75#ifdef EXPENSIVE_CHECKS
110using namespace std::placeholders;
112#define SV_NAME "slp-vectorizer"
113#define DEBUG_TYPE "SLP"
115STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
116STATISTIC(NumStridedStoreChains,
"Number of vectorized stride stores");
117STATISTIC(NumStoreChains,
"Number of vector stores created");
118STATISTIC(NumVectorizedStores,
"Number of vectorized stores");
121 "Controls which SLP graphs should be vectorized.");
125 cl::desc(
"Run the SLP vectorization passes"));
129 cl::desc(
"Enable vectorization for wider vector utilization"));
133 cl::desc(
"Only vectorize if you gain more than this "
138 cl::desc(
"Attempt to vectorize horizontal reductions"));
143 "Attempt to vectorize horizontal reductions feeding into a store"));
147 cl::desc(
"Improve the code quality by splitting alternate instructions"));
151 cl::desc(
"Reject vectorization if vector instruction count exceeds "
152 "scalar instruction count"));
156 cl::desc(
"Attempt to vectorize for this register size in bits"));
160 cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
168 cl::desc(
"Limit the size of the SLP scheduling region per block"));
172 cl::desc(
"Attempt to vectorize for this register size in bits"));
176 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
180 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
184 cl::desc(
"Do not vectorize a bundle of PHI nodes if the product of the "
185 "bundle size and the number of incoming values exceeds this "
186 "value, to limit the compile time spent on wide PHIs"));
192 cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
201 cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
205 cl::desc(
"The minimum number of loads, which should be considered strided, "
206 "if the stride is > 1 or is runtime value"));
211 "The minimum number of stores, which should be considered strided, "
212 "if the stride is > 1 or is runtime value"));
216 cl::desc(
"The maximum stride, considered to be profitable."));
221 cl::desc(
"Enable SLP trees to be built from strided "
226 cl::desc(
"Disable tree reordering even if it is "
227 "profitable. Used for testing only."));
231 cl::desc(
"Generate strided loads even if they are not "
232 "profitable. Used for testing only."));
236 cl::desc(
"Display the SLP trees with Graphviz"));
240 cl::desc(
"Try to vectorize with non-power-of-2 number of elements."));
244 cl::desc(
"Force vectorization of non-vectorizable stores operands."));
249 "Use non-vectorizable instructions as potential reduction roots."));
261 cl::desc(
"Try to replace values with the idempotent instructions for "
262 "better vectorization."));
266 cl::desc(
"Loop trip count, considered by the cost model during "
267 "modeling (0=loops are ignored and considered flat code)"));
278 cl::desc(
"Use per-lane execution scale for gather/buildvector tree "
279 "entries to model LICM-hoistable buildvector sequences."));
312 return canVectorizeTy(Ty) && !Ty->isX86_FP80Ty() && !Ty->isPPC_FP128Ty() &&
324 return SI->getValueOperand()->getType();
327 return CI->getOperand(0)->getType();
330 return IE->getOperand(1)->getType();
337 "ScalableVectorType is not supported.");
352 "expected unpacked struct literal");
354 "expected all element types to be valid vector element types");
356 StructTy->getContext(),
358 return FixedVectorType::get(ElTy, 1);
371 Type *Ty,
unsigned Sz) {
376 if (NumParts == 0 || NumParts >= Sz)
391 if (NumParts == 0 || NumParts >= Sz)
396 return (Sz / RegVF) * RegVF;
408 I * VecTyNumElements, VecTyNumElements)))
410 : Mask[
I] * VecTyNumElements + J;
444 unsigned SVNumElements =
446 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
447 if (SVNumElements % ShuffleMaskSize != 0)
449 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
450 if (GroupSize == 0 || (VL.
size() % GroupSize) != 0)
452 unsigned NumGroup = 0;
453 for (
size_t I = 0,
E = VL.
size();
I !=
E;
I += GroupSize) {
455 Value *Src = SV->getOperand(0);
461 if (SV->getOperand(0) != Src)
464 if (!SV->isExtractSubvectorMask(Index))
466 ExpectedIndex.
set(Index / ShuffleMaskSize);
470 if (!ExpectedIndex.
all())
474 assert(NumGroup == (VL.
size() / GroupSize) &&
"Unexpected number of groups");
493 unsigned SVNumElements =
496 unsigned AccumulateLength = 0;
497 for (
Value *V : VL) {
499 for (
int M : SV->getShuffleMask())
501 : AccumulateLength + M);
502 AccumulateLength += SVNumElements;
543 return std::min<unsigned>(PartNumElems,
Size - Part * PartNumElems);
552 OS <<
"Idx: " << Idx <<
", ";
553 OS <<
"n=" << VL.
size() <<
" [" << *VL.
front() <<
", ..]";
576 if (BB !=
II->getParent())
593 Value *FirstNonUndef =
nullptr;
594 for (
Value *V : VL) {
597 if (!FirstNonUndef) {
601 if (V != FirstNonUndef)
604 return FirstNonUndef !=
nullptr;
619 bool IsCopyable =
false) {
621 return Cmp->isCommutative();
623 return BO->isCommutative() ||
624 (BO->getOpcode() == Instruction::Sub &&
632 if (match(U.getUser(),
633 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
634 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
638 auto *I = dyn_cast<BinaryOperator>(U.get());
639 return match(U.getUser(),
640 m_Intrinsic<Intrinsic::abs>(
641 m_Specific(U.get()), m_ConstantInt(Flag))) &&
642 ((!IsCopyable && I && !I->hasNoSignedWrap()) ||
645 (BO->getOpcode() == Instruction::FSub &&
649 return match(U.getUser(),
650 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
652 return I->isCommutative();
659 bool IsCopyable =
false) {
661 "The instruction is not commutative.");
665 switch (BO->getOpcode()) {
666 case Instruction::Sub:
667 case Instruction::FSub:
673 return I->isCommutableOperand(
Op);
693 constexpr unsigned IntrinsicNumOperands = 2;
694 return IntrinsicNumOperands;
696 return I->getNumOperands();
702 static_assert(std::is_same_v<T, InsertElementInst> ||
703 std::is_same_v<T, ExtractElementInst>,
713 if (CI->getValue().uge(VT->getNumElements()))
715 Index *= VT->getNumElements();
716 Index += CI->getZExtValue();
738 Type *CurrentType =
IV->getType();
739 for (
unsigned I :
IV->indices()) {
741 Index *= ST->getNumElements();
742 CurrentType = ST->getElementType(
I);
744 Index *= AT->getNumElements();
745 CurrentType = AT->getElementType();
767 return std::all_of(It, VL.
end(), [&](
Value *V) {
768 if (auto *CI = dyn_cast<CmpInst>(V))
769 return BasePred == CI->getPredicate();
770 if (auto *I = dyn_cast<Instruction>(V))
771 return I->getOpcode() == Opcode;
772 return isa<PoisonValue>(V);
800 if (MaskArg == UseMask::UndefsAsMask)
804 if (MaskArg == UseMask::FirstArg &&
Value < VF)
805 UseMask.reset(
Value);
806 else if (MaskArg == UseMask::SecondArg &&
Value >= VF)
807 UseMask.reset(
Value - VF);
815template <
bool IsPoisonOnly = false>
819 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
827 if (!UseMask.empty()) {
838 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
853 for (
unsigned I = 0,
E = VecTy->getNumElements();
I !=
E; ++
I) {
854 if (
Constant *Elem =
C->getAggregateElement(
I))
856 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
884static std::optional<TargetTransformInfo::ShuffleKind>
897 return std::max(S, VTy->getNumElements());
900 Value *Vec1 =
nullptr;
901 Value *Vec2 =
nullptr;
906 Value *Vec = EE->getVectorOperand();
912 ShuffleMode CommonShuffleMode =
Unknown;
914 for (
unsigned I = 0,
E = VL.
size();
I <
E; ++
I) {
921 auto *Vec = EI->getVectorOperand();
935 if (Idx->getValue().uge(
Size))
937 unsigned IntIdx = Idx->getValue().getZExtValue();
944 if (!Vec1 || Vec1 == Vec) {
946 }
else if (!Vec2 || Vec2 == Vec) {
952 if (CommonShuffleMode == Permute)
956 if (Mask[
I] %
Size !=
I) {
957 CommonShuffleMode = Permute;
960 CommonShuffleMode =
Select;
963 if (CommonShuffleMode ==
Select && Vec2)
973 unsigned Opcode =
E->getOpcode();
974 assert((Opcode == Instruction::ExtractElement ||
975 Opcode == Instruction::ExtractValue) &&
976 "Expected extractelement or extractvalue instruction.");
977 if (Opcode == Instruction::ExtractElement) {
983 unsigned Idx = CI->getZExtValue();
991 if (EI->getNumIndices() != 1)
993 return *EI->idx_begin();
1027class BinOpSameOpcodeHelper {
1028 using MaskType = std::uint_fast32_t;
1030 constexpr static std::initializer_list<unsigned> SupportedOp = {
1031 Instruction::Add, Instruction::Sub, Instruction::Mul, Instruction::Shl,
1032 Instruction::AShr, Instruction::And, Instruction::Or, Instruction::Xor};
1034 "SupportedOp is not sorted.");
1052 static std::pair<ConstantInt *, unsigned>
1053 isBinOpWithConstantInt(
const Instruction *
I) {
1054 unsigned Opcode =
I->getOpcode();
1060 if (Opcode == Instruction::Sub || Opcode == Instruction::Shl ||
1061 Opcode == Instruction::AShr)
1062 return {
nullptr, 0};
1065 return {
nullptr, 0};
1067 struct InterchangeableInfo {
1070 MaskType Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |
1071 MulBIT | AShrBIT | ShlBIT;
1076 MaskType SeenBefore = 0;
1077 InterchangeableInfo(
const Instruction *I) : I(I) {}
1081 bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) {
1082 if (Mask & InterchangeableMask) {
1083 SeenBefore |= OpcodeInMaskForm;
1084 Mask &= InterchangeableMask;
1089 bool equal(
unsigned Opcode) {
1090 return Opcode == I->getOpcode() && trySet(MainOpBIT, MainOpBIT);
1093 MaskType Candidate = Mask & SeenBefore;
1094 if (Candidate & MainOpBIT)
1095 return I->getOpcode();
1096 if (Candidate & ShlBIT)
1097 return Instruction::Shl;
1098 if (Candidate & AShrBIT)
1099 return Instruction::AShr;
1100 if (Candidate & MulBIT)
1101 return Instruction::Mul;
1102 if (Candidate & AddBIT)
1103 return Instruction::Add;
1104 if (Candidate & SubBIT)
1105 return Instruction::Sub;
1106 if (Candidate & AndBIT)
1107 return Instruction::And;
1108 if (Candidate & OrBIT)
1109 return Instruction::Or;
1110 if (Candidate & XorBIT)
1111 return Instruction::Xor;
1115 bool hasDefinedOpcode()
const {
return (Mask & SeenBefore) > 0; }
1118 bool hasCandidateOpcode(
unsigned Opcode)
const {
1119 MaskType Candidate = Mask & SeenBefore;
1121 case Instruction::Shl:
1122 return Candidate & ShlBIT;
1123 case Instruction::AShr:
1124 return Candidate & AShrBIT;
1125 case Instruction::Mul:
1126 return Candidate & MulBIT;
1127 case Instruction::Add:
1128 return Candidate & AddBIT;
1129 case Instruction::Sub:
1130 return Candidate & SubBIT;
1131 case Instruction::And:
1132 return Candidate & AndBIT;
1133 case Instruction::Or:
1134 return Candidate & OrBIT;
1135 case Instruction::Xor:
1136 return Candidate & XorBIT;
1137 case Instruction::LShr:
1138 case Instruction::FAdd:
1139 case Instruction::FSub:
1140 case Instruction::FMul:
1141 case Instruction::SDiv:
1142 case Instruction::UDiv:
1143 case Instruction::FDiv:
1144 case Instruction::SRem:
1145 case Instruction::URem:
1146 case Instruction::FRem:
1156 unsigned FromOpcode = I->getOpcode();
1157 if (FromOpcode == ToOpcode)
1160 auto [CI, Pos] = isBinOpWithConstantInt(I);
1161 const APInt &FromCIValue = CI->getValue();
1162 unsigned FromCIValueBitWidth = FromCIValue.
getBitWidth();
1163 Type *RHSType = I->getOperand(Pos)->getType();
1165 switch (FromOpcode) {
1166 case Instruction::Shl:
1167 if (ToOpcode == Instruction::Add && FromCIValue.
isOne())
1168 return {I->getOperand(0), I->getOperand(0)};
1169 if (ToOpcode == Instruction::Mul) {
1170 RHS = ConstantInt::get(
1174 assert(FromCIValue.
isZero() &&
"Cannot convert the instruction.");
1179 case Instruction::Mul:
1181 if (ToOpcode == Instruction::Shl) {
1182 RHS = ConstantInt::get(
1183 RHSType, APInt(FromCIValueBitWidth, FromCIValue.
logBase2()));
1185 assert(FromCIValue.
isOne() &&
"Cannot convert the instruction.");
1190 case Instruction::Add:
1191 case Instruction::Sub:
1192 if (FromCIValue.
isZero()) {
1197 "Cannot convert the instruction.");
1198 APInt NegatedVal = APInt(FromCIValue);
1199 NegatedVal.negate();
1200 RHS = ConstantInt::get(RHSType, NegatedVal);
1203 case Instruction::And:
1209 assert(FromCIValue.
isZero() &&
"Cannot convert the instruction.");
1214 Value *
LHS = I->getOperand(1 - Pos);
1223 InterchangeableInfo MainOp;
1224 InterchangeableInfo AltOp;
1226 return ::isValidForAlternation(MainOp.I->getOpcode()) &&
1229 bool initializeAltOp(
const Instruction *
I) {
1239 BinOpSameOpcodeHelper(
const Instruction *MainOp,
1240 const Instruction *AltOp =
nullptr)
1241 : MainOp(MainOp), AltOp(AltOp) {}
1242 bool add(
const Instruction *
I) {
1244 "BinOpSameOpcodeHelper only accepts BinaryOperator.");
1245 unsigned Opcode =
I->getOpcode();
1246 MaskType OpcodeInMaskForm;
1249 case Instruction::Shl:
1250 OpcodeInMaskForm = ShlBIT;
1252 case Instruction::AShr:
1253 OpcodeInMaskForm = AShrBIT;
1255 case Instruction::Mul:
1256 OpcodeInMaskForm = MulBIT;
1258 case Instruction::Add:
1259 OpcodeInMaskForm = AddBIT;
1261 case Instruction::Sub:
1262 OpcodeInMaskForm = SubBIT;
1264 case Instruction::And:
1265 OpcodeInMaskForm = AndBIT;
1267 case Instruction::Or:
1268 OpcodeInMaskForm = OrBIT;
1270 case Instruction::Xor:
1271 OpcodeInMaskForm = XorBIT;
1274 return MainOp.equal(Opcode) ||
1275 (initializeAltOp(
I) && AltOp.equal(Opcode));
1277 MaskType InterchangeableMask = OpcodeInMaskForm;
1278 ConstantInt *CI = isBinOpWithConstantInt(
I).first;
1280 constexpr MaskType CanBeAll =
1281 XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | MulBIT | AShrBIT | ShlBIT;
1282 const APInt &CIValue = CI->
getValue();
1284 case Instruction::Shl:
1286 InterchangeableMask = CIValue.
isZero() ? CanBeAll : MulBIT | ShlBIT;
1287 if (CIValue.
isOne())
1288 InterchangeableMask |= AddBIT;
1290 case Instruction::Mul:
1291 if (CIValue.
isOne()) {
1292 InterchangeableMask = CanBeAll;
1296 InterchangeableMask = MulBIT | ShlBIT;
1298 case Instruction::Add:
1299 case Instruction::Sub:
1300 InterchangeableMask = CIValue.
isZero() ? CanBeAll : SubBIT | AddBIT;
1302 case Instruction::And:
1304 InterchangeableMask = CanBeAll;
1306 case Instruction::Xor:
1308 InterchangeableMask = XorBIT | OrBIT | SubBIT | AddBIT;
1312 InterchangeableMask = CanBeAll;
1316 return MainOp.trySet(OpcodeInMaskForm, InterchangeableMask) ||
1317 (initializeAltOp(
I) &&
1318 AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));
1320 unsigned getMainOpcode()
const {
return MainOp.getOpcode(); }
1321 bool hasDefinedMainOpcode()
const {
return MainOp.hasDefinedOpcode(); }
1323 bool hasCandidateOpcode(
unsigned Opcode)
const {
1324 return MainOp.hasCandidateOpcode(Opcode);
1326 bool hasAltOp()
const {
return AltOp.I; }
1327 unsigned getAltOpcode()
const {
1328 return hasAltOp() ? AltOp.getOpcode() : getMainOpcode();
1330 bool hasDefinedAltOpcode()
const {
1331 return !hasAltOp() || AltOp.hasDefinedOpcode();
1334 return MainOp.getOperand(
I);
1339class InstructionsState {
1365 bool HasCopyables =
false;
1369 assert(valid() &&
"InstructionsState is invalid.");
1374 assert(valid() &&
"InstructionsState is invalid.");
1379 unsigned getOpcode()
const {
return getMainOp()->getOpcode(); }
1381 unsigned getAltOpcode()
const {
return getAltOp()->getOpcode(); }
1384 bool isAltShuffle()
const {
return getMainOp() != getAltOp(); }
1393 Instruction *getMatchingMainOpOrAltOp(Instruction *
I)
const {
1394 assert(MainOp &&
"MainOp cannot be nullptr.");
1395 if (
I->getOpcode() == MainOp->getOpcode())
1397 if (MainOp->getOpcode() == Instruction::Select &&
1398 I->getOpcode() == Instruction::ZExt && !isAltShuffle())
1401 assert(AltOp &&
"AltOp cannot be nullptr.");
1402 if (
I->getOpcode() == AltOp->getOpcode())
1404 if (!
I->isBinaryOp())
1406 BinOpSameOpcodeHelper
Converter(MainOp);
1409 if (isAltShuffle() && !
Converter.hasCandidateOpcode(MainOp->getOpcode())) {
1410 BinOpSameOpcodeHelper AltConverter(AltOp);
1411 if (AltConverter.add(
I) && AltConverter.add(AltOp) &&
1412 AltConverter.hasCandidateOpcode(AltOp->getOpcode()))
1415 if (
Converter.hasAltOp() && !isAltShuffle())
1417 return Converter.hasAltOp() ? AltOp : MainOp;
1421 bool isShiftOp()
const {
1422 return getMainOp()->isShift() && getAltOp()->isShift();
1427 return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();
1431 bool isMulDivLikeOp()
const {
1432 constexpr std::array<unsigned, 8> MulDiv = {
1433 Instruction::Mul, Instruction::FMul, Instruction::SDiv,
1434 Instruction::UDiv, Instruction::FDiv, Instruction::SRem,
1435 Instruction::URem, Instruction::FRem};
1441 bool isAddSubLikeOp()
const {
1442 constexpr std::array<unsigned, 4>
AddSub = {
1443 Instruction::Add, Instruction::Sub, Instruction::FAdd,
1450 bool isCmpOp()
const {
1451 return (
getOpcode() == Instruction::ICmp ||
1457 bool valid()
const {
return MainOp && AltOp; }
1459 explicit operator bool()
const {
return valid(); }
1461 InstructionsState() =
delete;
1462 InstructionsState(Instruction *MainOp, Instruction *AltOp,
1463 bool HasCopyables =
false)
1464 : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}
1465 static InstructionsState invalid() {
return {
nullptr,
nullptr}; }
1468 bool isCopyableElement(
Value *V)
const {
1469 assert(valid() &&
"InstructionsState is invalid.");
1472 if (isAltShuffle() ||
getOpcode() == Instruction::GetElementPtr)
1477 if (
I->getParent() != MainOp->getParent() &&
1481 if (
I->getOpcode() == MainOp->getOpcode())
1483 if (!
I->isBinaryOp())
1485 BinOpSameOpcodeHelper
Converter(MainOp);
1492 bool isExpandedBinOp(
Value *V)
const {
1493 assert(valid() &&
"InstructionsState is invalid.");
1494 if (isCopyableElement(V))
1499 auto CheckForTransformedOpcode = [](
const Instruction *RefOp,
1502 case Instruction::Add:
1503 switch (ExpandingOp->getOpcode()) {
1504 case Instruction::Shl:
1515 Instruction *MainOp = getMatchingMainOpOrAltOp(ExpandingOp);
1517 "The instruction should be compatible with either main or alt op.");
1518 return CheckForTransformedOpcode(MainOp, ExpandingOp);
1523 bool isExpandedOperand(Instruction *
I,
unsigned Idx)
const {
1524 assert(isExpandedBinOp(
I) &&
"Expected an expanded binop.");
1525 switch (
I->getOpcode()) {
1526 case Instruction::Shl:
1535 bool isNonSchedulable(
Value *V)
const {
1536 assert(valid() &&
"InstructionsState is invalid.");
1543 if (getMainOp() == V)
1545 if (isCopyableElement(V)) {
1546 auto IsNonSchedulableCopyableElement = [
this](
Value *
V) {
1548 return !
I ||
isa<PHINode>(
I) ||
I->getParent() != MainOp->getParent() ||
1553 !MainOp->comesBefore(
I));
1556 return IsNonSchedulableCopyableElement(V);
1563 bool areInstructionsWithCopyableElements()
const {
1564 assert(valid() &&
"InstructionsState is invalid.");
1565 return HasCopyables;
1569std::pair<Instruction *, SmallVector<Value *>>
1571 Instruction *SelectedOp = S.getMatchingMainOpOrAltOp(
I);
1572 assert(SelectedOp &&
"Cannot convert the instruction.");
1573 if (
I->isBinaryOp()) {
1575 return std::make_pair(SelectedOp,
Converter.getOperand(SelectedOp));
1594 for (
Value *V : VL) {
1599 if (Inst->getOpcode() == Opcode)
1613 BaseOp0 == Op0 || BaseOp1 == Op1 ||
1624 "Assessing comparisons of different types?");
1634 return (BasePred == Pred &&
1636 (BasePred == SwappedPred &&
1647 return InstructionsState::invalid();
1651 return InstructionsState::invalid();
1656 (VL.
size() == 2 && InstCnt < 2))
1657 return InstructionsState::invalid();
1666 unsigned AltOpcode = Opcode;
1668 BinOpSameOpcodeHelper BinOpHelper(MainOp);
1669 bool SwappedPredsCompatible = IsCmpOp && [&]() {
1671 UniquePreds.
insert(BasePred);
1672 UniqueNonSwappedPreds.
insert(BasePred);
1673 for (
Value *V : VL) {
1680 UniqueNonSwappedPreds.
insert(CurrentPred);
1681 if (!UniquePreds.
contains(CurrentPred) &&
1682 !UniquePreds.
contains(SwappedCurrentPred))
1683 UniquePreds.
insert(CurrentPred);
1688 return UniqueNonSwappedPreds.
size() > 2 && UniquePreds.
size() == 2;
1698 return InstructionsState::invalid();
1700 bool AnyPoison = InstCnt != VL.
size();
1711 if (AnyPoison && (
I->isIntDivRem() ||
I->isFPDivRem() ||
isa<CallInst>(
I)))
1712 return InstructionsState::invalid();
1713 unsigned InstOpcode =
I->getOpcode();
1715 if (BinOpHelper.add(
I))
1720 Value *Op1 =
I->getOperand(0);
1723 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
1725 if (Opcode == AltOpcode) {
1728 "Cast isn't safe for alternation, logic needs to be updated!");
1729 AltOpcode = InstOpcode;
1736 Type *Ty0 = BaseInst->getOperand(0)->getType();
1737 Type *Ty1 = Inst->getOperand(0)->getType();
1739 assert(InstOpcode == Opcode &&
"Expected same CmpInst opcode.");
1740 assert(InstOpcode == AltOpcode &&
1741 "Alternate instructions are only supported by BinaryOperator "
1749 if ((VL.
size() == 2 || SwappedPredsCompatible) &&
1750 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1756 if (MainOp != AltOp) {
1759 }
else if (BasePred != CurrentPred) {
1762 "CmpInst isn't safe for alternation, logic needs to be updated!");
1767 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1768 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1771 }
else if (InstOpcode == Opcode) {
1772 assert(InstOpcode == AltOpcode &&
1773 "Alternate instructions are only supported by BinaryOperator and "
1776 if (Gep->getNumOperands() != 2 ||
1778 return InstructionsState::invalid();
1781 return InstructionsState::invalid();
1784 if (!LI->isSimple() || !BaseLI->isSimple())
1785 return InstructionsState::invalid();
1789 return InstructionsState::invalid();
1790 if (
Call->hasOperandBundles() &&
1792 !std::equal(
Call->op_begin() +
Call->getBundleOperandsStartIndex(),
1793 Call->op_begin() +
Call->getBundleOperandsEndIndex(),
1796 return InstructionsState::invalid();
1799 return InstructionsState::invalid();
1802 if (Mappings.
size() != BaseMappings.
size() ||
1803 Mappings.
front().ISA != BaseMappings.
front().ISA ||
1804 Mappings.
front().ScalarName != BaseMappings.
front().ScalarName ||
1805 Mappings.
front().VectorName != BaseMappings.
front().VectorName ||
1806 Mappings.
front().Shape.VF != BaseMappings.
front().Shape.VF ||
1807 Mappings.
front().Shape.Parameters !=
1808 BaseMappings.
front().Shape.Parameters)
1809 return InstructionsState::invalid();
1814 return InstructionsState::invalid();
1818 if (!BinOpHelper.hasDefinedMainOpcode() ||
1819 !BinOpHelper.hasDefinedAltOpcode())
1820 return InstructionsState::invalid();
1822 assert(MainOp &&
"Cannot find MainOp with Opcode from BinOpHelper.");
1824 assert(AltOp &&
"Cannot find AltOp with Opcode from BinOpHelper.");
1827 "Incorrect implementation of allSameOpcode.");
1828 InstructionsState S(MainOp, AltOp);
1834 "Invalid InstructionsState.");
1842 return all_of(VL, [&](
Value *V) {
return V->getType() == Ty; });
1852 unsigned Opcode = UserInst->
getOpcode();
1854 case Instruction::Load: {
1858 case Instruction::Store: {
1860 return (
SI->getPointerOperand() == Scalar);
1862 case Instruction::Call: {
1866 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1867 Arg.value().get() == Scalar;
1887 return LI->isSimple();
1889 return SI->isSimple();
1891 return !
MI->isVolatile();
1899 bool ExtendingManyInputs =
false) {
1900 if (SubMask.
empty())
1903 (!ExtendingManyInputs || SubMask.
size() > Mask.size() ||
1906 "SubMask with many inputs support must be larger than the mask.");
1908 Mask.append(SubMask.
begin(), SubMask.
end());
1912 int TermValue = std::min(Mask.size(), SubMask.
size());
1913 for (
int I = 0,
E = SubMask.
size();
I <
E; ++
I) {
1915 (!ExtendingManyInputs &&
1916 (SubMask[
I] >= TermValue || Mask[SubMask[
I]] >= TermValue)))
1918 NewMask[
I] = Mask[SubMask[
I]];
1934 const size_t Sz = Order.
size();
1937 for (
unsigned I = 0;
I < Sz; ++
I) {
1939 UnusedIndices.
reset(Order[
I]);
1941 MaskedIndices.
set(
I);
1943 if (MaskedIndices.
none())
1946 "Non-synced masked/available indices.");
1950 assert(Idx >= 0 &&
"Indices must be synced.");
1960 unsigned Opcode0,
unsigned Opcode1) {
1967 OpcodeMask.
set(Lane * ScalarTyNumElements,
1968 Lane * ScalarTyNumElements + ScalarTyNumElements);
1977 "Expected scalar constants.");
1980 std::fill_n(NewVal.begin() +
I * VF, VF, V);
1987 const unsigned E = Indices.
size();
1989 for (
unsigned I = 0;
I <
E; ++
I)
1990 Mask[Indices[
I]] =
I;
1996 assert(!Mask.empty() &&
"Expected non-empty mask.");
2000 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
2002 Scalars[Mask[
I]] = Prev[
I];
2015 auto *IO = dyn_cast<Instruction>(V);
2018 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
2031 return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(
UsesLimit) &&
2033 auto *IU = dyn_cast<Instruction>(U);
2036 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
2052 return !VL.
empty() &&
2070 return NumParts > 0 && NumParts < Sz &&
has_single_bit(Sz / NumParts) &&
2079 const unsigned Limit = std::numeric_limits<unsigned>::max()) {
2082 unsigned NumParts =
TTI.getNumberOfParts(VecTy);
2083 if (NumParts == 0 || NumParts >= Limit)
2089 if (NumParts >= Sz || PWSz % NumParts != 0 ||
2090 (PWSz / NumParts) % ScalarSz != 0 ||
2093 const unsigned NumElts = PWSz / NumParts;
2102 class ScheduleEntity;
2104 class ScheduleCopyableData;
2105 class ScheduleBundle;
2142 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
2143 AC(AC), DB(DB), DL(DL), ORE(ORE),
2162 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
2175 ArrayRef<std::tuple<WeakTrackingVH, unsigned, bool, bool>>
2176 VectorValuesAndScales = {});
2198 const SmallDenseSet<Value *> &UserIgnoreLst);
2205 assert(!VectorizableTree.empty() &&
"No graph to get the first node from");
2206 return VectorizableTree.front()->Scalars;
2212 const TreeEntry &Root = *VectorizableTree.front();
2213 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
2214 !Root.Scalars.
front()->getType()->isIntegerTy())
2215 return std::nullopt;
2216 auto It = MinBWs.find(&Root);
2217 if (It != MinBWs.end())
2221 if (Root.getOpcode() == Instruction::ZExt ||
2222 Root.getOpcode() == Instruction::SExt)
2223 return std::make_pair(
cast<CastInst>(Root.getMainOp())->getSrcTy(),
2224 Root.getOpcode() == Instruction::SExt);
2225 return std::nullopt;
2231 return MinBWs.at(VectorizableTree.front().get()).second;
2236 if (ReductionBitWidth == 0 ||
2237 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
2238 ReductionBitWidth >=
2239 DL->getTypeSizeInBits(
2240 VectorizableTree.front()->Scalars.front()->getType()))
2242 getWidenedType(VectorizableTree.front()->Scalars.front()->getType(),
2243 VectorizableTree.front()->getVectorFactor()));
2246 VectorizableTree.front()->Scalars.front()->getContext(),
2248 VectorizableTree.front()->getVectorFactor()));
2253 return VectorizableTree.front()->hasState() &&
2254 (VectorizableTree.front()->CombinedOp == TreeEntry::ReducedBitcast ||
2255 VectorizableTree.front()->CombinedOp ==
2256 TreeEntry::ReducedBitcastBSwap ||
2257 VectorizableTree.front()->CombinedOp ==
2258 TreeEntry::ReducedBitcastLoads ||
2259 VectorizableTree.front()->CombinedOp ==
2260 TreeEntry::ReducedBitcastBSwapLoads) &&
2261 VectorizableTree.front()->State == TreeEntry::Vectorize;
2266 return VectorizableTree.front()->hasState() &&
2267 VectorizableTree.front()->CombinedOp ==
2268 TreeEntry::ReducedCmpBitcast &&
2269 VectorizableTree.front()->State == TreeEntry::Vectorize;
2287 VectorizableTree.clear();
2288 ScalarToTreeEntries.clear();
2289 DeletedNodes.clear();
2290 TransformedToGatherNodes.clear();
2291 OperandsToTreeEntry.clear();
2292 ScalarsInSplitNodes.clear();
2294 NonScheduledFirst.clear();
2295 EntryToLastInstruction.clear();
2296 LastInstructionToPos.clear();
2297 LoadEntriesToVectorize.clear();
2298 IsGraphTransformMode =
false;
2299 GatheredLoadsEntriesFirst.reset();
2300 CompressEntryToData.clear();
2301 ExternalUses.clear();
2302 ExternalUsesAsOriginalScalar.clear();
2303 ExternalUsesWithNonUsers.clear();
2304 for (
auto &Iter : BlocksSchedules) {
2305 BlockScheduling *BS = Iter.second.get();
2309 ReductionBitWidth = 0;
2311 CastMaxMinBWSizes.reset();
2312 ExtraBitWidthNodes.clear();
2313 InstrElementSize.clear();
2314 UserIgnoreList =
nullptr;
2315 PostponedGathers.clear();
2316 ValueToGatherNodes.clear();
2317 TreeEntryToStridedPtrInfoMap.clear();
2318 CurrentLoopNest.clear();
2319 MergedLoopBTCs.clear();
2335 assert(!Order.
empty() &&
"expected non-empty order");
2336 const unsigned Sz = Order.
size();
2338 return P.value() ==
P.index() ||
P.value() == Sz;
2351 bool IgnoreReorder);
2364 std::optional<OrdersType>
2402 return MaxVecRegSize;
2407 return MinVecRegSize;
2415 unsigned MaxVF =
MaxVFOption.getNumOccurrences() ?
2416 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
2417 return MaxVF ? MaxVF : UINT_MAX;
2439 Align Alignment,
const int64_t Diff,
2440 const size_t Sz)
const;
2480 Value *Ptr0, StridedPtrInfo &SPtrInfo)
const;
2499 Align CommonAlignment,
2501 StridedPtrInfo &SPtrInfo,
bool IsLoad)
const;
2516 StridedPtrInfo &SPtrInfo,
2517 unsigned *BestVF =
nullptr,
2518 bool TryRecursiveCheck =
true)
const;
2524 auto IsSame = [&](
const TreeEntry *TE) {
return TE->isSame(VL); };
2530 return any_of(getTreeEntries(S.getMainOp()), IsSame) ||
2531 any_of(ValueToGatherNodes.lookup(S.getMainOp()), IsSame);
2538 for (
Value *V : VL) {
2542 for (
const TreeEntry *TE : ValueToGatherNodes.lookup(V)) {
2543 if (!Visited.
insert(TE).second)
2554 ListOfKnonwnNonVectorizableLoads.insert(
hash_value(VL));
2558 template <
typename T>
2560 return ListOfKnonwnNonVectorizableLoads.contains(
hash_value(VL));
2585 OS <<
"{User:" << (
UserTE ? std::to_string(
UserTE->Idx) :
"null")
2586 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
2611 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
2612 MaxLevel(MaxLevel) {}
2668 auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
2673 auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
2675 return U == U1 || U == U2 || R.isVectorized(U);
2678 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
2681 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
2683 ((
int)V1->getNumUses() == NumLanes ||
2684 AllUsersAreInternal(V1, V2)))
2690 auto CheckSameEntryOrFail = [&]() {
2695 any_of(TEs2, [&](TreeEntry *E) {
return Set.contains(E); }))
2704 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
2706 return CheckSameEntryOrFail();
2709 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
2710 LI2->getPointerOperand(), DL, SE,
true);
2711 if (!Dist || *Dist == 0) {
2714 R.TTI->isLegalMaskedGather(
2717 return CheckSameEntryOrFail();
2721 if (std::abs(*Dist) > NumLanes / 2)
2754 Value *EV2 =
nullptr;
2767 int Dist = Idx2 - Idx1;
2770 if (std::abs(Dist) == 0)
2772 if (std::abs(Dist) > NumLanes / 2)
2779 return CheckSameEntryOrFail();
2785 if (I1->getParent() != I2->getParent())
2786 return CheckSameEntryOrFail();
2795 V->getType() ==
Cond->getType()) ||
2798 V->getType() ==
Cond->getType()))
2807 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.
empty() ||
2808 !S.isAltShuffle()) &&
2812 S.getMainOp()->getNumOperands();
2824 return CheckSameEntryOrFail();
2858 int ShallowScoreAtThisLevel =
2869 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
2872 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
2874 ShallowScoreAtThisLevel))
2875 return ShallowScoreAtThisLevel;
2876 assert(I1 && I2 &&
"Should have early exited.");
2883 if (I1->getNumOperands() != I2->getNumOperands())
2885 for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
2886 OpIdx1 != NumOperands1; ++OpIdx1) {
2888 int MaxTmpScore = 0;
2889 unsigned MaxOpIdx2 = 0;
2890 bool FoundBest =
false;
2894 ? I2->getNumOperands()
2895 : std::min(I2->getNumOperands(), OpIdx1 + 1);
2896 assert(FromIdx <= ToIdx &&
"Bad index");
2897 for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
2899 if (Op2Used.
count(OpIdx2))
2904 I1, I2, CurrLevel + 1, {});
2907 TmpScore > MaxTmpScore) {
2908 MaxTmpScore = TmpScore;
2915 Op2Used.
insert(MaxOpIdx2);
2916 ShallowScoreAtThisLevel += MaxTmpScore;
2919 return ShallowScoreAtThisLevel;
2950 struct OperandData {
2951 OperandData() =
default;
2952 OperandData(
Value *V,
bool APO,
bool IsUsed)
2953 : V(V), APO(APO), IsUsed(IsUsed) {}
2963 bool IsUsed =
false;
2972 enum class ReorderingMode {
2986 unsigned ArgSize = 0;
2992 const Loop *L =
nullptr;
2995 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
2996 return OpsVec[
OpIdx][Lane];
3000 const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const {
3001 return OpsVec[
OpIdx][Lane];
3006 for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
3008 for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
3010 OpsVec[
OpIdx][Lane].IsUsed =
false;
3014 void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
3015 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
3027 int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx,
3029 Value *IdxLaneV = getData(Idx, Lane).V;
3042 unsigned UniquesCount = Uniques.
size();
3043 auto IdxIt = Uniques.
find(IdxLaneV);
3044 unsigned UniquesCntWithIdxLaneV =
3045 IdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
3047 auto OpIdxIt = Uniques.
find(OpIdxLaneV);
3048 unsigned UniquesCntWithOpIdxLaneV =
3049 OpIdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
3050 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
3052 return std::min(
bit_ceil(UniquesCntWithOpIdxLaneV) -
3053 UniquesCntWithOpIdxLaneV,
3054 UniquesCntWithOpIdxLaneV -
3056 ((IdxIt != Uniques.
end() && UsedLanes.
test(IdxIt->second))
3057 ? UniquesCntWithIdxLaneV -
bit_floor(UniquesCntWithIdxLaneV)
3058 :
bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
3067 int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
3068 Value *IdxLaneV = getData(Idx, Lane).V;
3081 return R.areAllUsersVectorized(IdxLaneI)
3089 static const int ScoreScaleFactor = 10;
3097 int Lane,
unsigned OpIdx,
unsigned Idx,
3107 int SplatScore = getSplatScore(Lane,
OpIdx, Idx, UsedLanes);
3108 if (Score <= -SplatScore) {
3112 Score += SplatScore;
3118 Score *= ScoreScaleFactor;
3119 Score += getExternalUseScore(Lane,
OpIdx, Idx);
3137 std::optional<unsigned>
3138 getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
3142 unsigned NumOperands = getNumOperands();
3145 Value *OpLastLane = getData(
OpIdx, LastLane).V;
3148 ReorderingMode RMode = ReorderingModes[
OpIdx];
3149 if (RMode == ReorderingMode::Failed)
3150 return std::nullopt;
3153 bool OpIdxAPO = getData(
OpIdx, Lane).APO;
3159 std::optional<unsigned> Idx;
3163 BestScoresPerLanes.try_emplace(std::make_pair(
OpIdx, Lane), 0)
3169 bool IsUsed = RMode == ReorderingMode::Splat ||
3170 RMode == ReorderingMode::Constant ||
3171 RMode == ReorderingMode::Load;
3173 for (
unsigned Idx = 0; Idx != NumOperands; ++Idx) {
3175 OperandData &OpData = getData(Idx, Lane);
3177 bool OpAPO = OpData.APO;
3186 if (OpAPO != OpIdxAPO)
3191 case ReorderingMode::Load:
3192 case ReorderingMode::Opcode: {
3193 bool LeftToRight = Lane > LastLane;
3194 Value *OpLeft = (LeftToRight) ? OpLastLane :
Op;
3195 Value *OpRight = (LeftToRight) ?
Op : OpLastLane;
3196 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
3197 OpIdx, Idx, IsUsed, UsedLanes);
3198 if (Score >
static_cast<int>(BestOp.Score) ||
3199 (Score > 0 && Score ==
static_cast<int>(BestOp.Score) &&
3202 BestOp.Score = Score;
3203 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] = Score;
3207 case ReorderingMode::Constant:
3209 (!BestOp.Score && L && L->isLoopInvariant(
Op))) {
3213 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] =
3220 case ReorderingMode::Splat:
3222 IsUsed =
Op == OpLastLane;
3223 if (
Op == OpLastLane) {
3225 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] =
3231 case ReorderingMode::Failed:
3237 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
3241 return std::nullopt;
3248 unsigned getBestLaneToStartReordering()
const {
3249 unsigned Min = UINT_MAX;
3250 unsigned SameOpNumber = 0;
3261 for (
int I = getNumLanes();
I > 0; --
I) {
3262 unsigned Lane =
I - 1;
3263 OperandsOrderData NumFreeOpsHash =
3264 getMaxNumOperandsThatCanBeReordered(Lane);
3267 if (NumFreeOpsHash.NumOfAPOs < Min) {
3268 Min = NumFreeOpsHash.NumOfAPOs;
3269 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3271 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3272 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
3273 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
3276 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3277 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3278 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
3279 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
3280 auto [It, Inserted] =
3281 HashMap.
try_emplace(NumFreeOpsHash.Hash, 1, Lane);
3287 unsigned BestLane = 0;
3288 unsigned CntMin = UINT_MAX;
3290 if (
Data.second.first < CntMin) {
3291 CntMin =
Data.second.first;
3292 BestLane =
Data.second.second;
3299 struct OperandsOrderData {
3302 unsigned NumOfAPOs = UINT_MAX;
3305 unsigned NumOpsWithSameOpcodeParent = 0;
3319 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const {
3320 unsigned CntTrue = 0;
3321 unsigned NumOperands = getNumOperands();
3331 bool AllUndefs =
true;
3332 unsigned NumOpsWithSameOpcodeParent = 0;
3337 const OperandData &OpData = getData(
OpIdx, Lane);
3344 I->getParent() != Parent) {
3345 if (NumOpsWithSameOpcodeParent == 0) {
3346 NumOpsWithSameOpcodeParent = 1;
3348 Parent =
I->getParent();
3350 --NumOpsWithSameOpcodeParent;
3353 ++NumOpsWithSameOpcodeParent;
3362 OperandsOrderData
Data;
3363 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
3364 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
3371 const InstructionsState &S) {
3375 return VL.
size() == getNumLanes();
3377 "Expected same number of lanes");
3378 assert(S.valid() &&
"InstructionsState is invalid.");
3384 OpsVec.resize(ArgSize);
3385 unsigned NumLanes = VL.
size();
3386 for (OperandDataVec &
Ops : OpsVec)
3387 Ops.resize(NumLanes);
3402 OpsVec[
OpIdx][Lane] = {Operands[
OpIdx][Lane],
true,
false};
3405 bool IsInverseOperation =
false;
3406 if (S.isCopyableElement(VL[Lane])) {
3408 IsInverseOperation =
3411 assert(
I &&
"Expected instruction");
3412 auto [SelectedOp,
Ops] = convertTo(
I, S);
3419 bool APO = (
OpIdx == 0) ?
false : IsInverseOperation;
3420 OpsVec[
OpIdx][Lane] = {Operands[
OpIdx][Lane], APO,
false};
3426 unsigned getNumOperands()
const {
return ArgSize; }
3429 unsigned getNumLanes()
const {
return OpsVec[0].size(); }
3432 Value *getValue(
unsigned OpIdx,
unsigned Lane)
const {
3433 return getData(
OpIdx, Lane).V;
3437 bool empty()
const {
return OpsVec.empty(); }
3440 void clear() { OpsVec.clear(); }
3445 bool shouldBroadcast(
Value *
Op,
unsigned OpIdx,
unsigned Lane) {
3447 "Op is expected to be getValue(OpIdx, Lane).");
3451 bool OpAPO = getData(
OpIdx, Lane).APO;
3452 bool IsInvariant = L && L->isLoopInvariant(
Op);
3454 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3458 bool FoundCandidate =
false;
3459 for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
3460 OperandData &
Data = getData(OpI, Ln);
3461 if (
Data.APO != OpAPO ||
Data.IsUsed)
3463 Value *OpILane = getValue(OpI, Lane);
3487 L->isLoopInvariant(
Data.V))) {
3488 FoundCandidate =
true;
3495 if (!FoundCandidate)
3498 return getNumLanes() == 2 || Cnt > 1;
3505 "Op is expected to be getValue(OpIdx, Lane).");
3506 bool OpAPO = getData(
OpIdx, Lane).APO;
3507 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3511 const OperandData &
Data = getData(OpI, Ln);
3512 if (
Data.APO != OpAPO ||
Data.IsUsed)
3514 Value *OpILn = getValue(OpI, Ln);
3515 return (L && L->isLoopInvariant(OpILn)) ||
3527 const InstructionsState &S,
const BoUpSLP &R)
3528 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
3529 L(R.LI->getLoopFor(S.getMainOp()->
getParent())) {
3531 appendOperands(RootVL, Operands, S);
3539 "Expected same num of lanes across all operands");
3540 for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
3541 OpVL[Lane] = OpsVec[
OpIdx][Lane].V;
3549 unsigned NumOperands = getNumOperands();
3550 unsigned NumLanes = getNumLanes();
3570 unsigned FirstLane = getBestLaneToStartReordering();
3579 if (shouldBroadcast(OpLane0,
OpIdx, FirstLane) ||
3580 !canBeVectorized(OpILane0,
OpIdx, FirstLane))
3581 ReorderingModes[
OpIdx] = ReorderingMode::Splat;
3583 ReorderingModes[
OpIdx] = ReorderingMode::Load;
3585 ReorderingModes[
OpIdx] = ReorderingMode::Opcode;
3587 ReorderingModes[
OpIdx] = ReorderingMode::Constant;
3590 ReorderingModes[
OpIdx] = ReorderingMode::Splat;
3599 auto &&SkipReordering = [
this]() {
3602 for (
const OperandData &
Data : Op0)
3605 ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
3606 if (
any_of(
Op, [&UniqueValues](
const OperandData &
Data) {
3611 return UniqueValues.
size() != 2;
3623 if (SkipReordering())
3626 bool StrategyFailed =
false;
3634 for (
unsigned I = 0;
I < NumOperands; ++
I)
3635 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
3638 UsedLanes.
set(FirstLane);
3639 for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
3641 for (
int Direction : {+1, -1}) {
3642 int Lane = FirstLane + Direction * Distance;
3643 if (Lane < 0 || Lane >= (
int)NumLanes)
3645 UsedLanes.
set(Lane);
3646 int LastLane = Lane - Direction;
3647 assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
3652 std::optional<unsigned> BestIdx =
3653 getBestOperand(
OpIdx, Lane, LastLane, ReorderingModes,
3654 MainAltOps[
OpIdx], UsedLanes);
3661 swap(
OpIdx, *BestIdx, Lane);
3664 StrategyFailed =
true;
3668 OperandData &AltOp = getData(
OpIdx, Lane);
3669 InstructionsState OpS =
3671 if (OpS && OpS.isAltShuffle())
3678 if (!StrategyFailed)
3683#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3686 case ReorderingMode::Load:
3688 case ReorderingMode::Opcode:
3690 case ReorderingMode::Constant:
3692 case ReorderingMode::Splat:
3694 case ReorderingMode::Failed:
3715 const unsigned Indent = 2;
3717 for (
const OperandDataVec &OpDataVec : OpsVec) {
3718 OS <<
"Operand " << Cnt++ <<
"\n";
3719 for (
const OperandData &OpData : OpDataVec) {
3720 OS.
indent(Indent) <<
"{";
3721 if (
Value *V = OpData.V)
3725 OS <<
", APO:" << OpData.APO <<
"}\n";
3742 std::pair<std::optional<int>,
int>
3747 int BestScore = Limit;
3748 std::optional<int> Index;
3749 for (
int I :
seq<int>(0, Candidates.size())) {
3751 Candidates[
I].second,
3754 if (Score > BestScore) {
3759 return std::make_pair(Index, BestScore);
3769 DeletedInstructions.insert(
I);
3774 template <
typename T>
3777 ArrayRef<std::tuple<WeakTrackingVH, unsigned, bool, bool>>
3778 VectorValuesAndScales) {
3780 for (T *V : DeadVals) {
3785 for (T *V : DeadVals) {
3786 if (!V || !Processed.
insert(V).second)
3791 for (
Use &U :
I->operands()) {
3793 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
3795 (Entries.
empty() ||
none_of(Entries, [&](
const TreeEntry *Entry) {
3796 return Entry->VectorizedValue == OpI;
3800 I->dropAllReferences();
3802 for (T *V : DeadVals) {
3804 if (!
I->getParent())
3809 cast<Instruction>(U.getUser()));
3811 "trying to erase instruction with users.");
3812 I->removeFromParent();
3816 while (!DeadInsts.
empty()) {
3819 if (!VI || !VI->getParent())
3822 "Live instruction found in dead worklist!");
3823 assert(VI->use_empty() &&
"Instructions with uses are not dead.");
3830 for (
Use &OpU : VI->operands()) {
3831 Value *OpV = OpU.get();
3843 if (!DeletedInstructions.contains(OpI) &&
3844 (!OpI->getType()->isVectorTy() ||
3846 VectorValuesAndScales,
3847 [&](
const std::tuple<WeakTrackingVH, unsigned, bool, bool>
3848 &V) {
return std::get<0>(V) == OpI; })) &&
3853 VI->removeFromParent();
3855 SE->forgetValue(VI);
3862 return AnalyzedReductionsRoots.count(
I);
3867 AnalyzedReductionsRoots.insert(
I);
3872 return AnalyzedReductionVals.contains(
hash_value(VL));
3877 AnalyzedReductionVals.insert(
hash_value(VL));
3881 AnalyzedReductionsRoots.clear();
3882 AnalyzedReductionVals.clear();
3883 AnalyzedMinBWVals.clear();
3891 return MustGather.contains(V);
3895 return NonScheduledFirst.contains(V);
3900 assert(V &&
"V cannot be nullptr.");
3902 return any_of(Entries, [&](
const TreeEntry *E) {
3903 return !DeletedNodes.contains(E) && !TransformedToGatherNodes.contains(E);
3914 const InstructionsState &LocalState,
3926 bool collectValuesToDemote(
3927 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
3930 bool &IsProfitableToDemote,
bool IsTruncRoot)
const;
3939 void buildReorderableOperands(
3947 void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
3950 bool areAllUsersVectorized(
3955 unsigned getNumScalarInsts()
const;
3959 unsigned getNumVectorInsts()
const;
3967 const TreeEntry *getOperandEntry(
const TreeEntry *E,
unsigned Idx)
const;
3968 TreeEntry *getOperandEntry(TreeEntry *E,
unsigned Idx) {
3969 return const_cast<TreeEntry *
>(
3970 getOperandEntry(
const_cast<const TreeEntry *
>(E), Idx));
3976 Instruction *getRootEntryInstruction(
const TreeEntry &Entry)
const;
3980 getCastContextHint(
const TreeEntry &TE)
const;
3987 uint64_t getScaleToLoopIterations(
const TreeEntry &TE,
3988 Value *Scalar =
nullptr,
4006 uint64_t getGatherNodeEffectiveScale(
const TreeEntry &TE,
4027 unsigned InterleaveFactor = 0);
4038 bool ResizeAllowed =
false)
const;
4045 Value *vectorizeOperand(TreeEntry *
E,
unsigned NodeIdx);
4050 template <
typename BVTy,
typename ResTy,
typename... Args>
4051 ResTy processBuildVector(
const TreeEntry *
E,
Type *ScalarTy, Args &...Params);
4056 Value *createBuildVector(
const TreeEntry *
E,
Type *ScalarTy);
4062 Instruction &getLastInstructionInBundle(
const TreeEntry *
E);
4069 std::optional<TargetTransformInfo::ShuffleKind>
4081 unsigned NumParts)
const;
4093 std::optional<TargetTransformInfo::ShuffleKind>
4094 isGatherShuffledSingleRegisterEntry(
4097 unsigned SliceSize);
4111 isGatherShuffledEntry(
4114 unsigned NumParts,
bool ForOrder =
false);
4120 Type *ScalarTy)
const;
4124 void setInsertPointAfterBundle(
const TreeEntry *
E);
4134 bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
4139 void tryToVectorizeGatheredLoads(
4141 std::tuple<BasicBlock *, Value *, Type *>,
4149 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
4165 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
4169 void reorderGatherNode(TreeEntry &TE);
4176 bool matchesShlZExt(
const TreeEntry &TE,
OrdersType &Order,
bool &IsBSwap,
4177 bool &ForLoads)
const;
4181 bool matchesInversedZExtSelect(
4182 const TreeEntry &SelectTE,
4188 bool matchesSelectOfBits(
const TreeEntry &SelectTE)
const;
4193 TreeEntry(VecTreeTy &Container) : Container(Container) {}
4196 SmallVector<int> getCommonMask()
const {
4197 if (State == TreeEntry::SplitVectorize)
4199 SmallVector<int>
Mask;
4206 SmallVector<int> getSplitMask()
const {
4207 assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
4208 "Expected only split vectorize node.");
4209 unsigned CommonVF = std::max<unsigned>(
4210 CombinedEntriesWithIndices.back().second,
4211 Scalars.size() - CombinedEntriesWithIndices.back().second);
4212 const unsigned Scale =
getNumElements(Scalars.front()->getType());
4215 for (
auto [Idx,
I] :
enumerate(ReorderIndices)) {
4219 (Idx >= CombinedEntriesWithIndices.back().second
4220 ? CommonVF - CombinedEntriesWithIndices.back().second * Scale
4229 void reorderSplitNode(
unsigned Idx, ArrayRef<int> Mask,
4230 ArrayRef<int> MaskOrder);
4235 if (
Mask.size() != VL.
size() && VL.
size() == Scalars.size())
4236 return std::equal(VL.
begin(), VL.
end(), Scalars.begin());
4239 [Scalars](
Value *V,
int Idx) {
4240 return (isa<UndefValue>(V) &&
4241 Idx == PoisonMaskElem) ||
4242 (Idx != PoisonMaskElem && V == Scalars[Idx]);
4245 if (!ReorderIndices.empty()) {
4249 SmallVector<int>
Mask;
4251 if (VL.
size() == Scalars.size())
4252 return IsSame(Scalars, Mask);
4253 if (VL.
size() == ReuseShuffleIndices.size()) {
4255 return IsSame(Scalars, Mask);
4259 return IsSame(Scalars, ReuseShuffleIndices);
4263 bool hasEqualOperands(
const TreeEntry &TE)
const {
4264 if (
TE.getNumOperands() != getNumOperands())
4266 SmallBitVector
Used(getNumOperands());
4267 for (
unsigned I = 0,
E = getNumOperands();
I <
E; ++
I) {
4268 unsigned PrevCount =
Used.count();
4269 for (
unsigned K = 0;
K <
E; ++
K) {
4272 if (getOperand(K) ==
TE.getOperand(
I)) {
4278 if (PrevCount ==
Used.count())
4287 unsigned getVectorFactor()
const {
4288 if (!ReuseShuffleIndices.empty())
4289 return ReuseShuffleIndices.size();
4290 return Scalars.size();
4294 bool isGather()
const {
return State == NeedToGather; }
4300 WeakTrackingVH VectorizedValue =
nullptr;
4321 enum CombinedOpcode {
4323 MinMax = Instruction::OtherOpsEnd + 1,
4326 ReducedBitcastBSwap,
4327 ReducedBitcastLoads,
4328 ReducedBitcastBSwapLoads,
4331 CombinedOpcode CombinedOp = NotCombinedOp;
4334 SmallVector<int, 4> ReuseShuffleIndices;
4337 SmallVector<unsigned, 4> ReorderIndices;
4345 VecTreeTy &Container;
4348 EdgeInfo UserTreeIndex;
4361 SmallVector<unsigned, 1> StructEVIndices;
4367 SmallVector<ValueList, 2> Operands;
4370 SmallPtrSet<const Value *, 4> CopyableElements;
4374 InstructionsState S = InstructionsState::invalid();
4377 unsigned InterleaveFactor = 0;
4380 bool DoesNotNeedToSchedule =
false;
4384 if (Operands.size() <
OpIdx + 1)
4385 Operands.resize(
OpIdx + 1);
4388 "Number of operands is greater than the number of scalars.");
4394 mutable SmallDenseMap<Value *, unsigned> ValueToLane;
4398 unsigned getInterleaveFactor()
const {
return InterleaveFactor; }
4400 void setInterleave(
unsigned Factor) { InterleaveFactor = Factor; }
4403 void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule =
true; }
4406 bool doesNotNeedToSchedule()
const {
return DoesNotNeedToSchedule; }
4411 setOperand(
I, Operands[
I]);
4415 void reorderOperands(ArrayRef<int> Mask) {
4423 return Operands[
OpIdx];
4429 return Operands[
OpIdx];
4433 unsigned getNumOperands()
const {
return Operands.size(); }
4436 Value *getSingleOperand(
unsigned OpIdx)
const {
4439 return Operands[
OpIdx][0];
4443 bool isAltShuffle()
const {
return S.isAltShuffle(); }
4445 Instruction *getMatchingMainOpOrAltOp(Instruction *
I)
const {
4446 return S.getMatchingMainOpOrAltOp(
I);
4454 if (
I && getMatchingMainOpOrAltOp(
I))
4456 return S.getMainOp();
4459 void setOperations(
const InstructionsState &S) {
4460 assert(S &&
"InstructionsState is invalid.");
4464 Instruction *getMainOp()
const {
return S.getMainOp(); }
4466 Instruction *getAltOp()
const {
return S.getAltOp(); }
4469 unsigned getOpcode()
const {
return S.getOpcode(); }
4471 unsigned getAltOpcode()
const {
return S.getAltOpcode(); }
4473 bool hasState()
const {
return S.valid(); }
4476 void addCopyableElement(
Value *V) {
4477 assert(S.isCopyableElement(V) &&
"Not a copyable element.");
4478 CopyableElements.insert(V);
4482 bool isCopyableElement(
Value *V)
const {
4483 return CopyableElements.contains(V);
4488 bool isExpandedBinOp(
Value *V)
const {
4489 assert(hasState() &&
"InstructionsState is invalid.");
4490 if (isCopyableElement(V))
4492 return S.isExpandedBinOp(V);
4497 bool isExpandedOperand(Instruction *
I,
unsigned Idx)
const {
4498 assert(hasState() &&
"InstructionsState is invalid.");
4499 if (isCopyableElement(
I))
4501 if (!isExpandedBinOp(
I))
4503 return S.isExpandedOperand(
I, Idx);
4507 bool hasCopyableElements()
const {
return !CopyableElements.empty(); }
4510 const InstructionsState &getOperations()
const {
return S; }
4514 unsigned findLaneForValue(
Value *V)
const {
4515 auto Res = ValueToLane.try_emplace(V, getVectorFactor());
4517 return Res.first->second;
4518 unsigned &FoundLane = Res.first->getSecond();
4519 for (
auto *It =
find(Scalars, V), *End = Scalars.end(); It != End;
4520 std::advance(It, 1)) {
4523 FoundLane = std::distance(Scalars.begin(), It);
4524 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
4525 if (!ReorderIndices.empty())
4526 FoundLane = ReorderIndices[FoundLane];
4527 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
4528 if (ReuseShuffleIndices.empty())
4530 if (
auto *RIt =
find(ReuseShuffleIndices, FoundLane);
4531 RIt != ReuseShuffleIndices.end()) {
4532 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
4536 assert(FoundLane < getVectorFactor() &&
"Unable to find given value.");
4543 buildAltOpShuffleMask(
const function_ref<
bool(Instruction *)> IsAltOp,
4544 SmallVectorImpl<int> &Mask,
4545 SmallVectorImpl<Value *> *OpScalars =
nullptr,
4546 SmallVectorImpl<Value *> *AltScalars =
nullptr)
const;
4549 bool isNonPowOf2Vec()
const {
4551 return IsNonPowerOf2;
4554 Value *getOrdered(
unsigned Idx)
const {
4555 if (ReorderIndices.empty())
4556 return Scalars[Idx];
4557 SmallVector<int>
Mask;
4559 return Scalars[
Mask[Idx]];
4565 dbgs() << Idx <<
".\n";
4566 for (
unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
4567 dbgs() <<
"Operand " << OpI <<
":\n";
4568 for (
const Value *V : Operands[OpI])
4571 dbgs() <<
"Scalars: \n";
4572 for (
Value *V : Scalars) {
4574 << ((S && S.isExpandedBinOp(V)) ?
" [[Expanded]]\n"
4577 dbgs() <<
"State: ";
4578 if (S && hasCopyableElements())
4579 dbgs() <<
"[[Copyable]] ";
4582 if (InterleaveFactor > 0) {
4583 dbgs() <<
"Vectorize with interleave factor " << InterleaveFactor
4586 dbgs() <<
"Vectorize\n";
4589 case ScatterVectorize:
4590 dbgs() <<
"ScatterVectorize\n";
4592 case StridedVectorize:
4593 dbgs() <<
"StridedVectorize\n";
4595 case CompressVectorize:
4596 dbgs() <<
"CompressVectorize\n";
4599 dbgs() <<
"NeedToGather\n";
4601 case CombinedVectorize:
4602 dbgs() <<
"CombinedVectorize\n";
4604 case SplitVectorize:
4605 dbgs() <<
"SplitVectorize\n";
4609 dbgs() <<
"MainOp: " << *S.getMainOp() <<
"\n";
4610 dbgs() <<
"AltOp: " << *S.getAltOp() <<
"\n";
4612 dbgs() <<
"MainOp: NULL\n";
4613 dbgs() <<
"AltOp: NULL\n";
4615 dbgs() <<
"VectorizedValue: ";
4616 if (VectorizedValue)
4617 dbgs() << *VectorizedValue <<
"\n";
4620 dbgs() <<
"ReuseShuffleIndices: ";
4621 if (ReuseShuffleIndices.empty())
4624 for (
int ReuseIdx : ReuseShuffleIndices)
4625 dbgs() << ReuseIdx <<
", ";
4627 dbgs() <<
"ReorderIndices: ";
4628 for (
unsigned ReorderIdx : ReorderIndices)
4629 dbgs() << ReorderIdx <<
", ";
4631 dbgs() <<
"UserTreeIndex: ";
4633 dbgs() << UserTreeIndex;
4635 dbgs() <<
"<invalid>";
4637 if (!StructEVIndices.empty()) {
4638 dbgs() <<
"StructEVIndices: ";
4642 if (!CombinedEntriesWithIndices.empty()) {
4643 dbgs() <<
"Combined entries: ";
4645 dbgs() <<
"Entry index " <<
P.first <<
" with offset " <<
P.second;
4656 StringRef Banner)
const {
4657 dbgs() <<
"SLP: " << Banner <<
":\n";
4659 dbgs() <<
"SLP: Costs:\n";
4660 dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
4661 dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
4662 dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
4663 dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = "
4664 << ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
4670 const InstructionsState &S,
4672 ArrayRef<int> ReuseShuffleIndices = {}) {
4673 auto Invalid = ScheduleBundle::invalid();
4674 return newTreeEntry(VL,
Invalid, S, UserTreeIdx, ReuseShuffleIndices);
4679 const InstructionsState &S,
4681 ArrayRef<int> ReuseShuffleIndices = {},
4682 ArrayRef<unsigned> ReorderIndices = {},
4683 unsigned InterleaveFactor = 0) {
4684 TreeEntry::EntryState EntryState =
4685 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
4686 TreeEntry *
E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
4687 ReuseShuffleIndices, ReorderIndices);
4688 if (
E && InterleaveFactor > 0)
4689 E->setInterleave(InterleaveFactor);
4694 TreeEntry::EntryState EntryState,
4695 ScheduleBundle &Bundle,
const InstructionsState &S,
4697 ArrayRef<int> ReuseShuffleIndices = {},
4698 ArrayRef<unsigned> ReorderIndices = {}) {
4699 assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||
4700 EntryState == TreeEntry::SplitVectorize)) ||
4701 (Bundle && EntryState != TreeEntry::NeedToGather &&
4702 EntryState != TreeEntry::SplitVectorize)) &&
4703 "Need to vectorize gather entry?");
4705 if (GatheredLoadsEntriesFirst.has_value() &&
4706 EntryState == TreeEntry::NeedToGather && S &&
4707 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
4708 !UserTreeIdx.UserTE)
4710 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
4711 TreeEntry *
Last = VectorizableTree.back().get();
4712 Last->Idx = VectorizableTree.size() - 1;
4713 Last->State = EntryState;
4714 if (UserTreeIdx.UserTE)
4715 OperandsToTreeEntry.try_emplace(
4716 std::make_pair(UserTreeIdx.UserTE, UserTreeIdx.EdgeIdx),
Last);
4717 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
4718 ReuseShuffleIndices.end());
4719 if (ReorderIndices.
empty()) {
4722 Last->setOperations(S);
4725 Last->Scalars.assign(VL.
size(),
nullptr);
4727 [VL](
unsigned Idx) ->
Value * {
4728 if (Idx >= VL.size())
4729 return UndefValue::get(VL.front()->getType());
4734 Last->setOperations(S);
4735 Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
4737 if (EntryState == TreeEntry::SplitVectorize) {
4738 assert(S &&
"Split nodes must have operations.");
4739 Last->setOperations(S);
4740 SmallPtrSet<Value *, 4> Processed;
4741 for (
Value *V : VL) {
4745 auto It = ScalarsInSplitNodes.find(V);
4746 if (It == ScalarsInSplitNodes.end()) {
4747 ScalarsInSplitNodes.try_emplace(V).first->getSecond().push_back(
Last);
4748 (void)Processed.
insert(V);
4749 }
else if (Processed.
insert(V).second) {
4751 "Value already associated with the node.");
4752 It->getSecond().push_back(
Last);
4755 }
else if (!
Last->isGather()) {
4758 (!S.areInstructionsWithCopyableElements() &&
4760 all_of(VL, [&](
Value *V) {
return S.isNonSchedulable(V); }))
4761 Last->setDoesNotNeedToSchedule();
4762 SmallPtrSet<Value *, 4> Processed;
4763 for (
Value *V : VL) {
4766 if (S.isCopyableElement(V)) {
4767 Last->addCopyableElement(V);
4770 auto It = ScalarToTreeEntries.find(V);
4771 if (It == ScalarToTreeEntries.end()) {
4772 ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(
Last);
4773 (void)Processed.
insert(V);
4774 }
else if (Processed.
insert(V).second) {
4776 "Value already associated with the node.");
4777 It->getSecond().push_back(
Last);
4781 assert((!Bundle.getBundle().empty() ||
Last->doesNotNeedToSchedule()) &&
4782 "Bundle and VL out of sync");
4783 if (!Bundle.getBundle().empty()) {
4784#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
4785 auto *BundleMember = Bundle.getBundle().begin();
4786 SmallPtrSet<Value *, 4> Processed;
4787 for (
Value *V : VL) {
4788 if (S.isNonSchedulable(V) || !Processed.
insert(V).second)
4792 assert(BundleMember == Bundle.getBundle().end() &&
4793 "Bundle and VL out of sync");
4795 Bundle.setTreeEntry(
Last);
4799 bool AllConstsOrCasts =
true;
4800 for (
Value *V : VL) {
4801 if (S && S.areInstructionsWithCopyableElements() &&
4802 S.isCopyableElement(V))
4803 Last->addCopyableElement(V);
4806 AllConstsOrCasts &=
I &&
I->getType()->isIntegerTy();
4807 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
4808 !UserTreeIdx.UserTE->isGather())
4809 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(
Last);
4812 if (AllConstsOrCasts)
4814 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
4815 MustGather.insert_range(VL);
4818 if (UserTreeIdx.UserTE)
4819 Last->UserTreeIndex = UserTreeIdx;
4825 TreeEntry::VecTreeTy VectorizableTree;
4830 for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
4831 VectorizableTree[
Id]->dump();
4832 if (TransformedToGatherNodes.contains(VectorizableTree[Id].get()))
4833 dbgs() <<
"[[TRANSFORMED TO GATHER]]";
4834 else if (DeletedNodes.contains(VectorizableTree[Id].get()))
4835 dbgs() <<
"[[DELETED NODE]]";
4843 assert(V &&
"V cannot be nullptr.");
4844 auto It = ScalarToTreeEntries.find(V);
4845 if (It == ScalarToTreeEntries.end())
4847 return It->getSecond();
4852 assert(V &&
"V cannot be nullptr.");
4853 auto It = ScalarsInSplitNodes.find(V);
4854 if (It == ScalarsInSplitNodes.end())
4856 return It->getSecond();
4861 bool SameVF =
false)
const {
4862 assert(V &&
"V cannot be nullptr.");
4863 for (TreeEntry *TE : ScalarToTreeEntries.lookup(V))
4864 if ((!SameVF ||
TE->getVectorFactor() == VL.
size()) &&
TE->isSame(VL))
4871 class ScalarsVectorizationLegality {
4872 InstructionsState S;
4874 bool TryToFindDuplicates;
4875 bool TrySplitVectorize;
4878 ScalarsVectorizationLegality(InstructionsState S,
bool IsLegal,
4879 bool TryToFindDuplicates =
true,
4880 bool TrySplitVectorize =
false)
4881 : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
4882 TrySplitVectorize(TrySplitVectorize) {
4883 assert((!IsLegal || (S.valid() && TryToFindDuplicates)) &&
4884 "Inconsistent state");
4886 const InstructionsState &getInstructionsState()
const {
return S; };
4887 bool isLegal()
const {
return IsLegal; }
4888 bool tryToFindDuplicates()
const {
return TryToFindDuplicates; }
4889 bool trySplitVectorize()
const {
return TrySplitVectorize; }
4894 ScalarsVectorizationLegality
4896 const EdgeInfo &UserTreeIdx)
const;
4900 TreeEntry::EntryState getScalarsVectorizationState(
4902 bool IsScatterVectorizeUserTE,
OrdersType &CurrentOrder,
4906 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
4909 SmallPtrSet<const TreeEntry *, 8> DeletedNodes;
4913 SmallDenseMap<const TreeEntry *, InstructionCost> TransformedToGatherNodes;
4916 SmallDenseMap<std::pair<const TreeEntry *, unsigned>, TreeEntry *>
4917 OperandsToTreeEntry;
4920 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarsInSplitNodes;
4923 SmallDenseMap<Value *, unsigned> InstrElementSize;
4937 SmallDenseMap<const TreeEntry *, WeakTrackingVH> EntryToLastInstruction;
4941 SmallDenseMap<const Instruction *, Instruction *> LastInstructionToPos;
4946 SetVector<const TreeEntry *> PostponedGathers;
4948 using ValueToGatherNodesMap =
4949 DenseMap<Value *, SmallSetVector<const TreeEntry *, 4>>;
4950 ValueToGatherNodesMap ValueToGatherNodes;
4952 SmallDenseMap<TreeEntry *, StridedPtrInfo> TreeEntryToStridedPtrInfoMap;
4957 SetVector<unsigned> LoadEntriesToVectorize;
4960 bool IsGraphTransformMode =
false;
4963 std::optional<unsigned> GatheredLoadsEntriesFirst;
4966 SmallDenseMap<
const TreeEntry *,
4967 std::tuple<SmallVector<int>,
VectorType *, unsigned,
bool>>
4968 CompressEntryToData;
4972 SmallVector<const Loop *> CurrentLoopNest;
4976 SmallVector<const SCEV *> MergedLoopBTCs;
4979 SmallDenseMap<const Loop *, SmallVector<const Loop *>> LoopToLoopNest;
4984 SmallDenseMap<const Loop *, uint64_t> LoopNestScaleCache;
4987 struct ExternalUser {
4988 ExternalUser(
Value *S, llvm::User *U,
const TreeEntry &E,
unsigned L)
4989 : Scalar(S), User(
U), E(E), Lane(
L) {}
4992 Value *Scalar =
nullptr;
4995 llvm::User *User =
nullptr;
5003 using UserList = SmallVector<ExternalUser, 16>;
5009 bool isAliased(
const MemoryLocation &Loc1, Instruction *Inst1,
5010 Instruction *Inst2) {
5013 AliasCacheKey
Key = std::make_pair(Inst1, Inst2);
5014 auto Res = AliasCache.try_emplace(
Key);
5016 return Res.first->second;
5017 bool Aliased =
isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
5019 Res.first->getSecond() = Aliased;
5023 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
5027 SmallDenseMap<AliasCacheKey, bool> AliasCache;
5032 BatchAAResults BatchAA;
5039 DenseSet<Instruction *> DeletedInstructions;
5042 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
5045 DenseSet<size_t> AnalyzedReductionVals;
5049 DenseSet<Value *> AnalyzedMinBWVals;
5055 UserList ExternalUses;
5059 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
5063 SmallPtrSet<Value *, 4> ExternalUsesWithNonUsers;
5066 SmallPtrSet<const Value *, 32> EphValues;
5070 SetVector<Instruction *> GatherShuffleExtractSeq;
5073 DenseSet<BasicBlock *> CSEBlocks;
5076 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
5083 class ScheduleEntity {
5084 friend class ScheduleBundle;
5085 friend class ScheduleData;
5086 friend class ScheduleCopyableData;
5089 enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData };
5090 Kind getKind()
const {
return K; }
5091 ScheduleEntity(Kind K) : K(K) {}
5095 int SchedulingPriority = 0;
5098 bool IsScheduled =
false;
5100 const Kind K = Kind::ScheduleData;
5103 ScheduleEntity() =
delete;
5105 void setSchedulingPriority(
int Priority) { SchedulingPriority = Priority; }
5106 int getSchedulingPriority()
const {
return SchedulingPriority; }
5107 bool isReady()
const {
5109 return SD->isReady();
5111 return CD->isReady();
5117 bool hasValidDependencies()
const {
5119 return SD->hasValidDependencies();
5121 return CD->hasValidDependencies();
5125 int getUnscheduledDeps()
const {
5127 return SD->getUnscheduledDeps();
5129 return CD->getUnscheduledDeps();
5133 int incrementUnscheduledDeps(
int Incr) {
5135 return SD->incrementUnscheduledDeps(Incr);
5139 int getDependencies()
const {
5141 return SD->getDependencies();
5147 return SD->getInst();
5152 bool isScheduled()
const {
return IsScheduled; }
5153 void setScheduled(
bool Scheduled) { IsScheduled = Scheduled; }
5155 static bool classof(
const ScheduleEntity *) {
return true; }
5157#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5158 void dump(raw_ostream &OS)
const {
5160 return SD->dump(OS);
5162 return CD->dump(OS);
5173#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5175 const BoUpSLP::ScheduleEntity &SE) {
5185 class ScheduleData final :
public ScheduleEntity {
5189 enum { InvalidDeps = -1 };
5191 ScheduleData() : ScheduleEntity(Kind::ScheduleData) {}
5192 static bool classof(
const ScheduleEntity *Entity) {
5193 return Entity->getKind() == Kind::ScheduleData;
5196 void init(
int BlockSchedulingRegionID, Instruction *
I) {
5197 NextLoadStore =
nullptr;
5198 IsScheduled =
false;
5199 SchedulingRegionID = BlockSchedulingRegionID;
5200 clearDependencies();
5206 if (hasValidDependencies()) {
5207 assert(UnscheduledDeps <= Dependencies &&
"invariant");
5209 assert(UnscheduledDeps == Dependencies &&
"invariant");
5213 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
5214 "unexpected scheduled state");
5221 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
5225 bool isReady()
const {
return UnscheduledDeps == 0 && !IsScheduled; }
5230 int incrementUnscheduledDeps(
int Incr) {
5231 assert(hasValidDependencies() &&
5232 "increment of unscheduled deps would be meaningless");
5233 UnscheduledDeps += Incr;
5234 assert(UnscheduledDeps >= 0 &&
5235 "Expected valid number of unscheduled deps");
5236 return UnscheduledDeps;
5241 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5244 void clearDependencies() {
5245 clearDirectDependencies();
5246 MemoryDependencies.clear();
5247 ControlDependencies.clear();
5254 void clearDirectDependencies() {
5255 Dependencies = InvalidDeps;
5256 resetUnscheduledDeps();
5257 IsScheduled =
false;
5261 int getUnscheduledDeps()
const {
return UnscheduledDeps; }
5263 int getDependencies()
const {
return Dependencies; }
5265 void initDependencies() { Dependencies = 0; }
5267 void incDependencies() { Dependencies++; }
5270 int getSchedulingRegionID()
const {
return SchedulingRegionID; }
5277 return MemoryDependencies;
5280 void addMemoryDependency(ScheduleData *Dep) {
5281 MemoryDependencies.push_back(Dep);
5285 return ControlDependencies;
5288 void addControlDependency(ScheduleData *Dep) {
5289 ControlDependencies.push_back(Dep);
5292 ScheduleData *getNextLoadStore()
const {
return NextLoadStore; }
5293 void setNextLoadStore(ScheduleData *
Next) { NextLoadStore =
Next; }
5295 void dump(raw_ostream &OS)
const { OS << *Inst; }
5307 ScheduleData *NextLoadStore =
nullptr;
5311 SmallVector<ScheduleData *> MemoryDependencies;
5317 SmallVector<ScheduleData *> ControlDependencies;
5321 int SchedulingRegionID = 0;
5327 int Dependencies = InvalidDeps;
5333 int UnscheduledDeps = InvalidDeps;
5338 const BoUpSLP::ScheduleData &SD) {
5344 class ScheduleBundle final :
public ScheduleEntity {
5348 bool IsValid =
true;
5350 TreeEntry *TE =
nullptr;
5351 ScheduleBundle(
bool IsValid)
5352 : ScheduleEntity(Kind::ScheduleBundle), IsValid(IsValid) {}
5355 ScheduleBundle() : ScheduleEntity(Kind::ScheduleBundle) {}
5356 static bool classof(
const ScheduleEntity *Entity) {
5357 return Entity->getKind() == Kind::ScheduleBundle;
5362 for (
const ScheduleEntity *SD : Bundle) {
5363 if (SD->hasValidDependencies()) {
5364 assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
5367 assert(SD->getUnscheduledDeps() == SD->getDependencies() &&
5371 if (isScheduled()) {
5372 assert(SD->hasValidDependencies() && SD->getUnscheduledDeps() == 0 &&
5373 "unexpected scheduled state");
5379 int unscheduledDepsInBundle()
const {
5380 assert(*
this &&
"bundle must not be empty");
5382 for (
const ScheduleEntity *BundleMember : Bundle) {
5383 if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
5384 return ScheduleData::InvalidDeps;
5385 Sum += BundleMember->getUnscheduledDeps();
5393 bool hasValidDependencies()
const {
5394 return all_of(Bundle, [](
const ScheduleEntity *SD) {
5395 return SD->hasValidDependencies();
5401 bool isReady()
const {
5402 assert(*
this &&
"bundle must not be empty");
5403 return unscheduledDepsInBundle() == 0 && !isScheduled();
5411 void add(ScheduleEntity *SD) { Bundle.push_back(SD); }
5414 void setTreeEntry(TreeEntry *TE) { this->TE = TE; }
5415 TreeEntry *getTreeEntry()
const {
return TE; }
5417 static ScheduleBundle invalid() {
return {
false}; }
5419 operator bool()
const {
return IsValid; }
5422 void dump(raw_ostream &OS)
const {
5431 OS << *SD->getInst();
5445 const BoUpSLP::ScheduleBundle &Bundle) {
5456 class ScheduleCopyableData final :
public ScheduleEntity {
5463 int SchedulingRegionID = 0;
5465 ScheduleBundle &Bundle;
5468 ScheduleCopyableData(
int BlockSchedulingRegionID,
Instruction *
I,
5469 const EdgeInfo &EI, ScheduleBundle &Bundle)
5470 : ScheduleEntity(Kind::ScheduleCopyableData), Inst(
I), EI(EI),
5471 SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {}
5472 static bool classof(
const ScheduleEntity *Entity) {
5473 return Entity->getKind() == Kind::ScheduleCopyableData;
5478 if (hasValidDependencies()) {
5479 assert(UnscheduledDeps <= Dependencies &&
"invariant");
5481 assert(UnscheduledDeps == Dependencies &&
"invariant");
5485 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
5486 "unexpected scheduled state");
5493 bool hasValidDependencies()
const {
5494 return Dependencies != ScheduleData::InvalidDeps;
5499 bool isReady()
const {
return UnscheduledDeps == 0 && !IsScheduled; }
5504 int incrementUnscheduledDeps(
int Incr) {
5505 assert(hasValidDependencies() &&
5506 "increment of unscheduled deps would be meaningless");
5507 UnscheduledDeps += Incr;
5508 assert(UnscheduledDeps >= 0 &&
"invariant");
5509 return UnscheduledDeps;
5514 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5517 int getUnscheduledDeps()
const {
return UnscheduledDeps; }
5519 int getDependencies()
const {
return Dependencies; }
5521 void initDependencies() { Dependencies = 0; }
5523 void incDependencies() { Dependencies++; }
5526 int getSchedulingRegionID()
const {
return SchedulingRegionID; }
5532 void clearDependencies() {
5533 Dependencies = ScheduleData::InvalidDeps;
5534 UnscheduledDeps = ScheduleData::InvalidDeps;
5535 IsScheduled =
false;
5539 const EdgeInfo &getEdgeInfo()
const {
return EI; }
5542 ScheduleBundle &getBundle() {
return Bundle; }
5543 const ScheduleBundle &getBundle()
const {
return Bundle; }
5545#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5546 void dump(raw_ostream &OS)
const { OS <<
"[Copyable]" << *getInst(); }
5557 int Dependencies = ScheduleData::InvalidDeps;
5563 int UnscheduledDeps = ScheduleData::InvalidDeps;
5593 struct BlockScheduling {
5595 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
5598 ScheduledBundles.clear();
5599 ScheduledBundlesList.
clear();
5600 ScheduleCopyableDataMap.clear();
5601 ScheduleCopyableDataMapByInst.clear();
5602 ScheduleCopyableDataMapByInstUser.clear();
5603 ScheduleCopyableDataMapByUsers.clear();
5605 RecalcCopyableOperandDeps.
clear();
5606 ScheduleStart =
nullptr;
5607 ScheduleEnd =
nullptr;
5608 FirstLoadStoreInRegion =
nullptr;
5609 LastLoadStoreInRegion =
nullptr;
5610 RegionHasStackSave =
false;
5614 ScheduleRegionSizeLimit -= ScheduleRegionSize;
5617 ScheduleRegionSize = 0;
5621 ++SchedulingRegionID;
5624 ScheduleData *getScheduleData(Instruction *
I) {
5627 if (BB !=
I->getParent())
5630 ScheduleData *SD = ScheduleDataMap.lookup(
I);
5631 if (SD && isInSchedulingRegion(*SD))
5636 ScheduleData *getScheduleData(
Value *V) {
5642 ScheduleCopyableData *getScheduleCopyableData(
const EdgeInfo &EI,
5643 const Value *V)
const {
5644 if (ScheduleCopyableDataMap.empty())
5646 auto It = ScheduleCopyableDataMap.find(std::make_pair(EI, V));
5647 if (It == ScheduleCopyableDataMap.end())
5649 ScheduleCopyableData *SD = It->getSecond().get();
5650 if (!isInSchedulingRegion(*SD))
5658 getScheduleCopyableData(
const Value *User,
unsigned OperandIdx,
5660 if (ScheduleCopyableDataMapByInstUser.empty())
5662 const auto It = ScheduleCopyableDataMapByInstUser.find(
5663 std::make_pair(std::make_pair(User, OperandIdx), V));
5664 if (It == ScheduleCopyableDataMapByInstUser.end())
5667 for (ScheduleCopyableData *SD : It->getSecond()) {
5668 if (isInSchedulingRegion(*SD))
5682 bool areAllOperandsReplacedByCopyableData(Instruction *User,
5686 if (ScheduleCopyableDataMap.empty())
5688 SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;
5690 if (Entries.
empty())
5692 unsigned CurNumOps = 0;
5693 for (
const Use &U :
User->operands()) {
5699 for (TreeEntry *TE : Entries) {
5701 bool IsNonSchedulableWithParentPhiNode =
5702 TE->doesNotNeedToSchedule() &&
TE->UserTreeIndex &&
5703 TE->UserTreeIndex.UserTE->hasState() &&
5704 TE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
5705 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
5708 if (IsNonSchedulableWithParentPhiNode) {
5709 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5710 const TreeEntry *ParentTE =
TE->UserTreeIndex.UserTE;
5711 for (
Value *V : ParentTE->Scalars) {
5715 if (ParentsUniqueUsers.
insert(
PHI).second &&
5720 Inc =
count(
TE->Scalars, User);
5728 bool IsCommutativeUser =
5731 if (!IsCommutativeUser) {
5741 (!IsCommutativeUser ||
5750 "Expected commutative user with 2 first commutable operands");
5751 bool IsCommutativeWithSameOps =
5752 IsCommutativeUser &&
User->getOperand(0) ==
User->getOperand(1);
5753 if ((!IsCommutativeUser || IsCommutativeWithSameOps) &&
5755 EdgeInfo EI(TE,
U.getOperandNo());
5756 if (CurNumOps !=
NumOps || getScheduleCopyableData(EI,
Op))
5760 PotentiallyReorderedEntriesCount.
try_emplace(TE, 0)
5761 .first->getSecond() += Inc;
5764 if (PotentiallyReorderedEntriesCount.
empty())
5767 for (
auto &
P : PotentiallyReorderedEntriesCount) {
5768 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5769 bool IsNonSchedulableWithParentPhiNode =
5770 P.first->doesNotNeedToSchedule() &&
P.first->UserTreeIndex &&
5771 P.first->UserTreeIndex.UserTE->hasState() &&
5772 P.first->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
5773 P.first->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
5774 auto *It =
find(
P.first->Scalars, User);
5776 assert(It !=
P.first->Scalars.end() &&
5777 "User is not in the tree entry");
5778 int Lane = std::distance(
P.first->Scalars.begin(), It);
5779 assert(Lane >= 0 &&
"Lane is not found");
5781 Lane =
P.first->ReorderIndices[Lane];
5782 assert(Lane <
static_cast<int>(
P.first->Scalars.size()) &&
5783 "Couldn't find extract lane");
5786 if (IsNonSchedulableWithParentPhiNode) {
5787 const TreeEntry *ParentTE =
P.first->UserTreeIndex.UserTE;
5789 if (!ParentsUniqueUsers.
insert(User).second) {
5795 for (
unsigned OpIdx :
5797 P.first->getMainOp()))) {
5798 if (
P.first->getOperand(
OpIdx)[Lane] ==
Op &&
5799 getScheduleCopyableData(EdgeInfo(
P.first,
OpIdx),
Op))
5804 }
while (It !=
P.first->Scalars.end());
5806 return all_of(PotentiallyReorderedEntriesCount,
5807 [&](
const std::pair<const TreeEntry *, unsigned> &
P) {
5808 return P.second ==
NumOps - 1;
5813 getScheduleCopyableData(
const Instruction *
I)
const {
5814 if (ScheduleCopyableDataMapByInst.empty())
5816 const auto It = ScheduleCopyableDataMapByInst.find(
I);
5817 if (It == ScheduleCopyableDataMapByInst.end())
5820 for (ScheduleCopyableData *SD : It->getSecond()) {
5821 if (isInSchedulingRegion(*SD))
5828 getScheduleCopyableDataUsers(
const Instruction *User)
const {
5829 if (ScheduleCopyableDataMapByUsers.empty())
5831 const auto It = ScheduleCopyableDataMapByUsers.find(User);
5832 if (It == ScheduleCopyableDataMapByUsers.end())
5835 for (ScheduleCopyableData *SD : It->getSecond()) {
5836 if (isInSchedulingRegion(*SD))
5842 ScheduleCopyableData &addScheduleCopyableData(
const EdgeInfo &EI,
5844 int SchedulingRegionID,
5845 ScheduleBundle &Bundle) {
5846 assert(!getScheduleCopyableData(EI,
I) &&
"already in the map");
5847 ScheduleCopyableData *CD =
5848 ScheduleCopyableDataMap
5849 .try_emplace(std::make_pair(EI,
I),
5850 std::make_unique<ScheduleCopyableData>(
5851 SchedulingRegionID,
I, EI, Bundle))
5854 ScheduleCopyableDataMapByInst[
I].push_back(CD);
5858 assert(It !=
Op.end() &&
"Lane not set");
5859 SmallPtrSet<Instruction *, 4> Visited;
5861 int Lane = std::distance(
Op.begin(), It);
5862 assert(Lane >= 0 &&
"Lane not set");
5864 !EI.UserTE->ReorderIndices.empty())
5865 Lane = EI.UserTE->ReorderIndices[Lane];
5866 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
5867 "Couldn't find extract lane");
5869 if (!Visited.
insert(In).second) {
5873 ScheduleCopyableDataMapByInstUser
5874 .try_emplace(std::make_pair(std::make_pair(In, EI.EdgeIdx),
I))
5877 ScheduleCopyableDataMapByUsers.try_emplace(
I)
5884 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
5885 if (ScheduleCopyableData *UserCD =
5886 getScheduleCopyableData(UserEI, In))
5887 ScheduleCopyableDataMapByUsers[
I].remove(UserCD);
5890 }
while (It !=
Op.end());
5892 ScheduleCopyableDataMapByUsers.try_emplace(
I).first->getSecond().insert(
5902 auto It = ScheduledBundles.find(
I);
5903 if (It == ScheduledBundles.end())
5905 return It->getSecond();
5909 bool isInSchedulingRegion(
const ScheduleEntity &SD)
const {
5911 return Data->getSchedulingRegionID() == SchedulingRegionID;
5913 return CD->getSchedulingRegionID() == SchedulingRegionID;
5915 [&](
const ScheduleEntity *BundleMember) {
5916 return isInSchedulingRegion(*BundleMember);
5922 template <
typename ReadyListType>
5923 void schedule(
const BoUpSLP &R,
const InstructionsState &S,
5924 const EdgeInfo &EI, ScheduleEntity *
Data,
5925 ReadyListType &ReadyList) {
5926 auto ProcessBundleMember = [&](ScheduleEntity *BundleMember,
5931 auto DecrUnsched = [&](
auto *
Data,
bool IsControl =
false) {
5932 if ((IsControl ||
Data->hasValidDependencies()) &&
5933 Data->incrementUnscheduledDeps(-1) == 0) {
5940 CopyableBundle.
push_back(&CD->getBundle());
5941 Bundles = CopyableBundle;
5943 Bundles = getScheduleBundles(
Data->getInst());
5945 if (!Bundles.
empty()) {
5946 for (ScheduleBundle *Bundle : Bundles) {
5947 if (Bundle->unscheduledDepsInBundle() == 0) {
5948 assert(!Bundle->isScheduled() &&
5949 "already scheduled bundle gets ready");
5950 ReadyList.insert(Bundle);
5952 <<
"SLP: gets ready: " << *Bundle <<
"\n");
5958 "already scheduled bundle gets ready");
5960 "Expected non-copyable data");
5961 ReadyList.insert(
Data);
5968 if (!ScheduleCopyableDataMap.empty()) {
5970 getScheduleCopyableData(User,
OpIdx,
I);
5971 for (ScheduleCopyableData *CD : CopyableData)
5972 DecrUnsched(CD,
false);
5973 if (!CopyableData.empty())
5976 if (ScheduleData *OpSD = getScheduleData(
I))
5977 DecrUnsched(OpSD,
false);
5983 if (!Bundles.empty()) {
5984 auto *
In = BundleMember->getInst();
5986 SmallDenseMap<const Instruction *, unsigned> OperandsUses;
5987 unsigned TotalOpCount = 0;
5990 TotalOpCount = OperandsUses[
In] = 1;
5992 for (
const Use &U :
In->operands()) {
5995 unsigned ExtraDeps = 1;
5997 for (ScheduleBundle *Bundle : Bundles) {
5998 if (
const TreeEntry *TE = Bundle->getTreeEntry()) {
5999 if (
TE->isExpandedBinOp(In))
6001 }
else if (S.isExpandedBinOp(In)) {
6005 Res.first->getSecond() += ExtraDeps;
6006 TotalOpCount += ExtraDeps;
6012 auto DecrUnschedForInst =
6014 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>>
6016 bool IsExpandedOperand =
false) {
6017 if (!ScheduleCopyableDataMap.empty()) {
6018 const EdgeInfo EI = {UserTE,
OpIdx};
6019 if (ScheduleCopyableData *CD =
6020 getScheduleCopyableData(EI,
I)) {
6021 if (!Checked.insert(std::make_pair(CD,
OpIdx)).second)
6023 DecrUnsched(CD,
false);
6027 auto It = OperandsUses.
find(
I);
6028 assert(It != OperandsUses.
end() &&
"Operand not found");
6029 if (It->second > 0) {
6030 if (ScheduleData *OpSD = getScheduleData(
I)) {
6031 if (!IsExpandedOperand &&
6032 !Checked.insert(std::make_pair(OpSD,
OpIdx)).second)
6035 assert(TotalOpCount > 0 &&
"No more operands to decrement");
6037 DecrUnsched(OpSD,
false);
6040 assert(TotalOpCount > 0 &&
"No more operands to decrement");
6046 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>> Checked;
6047 for (ScheduleBundle *Bundle : Bundles) {
6048 if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
6050 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
6053 auto *It =
find(Bundle->getTreeEntry()->Scalars, In);
6054 bool IsNonSchedulableWithParentPhiNode =
6055 Bundle->getTreeEntry()->doesNotNeedToSchedule() &&
6056 Bundle->getTreeEntry()->UserTreeIndex &&
6057 Bundle->getTreeEntry()->UserTreeIndex.UserTE->hasState() &&
6058 Bundle->getTreeEntry()->UserTreeIndex.UserTE->State !=
6059 TreeEntry::SplitVectorize &&
6060 Bundle->getTreeEntry()->UserTreeIndex.UserTE->getOpcode() ==
6064 std::distance(Bundle->getTreeEntry()->Scalars.begin(), It);
6065 assert(Lane >= 0 &&
"Lane not set");
6067 !Bundle->getTreeEntry()->ReorderIndices.empty())
6068 Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
6069 assert(Lane <
static_cast<int>(
6070 Bundle->getTreeEntry()->Scalars.size()) &&
6071 "Couldn't find extract lane");
6082 In->getNumOperands() ==
6083 Bundle->getTreeEntry()->getNumOperands() ||
6084 (
isa<ZExtInst>(In) && Bundle->getTreeEntry()->getOpcode() ==
6085 Instruction::Select) ||
6086 Bundle->getTreeEntry()->isCopyableElement(In)) &&
6087 "Missed TreeEntry operands?");
6091 if (IsNonSchedulableWithParentPhiNode) {
6092 const TreeEntry *ParentTE =
6093 Bundle->getTreeEntry()->UserTreeIndex.UserTE;
6095 if (!ParentsUniqueUsers.
insert(User).second) {
6096 It = std::find(std::next(It),
6097 Bundle->getTreeEntry()->Scalars.end(), In);
6102 for (
unsigned OpIdx :
6105 Bundle->getTreeEntry()->getOperand(
OpIdx)[Lane])) {
6109 I, Bundle->getTreeEntry(),
OpIdx, Checked,
6110 Bundle->getTreeEntry()->isExpandedOperand(In,
OpIdx));
6113 if (Bundle->getTreeEntry()->isCopyableElement(In))
6115 It = std::find(std::next(It),
6116 Bundle->getTreeEntry()->Scalars.end(), In);
6117 }
while (It != Bundle->getTreeEntry()->Scalars.end());
6122 for (Use &U : BundleMember->getInst()->operands()) {
6125 <<
"SLP: check for readiness (def): " << *
I <<
"\n");
6126 DecrUnschedForInst(BundleMember->getInst(),
U.getOperandNo(),
I);
6134 SmallPtrSet<const ScheduleData *, 4> VisitedMemory;
6135 for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) {
6136 if (!VisitedMemory.
insert(MemoryDep).second)
6141 << *MemoryDep <<
"\n");
6142 DecrUnsched(MemoryDep);
6145 SmallPtrSet<const ScheduleData *, 4> VisitedControl;
6146 for (ScheduleData *Dep : SD->getControlDependencies()) {
6147 if (!VisitedControl.
insert(Dep).second)
6152 <<
"SLP: check for readiness (ctrl): " << *Dep <<
"\n");
6153 DecrUnsched(Dep,
true);
6157 SD->setScheduled(
true);
6163 if (!Entries.
empty()) {
6164 for (TreeEntry *TE : Entries) {
6166 In->getNumOperands() !=
TE->getNumOperands())
6169 PseudoBundles.
emplace_back(std::make_unique<ScheduleBundle>());
6170 BundlePtr->setTreeEntry(TE);
6175 ProcessBundleMember(SD, Bundles);
6178 Bundle.setScheduled(
true);
6180 auto AreAllBundlesScheduled =
6181 [&](
const ScheduleEntity *SD,
6185 return !SDBundles.empty() &&
6186 all_of(SDBundles, [&](
const ScheduleBundle *SDBundle) {
6187 return SDBundle->isScheduled();
6190 for (ScheduleEntity *SD : Bundle.getBundle()) {
6193 SDBundles = getScheduleBundles(SD->getInst());
6194 if (!AreAllBundlesScheduled(SD, SDBundles))
6196 SD->setScheduled(
true);
6198 ScheduleCopyableDataMap.empty()) {
6219 for (TreeEntry *TE :
R.getTreeEntries(In)) {
6220 if (
TE->isCopyableElement(In))
6223 In->getNumOperands() !=
TE->getNumOperands())
6225 if (
any_of(SDBundles, [&](
const ScheduleBundle *SDBundle) {
6226 return SDBundle->getTreeEntry() ==
TE;
6229 ScheduleBundle &PseudoBundle =
6230 *PseudoBundles.
emplace_back(std::make_unique<ScheduleBundle>());
6231 PseudoBundle.setTreeEntry(TE);
6232 PseudoBundle.add(SD);
6233 AllBundles.push_back(&PseudoBundle);
6235 ProcessBundleMember(SD, AllBundles);
6245 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
6246 ScheduleStart->comesBefore(ScheduleEnd) &&
6247 "Not a valid scheduling region?");
6249 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
6251 if (!Bundles.
empty()) {
6252 for (ScheduleBundle *Bundle : Bundles) {
6253 assert(isInSchedulingRegion(*Bundle) &&
6254 "primary schedule data not in window?");
6259 auto *SD = getScheduleData(
I);
6262 assert(isInSchedulingRegion(*SD) &&
6263 "primary schedule data not in window?");
6268 [](
const ScheduleEntity *Bundle) {
6269 return Bundle->isReady();
6271 "item in ready list not ready?");
6275 template <
typename ReadyListType>
6276 void initialFillReadyList(ReadyListType &ReadyList) {
6277 SmallPtrSet<ScheduleBundle *, 16> Visited;
6278 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
6279 ScheduleData *SD = getScheduleData(
I);
6280 if (SD && SD->hasValidDependencies() && SD->isReady()) {
6283 for (ScheduleBundle *Bundle : Bundles) {
6284 if (!Visited.
insert(Bundle).second)
6286 if (Bundle->hasValidDependencies() && Bundle->isReady()) {
6287 ReadyList.insert(Bundle);
6289 << *Bundle <<
"\n");
6294 ReadyList.insert(SD);
6296 <<
"SLP: initially in ready list: " << *SD <<
"\n");
6307 const InstructionsState &S,
const EdgeInfo &EI);
6314 std::optional<ScheduleBundle *>
6316 const InstructionsState &S,
const EdgeInfo &EI);
6319 ScheduleData *allocateScheduleDataChunks();
6323 bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
6327 void initScheduleData(Instruction *FromI, Instruction *ToI,
6328 ScheduleData *PrevLoadStore,
6329 ScheduleData *NextLoadStore);
6333 void calculateDependencies(ScheduleBundle &Bundle,
bool InsertInReadyList,
6335 const SmallPtrSetImpl<Value *> &ExpandedOps,
6339 void resetSchedule();
6356 SmallDenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
6360 SmallDenseMap<std::pair<EdgeInfo, const Value *>,
6361 std::unique_ptr<ScheduleCopyableData>>
6362 ScheduleCopyableDataMap;
6368 SmallDenseMap<const Instruction *, SmallVector<ScheduleCopyableData *>>
6369 ScheduleCopyableDataMapByInst;
6375 SmallDenseMap<std::pair<std::pair<const Value *, unsigned>,
const Value *>,
6377 ScheduleCopyableDataMapByInstUser;
6397 SmallSetVector<ScheduleCopyableData *, 4>>
6398 ScheduleCopyableDataMapByUsers;
6401 SmallDenseMap<Instruction *, SmallVector<ScheduleBundle *>>
6407 SetVector<ScheduleEntity *> ReadyInsts;
6417 ScheduleData *FirstLoadStoreInRegion =
nullptr;
6421 ScheduleData *LastLoadStoreInRegion =
nullptr;
6426 bool RegionHasStackSave =
false;
6429 int ScheduleRegionSize = 0;
6444 SmallSetVector<ScheduleData *, 8> RecalcCopyableOperandDeps;
6450 int SchedulingRegionID = 1;
6454 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
6458 void scheduleBlock(
const BoUpSLP &R, BlockScheduling *BS);
6461 const SmallDenseSet<Value *> *UserIgnoreList =
nullptr;
6465 struct OrdersTypeDenseMapInfo {
6472 static unsigned getHashValue(
const OrdersType &V) {
6483 ScalarEvolution *SE;
6484 TargetTransformInfo *TTI;
6485 TargetLibraryInfo *TLI;
6488 AssumptionCache *AC;
6490 const DataLayout *DL;
6491 OptimizationRemarkEmitter *ORE;
6493 unsigned MaxVecRegSize;
6494 unsigned MinVecRegSize;
6497 IRBuilder<TargetFolder> Builder;
6504 DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
6509 unsigned ReductionBitWidth = 0;
6512 unsigned BaseGraphSize = 1;
6516 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
6520 DenseSet<unsigned> ExtraBitWidthNodes;
6528 SecondInfo::getEmptyKey());
6533 SecondInfo::getHashValue(Val.
EdgeIdx));
6554 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
6565 return R.VectorizableTree[0].get();
6569 return {&
N->UserTreeIndex,
N->Container};
6573 return {&
N->UserTreeIndex + 1,
N->Container};
6600 static unsigned size(
BoUpSLP *R) {
return R->VectorizableTree.size(); }
6612 OS << Entry->Idx <<
".\n";
6615 for (
auto *V : Entry->Scalars) {
6617 if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
6618 return EU.Scalar == V;
6628 if (Entry->isGather())
6630 if (Entry->State == TreeEntry::ScatterVectorize ||
6631 Entry->State == TreeEntry::StridedVectorize ||
6632 Entry->State == TreeEntry::CompressVectorize)
6633 return "color=blue";
6640 for (
auto *
I : DeletedInstructions) {
6641 if (!
I->getParent()) {
6646 I->insertBefore(F->getEntryBlock(),
6647 F->getEntryBlock().getFirstNonPHIIt());
6649 I->insertBefore(F->getEntryBlock().getTerminator()->getIterator());
6652 for (
Use &U :
I->operands()) {
6654 if (
Op && !DeletedInstructions.count(
Op) &&
Op->hasOneUser() &&
6658 I->dropAllReferences();
6660 for (
auto *
I : DeletedInstructions) {
6662 "trying to erase instruction with users.");
6663 I->eraseFromParent();
6669#ifdef EXPENSIVE_CHECKS
6680 assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
6681 "Expected non-empty mask.");
6684 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
6686 Reuses[Mask[
I]] = Prev[
I];
6694 bool BottomOrder =
false) {
6695 assert(!Mask.empty() &&
"Expected non-empty mask.");
6696 unsigned Sz = Mask.size();
6699 if (Order.
empty()) {
6701 std::iota(PrevOrder.
begin(), PrevOrder.
end(), 0);
6703 PrevOrder.
swap(Order);
6706 for (
unsigned I = 0;
I < Sz; ++
I)
6708 Order[
I] = PrevOrder[Mask[
I]];
6710 return Data.value() == Sz ||
Data.index() ==
Data.value();
6719 if (Order.
empty()) {
6721 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
6731 for (
unsigned I = 0;
I < Sz; ++
I)
6733 Order[MaskOrder[
I]] =
I;
6737std::optional<BoUpSLP::OrdersType>
6739 bool TopToBottom,
bool IgnoreReorder) {
6740 assert(TE.isGather() &&
"Expected gather node only.");
6744 Type *ScalarTy = GatheredScalars.
front()->getType();
6745 size_t NumScalars = GatheredScalars.
size();
6747 return std::nullopt;
6754 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
6756 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
6759 if (GatherShuffles.
empty() && ExtractShuffles.
empty())
6760 return std::nullopt;
6761 OrdersType CurrentOrder(NumScalars, NumScalars);
6762 if (GatherShuffles.
size() == 1 &&
6764 Entries.
front().front()->isSame(TE.Scalars)) {
6768 return std::nullopt;
6770 if (Entries.
front().front()->UserTreeIndex.UserTE ==
6771 TE.UserTreeIndex.UserTE)
6772 return std::nullopt;
6775 if (!IgnoreReorder && Entries.
front().front()->Idx == 0)
6776 return std::nullopt;
6779 if (!Entries.
front().front()->ReuseShuffleIndices.empty() &&
6780 TE.getVectorFactor() == 2 && Mask.size() == 2 &&
6783 return P.value() % 2 != static_cast<int>(P.index()) % 2;
6785 return std::nullopt;
6789 std::iota(CurrentOrder.
begin(), CurrentOrder.
end(), 0);
6790 return CurrentOrder;
6794 return all_of(Mask, [&](
int I) {
6801 if ((ExtractShuffles.
empty() && IsSplatMask(Mask) &&
6802 (Entries.
size() != 1 ||
6803 Entries.
front().front()->ReorderIndices.empty())) ||
6804 (GatherShuffles.
empty() && IsSplatMask(ExtractMask)))
6805 return std::nullopt;
6811 if (ShuffledSubMasks.
test(
I))
6813 const int VF = GetVF(
I);
6821 ShuffledSubMasks.
set(
I);
6825 int FirstMin = INT_MAX;
6826 int SecondVecFound =
false;
6828 int Idx = Mask[
I * PartSz + K];
6830 Value *V = GatheredScalars[
I * PartSz + K];
6832 SecondVecFound =
true;
6841 SecondVecFound =
true;
6845 FirstMin = (FirstMin / PartSz) * PartSz;
6847 if (SecondVecFound) {
6849 ShuffledSubMasks.
set(
I);
6853 int Idx = Mask[
I * PartSz + K];
6857 if (Idx >= PartSz) {
6860 SecondVecFound =
true;
6866 if (
static_cast<unsigned>(
I * PartSz + Idx) >= CurrentOrder.
size())
6868 if (CurrentOrder[
I * PartSz + Idx] >
6869 static_cast<unsigned>(
I * PartSz + K) &&
6870 CurrentOrder[
I * PartSz + Idx] !=
6871 static_cast<unsigned>(
I * PartSz + Idx))
6872 CurrentOrder[
I * PartSz + Idx] =
I * PartSz + K;
6875 if (SecondVecFound) {
6877 ShuffledSubMasks.
set(
I);
6883 if (!ExtractShuffles.
empty())
6884 TransformMaskToOrder(
6885 CurrentOrder, ExtractMask, PartSz, NumParts, [&](
unsigned I) {
6886 if (
I >= ExtractShuffles.
size() || !ExtractShuffles[
I])
6889 unsigned Sz =
getNumElems(TE.getVectorFactor(), PartSz,
I);
6891 int K =
I * PartSz + Idx;
6892 if (
static_cast<unsigned>(K) >= ExtractMask.
size())
6896 if (!TE.ReuseShuffleIndices.empty())
6897 K = TE.ReuseShuffleIndices[K];
6900 if (!TE.ReorderIndices.empty())
6901 K = std::distance(TE.ReorderIndices.begin(),
6902 find(TE.ReorderIndices, K));
6906 VF = std::max(VF, EI->getVectorOperandType()
6908 .getKnownMinValue());
6913 if (GatherShuffles.
size() == 1 && NumParts != 1) {
6914 if (ShuffledSubMasks.
any())
6915 return std::nullopt;
6916 PartSz = NumScalars;
6919 if (!Entries.
empty())
6920 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](
unsigned I) {
6921 if (
I >= GatherShuffles.
size() || !GatherShuffles[
I])
6923 return std::max(Entries[
I].front()->getVectorFactor(),
6924 Entries[
I].back()->getVectorFactor());
6926 unsigned NumUndefs =
count(CurrentOrder, NumScalars);
6927 if (ShuffledSubMasks.
all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
6928 return std::nullopt;
6929 return std::move(CurrentOrder);
6934 bool CompareOpcodes =
true) {
6940 return (!GEP1 || GEP1->getNumOperands() == 2) &&
6941 (!GEP2 || GEP2->getNumOperands() == 2) &&
6942 (((!GEP1 ||
isConstant(GEP1->getOperand(1))) &&
6943 (!GEP2 ||
isConstant(GEP2->getOperand(1)))) ||
6946 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
6950template <
typename T>
6955 return CommonAlignment;
6961 "Order is empty. Please check it before using isReverseOrder.");
6962 unsigned Sz = Order.
size();
6964 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
6985 "Coeffs vector needs to be of correct size");
6987 const SCEV *PtrSCEVLowest =
nullptr;
6988 const SCEV *PtrSCEVHighest =
nullptr;
6991 for (
Value *Ptr : PointerOps) {
6996 if (!PtrSCEVLowest && !PtrSCEVHighest) {
6997 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
7004 PtrSCEVLowest = PtrSCEV;
7011 PtrSCEVHighest = PtrSCEV;
7019 int Size =
DL.getTypeStoreSize(ElemTy);
7020 auto TryGetStride = [&](
const SCEV *Dist,
7021 const SCEV *Multiplier) ->
const SCEV * {
7023 if (M->getOperand(0) == Multiplier)
7024 return M->getOperand(1);
7025 if (M->getOperand(1) == Multiplier)
7026 return M->getOperand(0);
7029 if (Multiplier == Dist)
7034 const SCEV *Stride =
nullptr;
7035 if (
Size != 1 || SCEVs.
size() > 1) {
7037 Stride = TryGetStride(Dist, Sz);
7045 using DistOrdPair = std::pair<int64_t, int>;
7047 std::set<DistOrdPair,
decltype(Compare)> Offsets(Compare);
7049 bool IsConsecutive =
true;
7050 for (
const auto [Idx, PtrSCEV] :
enumerate(SCEVs)) {
7052 if (PtrSCEV != PtrSCEVLowest) {
7054 const SCEV *Coeff = TryGetStride(Diff, Stride);
7060 Coeffs[Idx] = (int64_t)SC->getAPInt().getLimitedValue();
7065 Dist = SC->getAPInt().getZExtValue();
7072 auto Res = Offsets.emplace(Dist, Cnt);
7076 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
7079 if (Offsets.size() != SCEVs.
size())
7081 SortedIndices.
clear();
7082 if (!IsConsecutive) {
7086 for (
const std::pair<int64_t, int> &Pair : Offsets) {
7087 SortedIndices[Cnt] = Pair.second;
7094static std::pair<InstructionCost, InstructionCost>
7113 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask,
CostKind, Index, SubTp,
7115 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
7118 Mask, NumSrcElts, NumSubElts, Index)) {
7119 if (Index + NumSubElts > NumSrcElts &&
7120 Index + NumSrcElts <=
static_cast<int>(
Mask.size()))
7124 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask,
CostKind, Index, SubTp,
7133 const APInt &DemandedElts,
bool Insert,
bool Extract,
7138 "ScalableVectorType is not supported.");
7141 "Incorrect usage.");
7146 unsigned ScalarTyNumElements = VecTy->getNumElements();
7149 if (!DemandedElts[
I])
7153 I * ScalarTyNumElements, VecTy);
7156 I * ScalarTyNumElements, VecTy);
7160 return TTI.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
7169 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
7170 if (Opcode == Instruction::ExtractElement) {
7176 Index * VecTy->getNumElements(), VecTy);
7179 return TTI.getVectorInstrCost(Opcode, Val,
CostKind, Index, Scalar,
7198 return TTI.getExtractWithExtendCost(Opcode, Dst, VecTy, Index,
CostKind);
7214 auto *Begin = std::next(
Mask.begin(), Index);
7215 std::iota(Begin, std::next(Begin, SubVecVF), 0);
7216 Vec = Builder.CreateShuffleVector(V, Mask);
7219 std::iota(
Mask.begin(),
Mask.end(), 0);
7220 std::iota(std::next(
Mask.begin(), Index),
7221 std::next(
Mask.begin(), Index + SubVecVF), VecVF);
7223 return Generator(Vec, V, Mask);
7226 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
7227 V = Builder.CreateShuffleVector(V, ResizeMask);
7229 return Builder.CreateShuffleVector(Vec, V, Mask);
7234 unsigned SubVecVF,
unsigned Index) {
7236 std::iota(Mask.begin(), Mask.end(), Index);
7237 return Builder.CreateShuffleVector(Vec, Mask);
7247 const unsigned Sz = PointerOps.
size();
7250 CompressMask[0] = 0;
7252 std::optional<unsigned> Stride = 0;
7255 Value *Ptr = Order.
empty() ? PointerOps[
I] : PointerOps[Order[
I]];
7256 std::optional<int64_t> OptPos =
7258 if (!OptPos || OptPos > std::numeric_limits<unsigned>::max())
7260 unsigned Pos =
static_cast<unsigned>(*OptPos);
7261 CompressMask[
I] = Pos;
7268 if (Pos != *Stride *
I)
7271 return Stride.has_value();
7284 InterleaveFactor = 0;
7286 const size_t Sz = VL.
size();
7294 if (AreAllUsersVectorized(V))
7297 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
CostKind,
7298 Mask.empty() ?
I : Mask[
I]);
7301 if (ExtractCost <= ScalarCost)
7306 if (Order.
empty()) {
7307 Ptr0 = PointerOps.
front();
7308 PtrN = PointerOps.
back();
7310 Ptr0 = PointerOps[Order.
front()];
7311 PtrN = PointerOps[Order.
back()];
7313 std::optional<int64_t> Diff =
7317 const size_t MaxRegSize =
7321 if (*Diff / Sz >= MaxRegSize / 8)
7325 Align CommonAlignment = LI->getAlign();
7327 Ptr0, LoadVecTy, CommonAlignment,
DL,
7330 if (IsMasked && !
TTI.isLegalMaskedLoad(LoadVecTy, CommonAlignment,
7331 LI->getPointerAddressSpace()))
7337 assert(CompressMask.
size() >= 2 &&
"At least two elements are required");
7341 auto [ScalarGEPCost, VectorGEPCost] =
7343 Instruction::Load,
CostKind, ScalarTy, LoadVecTy);
7360 LoadCost =
TTI.getMemIntrinsicInstrCost(
7363 LI->getPointerAddressSpace()),
7367 TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
7368 LI->getPointerAddressSpace(),
CostKind);
7370 if (IsStrided && !IsMasked && Order.
empty()) {
7377 AlignedLoadVecTy = LoadVecTy;
7378 if (
TTI.isLegalInterleavedAccessType(AlignedLoadVecTy, CompressMask[1],
7380 LI->getPointerAddressSpace())) {
7382 VectorGEPCost +
TTI.getInterleavedMemoryOpCost(
7383 Instruction::Load, AlignedLoadVecTy,
7384 CompressMask[1], {}, CommonAlignment,
7385 LI->getPointerAddressSpace(),
CostKind, IsMasked);
7386 if (InterleavedCost < GatherCost) {
7387 InterleaveFactor = CompressMask[1];
7388 LoadVecTy = AlignedLoadVecTy;
7395 if (!Order.
empty()) {
7398 NewMask[
I] = CompressMask[Mask[
I]];
7400 CompressMask.
swap(NewMask);
7402 InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
7403 return TotalVecCost < GatherCost;
7416 unsigned InterleaveFactor;
7420 AreAllUsersVectorized, IsMasked, InterleaveFactor,
7421 CompressMask, LoadVecTy);
7438 Align Alignment,
const int64_t Diff,
7439 const size_t Sz)
const {
7440 if (Diff % (Sz - 1) != 0)
7444 auto IsAnyPointerUsedOutGraph =
any_of(PointerOps, [&](
Value *V) {
7446 return !isVectorized(U) && !MustGather.contains(U);
7450 const uint64_t AbsoluteDiff = std::abs(Diff);
7452 if (IsAnyPointerUsedOutGraph ||
7453 (AbsoluteDiff > Sz &&
7457 Diff == -(
static_cast<int64_t
>(Sz) - 1)) {
7458 int64_t Stride = Diff /
static_cast<int64_t
>(Sz - 1);
7459 if (Diff != Stride *
static_cast<int64_t
>(Sz - 1))
7461 if (!TTI->isLegalStridedLoadStore(VecTy, Alignment))
7472 const size_t Sz = PointerOps.
size();
7480 SortedIndices.
empty() ? PointerOps[
I] : PointerOps[SortedIndices[
I]];
7481 std::optional<int64_t>
Offset =
7483 assert(
Offset &&
"sortPtrAccesses should have validated this pointer");
7484 SortedOffsetsFromBase[
I] = *
Offset;
7501 int64_t StrideWithinGroup =
7502 SortedOffsetsFromBase[1] - SortedOffsetsFromBase[0];
7505 auto IsEndOfGroupIndex = [=, &SortedOffsetsFromBase](
unsigned Idx) {
7506 return SortedOffsetsFromBase[Idx] - SortedOffsetsFromBase[Idx - 1] !=
7511 unsigned GroupSize = FoundIt != Indices.end() ? *FoundIt : Sz;
7513 unsigned VecSz = Sz;
7514 Type *NewScalarTy = ScalarTy;
7518 bool NeedsWidening = Sz != GroupSize;
7519 const uint64_t UnitBitWidth = DL->getTypeSizeInBits(ScalarTy).getFixedValue();
7520 if (NeedsWidening) {
7521 if (Sz % GroupSize != 0)
7524 if (StrideWithinGroup != 1)
7526 VecSz = Sz / GroupSize;
7527 NewScalarTy =
Type::getIntNTy(SE->getContext(), UnitBitWidth * GroupSize);
7532 if (!
isStridedLoad(PointerOps, NewScalarTy, Alignment, Diff, VecSz))
7535 int64_t StrideIntVal = StrideWithinGroup;
7536 if (NeedsWidening) {
7539 unsigned CurrentGroupStartIdx = GroupSize;
7540 int64_t StrideBetweenGroups =
7541 SortedOffsetsFromBase[GroupSize] - SortedOffsetsFromBase[0];
7542 StrideIntVal = StrideBetweenGroups;
7543 for (; CurrentGroupStartIdx < Sz; CurrentGroupStartIdx += GroupSize) {
7544 if (SortedOffsetsFromBase[CurrentGroupStartIdx] -
7545 SortedOffsetsFromBase[CurrentGroupStartIdx - GroupSize] !=
7546 StrideBetweenGroups)
7550 auto CheckGroup = [=](
const unsigned StartIdx) ->
bool {
7553 unsigned GroupEndIdx = FoundIt != Indices.end() ? *FoundIt : Sz;
7554 return GroupEndIdx - StartIdx == GroupSize;
7556 for (
unsigned I = 0;
I < Sz;
I += GroupSize) {
7562 Type *StrideTy = DL->getIndexType(Ptr0->
getType());
7572 bool IsLoad)
const {
7578 OffsetToPointerOpIdxMap;
7579 for (
auto [Idx, Ptr] :
enumerate(PointerOps)) {
7580 const SCEV *PtrSCEV = SE->getSCEV(Ptr);
7592 Offset = SC->getAPInt().getSExtValue();
7593 if (
Offset >= std::numeric_limits<int64_t>::max() - 1) {
7600 OffsetToPointerOpIdxMap[
Offset].first.push_back(Ptr);
7601 OffsetToPointerOpIdxMap[
Offset].second.push_back(Idx);
7603 unsigned NumOffsets = OffsetToPointerOpIdxMap.
size();
7607 const unsigned Sz = PointerOps.
size();
7608 unsigned VecSz = Sz;
7609 Type *NewScalarTy = BaseTy;
7610 if (NumOffsets > 1) {
7611 if (Sz % NumOffsets != 0)
7613 VecSz = Sz / NumOffsets;
7618 DL->getTypeSizeInBits(BaseTy).getFixedValue() * NumOffsets);
7619 auto *StridedLoadTy =
7621 unsigned MinProfitableStridedOps =
7624 if (Sz * BaseTyNumElts < MinProfitableStridedOps ||
7625 !TTI->isTypeLegal(StridedLoadTy) ||
7626 !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))
7632 for (
auto [Idx, MapPair] :
enumerate(OffsetToPointerOpIdxMap)) {
7633 if (MapPair.second.first.size() != VecSz)
7635 SortedOffsetsV[Idx] = MapPair.first;
7637 sort(SortedOffsetsV);
7639 if (NumOffsets > 1) {
7640 int64_t BaseBytes = DL->getTypeStoreSize(BaseTy);
7642 if (SortedOffsetsV[
I] - SortedOffsetsV[
I - 1] != BaseBytes)
7715 auto UpdateSortedIndices =
7718 if (SortedIndicesForOffset.
empty()) {
7719 SortedIndicesForOffset.
resize(IndicesInAllPointerOps.
size());
7720 std::iota(SortedIndicesForOffset.
begin(),
7721 SortedIndicesForOffset.
end(), 0);
7723 for (
const auto [Num, Idx] :
enumerate(SortedIndicesForOffset)) {
7724 SortedIndicesDraft[Num * NumOffsets + OffsetNum] =
7725 IndicesInAllPointerOps[Idx];
7729 int64_t LowestOffset = SortedOffsetsV[0];
7735 SortedIndicesForOffset0, Coeffs0);
7738 unsigned NumCoeffs0 = Coeffs0.
size();
7739 if (NumCoeffs0 * NumOffsets != Sz)
7744 OffsetToPointerOpIdxMap[LowestOffset].second;
7745 UpdateSortedIndices(SortedIndicesForOffset0, IndicesInAllPointerOps0, 0);
7751 for (
int J :
seq<int>(1, NumOffsets)) {
7754 SortedIndicesForOffset.
clear();
7756 int64_t
Offset = SortedOffsetsV[J];
7758 OffsetToPointerOpIdxMap[
Offset].first;
7760 OffsetToPointerOpIdxMap[
Offset].second;
7762 PointerOpsForOffset, BaseTy, *DL, *SE, SortedIndicesForOffset, Coeffs);
7764 if (!StrideWithinGroup || StrideWithinGroup != Stride0)
7766 if (Coeffs.
size() != NumCoeffs0)
7769 if (Coeffs != Coeffs0)
7772 UpdateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps, J);
7775 SortedIndices.
clear();
7776 SortedIndices = std::move(SortedIndicesDraft);
7778 SPtrInfo.
Ty = StridedLoadTy;
7785 unsigned *BestVF,
bool TryRecursiveCheck)
const {
7798 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
7804 const size_t Sz = VL.
size();
7806 auto *POIter = PointerOps.
begin();
7807 for (
Value *V : VL) {
7809 if (!L || !L->isSimple())
7811 *POIter = L->getPointerOperand();
7817 bool IsSorted =
sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
7826 std::optional<bool> MaskedGatherLegal;
7827 auto IsMaskedGatherLegal = [&] {
7828 if (!MaskedGatherLegal)
7830 TTI->isLegalMaskedGather(VecTy, CommonAlignment) &&
7831 !TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment);
7832 return *MaskedGatherLegal;
7839 if (!IsMaskedGatherLegal())
7850 if (Order.
empty()) {
7851 Ptr0 = PointerOps.
front();
7852 PtrN = PointerOps.
back();
7854 Ptr0 = PointerOps[Order.
front()];
7855 PtrN = PointerOps[Order.
back()];
7860 std::optional<int64_t> Diff0 =
7862 std::optional<int64_t> DiffN =
7865 "sortPtrAccesses should have validated these pointers");
7866 int64_t Diff = *DiffN - *Diff0;
7868 if (
static_cast<uint64_t>(Diff) == Sz - 1)
7871 *TLI, [&](
Value *V) {
7872 return areAllUsersVectorized(
7880 Diff, Ptr0, SPtrInfo))
7883 if (!IsMaskedGatherLegal())
7888 auto CheckForShuffledLoads = [&, &TTI = *TTI](
Align CommonAlignment,
7890 bool ProfitableGatherPointers) {
7895 auto [ScalarGEPCost, VectorGEPCost] =
7901 Type *PtrScalarTy = PointerOps.
front()->getType()->getScalarType();
7907 if (
static_cast<unsigned>(
count_if(
7930 TTI.getMemIntrinsicInstrCost(
7933 false, CommonAlignment),
7935 (ProfitableGatherPointers ? 0 : VectorGEPCost);
7943 constexpr unsigned ListLimit = 4;
7944 if (!TryRecursiveCheck || VL.
size() < ListLimit)
7947 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
7957 for (
unsigned Cnt = 0, End = VL.
size(); Cnt < End; Cnt += VF) {
7958 const unsigned SliceVF = std::min(VF, End - Cnt);
7963 PointerOps, SPtrInfo, BestVF,
7971 DemandedElts.
setBits(Cnt, Cnt + SliceVF);
7987 if (!DemandedElts.
isZero()) {
7993 if (DemandedElts[Idx])
7997 for (
const auto &[SliceStart, LS] : States) {
7998 const unsigned SliceVF = std::min<unsigned>(VF, VL.
size() - SliceStart);
8005 ArrayRef(PointerOps).slice(SliceStart, SliceVF),
8006 LI0->getPointerOperand(), Instruction::Load,
8010 if (
static_cast<unsigned>(
8012 PointerOps.
size() - 1 ||
8030 TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
8031 LI0->getPointerAddressSpace(),
CostKind,
8036 VecLdCost += TTI.getMemIntrinsicInstrCost(
8038 Intrinsic::experimental_vp_strided_load,
8039 SubVecTy, LI0->getPointerOperand(),
8040 false, CommonAlignment),
8045 VecLdCost += TTI.getMemIntrinsicInstrCost(
8047 Intrinsic::masked_load, SubVecTy,
8048 CommonAlignment, LI0->getPointerAddressSpace()),
8054 VecLdCost += TTI.getMemIntrinsicInstrCost(
8056 Intrinsic::masked_gather, SubVecTy,
8057 LI0->getPointerOperand(),
8058 false, CommonAlignment),
8066 const unsigned SliceIdx = SliceStart / VF;
8068 ShuffleMask[Idx] = Idx / VF == SliceIdx ? VL.
size() + Idx % VF : Idx;
8077 if (MaskedGatherCost >= VecLdCost &&
8090 bool ProfitableGatherPointers =
8091 L && Sz > 2 &&
static_cast<unsigned>(
count_if(PointerOps, [L](
Value *V) {
8092 return L->isLoopInvariant(V);
8094 if (ProfitableGatherPointers ||
all_of(PointerOps, [](
Value *
P) {
8097 (
GEP &&
GEP->getNumOperands() == 2 &&
8105 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
8106 ProfitableGatherPointers))
8118 all_of(VL, [](
const Value *V) {
return V->getType()->isPointerTy(); }) &&
8119 "Expected list of pointer operands.");
8124 std::pair<BasicBlock *, Value *>,
8128 .try_emplace(std::make_pair(
8132 SortedIndices.
clear();
8134 auto Key = std::make_pair(BBs[Cnt + 1],
8136 bool Found =
any_of(Bases.try_emplace(
Key).first->second,
8137 [&, &Cnt = Cnt, &Ptr = Ptr](
auto &
Base) {
8138 std::optional<int64_t> Diff =
8139 getPointersDiff(ElemTy, std::get<0>(Base.front()),
8140 ElemTy, Ptr, DL, SE,
8145 Base.emplace_back(Ptr, *Diff, Cnt + 1);
8151 if (Bases.size() > VL.
size() / 2 - 1)
8155 Bases.find(
Key)->second.emplace_back().emplace_back(Ptr, 0, Cnt + 1);
8159 if (Bases.size() == VL.
size())
8162 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
8163 Bases.front().second.size() == VL.
size()))
8168 auto ComparePointers = [](
Value *Ptr1,
Value *Ptr2) {
8177 FirstPointers.
insert(P1);
8178 SecondPointers.
insert(P2);
8184 "Unable to find matching root.");
8187 for (
auto &
Base : Bases) {
8188 for (
auto &Vec :
Base.second) {
8189 if (Vec.size() > 1) {
8191 int64_t InitialOffset = std::get<1>(Vec[0]);
8192 bool AnyConsecutive =
8194 return std::get<1>(
P.value()) ==
8195 int64_t(
P.index()) + InitialOffset;
8199 if (!AnyConsecutive)
8204 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
8208 for (
auto &
T : Bases)
8209 for (
const auto &Vec :
T.second)
8210 for (
const auto &
P : Vec)
8214 "Expected SortedIndices to be the size of VL");
8218std::optional<BoUpSLP::OrdersType>
8220 assert(TE.isGather() &&
"Expected gather node only.");
8221 Type *ScalarTy = TE.Scalars[0]->getType();
8224 Ptrs.
reserve(TE.Scalars.size());
8226 BBs.
reserve(TE.Scalars.size());
8227 for (
Value *V : TE.Scalars) {
8229 if (!L || !L->isSimple())
8230 return std::nullopt;
8236 if (!LoadEntriesToVectorize.contains(TE.Idx) &&
8238 return std::move(Order);
8239 return std::nullopt;
8250 if (VU->
getType() != V->getType())
8253 if (!VU->
hasOneUse() && !V->hasOneUse())
8259 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
8265 bool IsReusedIdx =
false;
8267 if (IE2 == VU && !IE1)
8269 if (IE1 == V && !IE2)
8270 return V->hasOneUse();
8271 if (IE1 && IE1 != V) {
8273 IsReusedIdx |= ReusedIdx.
test(Idx1);
8274 ReusedIdx.
set(Idx1);
8275 if ((IE1 != VU && !IE1->
hasOneUse()) || IsReusedIdx)
8280 if (IE2 && IE2 != VU) {
8282 IsReusedIdx |= ReusedIdx.
test(Idx2);
8283 ReusedIdx.
set(Idx2);
8284 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
8289 }
while (!IsReusedIdx && (IE1 || IE2));
8299std::optional<BoUpSLP::OrdersType>
8301 bool IgnoreReorder) {
8304 if (!TE.ReuseShuffleIndices.empty()) {
8306 return std::nullopt;
8314 unsigned Sz = TE.Scalars.size();
8315 if (TE.isGather()) {
8316 if (std::optional<OrdersType> CurrentOrder =
8321 ::addMask(Mask, TE.ReuseShuffleIndices);
8322 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
8323 unsigned Sz = TE.Scalars.size();
8324 for (
int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
8327 Res[Idx + K * Sz] =
I + K * Sz;
8329 return std::move(Res);
8332 if (Sz == 2 && TE.getVectorFactor() == 4 &&
8335 2 * TE.getVectorFactor()),
8337 return std::nullopt;
8338 if (TE.ReuseShuffleIndices.size() % Sz != 0)
8339 return std::nullopt;
8343 if (TE.ReorderIndices.empty())
8344 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
8347 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
8348 unsigned VF = ReorderMask.
size();
8352 for (
unsigned I = 0;
I < VF;
I += Sz) {
8354 unsigned UndefCnt = 0;
8355 unsigned Limit = std::min(Sz, VF -
I);
8365 UsedVals.
test(Val) || UndefCnt > Sz / 2)
8366 return std::nullopt;
8368 for (
unsigned K = 0; K < NumParts; ++K) {
8369 unsigned Idx = Val + Sz * K;
8370 if (Idx < VF &&
I + K < VF)
8371 ResOrder[Idx] =
I + K;
8374 return std::move(ResOrder);
8376 unsigned VF = TE.getVectorFactor();
8379 TE.ReuseShuffleIndices.end());
8380 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
8382 if (isa<PoisonValue>(V))
8384 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
8385 return Idx && *Idx < Sz;
8387 assert(!TE.isAltShuffle() &&
"Alternate instructions are only supported "
8388 "by BinaryOperator and CastInst.");
8390 if (TE.ReorderIndices.empty())
8391 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
8394 for (
unsigned I = 0;
I < VF; ++
I) {
8395 int &Idx = ReusedMask[
I];
8398 Value *V = TE.Scalars[ReorderMask[Idx]];
8400 Idx = std::distance(ReorderMask.
begin(),
find(ReorderMask, *EI));
8406 std::iota(ResOrder.
begin(), ResOrder.
end(), 0);
8407 auto *It = ResOrder.
begin();
8408 for (
unsigned K = 0; K < VF; K += Sz) {
8412 std::iota(SubMask.
begin(), SubMask.
end(), 0);
8414 transform(CurrentOrder, It, [K](
unsigned Pos) {
return Pos + K; });
8415 std::advance(It, Sz);
8418 return Data.index() ==
Data.value();
8420 return std::nullopt;
8421 return std::move(ResOrder);
8423 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
8424 (!TE.UserTreeIndex || !TE.UserTreeIndex.UserTE->hasState() ||
8426 (TE.ReorderIndices.empty() ||
isReverseOrder(TE.ReorderIndices)))
8427 return std::nullopt;
8428 if (TE.State == TreeEntry::SplitVectorize ||
8429 ((TE.State == TreeEntry::Vectorize ||
8430 TE.State == TreeEntry::StridedVectorize ||
8431 TE.State == TreeEntry::CompressVectorize) &&
8434 assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
8435 "Alternate instructions are only supported by "
8436 "BinaryOperator and CastInst.");
8437 return TE.ReorderIndices;
8439 if (!TopToBottom && IgnoreReorder && TE.State == TreeEntry::Vectorize &&
8440 TE.isAltShuffle()) {
8441 assert(TE.ReuseShuffleIndices.empty() &&
8442 "ReuseShuffleIndices should be "
8443 "empty for alternate instructions.");
8445 TE.buildAltOpShuffleMask(
8447 assert(TE.getMatchingMainOpOrAltOp(
I) &&
8448 "Unexpected main/alternate opcode");
8452 const int VF = TE.getVectorFactor();
8457 ResOrder[Mask[
I] % VF] =
I;
8459 return std::move(ResOrder);
8461 if (!TE.ReorderIndices.empty())
8462 return TE.ReorderIndices;
8463 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
8464 if (!TE.ReorderIndices.empty())
8465 return TE.ReorderIndices;
8468 for (
auto [
I, V] :
zip(UserBVHead, TE.Scalars)) {
8476 while (
II &&
II->hasOneUse() &&
II->getParent() == BB) {
8484 assert(BB1 != BB2 &&
"Expected different basic blocks.");
8485 if (!DT->isReachableFromEntry(BB1))
8487 if (!DT->isReachableFromEntry(BB2))
8489 auto *NodeA = DT->getNode(BB1);
8490 auto *NodeB = DT->getNode(BB2);
8491 assert(NodeA &&
"Should only process reachable instructions");
8492 assert(NodeB &&
"Should only process reachable instructions");
8493 assert((NodeA == NodeB) ==
8494 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
8495 "Different nodes should have different DFS numbers");
8496 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
8498 auto PHICompare = [&](
unsigned I1,
unsigned I2) {
8499 Value *V1 = TE.Scalars[I1];
8500 Value *V2 = TE.Scalars[I2];
8513 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
8514 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
8515 FirstUserOfPhi2->getParent());
8525 if (UserBVHead[I1] && !UserBVHead[I2])
8527 if (!UserBVHead[I1])
8529 if (UserBVHead[I1] == UserBVHead[I2])
8532 return CompareByBasicBlocks(UserBVHead[I1]->
getParent(),
8534 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
8547 if (EE1->getOperand(0) == EE2->getOperand(0))
8549 if (!Inst1 && Inst2)
8551 if (Inst1 && Inst2) {
8559 "Expected either instructions or arguments vector operands.");
8560 return P1->getArgNo() < P2->getArgNo();
8565 std::iota(Phis.
begin(), Phis.
end(), 0);
8568 return std::nullopt;
8569 return std::move(Phis);
8571 if (TE.isGather() &&
8572 (!TE.hasState() || !TE.isAltShuffle() ||
8573 ScalarsInSplitNodes.contains(TE.getMainOp())) &&
8577 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
8581 auto *EE = dyn_cast<ExtractElementInst>(V);
8582 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
8588 canReuseExtract(TE.Scalars, CurrentOrder,
true);
8589 if (Reuse || !CurrentOrder.
empty())
8590 return std::move(CurrentOrder);
8598 int Sz = TE.Scalars.size();
8602 if (It == TE.Scalars.begin())
8606 if (It != TE.Scalars.end()) {
8608 unsigned Idx = std::distance(TE.Scalars.begin(), It);
8623 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
8626 return std::move(Order);
8631 return std::nullopt;
8632 if (TE.Scalars.size() >= 3)
8637 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
8642 CurrentOrder, PointerOps, SPtrInfo);
8645 return std::move(CurrentOrder);
8647 if (std::optional<OrdersType> CurrentOrder =
8649 return CurrentOrder;
8651 return std::nullopt;
8661 for (
unsigned I = Sz,
E = Mask.size();
I <
E;
I += Sz) {
8663 if (Cluster != FirstCluster)
8669void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const {
8672 const unsigned Sz =
TE.Scalars.size();
8674 if (!
TE.isGather() ||
8681 addMask(NewMask,
TE.ReuseShuffleIndices);
8683 TE.ReorderIndices.clear();
8690 for (
auto *It =
TE.ReuseShuffleIndices.begin(),
8691 *End =
TE.ReuseShuffleIndices.end();
8692 It != End; std::advance(It, Sz))
8693 std::iota(It, std::next(It, Sz), 0);
8699 "Expected same size of orders");
8700 size_t Sz = Order.
size();
8703 if (Order[Idx] != Sz)
8704 UsedIndices.
set(Order[Idx]);
8706 if (SecondaryOrder.
empty()) {
8708 if (Order[Idx] == Sz && !UsedIndices.
test(Idx))
8712 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
8713 !UsedIndices.
test(SecondaryOrder[Idx]))
8714 Order[Idx] = SecondaryOrder[Idx];
8722 constexpr unsigned TinyVF = 2;
8723 constexpr unsigned TinyTree = 10;
8724 constexpr unsigned PhiOpsLimit = 12;
8725 constexpr unsigned GatherLoadsLimit = 2;
8726 if (VectorizableTree.size() <= TinyTree)
8728 if (VectorizableTree.front()->hasState() &&
8729 !VectorizableTree.front()->isGather() &&
8730 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
8731 VectorizableTree.front()->getOpcode() == Instruction::PHI ||
8732 (VectorizableTree.front()->getVectorFactor() <= TinyVF &&
8733 (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
8734 VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
8735 VectorizableTree.front()->ReorderIndices.empty()) {
8739 if (VectorizableTree.front()->hasState() &&
8740 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
8741 VectorizableTree.front()->Scalars.size() == TinyVF &&
8742 VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
8745 if (VectorizableTree.front()->hasState() &&
8746 VectorizableTree.front()->getOpcode() == Instruction::Store &&
8747 VectorizableTree.front()->ReorderIndices.empty()) {
8748 const unsigned ReorderedSplitsCnt =
8749 count_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
8750 return TE->State == TreeEntry::SplitVectorize &&
8751 !TE->ReorderIndices.empty() && TE->UserTreeIndex.UserTE &&
8752 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8755 if (ReorderedSplitsCnt <= 1 &&
8757 VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
8758 return ((!TE->isGather() &&
8759 (TE->ReorderIndices.empty() ||
8760 (TE->UserTreeIndex.UserTE &&
8761 TE->UserTreeIndex.UserTE->State ==
8762 TreeEntry::Vectorize &&
8763 !TE->UserTreeIndex.UserTE->ReuseShuffleIndices
8765 (TE->isGather() && TE->ReorderIndices.empty() &&
8766 (!TE->hasState() || TE->isAltShuffle() ||
8767 TE->getOpcode() == Instruction::Load ||
8768 TE->getOpcode() == Instruction::ZExt ||
8769 TE->getOpcode() == Instruction::SExt))) &&
8770 (VectorizableTree.front()->getVectorFactor() > TinyVF ||
8771 !TE->isGather() ||
none_of(TE->Scalars, [&](
Value *V) {
8772 return !isConstant(V) && isVectorized(V);
8774 })) >= VectorizableTree.size() - ReorderedSplitsCnt)
8777 bool HasPhis =
false;
8778 bool HasLoad =
true;
8779 unsigned GatherLoads = 0;
8780 for (
const std::unique_ptr<TreeEntry> &TE :
8781 ArrayRef(VectorizableTree).drop_front()) {
8782 if (TE->State == TreeEntry::SplitVectorize)
8784 if (!TE->hasState()) {
8788 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8793 if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) {
8794 if (!TE->isGather()) {
8801 if (GatherLoads >= GatherLoadsLimit)
8804 if (TE->getOpcode() == Instruction::GetElementPtr ||
8807 if (TE->getOpcode() != Instruction::PHI &&
8808 (!TE->hasCopyableElements() ||
8810 TE->Scalars.size() / 2))
8812 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8813 TE->getNumOperands() > PhiOpsLimit)
8822void BoUpSLP::TreeEntry::reorderSplitNode(
unsigned Idx,
ArrayRef<int> Mask,
8824 assert(State == TreeEntry::SplitVectorize &&
"Expected split user node.");
8827 std::iota(NewMask.
begin(), NewMask.
end(), 0);
8828 std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0);
8831 copy(MaskOrder, NewMaskOrder.begin());
8833 assert(Idx == 1 &&
"Expected either 0 or 1 index.");
8834 unsigned Offset = CombinedEntriesWithIndices.
back().second;
8843 ReorderIndices.clear();
8862 ExternalUserReorderMap;
8864 if (
any_of(VectorizableTree, [](
const std::unique_ptr<TreeEntry> &TE) {
8865 return TE->State == TreeEntry::Vectorize &&
8871 const bool IgnoreReorder =
8872 !UserIgnoreList && VectorizableTree.front()->hasState() &&
8873 (VectorizableTree.front()->
getOpcode() == Instruction::InsertElement ||
8874 VectorizableTree.front()->getOpcode() == Instruction::Store);
8878 for_each(VectorizableTree, [&, &TTIRef = *TTI](
8879 const std::unique_ptr<TreeEntry> &TE) {
8882 findExternalStoreUsersReorderIndices(TE.get());
8883 if (!ExternalUserReorderIndices.
empty()) {
8884 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
8886 std::move(ExternalUserReorderIndices));
8892 if (TE->hasState() && TE->isAltShuffle() &&
8893 TE->State != TreeEntry::SplitVectorize) {
8894 Type *ScalarTy = TE->Scalars[0]->getType();
8897 unsigned Opcode0 = TE->getOpcode();
8898 unsigned Opcode1 = TE->getAltOpcode();
8902 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
8903 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
8909 if (std::optional<OrdersType> CurrentOrder =
8919 const TreeEntry *UserTE = TE.get();
8921 if (!UserTE->UserTreeIndex)
8923 if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8924 UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
8925 UserTE->UserTreeIndex.UserTE->Idx != 0)
8927 UserTE = UserTE->UserTreeIndex.UserTE;
8930 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
8931 if (!(TE->State == TreeEntry::Vectorize ||
8932 TE->State == TreeEntry::StridedVectorize ||
8933 TE->State == TreeEntry::SplitVectorize ||
8934 TE->State == TreeEntry::CompressVectorize) ||
8935 !TE->ReuseShuffleIndices.empty())
8936 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
8937 if (TE->State == TreeEntry::Vectorize &&
8938 TE->getOpcode() == Instruction::PHI)
8939 PhisToOrders.
try_emplace(TE.get(), *CurrentOrder);
8944 for (
unsigned VF = VectorizableTree.front()->getVectorFactor();
8945 !VFToOrderedEntries.
empty() && VF > 1; --VF) {
8946 auto It = VFToOrderedEntries.
find(VF);
8947 if (It == VFToOrderedEntries.
end())
8961 for (
const TreeEntry *OpTE : OrderedEntries) {
8964 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE) &&
8965 OpTE->State != TreeEntry::SplitVectorize)
8968 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
8970 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
8971 auto It = GathersToOrders.find(OpTE);
8972 if (It != GathersToOrders.end())
8975 if (OpTE->hasState() && OpTE->isAltShuffle()) {
8976 auto It = AltShufflesToOrders.find(OpTE);
8977 if (It != AltShufflesToOrders.end())
8980 if (OpTE->State == TreeEntry::Vectorize &&
8981 OpTE->getOpcode() == Instruction::PHI) {
8982 auto It = PhisToOrders.
find(OpTE);
8983 if (It != PhisToOrders.
end())
8986 return OpTE->ReorderIndices;
8989 auto It = ExternalUserReorderMap.
find(OpTE);
8990 if (It != ExternalUserReorderMap.
end()) {
8991 const auto &ExternalUserReorderIndices = It->second;
8995 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
8996 OrdersUses.try_emplace(
OrdersType(), 0).first->second +=
8997 ExternalUserReorderIndices.size();
8999 for (
const OrdersType &ExtOrder : ExternalUserReorderIndices)
9000 ++OrdersUses.try_emplace(ExtOrder, 0).first->second;
9007 if (OpTE->State == TreeEntry::Vectorize &&
9008 OpTE->getOpcode() == Instruction::Store && !Order.
empty()) {
9009 assert(!OpTE->isAltShuffle() &&
9010 "Alternate instructions are only supported by BinaryOperator "
9014 unsigned E = Order.
size();
9017 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
9020 ++OrdersUses.try_emplace(CurrentOrder, 0).first->second;
9022 ++OrdersUses.try_emplace(Order, 0).first->second;
9025 if (OrdersUses.empty())
9028 unsigned IdentityCnt = 0;
9029 unsigned FilledIdentityCnt = 0;
9031 for (
auto &Pair : OrdersUses) {
9033 if (!Pair.first.empty())
9034 FilledIdentityCnt += Pair.second;
9035 IdentityCnt += Pair.second;
9040 unsigned Cnt = IdentityCnt;
9041 for (
auto &Pair : OrdersUses) {
9045 if (Cnt < Pair.second ||
9046 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
9047 Cnt == Pair.second && !BestOrder.
empty() &&
9050 BestOrder = Pair.first;
9063 unsigned E = BestOrder.
size();
9065 return I < E ? static_cast<int>(I) : PoisonMaskElem;
9068 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
9070 if (TE->Scalars.size() != VF) {
9071 if (TE->ReuseShuffleIndices.size() == VF) {
9072 assert(TE->State != TreeEntry::SplitVectorize &&
9073 "Split vectorized not expected.");
9078 (!TE->UserTreeIndex ||
9079 TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
9080 TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() ||
9081 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) &&
9082 "All users must be of VF size.");
9089 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->hasState() &&
9095 reorderNodeWithReuses(*TE, Mask);
9097 if (TE->UserTreeIndex &&
9098 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
9099 TE->UserTreeIndex.UserTE->reorderSplitNode(
9100 TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder);
9104 if ((TE->State == TreeEntry::SplitVectorize &&
9105 TE->ReuseShuffleIndices.empty()) ||
9106 ((TE->State == TreeEntry::Vectorize ||
9107 TE->State == TreeEntry::StridedVectorize ||
9108 TE->State == TreeEntry::CompressVectorize) &&
9113 (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize &&
9114 TE->ReuseShuffleIndices.empty())) &&
9115 "Alternate instructions are only supported by BinaryOperator "
9121 TE->reorderOperands(Mask);
9124 TE->reorderOperands(Mask);
9125 assert(TE->ReorderIndices.empty() &&
9126 "Expected empty reorder sequence.");
9129 if (!TE->ReuseShuffleIndices.empty()) {
9136 addMask(NewReuses, TE->ReuseShuffleIndices);
9137 TE->ReuseShuffleIndices.swap(NewReuses);
9138 }
else if (TE->UserTreeIndex &&
9139 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
9141 TE->UserTreeIndex.UserTE->reorderSplitNode(TE->UserTreeIndex.EdgeIdx,
9147void BoUpSLP::buildReorderableOperands(
9148 TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
9152 if (
any_of(Edges, [
I](
const std::pair<unsigned, TreeEntry *> &OpData) {
9153 return OpData.first ==
I &&
9154 (OpData.second->State == TreeEntry::Vectorize ||
9155 OpData.second->State == TreeEntry::StridedVectorize ||
9156 OpData.second->State == TreeEntry::CompressVectorize ||
9157 OpData.second->State == TreeEntry::SplitVectorize);
9161 if (UserTE->hasState()) {
9162 if (UserTE->getOpcode() == Instruction::ExtractElement ||
9163 UserTE->getOpcode() == Instruction::ExtractValue)
9165 if (UserTE->getOpcode() == Instruction::InsertElement &&
I == 0)
9167 if (UserTE->getOpcode() == Instruction::Store &&
I == 1 &&
9168 (UserTE->State == TreeEntry::Vectorize ||
9169 UserTE->State == TreeEntry::StridedVectorize))
9171 if (UserTE->getOpcode() == Instruction::Load &&
9172 (UserTE->State == TreeEntry::Vectorize ||
9173 UserTE->State == TreeEntry::StridedVectorize ||
9174 UserTE->State == TreeEntry::CompressVectorize))
9177 TreeEntry *
TE = getOperandEntry(UserTE,
I);
9178 assert(TE &&
"Expected operand entry.");
9179 if (!
TE->isGather()) {
9182 Edges.emplace_back(
I, TE);
9188 if (
TE->State == TreeEntry::ScatterVectorize &&
9189 TE->ReuseShuffleIndices.empty() &&
TE->ReorderIndices.empty())
9193 if (ReorderableGathers.
contains(TE))
9199 struct TreeEntryCompare {
9200 bool operator()(
const TreeEntry *LHS,
const TreeEntry *RHS)
const {
9201 if (LHS->UserTreeIndex && RHS->UserTreeIndex)
9202 return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx;
9203 return LHS->Idx < RHS->Idx;
9212 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
9213 if (TE->State != TreeEntry::Vectorize &&
9214 TE->State != TreeEntry::StridedVectorize &&
9215 TE->State != TreeEntry::CompressVectorize &&
9216 TE->State != TreeEntry::SplitVectorize)
9217 NonVectorized.
insert(TE.get());
9218 if (std::optional<OrdersType> CurrentOrder =
9220 Queue.push(TE.get());
9221 if (!(TE->State == TreeEntry::Vectorize ||
9222 TE->State == TreeEntry::StridedVectorize ||
9223 TE->State == TreeEntry::CompressVectorize ||
9224 TE->State == TreeEntry::SplitVectorize) ||
9225 !TE->ReuseShuffleIndices.empty())
9226 GathersToOrders.
insert(TE.get());
9235 while (!Queue.empty()) {
9237 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>
Users;
9238 TreeEntry *TE = Queue.top();
9239 const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
9242 while (!Queue.empty()) {
9244 if (!UserTE || UserTE != TE->UserTreeIndex.UserTE)
9249 for (TreeEntry *TE : OrderedOps) {
9250 if (!(TE->State == TreeEntry::Vectorize ||
9251 TE->State == TreeEntry::StridedVectorize ||
9252 TE->State == TreeEntry::CompressVectorize ||
9253 TE->State == TreeEntry::SplitVectorize ||
9254 (TE->isGather() && GathersToOrders.
contains(TE))) ||
9255 !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
9256 !Visited.
insert(TE).second)
9260 Users.first = TE->UserTreeIndex.UserTE;
9261 Users.second.emplace_back(TE->UserTreeIndex.EdgeIdx, TE);
9266 if (
Data.first->State == TreeEntry::Vectorize &&
9269 if (
Data.first->State == TreeEntry::SplitVectorize) {
9271 Data.second.size() <= 2 &&
9272 "Expected not greater than 2 operands for split vectorize node.");
9274 [](
const auto &
Op) { return !Op.second->UserTreeIndex; }))
9277 assert(
Data.first->CombinedEntriesWithIndices.size() == 2 &&
9278 "Expected exactly 2 entries.");
9279 for (
const auto &
P :
Data.first->CombinedEntriesWithIndices) {
9280 TreeEntry &OpTE = *VectorizableTree[
P.first];
9282 if (Order.
empty() || !OpTE.ReuseShuffleIndices.empty()) {
9283 if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
9285 const auto BestOrder =
9294 const unsigned E = Order.
size();
9297 return I < E ? static_cast<int>(I) : PoisonMaskElem;
9299 Data.first->reorderSplitNode(
P.second ? 1 : 0, Mask, MaskOrder);
9301 if (!OpTE.ReorderIndices.empty()) {
9302 OpTE.ReorderIndices.clear();
9303 }
else if (!OpTE.ReuseShuffleIndices.empty()) {
9306 assert(OpTE.isGather() &&
"Expected only gather/buildvector node.");
9310 if (
Data.first->ReuseShuffleIndices.empty() &&
9311 !
Data.first->ReorderIndices.empty()) {
9314 Queue.push(
Data.first);
9320 buildReorderableOperands(
Data.first,
Data.second, NonVectorized,
9332 for (
const auto &
Op :
Data.second) {
9333 TreeEntry *OpTE =
Op.second;
9334 if (!VisitedOps.
insert(OpTE).second)
9336 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
9338 const auto Order = [&]() ->
const OrdersType {
9339 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
9343 return OpTE->ReorderIndices;
9347 if (Order.
size() == 1)
9353 Value *Root = OpTE->hasState()
9356 auto GetSameNodesUsers = [&](
Value *Root) {
9358 for (
const TreeEntry *TE : ValueToGatherNodes.lookup(Root)) {
9359 if (TE != OpTE && TE->UserTreeIndex &&
9360 TE->getVectorFactor() == OpTE->getVectorFactor() &&
9361 TE->Scalars.size() == OpTE->Scalars.size() &&
9362 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
9363 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
9364 Res.
insert(TE->UserTreeIndex.UserTE);
9366 for (
const TreeEntry *TE : getTreeEntries(Root)) {
9367 if (TE != OpTE && TE->UserTreeIndex &&
9368 TE->getVectorFactor() == OpTE->getVectorFactor() &&
9369 TE->Scalars.size() == OpTE->Scalars.size() &&
9370 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
9371 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
9372 Res.
insert(TE->UserTreeIndex.UserTE);
9376 auto GetNumOperands = [](
const TreeEntry *TE) {
9377 if (TE->State == TreeEntry::SplitVectorize)
9378 return TE->getNumOperands();
9380 return CI->arg_size();
9381 return TE->getNumOperands();
9383 auto NodeShouldBeReorderedWithOperands = [&, TTI = TTI](
9384 const TreeEntry *TE) {
9392 const TreeEntry *
Op = getOperandEntry(TE, Idx);
9393 if (
Op->isGather() &&
Op->hasState()) {
9394 const TreeEntry *VecOp =
9395 getSameValuesTreeEntry(
Op->getMainOp(),
Op->Scalars);
9399 if (
Op->ReorderIndices.empty() &&
Op->ReuseShuffleIndices.empty())
9406 if (!RevisitedOps.
insert(UTE).second)
9408 return UTE ==
Data.first || !UTE->ReorderIndices.empty() ||
9409 !UTE->ReuseShuffleIndices.empty() ||
9410 (UTE->UserTreeIndex &&
9411 UTE->UserTreeIndex.UserTE ==
Data.first) ||
9412 (
Data.first->UserTreeIndex &&
9413 Data.first->UserTreeIndex.UserTE == UTE) ||
9414 (IgnoreReorder && UTE->UserTreeIndex &&
9415 UTE->UserTreeIndex.UserTE->Idx == 0) ||
9416 NodeShouldBeReorderedWithOperands(UTE);
9419 for (TreeEntry *UTE :
Users) {
9427 const TreeEntry *
Op = getOperandEntry(UTE, Idx);
9429 Queue.push(
const_cast<TreeEntry *
>(
Op));
9434 Data.second, [OpTE](
const std::pair<unsigned, TreeEntry *> &
P) {
9435 return P.second == OpTE;
9438 if (OpTE->State == TreeEntry::Vectorize &&
9439 OpTE->getOpcode() == Instruction::Store && !Order.
empty()) {
9440 assert(!OpTE->isAltShuffle() &&
9441 "Alternate instructions are only supported by BinaryOperator "
9445 unsigned E = Order.
size();
9448 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
9451 OrdersUses.try_emplace(CurrentOrder, 0).first->second +=
NumOps;
9453 OrdersUses.try_emplace(Order, 0).first->second +=
NumOps;
9455 auto Res = OrdersUses.try_emplace(
OrdersType(), 0);
9456 const auto AllowsReordering = [&](
const TreeEntry *TE) {
9457 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
9458 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
9459 (IgnoreReorder && TE->Idx == 0))
9461 if (TE->isGather()) {
9471 if (OpTE->UserTreeIndex) {
9472 TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE;
9473 if (!VisitedUsers.
insert(UserTE).second)
9478 if (AllowsReordering(UserTE))
9486 if (
static_cast<unsigned>(
count_if(
9487 Ops, [UserTE, &AllowsReordering](
9488 const std::pair<unsigned, TreeEntry *> &
Op) {
9489 return AllowsReordering(
Op.second) &&
9490 Op.second->UserTreeIndex.UserTE == UserTE;
9491 })) <=
Ops.size() / 2)
9492 ++Res.first->second;
9495 if (OrdersUses.empty()) {
9500 unsigned IdentityCnt = 0;
9501 unsigned VF =
Data.second.front().second->getVectorFactor();
9503 for (
auto &Pair : OrdersUses) {
9505 IdentityCnt += Pair.second;
9510 unsigned Cnt = IdentityCnt;
9511 for (
auto &Pair : OrdersUses) {
9515 if (Cnt < Pair.second) {
9517 BestOrder = Pair.first;
9534 unsigned E = BestOrder.
size();
9536 return I < E ? static_cast<int>(I) : PoisonMaskElem;
9538 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second) {
9539 TreeEntry *TE =
Op.second;
9540 if (!VisitedOps.
insert(TE).second)
9543 if (TE->State == TreeEntry::Vectorize &&
9546 if (TE->ReuseShuffleIndices.size() == BestOrder.
size()) {
9547 reorderNodeWithReuses(*TE, Mask);
9551 if (TE->State != TreeEntry::Vectorize &&
9552 TE->State != TreeEntry::StridedVectorize &&
9553 TE->State != TreeEntry::CompressVectorize &&
9554 TE->State != TreeEntry::SplitVectorize &&
9555 (TE->State != TreeEntry::ScatterVectorize ||
9556 TE->ReorderIndices.empty()))
9558 assert((BestOrder.
size() == TE->ReorderIndices.size() ||
9559 TE->ReorderIndices.empty()) &&
9560 "Non-matching sizes of user/operand entries.");
9562 if (IgnoreReorder && TE == VectorizableTree.front().get())
9563 IgnoreReorder =
false;
9566 for (TreeEntry *
Gather : GatherOps) {
9568 "Unexpected reordering of gathers.");
9569 if (!
Gather->ReuseShuffleIndices.empty()) {
9579 auto IsNotProfitableAltCodeNode = [](
const TreeEntry &TE) {
9580 return TE.isAltShuffle() &&
9581 (!TE.ReuseShuffleIndices.empty() || TE.getVectorFactor() == 2 ||
9582 TE.ReorderIndices.empty());
9584 if (
Data.first->State != TreeEntry::Vectorize ||
9586 Data.first->getMainOp()) ||
9587 IsNotProfitableAltCodeNode(*
Data.first))
9588 Data.first->reorderOperands(Mask);
9590 IsNotProfitableAltCodeNode(*
Data.first) ||
9591 Data.first->State == TreeEntry::CompressVectorize) {
9595 if (
Data.first->ReuseShuffleIndices.empty() &&
9596 !
Data.first->ReorderIndices.empty() &&
9597 !IsNotProfitableAltCodeNode(*
Data.first)) {
9600 Queue.push(
Data.first);
9608 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
9609 VectorizableTree.front()->ReuseShuffleIndices.empty())
9610 VectorizableTree.front()->ReorderIndices.
clear();
9613Instruction *BoUpSLP::getRootEntryInstruction(
const TreeEntry &Entry)
const {
9614 if (Entry.hasState() &&
9615 (Entry.getOpcode() == Instruction::Store ||
9616 Entry.getOpcode() == Instruction::Load) &&
9617 Entry.State == TreeEntry::StridedVectorize &&
9618 !Entry.ReorderIndices.empty() &&
isReverseOrder(Entry.ReorderIndices))
9625 const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
9628 for (
auto &TEPtr : VectorizableTree) {
9629 TreeEntry *Entry = TEPtr.get();
9632 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize ||
9633 DeletedNodes.contains(Entry) ||
9634 TransformedToGatherNodes.contains(Entry))
9638 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
9639 Value *Scalar = Entry->Scalars[Lane];
9645 auto It = ScalarToExtUses.
find(Scalar);
9646 if (It != ScalarToExtUses.
end() && !ExternalUses[It->second].User)
9649 if (!IsStructScalar && Scalar->hasNUsesOrMore(NumVectScalars)) {
9650 unsigned FoundLane = Entry->findLaneForValue(Scalar);
9651 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract from lane " << FoundLane
9652 <<
" from " << *Scalar <<
"for many users.\n");
9653 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
9654 ExternalUses.emplace_back(Scalar,
nullptr, *Entry, FoundLane);
9655 ExternalUsesWithNonUsers.insert(Scalar);
9660 const auto ExtI = ExternallyUsedValues.
find(Scalar);
9661 if (ExtI != ExternallyUsedValues.
end()) {
9662 unsigned FoundLane = Entry->findLaneForValue(Scalar);
9663 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract: Extra arg from lane "
9664 << FoundLane <<
" from " << *Scalar <<
".\n");
9665 ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size());
9666 ExternalUses.emplace_back(Scalar,
nullptr, *Entry, FoundLane);
9677 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
9682 any_of(UseEntries, [
this](
const TreeEntry *UseEntry) {
9683 return !DeletedNodes.contains(UseEntry) &&
9684 !TransformedToGatherNodes.contains(UseEntry);
9689 if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
9692 all_of(UseEntries, [&](TreeEntry *UseEntry) {
9693 if (DeletedNodes.contains(UseEntry) ||
9694 TransformedToGatherNodes.contains(UseEntry))
9696 return UseEntry->State == TreeEntry::ScatterVectorize ||
9698 Scalar, getRootEntryInstruction(*UseEntry), TLI,
9701 LLVM_DEBUG(
dbgs() <<
"SLP: \tInternal user will be removed:" << *U
9704 [](TreeEntry *UseEntry) {
9705 return UseEntry->isGather();
9710 if (!IsStructScalar) {
9712 if (It != ScalarToExtUses.
end()) {
9713 ExternalUses[It->second].User =
nullptr;
9719 if (U && !IsStructScalar && Scalar->hasNUsesOrMore(
UsesLimit))
9721 unsigned FoundLane = Entry->findLaneForValue(Scalar);
9723 <<
" from lane " << FoundLane <<
" from " << *Scalar
9725 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
9726 ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
9727 ExternalUsesWithNonUsers.insert(Scalar);
9736BoUpSLP::collectUserStores(
const BoUpSLP::TreeEntry *TE)
const {
9740 for (
unsigned Lane :
seq<unsigned>(0, TE->Scalars.size())) {
9741 Value *V = TE->Scalars[Lane];
9754 if (
SI ==
nullptr || !
SI->isSimple() ||
SI->getFunction() !=
F ||
9763 auto &StoresVec = PtrToStoresMap[{
SI->getParent(),
9764 SI->getValueOperand()->getType(), Ptr}];
9767 if (StoresVec.size() > Lane)
9769 if (!StoresVec.empty()) {
9771 SI->getValueOperand()->getType(),
SI->getPointerOperand(),
9772 SI->getValueOperand()->getType(),
9773 StoresVec.front()->getPointerOperand(), *
DL, *SE,
9779 StoresVec.push_back(SI);
9784 for (
auto &
P : PtrToStoresMap) {
9799 StoreInst *S0 = StoresVec[0];
9804 StoreInst *
SI = StoresVec[Idx];
9805 std::optional<int64_t> Diff =
9807 SI->getPointerOperand(), *DL, *SE,
9813 if (StoreOffsetVec.
size() != StoresVec.
size())
9815 sort(StoreOffsetVec, llvm::less_first());
9817 int64_t PrevDist = 0;
9818 for (
const auto &
P : StoreOffsetVec) {
9819 if (Idx > 0 &&
P.first != PrevDist + 1)
9827 ReorderIndices.assign(StoresVec.
size(), 0);
9828 bool IsIdentity =
true;
9830 ReorderIndices[
P.second] =
I;
9831 IsIdentity &=
P.second ==
I;
9837 ReorderIndices.clear();
9844 for (
unsigned Idx : Order)
9845 dbgs() << Idx <<
", ";
9851BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE)
const {
9852 unsigned NumLanes =
TE->Scalars.size();
9865 if (StoresVec.
size() != NumLanes)
9870 if (!canFormVector(StoresVec, ReorderIndices))
9875 ExternalReorderIndices.
push_back(ReorderIndices);
9877 return ExternalReorderIndices;
9883 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9884 "TreeEntryToStridedPtrInfoMap is not cleared");
9885 UserIgnoreList = &UserIgnoreLst;
9888 buildTreeRec(Roots, 0,
EdgeInfo());
9893 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9894 "TreeEntryToStridedPtrInfoMap is not cleared");
9897 buildTreeRec(Roots, 0,
EdgeInfo());
9906 bool AddNew =
true) {
9914 for (
Value *V : VL) {
9918 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
9920 bool IsFound =
false;
9921 for (
auto [Map,
Data] :
zip(ClusteredDistToLoad, ClusteredLoads)) {
9922 assert(LI->getParent() ==
Data.front().first->getParent() &&
9923 LI->getType() ==
Data.front().first->getType() &&
9927 "Expected loads with the same type, same parent and same "
9928 "underlying pointer.");
9930 LI->getType(), LI->getPointerOperand(),
Data.front().first->getType(),
9931 Data.front().first->getPointerOperand(),
DL, SE,
9935 auto It = Map.find(*Dist);
9936 if (It != Map.end() && It->second != LI)
9938 if (It == Map.end()) {
9939 Data.emplace_back(LI, *Dist);
9940 Map.try_emplace(*Dist, LI);
9950 auto FindMatchingLoads =
9955 int64_t &
Offset,
unsigned &Start) {
9957 return GatheredLoads.
end();
9966 std::optional<int64_t> Dist =
9968 Data.front().first->getType(),
9969 Data.front().first->getPointerOperand(),
DL, SE,
9975 for (std::pair<LoadInst *, int64_t>
P :
Data) {
9981 unsigned NumUniques = 0;
9982 for (
auto [Cnt, Pair] :
enumerate(Loads)) {
9983 bool Used = DataLoads.
contains(Pair.first);
9984 if (!Used && !DataDists.
contains(*Dist + Pair.second)) {
9991 if (NumUniques > 0 &&
9992 (Loads.
size() == NumUniques ||
9993 (Loads.
size() - NumUniques >= 2 &&
9994 Loads.
size() - NumUniques >= Loads.
size() / 2 &&
10000 return std::next(GatheredLoads.
begin(), Idx);
10004 return GatheredLoads.
end();
10006 for (
ArrayRef<std::pair<LoadInst *, int64_t>>
Data : ClusteredLoads) {
10007 unsigned Start = 0;
10010 auto *It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd,
Repeated,
10012 while (It != GatheredLoads.
end()) {
10013 assert(!LocalToAdd.
empty() &&
"Expected some elements to add.");
10014 for (
unsigned Idx : LocalToAdd)
10021 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
10028 Loads.push_back(
Data[Idx]);
10034 GatheredLoads, [&](
ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
10035 return PD.front().first->getParent() == LI->
getParent() &&
10036 PD.front().first->getType() == LI->
getType();
10038 while (It != GatheredLoads.
end()) {
10041 std::next(It), GatheredLoads.
end(),
10042 [&](
ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
10043 return PD.front().first->getParent() == LI->getParent() &&
10044 PD.front().first->getType() == LI->getType();
10048 GatheredLoads.emplace_back().append(
Data.begin(),
Data.end());
10049 AddNewLoads(GatheredLoads.emplace_back());
10054void BoUpSLP::tryToVectorizeGatheredLoads(
10055 const SmallMapVector<
10056 std::tuple<BasicBlock *, Value *, Type *>,
10059 GatheredLoadsEntriesFirst = VectorizableTree.
size();
10062 LoadEntriesToVectorize.size());
10063 for (
auto [Idx, Set] :
zip(LoadEntriesToVectorize, LoadSetsToVectorize))
10064 Set.insert_range(VectorizableTree[Idx]->Scalars);
10067 auto LoadSorter = [](
const std::pair<LoadInst *, int64_t> &L1,
10068 const std::pair<LoadInst *, int64_t> &L2) {
10069 return L1.second > L2.second;
10078 return TTI->isLegalMaskedGather(Ty, Alignment) &&
10079 !TTI->forceScalarizeMaskedGather(Ty, Alignment);
10084 SmallVectorImpl<LoadInst *> &NonVectorized,
10085 bool Final,
unsigned MaxVF) {
10087 unsigned StartIdx = 0;
10088 SmallVector<int> CandidateVFs;
10092 *TTI, Loads.
front()->getType(), MaxVF);
10094 *TTI, Loads.
front()->getType(), NumElts - 1)) {
10100 if (Final && CandidateVFs.
empty())
10103 unsigned BestVF = Final ? CandidateVFs.
back() : 0;
10104 for (
unsigned NumElts : CandidateVFs) {
10105 if (Final && NumElts > BestVF)
10107 SmallVector<unsigned> MaskedGatherVectorized;
10108 for (
unsigned Cnt = StartIdx,
E = Loads.
size(); Cnt <
E;
10112 if (VectorizedLoads.count(Slice.
front()) ||
10113 VectorizedLoads.count(Slice.
back()) ||
10119 bool AllowToVectorize =
false;
10121 if (NumElts == 2) {
10122 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
10125 for (LoadInst *LI : Slice) {
10127 if (LI->hasOneUse())
10133 if (
static_cast<unsigned int>(std::distance(
10134 LI->user_begin(), LI->user_end())) != LI->getNumUses())
10136 if (!IsLegalBroadcastLoad)
10140 for (User *U : LI->users()) {
10143 for (
const TreeEntry *UTE : getTreeEntries(U)) {
10144 for (
int I :
seq<int>(UTE->getNumOperands())) {
10146 return V == LI || isa<PoisonValue>(V);
10156 AllowToVectorize = CheckIfAllowed(Slice);
10160 any_of(ValueToGatherNodes.at(Slice.front()),
10161 [=](
const TreeEntry *TE) {
10162 return TE->Scalars.size() == 2 &&
10163 ((TE->Scalars.front() == Slice.front() &&
10164 TE->Scalars.back() == Slice.back()) ||
10165 (TE->Scalars.front() == Slice.back() &&
10166 TE->Scalars.back() == Slice.front()));
10169 if (AllowToVectorize) {
10174 reinterpret_cast<Value *
const *
>(Slice.begin()), Slice.size());
10177 PointerOps, SPtrInfo, &BestVF);
10179 (BestVF > 1 &&
static_cast<unsigned>(NumElts) == 2 * BestVF)) {
10181 if (MaskedGatherVectorized.
empty() ||
10182 Cnt >= MaskedGatherVectorized.
back() + NumElts)
10187 Results.emplace_back(Values, LS);
10188 VectorizedLoads.insert_range(Slice);
10191 if (Cnt == StartIdx)
10192 StartIdx += NumElts;
10195 if (StartIdx >= Loads.
size())
10199 if (!MaskedGatherVectorized.
empty() &&
10200 Cnt < MaskedGatherVectorized.
back() + NumElts)
10201 MaskedGatherVectorized.
pop_back();
10202 Cnt += NumElts - 1;
10206 if (!AllowToVectorize || BestVF == 0)
10210 for (
unsigned Cnt : MaskedGatherVectorized) {
10212 Cnt, std::min<unsigned>(NumElts, Loads.
size() - Cnt));
10216 VectorizedLoads.insert_range(Slice);
10218 if (Cnt == StartIdx)
10219 StartIdx += NumElts;
10222 for (LoadInst *LI : Loads) {
10223 if (!VectorizedLoads.contains(LI))
10224 NonVectorized.push_back(LI);
10228 auto ProcessGatheredLoads =
10231 bool Final =
false) {
10233 for (
ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists :
10235 if (LoadsDists.size() <= 1) {
10236 NonVectorized.
push_back(LoadsDists.back().first);
10244 unsigned MaxConsecutiveDistance = 0;
10245 unsigned CurrentConsecutiveDist = 1;
10246 int64_t LastDist = LocalLoadsDists.front().second;
10247 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
10248 for (
const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {
10251 assert(LastDist >=
L.second &&
10252 "Expected first distance always not less than second");
10253 if (
static_cast<uint64_t
>(LastDist -
L.second) ==
10254 CurrentConsecutiveDist) {
10255 ++CurrentConsecutiveDist;
10256 MaxConsecutiveDistance =
10257 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
10261 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
10264 CurrentConsecutiveDist = 1;
10265 LastDist =
L.second;
10268 if (Loads.
size() <= 1)
10270 if (AllowMaskedGather)
10271 MaxConsecutiveDistance = Loads.
size();
10272 else if (MaxConsecutiveDistance < 2)
10277 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
10278 Final, MaxConsecutiveDistance);
10280 OriginalLoads.size() == Loads.
size() &&
10281 MaxConsecutiveDistance == Loads.
size() &&
10286 VectorizedLoads.
clear();
10290 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
10291 UnsortedNonVectorized, Final,
10292 OriginalLoads.size());
10293 if (SortedNonVectorized.
size() >= UnsortedNonVectorized.
size()) {
10294 SortedNonVectorized.
swap(UnsortedNonVectorized);
10295 Results.swap(UnsortedResults);
10299 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize gathered loads ("
10300 << Slice.
size() <<
")\n");
10302 for (
Value *L : Slice)
10310 unsigned MaxVF = Slice.size();
10311 unsigned UserMaxVF = 0;
10312 unsigned InterleaveFactor = 0;
10317 std::optional<unsigned> InterleavedLoadsDistance = 0;
10318 unsigned Order = 0;
10319 std::optional<unsigned> CommonVF = 0;
10320 DenseMap<const TreeEntry *, unsigned> EntryToPosition;
10321 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
10322 for (
auto [Idx, V] :
enumerate(Slice)) {
10323 for (
const TreeEntry *
E : ValueToGatherNodes.at(V)) {
10324 UserMaxVF = std::max<unsigned>(UserMaxVF,
E->Scalars.size());
10327 UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
10329 if (*CommonVF == 0) {
10330 CommonVF =
E->Scalars.size();
10333 if (*CommonVF !=
E->Scalars.size())
10337 if (Pos != Idx && InterleavedLoadsDistance) {
10340 if (isa<Constant>(V))
10342 if (isVectorized(V))
10344 const auto &Nodes = ValueToGatherNodes.at(V);
10345 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
10346 !is_contained(Slice, V);
10348 InterleavedLoadsDistance.reset();
10351 DeinterleavedNodes.
insert(
E);
10352 if (*InterleavedLoadsDistance == 0) {
10353 InterleavedLoadsDistance = Idx - Pos;
10356 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
10357 (Idx - Pos) / *InterleavedLoadsDistance < Order)
10358 InterleavedLoadsDistance.reset();
10359 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
10363 DeinterleavedNodes.
clear();
10365 if (InterleavedLoadsDistance.value_or(0) > 1 &&
10366 CommonVF.value_or(0) != 0) {
10367 InterleaveFactor =
bit_ceil(*InterleavedLoadsDistance);
10368 unsigned VF = *CommonVF;
10373 if (InterleaveFactor <= Slice.size() &&
10374 TTI.isLegalInterleavedAccessType(
10380 ->getPointerAddressSpace()) &&
10383 UserMaxVF = InterleaveFactor * VF;
10385 InterleaveFactor = 0;
10390 unsigned ConsecutiveNodesSize = 0;
10391 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
10392 any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
10393 [&, Slice = Slice](
const auto &
P) {
10395 return std::get<1>(
P).contains(V);
10397 if (It == Slice.end())
10399 const TreeEntry &
TE =
10400 *VectorizableTree[std::get<0>(
P)];
10406 VL, VL.
front(), Order, PointerOps, SPtrInfo);
10410 ConsecutiveNodesSize += VL.
size();
10411 size_t Start = std::distance(Slice.begin(), It);
10412 size_t Sz = Slice.size() -
Start;
10413 return Sz < VL.
size() ||
10414 Slice.slice(Start, VL.
size()) != VL;
10419 if (InterleaveFactor == 0 &&
10421 [&, Slice = Slice](
unsigned Idx) {
10423 SmallVector<Value *> PointerOps;
10424 StridedPtrInfo SPtrInfo;
10425 return canVectorizeLoads(
10426 Slice.slice(Idx * UserMaxVF, UserMaxVF),
10427 Slice[Idx * UserMaxVF], Order, PointerOps,
10428 SPtrInfo) == LoadsState::ScatterVectorize;
10431 if (Slice.size() != ConsecutiveNodesSize)
10432 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
10434 for (
unsigned VF = MaxVF; VF >= 2; VF /= 2) {
10435 bool IsVectorized =
true;
10436 for (
unsigned I = 0,
E = Slice.size();
I <
E;
I += VF) {
10438 Slice.slice(
I, std::min(VF,
E -
I));
10443 if (
any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
10444 [&](
const auto &
P) {
10445 return !SubSlice.
equals(
10446 VectorizableTree[std::get<0>(
P)]
10451 unsigned Sz = VectorizableTree.size();
10452 buildTreeRec(SubSlice, 0,
EdgeInfo(), InterleaveFactor);
10453 if (Sz == VectorizableTree.size()) {
10454 IsVectorized =
false;
10457 if (InterleaveFactor > 0) {
10458 VF = 2 * (MaxVF / InterleaveFactor);
10459 InterleaveFactor = 0;
10468 NonVectorized.
append(SortedNonVectorized);
10470 return NonVectorized;
10472 for (
const auto &GLs : GatheredLoads) {
10473 const auto &
Ref = GLs.second;
10475 if (!
Ref.empty() && !NonVectorized.
empty() &&
10478 [](
unsigned S,
ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists)
10479 ->
unsigned {
return S + LoadsDists.size(); }) !=
10480 NonVectorized.
size() &&
10481 IsMaskedGatherSupported(NonVectorized)) {
10483 FinalGatheredLoads;
10484 for (LoadInst *LI : NonVectorized) {
10488 FinalGatheredLoads,
10492 (void)ProcessGatheredLoads(FinalGatheredLoads,
true);
10496 for (
unsigned Idx : LoadEntriesToVectorize) {
10497 const TreeEntry &
E = *VectorizableTree[Idx];
10500 if (!
E.ReorderIndices.empty()) {
10503 SmallVector<int> ReorderMask;
10507 buildTreeRec(GatheredScalars, 0,
EdgeInfo());
10511 if (
static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
10512 VectorizableTree.size())
10513 GatheredLoadsEntriesFirst.reset();
10523 bool AllowAlternate) {
10529 if (LI->isSimple())
10540 SubKey =
hash_value(EI->getVectorOperand());
10547 if (AllowAlternate)
10558 std::pair<size_t, size_t> OpVals =
10566 if (CI->isCommutative())
10588 SubKey =
hash_value(Gep->getPointerOperand());
10600 return std::make_pair(
Key, SubKey);
10606 Instruction *AltOp,
const TargetLibraryInfo &TLI);
10612 const unsigned VF,
unsigned MinBW,
10635static std::pair<InstructionCost, InstructionCost>
10654 FMF = FPCI->getFastMathFlags();
10657 LibCost.isValid() ? LibCost : ScalarLimit);
10671 assert(L &&
"Expected valid loop");
10677 while (L && IsLoopInvariant(L, VL))
10678 L = L->getParentLoop();
10684 assert(L &&
"Expected valid loop");
10687 SmallVector<const Loop *> &Res =
10688 LoopToLoopNest.try_emplace(L).first->getSecond();
10691 SmallVector<const Loop *> LoopNest;
10694 L =
L->getParentLoop();
10721 const InstructionsState &S,
10725 assert(S && S.getOpcode() == Instruction::ExtractValue &&
10726 "Expected extractvalue instruction state.");
10732 for (
Value *V : VL) {
10734 Aggregates.push_back(VL0->getAggregateOperand());
10738 if (
IV->getIndices() != VL0Indices ||
10741 Value *Agg =
IV->getAggregateOperand();
10745 if (AggState && AggState.getOpcode() == Instruction::Call &&
10746 !AggState.isAltShuffle() &&
10771BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
10773 bool IsScatterVectorizeUserTE,
OrdersType &CurrentOrder,
10774 SmallVectorImpl<Value *> &PointerOps,
StridedPtrInfo &SPtrInfo) {
10776 "Expected instructions with same/alternate opcodes only.");
10778 unsigned ShuffleOrOp =
10779 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
10781 switch (ShuffleOrOp) {
10782 case Instruction::PHI: {
10785 return TreeEntry::NeedToGather;
10787 for (
Value *V : VL) {
10791 for (
Value *Incoming :
PHI->incoming_values()) {
10793 if (Term &&
Term->isTerminator()) {
10795 <<
"SLP: Need to swizzle PHINodes (terminator use).\n");
10796 return TreeEntry::NeedToGather;
10801 return TreeEntry::Vectorize;
10803 case Instruction::ExtractElement:
10808 Value *
Op = EI->getOperand(0);
10812 return OpI && OpI->isTerminator();
10814 return TreeEntry::NeedToGather;
10816 case Instruction::ExtractValue: {
10817 bool Reuse = canReuseExtract(VL, CurrentOrder);
10818 if (Reuse || !CurrentOrder.empty())
10819 return TreeEntry::Vectorize;
10820 SmallVector<unsigned> Indices;
10822 if (ShuffleOrOp == Instruction::ExtractValue &&
10824 return TreeEntry::Vectorize;
10826 return TreeEntry::NeedToGather;
10828 case Instruction::InsertElement: {
10832 for (
Value *V : VL) {
10834 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement/poison vector.\n");
10835 return TreeEntry::NeedToGather;
10839 "Non-constant or undef index?");
10843 return !SourceVectors.contains(V);
10846 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
10847 "different source vectors.\n");
10848 return TreeEntry::NeedToGather;
10853 return SourceVectors.contains(V) && !
V->hasOneUse();
10856 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
10857 "multiple uses.\n");
10858 return TreeEntry::NeedToGather;
10861 return TreeEntry::Vectorize;
10863 case Instruction::Load: {
10870 auto IsGatheredNode = [&]() {
10871 if (!GatheredLoadsEntriesFirst)
10876 return any_of(getTreeEntries(V), [&](
const TreeEntry *TE) {
10877 return TE->Idx >= *GatheredLoadsEntriesFirst;
10883 return TreeEntry::Vectorize;
10885 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
10887 LoadEntriesToVectorize.insert(VectorizableTree.size());
10888 return TreeEntry::NeedToGather;
10890 return IsGatheredNode() ? TreeEntry::NeedToGather
10891 : TreeEntry::CompressVectorize;
10893 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
10895 LoadEntriesToVectorize.insert(VectorizableTree.size());
10896 return TreeEntry::NeedToGather;
10898 return IsGatheredNode() ? TreeEntry::NeedToGather
10899 : TreeEntry::ScatterVectorize;
10901 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
10903 LoadEntriesToVectorize.insert(VectorizableTree.size());
10904 return TreeEntry::NeedToGather;
10906 return IsGatheredNode() ? TreeEntry::NeedToGather
10907 : TreeEntry::StridedVectorize;
10911 if (DL->getTypeSizeInBits(ScalarTy) !=
10912 DL->getTypeAllocSizeInBits(ScalarTy))
10913 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering loads of non-packed type.\n");
10916 return !LI || !LI->isSimple();
10920 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering non-consecutive loads.\n");
10923 return TreeEntry::NeedToGather;
10927 case Instruction::ZExt:
10928 case Instruction::SExt:
10929 case Instruction::FPToUI:
10930 case Instruction::FPToSI:
10931 case Instruction::FPExt:
10932 case Instruction::PtrToInt:
10933 case Instruction::IntToPtr:
10934 case Instruction::SIToFP:
10935 case Instruction::UIToFP:
10936 case Instruction::Trunc:
10937 case Instruction::FPTrunc:
10938 case Instruction::BitCast: {
10940 for (
Value *V : VL) {
10946 dbgs() <<
"SLP: Gathering casts with different src types.\n");
10947 return TreeEntry::NeedToGather;
10950 return TreeEntry::Vectorize;
10952 case Instruction::ICmp:
10953 case Instruction::FCmp: {
10958 for (
Value *V : VL) {
10962 if ((
Cmp->getPredicate() != P0 &&
Cmp->getPredicate() != SwapP0) ||
10963 Cmp->getOperand(0)->getType() != ComparedTy) {
10964 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering cmp with different predicate.\n");
10965 return TreeEntry::NeedToGather;
10968 return TreeEntry::Vectorize;
10970 case Instruction::Select:
10972 SmallPtrSet<Type *, 4> CondTypes;
10973 for (
Value *V : VL) {
10980 if (CondTypes.
size() > 1) {
10983 <<
"SLP: Gathering select with different condition types.\n");
10984 return TreeEntry::NeedToGather;
10988 case Instruction::FNeg:
10989 case Instruction::Add:
10990 case Instruction::FAdd:
10991 case Instruction::Sub:
10992 case Instruction::FSub:
10993 case Instruction::Mul:
10994 case Instruction::FMul:
10995 case Instruction::UDiv:
10996 case Instruction::SDiv:
10997 case Instruction::FDiv:
10998 case Instruction::URem:
10999 case Instruction::SRem:
11000 case Instruction::FRem:
11001 case Instruction::Shl:
11002 case Instruction::LShr:
11003 case Instruction::AShr:
11004 case Instruction::And:
11005 case Instruction::Or:
11006 case Instruction::Xor:
11007 case Instruction::Freeze:
11008 if (S.getMainOp()->getType()->isFloatingPointTy() &&
11009 TTI->isFPVectorizationPotentiallyUnsafe() &&
any_of(VL, [](
Value *V) {
11011 return I &&
I->isBinaryOp() && !
I->isFast();
11013 return TreeEntry::NeedToGather;
11014 return TreeEntry::Vectorize;
11015 case Instruction::GetElementPtr: {
11017 for (
Value *V : VL) {
11021 if (
I->getNumOperands() != 2) {
11022 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (nested indexes).\n");
11023 return TreeEntry::NeedToGather;
11030 for (
Value *V : VL) {
11034 Type *CurTy =
GEP->getSourceElementType();
11035 if (Ty0 != CurTy) {
11036 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (different types).\n");
11037 return TreeEntry::NeedToGather;
11043 for (
Value *V : VL) {
11047 auto *
Op =
I->getOperand(1);
11049 (
Op->getType() != Ty1 &&
11051 Op->getType()->getScalarSizeInBits() >
11052 DL->getIndexSizeInBits(
11053 V->getType()->getPointerAddressSpace())))) {
11055 dbgs() <<
"SLP: not-vectorizable GEP (non-constant indexes).\n");
11056 return TreeEntry::NeedToGather;
11060 return TreeEntry::Vectorize;
11062 case Instruction::Store: {
11064 llvm::Type *ScalarTy =
cast<StoreInst>(VL0)->getValueOperand()->getType();
11067 if (DL->getTypeSizeInBits(ScalarTy) !=
11068 DL->getTypeAllocSizeInBits(ScalarTy)) {
11069 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering stores of non-packed type.\n");
11070 return TreeEntry::NeedToGather;
11074 for (
Value *V : VL) {
11076 if (!
SI->isSimple()) {
11078 return TreeEntry::NeedToGather;
11087 if (CurrentOrder.empty()) {
11088 Ptr0 = PointerOps.
front();
11089 PtrN = PointerOps.
back();
11091 Ptr0 = PointerOps[CurrentOrder.front()];
11092 PtrN = PointerOps[CurrentOrder.back()];
11095 std::optional<int64_t> Dist =
11098 if (
static_cast<uint64_t
>(*Dist) == VL.size() - 1)
11099 return TreeEntry::Vectorize;
11102 CurrentOrder, *Dist, Ptr0, SPtrInfo))
11103 return TreeEntry::StridedVectorize;
11107 return TreeEntry::NeedToGather;
11109 case Instruction::Call: {
11110 if (S.getMainOp()->getType()->isFloatingPointTy() &&
11111 TTI->isFPVectorizationPotentiallyUnsafe() &&
any_of(VL, [](
Value *V) {
11113 return I && !
I->isFast();
11115 return TreeEntry::NeedToGather;
11125 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
11129 return TreeEntry::NeedToGather;
11133 LLVM_DEBUG(
dbgs() <<
"SLP: Struct-returning calls have non-extractvalue "
11135 return TreeEntry::NeedToGather;
11138 unsigned NumArgs = CI->
arg_size();
11139 SmallVector<Value *, 4> ScalarArgs(NumArgs,
nullptr);
11140 for (
unsigned J = 0; J != NumArgs; ++J)
11143 for (
Value *V : VL) {
11148 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
11150 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched calls:" << *CI <<
"!=" << *V
11152 return TreeEntry::NeedToGather;
11156 for (
unsigned J = 0; J != NumArgs; ++J) {
11159 if (ScalarArgs[J] != A1J) {
11161 <<
"SLP: mismatched arguments in call:" << *CI
11162 <<
" argument " << ScalarArgs[J] <<
"!=" << A1J <<
"\n");
11163 return TreeEntry::NeedToGather;
11172 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched bundle operands in calls:" << *CI
11173 <<
"!=" << *V <<
'\n');
11174 return TreeEntry::NeedToGather;
11179 auto *VecTy =
getWidenedType(S.getMainOp()->getType(), VL.size());
11181 if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())
11182 return TreeEntry::NeedToGather;
11184 return TreeEntry::Vectorize;
11186 case Instruction::ShuffleVector: {
11187 if (!S.isAltShuffle()) {
11190 return TreeEntry::Vectorize;
11193 LLVM_DEBUG(
dbgs() <<
"SLP: ShuffleVector are not vectorized.\n");
11194 return TreeEntry::NeedToGather;
11197 return TreeEntry::Vectorize;
11201 return TreeEntry::NeedToGather;
11210 PHINode *Main =
nullptr;
11215 PHIHandler() =
delete;
11217 : DT(DT), Main(Main), Phis(Phis),
11218 Operands(Main->getNumIncomingValues(),
11220 void buildOperands() {
11221 constexpr unsigned FastLimit = 4;
11230 for (
auto [Idx, V] :
enumerate(Phis)) {
11234 "Expected isa instruction or poison value.");
11235 Operands[
I][Idx] =
V;
11238 if (
P->getIncomingBlock(
I) == InBB)
11239 Operands[
I][Idx] =
P->getIncomingValue(
I);
11241 Operands[
I][Idx] =
P->getIncomingValueForBlock(InBB);
11246 SmallMapVector<BasicBlock *, SmallVector<unsigned>, 4>
11256 for (
auto [Idx, V] :
enumerate(Phis)) {
11259 Operands[
I][Idx] =
V;
11268 Operands[
I][Idx] =
P->getIncomingValue(
I);
11271 auto *It = Blocks.
find(InBB);
11272 if (It == Blocks.
end())
11274 Operands[It->second.front()][Idx] =
P->getIncomingValue(
I);
11277 for (
const auto &
P : Blocks) {
11278 ArrayRef<unsigned> IncomingValues =
P.second;
11279 if (IncomingValues.
size() <= 1)
11282 for (
unsigned I : IncomingValues) {
11284 [&](
const auto &
Data) {
11285 return !
Data.value() ||
11286 Data.value() == Operands[BasicI][
Data.index()];
11288 "Expected empty operands list.");
11289 Operands[
I] = Operands[BasicI];
11302static std::pair<Instruction *, Instruction *>
11306 for (
Value *V : VL) {
11316 if (MainOp->
getOpcode() ==
I->getOpcode()) {
11335 "Expected different main and alt instructions.");
11336 return std::make_pair(MainOp, AltOp);
11349 const InstructionsState &S,
11351 const BoUpSLP &R,
bool BuildGatherOnly =
true) {
11355 ReuseShuffleIndices.
clear();
11361 for (
Value *V : VL) {
11378 bool RequireScheduling = S && S.getOpcode() != Instruction::PHI &&
11380 (S.areInstructionsWithCopyableElements() ||
11385 bool AreAllValuesNonConst = UniquePositions.
size() == UniqueValues.
size();
11387 if (RequireScheduling) {
11391 assert(EndIt != UniqueValues.
rend() &&
"Expected at least one non-poison.");
11392 UniqueValues.
erase(EndIt.base(), UniqueValues.
end());
11394 unsigned NumUniqueScalarValues = UniqueValues.
size();
11395 if (NumUniqueScalarValues == VL.
size()) {
11396 ReuseShuffleIndices.
clear();
11404 constexpr unsigned SmallVecWidth = 4;
11405 constexpr unsigned SmallVecUniqueThreshold = 3;
11406 if (VL.
size() == SmallVecWidth &&
11407 NumUniqueScalarValues == SmallVecUniqueThreshold && !BuildGatherOnly &&
11408 !(S && (S.getOpcode() == Instruction::Load ||
11409 S.getOpcode() == Instruction::PHI))) {
11411 ReuseShuffleIndices.
clear();
11417 auto EstimatePackPlusShuffleVsInserts = [&]() {
11419 if (UniquePositions.
size() == 1 &&
11420 (NumUniqueScalarValues == 1 ||
11422 return std::make_pair(
false,
false);
11426 constexpr unsigned MinVLForConstGatherCheck = 4;
11427 if (BuildGatherOnly && VL.
size() > MinVLForConstGatherCheck &&
11430 UniquePositions.
size() * 2 < NumUniqueScalarValues)
11431 return std::make_pair(
false,
false);
11433 assert(S && S.getOpcode() == Instruction::Load &&
"Expected load.");
11438 PointerOps, SPtrInfo);
11452 bool IsRootOperand =
11453 UserTreeIdx.
UserTE && UserTreeIdx.
UserTE->Idx == 0 && !BuildGatherOnly;
11454 if (IsRootOperand) {
11455 if (S && S.getOpcode() == Instruction::Load) {
11456 bool UseOrig = (CheckLoads(UniqueValues,
true) &&
11457 CheckLoads(VL,
false)) ||
11459 ReuseShuffleIndices, ReuseShuffleIndices.
size());
11460 return std::make_pair(
true, UseOrig);
11462 return std::make_pair(
true, !RequireScheduling);
11465 for (
auto [Idx, Val] :
enumerate(ReuseShuffleIndices))
11467 DemandedElts.
setBit(Idx);
11470 auto *UniquesVecTy =
11473 const unsigned UniquesNumParts =
11477 if (!RequireScheduling) {
11478 if (VL.
size() / NumUniqueScalarValues == 1 &&
11479 (NumParts <= 1 || UniquesNumParts >= NumParts))
11480 return std::make_pair(
true,
true);
11483 if (S && S.getOpcode() == Instruction::PHI && NumUniqueScalarValues > 1 &&
11484 UniquesNumParts <= NumParts)
11485 return std::make_pair(
true,
false);
11500 if (S && !BuildGatherOnly) {
11501 bool HasOneDup = S.getOpcode() != Instruction::Load &&
11502 NumUniqueScalarValues + 1 == VL.
size();
11503 bool MostlyUnique = NumUniqueScalarValues * 2 > VL.
size();
11504 bool IsHalfUniqueValues =
11505 NumUniqueScalarValues * 2 == VL.
size() &&
11506 (S.getOpcode() == Instruction::GetElementPtr ||
11509 NumParts * (VL.
size() > SmallVecWidth ? 1 : 2);
11511 ((MostlyUnique || IsHalfUniqueValues) && ReusesCost >
CostThreshold))
11512 return std::make_pair(
true,
true);
11517 if (S && S.getOpcode() == Instruction::Load) {
11518 bool UniquesVectorized =
11519 CheckLoads(UniqueValues,
false);
11520 if (UniquesVectorized || CheckLoads(VL,
false))
11521 return std::make_pair(
true, !UniquesVectorized);
11523 bool CanSkipBVCost =
11524 (!BuildGatherOnly && !RequireScheduling) || R.hasSameNode(S, VL);
11530 CostKind, AreAllValuesNonConst, VL);
11532 for (
const auto [Idx, V] :
enumerate(UniqueValues))
11534 UniquesDemandedElts.
clearBit(Idx);
11539 UniquesDemandedElts,
true,
11541 AreAllValuesNonConst, UniqueValues);
11542 UniquesCost += ReusesCost;
11543 if (UniquesCost <= InsertsCost)
11544 return std::make_pair(
true,
false);
11547 (R.getTreeSize() == 0 && R.isReductionTree() &&
11549 return std::make_pair(S && (!S.isAltShuffle() || !BuildGatherOnly),
11553 bool KeepOriginal = !BuildGatherOnly && !RequireScheduling;
11554 return std::make_pair(KeepOriginal, KeepOriginal);
11557 const auto [PackProfitable, UseOriginal] = EstimatePackPlusShuffleVsInserts();
11559 if (PackProfitable) {
11562 ReuseShuffleIndices.
clear();
11566 VL = std::move(UniqueValues);
11573 ReuseShuffleIndices.
clear();
11578 const InstructionsState &LocalState,
11582 constexpr unsigned SmallNodeSize = 4;
11583 if (VL.
size() <= SmallNodeSize || TTI->preferAlternateOpcodeVectorization() ||
11588 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *LocalState.getMainOp()
11590 for (TreeEntry *E : getSplitTreeEntries(LocalState.getMainOp())) {
11591 if (E->isSame(VL)) {
11593 << *LocalState.getMainOp() <<
".\n");
11611 Op1Indices.
set(Idx);
11614 if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
11617 (LocalState.getAltOpcode() == LocalState.getOpcode() &&
11619 LocalState.getAltOp(), *TLI))) {
11621 Op1Indices.
set(Idx);
11628 unsigned Opcode0 = LocalState.getOpcode();
11629 unsigned Opcode1 = LocalState.getAltOpcode();
11635 if (UOp1.
size() <= 1 || UOp2.
size() <= 1 ||
11636 TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask))
11638 unsigned Op1Cnt = 0, Op2Cnt = Op1.
size();
11640 if (Op1Indices.
test(Idx)) {
11641 ReorderIndices[Op1Cnt] = Idx;
11644 ReorderIndices[Op2Cnt] = Idx;
11649 ReorderIndices.
clear();
11659 if (!ReorderIndices.
empty())
11661 unsigned NumParts = TTI->getNumberOfParts(VecTy);
11666 if (NumParts >= VL.
size())
11675 if (!LocalState.isCmpOp() && NumParts <= 1 &&
11676 (Mask.empty() || InsertCost >= NewShuffleCost))
11678 if ((LocalState.getMainOp()->isBinaryOp() &&
11679 LocalState.getAltOp()->isBinaryOp() &&
11680 (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
11681 LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
11682 (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
11683 (LocalState.getMainOp()->isUnaryOp() &&
11684 LocalState.getAltOp()->isUnaryOp())) {
11686 TTI->getArithmeticInstrCost(Opcode0, VecTy, Kind) +
11687 TTI->getArithmeticInstrCost(Opcode1, VecTy, Kind);
11692 OriginalMask[Idx] = Idx + (Op1Indices.
test(Idx) ? 0 : VL.
size());
11696 VecTy, OriginalMask, Kind);
11698 TTI->getArithmeticInstrCost(Opcode0, Op1VecTy, Kind) +
11699 TTI->getArithmeticInstrCost(Opcode1, Op2VecTy, Kind);
11701 NewVecOpsCost + InsertCost +
11702 (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
11703 VectorizableTree.front()->getOpcode() == Instruction::Store
11707 if (NewCost >= OriginalCost)
11717class InstructionsCompatibilityAnalysis {
11722 unsigned MainOpcode = 0;
11727 static bool isSupportedOpcode(
const unsigned Opcode) {
11728 return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
11729 Opcode == Instruction::LShr || Opcode == Instruction::Shl ||
11730 Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
11731 Opcode == Instruction::And || Opcode == Instruction::Or ||
11732 Opcode == Instruction::Xor || Opcode == Instruction::FAdd ||
11733 Opcode == Instruction::FSub || Opcode == Instruction::FMul ||
11734 Opcode == Instruction::FDiv;
11744 auto IsSupportedInstruction = [&](
Instruction *
I,
bool AnyUndef) {
11745 if (AnyUndef && (
I->isIntDivRem() ||
I->isFPDivRem() ||
isa<CallInst>(
I)))
11747 return I && isSupportedOpcode(
I->getOpcode()) &&
11752 SmallDenseSet<Value *, 8> Operands;
11753 SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
11754 bool AnyUndef =
false;
11755 for (
Value *V : VL) {
11763 if (Candidates.
empty()) {
11764 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
11766 Operands.
insert(
I->op_begin(),
I->op_end());
11769 if (Parent ==
I->getParent()) {
11770 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
11771 Operands.
insert(
I->op_begin(),
I->op_end());
11774 auto *NodeA = DT.
getNode(Parent);
11775 auto *NodeB = DT.
getNode(
I->getParent());
11776 assert(NodeA &&
"Should only process reachable instructions");
11777 assert(NodeB &&
"Should only process reachable instructions");
11778 assert((NodeA == NodeB) ==
11779 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11780 "Different nodes should have different DFS numbers");
11781 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
11782 Candidates.
clear();
11783 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
11786 Operands.
insert(
I->op_begin(),
I->op_end());
11789 unsigned BestOpcodeNum = 0;
11791 bool UsedOutside =
false;
11792 for (
const auto &
P : Candidates) {
11794 if (UsedOutside && !PUsedOutside)
11796 if (!UsedOutside && PUsedOutside)
11798 if (
P.second.size() < BestOpcodeNum)
11801 if (!PUsedOutside &&
any_of(
P.second, [&](Instruction *
I) {
11802 return Operands.contains(I);
11809 if (
P.second.size() == BestOpcodeNum) {
11810 auto *
I =
P.second.front();
11814 if (MainBOOp0 && MainBOOp0->getOpcode() ==
I->getOpcode() &&
11815 MainBOOp0->getParent() ==
I->getParent())
11817 if (MainBOOp1 && MainBOOp1->getOpcode() ==
I->getOpcode() &&
11818 MainBOOp1->getParent() ==
I->getParent())
11822 UsedOutside = PUsedOutside;
11823 for (Instruction *
I :
P.second) {
11824 if (IsSupportedInstruction(
I, AnyUndef)) {
11826 BestOpcodeNum =
P.second.size();
11836 return I &&
I->getParent() == MainOp->
getParent() &&
11849 Value *selectBestIdempotentValue()
const {
11850 assert(isSupportedOpcode(MainOpcode) &&
"Unsupported opcode");
11861 if (!S.isCopyableElement(V))
11863 assert(isSupportedOpcode(MainOpcode) &&
"Unsupported opcode");
11864 return {
V, selectBestIdempotentValue()};
11870 SmallVectorImpl<BoUpSLP::ValueList> &Operands)
const {
11872 unsigned ShuffleOrOp =
11873 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
11876 switch (ShuffleOrOp) {
11877 case Instruction::PHI: {
11881 PHIHandler Handler(DT, PH, VL);
11882 Handler.buildOperands();
11883 Operands.
assign(PH->getNumOperands(), {});
11885 Operands[
I].
assign(Handler.getOperands(
I).begin(),
11886 Handler.getOperands(
I).end());
11889 case Instruction::ExtractValue: {
11890 SmallVector<unsigned> Indices;
11894 Operands[0].
swap(Calls);
11899 case Instruction::ExtractElement:
11904 case Instruction::InsertElement:
11912 case Instruction::Load:
11916 for (
auto [V,
Op] :
zip(VL, Operands.
back())) {
11920 Op = LI->getPointerOperand();
11923 case Instruction::ZExt:
11924 case Instruction::SExt:
11925 case Instruction::FPToUI:
11926 case Instruction::FPToSI:
11927 case Instruction::FPExt:
11928 case Instruction::PtrToInt:
11929 case Instruction::IntToPtr:
11930 case Instruction::SIToFP:
11931 case Instruction::UIToFP:
11932 case Instruction::Trunc:
11933 case Instruction::FPTrunc:
11934 case Instruction::BitCast:
11935 case Instruction::ICmp:
11936 case Instruction::FCmp:
11937 case Instruction::FNeg:
11938 case Instruction::Add:
11939 case Instruction::FAdd:
11940 case Instruction::Sub:
11941 case Instruction::FSub:
11942 case Instruction::Mul:
11943 case Instruction::FMul:
11944 case Instruction::UDiv:
11945 case Instruction::SDiv:
11946 case Instruction::FDiv:
11947 case Instruction::URem:
11948 case Instruction::SRem:
11949 case Instruction::FRem:
11950 case Instruction::Shl:
11951 case Instruction::LShr:
11952 case Instruction::AShr:
11953 case Instruction::And:
11954 case Instruction::Or:
11955 case Instruction::Xor:
11956 case Instruction::Freeze:
11957 case Instruction::Store:
11958 case Instruction::ShuffleVector:
11967 auto [
Op, ConvertedOps] = convertTo(
I, S);
11972 case Instruction::Select:
11986 Operands[0][Idx] =
I->getOperand(0);
11987 Operands[1][Idx] = ConstantInt::get(
I->getType(), 1);
11988 Operands[2][Idx] = ConstantInt::getNullValue(
I->getType());
11991 auto [
Op, ConvertedOps] = convertTo(
I, S);
11996 case Instruction::GetElementPtr: {
12003 const unsigned IndexIdx = 1;
12009 return !
GEP || VL0Ty ==
GEP->getOperand(IndexIdx)->getType();
12013 ->getPointerOperandType()
12014 ->getScalarType());
12018 Operands[0][Idx] =
V;
12019 Operands[1][Idx] = ConstantInt::getNullValue(Ty);
12022 Operands[0][Idx] =
GEP->getPointerOperand();
12023 auto *
Op =
GEP->getOperand(IndexIdx);
12026 CI, Ty, CI->getValue().isSignBitSet(),
DL)
12031 case Instruction::Call: {
12038 for (
Value *V : VL) {
12040 Ops.push_back(
I ?
I->getOperand(Idx)
12059 const InstructionsState &S,
12060 const InstructionsState &CopyableS) {
12067 Instruction *SAlt = S.isAltShuffle() ? S.getAltOp() :
nullptr;
12069 const bool IsAltCommutative =
12073 buildOriginalOperands(S, SMain,
Ops);
12075 if (
Ops.size() != 2)
12087 auto *I = dyn_cast<Instruction>(V);
12088 return I && I->getOpcode() == SMainOpI->getOpcode();
12091 SmallPtrSet<Value *, 8> Operands;
12092 for (
Value *V : VL) {
12094 if (!
I ||
I == SMain)
12096 Instruction *MatchingOp = S.getMatchingMainOpOrAltOp(
I);
12097 if (MatchingOp != SMain)
12100 buildOriginalOperands(S,
I, VOps);
12101 Operands.
insert(
I->op_begin(),
I->op_end());
12103 "Expected binary operations only.");
12104 if (CheckOperands(VOps[0][0],
Ops[0][0]) ||
12105 CheckOperands(VOps[1][0],
Ops[1][0]) ||
12106 (IsCommutative && (CheckOperands(VOps[0][0],
Ops[1][0]) ||
12107 CheckOperands(VOps[1][0],
Ops[0][0])))) {
12114 buildOriginalOperands(S, MainOp, MainOps);
12116 auto BuildFirstOperandCandidates =
12117 [&](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates,
12119 bool IsCommutative) {
12125 auto BuildSecondOperandCandidates =
12126 [&](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates,
12128 Value *Op1,
bool IsCommutative) {
12129 if (PrevBestIdx != 1)
12131 if (PrevBestIdx != 0 && IsCommutative)
12135 auto FindBestCandidate =
12138 auto Res =
R.findBestRootPair(Candidates);
12139 Score = Res.second;
12142 isConstant(Candidates[Res.first.value_or(0)].first) &&
12143 isConstant(Candidates[Res.first.value_or(0)].second);
12147 for (
const auto [Idx,
P] :
enumerate(Candidates)) {
12149 P.second ==
P.first) {
12152 Score =
isa<LoadInst>(Candidates[Res.first.value_or(0)].first)
12162 for (
Value *V : VL) {
12164 if (!
I || (
I == MainOp && (!S.isAltShuffle() ||
I == SMain)) ||
12165 (!S.isAltShuffle() &&
I == SMain))
12168 buildOriginalOperands(S,
I == SMain ? MainOp :
I, VOps);
12170 getOperands(CopyableS,
I == MainOp ? SMain :
I);
12171 if (CopyableOps.
size() == VOps.
size() &&
12172 all_of(
zip(CopyableOps, VOps), [&](
const auto &
P) {
12173 return std::get<0>(
P) == std::get<1>(
P)[0];
12177 BuildFirstOperandCandidates(Candidates, MainOps, CopyableOps[0],
12178 CopyableOps[1], IsMainCommutative);
12179 const unsigned OpSize = Candidates.
size();
12181 S.getMatchingMainOpOrAltOp(
I) == S.getMainOp() ? SMain : SAlt;
12182 const bool IsCommutativeInst =
12183 (MatchingOp == SMain ? IsCommutative : IsAltCommutative) ||
12185 if (S.isAltShuffle() && MatchingOp == SAlt &&
12191 if (S.isAltShuffle() && MatchingOp == SMain)
12192 Operands.
insert(
I->op_begin(),
I->op_end());
12193 BuildFirstOperandCandidates(Candidates,
Ops, VOps[0][0], VOps[1][0],
12194 IsCommutativeInst);
12197 std::optional<int> BestOp =
12198 FindBestCandidate(Candidates, IsBestConst, Score);
12199 const bool IsOriginalBetter =
12200 static_cast<unsigned>(BestOp.value_or(OpSize)) >= OpSize;
12201 Candidates.
clear();
12202 BuildSecondOperandCandidates(
12203 Candidates, MainOps, IsOriginalBetter ? -1 : *BestOp, CopyableOps[0],
12204 CopyableOps[1], IsMainCommutative);
12205 const unsigned SecondOpSize = Candidates.
size();
12206 BuildSecondOperandCandidates(
12208 IsOriginalBetter ? BestOp.value_or(OpSize - 1) - OpSize : -1,
12209 VOps[0][0], VOps[1][0], IsCommutativeInst);
12210 bool IsSecondBestConst;
12212 std::optional<int> SecondBestOp =
12213 FindBestCandidate(Candidates, IsSecondBestConst, SecondScore);
12215 if (!BestOp && !SecondBestOp)
12218 const bool IsSecondOriginalBetter =
12219 static_cast<unsigned>(SecondBestOp.value_or(SecondOpSize)) >=
12221 if (IsOriginalBetter && IsSecondOriginalBetter)
12225 if (!BestOp && IsSecondOriginalBetter)
12229 if (!SecondBestOp && IsOriginalBetter)
12233 if (!IsOriginalBetter && IsBestConst && IsSecondOriginalBetter &&
12234 !IsSecondBestConst)
12238 if (BestOp && IsOriginalBetter && !IsBestConst &&
12239 !IsSecondOriginalBetter && IsSecondBestConst)
12242 if (((Score > SecondScore ||
12244 Score == SecondScore)) &&
12245 IsOriginalBetter) ||
12246 (IsSecondOriginalBetter &&
12247 (SecondScore > Score ||
12249 Score == SecondScore))))
12256 InstructionsCompatibilityAnalysis(DominatorTree &DT,
const DataLayout &
DL,
12257 const TargetTransformInfo &
TTI,
12258 const TargetLibraryInfo &TLI)
12263 bool WithProfitabilityCheck =
false,
12264 bool SkipSameCodeCheck =
false) {
12265 InstructionsState S = (SkipSameCodeCheck || !
allSameBlock(VL))
12266 ? InstructionsState::invalid()
12278 return (ZExt && ZExt->getSrcTy()->isIntegerTy(1)) ||
12282 return InstructionsState(SelectOp, SelectOp);
12284 if (S && S.isAltShuffle()) {
12285 Type *ScalarTy = S.getMainOp()->getType();
12287 unsigned Opcode0 = S.getOpcode();
12288 unsigned Opcode1 = S.getAltOpcode();
12289 SmallBitVector OpcodeMask(
12298 return !
I ||
I->getOpcode() == S.getOpcode() ||
12299 (S.getOpcode() == Instruction::Add &&
12300 I->getOpcode() == Instruction::Shl);
12306 findAndSetMainInstruction(VL, R);
12309 InstructionsState OrigS = S;
12310 S = InstructionsState(MainOp, MainOp,
true);
12311 if (OrigS && !isCopyablePreferable(VL, R, OrigS, S))
12313 if (!WithProfitabilityCheck)
12317 auto BuildCandidates =
12318 [](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates,
Value *V1,
12324 if (I1 && I2 &&
I1->getOpcode() == I2->getOpcode() &&
12325 I1->getParent() != I2->getParent())
12329 if (VL.
size() == 2) {
12332 BuildCandidates(Candidates1, Operands[0][0], Operands[0][1]);
12333 BuildCandidates(Candidates2, Operands[1][0], Operands[1][1]);
12334 bool Res = !Candidates1.
empty() && !Candidates2.
empty() &&
12335 R.findBestRootPair(Candidates1).first &&
12336 R.findBestRootPair(Candidates2).first;
12338 Candidates1.
clear();
12339 Candidates2.
clear();
12340 BuildCandidates(Candidates1, Operands[0][0], Operands[1][1]);
12341 BuildCandidates(Candidates2, Operands[1][0], Operands[0][1]);
12342 Res = !Candidates1.
empty() && !Candidates2.
empty() &&
12343 R.findBestRootPair(Candidates1).first &&
12344 R.findBestRootPair(Candidates2).first;
12352 switch (MainOpcode) {
12353 case Instruction::Add:
12354 case Instruction::Sub:
12355 case Instruction::LShr:
12356 case Instruction::Shl:
12357 case Instruction::SDiv:
12358 case Instruction::UDiv:
12359 case Instruction::And:
12360 case Instruction::Or:
12361 case Instruction::Xor:
12362 case Instruction::FAdd:
12363 case Instruction::FMul:
12364 case Instruction::FSub:
12365 case Instruction::FDiv:
12371 if (VectorCost > ScalarCost)
12375 assert(Operands.
size() == 2 &&
"Unexpected number of operands!");
12376 unsigned CopyableNum =
12377 count_if(VL, [&](
Value *V) {
return S.isCopyableElement(V); });
12378 if (CopyableNum < VL.
size() / 2)
12381 const unsigned Limit = VL.
size() / 24;
12382 if ((CopyableNum >= VL.
size() - Limit ||
12383 (CopyableNum >= VL.
size() - 1 && VL.
size() > 4) ||
12392 Value *BestFrontOp =
nullptr;
12393 for (
auto [OpL, OpR] :
zip(Operands.
front(), Operands.
back())) {
12414 const unsigned BestOpcode = BestLHS->getOpcode();
12415 for (
auto [OpL, OpR] :
zip(Operands.
front(), Operands.
back())) {
12419 if (OpRI->getOpcode() == BestOpcode)
12432 constexpr unsigned Limit = 4;
12433 if (Operands.
front().size() >= Limit) {
12434 SmallDenseMap<const Value *, unsigned>
Counters;
12442 return C.second == 1;
12448 InstructionsCompatibilityAnalysis
Analysis(DT,
DL,
TTI, TLI);
12449 InstructionsState OpS =
Analysis.buildInstructionsState(
Ops, R);
12450 if (!OpS || (OpS.getOpcode() == Instruction::PHI && !
allSameBlock(
Ops)))
12452 unsigned CopyableNum =
12454 return CopyableNum <= VL.
size() / 2;
12456 if (!CheckOperand(Operands.
front()))
12464 assert(S &&
"Invalid state!");
12466 if (S.areInstructionsWithCopyableElements()) {
12467 MainOp = S.getMainOp();
12468 MainOpcode = S.getOpcode();
12469 const bool IsCommutative =
12476 for (
auto [OperandIdx, Operand] :
enumerate(OperandsForValue))
12477 Operands[OperandIdx][Idx] = Operand;
12483 if (IsCommutative) {
12489 unsigned FwdCount = 0;
12490 unsigned RevCount = 0;
12492 SmallMapVector<std::pair<unsigned, unsigned>, PairInfo, 8> PairCounts;
12493 unsigned MajID0 = 0, MajID1 = 0;
12497 unsigned ID0 = Operands[0][Idx]->getValueID();
12498 unsigned ID1 = Operands[1][Idx]->getValueID();
12501 unsigned MinID = std::min(ID0, ID1);
12502 unsigned MaxID = std::max(ID0, ID1);
12505 PairInfo &
Info = It->second;
12514 unsigned BestCount = 0;
12515 for (
const auto &
P : PairCounts) {
12516 const PairInfo &
Info =
P.second;
12518 if (
Total > BestCount) {
12520 if (
Info.FwdCount >=
Info.RevCount) {
12521 MajID0 =
P.first.first;
12522 MajID1 =
P.first.second;
12524 MajID0 =
P.first.second;
12525 MajID1 =
P.first.first;
12536 unsigned LAt0 = 0, LAt1 = 0, TotalNC = 0;
12541 if (BestCount > 0) {
12542 unsigned ID0 = Operands[0][Idx]->getValueID();
12543 unsigned ID1 = Operands[1][Idx]->getValueID();
12544 if (ID0 == MajID1 && ID1 == MajID0)
12545 std::swap(Operands[0][Idx], Operands[1][Idx]);
12553 if (TotalNC > 1 && LAt1 > LAt0 && LAt1 * 2 > TotalNC) {
12559 std::swap(Operands[0][Idx], Operands[1][Idx]);
12564 buildOriginalOperands(S, VL, Operands);
12571BoUpSLP::ScalarsVectorizationLegality
12573 const EdgeInfo &UserTreeIdx)
const {
12576 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
12577 InstructionsState S =
Analysis.buildInstructionsState(
12580 bool AreScatterAllGEPSameBlock =
false;
12582 SmallVector<unsigned> SortedIndices;
12584 bool IsScatterVectorizeUserTE =
12585 UserTreeIdx.UserTE &&
12586 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
12587 AreScatterAllGEPSameBlock =
12601 *SE, SortedIndices));
12602 if (!AreScatterAllGEPSameBlock) {
12603 LLVM_DEBUG(
dbgs() <<
"SLP: Try split and if failed, gathering due to "
12604 "C,S,B,O, small shuffle. \n";
12608 return ScalarsVectorizationLegality(S,
false,
12614 assert(It != VL.
end() &&
"Expected at least one GEP.");
12617 assert(S &&
"Must be valid.");
12622 if (S.getOpcode() == Instruction::PHI) {
12623 unsigned NumIncomingValues =
12625 if (
static_cast<uint64_t
>(VL.
size()) * NumIncomingValues >
12627 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to wide PHI operand fan-out ("
12628 << VL.
size() <<
" lanes x " << NumIncomingValues
12629 <<
" incoming values).\n");
12630 return ScalarsVectorizationLegality(S,
false);
12638 return ScalarsVectorizationLegality(S,
false,
12644 BasicBlock *BB = S.getMainOp()->getParent();
12647 !DT->isReachableFromEntry(BB)) {
12653 return ScalarsVectorizationLegality(S,
false);
12662 return ScalarsVectorizationLegality(S,
false,
12667 if (S.getOpcode() == Instruction::ExtractElement &&
12670 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to scalable vector type.\n");
12671 return ScalarsVectorizationLegality(S,
false);
12678 (S.isAltShuffle() || VL.
size() < 4 ||
12685 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to max recursion depth.\n");
12686 return ScalarsVectorizationLegality(S,
false);
12690 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *S.getMainOp() <<
".\n");
12692 const bool IsPHIWithLoop =
12693 S.getOpcode() == Instruction::PHI &&
12694 LI->getLoopFor(S.getMainOp()->getParent()) !=
nullptr;
12695 for (TreeEntry *
E : getTreeEntries(S.getMainOp())) {
12696 if (
E->isSame(VL)) {
12697 LLVM_DEBUG(
dbgs() <<
"SLP: Perfect diamond merge at " << *S.getMainOp()
12699 return ScalarsVectorizationLegality(S,
false);
12707 return ScalarsVectorizationLegality(S,
false);
12711 bool AreAllSameBlock = !AreScatterAllGEPSameBlock;
12712 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
12713 if (!AreAllSameInsts ||
isSplat(VL) ||
12717 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to C,S,B,O conditions. \n";
12721 return ScalarsVectorizationLegality(S,
false);
12725 if (!EphValues.empty()) {
12726 for (
Value *V : VL) {
12727 if (EphValues.count(V)) {
12729 <<
") is ephemeral.\n");
12731 return ScalarsVectorizationLegality(S,
false,
12743 if (S.isAltShuffle()) {
12744 auto GetNumVectorizedExtracted = [&]() {
12750 all_of(
I->operands(), [&](
const Use &U) {
12751 return isa<ExtractElementInst>(U.get());
12756 else if (!
I->hasOneUser() && !areAllUsersVectorized(
I, UserIgnoreList))
12759 return std::make_pair(Vectorized, Extracted);
12761 auto [Vectorized, Extracted] = GetNumVectorizedExtracted();
12763 bool PreferScalarize = !Vectorized.
isAllOnes() && VL.size() == 2;
12764 if (!Vectorized.
isAllOnes() && !PreferScalarize) {
12767 Type *ScalarTy = VL.front()->getType();
12772 false,
true, Kind);
12774 *TTI, ScalarTy, VecTy, Vectorized,
12775 true,
false, Kind,
false);
12776 PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate;
12778 if (PreferScalarize) {
12779 LLVM_DEBUG(
dbgs() <<
"SLP: The instructions are in tree and alternate "
12780 "node is not profitable.\n");
12781 return ScalarsVectorizationLegality(S,
false);
12786 if (UserIgnoreList && !UserIgnoreList->empty()) {
12787 for (
Value *V : VL) {
12788 if (UserIgnoreList->contains(V)) {
12789 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to gathered scalar.\n");
12790 return ScalarsVectorizationLegality(S,
false);
12795 return ScalarsVectorizationLegality(S,
true);
12800 unsigned InterleaveFactor) {
12803 SmallVector<int> ReuseShuffleIndices;
12807 auto TrySplitNode = [&](
const InstructionsState &LocalState) {
12813 auto Invalid = ScheduleBundle::invalid();
12814 auto *
TE = newTreeEntry(VL, TreeEntry::SplitVectorize,
Invalid, LocalState,
12815 UserTreeIdx, {}, ReorderIndices);
12820 getSameValuesTreeEntry(S.getMainOp(),
Op,
true))) {
12822 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
12823 Idx == 0 ? 0 : Op1.
size());
12824 (void)newTreeEntry(
Op, TreeEntry::NeedToGather,
Invalid, S, {
TE, Idx});
12826 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
12827 Idx == 0 ? 0 : Op1.
size());
12837 bool AreConsts =
false;
12838 for (
Value *V : VL) {
12850 if (AreOnlyConstsWithPHIs(VL)) {
12851 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to all constants and PHIs.\n");
12852 newGatherTreeEntry(VL, InstructionsState::invalid(), UserTreeIdx);
12856 ScalarsVectorizationLegality Legality =
12857 getScalarsVectorizationLegality(VL,
Depth, UserTreeIdx);
12858 InstructionsState S = Legality.getInstructionsState();
12859 if (!Legality.isLegal()) {
12860 if (Legality.trySplitVectorize()) {
12863 if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
12866 if (Legality.tryToFindDuplicates())
12868 UserTreeIdx, *
this);
12870 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
12875 if (S.isAltShuffle() && TrySplitNode(S))
12881 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
12886 bool IsScatterVectorizeUserTE =
12887 UserTreeIdx.UserTE &&
12888 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
12892 TreeEntry::EntryState State = getScalarsVectorizationState(
12893 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, SPtrInfo);
12894 if (State == TreeEntry::NeedToGather) {
12895 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
12902 if (VectorizableTree.empty()) {
12903 assert(CurrentLoopNest.empty() &&
"Expected empty loop nest");
12905 BasicBlock *Parent = S.getMainOp()->getParent();
12906 if (
const Loop *L = LI->getLoopFor(Parent)) {
12909 CurrentLoopNest.assign(getLoopNest(L));
12911 }
else if (!UserTreeIdx ||
12912 UserTreeIdx.UserTE->State == TreeEntry::SplitVectorize ||
12913 UserTreeIdx.UserTE->isGather() ||
12914 UserTreeIdx.UserTE->getMainOp()->getParent() !=
12915 S.getMainOp()->getParent()) {
12916 BasicBlock *Parent = S.getMainOp()->getParent();
12917 if (
const Loop *L = LI->getLoopFor(Parent)) {
12930 SmallVector<const Loop *> NewLoopNest(getLoopNest(L));
12931 unsigned CommonLen = 0;
12932 for (
const auto [L1, L2] :
zip(CurrentLoopNest, NewLoopNest)) {
12937 auto ValidateMergedBTCs = [&](
unsigned StartDepth) ->
bool {
12938 unsigned EndDepth =
12939 std::min<unsigned>(NewLoopNest.size(), MergedLoopBTCs.size());
12940 for (
unsigned D = StartDepth;
D < EndDepth; ++
D) {
12941 const SCEV *Constraint = MergedLoopBTCs[
D];
12944 const SCEV *NewBTC = SE->getBackedgeTakenCount(NewLoopNest[
D]);
12950 auto BailOutToGather = [&]() {
12952 <<
"SLP: Sibling loops have different trip counts.\n");
12953 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
12955 if (CurrentLoopNest.empty()) {
12956 if (!ValidateMergedBTCs(0)) {
12960 CurrentLoopNest.assign(NewLoopNest);
12961 }
else if (CommonLen < CurrentLoopNest.size() &&
12962 CommonLen < NewLoopNest.size()) {
12971 const Loop *SibA = CurrentLoopNest[CommonLen];
12972 const Loop *SibB = NewLoopNest[CommonLen];
12973 const SCEV *BecA = SE->getBackedgeTakenCount(SibA);
12974 const SCEV *BecB = SE->getBackedgeTakenCount(SibB);
12979 if (!ValidateMergedBTCs(CommonLen + 1)) {
12983 if (MergedLoopBTCs.size() <= CommonLen)
12984 MergedLoopBTCs.resize(CommonLen + 1,
nullptr);
12985 MergedLoopBTCs[CommonLen] = BecA;
12986 CurrentLoopNest.truncate(CommonLen);
12987 }
else if (NewLoopNest.size() > CurrentLoopNest.size()) {
12988 if (!ValidateMergedBTCs(CurrentLoopNest.size())) {
12992 CurrentLoopNest.append(
12993 std::next(NewLoopNest.begin(), CurrentLoopNest.size()),
12994 NewLoopNest.end());
13003 auto &BSRef = BlocksSchedules[BB];
13005 BSRef = std::make_unique<BlockScheduling>(BB);
13007 BlockScheduling &BS = *BSRef;
13010 std::optional<ScheduleBundle *> BundlePtr =
13011 BS.tryScheduleBundle(UniqueValues.getArrayRef(),
this, S, UserTreeIdx);
13012#ifdef EXPENSIVE_CHECKS
13016 if (!BundlePtr || (*BundlePtr && !*BundlePtr.value())) {
13017 LLVM_DEBUG(
dbgs() <<
"SLP: We are not able to schedule this bundle!\n");
13019 if (S.isAltShuffle() && ReuseShuffleIndices.
empty() && TrySplitNode(S))
13021 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
13022 NonScheduledFirst.insert(VL.front());
13023 if (S.getOpcode() == Instruction::Load &&
13024 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
13028 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
13030 ScheduleBundle
Empty;
13031 ScheduleBundle &Bundle = BundlePtr.value() ? *BundlePtr.value() :
Empty;
13032 LLVM_DEBUG(
dbgs() <<
"SLP: We are able to schedule this bundle.\n");
13034 unsigned ShuffleOrOp =
13035 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
13036 auto CreateOperandNodes = [&](TreeEntry *
TE,
const auto &Operands) {
13038 SmallVector<unsigned> PHIOps;
13044 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
13049 for (
unsigned I : PHIOps)
13050 buildTreeRec(Operands[
I],
Depth + 1, {
TE,
I});
13052 switch (ShuffleOrOp) {
13053 case Instruction::PHI: {
13055 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
13059 TE->setOperands(Operands);
13060 CreateOperandNodes(TE, Operands);
13063 case Instruction::ExtractValue:
13064 case Instruction::ExtractElement: {
13065 if (CurrentOrder.empty()) {
13066 LLVM_DEBUG(
dbgs() <<
"SLP: Reusing or shuffling extract sequence.\n");
13069 dbgs() <<
"SLP: Reusing or shuffling of reordered extract sequence "
13071 for (
unsigned Idx : CurrentOrder)
13072 dbgs() <<
" " << Idx;
13079 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
13080 ReuseShuffleIndices, CurrentOrder);
13082 "(ExtractValueInst/ExtractElementInst).\n";
13086 TE->setOperands(Operands);
13087 if (ShuffleOrOp == Instruction::ExtractValue) {
13088 SmallVector<unsigned> Indices;
13091 TE->StructEVIndices = std::move(Indices);
13092 buildTreeRec(Operands.
front(),
Depth + 1, {TE, 0});
13097 case Instruction::InsertElement: {
13098 assert(ReuseShuffleIndices.
empty() &&
"All inserts should be unique");
13100 auto OrdCompare = [](
const std::pair<int, int> &
P1,
13101 const std::pair<int, int> &
P2) {
13102 return P1.first >
P2.first;
13105 decltype(OrdCompare)>
13106 Indices(OrdCompare);
13107 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
13109 Indices.emplace(Idx,
I);
13111 OrdersType CurrentOrder(VL.size(), VL.size());
13112 bool IsIdentity =
true;
13113 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
13114 CurrentOrder[Indices.top().second] =
I;
13115 IsIdentity &= Indices.top().second ==
I;
13119 CurrentOrder.clear();
13120 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
13122 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (InsertElementInst).\n";
13125 TE->setOperands(Operands);
13126 buildTreeRec(
TE->getOperand(1),
Depth + 1, {TE, 1});
13129 case Instruction::Load: {
13136 TreeEntry *
TE =
nullptr;
13139 case TreeEntry::Vectorize:
13140 TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
13141 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
13142 if (CurrentOrder.empty())
13143 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (LoadInst).\n";
13147 <<
"SLP: added a new TreeEntry (jumbled LoadInst).\n";
13150 case TreeEntry::CompressVectorize:
13152 TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S,
13153 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
13156 <<
"SLP: added a new TreeEntry (masked LoadInst + compress).\n";
13159 case TreeEntry::StridedVectorize:
13161 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
13162 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
13163 TreeEntryToStridedPtrInfoMap[
TE] = SPtrInfo;
13164 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (strided LoadInst).\n";
13167 case TreeEntry::ScatterVectorize:
13169 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
13170 UserTreeIdx, ReuseShuffleIndices);
13173 <<
"SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
13176 case TreeEntry::CombinedVectorize:
13177 case TreeEntry::SplitVectorize:
13178 case TreeEntry::NeedToGather:
13181 if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {
13182 assert(Operands.
size() == 1 &&
"Expected a single operand only");
13183 SmallVector<int>
Mask;
13187 TE->setOperands(Operands);
13188 if (State == TreeEntry::ScatterVectorize)
13189 buildTreeRec(PointerOps,
Depth + 1, {
TE, 0});
13192 case Instruction::ZExt:
13193 case Instruction::SExt:
13194 case Instruction::FPToUI:
13195 case Instruction::FPToSI:
13196 case Instruction::FPExt:
13197 case Instruction::PtrToInt:
13198 case Instruction::IntToPtr:
13199 case Instruction::SIToFP:
13200 case Instruction::UIToFP:
13201 case Instruction::Trunc:
13202 case Instruction::FPTrunc:
13203 case Instruction::BitCast: {
13204 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
13205 std::make_pair(std::numeric_limits<unsigned>::min(),
13206 std::numeric_limits<unsigned>::max()));
13207 if (ShuffleOrOp == Instruction::ZExt ||
13208 ShuffleOrOp == Instruction::SExt) {
13209 CastMaxMinBWSizes = std::make_pair(
13210 std::max<unsigned>(DL->getTypeSizeInBits(VL0->
getType()),
13212 std::min<unsigned>(
13215 }
else if (ShuffleOrOp == Instruction::Trunc) {
13216 CastMaxMinBWSizes = std::make_pair(
13217 std::max<unsigned>(
13220 std::min<unsigned>(DL->getTypeSizeInBits(VL0->
getType()),
13223 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
13224 ReuseShuffleIndices);
13225 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (CastInst).\n";
13228 TE->setOperands(Operands);
13230 buildTreeRec(
TE->getOperand(
I),
Depth, {TE, I});
13231 if (ShuffleOrOp == Instruction::Trunc) {
13232 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
13233 }
else if (ShuffleOrOp == Instruction::SIToFP ||
13234 ShuffleOrOp == Instruction::UIToFP) {
13235 unsigned NumSignBits =
13238 APInt
Mask = DB->getDemandedBits(OpI);
13239 NumSignBits = std::max(NumSignBits,
Mask.countl_zero());
13241 if (NumSignBits * 2 >=
13243 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
13247 case Instruction::ICmp:
13248 case Instruction::FCmp: {
13251 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
13252 ReuseShuffleIndices);
13261 "Commutative Predicate mismatch");
13264 Operands.
back() =
Ops.getVL(1);
13271 if (
Cmp->getPredicate() != P0)
13275 TE->setOperands(Operands);
13276 buildTreeRec(Operands.
front(),
Depth, {TE, 0});
13277 buildTreeRec(Operands.
back(),
Depth, {TE, 1});
13278 if (ShuffleOrOp == Instruction::ICmp) {
13279 unsigned NumSignBits0 =
13281 if (NumSignBits0 * 2 >=
13283 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
13284 unsigned NumSignBits1 =
13286 if (NumSignBits1 * 2 >=
13288 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
13292 case Instruction::Select:
13293 case Instruction::FNeg:
13294 case Instruction::Add:
13295 case Instruction::FAdd:
13296 case Instruction::Sub:
13297 case Instruction::FSub:
13298 case Instruction::Mul:
13299 case Instruction::FMul:
13300 case Instruction::UDiv:
13301 case Instruction::SDiv:
13302 case Instruction::FDiv:
13303 case Instruction::URem:
13304 case Instruction::SRem:
13305 case Instruction::FRem:
13306 case Instruction::Shl:
13307 case Instruction::LShr:
13308 case Instruction::AShr:
13309 case Instruction::And:
13310 case Instruction::Or:
13311 case Instruction::Xor:
13312 case Instruction::Freeze: {
13313 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
13314 ReuseShuffleIndices);
13316 dbgs() <<
"SLP: added a new TreeEntry "
13317 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
13323 Operands[0] =
Ops.getVL(0);
13324 Operands[1] =
Ops.getVL(1);
13326 TE->setOperands(Operands);
13328 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
13331 case Instruction::GetElementPtr: {
13332 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
13333 ReuseShuffleIndices);
13334 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (GetElementPtrInst).\n";
13336 TE->setOperands(Operands);
13339 buildTreeRec(Operands[
I],
Depth + 1, {
TE,
I});
13342 case Instruction::Store: {
13343 assert(CurrentOrder.empty() &&
13344 "Expected ordered store during tree building");
13345 if (State == TreeEntry::StridedVectorize) {
13347 newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
13348 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
13349 TreeEntryToStridedPtrInfoMap[
TE] = SPtrInfo;
13351 dbgs() <<
"SLP: added a new TreeEntry (strided StoreInst).\n";
13353 TE->setOperands(Operands);
13354 buildTreeRec(
TE->getOperand(0),
Depth + 1, {TE, 0});
13357 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
13358 ReuseShuffleIndices, CurrentOrder);
13359 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (StoreInst).\n";
13361 TE->setOperands(Operands);
13362 buildTreeRec(
TE->getOperand(0),
Depth + 1, {TE, 0});
13365 case Instruction::Call: {
13371 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
13372 ReuseShuffleIndices);
13373 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (CallInst).\n";
13378 Operands[0] =
Ops.getVL(0);
13379 Operands[1] =
Ops.getVL(1);
13381 TE->setOperands(Operands);
13387 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
13391 case Instruction::ShuffleVector: {
13392 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
13393 ReuseShuffleIndices);
13394 if (S.isAltShuffle()) {
13395 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (isAltShuffle).\n";
13400 dbgs() <<
"SLP: added a new TreeEntry (ShuffleVectorInst).\n";
13414 "Expected different main/alternate predicates.");
13430 TE->setOperands(Operands);
13431 buildTreeRec(Operands.
front(),
Depth + 1, {TE, 0});
13432 buildTreeRec(Operands.
back(),
Depth + 1, {TE, 1});
13439 Operands[0] =
Ops.getVL(0);
13440 Operands[1] =
Ops.getVL(1);
13442 TE->setOperands(Operands);
13444 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
13462 for (
const auto *Ty : ST->elements())
13463 if (Ty != *ST->element_begin())
13465 N *= ST->getNumElements();
13466 EltTy = *ST->element_begin();
13468 N *= AT->getNumElements();
13469 EltTy = AT->getElementType();
13472 N *= VT->getNumElements();
13473 EltTy = VT->getElementType();
13479 size_t VTSize = DL->getTypeStoreSizeInBits(
getWidenedType(EltTy,
N));
13480 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
13481 VTSize != DL->getTypeStoreSizeInBits(T))
13488 bool ResizeAllowed)
const {
13490 assert(It != VL.
end() &&
"Expected at least one extract instruction.");
13497 Value *Vec = E0->getOperand(0);
13499 CurrentOrder.
clear();
13503 if (E0->getOpcode() == Instruction::ExtractValue) {
13515 unsigned E = VL.
size();
13516 if (!ResizeAllowed && NElts !=
E)
13519 unsigned MinIdx = NElts, MaxIdx = 0;
13524 if (Inst->getOperand(0) != Vec)
13532 const unsigned ExtIdx = *Idx;
13533 if (ExtIdx >= NElts)
13535 Indices[
I] = ExtIdx;
13536 if (MinIdx > ExtIdx)
13538 if (MaxIdx < ExtIdx)
13541 if (MaxIdx - MinIdx + 1 >
E)
13543 if (MaxIdx + 1 <=
E)
13547 bool ShouldKeepOrder =
true;
13554 for (
unsigned I = 0;
I <
E; ++
I) {
13557 const unsigned ExtIdx = Indices[
I] - MinIdx;
13558 if (CurrentOrder[ExtIdx] !=
E) {
13559 CurrentOrder.
clear();
13562 ShouldKeepOrder &= ExtIdx ==
I;
13563 CurrentOrder[ExtIdx] =
I;
13565 if (ShouldKeepOrder)
13566 CurrentOrder.
clear();
13568 return ShouldKeepOrder;
13571bool BoUpSLP::areAllUsersVectorized(
13572 Instruction *
I,
const SmallDenseSet<Value *> *VectorizedVals)
const {
13573 return (
I->hasOneUse() && (!VectorizedVals || VectorizedVals->
contains(
I))) ||
13574 all_of(
I->users(), [
this](User *U) {
13575 return isVectorized(U) || isVectorLikeInstWithConstOps(U) ||
13576 (isa<ExtractElementInst>(U) && MustGather.contains(U));
13581 const InstructionsState &S,
13582 DominatorTree &DT,
const DataLayout &DL,
13583 TargetTransformInfo &TTI,
13584 const TargetLibraryInfo &TLI);
13586unsigned BoUpSLP::getNumScalarInsts()
const {
13587 unsigned Count = 0;
13588 for (
const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
13589 const TreeEntry &
TE = *Ptr;
13590 if (DeletedNodes.contains(&TE))
13592 if (
TE.isGather() || TransformedToGatherNodes.contains(&TE)) {
13606 if (
TE.State == TreeEntry::CombinedVectorize)
13611 for (
Value *V :
TE.Scalars) {
13613 (
TE.hasCopyableElements() &&
TE.isCopyableElement(V)))
13619 if (
I && (
I->isIntDivRem() ||
I->isFPDivRem()))
13632 if (
TE.CombinedOp == TreeEntry::NotCombinedOp &&
TE.hasState()) {
13633 unsigned Opcode =
TE.getOpcode();
13634 if (Opcode == Instruction::Select) {
13635 for (
Value *V :
TE.Scalars) {
13636 if (
TE.hasCopyableElements() &&
TE.isCopyableElement(V))
13643 assert(
Count > 0 &&
"Underflow in scalar inst count (minmax)");
13647 }
else if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub) {
13648 for (
Value *V :
TE.Scalars) {
13649 if (
TE.hasCopyableElements() &&
TE.isCopyableElement(V))
13652 if (!
I || (
TE.isAltShuffle() &&
I->getOpcode() != Instruction::FAdd &&
13653 I->getOpcode() != Instruction::FSub))
13657 assert(
Count > 0 &&
"Underflow in scalar inst count (fma)");
13667unsigned BoUpSLP::getNumVectorInsts()
const {
13668 unsigned Count = 0;
13669 SmallPtrSet<Value *, 4> GatherExtractSourceVecs;
13670 for (
const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
13671 const TreeEntry &
TE = *Ptr;
13672 if (DeletedNodes.contains(&TE))
13674 if (
TE.State == TreeEntry::CombinedVectorize)
13676 bool IsGatherOrTransformed =
13677 TE.isGather() || TransformedToGatherNodes.contains(&TE);
13678 if (IsGatherOrTransformed) {
13679 if (
TE.hasState()) {
13680 if (
const TreeEntry *
E =
13681 getSameValuesTreeEntry(
TE.getMainOp(),
TE.Scalars);
13682 E &&
E != &TE &&
E->getVectorFactor() ==
TE.getVectorFactor())
13685 if (
const TreeEntry *
E =
13686 getSameValuesTreeEntry(
TE.getMainOp(), RevScalars);
13687 E &&
E->getVectorFactor() ==
TE.getVectorFactor()) {
13699 GatherExtractSourceVecs.
insert(EE->getVectorOperand());
13701 for (
Value *V :
TE.Scalars) {
13711 if (
TE.getOpcode() == Instruction::InsertElement ||
13712 TE.getOpcode() == Instruction::ExtractElement)
13714 if (
TE.State == TreeEntry::SplitVectorize)
13718 if (!
TE.ReorderIndices.empty() || !
TE.ReuseShuffleIndices.empty())
13721 Count += GatherExtractSourceVecs.
size();
13724 SmallPtrSet<Value *, 8> CountedExtracts;
13725 for (
const ExternalUser &EU : ExternalUses) {
13728 if (EU.User && EphValues.count(EU.User))
13730 if (ExternalUsesAsOriginalScalar.contains(EU.Scalar))
13732 if (!CountedExtracts.
insert(EU.Scalar).second)
13739void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
13740 const function_ref<
bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
13741 SmallVectorImpl<Value *> *OpScalars,
13742 SmallVectorImpl<Value *> *AltScalars)
const {
13743 unsigned Sz = Scalars.size();
13745 SmallVector<int> OrderMask;
13746 if (!ReorderIndices.empty())
13748 for (
unsigned I = 0;
I < Sz; ++
I) {
13750 if (!ReorderIndices.empty())
13751 Idx = OrderMask[
I];
13755 if (IsAltOp(OpInst)) {
13756 Mask[
I] = Sz + Idx;
13765 if (!ReuseShuffleIndices.
empty()) {
13767 transform(ReuseShuffleIndices, NewMask.
begin(), [&Mask](
int Idx) {
13768 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
13770 Mask.swap(NewMask);
13777 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(
I) == MainOp;
13787 assert(MainP != AltP &&
"Expected different main/alternate predicates.");
13796 assert((MainP ==
P || AltP ==
P || MainP == SwappedP || AltP == SwappedP) &&
13797 "CmpInst expected to match either main or alternate predicate or "
13799 return MainP !=
P && MainP != SwappedP;
13801 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(
I) == AltOp;
13819 return CI->getValue().isPowerOf2();
13825 return CI->getValue().isNegatedPowerOf2();
13830 if (IsConstant && IsUniform)
13832 else if (IsConstant)
13834 else if (IsUniform)
13846class BaseShuffleAnalysis {
13848 Type *ScalarTy =
nullptr;
13850 BaseShuffleAnalysis(
Type *ScalarTy) : ScalarTy(ScalarTy) {}
13858 unsigned getVF(
Value *V)
const {
13859 assert(V &&
"V cannot be nullptr");
13861 "V does not have FixedVectorType");
13862 assert(ScalarTy &&
"ScalarTy cannot be nullptr");
13864 unsigned VNumElements =
13866 assert(VNumElements > ScalarTyNumElements &&
13867 "the number of elements of V is not large enough");
13868 assert(VNumElements % ScalarTyNumElements == 0 &&
13869 "the number of elements of V is not a vectorized value");
13870 return VNumElements / ScalarTyNumElements;
13876 static bool isIdentityMask(ArrayRef<int> Mask,
const FixedVectorType *VecTy,
13878 int Limit =
Mask.size();
13890 if (Limit % VF == 0 &&
all_of(
seq<int>(0, Limit / VF), [=](
int Idx) {
13891 ArrayRef<int> Slice =
Mask.slice(Idx * VF, VF);
13904 static void combineMasks(
unsigned LocalVF, SmallVectorImpl<int> &Mask,
13905 ArrayRef<int> ExtMask) {
13906 unsigned VF =
Mask.size();
13908 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
13911 int MaskedIdx =
Mask[ExtMask[
I] % VF];
13915 Mask.swap(NewMask);
13951 static bool peekThroughShuffles(
Value *&V, SmallVectorImpl<int> &Mask,
13952 bool SinglePermute) {
13954 ShuffleVectorInst *IdentityOp =
nullptr;
13955 SmallVector<int> IdentityMask;
13964 if (isIdentityMask(Mask, SVTy,
false)) {
13965 if (!IdentityOp || !SinglePermute ||
13966 (isIdentityMask(Mask, SVTy,
true) &&
13968 IdentityMask.
size()))) {
13973 IdentityMask.
assign(Mask);
13993 if (SV->isZeroEltSplat()) {
13995 IdentityMask.
assign(Mask);
13997 int LocalVF =
Mask.size();
14000 LocalVF = SVOpTy->getNumElements();
14004 static_cast<unsigned>(
I) >= SV->getShuffleMask().size())
14006 ExtMask[Idx] = SV->getMaskValue(
I);
14016 if (!IsOp1Undef && !IsOp2Undef) {
14018 for (
int &
I : Mask) {
14021 if (SV->getMaskValue(
I % SV->getShuffleMask().size()) ==
14027 SmallVector<int> ShuffleMask(SV->getShuffleMask());
14028 combineMasks(LocalVF, ShuffleMask, Mask);
14029 Mask.swap(ShuffleMask);
14031 Op = SV->getOperand(0);
14033 Op = SV->getOperand(1);
14036 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
14041 "Expected masks of same sizes.");
14046 Mask.swap(IdentityMask);
14048 return SinglePermute &&
14051 (Shuffle &&
Mask.size() == Shuffle->getShuffleMask().size() &&
14052 Shuffle->isZeroEltSplat() &&
14056 Shuffle->getShuffleMask()[
P.index()] == 0;
14069 template <
typename T,
typename ShuffleBuilderTy,
typename...
Args>
14070 static T createShuffle(
Value *V1,
Value *V2, ArrayRef<int> Mask,
14071 ShuffleBuilderTy &Builder,
Type *ScalarTy,
14073 assert(V1 &&
"Expected at least one vector value.");
14075 SmallVector<int> NewMask(Mask);
14076 if (ScalarTyNumElements != 1) {
14082 Builder.resizeToMatch(V1, V2);
14083 int VF =
Mask.size();
14085 VF = FTy->getNumElements();
14096 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
14098 CombinedMask1[
I] =
Mask[
I];
14100 CombinedMask2[
I] =
Mask[
I] - VF;
14107 (void)peekThroughShuffles(Op1, CombinedMask1,
false);
14108 (void)peekThroughShuffles(Op2, CombinedMask2,
false);
14114 for (
auto [Idx,
I] :
enumerate(CombinedMask1)) {
14117 ExtMask1[Idx] = SV1->getMaskValue(
I);
14121 ->getNumElements(),
14122 ExtMask1, UseMask::SecondArg);
14123 SmallVector<int> ExtMask2(CombinedMask2.size(),
PoisonMaskElem);
14124 for (
auto [Idx,
I] :
enumerate(CombinedMask2)) {
14127 ExtMask2[Idx] = SV2->getMaskValue(
I);
14131 ->getNumElements(),
14132 ExtMask2, UseMask::SecondArg);
14133 if (SV1->getOperand(0)->getType() ==
14134 SV2->getOperand(0)->getType() &&
14135 SV1->getOperand(0)->getType() != SV1->getType() &&
14138 Op1 = SV1->getOperand(0);
14139 Op2 = SV2->getOperand(0);
14140 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
14141 int LocalVF = ShuffleMask1.size();
14143 LocalVF = FTy->getNumElements();
14144 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
14145 CombinedMask1.swap(ShuffleMask1);
14146 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
14147 LocalVF = ShuffleMask2.size();
14149 LocalVF = FTy->getNumElements();
14150 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
14151 CombinedMask2.swap(ShuffleMask2);
14154 }
while (PrevOp1 != Op1 || PrevOp2 != Op2);
14155 Builder.resizeToMatch(Op1, Op2);
14157 ->getElementCount()
14158 .getKnownMinValue(),
14160 ->getElementCount()
14161 .getKnownMinValue());
14162 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
14165 "Expected undefined mask element");
14166 CombinedMask1[
I] = CombinedMask2[
I] + (Op1 == Op2 ? 0 : VF);
14175 return Builder.createIdentity(Op1);
14176 return Builder.createShuffleVector(
14181 return Builder.createPoison(
14183 bool IsIdentity = peekThroughShuffles(V1, NewMask,
true);
14184 assert(V1 &&
"Expected non-null value after looking through shuffles.");
14187 return Builder.createShuffleVector(V1, NewMask,
Arguments...);
14188 return Builder.createIdentity(V1);
14194 ArrayRef<int> Mask) {
14203static std::pair<InstructionCost, InstructionCost>
14214 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
14223 ScalarCost =
TTI.getPointersChainCost(
14224 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
14228 for (
Value *V : Ptrs) {
14229 if (V == BasePtr) {
14242 if (PtrsRetainedInVecCode.
size() == Ptrs.size()) {
14247 VecCost =
TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
14248 TTI::PointersChainInfo::getKnownStride(),
14258 [](
const Value *V) {
14260 return Ptr && !Ptr->hasAllConstantIndices();
14262 ? TTI::PointersChainInfo::getUnknownStride()
14263 : TTI::PointersChainInfo::getKnownStride();
14266 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy,
CostKind);
14270 if (It != Ptrs.
end())
14275 VecCost =
TTI.getGEPCost(BaseGEP->getSourceElementType(),
14276 BaseGEP->getPointerOperand(), Indices, VecTy,
14281 return std::make_pair(ScalarCost, VecCost);
14284void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
14285 assert(
TE.isGather() &&
TE.ReorderIndices.empty() &&
14286 "Expected gather node without reordering.");
14288 SmallSet<size_t, 2> LoadKeyUsed;
14292 if (
TE.Scalars.size() == 2 || (
TE.hasState() && !
TE.isAltShuffle()) ||
14297 return VectorizableTree[Idx]->isSame(TE.Scalars);
14301 auto GenerateLoadsSubkey = [&](
size_t Key, LoadInst *LI) {
14306 auto LIt = LoadsMap.
find(std::make_pair(
Key, Ptr));
14307 if (LIt != LoadsMap.
end()) {
14308 for (LoadInst *RLI : LIt->second) {
14310 LI->
getType(), LI->getPointerOperand(), *DL, *SE,
14314 for (LoadInst *RLI : LIt->second) {
14316 LI->getPointerOperand(), *TLI)) {
14321 if (LIt->second.size() > 2) {
14323 hash_value(LIt->second.back()->getPointerOperand());
14329 LoadsMap.
try_emplace(std::make_pair(
Key, Ptr)).first->second.push_back(LI);
14332 MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues;
14333 SmallDenseMap<Value *, SmallVector<unsigned>, 8> KeyToIndex;
14334 bool IsOrdered =
true;
14335 unsigned NumInstructions = 0;
14339 size_t Key = 1, Idx = 1;
14347 auto &Container = SortedValues[
Key];
14348 if (IsOrdered && !KeyToIndex.
contains(V) &&
14351 ((Container.contains(Idx) &&
14352 KeyToIndex.
at(Container[Idx].back()).back() !=
I - 1) ||
14353 (!Container.empty() && !Container.contains(Idx) &&
14354 KeyToIndex.
at(Container.back().second.back()).back() !=
I - 1)))
14356 auto &KTI = KeyToIndex[
V];
14358 Container[Idx].push_back(V);
14363 if (!IsOrdered && NumInstructions > 1) {
14365 TE.ReorderIndices.resize(
TE.Scalars.size(),
TE.Scalars.size());
14366 for (
const auto &
D : SortedValues) {
14367 for (
const auto &
P :
D.second) {
14369 for (
Value *V :
P.second) {
14370 ArrayRef<unsigned> Indices = KeyToIndex.
at(V);
14371 for (
auto [K, Idx] :
enumerate(Indices)) {
14372 TE.ReorderIndices[Cnt +
K] = Idx;
14373 TE.Scalars[Cnt +
K] =
V;
14375 Sz += Indices.
size();
14376 Cnt += Indices.
size();
14380 *TTI,
TE.Scalars.front()->getType(), Sz);
14384 }
else if (!
P.second.empty() &&
isConstant(
P.second.front())) {
14392 if (!
TE.ReuseShuffleIndices.empty() ||
TE.ReorderIndices.empty())
14397 auto *ScalarTy =
TE.Scalars.front()->getType();
14399 for (
auto [Idx, Sz] : SubVectors) {
14407 int Sz =
TE.Scalars.size();
14408 SmallVector<int> ReorderMask(
TE.ReorderIndices.begin(),
14409 TE.ReorderIndices.end());
14415 ReorderMask[
I] =
I +
TE.ReorderIndices.size();
14419 any_of(ReorderMask, [&](
int I) {
return I >= Sz; })
14422 VecTy, ReorderMask);
14428 DemandedElts.clearBit(
I);
14430 ReorderMask[
I] =
I;
14432 ReorderMask[
I] =
I + Sz;
14438 if (!DemandedElts.isAllOnes())
14440 if (
Cost >= BVCost) {
14441 SmallVector<int>
Mask(
TE.ReorderIndices.begin(),
TE.ReorderIndices.end());
14443 TE.ReorderIndices.clear();
14450 const InstructionsState &S,
14456 return V->getType()->getScalarType()->isFloatingPointTy();
14458 "Can only convert to FMA for floating point types");
14459 assert(S.isAddSubLikeOp() &&
"Can only convert to FMA for add/sub");
14464 for (
Value *V : VL) {
14468 if (S.isCopyableElement(
I))
14470 Instruction *MatchingI = S.getMatchingMainOpOrAltOp(
I);
14471 if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
14474 FMF &= FPCI->getFastMathFlags();
14478 if (!CheckForContractable(VL))
14481 InstructionsCompatibilityAnalysis
Analysis(DT,
DL,
TTI, TLI);
14488 if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
14490 if (!CheckForContractable(Operands.
front()))
14498 for (
Value *V : VL) {
14502 if (!S.isCopyableElement(
I))
14504 FMF &= FPCI->getFastMathFlags();
14505 FMulPlusFAddCost +=
TTI.getInstructionCost(
I,
CostKind);
14508 for (
auto [V,
Op] :
zip(VL, Operands.
front())) {
14509 if (S.isCopyableElement(V))
14512 if (!
I || !
I->hasOneUse() || OpS.isCopyableElement(
I)) {
14514 FMACost +=
TTI.getInstructionCost(OpI,
CostKind);
14521 FMF &= FPCI->getFastMathFlags();
14522 FMulPlusFAddCost +=
TTI.getInstructionCost(
I,
CostKind);
14530bool BoUpSLP::matchesShlZExt(
const TreeEntry &TE,
OrdersType &Order,
14531 bool &IsBSwap,
bool &ForLoads)
const {
14532 assert(
TE.hasState() &&
TE.getOpcode() == Instruction::Shl &&
14533 "Expected Shl node.");
14536 if (
TE.State != TreeEntry::Vectorize || !
TE.ReorderIndices.empty() ||
14537 !
TE.ReuseShuffleIndices.empty() || MinBWs.contains(&TE) ||
14538 any_of(
TE.Scalars, [](
Value *V) { return !V->hasOneUse(); }))
14540 Type *ScalarTy =
TE.getMainOp()->getType();
14546 const unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
14547 const TreeEntry *LhsTE = getOperandEntry(&TE, 0);
14548 const TreeEntry *RhsTE = getOperandEntry(&TE, 1);
14550 if (!(LhsTE->State == TreeEntry::Vectorize &&
14551 LhsTE->getOpcode() == Instruction::ZExt &&
14552 LhsTE->ReorderIndices.empty() && LhsTE->ReuseShuffleIndices.empty() &&
14553 !MinBWs.contains(LhsTE) &&
14554 all_of(LhsTE->Scalars, [](
Value *V) { return V->hasOneUse(); })))
14557 unsigned Stride = DL->getTypeSizeInBits(SrcScalarTy);
14558 if (!
isPowerOf2_64(Stride) || Stride >= Sz || Sz % Stride != 0 ||
14561 if (!(RhsTE->isGather() && RhsTE->ReorderIndices.empty() &&
14562 RhsTE->ReuseShuffleIndices.empty() && !MinBWs.contains(RhsTE)))
14565 unsigned CurrentValue = 0;
14567 if (
all_of(RhsTE->Scalars,
14569 CurrentValue += Stride;
14570 if (isa<UndefValue>(V))
14572 auto *C = dyn_cast<Constant>(V);
14575 return C->getUniqueInteger() == CurrentValue - Stride;
14577 CurrentValue <= Sz) {
14580 const unsigned VF = RhsTE->getVectorFactor();
14581 Order.assign(VF, VF);
14583 SmallBitVector SeenPositions(VF);
14586 if (VF * Stride > Sz)
14588 for (
const auto [Idx, V] :
enumerate(RhsTE->Scalars)) {
14594 const APInt &Val =
C->getUniqueInteger();
14599 if (Order[Idx] != VF || Pos >= VF)
14601 if (SeenPositions.test(Pos))
14603 SeenPositions.set(Pos);
14611 auto *SrcType = IntegerType::getIntNTy(ScalarTy->
getContext(),
14612 Stride * LhsTE->getVectorFactor());
14614 SmallPtrSet<Value *, 4> CheckedExtracts;
14620 getCastContextHint(*getOperandEntry(LhsTE, 0));
14622 TTI->getArithmeticReductionCost(Instruction::Or, VecTy, FMF,
CostKind) +
14623 TTI->getArithmeticInstrCost(Instruction::Shl, VecTy,
CostKind,
14624 getOperandInfo(LhsTE->Scalars)) +
14625 TTI->getCastInstrCost(
14626 Instruction::ZExt, VecTy,
14630 Instruction::BitCast, SrcType, SrcVecTy, CastCtx,
CostKind);
14631 if (!Order.empty()) {
14633 SmallVector<int>
Mask;
14639 constexpr unsigned ByteSize = 8;
14641 DL->getTypeSizeInBits(SrcScalarTy) == ByteSize) {
14642 IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, SrcType, {SrcType});
14644 TTI->getCastInstrCost(Instruction::BitCast, SrcType, SrcVecTy, CastCtx,
14646 TTI->getIntrinsicInstrCost(CostAttrs,
CostKind);
14647 if (BSwapCost <= BitcastCost) {
14648 BitcastCost = BSwapCost;
14652 const TreeEntry *SrcTE = getOperandEntry(LhsTE, 0);
14653 if (SrcTE->State == TreeEntry::Vectorize &&
14654 SrcTE->ReorderIndices.empty() && SrcTE->ReuseShuffleIndices.empty() &&
14655 SrcTE->getOpcode() == Instruction::Load && !SrcTE->isAltShuffle() &&
14656 all_of(SrcTE->Scalars, [](
Value *V) { return V->hasOneUse(); })) {
14658 IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, SrcType, {SrcType});
14660 TTI->getMemoryOpCost(Instruction::Load, SrcType, LI->getAlign(),
14661 LI->getPointerAddressSpace(),
CostKind) +
14662 TTI->getIntrinsicInstrCost(CostAttrs,
CostKind);
14663 if (BSwapCost <= BitcastCost) {
14665 TTI->getMemoryOpCost(Instruction::Load, SrcVecTy, LI->getAlign(),
14666 LI->getPointerAddressSpace(),
CostKind);
14667 BitcastCost = BSwapCost;
14672 }
else if (Order.empty() && DL->getTypeSizeInBits(SrcScalarTy) == ByteSize) {
14674 const TreeEntry *SrcTE = getOperandEntry(LhsTE, 0);
14675 if (SrcTE->State == TreeEntry::Vectorize && SrcTE->ReorderIndices.empty() &&
14676 SrcTE->ReuseShuffleIndices.empty() &&
14677 SrcTE->getOpcode() == Instruction::Load && !SrcTE->isAltShuffle() &&
14678 all_of(SrcTE->Scalars, [](
Value *V) { return V->hasOneUse(); })) {
14681 TTI->getMemoryOpCost(Instruction::Load, SrcType, LI->getAlign(),
14682 LI->getPointerAddressSpace(),
CostKind);
14684 TTI->getMemoryOpCost(Instruction::Load, SrcVecTy, LI->getAlign(),
14685 LI->getPointerAddressSpace(),
CostKind);
14689 if (SrcType != ScalarTy) {
14690 BitcastCost += TTI->getCastInstrCost(Instruction::ZExt, ScalarTy, SrcType,
14693 return BitcastCost < VecCost;
14696bool BoUpSLP::matchesInversedZExtSelect(
14697 const TreeEntry &SelectTE,
14698 SmallVectorImpl<unsigned> &InversedCmpsIndices)
const {
14699 assert(SelectTE.hasState() && SelectTE.getOpcode() == Instruction::Select &&
14700 "Expected select node.");
14702 for (
auto [Idx, V] :
enumerate(SelectTE.Scalars)) {
14704 if (!Inst || Inst->getOpcode() != Instruction::ZExt)
14710 const auto *CmpTE = getOperandEntry(&SelectTE, 0);
14711 const auto *Op1TE = getOperandEntry(&SelectTE, 1);
14712 const auto *Op2TE = getOperandEntry(&SelectTE, 2);
14716 if (CmpTE->State != TreeEntry::Vectorize || !CmpTE->isAltShuffle() ||
14717 (CmpTE->getOpcode() != Instruction::ICmp &&
14718 CmpTE->getOpcode() != Instruction::FCmp) ||
14719 !CmpTE->ReorderIndices.empty() || !CmpTE->ReuseShuffleIndices.empty() ||
14720 !Op1TE->ReorderIndices.empty() || !Op1TE->ReuseShuffleIndices.empty() ||
14721 !Op2TE->ReorderIndices.empty() || !Op2TE->ReuseShuffleIndices.empty())
14724 if (!Op1TE->isGather() || !Op2TE->isGather())
14727 auto *
Cmp = CmpTE->getMainOp();
14730 if (!
match(Cmp, MatchCmp))
14732 CmpPredicate MainPred = Pred;
14735 for (
const auto [Idx, V] :
enumerate(CmpTE->Scalars)) {
14736 if (!
match(V, MatchCmp))
14742 if (!
V->hasOneUse())
14747 if (InversedCmpsIndices.
empty())
14755 TTI->getCmpSelInstrCost(CmpTE->getOpcode(), VecTy, CmpTy, MainPred,
14756 CostKind, getOperandInfo(CmpTE->getOperand(0)),
14757 getOperandInfo(CmpTE->getOperand(1)));
14762 for (
Value *V : CmpTE->Scalars) {
14766 BVCost += TTI->getInstructionCost(
I,
CostKind);
14768 return VecCost < BVCost;
14771bool BoUpSLP::matchesSelectOfBits(
const TreeEntry &SelectTE)
const {
14772 assert(SelectTE.hasState() && SelectTE.getOpcode() == Instruction::Select &&
14773 "Expected select node.");
14774 if (DL->isBigEndian())
14776 if (!SelectTE.ReorderIndices.empty() || !SelectTE.ReuseShuffleIndices.empty())
14778 if (!UserIgnoreList || SelectTE.Idx != 0)
14780 if (
any_of(SelectTE.Scalars, [](
Value *V) { return !V->hasOneUse(); }))
14783 if (
any_of(*UserIgnoreList,
14786 const TreeEntry *Op1TE = getOperandEntry(&SelectTE, 1);
14787 const TreeEntry *Op2TE = getOperandEntry(&SelectTE, 2);
14788 if (!Op1TE->isGather() || !Op2TE->isGather())
14791 if (!Op1TE->ReorderIndices.empty() || !Op1TE->ReuseShuffleIndices.empty() ||
14792 !Op2TE->ReuseShuffleIndices.empty())
14794 Type *ScalarTy = Op1TE->Scalars.front()->getType();
14798 if (
any_of(Op2TE->Scalars, [](
Value *V) { return !match(V, m_ZeroInt()); }))
14803 return !(match(P.value(), m_ConstantInt(V)) && isPowerOf2_64(V) &&
14804 Log2_64(V) == P.index());
14808 auto *DstTy = IntegerType::getIntNTy(ScalarTy->
getContext(),
14809 SelectTE.getVectorFactor());
14814 auto It = MinBWs.find(&SelectTE);
14815 if (It != MinBWs.end()) {
14816 auto *EffectiveScalarTy =
14824 if (DstTy != ScalarTy) {
14825 BitcastCost += TTI->getCastInstrCost(Instruction::ZExt, ScalarTy, DstTy,
14830 TTI->getCmpSelInstrCost(Instruction::Select, VecTy, CmpTy,
14832 getOperandInfo(Op1TE->Scalars),
14833 getOperandInfo(Op2TE->Scalars)) +
14834 TTI->getArithmeticReductionCost(Instruction::Or, VecTy, FMF,
CostKind);
14835 return BitcastCost <= SelectCost;
14840 BaseGraphSize = VectorizableTree.size();
14842 class GraphTransformModeRAAI {
14843 bool &SavedIsGraphTransformMode;
14846 GraphTransformModeRAAI(
bool &IsGraphTransformMode)
14847 : SavedIsGraphTransformMode(IsGraphTransformMode) {
14848 IsGraphTransformMode =
true;
14850 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode =
false; }
14851 } TransformContext(IsGraphTransformMode);
14860 const InstructionsState &S) {
14864 I2->getOperand(
Op));
14865 return all_of(Candidates, [
this](
14866 ArrayRef<std::pair<Value *, Value *>> Cand) {
14868 [](
const std::pair<Value *, Value *> &
P) {
14878 TreeEntry &E = *VectorizableTree[Idx];
14880 reorderGatherNode(E);
14885 constexpr unsigned VFLimit = 16;
14886 bool ForceLoadGather =
14887 count_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
14888 return TE->isGather() && TE->hasState() &&
14889 TE->getOpcode() == Instruction::Load &&
14890 TE->getVectorFactor() < VFLimit;
14896 return TE->isSame(VL) ||
all_of(VL, [&](
Value *V) {
14905 auto CheckForSameVectorNodes = [&](
const TreeEntry &E) {
14906 if (E.hasState()) {
14908 !TEs.
empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
14909 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
14910 ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
14911 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
14912 return is_contained(TEs, TE);
14919 !TEs.
empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
14920 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
14921 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
14922 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
14923 return is_contained(TEs, TE);
14931 if (It != E.Scalars.end()) {
14933 !TEs.empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
14934 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
14935 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
14936 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
14937 return is_contained(TEs, TE);
14947 for (
unsigned Idx :
seq<unsigned>(BaseGraphSize)) {
14948 TreeEntry &
E = *VectorizableTree[Idx];
14949 if (
E.isGather()) {
14952 unsigned MinVF =
getMinVF(2 * Sz);
14955 if (VL.
size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
14956 !(!
E.hasState() ||
E.getOpcode() == Instruction::Load ||
14962 if (ForceLoadGather &&
E.hasState() &&
E.getOpcode() == Instruction::Load)
14965 if (CheckForSameVectorNodes(
E))
14969 unsigned StartIdx = 0;
14970 unsigned End = VL.
size();
14971 SmallBitVector Processed(End);
14973 *TTI, VL.
front()->getType(), VL.
size() - 1);
14975 *TTI, VL.
front()->getType(), VF - 1)) {
14976 if (StartIdx + VF > End)
14979 bool AllStrided =
true;
14985 for (
unsigned Cnt = StartIdx; Cnt < End; Cnt += VF) {
14986 const unsigned SliceVF = std::min(VF, End - Cnt);
14993 !getSameValuesTreeEntry(Slice.
front(), Slice,
true))
15000 bool IsSplat =
isSplat(Slice);
15001 bool IsTwoRegisterSplat =
true;
15002 if (IsSplat && VF == 2) {
15006 IsTwoRegisterSplat = NumRegs2VF == 2;
15008 if (Slices.
empty() || !IsSplat || !IsTwoRegisterSplat ||
15016 (S.getOpcode() == Instruction::Load &&
15021 std::optional<bool> MainOpIsCheap;
15022 auto IsMainOpCheap = [&] {
15023 if (!MainOpIsCheap)
15025 TTI->getInstructionCost(S.getMainOp(),
CostKind) <
15027 return *MainOpIsCheap;
15031 if ((!UserIgnoreList ||
E.Idx != 0) && IsMainOpCheap() &&
15039 if (S.getOpcode() == Instruction::Load) {
15044 PointerOps, SPtrInfo);
15055 if (UserIgnoreList &&
E.Idx == 0)
15060 }
else if (S.getOpcode() == Instruction::ExtractElement ||
15061 (IsMainOpCheap() &&
15062 !CheckOperandsProfitability(
15079 if (VF == 2 && AllStrided && Slices.
size() > 2)
15081 auto AddCombinedNode = [&](
unsigned Idx,
unsigned Cnt,
unsigned Sz) {
15082 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
15083 Processed.set(Cnt, Cnt + Sz);
15084 if (StartIdx == Cnt)
15085 StartIdx = Cnt + Sz;
15086 if (End == Cnt + Sz)
15089 for (
auto [Cnt, Sz] : Slices) {
15091 const TreeEntry *SameTE =
nullptr;
15093 It != Slice.
end()) {
15095 SameTE = getSameValuesTreeEntry(*It, Slice);
15097 unsigned PrevSize = VectorizableTree.size();
15098 [[maybe_unused]]
unsigned PrevEntriesSize =
15099 LoadEntriesToVectorize.size();
15100 buildTreeRec(Slice, 0,
EdgeInfo(&
E, UINT_MAX));
15101 if (PrevSize + 1 == VectorizableTree.size() && !SameTE &&
15102 VectorizableTree[PrevSize]->isGather() &&
15103 VectorizableTree[PrevSize]->hasState() &&
15104 VectorizableTree[PrevSize]->getOpcode() !=
15105 Instruction::ExtractElement &&
15107 if (UserIgnoreList &&
E.Idx == 0 && VF == 2)
15109 VectorizableTree.pop_back();
15110 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
15111 "LoadEntriesToVectorize expected to remain the same");
15114 AddCombinedNode(PrevSize, Cnt, Sz);
15118 if (
E.CombinedEntriesWithIndices.empty() && !
E.ReorderIndices.empty()) {
15119 SmallVector<int>
Mask(
E.ReorderIndices.begin(),
E.ReorderIndices.end());
15121 E.ReorderIndices.clear();
15126 switch (
E.getOpcode()) {
15127 case Instruction::Load: {
15130 if (
E.State != TreeEntry::Vectorize)
15132 Type *ScalarTy =
E.getMainOp()->getType();
15139 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
15140 SmallVector<int>
Mask;
15144 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
15145 BaseLI->getPointerAddressSpace(),
CostKind,
15149 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_load,
15150 VecTy, BaseLI->getPointerOperand(),
15151 false, CommonAlignment,
15158 ->getPointerOperand()
15161 SPtrInfo.
StrideVal = ConstantInt::get(StrideTy, 1);
15162 SPtrInfo.Ty = VecTy;
15163 TreeEntryToStridedPtrInfoMap[&
E] = SPtrInfo;
15164 E.State = TreeEntry::StridedVectorize;
15169 case Instruction::Store: {
15178 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
15179 SmallVector<int>
Mask;
15183 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
15184 BaseSI->getPointerAddressSpace(),
CostKind,
15188 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_store,
15189 VecTy, BaseSI->getPointerOperand(),
15190 false, CommonAlignment,
15193 if (StridedCost < OriginalVecCost) {
15196 E.State = TreeEntry::StridedVectorize;
15198 ->getPointerOperand()
15202 SPtrInfo.Ty = VecTy;
15203 TreeEntryToStridedPtrInfoMap[&
E] = SPtrInfo;
15205 }
else if (!
E.ReorderIndices.empty()) {
15207 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int>
Mask) {
15209 assert(
Mask.size() > 1 &&
"Expected mask greater than 1 element.");
15210 if (
Mask.size() < 4)
15214 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
15215 TTI.isLegalInterleavedAccessType(
15216 VecTy, Factor, BaseSI->getAlign(),
15217 BaseSI->getPointerAddressSpace()))
15223 SmallVector<int>
Mask(
E.ReorderIndices.begin(),
E.ReorderIndices.end());
15224 unsigned InterleaveFactor = IsInterleaveMask(Mask);
15225 if (InterleaveFactor != 0)
15226 E.setInterleave(InterleaveFactor);
15230 case Instruction::Select: {
15231 if (
E.State != TreeEntry::Vectorize)
15236 E.CombinedOp = TreeEntry::MinMax;
15237 TreeEntry *CondEntry = getOperandEntry(&
E, 0);
15238 if (SelectOnly && CondEntry->UserTreeIndex &&
15239 CondEntry->State == TreeEntry::Vectorize) {
15241 CondEntry->State = TreeEntry::CombinedVectorize;
15246 SmallVector<unsigned> InversedCmpsIndices;
15247 if (matchesInversedZExtSelect(
E, InversedCmpsIndices)) {
15248 auto *CmpTE = getOperandEntry(&
E, 0);
15249 auto *Op1TE = getOperandEntry(&
E, 1);
15250 auto *Op2TE = getOperandEntry(&
E, 2);
15252 CmpTE->setOperations(
15253 InstructionsState(CmpTE->getMainOp(), CmpTE->getMainOp()));
15256 auto UpdateGatherEntry = [&](TreeEntry *OldTE, TreeEntry *NewTE,
15260 auto It = ValueToGatherNodes.find(V);
15261 assert(It != ValueToGatherNodes.end() &&
15262 "Expected to find the value in the map.");
15263 auto &
C = It->getSecond();
15270 for (
const unsigned Idx : InversedCmpsIndices) {
15271 Value *V1 = Op1TE->Scalars[Idx];
15272 Value *V2 = Op2TE->Scalars[Idx];
15273 std::swap(Op1TE->Scalars[Idx], Op2TE->Scalars[Idx]);
15275 UpdateGatherEntry(Op1TE, Op2TE, V1);
15276 UpdateGatherEntry(Op2TE, Op1TE, V2);
15278 OperandsToTreeEntry.emplace_or_assign(std::make_pair(&
E, 1), Op1TE);
15279 OperandsToTreeEntry.emplace_or_assign(std::make_pair(&
E, 2), Op2TE);
15282 if (matchesSelectOfBits(
E)) {
15284 const TreeEntry::CombinedOpcode
Code = TreeEntry::ReducedCmpBitcast;
15285 E.CombinedOp =
Code;
15286 auto *Op1TE = getOperandEntry(&
E, 1);
15287 auto *Op2TE = getOperandEntry(&
E, 2);
15288 Op1TE->State = TreeEntry::CombinedVectorize;
15289 Op1TE->CombinedOp =
Code;
15290 Op2TE->State = TreeEntry::CombinedVectorize;
15291 Op2TE->CombinedOp =
Code;
15296 case Instruction::FSub:
15297 case Instruction::FAdd: {
15299 if (
E.State != TreeEntry::Vectorize ||
15300 !
E.getOperations().isAddSubLikeOp() ||
15301 E.getOperations().isAltShuffle())
15303 const TreeEntry *
LHS = getOperandEntry(&
E, 0);
15304 const TreeEntry *
RHS = getOperandEntry(&
E, 1);
15305 auto IsOneUseVectorFMulOperand = [](
const TreeEntry *
TE) {
15306 return TE->State == TreeEntry::Vectorize &&
15307 TE->ReorderIndices.empty() &&
TE->ReuseShuffleIndices.empty() &&
15308 TE->getOpcode() == Instruction::FMul && !
TE->isAltShuffle() &&
15310 return (TE->hasCopyableElements() &&
15311 TE->isCopyableElement(V)) ||
15315 if (!IsOneUseVectorFMulOperand(
LHS) &&
15316 (
E.getOpcode() == Instruction::FSub ||
15317 !IsOneUseVectorFMulOperand(
RHS)))
15323 E.CombinedOp = TreeEntry::FMulAdd;
15324 TreeEntry *FMulEntry = getOperandEntry(&
E, 0);
15325 if (FMulEntry->UserTreeIndex &&
15326 FMulEntry->State == TreeEntry::Vectorize) {
15328 FMulEntry->State = TreeEntry::CombinedVectorize;
15332 case Instruction::Shl: {
15333 if (
E.Idx != 0 || DL->isBigEndian())
15335 if (!UserIgnoreList)
15345 if (!matchesShlZExt(
E, Order, IsBSwap, ForLoads))
15348 TreeEntry::CombinedOpcode
Code =
15349 IsBSwap ? (ForLoads ? TreeEntry::ReducedBitcastBSwapLoads
15350 : TreeEntry::ReducedBitcastBSwap)
15351 : (ForLoads ? TreeEntry::ReducedBitcastLoads
15352 : TreeEntry::ReducedBitcast);
15353 E.CombinedOp =
Code;
15354 E.ReorderIndices = std::move(Order);
15355 TreeEntry *ZExtEntry = getOperandEntry(&
E, 0);
15356 assert(ZExtEntry->UserTreeIndex &&
15357 ZExtEntry->State == TreeEntry::Vectorize &&
15358 ZExtEntry->getOpcode() == Instruction::ZExt &&
15359 "Expected ZExt node.");
15361 ZExtEntry->State = TreeEntry::CombinedVectorize;
15362 ZExtEntry->CombinedOp =
Code;
15364 TreeEntry *LoadsEntry = getOperandEntry(ZExtEntry, 0);
15365 assert(LoadsEntry->UserTreeIndex &&
15366 LoadsEntry->State == TreeEntry::Vectorize &&
15367 LoadsEntry->getOpcode() == Instruction::Load &&
15368 "Expected Load node.");
15370 LoadsEntry->State = TreeEntry::CombinedVectorize;
15371 LoadsEntry->CombinedOp =
Code;
15373 TreeEntry *ConstEntry = getOperandEntry(&
E, 1);
15374 assert(ConstEntry->UserTreeIndex && ConstEntry->isGather() &&
15375 "Expected ZExt node.");
15377 ConstEntry->State = TreeEntry::CombinedVectorize;
15378 ConstEntry->CombinedOp =
Code;
15386 if (LoadEntriesToVectorize.empty()) {
15388 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
15389 VectorizableTree.front()->getOpcode() == Instruction::Load)
15392 constexpr unsigned SmallTree = 3;
15393 constexpr unsigned SmallVF = 2;
15394 if ((VectorizableTree.size() <= SmallTree &&
15395 VectorizableTree.front()->Scalars.size() == SmallVF) ||
15396 (VectorizableTree.size() <= 2 && UserIgnoreList))
15399 if (VectorizableTree.front()->isNonPowOf2Vec() &&
15403 [](
const std::unique_ptr<TreeEntry> &TE) {
15404 return TE->isGather() &&
TE->hasState() &&
15405 TE->getOpcode() == Instruction::Load &&
15413 SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
15417 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
15418 TreeEntry &
E = *
TE;
15419 if (
E.isGather() &&
15420 ((
E.hasState() &&
E.getOpcode() == Instruction::Load) ||
15421 (!
E.hasState() &&
any_of(
E.Scalars,
15423 return isa<LoadInst>(V) &&
15424 !isVectorized(V) &&
15425 !isDeleted(cast<Instruction>(V));
15428 for (
Value *V :
E.Scalars) {
15435 *
this, V, *DL, *SE, *TTI,
15436 GatheredLoads[std::make_tuple(
15444 if (!GatheredLoads.
empty())
15445 tryToVectorizeGatheredLoads(GatheredLoads);
15455 bool IsFinalized =
false;
15481 bool SameNodesEstimated =
true;
15484 if (Ty->getScalarType()->isPointerTy()) {
15488 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
15489 Ty->getScalarType());
15507 assert(It != VL.
end() &&
"Expected at least one non-undef value.");
15510 count(VL, *It) > 1 &&
15512 if (!NeedShuffle) {
15515 return TTI.getShuffleCost(
15520 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
15521 CostKind, std::distance(VL.
begin(), It),
15527 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
15530 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
15534 VecTy, ShuffleMask, CostKind,
15538 return GatherCost +
15541 : R.getGatherCost(Gathers, !Root && VL.
equals(Gathers),
15549 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
15550 unsigned NumParts) {
15551 assert(VL.
size() > NumParts &&
"Unexpected scalarized shuffle.");
15559 return std::max(Sz, VecTy->getNumElements());
15566 -> std::optional<TTI::ShuffleKind> {
15567 if (NumElts <= EltsPerVector)
15568 return std::nullopt;
15573 return std::min(S,
I);
15576 int OffsetReg1 = OffsetReg0;
15580 int FirstRegId = -1;
15581 Indices.assign(1, OffsetReg0);
15585 int Idx =
I - OffsetReg0;
15587 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
15588 if (FirstRegId < 0)
15589 FirstRegId = RegId;
15590 RegIndices.
insert(RegId);
15591 if (RegIndices.
size() > 2)
15592 return std::nullopt;
15593 if (RegIndices.
size() == 2) {
15595 if (Indices.
size() == 1) {
15598 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
15599 [&](
int S,
int I) {
15600 if (I == PoisonMaskElem)
15602 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
15603 ((I - OffsetReg0) % NumElts) / EltsPerVector;
15604 if (RegId == FirstRegId)
15606 return std::min(S, I);
15609 unsigned Index = OffsetReg1 % NumElts;
15610 Indices.push_back(Index);
15611 SubVecSizes.push_back(std::min(NumElts - Index, EltsPerVector));
15613 Idx =
I - OffsetReg1;
15615 I = (Idx % NumElts) % EltsPerVector +
15616 (RegId == FirstRegId ? 0 : EltsPerVector);
15618 return ShuffleKind;
15626 if (!ShuffleKinds[Part])
15629 Part * EltsPerVector,
getNumElems(Mask.size(), EltsPerVector, Part));
15634 std::optional<TTI::ShuffleKind> RegShuffleKind =
15635 CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);
15636 if (!RegShuffleKind) {
15639 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.
size())))
15641 TTI, *ShuffleKinds[Part],
15648 TTI, *RegShuffleKind,
15652 *R.TTI, VL.
front()->getType(),
alignTo(NumElts, EltsPerVector));
15653 for (
const auto [Idx, SubVecSize] :
zip(Indices, SubVecSizes)) {
15654 assert((Idx + SubVecSize) <= BaseVF &&
15655 "SK_ExtractSubvector index out of range");
15666 TTI, *ShuffleKinds[Part],
15668 if (OriginalCost < Cost)
15669 Cost = OriginalCost;
15676 void estimateNodesPermuteCost(
const TreeEntry &E1,
const TreeEntry *E2,
15678 unsigned SliceSize) {
15679 if (SameNodesEstimated) {
15685 if ((InVectors.size() == 2 &&
15689 unsigned Limit =
getNumElems(Mask.size(), SliceSize, Part);
15692 "Expected all poisoned elements.");
15694 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
15699 Cost += createShuffle(InVectors.front(),
15700 InVectors.size() == 1 ?
nullptr : InVectors.back(),
15702 transformMaskAfterShuffle(CommonMask, CommonMask);
15703 }
else if (InVectors.size() == 2) {
15704 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
15705 transformMaskAfterShuffle(CommonMask, CommonMask);
15707 SameNodesEstimated =
false;
15708 if (!E2 && InVectors.size() == 1) {
15709 unsigned VF = E1.getVectorFactor();
15711 VF = std::max(VF, getVF(V1));
15714 VF = std::max(VF, E->getVectorFactor());
15716 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
15718 CommonMask[Idx] = Mask[Idx] + VF;
15719 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
15720 transformMaskAfterShuffle(CommonMask, CommonMask);
15722 auto P = InVectors.front();
15723 Cost += createShuffle(&E1, E2, Mask);
15724 unsigned VF = Mask.size();
15730 VF = std::max(VF, E->getVectorFactor());
15732 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
15734 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
15735 Cost += createShuffle(
P, InVectors.front(), CommonMask);
15736 transformMaskAfterShuffle(CommonMask, CommonMask);
15740 class ShuffleCostBuilder {
15743 static bool isEmptyOrIdentity(
ArrayRef<int> Mask,
unsigned VF) {
15745 return Mask.empty() ||
15746 (VF == Mask.size() &&
15754 ~ShuffleCostBuilder() =
default;
15760 if (isEmptyOrIdentity(Mask, VF))
15770 if (isEmptyOrIdentity(Mask, VF))
15772 return ::getShuffleCost(
15780 void resizeToMatch(
Value *&,
Value *&)
const {}
15790 ShuffleCostBuilder Builder(TTI);
15793 unsigned CommonVF = Mask.size();
15795 auto GetNodeMinBWAffectedCost = [&](
const TreeEntry &E,
15799 Type *EScalarTy = E.Scalars.front()->getType();
15800 bool IsSigned =
true;
15801 if (
auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
15803 IsSigned = It->second.second;
15805 if (EScalarTy != ScalarTy) {
15806 unsigned CastOpcode = Instruction::Trunc;
15807 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
15808 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
15810 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
15811 return TTI.getCastInstrCost(CastOpcode,
getWidenedType(ScalarTy, VF),
15821 Type *EScalarTy = VecTy->getElementType();
15822 if (EScalarTy != ScalarTy) {
15824 unsigned CastOpcode = Instruction::Trunc;
15825 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
15826 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
15828 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
15829 return TTI.getCastInstrCost(
15835 if (!V1 && !V2 && !P2.isNull()) {
15838 unsigned VF = E->getVectorFactor();
15840 CommonVF = std::max(VF, E2->getVectorFactor());
15843 return Idx < 2 * static_cast<int>(CommonVF);
15845 "All elements in mask must be less than 2 * CommonVF.");
15846 if (E->Scalars.size() == E2->Scalars.size()) {
15850 for (
int &Idx : CommonMask) {
15853 if (Idx <
static_cast<int>(CommonVF) && !EMask.
empty())
15855 else if (Idx >=
static_cast<int>(CommonVF))
15856 Idx = (E2Mask.
empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
15860 CommonVF = E->Scalars.size();
15861 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
15862 GetNodeMinBWAffectedCost(*E2, CommonVF);
15864 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
15865 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
15868 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
15869 }
else if (!V1 && P2.isNull()) {
15872 unsigned VF = E->getVectorFactor();
15876 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
15877 "All elements in mask must be less than CommonVF.");
15878 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
15880 assert(!EMask.
empty() &&
"Expected non-empty common mask.");
15881 for (
int &Idx : CommonMask) {
15885 CommonVF = E->Scalars.size();
15886 }
else if (
unsigned Factor = E->getInterleaveFactor();
15887 Factor > 0 && E->Scalars.size() != Mask.size() &&
15891 std::iota(CommonMask.begin(), CommonMask.end(), 0);
15893 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
15896 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
15897 CommonVF == CommonMask.size() &&
15899 [](
const auto &&
P) {
15901 static_cast<unsigned>(
P.value()) !=
P.index();
15909 }
else if (V1 && P2.isNull()) {
15911 ExtraCost += GetValueMinBWAffectedCost(V1);
15912 CommonVF = getVF(V1);
15915 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
15916 "All elements in mask must be less than CommonVF.");
15917 }
else if (V1 && !V2) {
15919 unsigned VF = getVF(V1);
15921 CommonVF = std::max(VF, E2->getVectorFactor());
15924 return Idx < 2 * static_cast<int>(CommonVF);
15926 "All elements in mask must be less than 2 * CommonVF.");
15927 if (E2->Scalars.size() == VF && VF != CommonVF) {
15929 assert(!E2Mask.
empty() &&
"Expected non-empty common mask.");
15930 for (
int &Idx : CommonMask) {
15933 if (Idx >=
static_cast<int>(CommonVF))
15934 Idx = E2Mask[Idx - CommonVF] + VF;
15938 ExtraCost += GetValueMinBWAffectedCost(V1);
15940 ExtraCost += GetNodeMinBWAffectedCost(
15941 *E2, std::min(CommonVF, E2->getVectorFactor()));
15942 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
15943 }
else if (!V1 && V2) {
15945 unsigned VF = getVF(V2);
15947 CommonVF = std::max(VF, E1->getVectorFactor());
15950 return Idx < 2 * static_cast<int>(CommonVF);
15952 "All elements in mask must be less than 2 * CommonVF.");
15953 if (E1->Scalars.size() == VF && VF != CommonVF) {
15955 assert(!E1Mask.
empty() &&
"Expected non-empty common mask.");
15956 for (
int &Idx : CommonMask) {
15959 if (Idx >=
static_cast<int>(CommonVF))
15960 Idx = E1Mask[Idx - CommonVF] + VF;
15966 ExtraCost += GetNodeMinBWAffectedCost(
15967 *E1, std::min(CommonVF, E1->getVectorFactor()));
15969 ExtraCost += GetValueMinBWAffectedCost(V2);
15970 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
15972 assert(V1 && V2 &&
"Expected both vectors.");
15973 unsigned VF = getVF(V1);
15974 CommonVF = std::max(VF, getVF(V2));
15977 return Idx < 2 * static_cast<int>(CommonVF);
15979 "All elements in mask must be less than 2 * CommonVF.");
15981 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
15984 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
15989 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
15992 InVectors.front() =
15994 if (InVectors.size() == 2)
15995 InVectors.pop_back();
15996 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
15997 V1, V2, CommonMask, Builder, ScalarTy, VL);
16004 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
16005 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
16006 CheckedExtracts(CheckedExtracts) {}
16008 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
16009 unsigned NumParts,
bool &UseVecBaseAsInput) {
16010 UseVecBaseAsInput =
false;
16013 Value *VecBase =
nullptr;
16015 if (!E->ReorderIndices.empty()) {
16017 E->ReorderIndices.end());
16022 bool PrevNodeFound =
any_of(
16023 ArrayRef(R.VectorizableTree).take_front(E->Idx),
16024 [&](
const std::unique_ptr<TreeEntry> &TE) {
16025 return ((TE->hasState() && !TE->isAltShuffle() &&
16026 TE->getOpcode() == Instruction::ExtractElement) ||
16028 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
16029 return VL.size() > Data.index() &&
16030 (Mask[Data.index()] == PoisonMaskElem ||
16031 isa<UndefValue>(VL[Data.index()]) ||
16032 Data.value() == VL[Data.index()]);
16040 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
16054 VecBase = EE->getVectorOperand();
16055 UniqueBases.
insert(VecBase);
16057 if (!CheckedExtracts.
insert(V).second ||
16060 [&](
const TreeEntry *TE) {
16061 return R.DeletedNodes.contains(TE) ||
16062 R.TransformedToGatherNodes.contains(TE);
16064 (E->UserTreeIndex && E->UserTreeIndex.EdgeIdx == UINT_MAX &&
16065 !R.isVectorized(EE) &&
16067 count_if(E->UserTreeIndex.UserTE->Scalars,
16068 [&](
Value *V) { return V == EE; })) ||
16071 return isa<GetElementPtrInst>(U) &&
16072 !R.areAllUsersVectorized(cast<Instruction>(U),
16080 unsigned Idx = *EEIdx;
16082 if (EE->hasOneUse() || !PrevNodeFound) {
16088 Cost -=
TTI.getExtractWithExtendCost(
16092 Cost +=
TTI.getCastInstrCost(
16098 APInt &DemandedElts =
16099 VectorOpsToExtracts
16102 .first->getSecond();
16103 DemandedElts.
setBit(Idx);
16106 for (
const auto &[Vec, DemandedElts] : VectorOpsToExtracts)
16108 DemandedElts,
false,
16116 if (!PrevNodeFound)
16117 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
16120 transformMaskAfterShuffle(CommonMask, CommonMask);
16121 SameNodesEstimated =
false;
16122 if (NumParts != 1 && UniqueBases.
size() != 1) {
16123 UseVecBaseAsInput =
true;
16131 std::optional<InstructionCost>
16135 return std::nullopt;
16139 IsFinalized =
false;
16140 CommonMask.clear();
16143 VectorizedVals.clear();
16144 SameNodesEstimated =
true;
16151 return Idx < static_cast<int>(E1.getVectorFactor());
16153 "Expected single vector shuffle mask.");
16157 if (InVectors.empty()) {
16158 CommonMask.assign(Mask.begin(), Mask.end());
16159 InVectors.assign({&E1, &E2});
16162 assert(!CommonMask.empty() &&
"Expected non-empty common mask.");
16164 unsigned NumParts =
16168 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
16169 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
16173 if (InVectors.empty()) {
16174 CommonMask.assign(Mask.begin(), Mask.end());
16175 InVectors.assign(1, &E1);
16178 assert(!CommonMask.empty() &&
"Expected non-empty common mask.");
16180 unsigned NumParts =
16184 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
16185 estimateNodesPermuteCost(E1,
nullptr, Mask, Part, SliceSize);
16186 if (!SameNodesEstimated && InVectors.size() == 1)
16187 InVectors.emplace_back(&E1);
16193 assert(InVectors.size() == 1 &&
16200 ->getOrdered(
P.index()));
16201 return EI->getVectorOperand() == V1 ||
16202 EI->getVectorOperand() == V2;
16204 "Expected extractelement vectors.");
16210 if (InVectors.empty()) {
16211 assert(CommonMask.empty() && !ForExtracts &&
16212 "Expected empty input mask/vectors.");
16213 CommonMask.assign(Mask.begin(), Mask.end());
16214 InVectors.assign(1, V1);
16220 !CommonMask.empty() &&
16224 ->getOrdered(
P.index());
16226 return P.value() == Mask[
P.index()] ||
16231 return EI->getVectorOperand() == V1;
16233 "Expected only tree entry for extractelement vectors.");
16236 assert(!InVectors.empty() && !CommonMask.empty() &&
16237 "Expected only tree entries from extracts/reused buildvectors.");
16238 unsigned VF = getVF(V1);
16239 if (InVectors.size() == 2) {
16240 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
16241 transformMaskAfterShuffle(CommonMask, CommonMask);
16242 VF = std::max<unsigned>(VF, CommonMask.size());
16243 }
else if (
const auto *InTE =
16244 InVectors.front().dyn_cast<
const TreeEntry *>()) {
16245 VF = std::max(VF, InTE->getVectorFactor());
16249 ->getNumElements());
16251 InVectors.push_back(V1);
16252 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
16254 CommonMask[Idx] = Mask[Idx] + VF;
16257 Value *Root =
nullptr) {
16258 Cost += getBuildVectorCost(VL, Root);
16260 if (BVValues->empty() && InVectors.empty())
16261 BVValues->assign(VL.
begin(), VL.
end());
16268 unsigned VF = VL.
size();
16270 VF = std::min(VF, MaskVF);
16271 Type *VLScalarTy = VL.
front()->getType();
16295 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
16301 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
16306 IsFinalized =
true;
16309 if (InVectors.
size() == 2)
16310 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
16312 Cost += createShuffle(Vec,
nullptr, CommonMask);
16313 transformMaskAfterShuffle(CommonMask, CommonMask);
16315 "Expected vector length for the final value before action.");
16318 Cost += createShuffle(V1, V2, Mask);
16321 InVectors.
front() = V;
16323 if (!SubVectors.empty()) {
16325 if (InVectors.
size() == 2)
16326 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
16328 Cost += createShuffle(Vec,
nullptr, CommonMask);
16329 transformMaskAfterShuffle(CommonMask, CommonMask);
16331 if (!SubVectorsMask.
empty()) {
16333 "Expected same size of masks for subvectors and common mask.");
16335 copy(SubVectorsMask, SVMask.begin());
16336 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
16339 I1 = I2 + CommonMask.
size();
16347 for (
auto [
E, Idx] : SubVectors) {
16348 Type *EScalarTy =
E->Scalars.front()->getType();
16349 bool IsSigned =
true;
16350 if (
auto It =
R.MinBWs.find(
E); It !=
R.MinBWs.end()) {
16353 IsSigned = It->second.second;
16355 if (ScalarTy != EScalarTy) {
16356 unsigned CastOpcode = Instruction::Trunc;
16357 unsigned DstSz =
R.DL->getTypeSizeInBits(ScalarTy);
16358 unsigned SrcSz =
R.DL->getTypeSizeInBits(EScalarTy);
16360 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
16361 Cost += TTI.getCastInstrCost(
16371 if (!CommonMask.
empty()) {
16372 std::iota(std::next(CommonMask.
begin(), Idx),
16373 std::next(CommonMask.
begin(), Idx +
E->getVectorFactor()),
16379 if (!ExtMask.
empty()) {
16380 if (CommonMask.
empty()) {
16384 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
16387 NewMask[
I] = CommonMask[ExtMask[
I]];
16389 CommonMask.
swap(NewMask);
16392 if (CommonMask.
empty()) {
16393 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
16400 createShuffle(InVectors.
front(),
16401 InVectors.
size() == 2 ? InVectors.
back() :
nullptr,
16406 assert((IsFinalized || CommonMask.empty()) &&
16407 "Shuffle construction must be finalized.");
16411const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(
const TreeEntry *E,
16412 unsigned Idx)
const {
16413 TreeEntry *
Op = OperandsToTreeEntry.
at({E, Idx});
16414 assert(
Op->isSame(
E->getOperand(Idx)) &&
"Operands mismatch!");
16419 if (
TE.State == TreeEntry::ScatterVectorize ||
16420 TE.State == TreeEntry::StridedVectorize)
16422 if (
TE.State == TreeEntry::CompressVectorize)
16424 if (
TE.State == TreeEntry::Vectorize &&
TE.getOpcode() == Instruction::Load &&
16425 !
TE.isAltShuffle()) {
16426 if (
TE.ReorderIndices.empty())
16428 SmallVector<int>
Mask;
16447 if (!L->getExitingBlock())
16454uint64_t BoUpSLP::getScaleToLoopIterations(
const TreeEntry &TE,
Value *Scalar,
16470 if (LI->getLoopFor(
PHI->getParent())) {
16475 if (
PHI->getIncomingValue(
I) != Scalar)
16478 if (!Parent || LI->getLoopDepth(InBB) > LI->getLoopDepth(Parent))
16484 Parent =
U->getParent();
16485 }
else if (
TE.isGather() ||
TE.State == TreeEntry::SplitVectorize) {
16487 while (EI.UserTE) {
16488 if (EI.UserTE->isGather() ||
16489 EI.UserTE->State == TreeEntry::SplitVectorize) {
16490 EI = EI.UserTE->UserTreeIndex;
16493 if (EI.UserTE->State == TreeEntry::Vectorize &&
16494 EI.UserTE->getOpcode() == Instruction::PHI) {
16496 Parent = PH->getIncomingBlock(EI.EdgeIdx);
16498 Parent = EI.UserTE->getMainOp()->
getParent();
16505 Parent =
TE.getMainOp()->getParent();
16507 const Loop *
L = LI->getLoopFor(Parent);
16517uint64_t BoUpSLP::getLoopNestScale(
const Loop *L) {
16520 if (
auto It = LoopNestScaleCache.find(L); It != LoopNestScaleCache.end())
16525 SmallVector<const Loop *> Chain;
16526 for (
const Loop *Cur = L; Cur; Cur = Cur->getParentLoop()) {
16527 if (LoopNestScaleCache.contains(Cur))
16531 assert(!Chain.
empty() &&
"Early-return above should have handled cache hit.");
16532 uint64_t Scale = 1;
16533 if (
const Loop *Parent = Chain.
back()->getParentLoop())
16534 Scale = LoopNestScaleCache.lookup(Parent);
16538 for (
const Loop *Cur :
reverse(Chain)) {
16541 LoopNestScaleCache.try_emplace(Cur, std::max<uint64_t>(1, Scale));
16543 return std::max<uint64_t>(1, Scale);
16546uint64_t BoUpSLP::getGatherNodeEffectiveScale(
const TreeEntry &TE,
16551 assert((
TE.isGather() ||
TE.State == TreeEntry::SplitVectorize) &&
16552 "Expected gather/split tree entry.");
16554 uint64_t BaseScale = getScaleToLoopIterations(TE,
nullptr, U);
16574 bool Overflow =
false;
16575 for (
Value *V :
TE.Scalars) {
16579 uint64_t LaneScale =
16580 std::min(getScaleToLoopIterations(TE, V, U), BaseScale);
16588 uint64_t Numerator =
SaturatingAdd(Sum, uint64_t(
N - 1), &Overflow);
16591 uint64_t Avg = Numerator /
N;
16592 return std::clamp<uint64_t>(Avg, 1, BaseScale);
16596BoUpSLP::getVectorSpillReloadCost(
const TreeEntry *
E,
Type *ScalarTy,
16606 if (!
E->hasState() ||
E->getOpcode() == Instruction::Store ||
16607 E->getOpcode() == Instruction::ExtractElement ||
16608 E->getOpcode() == Instruction::ExtractValue ||
16609 E->getOpcode() == Instruction::Freeze ||
16610 (
E->getOpcode() == Instruction::Load &&
16611 E->State != TreeEntry::ScatterVectorize))
16612 return SpillsReloads;
16615 E->State == TreeEntry::Vectorize &&
E->getOpcode() == Instruction::PHI;
16616 SmallPtrSet<const TreeEntry *, 8> CountedOpEntries;
16617 SmallDenseMap<unsigned, unsigned> PressureByClass;
16618 auto AddPartsToClass = [&](
unsigned RegClass,
unsigned Parts) {
16619 assert(Parts != 0 &&
"Expected non-zero number of parts (registers).");
16620 PressureByClass[RegClass] += Parts;
16623 auto GetEntryVecTy = [&](
const TreeEntry *
TE) -> std::pair<Type *, Type *> {
16625 auto BWIt = MinBWs.find(TE);
16626 if (BWIt != MinBWs.end()) {
16632 return std::make_pair(ScalarTy,
16636 if (
E->State == TreeEntry::SplitVectorize) {
16637 for (
const auto &[Idx,
_] :
E->CombinedEntriesWithIndices) {
16638 const TreeEntry *OpTE = VectorizableTree[Idx].get();
16640 if (!CountedOpEntries.
insert(OpTE).second)
16642 auto [ScalarTy, OpVecTy] = GetEntryVecTy(OpTE);
16646 const unsigned RC =
16647 TTI->getRegisterClassForType(
true, OpVecTy);
16648 AddPartsToClass(RC, Parts);
16650 }
else if (IsPHI) {
16653 SmallDenseMap<unsigned, unsigned> MaxOpPressureByClass;
16655 const TreeEntry *OpTE = getOperandEntry(
E, Idx);
16656 auto [ScalarTy, OpVecTy] = GetEntryVecTy(OpTE);
16660 const unsigned RC =
16661 TTI->getRegisterClassForType(
true, OpVecTy);
16662 MaxOpPressureByClass[RC] = std::max(MaxOpPressureByClass[RC], Parts);
16664 for (
auto [RC, Parts] : MaxOpPressureByClass)
16665 AddPartsToClass(RC, Parts);
16670 if (
E->getOpcode() == Instruction::InsertElement && Idx == 0)
16678 const TreeEntry *OpTE = getOperandEntry(
E, Idx);
16680 if (!CountedOpEntries.
insert(OpTE).second)
16686 const unsigned RC =
16687 TTI->getRegisterClassForType(
true, OpVecTy);
16688 AddPartsToClass(RC, Parts);
16692 if (
E->getOpcode() != Instruction::Load) {
16694 if (ResParts != 0) {
16695 const unsigned RC = TTI->getRegisterClassForType(
true, VecTy);
16696 AddPartsToClass(RC, ResParts);
16698 if (VecTy != FinalVecTy) {
16699 const unsigned FinalResParts =
16701 if (FinalResParts != 0) {
16702 const unsigned RC =
16703 TTI->getRegisterClassForType(
true, FinalVecTy);
16704 AddPartsToClass(RC, FinalResParts);
16709 for (
auto [RegClass, UsedRegs] : PressureByClass) {
16710 const unsigned NumAvailRegs = TTI->getNumberOfRegisters(RegClass);
16711 if (NumAvailRegs == 0 || UsedRegs <= NumAvailRegs)
16713 const unsigned SpillCount = UsedRegs - NumAvailRegs;
16715 TTI->getRegisterClassReloadCost(RegClass,
CostKind);
16718 if (
E->Idx > 0 || !UserIgnoreList || !
E->Scalars[0]->getType()->isVoidTy())
16719 SingleRegSpillReload +=
16720 TTI->getRegisterClassSpillCost(RegClass,
CostKind);
16721 SpillsReloads += SingleRegSpillReload * SpillCount;
16723 return SpillsReloads;
16731 for (
unsigned I :
seq(VL.
size())) {
16732 if (!DemandedElts[
I])
16753 SmallPtrSetImpl<Value *> &CheckedExtracts) {
16757 if (
SLPReVec &&
E->State == TreeEntry::Vectorize &&
16758 E->getOpcode() == Instruction::InsertElement &&
16759 !
E->getOperand(1).back()->getType()->isVectorTy())
16762 return InstructionCost::getInvalid();
16767 auto It = MinBWs.find(
E);
16768 Type *OrigScalarTy = ScalarTy;
16769 if (It != MinBWs.end()) {
16775 const TreeEntry *ZExt = getOperandEntry(
E, 0);
16779 unsigned EntryVF =
E->getVectorFactor();
16783 getVectorSpillReloadCost(
E, ScalarTy, VecTy, FinalVecTy,
CostKind);
16784 if (
E->isGather() || TransformedToGatherNodes.contains(
E)) {
16788 return InstructionCost::getInvalid();
16789 return SpillsReloads +
16790 processBuildVector<ShuffleCostEstimator, InstructionCost>(
16791 E, ScalarTy, *TTI, VectorizedVals, *
this, CheckedExtracts);
16793 if (
E->State == TreeEntry::SplitVectorize) {
16794 assert(
E->CombinedEntriesWithIndices.size() == 2 &&
16795 "Expected exactly 2 combined entries.");
16796 assert(
E->ReuseShuffleIndices.empty() &&
"Expected empty reuses mask.");
16798 if (
E->ReorderIndices.empty()) {
16801 CostKind,
E->CombinedEntriesWithIndices.back().second,
16804 VectorizableTree[
E->CombinedEntriesWithIndices.back().first]
16805 ->getVectorFactor())));
16807 unsigned CommonVF =
16808 std::max(VectorizableTree[
E->CombinedEntriesWithIndices.front().first]
16809 ->getVectorFactor(),
16810 VectorizableTree[
E->CombinedEntriesWithIndices.back().first]
16811 ->getVectorFactor());
16817 VectorCost += SpillsReloads;
16818 LLVM_DEBUG(dumpTreeCosts(
E, 0, VectorCost, 0,
"Calculated costs for Tree"));
16822 SmallVector<int>
Mask;
16823 if (!
E->ReorderIndices.empty() &&
E->State != TreeEntry::CompressVectorize &&
16824 (
E->State != TreeEntry::StridedVectorize ||
16826 SmallVector<int> NewMask;
16827 if (
E->getOpcode() == Instruction::Store) {
16829 NewMask.
resize(
E->ReorderIndices.size());
16836 if (!
E->ReuseShuffleIndices.empty())
16840 "Expected non-struct vector type for shuffle cost calculation.");
16845 assert((
E->State == TreeEntry::Vectorize ||
16846 E->State == TreeEntry::ScatterVectorize ||
16847 E->State == TreeEntry::StridedVectorize ||
16848 E->State == TreeEntry::CompressVectorize) &&
16849 "Unhandled state");
16852 (
E->getOpcode() == Instruction::GetElementPtr &&
16853 E->getMainOp()->getType()->isPointerTy()) ||
16854 E->hasCopyableElements()) &&
16857 unsigned ShuffleOrOp =
16858 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector :
E->
getOpcode();
16859 if (
E->CombinedOp != TreeEntry::NotCombinedOp)
16860 ShuffleOrOp =
E->CombinedOp;
16861 SmallSetVector<Value *, 16> UniqueValues;
16862 SmallVector<unsigned, 16> UniqueIndexes;
16864 if (UniqueValues.insert(V))
16865 UniqueIndexes.push_back(Idx);
16866 const unsigned Sz = UniqueValues.size();
16867 SmallBitVector UsedScalars(Sz,
false);
16868 for (
unsigned I = 0;
I < Sz; ++
I) {
16870 !
E->isCopyableElement(UniqueValues[
I]) &&
16871 getTreeEntries(UniqueValues[
I]).
front() ==
E)
16873 UsedScalars.set(
I);
16875 auto GetCastContextHint = [&](
Value *
V) {
16877 return getCastContextHint(*OpTEs.front());
16878 InstructionsState SrcState =
getSameOpcode(
E->getOperand(0), *TLI);
16879 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
16880 !SrcState.isAltShuffle())
16893 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
16895 for (
unsigned I = 0;
I < Sz; ++
I) {
16896 if (UsedScalars.test(
I))
16898 ScalarCost += ScalarEltCost(
I);
16905 if (It != MinBWs.end() && !UnaryInstruction::isCast(
E->getOpcode()) &&
16907 (
E->getOpcode() != Instruction::Load ||
E->UserTreeIndex)) {
16909 if (!EI.UserTE->hasState() ||
16910 EI.UserTE->getOpcode() != Instruction::Select ||
16912 auto UserBWIt = MinBWs.find(EI.UserTE);
16913 Type *UserScalarTy =
16914 (EI.UserTE->isGather() ||
16915 EI.UserTE->State == TreeEntry::SplitVectorize)
16916 ? EI.UserTE->Scalars.front()->getType()
16917 : EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
16918 if (UserBWIt != MinBWs.end())
16920 UserBWIt->second.first);
16921 if (ScalarTy != UserScalarTy) {
16922 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
16923 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
16924 unsigned VecOpcode;
16926 if (BWSz > SrcBWSz)
16927 VecOpcode = Instruction::Trunc;
16930 It->second.second ? Instruction::SExt : Instruction::ZExt;
16932 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
16937 VecCost += SpillsReloads;
16938 LLVM_DEBUG(dumpTreeCosts(
E, CommonCost, VecCost - CommonCost,
16939 ScalarCost,
"Calculated costs for Tree"));
16940 return VecCost - ScalarCost;
16945 assert((
E->State == TreeEntry::Vectorize ||
16946 E->State == TreeEntry::StridedVectorize ||
16947 E->State == TreeEntry::CompressVectorize) &&
16948 "Entry state expected to be Vectorize, StridedVectorize or "
16949 "MaskedLoadCompressVectorize here.");
16952 std::tie(ScalarCost, VecCost) =
16955 LLVM_DEBUG(dumpTreeCosts(
E, 0, VecCost, ScalarCost,
16956 "Calculated GEPs cost for Tree"));
16958 return VecCost - ScalarCost + SpillsReloads;
16964 return InstructionCost::getInvalid();
16965 Type *CanonicalType = Ty;
16971 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
16972 {CanonicalType, CanonicalType});
16974 TTI->getIntrinsicInstrCost(CostAttrs,
CostKind);
16977 if (VI && SelectOnly) {
16979 "Expected only for scalar type.");
16982 CI->
getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
16983 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
16984 {TTI::OK_AnyValue, TTI::OP_None}, CI);
16988 auto GetFMulAddCost = [&, &TTI = *TTI](
const InstructionsState &S,
16993 switch (ShuffleOrOp) {
16994 case Instruction::PHI: {
16997 SmallPtrSet<const TreeEntry *, 4> CountedOps;
16998 for (
Value *V : UniqueValues) {
17003 ValueList Operands(
PHI->getNumIncomingValues(),
nullptr);
17004 for (
unsigned I = 0,
N =
PHI->getNumIncomingValues();
I <
N; ++
I) {
17008 if (
const TreeEntry *OpTE =
17009 getSameValuesTreeEntry(Operands.
front(), Operands))
17010 if (CountedOps.
insert(OpTE).second &&
17011 !OpTE->ReuseShuffleIndices.empty())
17012 ScalarCost +=
TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
17013 OpTE->Scalars.size());
17016 return CommonCost - ScalarCost + SpillsReloads;
17018 case Instruction::ExtractValue:
17019 case Instruction::ExtractElement: {
17024 if (ShuffleOrOp == Instruction::ExtractValue && !
E->StructEVIndices.empty())
17026 APInt DemandedElts;
17028 auto GetScalarCost = [&](
unsigned Idx) {
17034 if (ShuffleOrOp == Instruction::ExtractElement) {
17036 SrcVecTy = EE->getVectorOperandType();
17039 Type *AggregateTy = EV->getAggregateOperand()->getType();
17042 NumElts = ATy->getNumElements();
17048 if (
I->hasOneUse()) {
17058 Cost -= TTI->getCastInstrCost(
17064 if (DemandedElts.
isZero())
17070 return CommonCost - (DemandedElts.
isZero()
17072 : TTI.getScalarizationOverhead(
17073 SrcVecTy, DemandedElts,
false,
17076 return GetCostDiff(GetScalarCost, GetVectorCost);
17078 case Instruction::InsertElement: {
17079 assert(
E->ReuseShuffleIndices.empty() &&
17080 "Unique insertelements only are expected.");
17082 unsigned const NumElts = SrcVecTy->getNumElements();
17083 unsigned const NumScalars = VL.
size();
17085 unsigned NumOfParts =
17090 unsigned OffsetEnd = OffsetBeg;
17091 InsertMask[OffsetBeg] = 0;
17094 if (OffsetBeg > Idx)
17096 else if (OffsetEnd < Idx)
17098 InsertMask[Idx] =
I + 1;
17101 if (NumOfParts > 0 && NumOfParts < NumElts)
17102 VecScalarsSz =
PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
17103 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
17105 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
17106 unsigned InsertVecSz = std::min<unsigned>(
17108 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
17109 bool IsWholeSubvector =
17110 OffsetBeg ==
Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
17114 if (OffsetBeg + InsertVecSz > VecSz) {
17117 InsertVecSz = VecSz;
17122 SmallVector<int>
Mask;
17123 if (!
E->ReorderIndices.empty()) {
17128 std::iota(
Mask.begin(), std::next(
Mask.begin(), InsertVecSz), 0);
17130 bool IsIdentity =
true;
17133 Mask.swap(PrevMask);
17134 for (
unsigned I = 0;
I < NumScalars; ++
I) {
17136 DemandedElts.
setBit(InsertIdx);
17137 AdjustedVL[InsertIdx] = VL[PrevMask[
I]];
17138 IsIdentity &= InsertIdx - OffsetBeg ==
I;
17139 Mask[InsertIdx - OffsetBeg] =
I;
17141 assert(
Offset < NumElts &&
"Failed to find vector index offset");
17151 InsertVecTy, Mask);
17153 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
17159 SmallBitVector InMask =
17161 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
17163 *TTI, ScalarTy, SrcVecTy, DemandedElts,
17166 if (!InMask.
all() && NumScalars != NumElts && !IsWholeSubvector) {
17167 if (InsertVecSz != VecSz) {
17172 for (
unsigned I = 0, End = OffsetBeg -
Offset;
I < End; ++
I)
17174 for (
unsigned I = OffsetBeg -
Offset, End = OffsetEnd -
Offset;
17178 for (
unsigned I = OffsetEnd + 1 -
Offset;
I < VecSz; ++
I)
17185 return Cost + SpillsReloads;
17187 case Instruction::ZExt:
17188 case Instruction::SExt:
17189 case Instruction::FPToUI:
17190 case Instruction::FPToSI:
17191 case Instruction::FPExt:
17192 case Instruction::PtrToInt:
17193 case Instruction::IntToPtr:
17194 case Instruction::SIToFP:
17195 case Instruction::UIToFP:
17196 case Instruction::Trunc:
17197 case Instruction::FPTrunc:
17198 case Instruction::BitCast: {
17199 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
17202 unsigned Opcode = ShuffleOrOp;
17203 unsigned VecOpcode = Opcode;
17205 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
17207 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->
getScalarType());
17208 if (SrcIt != MinBWs.end()) {
17209 SrcBWSz = SrcIt->second.first;
17215 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->
getScalarType());
17216 if (BWSz == SrcBWSz) {
17217 VecOpcode = Instruction::BitCast;
17218 }
else if (BWSz < SrcBWSz) {
17219 VecOpcode = Instruction::Trunc;
17220 }
else if (It != MinBWs.end()) {
17221 assert(BWSz > SrcBWSz &&
"Invalid cast!");
17222 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
17223 }
else if (SrcIt != MinBWs.end()) {
17224 assert(BWSz > SrcBWSz &&
"Invalid cast!");
17226 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
17228 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
17229 !SrcIt->second.second) {
17230 VecOpcode = Instruction::UIToFP;
17233 assert(Idx == 0 &&
"Expected 0 index only");
17234 return TTI->getCastInstrCost(Opcode, VL0->
getType(),
17241 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
17243 auto *
VI = VL0->
getOpcode() == Opcode ? VL0 :
nullptr;
17246 bool IsArithmeticExtendedReduction =
17247 E->Idx == 0 && UserIgnoreList &&
17250 return is_contained({Instruction::Add, Instruction::FAdd,
17251 Instruction::Mul, Instruction::FMul,
17252 Instruction::And, Instruction::Or,
17256 if (IsArithmeticExtendedReduction &&
17257 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
17259 return CommonCost +
17260 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH,
CostKind,
17261 VecOpcode == Opcode ? VI :
nullptr);
17263 return GetCostDiff(GetScalarCost, GetVectorCost);
17265 case Instruction::FCmp:
17266 case Instruction::ICmp:
17273 case Instruction::Select: {
17274 CmpPredicate VecPred, SwappedVecPred;
17277 match(VL0, MatchCmp))
17283 auto GetScalarCost = [&](
unsigned Idx) {
17298 ShuffleOrOp == Instruction::Select &&
17313 "Expected same type for LHS/RHS");
17316 ScalarCost = TTI->getArithmeticInstrCost(
17318 getOperandInfo(
VI->getOperand(0)), getOperandInfo(
RHS));
17321 ScalarCost = TTI->getArithmeticInstrCost(
17323 getOperandInfo(
VI->getOperand(0)), getOperandInfo(
LHS));
17329 ScalarCost = TTI->getCmpSelInstrCost(
17330 E->getOpcode(), OrigScalarTy,
17335 VI->getOperand(ShuffleOrOp == Instruction::Select ? 1 : 0)),
17337 VI->getOperand(ShuffleOrOp == Instruction::Select ? 2 : 1)),
17357 if (ShuffleOrOp == Instruction::Select) {
17367 VecCost = TTI->getArithmeticInstrCost(
17368 Instruction::Or, VecTy,
CostKind, getOperandInfo(
Cond),
17369 getOperandInfo(
RHS));
17374 VecCost = TTI->getArithmeticInstrCost(
17375 Instruction::And, VecTy,
CostKind, getOperandInfo(
Cond),
17376 getOperandInfo(
LHS));
17381 VecCost = TTI->getCmpSelInstrCost(
17382 E->getOpcode(), VecTy, MaskTy, VecPred,
CostKind,
17384 E->getOperand(ShuffleOrOp == Instruction::Select ? 1 : 0)),
17386 E->getOperand(ShuffleOrOp == Instruction::Select ? 2 : 1)),
17391 assert(VecTyNumElements >= CondNumElements &&
17392 VecTyNumElements % CondNumElements == 0 &&
17393 "Cannot vectorize Instruction::Select");
17394 if (CondNumElements != VecTyNumElements) {
17404 return VecCost + CommonCost;
17406 return GetCostDiff(GetScalarCost, GetVectorCost);
17408 case TreeEntry::MinMax: {
17409 auto GetScalarCost = [&](
unsigned Idx) {
17410 return GetMinMaxCost(OrigScalarTy);
17414 return VecCost + CommonCost;
17416 return GetCostDiff(GetScalarCost, GetVectorCost);
17418 case TreeEntry::FMulAdd: {
17419 auto GetScalarCost = [&](
unsigned Idx) {
17422 return GetFMulAddCost(
E->getOperations(),
17428 for (
Value *V :
E->Scalars) {
17430 FMF &= FPCI->getFastMathFlags();
17432 FMF &= FPCIOp->getFastMathFlags();
17435 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,
17436 {VecTy, VecTy, VecTy}, FMF);
17438 return VecCost + CommonCost;
17440 return GetCostDiff(GetScalarCost, GetVectorCost);
17442 case TreeEntry::ReducedBitcast:
17443 case TreeEntry::ReducedBitcastBSwap: {
17444 auto GetScalarCost = [&, &TTI = *TTI](
unsigned Idx) {
17454 ScalarCost += TTI.getInstructionCost(ZExt,
CostKind);
17458 const TreeEntry *LhsTE = getOperandEntry(
E, 0);
17460 getCastContextHint(*getOperandEntry(LhsTE, 0));
17462 auto *SrcVecTy =
getWidenedType(SrcScalarTy, LhsTE->getVectorFactor());
17464 Instruction::BitCast, ScalarTy, SrcVecTy, CastCtx,
CostKind);
17465 if (ShuffleOrOp == TreeEntry::ReducedBitcastBSwap) {
17466 auto *SrcType = IntegerType::getIntNTy(
17468 DL->getTypeSizeInBits(SrcScalarTy) * EntryVF);
17469 IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, SrcType, {SrcType});
17471 TTI.getIntrinsicInstrCost(CostAttrs,
CostKind);
17473 if (SrcType != ScalarTy) {
17475 TTI.getCastInstrCost(Instruction::ZExt, ScalarTy, SrcType,
17479 return BitcastCost + CommonCost;
17481 return GetCostDiff(GetScalarCost, GetVectorCost);
17483 case TreeEntry::ReducedBitcastLoads:
17484 case TreeEntry::ReducedBitcastBSwapLoads: {
17485 auto GetScalarCost = [&, &TTI = *TTI](
unsigned Idx) {
17495 ScalarCost += TTI.getInstructionCost(ZExt,
CostKind);
17499 ScalarCost += TTI.getInstructionCost(Load,
CostKind);
17503 const TreeEntry *LhsTE = getOperandEntry(
E, 0);
17504 const TreeEntry *LoadTE = getOperandEntry(LhsTE, 0);
17506 auto *SrcType = IntegerType::getIntNTy(
17508 DL->getTypeSizeInBits(LI0->getType()) * EntryVF);
17510 TTI.getMemoryOpCost(Instruction::Load, SrcType, LI0->getAlign(),
17511 LI0->getPointerAddressSpace(),
CostKind);
17512 if (ShuffleOrOp == TreeEntry::ReducedBitcastBSwapLoads) {
17513 IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, SrcType, {SrcType});
17515 TTI.getIntrinsicInstrCost(CostAttrs,
CostKind);
17517 if (SrcType != ScalarTy) {
17519 TTI.getCastInstrCost(Instruction::ZExt, ScalarTy, SrcType,
17523 return LoadCost + CommonCost;
17525 return GetCostDiff(GetScalarCost, GetVectorCost);
17527 case TreeEntry::ReducedCmpBitcast: {
17528 auto GetScalarCost = [&, &TTI = *TTI](
unsigned Idx) {
17540 IntegerType::getIntNTy(ScalarTy->
getContext(),
E->getVectorFactor());
17542 TTI.getCastInstrCost(Instruction::BitCast, DstTy, CmpTy,
17544 if (DstTy != ScalarTy) {
17546 TTI.getCastInstrCost(Instruction::ZExt, ScalarTy, DstTy,
17549 return BitcastCost + CommonCost;
17551 return GetCostDiff(GetScalarCost, GetVectorCost);
17553 case Instruction::FNeg:
17554 case Instruction::Add:
17555 case Instruction::FAdd:
17556 case Instruction::Sub:
17557 case Instruction::FSub:
17558 case Instruction::Mul:
17559 case Instruction::FMul:
17560 case Instruction::UDiv:
17561 case Instruction::SDiv:
17562 case Instruction::FDiv:
17563 case Instruction::URem:
17564 case Instruction::SRem:
17565 case Instruction::FRem:
17566 case Instruction::Shl:
17567 case Instruction::LShr:
17568 case Instruction::AShr:
17569 case Instruction::And:
17570 case Instruction::Or:
17571 case Instruction::Xor: {
17572 auto GetScalarCost = [&](
unsigned Idx) {
17579 unsigned Lane = UniqueIndexes[Idx];
17580 Value *Op1 =
E->getOperand(0)[Lane];
17582 SmallVector<const Value *, 2> Operands(1, Op1);
17586 Op2 =
E->getOperand(1)[Lane];
17592 ShuffleOrOp, OrigScalarTy,
CostKind, Op1Info, Op2Info, Operands);
17594 I && (ShuffleOrOp == Instruction::FAdd ||
17595 ShuffleOrOp == Instruction::FSub)) {
17603 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
17608 return CI && CI->getValue().countr_one() >= It->second.first;
17616 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy,
CostKind, Op1Info,
17617 Op2Info, {},
nullptr, TLI) +
17620 return GetCostDiff(GetScalarCost, GetVectorCost);
17622 case Instruction::GetElementPtr: {
17623 return CommonCost + GetGEPCostDiff(VL, VL0);
17625 case Instruction::Load: {
17626 auto GetScalarCost = [&](
unsigned Idx) {
17628 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
17629 VI->getAlign(),
VI->getPointerAddressSpace(),
17635 switch (
E->State) {
17636 case TreeEntry::Vectorize:
17637 if (
unsigned Factor =
E->getInterleaveFactor()) {
17638 VecLdCost = TTI->getInterleavedMemoryOpCost(
17639 Instruction::Load, VecTy, Factor, {}, LI0->getAlign(),
17640 LI0->getPointerAddressSpace(),
CostKind);
17643 VecLdCost = TTI->getMemoryOpCost(
17644 Instruction::Load, VecTy, LI0->getAlign(),
17648 case TreeEntry::StridedVectorize: {
17650 FixedVectorType *StridedLoadTy = SPtrInfo.
Ty;
17651 assert(StridedLoadTy &&
"Missing StridedPointerInfo for tree entry.");
17652 Align CommonAlignment =
17654 VecLdCost = TTI->getMemIntrinsicInstrCost(
17655 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_load,
17656 StridedLoadTy, LI0->getPointerOperand(),
17657 false, CommonAlignment),
17659 if (StridedLoadTy != VecTy)
17661 TTI->getCastInstrCost(Instruction::BitCast, VecTy, StridedLoadTy,
17666 case TreeEntry::CompressVectorize: {
17668 unsigned InterleaveFactor;
17669 SmallVector<int> CompressMask;
17672 if (!
E->ReorderIndices.empty()) {
17673 SmallVector<int>
Mask(
E->ReorderIndices.begin(),
17674 E->ReorderIndices.end());
17681 Scalars, PointerOps,
E->ReorderIndices, *TTI, *DL, *SE, *AC, *DT,
17682 *TLI, [](
Value *) { return true; }, IsMasked, InterleaveFactor,
17683 CompressMask, LoadVecTy);
17684 CompressEntryToData.try_emplace(
E, CompressMask, LoadVecTy,
17685 InterleaveFactor, IsMasked);
17686 Align CommonAlignment = LI0->getAlign();
17687 if (InterleaveFactor) {
17688 VecLdCost = TTI->getInterleavedMemoryOpCost(
17689 Instruction::Load, LoadVecTy, InterleaveFactor, {},
17690 CommonAlignment, LI0->getPointerAddressSpace(),
CostKind);
17691 }
else if (IsMasked) {
17692 VecLdCost = TTI->getMemIntrinsicInstrCost(
17693 MemIntrinsicCostAttributes(Intrinsic::masked_load, LoadVecTy,
17695 LI0->getPointerAddressSpace()),
17699 LoadVecTy, CompressMask,
CostKind);
17701 VecLdCost = TTI->getMemoryOpCost(
17702 Instruction::Load, LoadVecTy, CommonAlignment,
17706 LoadVecTy, CompressMask,
CostKind);
17710 case TreeEntry::ScatterVectorize: {
17711 Align CommonAlignment =
17713 VecLdCost = TTI->getMemIntrinsicInstrCost(
17714 MemIntrinsicCostAttributes(Intrinsic::masked_gather, VecTy,
17715 LI0->getPointerOperand(),
17716 false, CommonAlignment),
17720 case TreeEntry::CombinedVectorize:
17721 case TreeEntry::SplitVectorize:
17722 case TreeEntry::NeedToGather:
17725 return VecLdCost + CommonCost;
17731 if (
E->State == TreeEntry::ScatterVectorize)
17738 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
17740 case Instruction::Store: {
17741 bool IsReorder = !
E->ReorderIndices.empty();
17742 auto GetScalarCost = [=](
unsigned Idx) {
17745 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
17746 VI->getAlign(),
VI->getPointerAddressSpace(),
17754 if (
E->State == TreeEntry::StridedVectorize) {
17756 FixedVectorType *StridedStoreTy = SPtrInfo.
Ty;
17757 assert(StridedStoreTy &&
"Missing StridedPointerInfo for tree entry.");
17758 Align CommonAlignment =
17760 VecStCost = TTI->getMemIntrinsicInstrCost(
17761 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_store,
17763 BaseSI->getPointerOperand(),
17764 false, CommonAlignment),
17766 if (StridedStoreTy != VecTy)
17768 TTI->getCastInstrCost(Instruction::BitCast, VecTy, StridedStoreTy,
17772 assert(
E->State == TreeEntry::Vectorize &&
17773 "Expected either strided or consecutive stores.");
17774 if (
unsigned Factor =
E->getInterleaveFactor()) {
17775 assert(
E->ReuseShuffleIndices.empty() && !
E->ReorderIndices.empty() &&
17776 "No reused shuffles expected");
17778 VecStCost = TTI->getInterleavedMemoryOpCost(
17779 Instruction::Store, VecTy, Factor, {}, BaseSI->getAlign(),
17780 BaseSI->getPointerAddressSpace(),
CostKind);
17783 VecStCost = TTI->getMemoryOpCost(
17784 Instruction::Store, VecTy, BaseSI->getAlign(),
17785 BaseSI->getPointerAddressSpace(),
CostKind, OpInfo);
17788 return VecStCost + CommonCost;
17792 unsigned Idx = IsReorder ?
E->ReorderIndices[
I] :
I;
17796 return GetCostDiff(GetScalarCost, GetVectorCost) +
17797 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
17799 case Instruction::Call: {
17800 auto GetScalarCost = [&](
unsigned Idx) {
17804 IntrinsicCostAttributes CostAttrs(
ID, *CI, 1);
17805 return TTI->getIntrinsicInstrCost(CostAttrs,
CostKind);
17816 It != MinBWs.end() ? It->second.first : 0, TTI);
17818 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
17820 return GetCostDiff(GetScalarCost, GetVectorCost);
17822 case Instruction::ShuffleVector: {
17830 "Invalid Shuffle Vector Operand");
17833 auto TryFindNodeWithEqualOperands = [=]() {
17834 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
17837 if (
TE->hasState() &&
TE->isAltShuffle() &&
17838 ((
TE->getOpcode() ==
E->getOpcode() &&
17839 TE->getAltOpcode() ==
E->getAltOpcode()) ||
17840 (
TE->getOpcode() ==
E->getAltOpcode() &&
17841 TE->getAltOpcode() ==
E->getOpcode())) &&
17842 TE->hasEqualOperands(*
E))
17847 auto GetScalarCost = [&](
unsigned Idx) {
17852 assert(
E->getMatchingMainOpOrAltOp(VI) &&
17853 "Unexpected main/alternate opcode");
17855 return TTI->getInstructionCost(VI,
CostKind);
17863 if (TryFindNodeWithEqualOperands()) {
17865 dbgs() <<
"SLP: diamond match for alternate node found.\n";
17872 TTIRef.getArithmeticInstrCost(
E->getOpcode(), VecTy,
CostKind);
17874 TTIRef.getArithmeticInstrCost(
E->getAltOpcode(), VecTy,
CostKind);
17877 VecCost = TTIRef.getCmpSelInstrCost(
17878 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(),
CostKind,
17879 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
17881 VecCost += TTIRef.getCmpSelInstrCost(
17882 E->getOpcode(), VecTy, MaskTy,
17884 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
17887 Type *SrcSclTy =
E->getMainOp()->getOperand(0)->getType();
17890 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
17891 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
17893 DL->getTypeSizeInBits(
E->getMainOp()->getOperand(0)->getType());
17894 if (SrcIt != MinBWs.end()) {
17895 SrcBWSz = SrcIt->second.first;
17899 if (BWSz <= SrcBWSz) {
17900 if (BWSz < SrcBWSz)
17902 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
17906 <<
"SLP: alternate extension, which should be truncated.\n";
17912 VecCost = TTIRef.getCastInstrCost(
E->getOpcode(), VecTy, SrcTy,
17915 TTIRef.getCastInstrCost(
E->getAltOpcode(), VecTy, SrcTy,
17918 SmallVector<int>
Mask;
17919 E->buildAltOpShuffleMask(
17920 [&](Instruction *
I) {
17921 assert(
E->getMatchingMainOpOrAltOp(
I) &&
17922 "Unexpected main/alternate opcode");
17933 unsigned Opcode0 =
E->getOpcode();
17934 unsigned Opcode1 =
E->getAltOpcode();
17935 SmallBitVector OpcodeMask(
17943 return AltVecCost < VecCost ? AltVecCost : VecCost;
17949 return GetCostDiff(
17954 "Not supported shufflevector usage.");
17956 unsigned SVNumElements =
17958 ->getNumElements();
17959 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
17960 for (
size_t I = 0, End = VL.
size();
I != End;
I += GroupSize) {
17965 "Not supported shufflevector usage.");
17968 [[maybe_unused]]
bool IsExtractSubvectorMask =
17969 SV->isExtractSubvectorMask(Index);
17970 assert(IsExtractSubvectorMask &&
17971 "Not supported shufflevector usage.");
17972 if (NextIndex != Index)
17974 NextIndex += SV->getShuffleMask().size();
17977 return ::getShuffleCost(
17984 return GetCostDiff(GetScalarCost, GetVectorCost);
17986 case Instruction::Freeze:
17993bool BoUpSLP::isFullyVectorizableTinyTree(
bool ForReduction)
const {
17995 << VectorizableTree.size() <<
" is fully vectorizable .\n");
17997 auto &&AreVectorizableGathers = [
this](
const TreeEntry *
TE,
unsigned Limit) {
17998 SmallVector<int>
Mask;
17999 return TE->isGather() &&
18001 [
this](
Value *V) { return EphValues.contains(V); }) &&
18003 TE->Scalars.size() < Limit ||
18004 (((
TE->hasState() &&
18005 TE->getOpcode() == Instruction::ExtractElement) ||
18008 (
TE->hasState() &&
TE->getOpcode() == Instruction::Load &&
18009 !
TE->isAltShuffle()) ||
18014 if (VectorizableTree.size() == 1 &&
18015 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
18016 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
18017 VectorizableTree[0]->State == TreeEntry::CompressVectorize ||
18019 AreVectorizableGathers(VectorizableTree[0].
get(),
18020 VectorizableTree[0]->Scalars.size()) &&
18021 VectorizableTree[0]->getVectorFactor() > 2)))
18024 if (VectorizableTree.size() != 2)
18031 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
18032 AreVectorizableGathers(VectorizableTree[1].
get(),
18033 VectorizableTree[0]->Scalars.size()))
18037 if (VectorizableTree[0]->
isGather() ||
18038 (VectorizableTree[1]->
isGather() &&
18039 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
18040 VectorizableTree[0]->State != TreeEntry::StridedVectorize &&
18041 VectorizableTree[0]->State != TreeEntry::CompressVectorize))
18052 if (VectorizableTree.empty()) {
18053 assert(ExternalUses.empty() &&
"We shouldn't have any external users");
18059 if (
any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
18060 return TE->isGather() &&
18064 dbgs() <<
"SLP: rejecting tree with buildvector struct values of size "
18065 << VectorizableTree.size() <<
".\n");
18071 const unsigned TreeSize = VectorizableTree.size();
18072 const TreeEntry &Front = *VectorizableTree.front();
18073 const bool FrontIsGather = Front.isGather();
18074 const bool FrontHasState = Front.hasState();
18075 const unsigned FrontOpcode = FrontHasState ? Front.getOpcode() : 0u;
18079 constexpr unsigned Limit = 4;
18080 constexpr unsigned LargeTree = 20;
18081 constexpr unsigned LimitTreeSize = 36;
18086 if (!ForReduction) {
18089 if (TreeSize == 1 && FrontIsGather) {
18090 if (FrontHasState && FrontOpcode == Instruction::ExtractElement)
18096 all_of(VectorizableTree, [](
const std::unique_ptr<TreeEntry> &TE) {
18097 return TE->isGather() || TE->State == TreeEntry::SplitVectorize;
18101 FrontOpcode == Instruction::ExtractElement &&
18102 (Front.getVectorFactor() == 2 ||
18106 auto *I = dyn_cast<Instruction>(V);
18107 return !I || !areAllUsersVectorized(I, UserIgnoreList);
18113 VectorizableTree[1]->isGather() &&
18114 (VectorizableTree[1]->getVectorFactor() <= 2 ||
18115 !(
isSplat(VectorizableTree[1]->Scalars) ||
18122 (!ForReduction || Front.getVectorFactor() <= 2) &&
18124 [&](
const std::unique_ptr<TreeEntry> &TE) {
18125 return TE->isGather() && TE->getVectorFactor() <= Limit &&
18135 if (!ForReduction) {
18140 if (!ThresholdSet &&
18141 all_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
18142 const bool IsGather = TE->isGather();
18143 const bool HasState = TE->hasState();
18144 const unsigned Op = HasState ? TE->getOpcode() : 0u;
18145 if (IsGather && (!HasState ||
Op != Instruction::ExtractElement) &&
18148 return HasState &&
Op == Instruction::PHI;
18154 if (ThresholdSet && TreeSize <= Limit) {
18155 bool HasVectorPhi =
false;
18156 auto Compatible = [&](
const std::unique_ptr<TreeEntry> &TE) {
18157 const bool IsGather = TE->isGather();
18158 const bool HasState = TE->hasState();
18159 const unsigned Op = HasState ? TE->getOpcode() : 0u;
18160 if (IsGather && (!HasState ||
Op != Instruction::ExtractElement) &&
18165 if (
Op == Instruction::InsertElement)
18167 if (
Op != Instruction::PHI)
18169 if (TE->State == TreeEntry::Vectorize)
18170 HasVectorPhi =
true;
18172 return isa<PoisonValue>(V) || MustGather.contains(V);
18175 if (
all_of(VectorizableTree, Compatible) && HasVectorPhi)
18180 if (ThresholdNonNegative) {
18181 const bool IsLargeTree = TreeSize >= LargeTree;
18182 bool HasSingleLoad =
false;
18183 if (
all_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
18184 const bool IsGather = TE->isGather();
18185 const bool HasState = TE->hasState();
18186 const unsigned Op = HasState ? TE->getOpcode() : 0u;
18190 const bool PrevLoad = HasSingleLoad;
18192 HasState && !IsGather &&
18193 (
Op == Instruction::Load || TE->hasCopyableElements()) &&
18194 (TE->getVectorFactor() > 2 || TE->ReorderIndices.empty());
18196 if (
Op == Instruction::PHI)
18198 if (TE->getVectorFactor() <= Limit &&
18199 (
Op == Instruction::Store ||
18200 (
Op == Instruction::Load && !PrevLoad)))
18203 }
else if (HasState &&
Op == Instruction::PHI) {
18206 return IsGather && (!HasState ||
Op != Instruction::ExtractElement);
18211 if (TreeSize >= 5 && Front.getVectorFactor() <= 2 &&
18212 Front.Scalars.front()->getType()->isIntegerTy()) {
18213 bool VectorNodeFound =
false;
18214 bool AnyNonConst =
false;
18215 if (
all_of(VectorizableTree,
18216 [&](
const std::unique_ptr<TreeEntry> &TE) {
18217 if (TE->State == TreeEntry::Vectorize && TE->hasState()) {
18218 const unsigned Op = TE->getOpcode();
18219 if (
Op == Instruction::PHI ||
18220 !TE->ReorderIndices.empty())
18222 if (VectorNodeFound)
18224 VectorNodeFound =
true;
18231 return TE->isGather() ||
18232 TE->State == TreeEntry::SplitVectorize;
18242 auto IsBenignNode = [&](
const TreeEntry &TE) {
18243 if (TE.State == TreeEntry::SplitVectorize)
18245 const bool IsGather = TE.isGather();
18246 const bool HasState = TE.hasState();
18248 const unsigned Op = TE.getOpcode();
18249 if (
Op == Instruction::PHI)
18251 const unsigned ScalarsSize = TE.Scalars.size();
18252 if (TE.Idx == 0 && ScalarsSize == 2 &&
Op == Instruction::ICmp &&
18253 TreeSize > LimitTreeSize)
18255 if (ScalarsSize == 2 &&
18256 (!TE.ReuseShuffleIndices.empty() || !TE.ReorderIndices.empty() ||
18257 TE.isAltShuffle()))
18259 if (TE.hasCopyableElements() &&
18269 if (!ThresholdSet) {
18271 unsigned NumGathers = 0;
18272 if (
all_of(VectorizableTree,
18273 [&](
const std::unique_ptr<TreeEntry> &TE) {
18274 const bool IsGather = TE->isGather();
18275 if (!IsGather && TE->hasState()) {
18276 const unsigned Op = TE->getOpcode();
18277 if (
Op == Instruction::Load ||
Op == Instruction::Store) {
18284 return IsBenignNode(*TE);
18286 (StoreLoadNodes.
empty() ||
18287 (TreeSize > LimitTreeSize * StoreLoadNodes.
size() &&
18289 none_of(StoreLoadNodes, [&](
const TreeEntry *TE) {
18290 return TE->getOpcode() == Instruction::Store ||
18292 return !isa<LoadInst>(V) ||
18293 areAllUsersVectorized(cast<Instruction>(V));
18301 if (ThresholdNonNegative && TreeSize > LimitTreeSize) {
18302 const TreeEntry *VectorNode =
nullptr;
18303 if (
all_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
18304 if (!TE->isGather() && TE->hasState() &&
18305 TE->State != TreeEntry::SplitVectorize &&
18306 TE->getOpcode() != Instruction::PHI) {
18309 VectorNode = TE.get();
18312 return IsBenignNode(*TE);
18319 if (ThresholdSet && TreeSize >= Limit &&
18320 Front.State == TreeEntry::SplitVectorize &&
18322 [](
const std::unique_ptr<TreeEntry> &TE) {
18323 return !TE->isGather() && TE->UserTreeIndex.UserTE &&
18324 TE->UserTreeIndex.UserTE->Idx == 0;
18330 if (ThresholdSet && TreeSize > 2 && Front.State == TreeEntry::Vectorize &&
18331 FrontOpcode == Instruction::InsertElement &&
18332 VectorizableTree[1]->State == TreeEntry::Vectorize &&
18333 VectorizableTree[1]->getOpcode() == Instruction::PHI &&
18335 [](
const std::unique_ptr<TreeEntry> &TE) {
18336 return TE->isGather();
18348 if (isFullyVectorizableTinyTree(ForReduction))
18354 const bool IsAllowedSingleBVNode =
18355 TreeSize > 1 || (FrontHasState && !Front.isAltShuffle() &&
18356 FrontOpcode != Instruction::PHI &&
18357 FrontOpcode != Instruction::GetElementPtr &&
18359 if (
any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
18360 return TE->isGather() &&
all_of(TE->Scalars, [&](
Value *V) {
18361 return isa<ExtractElementInst, Constant>(V) ||
18362 (IsAllowedSingleBVNode &&
18363 !V->hasNUsesOrMore(UsesLimit) &&
18364 any_of(V->users(), IsaPred<InsertElementInst>));
18369 const TreeEntry &Back = *VectorizableTree.back();
18370 if (Back.isGather() && Back.hasState() && Back.isAltShuffle()) {
18371 const unsigned BackVF = Back.getVectorFactor();
18373 !Back.Scalars.front()->getType()->isVectorTy() &&
18374 TTI->getScalarizationOverhead(
18390 constexpr unsigned SmallTree = 3;
18391 if (VectorizableTree.front()->isNonPowOf2Vec() &&
18394 [](
const std::unique_ptr<TreeEntry> &TE) {
18395 return TE->isGather() && TE->hasState() &&
18396 TE->getOpcode() == Instruction::Load &&
18404 TreeEntry &E = *VectorizableTree[Idx];
18405 if (E.State == TreeEntry::SplitVectorize)
18409 if ((E.hasState() && E.getOpcode() != Instruction::Load) ||
18431 const TreeEntry *Root = VectorizableTree.front().get();
18432 if (Root->isGather())
18441 for (
const auto &TEPtr : VectorizableTree) {
18442 if (TEPtr->CombinedOp == TreeEntry::ReducedBitcast ||
18443 TEPtr->CombinedOp == TreeEntry::ReducedBitcastBSwap ||
18444 TEPtr->CombinedOp == TreeEntry::ReducedBitcastLoads ||
18445 TEPtr->CombinedOp == TreeEntry::ReducedBitcastBSwapLoads ||
18446 TEPtr->CombinedOp == TreeEntry::ReducedCmpBitcast) {
18447 ScalarOrPseudoEntries.
insert(TEPtr.get());
18450 if (!TEPtr->isGather()) {
18451 Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
18452 EntriesToLastInstruction.
try_emplace(TEPtr.get(), LastInst);
18453 LastInstructions.
insert(LastInst);
18455 if (TEPtr->UserTreeIndex)
18456 EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
18463 auto NoCallIntrinsicOrDoesNotReturn = [
this, &NoCallIntrinsicCache](
18468 if (CB->doesNotReturn())
18473 if (
II->isAssumeLikeIntrinsic())
18475 auto [It, Inserted] = NoCallIntrinsicCache.
try_emplace(
II);
18483 bool Res = IntrCost < CallCost;
18492 CheckedInstructions;
18493 unsigned Budget = 0;
18494 const unsigned BudgetLimit =
18499 "Expected instructions in same block.");
18500 if (
auto It = CheckedInstructions.
find(
Last);
18501 It != CheckedInstructions.
end()) {
18502 const Instruction *Checked = It->second.getPointer();
18503 const bool NoCallsInCachedRange = It->second.getInt() != 0;
18504 if (Checked ==
First)
18505 return NoCallsInCachedRange;
18519 ++
First->getIterator().getReverse(),
18521 Last->getIterator().getReverse();
18523 while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
18528 CB && !NoCallIntrinsicOrDoesNotReturn(CB) && !
isVectorized(CB)) {
18529 for (
const Instruction *LastInst : LastInstsInRange)
18530 CheckedInstructions.
try_emplace(LastInst, &*PrevInstIt, 0);
18533 if (LastInstructions.
contains(&*PrevInstIt))
18534 LastInstsInRange.
push_back(&*PrevInstIt);
18543 const bool Completed = PrevInstIt == InstIt;
18544 const bool NoCallsInRange = Completed || Budget <= BudgetLimit;
18545 for (
const Instruction *LastInst : LastInstsInRange)
18547 LastInst, Completed ?
First : &*PrevInstIt, NoCallsInRange ? 1 : 0);
18548 return NoCallsInRange;
18550 auto AddCosts = [&](
const TreeEntry *
Op) {
18553 Type *ScalarTy =
Op->Scalars.front()->getType();
18554 auto It = MinBWs.find(
Op);
18555 if (It != MinBWs.end())
18558 uint64_t Scale = getScaleToLoopIterations(*
Op);
18559 InstructionCost KeepLiveCost = TTI->getCostOfKeepingLiveOverCall(VecTy);
18560 KeepLiveCost *= Scale;
18561 Cost += KeepLiveCost;
18564 Cost -=
Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy) *
18572 ParentOpParentToPreds;
18580 auto BlockHasNoReturnCall = [&](
const BasicBlock *BB) {
18581 auto [It, Inserted] = BlockHasNoReturnCallCache.
try_emplace(BB,
false);
18586 if (CB && CB->doesNotReturn() && !CB->isTerminator()) {
18596 auto LoopBodyHasCall = [&](
const Loop *L) {
18597 if (
auto It = LoopBodyHasNonVecCall.
find(L);
18598 It != LoopBodyHasNonVecCall.
end())
18606 if (BlockHasNoReturnCall(BB))
18610 if (!CB || NoCallIntrinsicOrDoesNotReturn(CB) ||
isVectorized(CB))
18621 auto Key = std::make_pair(Root, OpParent);
18622 if (
auto It = ParentOpParentToPreds.
find(
Key);
18623 It != ParentOpParentToPreds.
end())
18631 const Loop *L = LI->getLoopFor(Root);
18632 const Loop *Outermost =
nullptr;
18633 while (L && !L->contains(OpParent)) {
18635 L = L->getParentLoop();
18637 if (Outermost && LoopBodyHasCall(Outermost))
18666 while (!Worklist.
empty()) {
18668 if (BB == OpParent) {
18672 if (!Visited.
insert(BB).second)
18677 if (DT->properlyDominates(Root, BB))
18683 if (BlockHasNoReturnCall(BB))
18685 auto Pair = std::make_pair(BB, OpParent);
18686 if (
auto It = ParentOpParentToPreds.
find(Pair);
18687 It != ParentOpParentToPreds.
end()) {
18701 if (Budget > BudgetLimit)
18714 auto FindNonScalarParentEntry = [&](
const TreeEntry *E) ->
const TreeEntry * {
18716 "Expected scalar or pseudo entry.");
18717 const TreeEntry *Entry = E;
18718 while (Entry->UserTreeIndex) {
18719 Entry = Entry->UserTreeIndex.UserTE;
18720 if (!ScalarOrPseudoEntries.
contains(Entry))
18725 while (!LiveEntries.
empty()) {
18727 const auto OpIt = EntriesToOperands.
find(Entry);
18728 if (OpIt == EntriesToOperands.
end())
18731 if (Operands.
empty())
18733 if (ScalarOrPseudoEntries.
contains(Entry)) {
18734 Entry = FindNonScalarParentEntry(Entry);
18736 for (
const TreeEntry *
Op : Operands) {
18737 if (!
Op->isGather())
18743 Instruction *LastInst = EntriesToLastInstruction.
at(Entry);
18745 for (
const TreeEntry *
Op : Operands) {
18746 if (!
Op->isGather())
18750 if (Entry->State == TreeEntry::SplitVectorize ||
18751 (Entry->getOpcode() != Instruction::PHI &&
Op->isGather()) ||
18757 Pred = Phi->getIncomingBlock(
Op->UserTreeIndex.EdgeIdx);
18760 if (
Op->isGather()) {
18761 assert(Entry->getOpcode() == Instruction::PHI &&
18762 "Expected phi node only.");
18764 ->getIncomingBlock(
Op->UserTreeIndex.EdgeIdx);
18766 for (
Value *V :
Op->Scalars) {
18777 OpLastInst = EntriesToLastInstruction.
at(
Op);
18781 if (OpParent == Parent) {
18782 if (Entry->getOpcode() == Instruction::PHI) {
18783 if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
18787 if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
18793 if (Entry->getOpcode() != Instruction::PHI &&
18794 !CheckForNonVecCallsInSameBlock(
18800 if (!CheckForNonVecCallsInSameBlock(OpLastInst,
18806 if (!CheckPredecessors(Parent, Pred, OpParent)) {
18822 const auto *I1 = IE1;
18823 const auto *I2 = IE2;
18835 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
18838 if (I2 && ((I2 == IE2 || I2->
hasOneUse())) &&
18841 }
while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
18848struct ValueSelect {
18849 template <
typename U>
18850 static std::enable_if_t<std::is_same_v<Value *, U>,
Value *>
get(
Value *V) {
18853 template <
typename U>
18854 static std::enable_if_t<!std::is_same_v<Value *, U>,
U>
get(
Value *) {
18872template <
typename T>
18878 assert(!ShuffleMask.empty() &&
"Empty list of shuffles for inserts.");
18880 auto VMIt = std::next(ShuffleMask.begin());
18883 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
18885 if (!IsBaseUndef.
all()) {
18887 std::pair<T *, bool> Res =
18888 ResizeAction(ShuffleMask.begin()->first, Mask,
false);
18890 for (
unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
18894 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
18896 [[maybe_unused]]
auto *V = ValueSelect::get<T *>(
Base);
18897 assert((!V || GetVF(V) == Mask.size()) &&
18898 "Expected base vector of VF number of elements.");
18899 Prev = Action(Mask, {
nullptr, Res.first});
18900 }
else if (ShuffleMask.size() == 1) {
18903 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
18909 Prev = Action(Mask, {ShuffleMask.begin()->first});
18913 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
18914 unsigned Vec2VF = GetVF(VMIt->first);
18915 if (Vec1VF == Vec2VF) {
18919 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
18922 Mask[
I] = SecMask[
I] + Vec1VF;
18925 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
18928 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
18930 std::pair<T *, bool> Res2 =
18931 ResizeAction(VMIt->first, VMIt->second,
false);
18933 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
18940 Mask[
I] = (Res2.second ?
I : SecMask[
I]) + VF;
18943 Prev = Action(Mask, {Res1.first, Res2.first});
18945 VMIt = std::next(VMIt);
18947 [[maybe_unused]]
bool IsBaseNotUndef = !IsBaseUndef.
all();
18949 for (
auto E = ShuffleMask.end(); VMIt !=
E; ++VMIt) {
18951 std::pair<T *, bool> Res =
18952 ResizeAction(VMIt->first, VMIt->second,
false);
18954 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
18957 "Multiple uses of scalars.");
18958 Mask[
I] = (Res.second ?
I : SecMask[
I]) + VF;
18963 Prev = Action(Mask, {Prev, Res.first});
18972 if (
any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
18973 return TE->isGather() &&
18977 dbgs() <<
"SLP: rejecting tree with buildvector struct values of size "
18978 << VectorizableTree.size() <<
".\n");
18987 << VectorizableTree.size() <<
".\n");
18991 const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
18993 auto IsExternallyUsedV = [&](
Value *V) {
18994 auto [It, Inserted] = ExternalUseVCache.
try_emplace(V);
18998 if (V->hasOneUse() || V->getType()->isVoidTy()) {
19000 }
else if (V->hasNUsesOrMore(NumVectScalars)) {
19006 m_InsertElt(m_Value(), m_OneUse(m_CastOrSelf(m_Specific(I))),
19009 if (match(U, m_InsertElt(m_Value(), m_Specific(I), m_ConstantInt())))
19011 if (match(U, m_Store(m_OneUse(m_CastOrSelf(m_Specific(I))), m_Value())))
19013 if (match(U, m_Store(m_Specific(I), m_Value())))
19015 ArrayRef<TreeEntry *> Entries = getTreeEntries(U);
19016 if (Entries.empty() && !MustGather.contains(U))
19018 if (any_of(Entries,
19019 [&](TreeEntry *TE) { return DeletedNodes.contains(TE); }))
19021 return any_of(ValueToGatherNodes.lookup(U), [&](
const TreeEntry *TE) {
19022 return DeletedNodes.contains(TE);
19029 auto IsExternallyUsed = [&](
const TreeEntry &TE,
Value *V) {
19030 assert(TE.hasState() && !TE.isGather() &&
19031 TE.State != TreeEntry::SplitVectorize &&
"Expected vector node.");
19032 if (TE.hasCopyableElements() && TE.isCopyableElement(V))
19034 return IsExternallyUsedV(V);
19041 for (
const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
19042 TreeEntry &TE = *Ptr;
19045 if (TE.State == TreeEntry::CombinedVectorize) {
19047 dbgs() <<
"SLP: Skipping cost for combined node that starts with "
19048 << *TE.Scalars[0] <<
".\n";
19049 TE.dump();
dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
19050 NodesCosts.try_emplace(&TE);
19053 if (
TE.hasState() &&
19054 (
TE.isGather() ||
TE.State == TreeEntry::SplitVectorize)) {
19055 if (
const TreeEntry *
E =
19056 getSameValuesTreeEntry(
TE.getMainOp(),
TE.Scalars);
19057 E &&
E->getVectorFactor() ==
TE.getVectorFactor()) {
19062 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
19063 NodesCosts.try_emplace(&TE);
19070 assert((!
TE.isGather() ||
TE.Idx == 0 ||
TE.UserTreeIndex) &&
19071 "Expected gather nodes with users only.");
19074 uint64_t Scale = 0;
19075 bool CostIsFree =
C == 0;
19080 const bool IsGatherLike =
19081 TE.isGather() ||
TE.State == TreeEntry::SplitVectorize;
19082 if (!CostIsFree && !
TE.isGather() &&
TE.hasState()) {
19083 if (PrevVecParent ==
TE.getMainOp()->getParent()) {
19089 if (!CostIsFree && !Scale) {
19092 ? getGatherNodeEffectiveScale(TE,
TE.Idx == 0 ? RdxRoot :
nullptr)
19093 : getScaleToLoopIterations(
TE);
19096 if (!
TE.isGather() &&
TE.hasState()) {
19097 PrevVecParent = TE.getMainOp()->getParent();
19102 NodesCosts.try_emplace(&TE,
C);
19105 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
19107 if (
TE.Idx > 0 && !
TE.UserTreeIndex &&
TE.hasState() &&
19108 TE.getOpcode() == Instruction::Load)
19109 GatheredLoadsNodes.insert(&TE);
19110 if (!
TE.isGather() &&
TE.State != TreeEntry::SplitVectorize &&
19111 !(
TE.Idx == 0 && (
TE.getOpcode() == Instruction::InsertElement ||
19112 TE.getOpcode() == Instruction::Store)) &&
19116 for (
Value *V :
TE.Scalars) {
19117 if (IsExternallyUsed(TE, V))
19118 DemandedElts.
setBit(
TE.findLaneForValue(V));
19120 if (!DemandedElts.
isZero()) {
19121 Type *ScalarTy =
TE.Scalars.front()->getType();
19122 auto It = MinBWs.find(&TE);
19123 if (It != MinBWs.end())
19128 DemandedElts,
false,
19130 if (ExtCost.
isValid() && ExtCost != 0) {
19132 Scale = getScaleToLoopIterations(TE);
19136 ExtractCosts.try_emplace(&TE, ExtCost);
19145 constexpr unsigned PartLimit = 2;
19146 const unsigned Sz =
19148 const unsigned MinVF =
getMinVF(Sz);
19150 VectorizableTree.front()->Scalars.size() * PartLimit <= MinVF &&
19151 (!VectorizableTree.front()->hasState() ||
19152 (VectorizableTree.front()->getOpcode() != Instruction::Store &&
19153 LI->getLoopFor(VectorizableTree.front()->getMainOp()->getParent()))))
19160 std::tuple<InstructionCost, InstructionCost, SmallVector<unsigned>>>
19161 SubtreeCosts(VectorizableTree.size());
19162 auto UpdateParentNodes =
19163 [&](
const TreeEntry *UserTE,
const TreeEntry *
TE,
19165 SmallDenseSet<std::pair<const TreeEntry *, const TreeEntry *>, 4>
19167 bool AddToList =
true) {
19169 VisitedUser.insert(std::make_pair(TE, UserTE)).second) {
19170 std::get<0>(SubtreeCosts[UserTE->Idx]) += TotalCost;
19171 std::get<1>(SubtreeCosts[UserTE->Idx]) +=
Cost;
19173 std::get<2>(SubtreeCosts[UserTE->Idx]).push_back(
TE->Idx);
19174 UserTE = UserTE->UserTreeIndex.UserTE;
19177 for (
const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
19178 TreeEntry &
TE = *Ptr;
19181 std::get<0>(SubtreeCosts[
TE.Idx]) +=
C + ExtractCost;
19182 std::get<1>(SubtreeCosts[
TE.Idx]) +=
C;
19183 if (
const TreeEntry *UserTE =
TE.UserTreeIndex.UserTE) {
19184 SmallDenseSet<std::pair<const TreeEntry *, const TreeEntry *>, 4>
19186 UpdateParentNodes(UserTE, &TE,
C + ExtractCost,
C, VisitedUser);
19189 SmallDenseSet<std::pair<const TreeEntry *, const TreeEntry *>, 4> Visited;
19190 for (TreeEntry *TE : GatheredLoadsNodes) {
19193 for (
Value *V :
TE->Scalars) {
19194 for (
const TreeEntry *BVTE : ValueToGatherNodes.lookup(V))
19195 UpdateParentNodes(BVTE, TE, TotalCost,
Cost, Visited,
19200 using CostIndicesTy =
19202 SmallVector<unsigned>>>;
19203 struct FirstGreater {
19204 bool operator()(
const CostIndicesTy &
LHS,
const CostIndicesTy &
RHS)
const {
19205 return std::get<0>(
LHS.second) < std::get<0>(
RHS.second) ||
19206 (std::get<0>(
LHS.second) == std::get<0>(
RHS.second) &&
19207 LHS.first->Idx <
RHS.first->Idx);
19210 PriorityQueue<CostIndicesTy, SmallVector<CostIndicesTy>, FirstGreater>
19212 for (
const auto [Idx,
P] :
enumerate(SubtreeCosts))
19213 Worklist.emplace(VectorizableTree[Idx].
get(),
P);
19216 if (!UserIgnoreList && VectorizableTree.front()->getVectorFactor() < MinVF &&
19217 VectorizableTree.front()->hasState() &&
19218 VectorizableTree.front()->getOpcode() == Instruction::Store &&
19219 (Worklist.top().first->Idx == 0 || Worklist.top().first->Idx == 1))
19223 bool PreferTrimmedTree =
false;
19224 while (!Worklist.empty() && std::get<0>(Worklist.top().second) > 0) {
19225 TreeEntry *
TE = Worklist.top().first;
19226 if (
TE->isGather() ||
TE->Idx == 0 || DeletedNodes.contains(TE) ||
19230 (
TE->UserTreeIndex &&
19231 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize &&
19233 ArrayRef<TreeEntry *> Entries = getSplitTreeEntries(V);
19234 return Entries.size() > 1;
19240 if (
TE->State == TreeEntry::Vectorize && !
TE->isAltShuffle() &&
19241 (
TE->getOpcode() == Instruction::ICmp ||
19242 TE->getOpcode() == Instruction::FCmp) &&
19244 auto *I = dyn_cast<CmpInst>(V);
19247 return I->getPredicate() !=
19248 cast<CmpInst>(TE->getMainOp())->getPredicate();
19255 InstructionCost TotalSubtreeCost = std::get<0>(Worklist.top().second);
19257 if (TotalSubtreeCost < TE->Scalars.size()) {
19261 if (!TransformedToGatherNodes.empty()) {
19262 for (
unsigned Idx : std::get<2>(Worklist.top().second)) {
19263 auto It = TransformedToGatherNodes.find(VectorizableTree[Idx].
get());
19264 if (It != TransformedToGatherNodes.end()) {
19265 TotalSubtreeCost -= std::get<0>(SubtreeCosts[Idx]);
19266 SubtreeCost -= std::get<1>(SubtreeCosts[Idx]);
19267 TotalSubtreeCost += It->second;
19268 SubtreeCost += It->second;
19272 if (TotalSubtreeCost < 0 || TotalSubtreeCost < TE->Scalars.size()) {
19276 const unsigned EntryVF =
TE->getVectorFactor();
19280 DemandedElts.
setBit(Idx);
19284 auto It = MinBWs.find(TE);
19285 if (It != MinBWs.end())
19291 SmallVector<int>
Mask;
19292 if (!
TE->ReorderIndices.empty() &&
19293 TE->State != TreeEntry::CompressVectorize &&
19294 (
TE->State != TreeEntry::StridedVectorize ||
19296 SmallVector<int> NewMask;
19297 if (
TE->getOpcode() == Instruction::Store) {
19299 NewMask.
resize(
TE->ReorderIndices.size());
19306 if (!
TE->ReuseShuffleIndices.empty())
19313 if ((!
TE->hasState() || !
TE->isAltShuffle()) &&
19315 return (TE->hasCopyableElements() && TE->isCopyableElement(V)) ||
19316 isConstant(V) || isGathered(V) || getTreeEntries(V).size() > 1;
19320 ArrayRef<unsigned> Nodes = std::get<2>(Worklist.top().second);
19326 auto IsEqualCostAltShuffleToTrim = [&]() {
19327 return TotalSubtreeCost == GatherCost &&
TE->hasState() &&
19329 none_of(Nodes, [&](
unsigned Idx) {
19330 return VectorizableTree[Idx]->hasState() &&
19331 VectorizableTree[Idx]->getOpcode() ==
19332 Instruction::ExtractElement;
19337 bool HasNonPowerOf2 =
any_of(Nodes, [&](
unsigned Idx) {
19341 if (TotalSubtreeCost > GatherCost + TrimMargin ||
19342 IsEqualCostAltShuffleToTrim()) {
19343 PreferTrimmedTree |= TotalSubtreeCost == GatherCost;
19348 if (VectorizableTree.front()->hasState() &&
19349 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
19353 <<
" - tree already profitable with cost " <<
Cost
19358 return InstructionCost::getInvalid();
19361 LLVM_DEBUG(
dbgs() <<
"SLP: Trimming unprofitable subtree at node "
19362 <<
TE->Idx <<
" with cost "
19363 << std::get<0>(Worklist.top().second)
19364 <<
" and gather cost " << GatherCost <<
".\n");
19365 if (
TE->UserTreeIndex) {
19366 TransformedToGatherNodes.try_emplace(TE, GatherCost);
19367 NodesCosts.erase(TE);
19369 DeletedNodes.insert(TE);
19370 TransformedToGatherNodes.erase(TE);
19371 NodesCosts.erase(TE);
19373 for (
unsigned Idx : Nodes) {
19374 TreeEntry &ChildTE = *VectorizableTree[Idx];
19375 DeletedNodes.insert(&ChildTE);
19376 TransformedToGatherNodes.erase(&ChildTE);
19377 NodesCosts.erase(&ChildTE);
19384 return std::get<1>(SubtreeCosts.front());
19386 SmallPtrSet<TreeEntry *, 4> GatheredLoadsToDelete;
19393 for (TreeEntry *TE : GatheredLoadsNodes) {
19394 if (DeletedNodes.contains(TE) || TransformedToGatherNodes.contains(TE))
19396 GatheredLoadsToDelete.
insert(TE);
19399 SmallDenseMap<const TreeEntry *, SmallVector<Value *>> ValuesToInsert;
19400 for (
Value *V :
TE->Scalars) {
19401 unsigned Pos =
TE->findLaneForValue(V);
19402 for (
const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
19403 if (DeletedNodes.contains(BVE))
19405 DemandedElts.
setBit(Pos);
19406 ValuesToInsert.
try_emplace(BVE).first->second.push_back(V);
19409 if (!DemandedElts.
isZero()) {
19410 Type *ScalarTy =
TE->Scalars.front()->getType();
19411 auto It = MinBWs.find(TE);
19412 if (It != MinBWs.end())
19419 for (
const auto &[BVE, Values] : ValuesToInsert) {
19423 for (
Value *V : Values) {
19424 unsigned Pos = BVE->findLaneForValue(V);
19426 BVDemandedElts.
setBit(Pos);
19428 auto *BVVecTy =
getWidenedType(ScalarTy, BVE->getVectorFactor());
19434 if (ExtractsCost < BVCost) {
19435 LoadsExtractsCost += ExtractsCost;
19436 GatheredLoadsToDelete.
erase(TE);
19439 LoadsExtractsCost += BVCost;
19441 NodesCosts.erase(TE);
19445 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
19446 if (
TE->UserTreeIndex &&
19447 GatheredLoadsToDelete.
contains(
TE->UserTreeIndex.UserTE)) {
19448 DeletedNodes.insert(
TE.get());
19449 NodesCosts.erase(
TE.get());
19450 GatheredLoadsToDelete.
insert(
TE.get());
19452 if (GatheredLoadsToDelete.
contains(
TE.get()))
19453 DeletedNodes.insert(
TE.get());
19456 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
19457 if (!
TE->UserTreeIndex && TransformedToGatherNodes.contains(
TE.get())) {
19458 assert(
TE->getOpcode() == Instruction::Load &&
"Expected load only.");
19461 if (DeletedNodes.contains(
TE.get()))
19463 if (!NodesCosts.contains(
TE.get())) {
19465 getEntryCost(
TE.get(), VectorizedVals, CheckedExtracts);
19466 if (!
C.isValid() ||
C == 0) {
19467 NodesCosts.try_emplace(
TE.get(),
C);
19470 uint64_t Scale = EntryToScale.
lookup(
TE.get());
19472 const bool IsGatherLike =
19473 TE->isGather() ||
TE->State == TreeEntry::SplitVectorize;
19474 Scale = IsGatherLike ? getGatherNodeEffectiveScale(*
TE.get())
19475 : getScaleToLoopIterations(*
TE.
get());
19478 NodesCosts.try_emplace(
TE.get(),
C);
19482 LLVM_DEBUG(
dbgs() <<
"SLP: Recalculate costs after tree trimming.\n");
19484 for (
const auto &
P : NodesCosts) {
19485 NewCost +=
P.second;
19486 LLVM_DEBUG(
dbgs() <<
"SLP: Adding cost " <<
P.second <<
" for bundle "
19489 <<
"SLP: Current total cost = " << NewCost <<
"\n");
19491 if (NewCost + LoadsExtractsCost >
Cost ||
19492 (!PreferTrimmedTree && NewCost + LoadsExtractsCost ==
Cost)) {
19493 DeletedNodes.clear();
19494 TransformedToGatherNodes.clear();
19499 if (VectorizableTree.size() >= 2 && VectorizableTree.front()->hasState() &&
19500 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
19501 TransformedToGatherNodes.contains(VectorizableTree[1].get()))
19502 return InstructionCost::getInvalid();
19503 if (VectorizableTree.size() >= 3 && VectorizableTree.front()->hasState() &&
19504 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
19505 VectorizableTree[1]->hasState() &&
19506 VectorizableTree[1]->State == TreeEntry::Vectorize &&
19507 (VectorizableTree[1]->getOpcode() == Instruction::ZExt ||
19508 VectorizableTree[1]->getOpcode() == Instruction::SExt ||
19509 VectorizableTree[1]->getOpcode() == Instruction::Trunc) &&
19510 TransformedToGatherNodes.contains(VectorizableTree[2].get()))
19511 return InstructionCost::getInvalid();
19519template <
typename T>
struct ShuffledInsertData {
19523 MapVector<T, SmallVector<int>> ValueMasks;
19537 VectorizableTree.front()->getVectorFactor() == 2 &&
19541 VectorizableTree.front()->Scalars.front()->getType()))) {
19542 unsigned NumScalar = getNumScalarInsts();
19543 unsigned NumVector = getNumVectorInsts();
19544 LLVM_DEBUG(
dbgs() <<
"SLP: Inst count check: vector=" << NumVector
19545 <<
" scalar=" << NumScalar <<
"\n");
19546 if (NumVector > NumScalar) {
19548 << NumVector <<
" > scalar inst count " << NumScalar
19559 if (!
C.isValid() ||
C == 0)
19562 EntryToScale.
try_emplace(std::make_tuple(&TE, Scalar, U), 0)
19563 .first->getSecond();
19565 Scale = getScaleToLoopIterations(TE, Scalar, U);
19566 LLVM_DEBUG(
dbgs() <<
"Scale " << Scale <<
" For entry " << TE.Idx <<
"\n");
19570 if (UserIgnoreList) {
19572 ReductionCost = ScaleCost(ReductionCost, *VectorizableTree.front().get(),
19573 nullptr, ReductionRoot);
19577 Cost += ReductionCost;
19579 if (
any_of(ExternalUses, [](
const ExternalUser &EU) {
19587 constexpr unsigned CostLimit = 100;
19589 (VectorizableTree.size() - DeletedNodes.size()) *
19590 VectorizableTree.front()->getVectorFactor() <
19595 none_of(ExternalUses, [](
const ExternalUser &EU) {
19601 ExtractCostCalculated;
19607 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
19614 bool AllUsersGEPSWithStoresLoads =
true;
19615 SmallBitVector UsedLanes(VectorizableTree.front()->getVectorFactor());
19617 Type *UserScalarTy =
nullptr;
19618 for (ExternalUser &EU : ExternalUses) {
19619 ScalarUserAndIdx.
emplace_back(EU.Scalar, EU.User, EU.Lane);
19620 if (EU.E.Idx == 0) {
19621 UsedLanes.
set(EU.Lane);
19627 UserScalarTy = LocalTy;
19628 }
else if (UserScalarTy != LocalTy) {
19629 AllUsersGEPSWithStoresLoads =
false;
19634 AllUsersGEPSWithStoresLoads =
false;
19639 AllUsersGEPSWithStoresLoads &= UsedLanes.
all();
19651 for (
const ExternalUser &EU : ExternalUses) {
19654 if (EphValues.count(EU.User))
19657 if (!DT->isReachableFromEntry(UserParent) || UserParent->
isEHPad() ||
19670 if (LI->getLoopFor(
PHI->getParent())) {
19672 if (
PHI->getIncomingValue(Idx) != EU.Scalar)
19676 UseBlock ? DT->findNearestCommonDominator(UseBlock, InBB) : InBB;
19684 auto [It, Inserted] = ScalarToExtractBlock.
try_emplace(EU.Scalar, UseBlock);
19685 if (!Inserted && It->second && UseBlock)
19686 It->second = DT->findNearestCommonDominator(It->second, UseBlock);
19690 for (ExternalUser &EU : ExternalUses) {
19691 LLVM_DEBUG(
dbgs() <<
"SLP: Computing cost for external use of TreeEntry "
19692 << EU.E.Idx <<
" in lane " << EU.Lane <<
"\n");
19694 else dbgs() <<
" User: nullptr\n");
19695 LLVM_DEBUG(
dbgs() <<
" Use: " << EU.Scalar->getNameOrAsOperand() <<
"\n");
19700 if (EphValues.count(EU.User))
19704 if (!CheckedScalarUser.
insert(std::make_pair(EU.Scalar, EU.User)).second ||
19706 CheckedScalarUser.
contains(std::make_pair(EU.Scalar,
nullptr))))
19714 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
19721 (EU.E.hasState() && EU.E.getOpcode() == Instruction::InsertElement)))
19730 if (!UsedInserts.
insert(VU).second)
19734 const TreeEntry *ScalarTE = &EU.E;
19737 [
this, VU](
const ShuffledInsertData<const TreeEntry *> &
Data) {
19742 Value *Op0 =
II->getOperand(0);
19749 if (It == ShuffledInserts.
end()) {
19751 Data.InsertElements.emplace_back(VU);
19753 VecId = ShuffledInserts.
size() - 1;
19754 auto It = MinBWs.find(ScalarTE);
19755 if (It != MinBWs.end() &&
19757 .
insert(std::make_pair(ScalarTE, FTy->getElementType()))
19759 unsigned BWSz = It->second.first;
19760 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
19761 unsigned VecOpcode;
19762 if (DstBWSz < BWSz)
19763 VecOpcode = Instruction::Trunc;
19766 It->second.second ? Instruction::SExt : Instruction::ZExt;
19771 FTy->getNumElements()),
19774 <<
" for extending externally used vector with "
19775 "non-equal minimum bitwidth.\n");
19780 It->InsertElements.front() = VU;
19781 VecId = std::distance(ShuffledInserts.
begin(), It);
19783 int InIdx = *InsertIdx;
19785 ShuffledInserts[VecId].ValueMasks[ScalarTE];
19788 Mask[InIdx] = EU.Lane;
19789 DemandedElts[VecId].setBit(InIdx);
19800 auto *ScalarTy = EU.Scalar->getType();
19801 const unsigned BundleWidth = EU.E.getVectorFactor();
19802 assert(EU.Lane < BundleWidth &&
"Extracted lane out of bounds.");
19804 const TreeEntry *Entry = &EU.E;
19805 auto It = MinBWs.find(Entry);
19808 assert(EU.User &&
"Expected user for struct extract");
19810 Indices.
assign(EV->getIndices());
19813 auto ExtractKey = std::make_pair(EU.Scalar, Indices);
19815 !ExtractCostCalculated.
insert(ExtractKey).second)
19817 if (It != MinBWs.end()) {
19822 ? Instruction::ZExt
19823 : Instruction::SExt;
19828 << ExtraCost <<
"\n");
19830 Type *ExtractTy = VecTy;
19835 *TTI, ScalarTy, Instruction::ExtractElement, ExtractTy,
CostKind,
19836 EU.Lane, EU.Scalar, ScalarUserAndIdx);
19837 LLVM_DEBUG(
dbgs() <<
" ExtractElement cost for " << *ScalarTy <<
" from "
19838 << *VecTy <<
": " << ExtraCost <<
"\n");
19841 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
19842 Entry->getOpcode() == Instruction::Load) {
19844 auto IsPhiInLoop = [&](
const ExternalUser &U) {
19847 const Loop *L = LI->getLoopFor(Phi->getParent());
19848 return L && (Phi->getParent() ==
I->getParent() ||
19849 L == LI->getLoopFor(
I->getParent()));
19853 if (!ValueToExtUses) {
19854 ValueToExtUses.emplace();
19855 for (
const auto &
P :
enumerate(ExternalUses)) {
19857 if (IsPhiInLoop(
P.value()))
19860 ValueToExtUses->try_emplace(
P.value().Scalar,
P.index());
19867 auto OperandIsScalar = [&](
Value *V) {
19873 return !EE->hasOneUse() || !MustGather.contains(EE);
19878 return ValueToExtUses->contains(V);
19880 bool CanBeUsedAsScalar =
all_of(Inst->operands(), OperandIsScalar);
19881 bool CanBeUsedAsScalarCast =
false;
19884 Op &&
all_of(
Op->operands(), OperandIsScalar)) {
19889 if (ScalarCost + OpCost <= ExtraCost) {
19890 CanBeUsedAsScalar = CanBeUsedAsScalarCast =
true;
19891 ScalarCost += OpCost;
19895 if (CanBeUsedAsScalar) {
19896 bool KeepScalar = ScalarCost <= ExtraCost;
19900 bool IsProfitablePHIUser =
19902 VectorizableTree.front()->Scalars.size() > 2)) &&
19903 VectorizableTree.front()->hasState() &&
19904 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
19908 auto *PHIUser = dyn_cast<PHINode>(U);
19909 return (!PHIUser ||
19910 PHIUser->getParent() != VectorizableTree.front()
19916 return ValueToExtUses->contains(V);
19918 if (IsProfitablePHIUser) {
19922 (!GatheredLoadsEntriesFirst.has_value() ||
19923 Entry->Idx < *GatheredLoadsEntriesFirst)) {
19924 unsigned ScalarUsesCount =
count_if(Entry->Scalars, [&](
Value *V) {
19925 return ValueToExtUses->contains(V);
19927 auto It = ExtractsCount.
find(Entry);
19928 if (It != ExtractsCount.
end()) {
19929 assert(ScalarUsesCount >= It->getSecond().size() &&
19930 "Expected total number of external uses not less than "
19931 "number of scalar uses.");
19932 ScalarUsesCount -= It->getSecond().size();
19937 KeepScalar = ScalarUsesCount <= 1 || !
has_single_bit(ScalarUsesCount);
19940 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
19941 for (
Value *V : Inst->operands()) {
19947 auto It = ValueToExtUses->find(V);
19948 if (It != ValueToExtUses->end()) {
19950 ExternalUses[It->second].User =
nullptr;
19953 ExtraCost = ScalarCost;
19954 if (!IsPhiInLoop(EU))
19955 ExtractsCount[Entry].
insert(Inst);
19956 if (CanBeUsedAsScalarCast) {
19957 ScalarOpsFromCasts.
insert(Inst->getOperand(0));
19961 for (
Value *V : IOp->operands()) {
19964 auto It = ValueToExtUses->find(V);
19965 if (It != ValueToExtUses->end()) {
19967 ExternalUses[It->second].User =
nullptr;
19982 if (!ExternalUsesAsOriginalScalar.contains(EU.Scalar)) {
19983 if (ExtraCost.
isValid() && ExtraCost != 0) {
19993 ExtraCost = ScaleCost(ExtraCost, *Entry, EU.Scalar,
nullptr);
19996 if (
const Loop *L = ExtractBB ? LI->getLoopFor(ExtractBB) :
nullptr) {
19997 uint64_t Scale = getLoopNestScale(
20000 <<
"SLP: Extract scale " << Scale <<
" (NCD block) for "
20001 << EU.Scalar->getNameOrAsOperand() <<
"\n");
20002 ExtraCost *= Scale;
20007 ExtraCost = ScaleCost(ExtraCost, *Entry, EU.Scalar,
20011 ExtractCost += ExtraCost;
20020 if (AllUsersGEPSWithStoresLoads && !Pointers.
empty()) {
20021 const TreeEntry &RootEntry = *VectorizableTree.front();
20022 const bool AnyRootKeptAsScalar =
any_of(RootEntry.Scalars, [&](
Value *V) {
20023 return ExternalUsesAsOriginalScalar.contains(V);
20025 const Value *CommonBase =
nullptr;
20026 bool HaveCommonBase =
true;
20027 for (
const Value *
P : Pointers) {
20031 else if (CommonBase !=
Op) {
20032 HaveCommonBase =
false;
20036 if (!AnyRootKeptAsScalar && HaveCommonBase) {
20038 auto *VecTy =
getWidenedType(UserScalarTy, RootEntry.Scalars.size());
20040 Pointers, CommonBase, TTI::PointersChainInfo::getUnitStride(),
20043 Pointers, CommonBase, TTI::PointersChainInfo::getUnknownStride(),
20045 ExtractCost += ScaleCost(VectorGEPCost - ScalarGEPCost, RootEntry);
20050 for (
Value *V : ScalarOpsFromCasts) {
20051 ExternalUsesAsOriginalScalar.insert(V);
20053 const auto *It =
find_if_not(TEs, [&](TreeEntry *TE) {
20054 return TransformedToGatherNodes.contains(TE) ||
20055 DeletedNodes.contains(TE);
20057 if (It != TEs.end()) {
20058 const TreeEntry *UserTE = *It;
20059 ExternalUses.emplace_back(V,
nullptr, *UserTE,
20060 UserTE->findLaneForValue(V));
20065 if (!VectorizedVals.
empty()) {
20066 const TreeEntry &Root = *VectorizableTree.front();
20067 auto BWIt = MinBWs.find(&Root);
20068 if (BWIt != MinBWs.end()) {
20069 Type *DstTy = Root.Scalars.front()->getType();
20070 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->
getScalarType());
20072 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
20073 if (OriginalSz != SrcSz) {
20074 unsigned Opcode = Instruction::Trunc;
20075 if (OriginalSz > SrcSz)
20076 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
20083 TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
20086 CastCost = ScaleCost(CastCost, Root,
nullptr, ReductionRoot);
20096 VectorizableTree[1]->hasState() &&
20097 VectorizableTree[1]->State == TreeEntry::Vectorize &&
20098 all_of(VectorizableTree[1]->Scalars, [&](
Value *V) {
20099 return ExternalUsesAsOriginalScalar.contains(V);
20103 Cost += ExtractCost;
20104 auto &&ResizeToVF = [
this, &Cost](
const TreeEntry *TE,
ArrayRef<int> Mask,
20105 bool ForSingleMask) {
20107 unsigned VF = Mask.size();
20108 unsigned VecVF = TE->getVectorFactor();
20109 bool HasLargeIndex =
20110 any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); });
20111 if ((VF != VecVF && HasLargeIndex) ||
20114 if (HasLargeIndex) {
20116 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
20123 dbgs() <<
"SLP: Adding cost " <<
C
20124 <<
" for final shuffle of insertelement external users.\n";
20125 TE->dump();
dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
20127 return std::make_pair(TE,
true);
20130 if (!ForSingleMask) {
20132 for (
unsigned I = 0;
I < VF; ++
I) {
20134 ResizeMask[Mask[
I]] = Mask[
I];
20139 TE->getMainOp()->getType(), VecVF)),
20142 dbgs() <<
"SLP: Adding cost " <<
C
20143 <<
" for final shuffle of insertelement external users.\n";
20144 TE->dump();
dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
20149 return std::make_pair(TE,
false);
20152 for (
int I = 0, E = ShuffledInserts.
size();
I < E; ++
I) {
20153 Value *
Base = ShuffledInserts[
I].InsertElements.
front()->getOperand(0);
20154 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
20158 assert((TEs.size() == 1 || TEs.size() == 2) &&
20159 "Expected exactly 1 or 2 tree entries.");
20160 if (TEs.size() == 1) {
20162 VF = TEs.front()->getVectorFactor();
20163 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
20167 (
Data.index() < VF &&
20168 static_cast<int>(
Data.index()) ==
Data.value());
20172 C = ScaleCost(
C, *TEs.front());
20174 <<
" for final shuffle of insertelement "
20175 "external users.\n";
20176 TEs.front()->
dump();
20177 dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
20183 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
20184 VF = TEs.front()->getVectorFactor();
20188 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
20191 C = ScaleCost(
C, *TEs.back());
20193 <<
" for final shuffle of vector node and external "
20194 "insertelement users.\n";
20195 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
20196 dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
20204 [](
const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
20205 EstimateShufflesCost);
20208 ShuffledInserts[
I].InsertElements.
front()->getType()),
20211 Cost -= InsertCost;
20215 if (ReductionBitWidth != 0) {
20216 assert(UserIgnoreList &&
"Expected reduction tree.");
20217 const TreeEntry &E = *VectorizableTree.front();
20218 auto It = MinBWs.find(&E);
20219 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
20220 unsigned SrcSize = It->second.first;
20221 unsigned DstSize = ReductionBitWidth;
20222 unsigned Opcode = Instruction::Trunc;
20223 if (SrcSize < DstSize) {
20224 bool IsArithmeticExtendedReduction =
20227 return is_contained({Instruction::Add, Instruction::FAdd,
20228 Instruction::Mul, Instruction::FMul,
20229 Instruction::And, Instruction::Or,
20233 if (IsArithmeticExtendedReduction)
20235 Instruction::BitCast;
20237 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
20239 if (Opcode != Instruction::BitCast) {
20241 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
20243 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
20245 switch (E.getOpcode()) {
20246 case Instruction::SExt:
20247 case Instruction::ZExt:
20248 case Instruction::Trunc: {
20249 const TreeEntry *OpTE = getOperandEntry(&E, 0);
20250 CCH = getCastContextHint(*OpTE);
20257 TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
20259 CastCost = ScaleCost(CastCost, *VectorizableTree.front().get(),
20260 nullptr, ReductionRoot);
20263 <<
" for final resize for reduction from " << SrcVecTy
20264 <<
" to " << DstVecTy <<
"\n";
20265 dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
20270 std::optional<InstructionCost> SpillCost;
20273 Cost += *SpillCost;
20279 OS <<
"SLP: Spill Cost = ";
20284 OS <<
".\nSLP: Extract Cost = " << ExtractCost <<
".\n";
20286 OS <<
"SLP: Reduction Cost = " << ReductionCost <<
".\n";
20287 OS <<
"SLP: Total Cost = " << Cost <<
".\n";
20291 ViewGraph(
this,
"SLP" + F->getName(),
false, Str);
20302std::optional<TTI::ShuffleKind>
20303BoUpSLP::tryToGatherSingleRegisterExtractElements(
20309 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
20325 if (Idx >= VecTy->getNumElements()) {
20329 SmallBitVector ExtractMask(VecTy->getNumElements(),
true);
20330 ExtractMask.reset(*Idx);
20337 VectorOpToIdx[EI->getVectorOperand()].push_back(
I);
20342 stable_sort(Vectors, [](
const auto &P1,
const auto &P2) {
20343 return P1.second.size() >
P2.second.size();
20346 const int UndefSz = UndefVectorExtracts.
size();
20347 unsigned SingleMax = 0;
20348 unsigned PairMax = 0;
20349 if (!Vectors.
empty()) {
20350 SingleMax = Vectors.
front().second.size() + UndefSz;
20351 if (Vectors.
size() > 1) {
20352 auto *ItNext = std::next(Vectors.
begin());
20353 PairMax = SingleMax + ItNext->second.size();
20356 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
20357 return std::nullopt;
20363 if (SingleMax >= PairMax && SingleMax) {
20364 for (
int Idx : Vectors.
front().second)
20365 std::swap(GatheredExtracts[Idx], VL[Idx]);
20366 }
else if (!Vectors.
empty()) {
20367 for (
unsigned Idx : {0, 1})
20368 for (
int Idx : Vectors[Idx].second)
20369 std::swap(GatheredExtracts[Idx], VL[Idx]);
20372 for (
int Idx : UndefVectorExtracts)
20373 std::swap(GatheredExtracts[Idx], VL[Idx]);
20376 std::optional<TTI::ShuffleKind> Res =
20382 return std::nullopt;
20386 for (
int I = 0,
E = GatheredExtracts.size();
I <
E; ++
I) {
20407BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
20408 SmallVectorImpl<int> &Mask,
20409 unsigned NumParts)
const {
20410 assert(NumParts > 0 &&
"NumParts expected be greater than or equal to 1.");
20417 const unsigned PartOffset = Part * SliceSize;
20420 if (PartOffset + PartSize > VL.
size())
20424 SmallVector<int> SubMask;
20425 std::optional<TTI::ShuffleKind> Res =
20426 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
20427 ShufflesRes[Part] = Res;
20428 copy(SubMask, std::next(
Mask.begin(), Part * SliceSize));
20429 if (SubVL.
size() != SliceSize)
20432 if (
none_of(ShufflesRes, [](
const std::optional<TTI::ShuffleKind> &Res) {
20433 return Res.has_value();
20435 ShufflesRes.clear();
20436 return ShufflesRes;
20439std::optional<TargetTransformInfo::ShuffleKind>
20440BoUpSLP::isGatherShuffledSingleRegisterEntry(
20442 SmallVectorImpl<const TreeEntry *> &Entries,
unsigned Part,
bool ForOrder,
20443 unsigned SliceSize) {
20446 return std::nullopt;
20447 const unsigned MaskBase = Part * SliceSize;
20450 auto GetUserEntry = [&](
const TreeEntry *
TE) {
20451 while (
TE->UserTreeIndex &&
TE->UserTreeIndex.EdgeIdx == UINT_MAX)
20452 TE =
TE->UserTreeIndex.UserTE;
20453 if (TE == VectorizableTree.front().get())
20454 return EdgeInfo(
const_cast<TreeEntry *
>(TE), 0);
20455 return TE->UserTreeIndex;
20457 auto HasGatherUser = [&](
const TreeEntry *
TE) {
20458 while (
TE->Idx != 0 &&
TE->UserTreeIndex) {
20459 if (
TE->UserTreeIndex.EdgeIdx == UINT_MAX)
20461 TE =
TE->UserTreeIndex.UserTE;
20465 const EdgeInfo TEUseEI = GetUserEntry(TE);
20466 if (!TEUseEI || (TEUseEI.UserTE->Idx == 0 && TEUseEI.UserTE->isGather() &&
20467 !TEUseEI.UserTE->hasState()))
20468 return std::nullopt;
20469 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
20474 TEUseEI.UserTE->hasState() ? TEUseEI.UserTE->getMainOp() :
nullptr);
20475 PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
20476 TEInsertBlock =
PHI->getIncomingBlock(TEUseEI.EdgeIdx);
20479 TEInsertBlock = TEInsertPt->
getParent();
20481 if (!DT->isReachableFromEntry(TEInsertBlock))
20482 return std::nullopt;
20483 auto *NodeUI = DT->getNode(TEInsertBlock);
20484 assert(NodeUI &&
"Should only process reachable instructions");
20486 auto CheckOrdering = [&](
const Instruction *InsertPt) {
20499 const BasicBlock *InsertBlock = InsertPt->getParent();
20500 auto *NodeEUI = DT->getNode(InsertBlock);
20503 assert((NodeUI == NodeEUI) ==
20504 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
20505 "Different nodes should have different DFS numbers");
20507 if (TEInsertPt->
getParent() != InsertBlock &&
20508 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
20510 if (TEInsertPt->
getParent() == InsertBlock &&
20523 SmallDenseMap<Value *, int> UsedValuesEntry;
20524 SmallPtrSet<const Value *, 16> VisitedValue;
20525 bool IsReusedNodeFound =
false;
20526 auto CheckAndUseSameNode = [&](
const TreeEntry *TEPtr) {
20528 if (IsReusedNodeFound)
20530 if ((TEPtr->getVectorFactor() != VL.
size() &&
20531 TEPtr->Scalars.size() != VL.
size()) ||
20532 (!TEPtr->isSame(VL) && !TEPtr->isSame(
TE->Scalars)))
20534 IsReusedNodeFound =
20535 equal(
TE->Scalars, TEPtr->Scalars) &&
20536 equal(
TE->ReorderIndices, TEPtr->ReorderIndices) &&
20537 equal(
TE->ReuseShuffleIndices, TEPtr->ReuseShuffleIndices);
20540 for (
Value *V : VL) {
20547 auto CheckParentNodes = [&](
const TreeEntry *User1,
const TreeEntry *User2,
20548 unsigned EdgeIdx) {
20549 const TreeEntry *Ptr1 = User1;
20550 const TreeEntry *Ptr2 = User2;
20551 SmallDenseMap<const TreeEntry *, unsigned> PtrToIdx;
20554 EdgeIdx = Ptr2->UserTreeIndex.EdgeIdx;
20555 Ptr2 = Ptr2->UserTreeIndex.UserTE;
20558 unsigned Idx = Ptr1->UserTreeIndex.EdgeIdx;
20559 Ptr1 = Ptr1->UserTreeIndex.UserTE;
20560 if (
auto It = PtrToIdx.
find(Ptr1); It != PtrToIdx.
end())
20561 return Idx < It->second;
20567 std::optional<bool> TEInsertPtUsedOutsideBlock;
20568 auto IsTEInsertPtUsedOutsideBlock = [&] {
20569 if (!TEInsertPtUsedOutsideBlock)
20570 TEInsertPtUsedOutsideBlock =
20572 return *TEInsertPtUsedOutsideBlock;
20577 const bool TEUseEIInsertPtUsedOutside =
20578 TEUseEI && TEUseEI.UserTE && TEUseEI.UserTE->hasCopyableElements() &&
20579 !TEUseEI.UserTE->isCopyableElement(
20581 IsTEInsertPtUsedOutsideBlock();
20582 auto CheckNonSchedulableOrdering = [&](
const TreeEntry *
E,
20584 return TEUseEIInsertPtUsedOutside &&
20585 InsertPt->getNextNode() == TEInsertPt &&
20586 (!
E->hasCopyableElements() || !
E->isCopyableElement(InsertPt) ||
20592 const bool TEUserNeedsEmitFirst =
20593 TEUseEI.UserTE->State == TreeEntry::Vectorize &&
20594 TEUseEI.UserTE->hasState() &&
20595 (TEUseEI.UserTE->getOpcode() != Instruction::PHI ||
20596 TEUseEI.UserTE->isAltShuffle()) &&
20600 SmallDenseMap<const TreeEntry *, bool> ScalarsUsedOutsideBlockCache;
20601 auto AllScalarsUsedOutsideBlock = [&](
const TreeEntry *UserTE) {
20609 for (
Value *V : VL) {
20613 SmallPtrSet<const TreeEntry *, 4> VToTEs;
20615 ValueToGatherNodes.lookup(V).takeVector());
20616 if (TransformedToGatherNodes.contains(TE)) {
20617 for (TreeEntry *
E : getSplitTreeEntries(V)) {
20618 if (TE ==
E || !TransformedToGatherNodes.contains(
E) ||
20619 !
E->UserTreeIndex ||
E->UserTreeIndex.UserTE->isGather())
20621 GatherNodes.push_back(
E);
20623 for (TreeEntry *
E : getTreeEntries(V)) {
20624 if (TE ==
E || !TransformedToGatherNodes.contains(
E) ||
20625 !
E->UserTreeIndex ||
E->UserTreeIndex.UserTE->isGather())
20627 GatherNodes.push_back(
E);
20630 for (
const TreeEntry *TEPtr : GatherNodes) {
20631 if (TEPtr == TE || TEPtr->Idx == 0 || DeletedNodes.contains(TEPtr))
20634 [&](
Value *V) { return GatheredScalars.contains(V); }) &&
20635 "Must contain at least single gathered value.");
20636 assert(TEPtr->UserTreeIndex &&
20637 "Expected only single user of a gather node.");
20638 if (
any_of(TEPtr->CombinedEntriesWithIndices,
20639 [&](
const auto &
P) { return P.first == TE->Idx; }))
20641 const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
20643 PHINode *UserPHI = (UseEI.UserTE->State != TreeEntry::SplitVectorize &&
20644 UseEI.UserTE->hasState())
20649 : &getLastInstructionInBundle(UseEI.UserTE);
20650 if (TEInsertPt == InsertPt) {
20652 if (TEUserNeedsEmitFirst) {
20653 if (UseEI.UserTE->State != TreeEntry::Vectorize ||
20654 (UseEI.UserTE->hasState() &&
20655 UseEI.UserTE->getOpcode() == Instruction::PHI &&
20656 !UseEI.UserTE->isAltShuffle()) ||
20657 !AllScalarsUsedOutsideBlock(UseEI.UserTE))
20665 (TEUseEI.UserTE != UseEI.UserTE || TEUseEI.EdgeIdx < UseEI.EdgeIdx))
20668 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
20669 TEUseEI.UserTE->getOpcode() == Instruction::PHI &&
20670 UseEI.UserTE->State == TreeEntry::Vectorize &&
20671 UseEI.UserTE->getOpcode() == Instruction::PHI &&
20672 TEUseEI.UserTE != UseEI.UserTE)
20677 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
20681 if (TEUseEI.UserTE != UseEI.UserTE &&
20682 (TEUseEI.UserTE->Idx < UseEI.UserTE->Idx ||
20683 HasGatherUser(TEUseEI.UserTE)))
20686 if (CheckParentNodes(TEUseEI.UserTE, UseEI.UserTE, UseEI.EdgeIdx))
20690 if (!TEUseEI.UserTE->isGather() && !UserPHI &&
20691 TEUseEI.UserTE->doesNotNeedToSchedule() !=
20692 UseEI.UserTE->doesNotNeedToSchedule() &&
20697 if ((TEInsertBlock != InsertPt->
getParent() ||
20698 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
20699 (!CheckOrdering(InsertPt) ||
20700 (UseEI.UserTE->hasCopyableElements() &&
20701 IsTEInsertPtUsedOutsideBlock() &&
20705 if (CheckAndUseSameNode(TEPtr))
20710 if (CheckNonSchedulableOrdering(UseEI.UserTE, InsertPt))
20715 const auto *It =
find_if(VTEs, [&](
const TreeEntry *MTE) {
20716 return MTE !=
TE && MTE != TEUseEI.UserTE &&
20717 !DeletedNodes.contains(MTE) &&
20718 !TransformedToGatherNodes.contains(MTE);
20720 if (It != VTEs.end()) {
20721 const TreeEntry *VTE = *It;
20722 if (
none_of(
TE->CombinedEntriesWithIndices,
20723 [&](
const auto &
P) { return P.first == VTE->Idx; })) {
20724 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
20725 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
20729 if (CheckAndUseSameNode(VTE))
20735 const auto *It =
find_if(VTEs, [&, MainTE = TE](
const TreeEntry *TE) {
20736 return TE != MainTE && !DeletedNodes.contains(TE) &&
20737 !TransformedToGatherNodes.contains(TE);
20739 if (It != VTEs.end()) {
20740 const TreeEntry *VTE = *It;
20741 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
20742 VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
20743 VTEs = VTEs.drop_front();
20745 const auto *MIt =
find_if(VTEs, [](
const TreeEntry *MTE) {
20746 return MTE->State == TreeEntry::Vectorize;
20748 if (MIt == VTEs.end())
20752 if (
none_of(
TE->CombinedEntriesWithIndices,
20753 [&](
const auto &
P) { return P.first == VTE->Idx; })) {
20754 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
20755 if (&LastBundleInst == TEInsertPt ||
20756 !CheckOrdering(&LastBundleInst) ||
20757 CheckNonSchedulableOrdering(VTE, &LastBundleInst))
20761 if (CheckAndUseSameNode(VTE))
20766 if (IsReusedNodeFound)
20768 if (VToTEs.
empty())
20770 if (UsedTEs.
empty()) {
20778 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
20780 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
20784 if (!VToTEs.
empty()) {
20790 VToTEs = SavedVToTEs;
20795 if (Idx == UsedTEs.
size()) {
20799 if (UsedTEs.
size() == 2)
20801 UsedTEs.push_back(SavedVToTEs);
20802 Idx = UsedTEs.
size() - 1;
20808 if (UsedTEs.
empty()) {
20810 return std::nullopt;
20814 if (UsedTEs.
size() == 1) {
20817 UsedTEs.front().
end());
20818 sort(FirstEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
20819 return TE1->Idx < TE2->Idx;
20822 auto *It =
find_if(FirstEntries, [=](
const TreeEntry *EntryPtr) {
20823 return (EntryPtr->getVectorFactor() ==
TE->Scalars.size() &&
20824 EntryPtr->isSame(
TE->Scalars)) ||
20825 EntryPtr->isSame(VL);
20827 if (It != FirstEntries.end() &&
20828 (IsReusedNodeFound || (*It)->getVectorFactor() == VL.size() ||
20829 ((*It)->getVectorFactor() ==
TE->Scalars.size() &&
20830 TE->ReuseShuffleIndices.size() == VL.size() &&
20831 (*It)->isSame(
TE->Scalars)))) {
20833 if (IsReusedNodeFound || (*It)->getVectorFactor() == VL.size()) {
20834 std::iota(std::next(
Mask.begin(), MaskBase),
20835 std::next(
Mask.begin(), MaskBase + VL.size()), 0);
20837 SmallVector<int> CommonMask =
TE->getCommonMask();
20848 Entries.
push_back(FirstEntries.front());
20850 for (
auto &
P : UsedValuesEntry)
20852 VF = FirstEntries.front()->getVectorFactor();
20855 assert(UsedTEs.
size() == 2 &&
"Expected at max 2 permuted entries.");
20857 DenseMap<int, const TreeEntry *> VFToTE;
20858 for (
const TreeEntry *TE : UsedTEs.front()) {
20859 unsigned VF =
TE->getVectorFactor();
20860 auto It = VFToTE.
find(VF);
20861 if (It != VFToTE.
end()) {
20862 if (It->second->Idx >
TE->Idx)
20863 It->getSecond() =
TE;
20870 UsedTEs.back().
end());
20871 sort(SecondEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
20872 return TE1->Idx < TE2->Idx;
20874 for (
const TreeEntry *TE : SecondEntries) {
20875 auto It = VFToTE.
find(
TE->getVectorFactor());
20876 if (It != VFToTE.
end()) {
20885 if (Entries.
empty()) {
20887 UsedTEs.front(), [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
20888 return TE1->Idx < TE2->Idx;
20890 Entries.
push_back(SecondEntries.front());
20891 VF = std::max(Entries.
front()->getVectorFactor(),
20892 Entries.
back()->getVectorFactor());
20894 VF = Entries.
front()->getVectorFactor();
20897 for (
const TreeEntry *
E : Entries)
20901 for (
auto &
P : UsedValuesEntry) {
20903 if (ValuesToEntries[Idx].
contains(
P.first)) {
20913 auto AreCompatiblePHIs = [&](
Value *
V,
Value *V1) {
20920 for (
int I = 0,
E =
PHI->getNumIncomingValues();
I <
E; ++
I) {
20922 Value *In1 = PHI1->getIncomingValue(
I);
20940 SmallDenseMap<Value *, bool> MightBeIgnoredCache;
20941 auto MightBeIgnored = [=, &MightBeIgnoredCache](
Value *
V) {
20948 !areAllUsersVectorized(
I, UserIgnoreList) &&
isSimple(
I);
20955 auto NeighborMightBeIgnored = [&](
Value *
V,
int Idx) {
20956 Value *V1 = VL[Idx];
20957 bool UsedInSameVTE =
false;
20958 auto It = UsedValuesEntry.find(V1);
20959 if (It != UsedValuesEntry.end())
20960 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
20961 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
20968 SmallBitVector UsedIdxs(Entries.size());
20970 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
20972 auto It = UsedValuesEntry.find(V);
20973 if (It == UsedValuesEntry.end())
20979 ((
I > 0 && NeighborMightBeIgnored(V,
I - 1)) ||
20980 (
I !=
E - 1 && NeighborMightBeIgnored(V,
I + 1)))))
20982 unsigned Idx = It->second;
20989 for (
unsigned I = 0, Sz = Entries.size();
I < Sz; ++
I) {
20990 if (!UsedIdxs.test(
I))
20996 for (std::pair<unsigned, int> &Pair : EntryLanes)
20997 if (Pair.first ==
I)
20998 Pair.first = TempEntries.
size();
21001 Entries.swap(TempEntries);
21002 if (EntryLanes.size() == Entries.size() &&
21004 .slice(MaskBase,
getNumElems(
TE->Scalars.size(), SliceSize,
21011 return std::nullopt;
21014 bool IsIdentity = Entries.size() == 1;
21017 for (
const std::pair<unsigned, int> &Pair : EntryLanes) {
21018 unsigned Idx = MaskBase + Pair.second;
21021 (ForOrder ? std::distance(
21022 Entries[Pair.first]->Scalars.begin(),
21023 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
21024 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
21025 IsIdentity &=
Mask[Idx] == Pair.second;
21027 if (ForOrder || IsIdentity || Entries.empty()) {
21028 switch (Entries.size()) {
21030 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
21034 if (EntryLanes.size() > 2 || VL.size() <= 2)
21041 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
21043 SmallVector<int> SubMask(std::next(
Mask.begin(), MaskBase),
21044 std::next(
Mask.begin(), MaskBase + VL.size()));
21045 int MinElement = SubMask.
front(), MaxElement = SubMask.
front();
21046 for (
int Idx : SubMask) {
21054 assert(MaxElement >= 0 && MinElement >= 0 &&
21055 MaxElement % VF >= MinElement % VF &&
21056 "Expected at least single element.");
21061 unsigned MinIdx = MinElement % VF;
21064 *TTI, VL.front()->getType(), MinIdx);
21065 auto *RegFloorTy =
getWidenedType(VL.front()->getType(), RegFloor);
21066 unsigned RegFloorParts =
21068 if (RegFloorParts > 1)
21072 std::max<unsigned>(VL.size(), (MaxElement % VF) -
Offset + 1);
21074 for (
int &Idx : SubMask) {
21077 Idx = (Idx % VF) -
Offset + (Idx >=
static_cast<int>(VF) ? NewVF : 0);
21088 auto GetShuffleCost = [&,
21089 &TTI = *TTI](ArrayRef<int>
Mask,
21092 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
21094 Mask, Entries.front()->getInterleaveFactor()))
21096 return ::getShuffleCost(TTI,
21101 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
21103 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
21104 if (Entries.size() == 1 || !Entries[0]->isGather()) {
21105 FirstShuffleCost = ShuffleCost;
21109 bool IsIdentity =
true;
21110 for (
auto [
I, Idx] :
enumerate(FirstMask)) {
21111 if (Idx >=
static_cast<int>(NewVF)) {
21116 IsIdentity &=
static_cast<int>(
I) == Idx;
21120 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
21122 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
21126 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
21127 if (Entries.size() == 1 || !Entries[1]->isGather()) {
21128 SecondShuffleCost = ShuffleCost;
21132 bool IsIdentity =
true;
21133 for (
auto [
I, Idx] :
enumerate(SecondMask)) {
21134 if (Idx <
static_cast<int>(NewVF) && Idx >= 0) {
21140 IsIdentity &=
static_cast<int>(
I) == Idx;
21145 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
21147 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
21155 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
21157 const TreeEntry *BestEntry =
nullptr;
21159 if (FirstShuffleCost < ShuffleCost) {
21160 for (
int &Idx : MaskSlice)
21161 if (Idx >=
static_cast<int>(VF))
21163 BestEntry = Entries.front();
21164 ShuffleCost = FirstShuffleCost;
21166 if (SecondShuffleCost < ShuffleCost) {
21167 for (
int &Idx : MaskSlice) {
21168 if (Idx <
static_cast<int>(VF))
21173 BestEntry = Entries[1];
21174 ShuffleCost = SecondShuffleCost;
21176 if (BuildVectorCost >= ShuffleCost) {
21179 Entries.push_back(BestEntry);
21187 std::fill(std::next(
Mask.begin(), MaskBase),
21189 return std::nullopt;
21193BoUpSLP::isGatherShuffledEntry(
21197 assert(NumParts > 0 && NumParts < VL.
size() &&
21198 "Expected positive number of registers.");
21201 if (TE == VectorizableTree.front().get() &&
21202 (!GatheredLoadsEntriesFirst.has_value() ||
21204 [](
const std::unique_ptr<TreeEntry> &TE) {
21205 return !
TE->isGather();
21209 assert((
TE->UserTreeIndex || TE == VectorizableTree.front().get()) &&
21210 "Expected only single user of the gather node.");
21213 if (
TE->UserTreeIndex &&
TE->UserTreeIndex.UserTE->isGather() &&
21214 TE->UserTreeIndex.EdgeIdx == UINT_MAX &&
21216 (
TE->hasState() &&
TE->getOpcode() == Instruction::ExtractElement) ||
21219 getSameValuesTreeEntry(
TE->getMainOp(),
TE->Scalars))))
21224 if (Part * SliceSize >= VL.
size())
21228 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
21229 std::optional<TTI::ShuffleKind> SubRes =
21230 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
21231 ForOrder, SliceSize);
21233 SubEntries.
clear();
21236 SubEntries.
front()->getVectorFactor() == VL.
size() &&
21237 (SubEntries.
front()->isSame(
TE->Scalars) ||
21238 SubEntries.
front()->isSame(VL))) {
21240 LocalSubEntries.
swap(SubEntries);
21243 std::iota(
Mask.begin(),
Mask.end(), 0);
21245 for (
int I = 0, Sz = VL.
size();
I < Sz; ++
I)
21248 Entries.emplace_back(1, LocalSubEntries.
front());
21254 [](
const std::optional<TTI::ShuffleKind> &SK) {
return !SK; })) {
21262 Type *ScalarTy)
const {
21263 const unsigned VF = VL.
size();
21271 auto EstimateInsertCost = [&](
unsigned I,
Value *
V) {
21273 if (
V->getType() != ScalarTy)
21274 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy,
V->getType(),
21278 std::iota(ConstantShuffleMask.begin(), ConstantShuffleMask.end(), 0);
21285 ConstantShuffleMask[
I] =
I + VF;
21288 EstimateInsertCost(
I, V);
21291 bool IsAnyNonUndefConst =
21294 if (!ForPoisonSrc && IsAnyNonUndefConst) {
21300 if (!DemandedElements.
isZero())
21304 false,
CostKind, ForPoisonSrc && !IsAnyNonUndefConst, VL);
21308Instruction &BoUpSLP::getLastInstructionInBundle(
const TreeEntry *
E) {
21309 auto It = EntryToLastInstruction.find(
E);
21310 if (It != EntryToLastInstruction.end())
21318 if (
E->hasState()) {
21319 Front =
E->getMainOp();
21320 Opcode =
E->getOpcode();
21327 ((GatheredLoadsEntriesFirst.has_value() && Opcode == Instruction::Load &&
21328 E->isGather() &&
E->Idx < *GatheredLoadsEntriesFirst) ||
21329 E->State == TreeEntry::SplitVectorize ||
E->hasCopyableElements() ||
21331 [=](
Value *V) ->
bool {
21332 if (Opcode == Instruction::GetElementPtr &&
21333 !isa<GetElementPtrInst>(V))
21335 auto *I = dyn_cast<Instruction>(V);
21336 return !I || !E->getMatchingMainOpOrAltOp(I) ||
21337 I->getParent() == BB || isVectorLikeInstWithConstOps(I);
21339 "Expected gathered loads or GEPs or instructions from same basic "
21342 auto FindLastInst = [&]() {
21344 for (
Value *V :
E->Scalars) {
21348 if (
E->isCopyableElement(
I))
21350 if (LastInst->
getParent() ==
I->getParent()) {
21355 assert(((Opcode == Instruction::GetElementPtr &&
21357 E->State == TreeEntry::SplitVectorize ||
21360 (GatheredLoadsEntriesFirst.has_value() &&
21361 Opcode == Instruction::Load &&
E->isGather() &&
21362 E->Idx < *GatheredLoadsEntriesFirst)) &&
21363 "Expected vector-like or non-GEP in GEP node insts only.");
21364 if (!DT->isReachableFromEntry(LastInst->
getParent())) {
21368 if (!DT->isReachableFromEntry(
I->getParent()))
21370 auto *NodeA = DT->getNode(LastInst->
getParent());
21371 auto *NodeB = DT->getNode(
I->getParent());
21372 assert(NodeA &&
"Should only process reachable instructions");
21373 assert(NodeB &&
"Should only process reachable instructions");
21374 assert((NodeA == NodeB) ==
21375 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
21376 "Different nodes should have different DFS numbers");
21377 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
21384 auto FindFirstInst = [&]() {
21386 for (
Value *V :
E->Scalars) {
21390 if (
E->isCopyableElement(
I))
21392 if (FirstInst->
getParent() ==
I->getParent()) {
21393 if (
I->comesBefore(FirstInst))
21397 assert(((Opcode == Instruction::GetElementPtr &&
21401 "Expected vector-like or non-GEP in GEP node insts only.");
21402 if (!DT->isReachableFromEntry(FirstInst->
getParent())) {
21406 if (!DT->isReachableFromEntry(
I->getParent()))
21408 auto *NodeA = DT->getNode(FirstInst->
getParent());
21409 auto *NodeB = DT->getNode(
I->getParent());
21410 assert(NodeA &&
"Should only process reachable instructions");
21411 assert(NodeB &&
"Should only process reachable instructions");
21412 assert((NodeA == NodeB) ==
21413 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
21414 "Different nodes should have different DFS numbers");
21415 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
21421 if (
E->State == TreeEntry::SplitVectorize) {
21422 Res = FindLastInst();
21424 for (
auto *
E : Entries) {
21427 I = &getLastInstructionInBundle(
E);
21432 EntryToLastInstruction.try_emplace(
E, Res);
21437 if (GatheredLoadsEntriesFirst.has_value() &&
21438 E->Idx >= *GatheredLoadsEntriesFirst && !
E->isGather() &&
21439 Opcode == Instruction::Load) {
21440 Res = FindFirstInst();
21441 EntryToLastInstruction.try_emplace(
E, Res);
21447 auto FindScheduleBundle = [&](
const TreeEntry *
E) ->
const ScheduleBundle * {
21451 const auto *It = BlocksSchedules.find(BB);
21452 if (It == BlocksSchedules.end())
21454 for (
Value *V :
E->Scalars) {
21460 if (Bundles.
empty())
21463 Bundles, [&](ScheduleBundle *
B) {
return B->getTreeEntry() ==
E; });
21464 if (It != Bundles.
end())
21469 const ScheduleBundle *Bundle = FindScheduleBundle(
E);
21470 if (!
E->isGather() && !Bundle) {
21471 if ((Opcode == Instruction::GetElementPtr &&
21474 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
21478 return isa<PoisonValue>(V) ||
21479 (E->Idx == 0 && isa<InsertElementInst>(V)) ||
21480 E->isCopyableElement(V) ||
21481 (!isVectorLikeInstWithConstOps(V) &&
21482 isUsedOutsideBlock(V));
21484 (!
E->doesNotNeedToSchedule() ||
21487 if (!isa<Instruction>(V) ||
21488 (E->hasCopyableElements() && E->isCopyableElement(V)))
21490 return !areAllOperandsNonInsts(V);
21493 if (!isa<Instruction>(V) ||
21494 (E->hasCopyableElements() && E->isCopyableElement(V)))
21496 return MustGather.contains(V);
21498 Res = FindLastInst();
21500 Res = FindFirstInst();
21501 EntryToLastInstruction.try_emplace(
E, Res);
21510 assert(!
E->isGather() &&
"Gathered instructions should not be scheduled");
21511 Res = Bundle->getBundle().back()->getInst();
21512 EntryToLastInstruction.try_emplace(
E, Res);
21535 Res = FindLastInst();
21536 assert(Res &&
"Failed to find last instruction in bundle");
21537 EntryToLastInstruction.try_emplace(
E, Res);
21541void BoUpSLP::setInsertPointAfterBundle(
const TreeEntry *
E) {
21542 auto *Front =
E->getMainOp();
21543 Instruction *LastInst = &getLastInstructionInBundle(
E);
21544 assert(LastInst &&
"Failed to find last instruction in bundle");
21549 LastInstIt = LastInst->
getParent()->getFirstNonPHIIt();
21550 if (LastInstIt != LastInst->
getParent()->end() &&
21551 LastInstIt->getParent()->isLandingPad())
21552 LastInstIt = std::next(LastInstIt);
21555 (!
E->isGather() &&
E->State != TreeEntry::SplitVectorize &&
21556 (
E->doesNotNeedToSchedule() ||
21557 (
E->hasCopyableElements() && !
E->isCopyableElement(LastInst) &&
21559 (GatheredLoadsEntriesFirst.has_value() &&
21560 E->Idx >= *GatheredLoadsEntriesFirst && !
E->isGather() &&
21561 E->getOpcode() == Instruction::Load)) {
21562 Builder.SetInsertPoint(LastInst->
getParent(), LastInstIt);
21566 Builder.SetInsertPoint(
21569 if (Instruction *Res = LastInstructionToPos.lookup(LastInst)) {
21572 Res = Builder.CreateAlignedLoad(Builder.getPtrTy(),
21577 if (
E->State != TreeEntry::SplitVectorize)
21578 LastInstructionToPos.try_emplace(LastInst, Res);
21581 Builder.SetCurrentDebugLocation(Front->
getDebugLoc());
21584Value *BoUpSLP::gather(
21586 function_ref<
Value *(
Value *,
Value *, ArrayRef<int>)> CreateShuffle) {
21592 SmallSet<int, 4> PostponedIndices;
21593 Loop *
L = LI->getLoopFor(Builder.GetInsertBlock());
21595 SmallPtrSet<BasicBlock *, 4> Visited;
21596 while (InsertBB && InsertBB != InstBB && Visited.
insert(InsertBB).second)
21597 InsertBB = InsertBB->getSinglePredecessor();
21598 return InsertBB && InsertBB == InstBB;
21600 for (
int I = 0,
E = VL.
size();
I <
E; ++
I) {
21602 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
21604 (L && (!Root ||
L->isLoopInvariant(Root)) &&
L->contains(Inst))) &&
21605 PostponedIndices.
insert(
I).second)
21609 auto &&CreateInsertElement = [
this](
Value *Vec,
Value *
V,
unsigned Pos,
21616 if (
Scalar->getType() != Ty) {
21627 Scalar = Builder.CreateIntCast(
21641 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
21646 GatherShuffleExtractSeq.insert(InsElt);
21651 const auto *It =
find_if(Entries, [&](
const TreeEntry *
E) {
21652 return !TransformedToGatherNodes.contains(
E) &&
21653 !DeletedNodes.contains(
E);
21655 if (It != Entries.
end()) {
21657 User *UserOp =
nullptr;
21662 if (
V->getType()->isVectorTy()) {
21664 SV && SV->getOperand(0) != V && SV->getOperand(1) != V) {
21666 auto FindOperand = [](
Value *Vec,
Value *
V) -> Instruction * {
21668 if (SV->getOperand(0) == V)
21670 if (SV->getOperand(1) == V)
21676 if (Instruction *User = FindOperand(SV->getOperand(0), V))
21678 else if (Instruction *User = FindOperand(SV->getOperand(1), V))
21681 "Failed to find shufflevector, caused by resize.");
21688 unsigned FoundLane = (*It)->findLaneForValue(V);
21689 ExternalUses.emplace_back(V,
nullptr, **It, FoundLane);
21690 ExternalUsesWithNonUsers.insert(V);
21696 unsigned FoundLane = (*It)->findLaneForValue(V);
21697 ExternalUses.emplace_back(V, UserOp, **It, FoundLane);
21705 SmallVector<int> NonConsts;
21707 std::iota(
Mask.begin(),
Mask.end(), 0);
21708 Value *OriginalRoot = Root;
21711 SV->getOperand(0)->getType() == VecTy) {
21712 Root = SV->getOperand(0);
21713 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
21716 for (
int I = 0,
E = VL.
size();
I <
E; ++
I) {
21725 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
21730 Vec = OriginalRoot;
21732 Vec = CreateShuffle(Root, Vec, Mask);
21734 OI && OI->use_empty() &&
21735 none_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
21736 return TE->VectorizedValue == OI;
21742 for (
int I : NonConsts)
21743 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
21746 for (
const std::pair<Value *, unsigned> &Pair : PostponedInsts)
21747 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
21785 bool IsFinalized =
false;
21798 class ShuffleIRBuilder {
21811 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
21812 CSEBlocks(CSEBlocks),
DL(DL) {}
21813 ~ShuffleIRBuilder() =
default;
21819 "Expected integer vector types only.");
21825 ->getIntegerBitWidth())
21826 V2 = Builder.CreateIntCast(
21829 V1 = Builder.CreateIntCast(
21833 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
21835 GatherShuffleExtractSeq.insert(
I);
21836 CSEBlocks.insert(
I->getParent());
21845 unsigned VF = Mask.size();
21849 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
21851 GatherShuffleExtractSeq.insert(
I);
21852 CSEBlocks.insert(
I->getParent());
21856 Value *createIdentity(
Value *V) {
return V; }
21857 Value *createPoison(
Type *Ty,
unsigned VF) {
21862 void resizeToMatch(
Value *&V1,
Value *&V2) {
21867 int VF = std::max(V1VF, V2VF);
21868 int MinVF = std::min(V1VF, V2VF);
21870 std::iota(IdentityMask.
begin(), std::next(IdentityMask.
begin(), MinVF),
21872 Value *&
Op = MinVF == V1VF ? V1 : V2;
21873 Op = Builder.CreateShuffleVector(
Op, IdentityMask);
21875 GatherShuffleExtractSeq.insert(
I);
21876 CSEBlocks.insert(
I->getParent());
21889 assert(V1 &&
"Expected at least one vector value.");
21890 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
21891 R.CSEBlocks, *R.DL);
21892 return BaseShuffleAnalysis::createShuffle<Value *>(
21893 V1, V2, Mask, ShuffleBuilder, ScalarTy);
21899 std::optional<bool> IsSigned = std::nullopt) {
21904 return Builder.CreateIntCast(
21905 V,
VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
21909 Value *getVectorizedValue(
const TreeEntry &E) {
21910 Value *Vec = E.VectorizedValue;
21913 return castToScalarTyElem(Vec,
any_of(E.Scalars, [&](
Value *V) {
21914 return !isa<PoisonValue>(V) &&
21915 !isKnownNonNegative(
21916 V, SimplifyQuery(*R.DL));
21922 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
21926 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
21927 unsigned NumParts,
bool &UseVecBaseAsInput) {
21928 UseVecBaseAsInput =
false;
21930 Value *VecBase =
nullptr;
21932 if (!E->ReorderIndices.empty()) {
21934 E->ReorderIndices.end());
21937 for (
int I = 0, Sz = Mask.size();
I < Sz; ++
I) {
21942 VecBase = EI->getVectorOperand();
21944 VecBase = TEs.front()->VectorizedValue;
21945 assert(VecBase &&
"Expected vectorized value.");
21946 UniqueBases.
insert(VecBase);
21949 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
21950 (E->UserTreeIndex && E->UserTreeIndex.EdgeIdx == UINT_MAX &&
21951 !R.isVectorized(EI) &&
21953 count_if(E->UserTreeIndex.UserTE->Scalars,
21954 [&](
Value *V) { return V == EI; })) ||
21955 (NumParts != 1 &&
count(VL, EI) > 1) ||
21957 ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(U);
21958 return UTEs.empty() || UTEs.size() > 1 ||
21960 [&](const TreeEntry *TE) {
21961 return R.DeletedNodes.contains(TE) ||
21962 R.TransformedToGatherNodes.contains(TE);
21968 [&](
const std::unique_ptr<TreeEntry> &TE) {
21969 return TE->UserTreeIndex.UserTE ==
21971 is_contained(VL, EI);
21975 R.eraseInstruction(EI);
21977 if (NumParts == 1 || UniqueBases.
size() == 1) {
21978 assert(VecBase &&
"Expected vectorized value.");
21979 return castToScalarTyElem(VecBase);
21981 UseVecBaseAsInput =
true;
21991 Value *Vec =
nullptr;
21998 constexpr int MaxBases = 2;
22000 auto VLMask =
zip(SubVL, SubMask);
22001 const unsigned VF =
22002 accumulate(VLMask, 0U, [&](
unsigned S,
const auto &
D) {
22009 VecOp = TEs.front()->VectorizedValue;
22010 assert(VecOp &&
"Expected vectorized value.");
22011 const unsigned Size =
22013 return std::max(S,
Size);
22015 for (
const auto [V,
I] : VLMask) {
22020 VecOp = TEs.front()->VectorizedValue;
22021 assert(VecOp &&
"Expected vectorized value.");
22022 VecOp = castToScalarTyElem(VecOp);
22023 Bases[
I / VF] = VecOp;
22025 if (!Bases.front())
22028 if (Bases.back()) {
22029 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
22030 TransformToIdentity(SubMask);
22032 SubVec = Bases.front();
22038 ArrayRef<int> SubMask =
22039 Mask.slice(
P * SliceSize,
22042 return all_of(SubMask, [](
int Idx) {
22046 "Expected first part or all previous parts masked.");
22047 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
22052 unsigned SubVecVF =
22054 NewVF = std::max(NewVF, SubVecVF);
22057 for (
int &Idx : SubMask)
22060 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
22061 Vec = createShuffle(Vec, SubVec, VecMask);
22062 TransformToIdentity(VecMask);
22070 std::optional<Value *>
22076 TEs, [](
const TreeEntry *TE) {
return TE->VectorizedValue; });
22078 return std::nullopt;
22081 auto *ResVecTy =
getWidenedType(ScalarTy, E->getVectorFactor());
22082 return Builder.CreateAlignedLoad(
22089 IsFinalized =
false;
22090 CommonMask.clear();
22096 Value *V1 = getVectorizedValue(E1);
22097 Value *V2 = getVectorizedValue(E2);
22103 Value *V1 = getVectorizedValue(E1);
22108 assert(V1 && V2 && !Mask.empty() &&
"Expected non-empty input vectors.");
22111 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
22112 V1 = castToScalarTyElem(V1);
22113 V2 = castToScalarTyElem(V2);
22114 if (InVectors.empty()) {
22115 InVectors.push_back(V1);
22116 InVectors.push_back(V2);
22117 CommonMask.assign(Mask.begin(), Mask.end());
22120 Value *Vec = InVectors.front();
22121 if (InVectors.size() == 2) {
22122 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
22123 transformMaskAfterShuffle(CommonMask, CommonMask);
22126 Vec = createShuffle(Vec,
nullptr, CommonMask);
22127 transformMaskAfterShuffle(CommonMask, CommonMask);
22129 V1 = createShuffle(V1, V2, Mask);
22130 unsigned VF = std::max(getVF(V1), getVF(Vec));
22131 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
22133 CommonMask[Idx] = Idx + VF;
22134 InVectors.front() = Vec;
22135 if (InVectors.size() == 2)
22136 InVectors.back() = V1;
22138 InVectors.push_back(V1);
22143 "castToScalarTyElem expects V1 to be FixedVectorType");
22144 V1 = castToScalarTyElem(V1);
22145 if (InVectors.empty()) {
22146 InVectors.push_back(V1);
22147 CommonMask.assign(Mask.begin(), Mask.end());
22150 const auto *It =
find(InVectors, V1);
22151 if (It == InVectors.end()) {
22152 if (InVectors.size() == 2 ||
22153 InVectors.front()->getType() != V1->
getType()) {
22154 Value *V = InVectors.front();
22155 if (InVectors.size() == 2) {
22156 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
22157 transformMaskAfterShuffle(CommonMask, CommonMask);
22159 CommonMask.size()) {
22160 V = createShuffle(InVectors.front(),
nullptr, CommonMask);
22161 transformMaskAfterShuffle(CommonMask, CommonMask);
22163 unsigned VF = std::max(CommonMask.size(), Mask.size());
22164 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
22166 CommonMask[Idx] = V->getType() != V1->
getType()
22168 : Mask[Idx] + getVF(V1);
22169 if (V->getType() != V1->
getType())
22170 V1 = createShuffle(V1,
nullptr, Mask);
22171 InVectors.front() = V;
22172 if (InVectors.size() == 2)
22173 InVectors.back() = V1;
22175 InVectors.push_back(V1);
22180 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
22182 InVectors.push_back(V1);
22187 for (
Value *V : InVectors)
22188 VF = std::max(VF, getVF(V));
22189 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
22191 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
22200 Value *Root =
nullptr) {
22201 return R.gather(VL, Root, ScalarTy,
22203 return createShuffle(V1, V2, Mask);
22212 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
22217 IsFinalized =
true;
22220 if (InVectors.
size() == 2) {
22221 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
22224 Vec = createShuffle(Vec,
nullptr, CommonMask);
22226 transformMaskAfterShuffle(CommonMask, CommonMask);
22228 "Expected vector length for the final value before action.");
22232 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
22233 Vec = createShuffle(Vec,
nullptr, ResizeMask);
22235 Action(Vec, CommonMask, [
this](
Value *V1,
Value *V2, ArrayRef<int> Mask) {
22236 return createShuffle(V1, V2, Mask);
22238 InVectors.
front() = Vec;
22240 if (!SubVectors.empty()) {
22242 if (InVectors.
size() == 2) {
22243 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
22246 Vec = createShuffle(Vec,
nullptr, CommonMask);
22248 transformMaskAfterShuffle(CommonMask, CommonMask);
22249 auto CreateSubVectors = [&](
Value *Vec,
22250 SmallVectorImpl<int> &CommonMask) {
22251 for (
auto [
E, Idx] : SubVectors) {
22252 Value *
V = getVectorizedValue(*
E);
22259 Type *OrigScalarTy = ScalarTy;
22262 Builder, Vec, V, InsertionIndex,
22263 std::bind(&ShuffleInstructionBuilder::createShuffle,
this, _1, _2,
22265 ScalarTy = OrigScalarTy;
22266 if (!CommonMask.
empty()) {
22267 std::iota(std::next(CommonMask.
begin(), Idx),
22268 std::next(CommonMask.
begin(), Idx +
E->getVectorFactor()),
22274 if (SubVectorsMask.
empty()) {
22275 Vec = CreateSubVectors(Vec, CommonMask);
22278 copy(SubVectorsMask, SVMask.begin());
22279 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
22282 I1 = I2 + CommonMask.
size();
22287 Vec = createShuffle(InsertVec, Vec, SVMask);
22288 transformMaskAfterShuffle(CommonMask, SVMask);
22290 InVectors.
front() = Vec;
22293 if (!ExtMask.
empty()) {
22294 if (CommonMask.
empty()) {
22298 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
22301 NewMask[
I] = CommonMask[ExtMask[
I]];
22303 CommonMask.
swap(NewMask);
22306 if (CommonMask.
empty()) {
22307 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
22308 return InVectors.
front();
22310 if (InVectors.
size() == 2)
22311 return createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
22312 return createShuffle(InVectors.
front(),
nullptr, CommonMask);
22316 assert((IsFinalized || CommonMask.empty()) &&
22317 "Shuffle construction must be finalized.");
22321Value *BoUpSLP::vectorizeOperand(TreeEntry *E,
unsigned NodeIdx) {
22325template <
typename BVTy,
typename ResTy,
typename... Args>
22326ResTy BoUpSLP::processBuildVector(
const TreeEntry *E,
Type *ScalarTy,
22328 assert((E->isGather() || TransformedToGatherNodes.contains(E)) &&
22329 "Expected gather node.");
22330 unsigned VF = E->getVectorFactor();
22332 bool NeedFreeze =
false;
22336 E->CombinedEntriesWithIndices.size());
22337 if (E->State == TreeEntry::SplitVectorize &&
22338 TransformedToGatherNodes.contains(E)) {
22339 SubVectors.
clear();
22342 for (
auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
22344 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
22347 E->CombinedEntriesWithIndices, SubVectors.
begin(), [&](
const auto &
P) {
22348 return std::make_pair(VectorizableTree[P.first].get(), P.second);
22354 E->ReorderIndices.end());
22355 if (!ReorderMask.
empty())
22361 if (!SubVectors.
empty() && !SubVectorsMask.
empty()) {
22363 if (E->Scalars[
I] == GatheredScalars[ReorderMask[
I]])
22366 SubVectorsMask.
clear();
22370 unsigned I,
unsigned SliceSize,
22371 bool IsNotPoisonous) {
22373 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
22376 TreeEntry *UserTE = E->UserTreeIndex.UserTE;
22377 unsigned EdgeIdx = E->UserTreeIndex.EdgeIdx;
22378 if (UserTE->getNumOperands() != 2)
22380 if (!IsNotPoisonous) {
22381 auto *It =
find_if(
ArrayRef(VectorizableTree).drop_front(UserTE->Idx + 1),
22382 [=](
const std::unique_ptr<TreeEntry> &TE) {
22383 return TE->UserTreeIndex.UserTE == UserTE &&
22384 TE->UserTreeIndex.EdgeIdx != EdgeIdx;
22386 if (It == VectorizableTree.end())
22389 if (!(*It)->ReorderIndices.empty()) {
22393 if (!
all_of(
zip(GatheredScalars, GS), [&](
const auto &
P) {
22394 Value *V0 = std::get<0>(
P);
22395 Value *V1 = std::get<1>(
P);
22403 if ((Mask.size() < InputVF &&
22406 (Mask.size() == InputVF &&
22409 std::next(Mask.begin(),
I * SliceSize),
22410 std::next(Mask.begin(),
22417 std::next(Mask.begin(),
I * SliceSize),
22418 std::next(Mask.begin(),
22424 BVTy ShuffleBuilder(ScalarTy, Params...);
22425 ResTy Res = ResTy();
22429 Value *ExtractVecBase =
nullptr;
22430 bool UseVecBaseAsInput =
false;
22433 Type *OrigScalarTy = GatheredScalars.
front()->getType();
22435 unsigned NumParts =
22439 bool Resized =
false;
22441 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
22442 if (!ExtractShuffles.
empty()) {
22444 for (
auto [Idx,
I] :
enumerate(ExtractMask)) {
22450 ExtractEntries.
append(TEs.begin(), TEs.end());
22452 if (std::optional<ResTy> Delayed =
22453 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
22455 PostponedGathers.insert(E);
22460 if (
Value *VecBase = ShuffleBuilder.adjustExtracts(
22461 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
22462 ExtractVecBase = VecBase;
22464 if (VF == VecBaseTy->getNumElements() &&
22465 GatheredScalars.
size() != VF) {
22467 GatheredScalars.
append(VF - GatheredScalars.
size(),
22475 if (!ExtractShuffles.
empty() || !E->hasState() ||
22476 E->getOpcode() != Instruction::Load ||
22477 (((E->hasState() && E->getOpcode() == Instruction::Load) ||
22481 return isa<LoadInst>(V) && isVectorized(V);
22483 (E->hasState() && E->isAltShuffle()) ||
22484 all_of(E->Scalars, [
this](
Value *V) { return isVectorized(V); }) ||
22486 (E->Scalars != GatheredScalars && GatheredScalars.
size() <= 2)) {
22488 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
22490 if (!GatherShuffles.
empty()) {
22491 if (std::optional<ResTy> Delayed =
22492 ShuffleBuilder.needToDelay(E, Entries)) {
22494 PostponedGathers.insert(E);
22499 if (GatherShuffles.
size() == 1 &&
22501 (Entries.
front().front()->isSame(E->Scalars) ||
22502 E->isSame(Entries.
front().front()->Scalars))) {
22505 LLVM_DEBUG(
dbgs() <<
"SLP: perfect diamond match for gather bundle "
22508 Mask.resize(E->Scalars.size());
22509 const TreeEntry *FrontTE = Entries.
front().front();
22510 if (FrontTE->ReorderIndices.empty() && E->ReorderIndices.empty() &&
22511 ((FrontTE->ReuseShuffleIndices.empty() &&
22512 E->Scalars.size() == FrontTE->Scalars.size()) ||
22513 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
22514 std::iota(Mask.begin(), Mask.end(), 0);
22521 Mask[
I] = FrontTE->findLaneForValue(V);
22526 ShuffleBuilder.resetForSameNode();
22528 if ((E->isSame(FrontTE->Scalars) &&
22529 FrontTE->ReuseShuffleIndices.empty() &&
22530 FrontTE->ReorderIndices.empty() &&
22531 E->getVectorFactor() == FrontTE->getVectorFactor()) ||
22532 (
equal(E->Scalars, FrontTE->Scalars) &&
22533 equal(E->ReorderIndices, FrontTE->ReorderIndices) &&
22534 equal(E->ReuseShuffleIndices, FrontTE->ReuseShuffleIndices))) {
22535 Mask.resize(FrontTE->getVectorFactor());
22536 std::iota(Mask.begin(), Mask.end(), 0);
22537 ShuffleBuilder.add(*FrontTE, Mask);
22538 Res = ShuffleBuilder.finalize({}, {}, {});
22540 ShuffleBuilder.add(*FrontTE, Mask);
22541 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
22546 if (GatheredScalars.
size() != VF &&
22548 return any_of(TEs, [&](
const TreeEntry *TE) {
22549 return TE->getVectorFactor() == VF;
22552 GatheredScalars.
append(VF - GatheredScalars.
size(),
22556 for (
int I = 0, Sz = Mask.size();
I < Sz; ++
I) {
22564 bool IsRootPoison) {
22567 bool IsSplat = IsRootPoison &&
isSplat(Scalars) &&
22574 int NumNonConsts = 0;
22593 Scalars.
front() = OrigV;
22596 const auto Res = UniquePositions.
try_emplace(OrigV,
I);
22597 Scalars[Res.first->second] = OrigV;
22598 ReuseMask[
I] = Res.first->second;
22601 if (NumNonConsts == 1) {
22606 if (!UndefPos.
empty() && UndefPos.
front() == 0)
22609 ReuseMask[SinglePos] = SinglePos;
22610 }
else if (!UndefPos.
empty() && IsSplat) {
22617 (E->UserTreeIndex &&
any_of(V->uses(), [E](
const Use &U) {
22620 return E->UserTreeIndex.EdgeIdx != U.getOperandNo() &&
22621 is_contained(E->UserTreeIndex.UserTE->Scalars,
22625 if (It != Scalars.
end()) {
22627 int Pos = std::distance(Scalars.
begin(), It);
22628 for (
int I : UndefPos) {
22630 ReuseMask[
I] = Pos;
22639 for (
int I : UndefPos) {
22648 if (!ExtractShuffles.
empty() || !GatherShuffles.
empty()) {
22649 bool IsNonPoisoned =
true;
22650 bool IsUsedInExpr =
true;
22651 Value *Vec1 =
nullptr;
22652 if (!ExtractShuffles.
empty()) {
22656 Value *Vec2 =
nullptr;
22657 for (
unsigned I = 0, Sz = ExtractMask.
size();
I < Sz; ++
I) {
22661 if (UseVecBaseAsInput) {
22662 Vec1 = ExtractVecBase;
22664 for (
unsigned I = 0, Sz = ExtractMask.
size();
I < Sz; ++
I) {
22670 Value *VecOp = EI->getVectorOperand();
22672 !TEs.
empty() && TEs.front()->VectorizedValue)
22673 VecOp = TEs.front()->VectorizedValue;
22676 }
else if (Vec1 != VecOp) {
22677 assert((!Vec2 || Vec2 == VecOp) &&
22678 "Expected only 1 or 2 vectors shuffle.");
22684 IsUsedInExpr =
false;
22687 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
22690 IsUsedInExpr &= FindReusedSplat(
22693 ExtractMask.
size(), IsNotPoisonedVec);
22694 ShuffleBuilder.add(Vec1, ExtractMask,
true);
22695 IsNonPoisoned &= IsNotPoisonedVec;
22697 IsUsedInExpr =
false;
22702 if (!GatherShuffles.
empty()) {
22704 if (Mask.size() == E->Scalars.size())
22709 for (
const auto [
I, TEs] :
enumerate(Entries)) {
22712 "No shuffles with empty entries list expected.");
22715 assert((TEs.size() == 1 || TEs.size() == 2) &&
22716 "Expected shuffle of 1 or 2 entries.");
22717 unsigned Limit =
getNumElems(Mask.size(), SliceSize,
I);
22720 copy(SubMask, std::next(VecMask.
begin(),
I * SliceSize));
22721 if (TEs.size() == 1) {
22722 bool IsNotPoisonedVec =
22723 TEs.front()->VectorizedValue
22727 FindReusedSplat(VecMask, TEs.
front()->getVectorFactor(),
I,
22728 SliceSize, IsNotPoisonedVec);
22729 ShuffleBuilder.add(*TEs.front(), VecMask);
22730 IsNonPoisoned &= IsNotPoisonedVec;
22732 IsUsedInExpr =
false;
22733 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
22734 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
22745 int EMSz = ExtractMask.
size();
22746 int MSz = Mask.size();
22749 bool IsSingleShuffle = ExtractShuffles.
empty() || GatherShuffles.
empty();
22750 bool IsIdentityShuffle =
22751 ((UseVecBaseAsInput ||
22753 [](
const std::optional<TTI::ShuffleKind> &SK) {
22757 none_of(ExtractMask, [&](
int I) {
return I >= EMSz; }) &&
22759 (!GatherShuffles.
empty() &&
22761 [](
const std::optional<TTI::ShuffleKind> &SK) {
22765 none_of(Mask, [&](
int I) {
return I >= MSz; }) &&
22767 bool EnoughConstsForShuffle =
22777 (!IsIdentityShuffle ||
22778 (GatheredScalars.
size() == 2 &&
22786 for (
int I = 0, Sz = GatheredScalars.
size();
I < Sz; ++
I) {
22787 if (EnoughConstsForShuffle &&
isa<Constant>(GatheredScalars[
I]))
22795 TryPackScalars(GatheredScalars, BVMask,
true);
22796 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.
size());
22797 ShuffleBuilder.add(BV, BVMask);
22801 (IsSingleShuffle && ((IsIdentityShuffle &&
22804 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
22807 Res = ShuffleBuilder.finalize(
22808 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.
size(),
22810 bool IsSplat = isSplat(NonConstants);
22811 SmallVector<int> BVMask(Mask.size(), PoisonMaskElem);
22812 TryPackScalars(NonConstants, BVMask, false);
22813 auto CheckIfSplatIsProfitable = [&]() {
22816 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
22817 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
22818 if (isa<ExtractElementInst>(V) || isVectorized(V))
22820 InstructionCost SplatCost = TTI->getVectorInstrCost(
22821 Instruction::InsertElement, VecTy, CostKind, 0,
22822 PoisonValue::get(VecTy), V);
22823 SmallVector<int> NewMask(Mask.begin(), Mask.end());
22824 for (auto [Idx, I] : enumerate(BVMask))
22825 if (I != PoisonMaskElem)
22826 NewMask[Idx] = Mask.size();
22828 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
22829 cast<VectorType>(VecTy), NewMask, CostKind);
22830 InstructionCost BVCost = TTI->getVectorInstrCost(
22831 Instruction::InsertElement, VecTy, CostKind,
22832 *find_if(Mask, not_equal_to(PoisonMaskElem)), Vec, V);
22834 if (count(BVMask, PoisonMaskElem) <
22835 static_cast<int>(BVMask.size() - 1)) {
22836 SmallVector<int> NewMask(Mask.begin(), Mask.end());
22837 for (auto [Idx, I] : enumerate(BVMask))
22838 if (I != PoisonMaskElem)
22840 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
22841 cast<VectorType>(VecTy), NewMask,
22844 return SplatCost <= BVCost;
22846 if (!IsSplat || Mask.size() <= 2 || !CheckIfSplatIsProfitable()) {
22850 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
22856 Value *BV = ShuffleBuilder.gather(Values, BVMask.
size());
22859 return I == PoisonMaskElem ? PoisonMaskElem : 0;
22862 BV = CreateShuffle(BV,
nullptr, SplatMask);
22865 Mask[Idx] = BVMask.size() + Idx;
22866 Vec = CreateShuffle(Vec, BV, Mask);
22875 TryPackScalars(GatheredScalars, ReuseMask,
true);
22876 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
22877 ShuffleBuilder.add(BV, ReuseMask);
22878 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
22883 for (
auto [
I, V] :
enumerate(GatheredScalars)) {
22887 Value *BV = ShuffleBuilder.gather(GatheredScalars);
22888 ShuffleBuilder.add(BV, Mask);
22889 Res = ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors,
22894 Res = ShuffleBuilder.createFreeze(Res);
22898Value *BoUpSLP::createBuildVector(
const TreeEntry *
E,
Type *ScalarTy) {
22900 if (
E->State != TreeEntry::SplitVectorize ||
22901 !TransformedToGatherNodes.contains(
E)) {
22902 for (
auto [EIdx,
_] :
E->CombinedEntriesWithIndices)
22905 return processBuildVector<ShuffleInstructionBuilder, Value *>(
E, ScalarTy,
22913 for (
Value *V : VL)
22926 IRBuilderBase::InsertPointGuard Guard(Builder);
22928 Value *
V =
E->Scalars.front();
22930 auto It = MinBWs.find(
E);
22931 if (It != MinBWs.end()) {
22937 if (
E->VectorizedValue)
22938 return E->VectorizedValue;
22940 if (
E->isGather() || TransformedToGatherNodes.contains(
E)) {
22942 if (
E->hasState() &&
E->Idx == 0 && !UserIgnoreList)
22943 setInsertPointAfterBundle(
E);
22944 Value *Vec = createBuildVector(
E, ScalarTy);
22945 E->VectorizedValue = Vec;
22948 if (
E->State == TreeEntry::SplitVectorize) {
22949 assert(
E->CombinedEntriesWithIndices.size() == 2 &&
22950 "Expected exactly 2 combined entries.");
22951 setInsertPointAfterBundle(
E);
22953 *VectorizableTree[
E->CombinedEntriesWithIndices.front().first];
22955 ArrayRef(
E->Scalars).take_front(OpTE1.getVectorFactor())) &&
22956 "Expected same first part of scalars.");
22959 *VectorizableTree[
E->CombinedEntriesWithIndices.back().first];
22961 OpTE2.isSame(
ArrayRef(
E->Scalars).take_back(OpTE2.getVectorFactor())) &&
22962 "Expected same second part of scalars.");
22964 auto GetOperandSignedness = [&](
const TreeEntry *OpE) {
22965 bool IsSigned =
false;
22966 auto It = MinBWs.find(OpE);
22967 if (It != MinBWs.end())
22968 IsSigned = It->second.second;
22971 if (isa<PoisonValue>(V))
22973 return !isKnownNonNegative(R, SimplifyQuery(*DL));
22980 Op1 = Builder.CreateIntCast(
22985 GetOperandSignedness(&OpTE1));
22990 Op2 = Builder.CreateIntCast(
22995 GetOperandSignedness(&OpTE2));
22997 if (
E->ReorderIndices.empty()) {
23001 std::next(
Mask.begin(),
E->CombinedEntriesWithIndices.back().second),
23004 if (ScalarTyNumElements != 1) {
23008 Value *Vec = Builder.CreateShuffleVector(Op1, Mask);
23010 E->CombinedEntriesWithIndices.back().second *
23011 ScalarTyNumElements);
23012 E->VectorizedValue = Vec;
23015 unsigned CommonVF =
23016 std::max(OpTE1.getVectorFactor(), OpTE2.getVectorFactor());
23023 Op1 = Builder.CreateShuffleVector(Op1, Mask);
23029 Op2 = Builder.CreateShuffleVector(Op2, Mask);
23031 Value *Vec = Builder.CreateShuffleVector(Op1, Op2,
E->getSplitMask());
23032 E->VectorizedValue = Vec;
23036 bool IsReverseOrder =
23038 auto FinalShuffle = [&](
Value *
V,
const TreeEntry *
E) {
23041 assert(
E->ReorderIndices.empty() &&
23042 "Expected no reordering for struct types.");
23043 assert(
E->ReuseShuffleIndices.empty() &&
23044 "Expected no reuse shuffle indices for struct types.");
23048 if ((
E->State == TreeEntry::StridedVectorize && IsReverseOrder) ||
23049 E->State == TreeEntry::CompressVectorize) {
23050 ShuffleBuilder.addOrdered(V, {});
23051 }
else if (
E->getOpcode() == Instruction::Store &&
23052 (
E->State == TreeEntry::Vectorize ||
23053 E->State == TreeEntry::StridedVectorize)) {
23054 ArrayRef<int>
Mask =
23055 ArrayRef(
reinterpret_cast<const int *
>(
E->ReorderIndices.begin()),
23056 E->ReorderIndices.size());
23057 ShuffleBuilder.add(V, Mask);
23059 ShuffleBuilder.addOrdered(V,
E->ReorderIndices);
23062 E->CombinedEntriesWithIndices.size());
23064 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](
const auto &
P) {
23065 return std::make_pair(VectorizableTree[P.first].get(), P.second);
23068 (
E->CombinedEntriesWithIndices.empty() ||
E->ReorderIndices.empty()) &&
23069 "Expected either combined subnodes or reordering");
23070 return ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors, {});
23073 assert(!
E->isGather() &&
"Unhandled state");
23074 unsigned ShuffleOrOp =
23075 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector :
E->
getOpcode();
23076 if (!
E->isAltShuffle()) {
23077 switch (E->CombinedOp) {
23078 case TreeEntry::ReducedBitcast:
23079 case TreeEntry::ReducedBitcastBSwap:
23080 case TreeEntry::ReducedBitcastLoads:
23081 case TreeEntry::ReducedBitcastBSwapLoads:
23082 case TreeEntry::ReducedCmpBitcast:
23083 ShuffleOrOp = E->CombinedOp;
23090 auto GetOperandSignedness = [&](
unsigned Idx) {
23091 const TreeEntry *OpE = getOperandEntry(
E, Idx);
23092 bool IsSigned =
false;
23093 auto It = MinBWs.find(OpE);
23094 if (It != MinBWs.end())
23095 IsSigned = It->second.second;
23098 if (isa<PoisonValue>(V))
23100 return !isKnownNonNegative(R, SimplifyQuery(*DL));
23104 auto PropagateIRFlags = [&](
Value *
V,
unsigned Opcode = 0,
23107 SmallSetVector<Value *, 4> UniqueInsts;
23108 for (
Value *Scalar : Scalars) {
23112 if (
E->hasCopyableElements() &&
E->isCopyableElement(
I))
23117 Opcode =
E->getOpcode();
23130 bool AllNoNaNs =
true;
23131 bool AllNoInfs =
true;
23132 for (
Value *Scalar : Scalars) {
23133 if (!
E->isCopyableElement(Scalar))
23136 AllNoNaNs &= FPMO->hasNoNaNs();
23137 AllNoInfs &= FPMO->hasNoInfs();
23141 AllNoNaNs &= !CFP->isNaN();
23142 AllNoInfs &= !CFP->isInfinity();
23150 I->setHasNoNaNs(
false);
23152 I->setHasNoInfs(
false);
23155 if (!MinBWs.contains(
E) && Opcode == Instruction::Sub &&
23156 (
E->hasCopyableElements() ||
any_of(Scalars, [](
Value *Scalar) {
23160 I->setHasNoUnsignedWrap(
false);
23162 ICmp->setSameSign(
false);
23165 switch (ShuffleOrOp) {
23166 case Instruction::PHI: {
23167 assert((
E->ReorderIndices.empty() || !
E->ReuseShuffleIndices.empty() ||
23168 E != VectorizableTree.front().get() ||
E->UserTreeIndex) &&
23169 "PHI reordering is free.");
23171 Builder.SetInsertPoint(PH->getParent(),
23172 PH->getParent()->getFirstNonPHIIt());
23174 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
23178 Builder.SetInsertPoint(PH->getParent(),
23179 PH->getParent()->getFirstInsertionPt());
23182 V = FinalShuffle(V,
E);
23184 E->VectorizedValue =
V;
23191 SmallDenseMap<BasicBlock *, unsigned, 4> VisitedBBs;
23197 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
23203 TreeEntry *OpTE = getOperandEntry(
E,
I);
23204 if (OpTE->isGather() || DeletedNodes.contains(OpTE) ||
23205 TransformedToGatherNodes.contains(OpTE)) {
23208 assert(!OpTE->VectorizedValue &&
"Expected no vectorized value.");
23209 OpTE->VectorizedValue = VecOp;
23216 Value *Vec = vectorizeOperand(
E,
I);
23217 if (VecTy != Vec->
getType()) {
23219 MinBWs.contains(getOperandEntry(
E,
I))) &&
23220 "Expected item in MinBWs.");
23221 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(
I));
23227 "Invalid number of incoming values");
23228 assert(
E->VectorizedValue &&
"Expected vectorized value.");
23229 return E->VectorizedValue;
23232 case Instruction::ExtractElement: {
23233 Value *
V =
E->getSingleOperand(0);
23234 setInsertPointAfterBundle(
E);
23235 V = FinalShuffle(V,
E);
23236 E->VectorizedValue =
V;
23239 case Instruction::ExtractValue: {
23240 if (!
E->StructEVIndices.empty()) {
23241 setInsertPointAfterBundle(
E);
23242 Value *
V = vectorizeOperand(
E, 0);
23243 V = Builder.CreateExtractValue(V,
E->StructEVIndices);
23246 V = FinalShuffle(V,
E);
23247 E->VectorizedValue =
V;
23251 Builder.SetInsertPoint(LI);
23252 Value *Ptr = LI->getPointerOperand();
23253 LoadInst *
V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
23254 Value *NewV = PropagateIRFlags(V);
23255 NewV = FinalShuffle(NewV,
E);
23256 E->VectorizedValue = NewV;
23259 case Instruction::InsertElement: {
23260 assert(
E->ReuseShuffleIndices.empty() &&
"All inserts should be unique");
23261 if (
const TreeEntry *OpE = getOperandEntry(
E, 1);
23262 OpE && !OpE->isGather() && OpE->hasState() &&
23263 !OpE->hasCopyableElements())
23266 setInsertPointAfterBundle(
E);
23267 Value *
V = vectorizeOperand(
E, 1);
23269 Type *ScalarTy =
Op.front()->getType();
23272 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(
E, 1));
23273 assert(Res.first > 0 &&
"Expected item in MinBWs.");
23274 V = Builder.CreateIntCast(
23284 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
23286 const unsigned NumElts =
23288 const unsigned NumScalars =
E->Scalars.size();
23291 assert(
Offset < NumElts &&
"Failed to find vector index offset");
23294 SmallVector<int>
Mask;
23295 if (!
E->ReorderIndices.empty()) {
23300 std::iota(
Mask.begin(), std::next(
Mask.begin(), NumScalars), 0);
23303 bool IsIdentity =
true;
23305 Mask.swap(PrevMask);
23306 for (
unsigned I = 0;
I < NumScalars; ++
I) {
23309 IsIdentity &= InsertIdx -
Offset ==
I;
23312 if (!IsIdentity || NumElts != NumScalars) {
23313 Value *V2 =
nullptr;
23314 bool IsVNonPoisonous =
23316 SmallVector<int> InsertMask(Mask);
23317 if (NumElts != NumScalars &&
Offset == 0) {
23326 InsertMask[*InsertIdx] = *InsertIdx;
23332 SmallBitVector UseMask =
23333 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
23334 SmallBitVector IsFirstPoison =
23336 SmallBitVector IsFirstUndef =
23338 if (!IsFirstPoison.
all()) {
23340 for (
unsigned I = 0;
I < NumElts;
I++) {
23342 IsFirstUndef.
test(
I)) {
23343 if (IsVNonPoisonous) {
23344 InsertMask[
I] =
I < NumScalars ?
I : 0;
23349 if (Idx >= NumScalars)
23350 Idx = NumScalars - 1;
23351 InsertMask[
I] = NumScalars + Idx;
23364 V = Builder.CreateShuffleVector(V, V2, InsertMask);
23366 GatherShuffleExtractSeq.insert(
I);
23367 CSEBlocks.insert(
I->getParent());
23372 for (
unsigned I = 0;
I < NumElts;
I++) {
23376 SmallBitVector UseMask =
23377 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
23378 SmallBitVector IsFirstUndef =
23380 if ((!IsIdentity ||
Offset != 0 || !IsFirstUndef.
all()) &&
23381 NumElts != NumScalars) {
23382 if (IsFirstUndef.
all()) {
23384 SmallBitVector IsFirstPoison =
23386 if (!IsFirstPoison.
all()) {
23387 for (
unsigned I = 0;
I < NumElts;
I++) {
23389 InsertMask[
I] =
I + NumElts;
23392 V = Builder.CreateShuffleVector(
23398 GatherShuffleExtractSeq.insert(
I);
23399 CSEBlocks.insert(
I->getParent());
23403 SmallBitVector IsFirstPoison =
23405 for (
unsigned I = 0;
I < NumElts;
I++) {
23409 InsertMask[
I] += NumElts;
23411 V = Builder.CreateShuffleVector(
23412 FirstInsert->getOperand(0), V, InsertMask,
23415 GatherShuffleExtractSeq.insert(
I);
23416 CSEBlocks.insert(
I->getParent());
23421 ++NumVectorInstructions;
23422 E->VectorizedValue =
V;
23425 case Instruction::ZExt:
23426 case Instruction::SExt:
23427 case Instruction::FPToUI:
23428 case Instruction::FPToSI:
23429 case Instruction::FPExt:
23430 case Instruction::PtrToInt:
23431 case Instruction::IntToPtr:
23432 case Instruction::SIToFP:
23433 case Instruction::UIToFP:
23434 case Instruction::Trunc:
23435 case Instruction::FPTrunc:
23436 case Instruction::BitCast: {
23437 setInsertPointAfterBundle(
E);
23439 Value *InVec = vectorizeOperand(
E, 0);
23444 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
23446 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
23449 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
23450 if (SrcIt != MinBWs.end())
23451 SrcBWSz = SrcIt->second.first;
23452 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->
getScalarType());
23453 if (BWSz == SrcBWSz) {
23454 VecOpcode = Instruction::BitCast;
23455 }
else if (BWSz < SrcBWSz) {
23456 VecOpcode = Instruction::Trunc;
23457 }
else if (It != MinBWs.end()) {
23458 assert(BWSz > SrcBWSz &&
"Invalid cast!");
23459 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
23460 }
else if (SrcIt != MinBWs.end()) {
23461 assert(BWSz > SrcBWSz &&
"Invalid cast!");
23463 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
23465 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
23466 !SrcIt->second.second) {
23467 VecOpcode = Instruction::UIToFP;
23468 }
else if (VecOpcode == Instruction::BitCast && SrcIt != MinBWs.end() &&
23470 Type *OrigSrcScalarTy = CI->getSrcTy();
23471 auto *OrigSrcVectorTy =
23474 Builder.CreateIntCast(InVec, OrigSrcVectorTy, SrcIt->second.second);
23476 Value *
V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
23478 : Builder.CreateCast(VecOpcode, InVec, VecTy);
23479 V = FinalShuffle(V,
E);
23481 E->VectorizedValue =
V;
23482 ++NumVectorInstructions;
23485 case Instruction::FCmp:
23486 case Instruction::ICmp: {
23487 setInsertPointAfterBundle(
E);
23489 Value *
L = vectorizeOperand(
E, 0);
23490 Value *
R = vectorizeOperand(
E, 1);
23491 if (
L->getType() !=
R->getType()) {
23494 MinBWs.contains(getOperandEntry(
E, 0)) ||
23495 MinBWs.contains(getOperandEntry(
E, 1))) &&
23496 "Expected item in MinBWs.");
23499 ->getIntegerBitWidth();
23502 ->getIntegerBitWidth();
23507 auto *CI = dyn_cast<ConstantInt>(V);
23509 CI->getValue().getActiveBits() > LBW;
23513 auto *CI = dyn_cast<ConstantInt>(V);
23514 return CI && CI->getValue().getActiveBits() <= RBW;
23516 Type *CastTy =
R->getType();
23517 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
23519 Type *CastTy =
L->getType();
23520 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
23525 Value *
V = Builder.CreateCmp(P0, L, R);
23526 V = PropagateIRFlags(V);
23529 V = FinalShuffle(V,
E);
23531 E->VectorizedValue =
V;
23532 ++NumVectorInstructions;
23535 case Instruction::Select: {
23536 setInsertPointAfterBundle(
E);
23539 Value *True = vectorizeOperand(
E, 1);
23540 Value *False = vectorizeOperand(
E, 2);
23544 MinBWs.contains(getOperandEntry(
E, 1)) ||
23545 MinBWs.contains(getOperandEntry(
E, 2))) &&
23546 "Expected item in MinBWs.");
23547 if (True->
getType() != VecTy)
23548 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
23549 if (False->
getType() != VecTy)
23550 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
23555 assert(TrueNumElements >= CondNumElements &&
23556 TrueNumElements % CondNumElements == 0 &&
23557 "Cannot vectorize Instruction::Select");
23559 "Cannot vectorize Instruction::Select");
23560 if (CondNumElements != TrueNumElements) {
23563 Cond = Builder.CreateShuffleVector(
23568 "Cannot vectorize Instruction::Select");
23570 Builder.CreateSelectWithUnknownProfile(
Cond, True, False,
DEBUG_TYPE);
23571 V = FinalShuffle(V,
E);
23573 E->VectorizedValue =
V;
23574 ++NumVectorInstructions;
23577 case Instruction::FNeg: {
23578 setInsertPointAfterBundle(
E);
23580 Value *
Op = vectorizeOperand(
E, 0);
23582 Value *
V = Builder.CreateUnOp(
23584 V = PropagateIRFlags(V);
23586 V = FinalShuffle(V,
E);
23588 E->VectorizedValue =
V;
23589 ++NumVectorInstructions;
23593 case Instruction::Freeze: {
23594 setInsertPointAfterBundle(
E);
23596 Value *
Op = vectorizeOperand(
E, 0);
23598 if (
Op->getType() != VecTy) {
23600 MinBWs.contains(getOperandEntry(
E, 0))) &&
23601 "Expected item in MinBWs.");
23602 Op = Builder.CreateIntCast(
Op, VecTy, GetOperandSignedness(0));
23604 Value *
V = Builder.CreateFreeze(
Op);
23605 V = FinalShuffle(V,
E);
23607 E->VectorizedValue =
V;
23608 ++NumVectorInstructions;
23612 case Instruction::Add:
23613 case Instruction::FAdd:
23614 case Instruction::Sub:
23615 case Instruction::FSub:
23616 case Instruction::Mul:
23617 case Instruction::FMul:
23618 case Instruction::UDiv:
23619 case Instruction::SDiv:
23620 case Instruction::FDiv:
23621 case Instruction::URem:
23622 case Instruction::SRem:
23623 case Instruction::FRem:
23624 case Instruction::Shl:
23625 case Instruction::LShr:
23626 case Instruction::AShr:
23627 case Instruction::And:
23628 case Instruction::Or:
23629 case Instruction::Xor: {
23630 setInsertPointAfterBundle(
E);
23634 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
23639 return CI && CI->getValue().countr_one() >= It->second.first;
23641 V = FinalShuffle(
I == 0 ?
RHS :
LHS,
E);
23642 E->VectorizedValue =
V;
23643 ++NumVectorInstructions;
23651 MinBWs.contains(getOperandEntry(
E, 0)) ||
23652 MinBWs.contains(getOperandEntry(
E, 1))) &&
23653 "Expected item in MinBWs.");
23655 LHS = Builder.CreateIntCast(
LHS, VecTy, GetOperandSignedness(0));
23657 RHS = Builder.CreateIntCast(
RHS, VecTy, GetOperandSignedness(1));
23660 Value *
V = Builder.CreateBinOp(
23663 V = PropagateIRFlags(V);
23665 V = FinalShuffle(V,
E);
23667 E->VectorizedValue =
V;
23668 ++NumVectorInstructions;
23672 case Instruction::Load: {
23675 setInsertPointAfterBundle(
E);
23679 FixedVectorType *StridedLoadTy =
nullptr;
23680 Value *PO = LI->getPointerOperand();
23681 if (
E->State == TreeEntry::Vectorize) {
23682 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
23683 }
else if (
E->State == TreeEntry::CompressVectorize) {
23684 auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
23685 CompressEntryToData.at(
E);
23686 Align CommonAlignment = LI->getAlign();
23692 for (
int I : CompressMask)
23696 MaskValues =
replicateMask(MaskValues, VecTy->getNumElements());
23699 NewLI = Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment,
23702 NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment);
23713 }
else if (
E->State == TreeEntry::StridedVectorize) {
23716 PO = IsReverseOrder ? PtrN : Ptr0;
23717 Type *StrideTy = DL->getIndexType(PO->
getType());
23720 StridedLoadTy = SPtrInfo.Ty;
23721 assert(StridedLoadTy &&
"Missing StridedPointerInfo for tree entry.");
23722 unsigned StridedLoadEC =
23725 Value *Stride = SPtrInfo.StrideVal;
23727 const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
23728 assert(StrideSCEV &&
"Neither StrideVal nor StrideSCEV were set.");
23729 SCEVExpander Expander(*SE,
"strided-load-vec");
23730 Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->
getType(),
23731 &*Builder.GetInsertPoint());
23734 Builder.CreateIntCast(Stride, StrideTy,
true);
23735 StrideVal = Builder.CreateMul(
23737 StrideTy, (IsReverseOrder ? -1 : 1) *
23739 DL->getTypeAllocSize(ScalarTy))));
23741 auto *Inst = Builder.CreateIntrinsic(
23742 Intrinsic::experimental_vp_strided_load,
23743 {StridedLoadTy, PO->
getType(), StrideTy},
23746 Builder.getInt32(StridedLoadEC)});
23747 Inst->addParamAttr(
23749 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
23752 assert(
E->State == TreeEntry::ScatterVectorize &&
"Unhandled state");
23753 Value *VecPtr = vectorizeOperand(
E, 0);
23760 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
23761 "Cannot expand getelementptr.");
23762 unsigned VF = VecTyNumElements / ScalarTyNumElements;
23765 return Builder.getInt64(I % ScalarTyNumElements);
23767 VecPtr = Builder.CreateGEP(
23769 Builder.CreateShuffleVector(
23775 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
23777 Value *
V =
E->State == TreeEntry::CompressVectorize
23779 : PropagateIRFlags(NewLI);
23781 if (StridedLoadTy != VecTy)
23782 V = Builder.CreateBitOrPointerCast(V, VecTy);
23783 V = FinalShuffle(V,
E);
23784 E->VectorizedValue =
V;
23785 ++NumVectorInstructions;
23788 case Instruction::Store: {
23791 setInsertPointAfterBundle(
E);
23793 Value *VecValue = vectorizeOperand(
E, 0);
23794 if (VecValue->
getType() != VecTy)
23796 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
23797 VecValue = FinalShuffle(VecValue,
E);
23799 Value *Ptr =
SI->getPointerOperand();
23801 if (
E->State == TreeEntry::Vectorize) {
23802 ST = Builder.CreateAlignedStore(VecValue, Ptr,
SI->getAlign());
23804 assert(
E->State == TreeEntry::StridedVectorize &&
23805 "Expected either strided or consecutive stores.");
23806 bool IsReverseOrder =
23808 if (IsReverseOrder) {
23810 Ptr =
SI->getPointerOperand();
23813 Type *StrideTy = DL->getIndexType(
SI->getPointerOperandType());
23816 FixedVectorType *StridedStoreTy = SPtrInfo.Ty;
23817 assert(StridedStoreTy &&
"Missing StridedPointerInfo for tree entry.");
23819 Value *Stride = SPtrInfo.StrideVal;
23820 assert(Stride &&
"Missing StridedPointerInfo for tree entry.");
23822 Builder.CreateIntCast(Stride, StrideTy,
true);
23824 StrideVal = Builder.CreateMul(
23827 StrideTy,
static_cast<int>(DL->getTypeAllocSize(ScalarTy))));
23828 if (StridedStoreTy != VecTy)
23829 VecValue = Builder.CreateBitOrPointerCast(VecValue, StridedStoreTy);
23830 auto *Inst = Builder.CreateIntrinsic(
23831 Intrinsic::experimental_vp_strided_store,
23832 {StridedStoreTy, Ptr->
getType(), StrideTy},
23833 {VecValue, Ptr, StrideVal,
23835 Builder.getInt32(StridedStoreEC)});
23836 Inst->addParamAttr(
23838 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
23842 Value *
V = PropagateIRFlags(ST);
23844 E->VectorizedValue =
V;
23845 ++NumVectorInstructions;
23848 case Instruction::GetElementPtr: {
23850 setInsertPointAfterBundle(
E);
23852 Value *Op0 = vectorizeOperand(
E, 0);
23855 for (
int J = 1,
N = GEP0->getNumOperands(); J <
N; ++J) {
23856 Value *OpVec = vectorizeOperand(
E, J);
23860 Value *
V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
23863 for (
Value *V :
E->Scalars) {
23867 V = PropagateIRFlags(
I);
23870 V = FinalShuffle(V,
E);
23872 E->VectorizedValue =
V;
23873 ++NumVectorInstructions;
23877 case Instruction::Call: {
23879 setInsertPointAfterBundle(
E);
23885 It != MinBWs.end() ? It->second.first : 0, TTI);
23888 VecCallCosts.first <= VecCallCosts.second;
23890 Value *ScalarArg =
nullptr;
23896 for (
auto [Idx, Ty] :
enumerate(ContainedTys)) {
23906 ScalarArg = CEI->getArgOperand(
I);
23909 if (
ID == Intrinsic::abs && It != MinBWs.end() &&
23910 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
23911 ScalarArg = Builder.getFalse();
23918 Value *OpVec = vectorizeOperand(
E,
I);
23919 ScalarArg = CEI->getArgOperand(
I);
23922 It == MinBWs.end()) {
23925 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(
I));
23926 }
else if (It != MinBWs.end()) {
23927 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(
I));
23936 if (!UseIntrinsic) {
23941 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
23948 Value *
V = Builder.CreateCall(CF, OpVecs, OpBundles);
23950 V = PropagateIRFlags(V);
23952 V = FinalShuffle(V,
E);
23954 E->VectorizedValue =
V;
23955 ++NumVectorInstructions;
23958 case Instruction::ShuffleVector: {
23961 setInsertPointAfterBundle(
E);
23962 Value *Src = vectorizeOperand(
E, 0);
23965 SmallVector<int> NewMask(ThisMask.size());
23967 return SVSrc->getShuffleMask()[Mask];
23969 V = Builder.CreateShuffleVector(SVSrc->getOperand(0),
23970 SVSrc->getOperand(1), NewMask);
23972 V = Builder.CreateShuffleVector(Src, ThisMask);
23974 V = PropagateIRFlags(V);
23975 V = FinalShuffle(V,
E);
23983 "Invalid Shuffle Vector Operand");
23987 setInsertPointAfterBundle(
E);
23988 LHS = vectorizeOperand(
E, 0);
23989 RHS = vectorizeOperand(
E, 1);
23991 setInsertPointAfterBundle(
E);
23992 LHS = vectorizeOperand(
E, 0);
23998 assert((It != MinBWs.end() ||
23999 getOperandEntry(
E, 0)->State == TreeEntry::NeedToGather ||
24000 getOperandEntry(
E, 1)->State == TreeEntry::NeedToGather ||
24001 MinBWs.contains(getOperandEntry(
E, 0)) ||
24002 MinBWs.contains(getOperandEntry(
E, 1))) &&
24003 "Expected item in MinBWs.");
24004 Type *CastTy = VecTy;
24010 ->getIntegerBitWidth())
24016 LHS = Builder.CreateIntCast(
LHS, CastTy, GetOperandSignedness(0));
24018 RHS = Builder.CreateIntCast(
RHS, CastTy, GetOperandSignedness(1));
24023 V0 = Builder.CreateBinOp(
24025 V1 = Builder.CreateBinOp(
24028 V0 = Builder.CreateCmp(CI0->getPredicate(),
LHS,
RHS);
24031 V1 = Builder.CreateCmp(AltPred,
LHS,
RHS);
24034 unsigned SrcBWSz = DL->getTypeSizeInBits(
24036 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
24037 if (BWSz <= SrcBWSz) {
24038 if (BWSz < SrcBWSz)
24039 LHS = Builder.CreateIntCast(
LHS, VecTy, It->second.first);
24041 "Expected same type as operand.");
24042 LHS = PropagateIRFlags(
LHS);
24044 E->VectorizedValue =
LHS;
24045 ++NumVectorInstructions;
24049 V0 = Builder.CreateCast(
24051 V1 = Builder.CreateCast(
24056 for (
Value *V : {V0, V1}) {
24058 GatherShuffleExtractSeq.insert(
I);
24059 CSEBlocks.insert(
I->getParent());
24067 SmallVector<int>
Mask;
24068 E->buildAltOpShuffleMask(
24069 [
E,
this](Instruction *
I) {
24070 assert(
E->getMatchingMainOpOrAltOp(
I) &&
24071 "Unexpected main/alternate opcode");
24075 Mask, &OpScalars, &AltScalars);
24077 PropagateIRFlags(V0,
E->getOpcode(), OpScalars);
24078 PropagateIRFlags(V1,
E->getAltOpcode(), AltScalars);
24084 V = Builder.CreateShuffleVector(V0, V1, Mask);
24086 GatherShuffleExtractSeq.insert(
I);
24087 CSEBlocks.insert(
I->getParent());
24091 E->VectorizedValue =
V;
24092 ++NumVectorInstructions;
24096 case TreeEntry::ReducedBitcast:
24097 case TreeEntry::ReducedBitcastBSwap: {
24098 assert(UserIgnoreList &&
"Expected reduction operations only.");
24099 setInsertPointAfterBundle(
E);
24100 TreeEntry *ZExt = getOperandEntry(
E, 0);
24102 ZExt->getMainOp()->getType(), ZExt->getVectorFactor()));
24103 TreeEntry *
Const = getOperandEntry(
E, 1);
24105 Const->Scalars.front()->getType(),
Const->getVectorFactor()));
24106 Value *
Op = vectorizeOperand(ZExt, 0);
24109 DL->getTypeSizeInBits(
cast<CastInst>(ZExt->getMainOp())->getSrcTy()) *
24110 E->getVectorFactor());
24111 auto *OrigScalarTy = ScalarTy;
24114 Op = FinalShuffle(
Op,
E);
24115 auto *
V = Builder.CreateBitCast(
Op, SrcType);
24116 ++NumVectorInstructions;
24117 if (ShuffleOrOp == TreeEntry::ReducedBitcastBSwap) {
24118 V = Builder.CreateUnaryIntrinsic(Intrinsic::bswap, V);
24119 ++NumVectorInstructions;
24121 if (SrcType != OrigScalarTy) {
24122 V = Builder.CreateIntCast(V, OrigScalarTy,
false);
24123 ++NumVectorInstructions;
24125 E->VectorizedValue =
V;
24128 case TreeEntry::ReducedBitcastLoads:
24129 case TreeEntry::ReducedBitcastBSwapLoads: {
24130 assert(UserIgnoreList &&
"Expected reduction operations only.");
24131 TreeEntry *ZExt = getOperandEntry(
E, 0);
24132 TreeEntry *
Load = getOperandEntry(ZExt, 0);
24133 setInsertPointAfterBundle(Load);
24135 ZExt->getMainOp()->getType(), ZExt->getVectorFactor()));
24136 TreeEntry *
Const = getOperandEntry(
E, 1);
24138 Const->Scalars.front()->getType(),
Const->getVectorFactor()));
24140 Load->getMainOp()->getType(),
Load->getVectorFactor()));
24142 Value *PO = LI->getPointerOperand();
24145 DL->getTypeSizeInBits(
cast<CastInst>(ZExt->getMainOp())->getSrcTy()) *
24146 E->getVectorFactor());
24147 auto *OrigScalarTy = ScalarTy;
24148 ScalarTy = ZExt->getMainOp()->getType();
24149 Value *
V = Builder.CreateAlignedLoad(SrcTy, PO, LI->getAlign());
24150 ++NumVectorInstructions;
24151 if (ShuffleOrOp == TreeEntry::ReducedBitcastBSwapLoads) {
24152 V = Builder.CreateUnaryIntrinsic(Intrinsic::bswap, V);
24153 ++NumVectorInstructions;
24155 if (SrcTy != OrigScalarTy) {
24156 V = Builder.CreateIntCast(V, OrigScalarTy,
false);
24157 ++NumVectorInstructions;
24159 E->VectorizedValue =
V;
24162 case TreeEntry::ReducedCmpBitcast: {
24163 assert(UserIgnoreList &&
"Expected reduction operations only.");
24164 setInsertPointAfterBundle(
E);
24165 TreeEntry *Op1TE = getOperandEntry(
E, 1);
24166 TreeEntry *Op2TE = getOperandEntry(
E, 2);
24167 Op1TE->VectorizedValue =
24169 Op2TE->VectorizedValue =
24174 IntegerType::getIntNTy(ScalarTy->
getContext(),
E->getVectorFactor());
24175 auto *
V = Builder.CreateBitCast(Cmp, DstTy);
24176 ++NumVectorInstructions;
24177 if (DstTy != ScalarTy) {
24178 V = Builder.CreateIntCast(V, ScalarTy,
false);
24179 ++NumVectorInstructions;
24181 E->VectorizedValue =
V;
24198 ArrayRef<std::tuple<WeakTrackingVH, unsigned, bool, bool>>
24199 VectorValuesAndScales) {
24202 EntryToLastInstruction.clear();
24204 for (
auto &BSIter : BlocksSchedules)
24205 scheduleBlock(*
this, BSIter.second.get());
24208 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
24211 if (TE->isGather() || DeletedNodes.contains(TE.get()) ||
24212 (TE->State == TreeEntry::CombinedVectorize &&
24213 (TE->CombinedOp == TreeEntry::ReducedBitcast ||
24214 TE->CombinedOp == TreeEntry::ReducedBitcastBSwap ||
24215 ((TE->CombinedOp == TreeEntry::ReducedBitcastLoads ||
24216 TE->CombinedOp == TreeEntry::ReducedBitcastBSwapLoads ||
24217 TE->CombinedOp == TreeEntry::ReducedCmpBitcast) &&
24218 (!TE->hasState() || TE->getOpcode() != Instruction::Load)))))
24220 (void)getLastInstructionInBundle(TE.get());
24224 Builder.SetInsertPoint(ReductionRoot->
getParent(),
24227 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
24235 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
24236 if (DeletedNodes.contains(TE.get()))
24238 if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
24239 TE->UserTreeIndex.UserTE->hasState() &&
24240 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
24241 (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
24242 TE->UserTreeIndex.UserTE->isAltShuffle()) &&
24243 !TE->UserTreeIndex.UserTE->hasCopyableElements()) {
24244 const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
24245 auto [It, Inserted] =
24246 UserTEScalarsUsedOutsideBlockCache.
try_emplace(UserTE);
24248 It->second =
all_of(UserTE->Scalars,
24249 [](
Value *V) { return isUsedOutsideBlock(V); });
24252 Instruction &LastInst = getLastInstructionInBundle(UserTE);
24256 for (
auto &Entry : GatherEntries) {
24258 Builder.SetInsertPoint(Entry.second);
24259 Builder.SetCurrentDebugLocation(Entry.second->getDebugLoc());
24264 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
24265 if (DeletedNodes.contains(TE.get()))
24267 if (GatheredLoadsEntriesFirst.has_value() &&
24268 TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue &&
24269 (!TE->isGather() || TE->UserTreeIndex)) {
24270 assert((TE->UserTreeIndex ||
24271 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
24272 "Expected gathered load node.");
24281 for (
const TreeEntry *E : PostponedNodes) {
24282 auto *TE =
const_cast<TreeEntry *
>(E);
24284 TE->VectorizedValue =
nullptr;
24295 (TE->UserTreeIndex.UserTE->hasState() &&
24296 TE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
24297 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI)) {
24306 if (UI->comesBefore(InsertPt))
24309 Builder.SetInsertPoint(InsertPt);
24311 Builder.SetInsertPoint(PrevVec);
24313 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
24316 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
24317 Builder.GetInsertPoint()->comesBefore(VecI))
24318 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
24319 Builder.GetInsertPoint());
24320 if (Vec->
getType() != PrevVec->getType()) {
24322 PrevVec->getType()->isIntOrIntVectorTy() &&
24323 "Expected integer vector types only.");
24324 std::optional<bool> IsSigned;
24325 for (
Value *V : TE->Scalars) {
24327 for (
const TreeEntry *MNTE : getTreeEntries(V)) {
24328 auto It = MinBWs.find(MNTE);
24329 if (It != MinBWs.end()) {
24330 IsSigned = IsSigned.value_or(
false) || It->second.second;
24335 if (IsSigned.value_or(
false))
24338 for (
const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
24339 auto It = MinBWs.find(BVE);
24340 if (It != MinBWs.end()) {
24341 IsSigned = IsSigned.value_or(
false) || It->second.second;
24346 if (IsSigned.value_or(
false))
24350 IsSigned.value_or(
false) ||
24354 if (IsSigned.value_or(
false))
24358 if (IsSigned.value_or(
false)) {
24360 auto It = MinBWs.find(TE->UserTreeIndex.UserTE);
24361 if (It != MinBWs.end())
24362 IsSigned = It->second.second;
24365 "Expected user node or perfect diamond match in MinBWs.");
24366 Vec = Builder.CreateIntCast(Vec, PrevVec->
getType(), *IsSigned);
24368 PrevVec->replaceAllUsesWith(Vec);
24369 PostponedValues.
try_emplace(Vec).first->second.push_back(TE);
24372 auto It = PostponedValues.
find(PrevVec);
24373 if (It != PostponedValues.
end()) {
24374 for (TreeEntry *VTE : It->getSecond())
24375 VTE->VectorizedValue = Vec;