74#ifdef EXPENSIVE_CHECKS
109using namespace std::placeholders;
111#define SV_NAME "slp-vectorizer"
112#define DEBUG_TYPE "SLP"
114STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
117 "Controls which SLP graphs should be vectorized.");
121 cl::desc(
"Run the SLP vectorization passes"));
125 cl::desc(
"Enable vectorization for wider vector utilization"));
129 cl::desc(
"Only vectorize if you gain more than this "
134 cl::desc(
"Attempt to vectorize horizontal reductions"));
139 "Attempt to vectorize horizontal reductions feeding into a store"));
143 cl::desc(
"Improve the code quality by splitting alternate instructions"));
147 cl::desc(
"Attempt to vectorize for this register size in bits"));
151 cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
159 cl::desc(
"Limit the size of the SLP scheduling region per block"));
163 cl::desc(
"Attempt to vectorize for this register size in bits"));
167 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
171 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
177 cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
186 cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
190 cl::desc(
"The minimum number of loads, which should be considered strided, "
191 "if the stride is > 1 or is runtime value"));
195 cl::desc(
"The maximum stride, considered to be profitable."));
199 cl::desc(
"Disable tree reordering even if it is "
200 "profitable. Used for testing only."));
204 cl::desc(
"Generate strided loads even if they are not "
205 "profitable. Used for testing only."));
209 cl::desc(
"Display the SLP trees with Graphviz"));
213 cl::desc(
"Try to vectorize with non-power-of-2 number of elements."));
218 cl::desc(
"Try to replace values with the idempotent instructions for "
219 "better vectorization."));
223 cl::desc(
"Loop trip count, considered by the cost model during "
224 "modeling (0=loops are ignored and considered flat code)"));
256 Ty = Ty->getScalarType();
258 !Ty->isPPC_FP128Ty();
267 return SI->getValueOperand()->getType();
269 return CI->getOperand(0)->getType();
272 return IE->getOperand(1)->getType();
279 "ScalableVectorType is not supported.");
281 return VecTy->getNumElements();
295 Type *Ty,
unsigned Sz) {
300 if (NumParts == 0 || NumParts >= Sz)
315 if (NumParts == 0 || NumParts >= Sz)
320 return (Sz / RegVF) * RegVF;
332 I * VecTyNumElements, VecTyNumElements)))
334 : Mask[
I] * VecTyNumElements + J;
368 unsigned SVNumElements =
370 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
371 if (SVNumElements % ShuffleMaskSize != 0)
373 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
374 if (GroupSize == 0 || (VL.
size() % GroupSize) != 0)
376 unsigned NumGroup = 0;
377 for (
size_t I = 0,
E = VL.
size();
I !=
E;
I += GroupSize) {
379 Value *Src = SV->getOperand(0);
385 if (SV->getOperand(0) != Src)
388 if (!SV->isExtractSubvectorMask(Index))
390 ExpectedIndex.
set(Index / ShuffleMaskSize);
394 if (!ExpectedIndex.
all())
398 assert(NumGroup == (VL.
size() / GroupSize) &&
"Unexpected number of groups");
417 unsigned SVNumElements =
420 unsigned AccumulateLength = 0;
421 for (
Value *V : VL) {
423 for (
int M : SV->getShuffleMask())
425 : AccumulateLength + M);
426 AccumulateLength += SVNumElements;
467 return std::min<unsigned>(PartNumElems,
Size - Part * PartNumElems);
476 OS <<
"Idx: " << Idx <<
", ";
477 OS <<
"n=" << VL.
size() <<
" [" << *VL.
front() <<
", ..]";
500 if (BB !=
II->getParent())
517 Value *FirstNonUndef =
nullptr;
518 for (
Value *V : VL) {
521 if (!FirstNonUndef) {
525 if (V != FirstNonUndef)
528 return FirstNonUndef !=
nullptr;
543 bool IsCopyable =
false) {
545 return Cmp->isCommutative();
547 return BO->isCommutative() ||
548 (BO->getOpcode() == Instruction::Sub &&
556 if (match(U.getUser(),
557 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
558 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
562 auto *I = dyn_cast<BinaryOperator>(U.get());
563 return match(U.getUser(),
564 m_Intrinsic<Intrinsic::abs>(
565 m_Specific(U.get()), m_ConstantInt(Flag))) &&
566 ((!IsCopyable && I && !I->hasNoSignedWrap()) ||
569 (BO->getOpcode() == Instruction::FSub &&
573 return match(U.getUser(),
574 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
576 return I->isCommutative();
583 bool IsCopyable =
false) {
585 "The instruction is not commutative.");
589 switch (BO->getOpcode()) {
590 case Instruction::Sub:
591 case Instruction::FSub:
597 return I->isCommutableOperand(
Op);
617 constexpr unsigned IntrinsicNumOperands = 2;
618 return IntrinsicNumOperands;
620 return I->getNumOperands();
626 static_assert(std::is_same_v<T, InsertElementInst> ||
627 std::is_same_v<T, ExtractElementInst>,
637 if (CI->getValue().uge(VT->getNumElements()))
639 Index *= VT->getNumElements();
640 Index += CI->getZExtValue();
662 Type *CurrentType =
IV->getType();
663 for (
unsigned I :
IV->indices()) {
665 Index *= ST->getNumElements();
666 CurrentType = ST->getElementType(
I);
668 Index *= AT->getNumElements();
669 CurrentType = AT->getElementType();
691 return std::all_of(It, VL.
end(), [&](
Value *V) {
692 if (auto *CI = dyn_cast<CmpInst>(V))
693 return BasePred == CI->getPredicate();
694 if (auto *I = dyn_cast<Instruction>(V))
695 return I->getOpcode() == Opcode;
696 return isa<PoisonValue>(V);
724 if (MaskArg == UseMask::UndefsAsMask)
728 if (MaskArg == UseMask::FirstArg &&
Value < VF)
729 UseMask.reset(
Value);
730 else if (MaskArg == UseMask::SecondArg &&
Value >= VF)
731 UseMask.reset(
Value - VF);
739template <
bool IsPoisonOnly = false>
743 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
751 if (!UseMask.empty()) {
762 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
777 for (
unsigned I = 0,
E = VecTy->getNumElements();
I !=
E; ++
I) {
778 if (
Constant *Elem =
C->getAggregateElement(
I))
780 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
808static std::optional<TargetTransformInfo::ShuffleKind>
815 std::accumulate(VL.
begin(), VL.
end(), 0u, [](
unsigned S,
Value *V) {
816 auto *EI = dyn_cast<ExtractElementInst>(V);
819 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
822 return std::max(S, VTy->getNumElements());
825 Value *Vec1 =
nullptr;
826 Value *Vec2 =
nullptr;
831 Value *Vec = EE->getVectorOperand();
837 ShuffleMode CommonShuffleMode =
Unknown;
839 for (
unsigned I = 0,
E = VL.
size();
I <
E; ++
I) {
846 auto *Vec = EI->getVectorOperand();
860 if (Idx->getValue().uge(
Size))
862 unsigned IntIdx = Idx->getValue().getZExtValue();
869 if (!Vec1 || Vec1 == Vec) {
871 }
else if (!Vec2 || Vec2 == Vec) {
877 if (CommonShuffleMode == Permute)
881 if (Mask[
I] %
Size !=
I) {
882 CommonShuffleMode = Permute;
885 CommonShuffleMode =
Select;
888 if (CommonShuffleMode ==
Select && Vec2)
898 unsigned Opcode =
E->getOpcode();
899 assert((Opcode == Instruction::ExtractElement ||
900 Opcode == Instruction::ExtractValue) &&
901 "Expected extractelement or extractvalue instruction.");
902 if (Opcode == Instruction::ExtractElement) {
908 unsigned Idx = CI->getZExtValue();
916 if (EI->getNumIndices() != 1)
918 return *EI->idx_begin();
952class BinOpSameOpcodeHelper {
953 using MaskType = std::uint_fast32_t;
955 constexpr static std::initializer_list<unsigned> SupportedOp = {
956 Instruction::Add, Instruction::Sub, Instruction::Mul, Instruction::Shl,
957 Instruction::AShr, Instruction::And, Instruction::Or, Instruction::Xor};
959 "SupportedOp is not sorted.");
977 static std::pair<ConstantInt *, unsigned>
978 isBinOpWithConstantInt(
const Instruction *
I) {
979 unsigned Opcode =
I->getOpcode();
985 if (Opcode == Instruction::Sub || Opcode == Instruction::Shl ||
986 Opcode == Instruction::AShr)
992 struct InterchangeableInfo {
995 MaskType Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |
996 MulBIT | AShrBIT | ShlBIT;
1001 MaskType SeenBefore = 0;
1002 InterchangeableInfo(
const Instruction *I) : I(I) {}
1006 bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) {
1007 if (Mask & InterchangeableMask) {
1008 SeenBefore |= OpcodeInMaskForm;
1009 Mask &= InterchangeableMask;
1014 bool equal(
unsigned Opcode) {
1015 return Opcode == I->getOpcode() && trySet(MainOpBIT, MainOpBIT);
1018 MaskType Candidate = Mask & SeenBefore;
1019 if (Candidate & MainOpBIT)
1020 return I->getOpcode();
1021 if (Candidate & ShlBIT)
1022 return Instruction::Shl;
1023 if (Candidate & AShrBIT)
1024 return Instruction::AShr;
1025 if (Candidate & MulBIT)
1026 return Instruction::Mul;
1027 if (Candidate & AddBIT)
1028 return Instruction::Add;
1029 if (Candidate & SubBIT)
1030 return Instruction::Sub;
1031 if (Candidate & AndBIT)
1032 return Instruction::And;
1033 if (Candidate & OrBIT)
1034 return Instruction::Or;
1035 if (Candidate & XorBIT)
1036 return Instruction::Xor;
1041 bool hasCandidateOpcode(
unsigned Opcode)
const {
1042 MaskType Candidate = Mask & SeenBefore;
1044 case Instruction::Shl:
1045 return Candidate & ShlBIT;
1046 case Instruction::AShr:
1047 return Candidate & AShrBIT;
1048 case Instruction::Mul:
1049 return Candidate & MulBIT;
1050 case Instruction::Add:
1051 return Candidate & AddBIT;
1052 case Instruction::Sub:
1053 return Candidate & SubBIT;
1054 case Instruction::And:
1055 return Candidate & AndBIT;
1056 case Instruction::Or:
1057 return Candidate & OrBIT;
1058 case Instruction::Xor:
1059 return Candidate & XorBIT;
1060 case Instruction::LShr:
1061 case Instruction::FAdd:
1062 case Instruction::FSub:
1063 case Instruction::FMul:
1064 case Instruction::SDiv:
1065 case Instruction::UDiv:
1066 case Instruction::FDiv:
1067 case Instruction::SRem:
1068 case Instruction::URem:
1069 case Instruction::FRem:
1079 unsigned FromOpcode = I->getOpcode();
1080 if (FromOpcode == ToOpcode)
1083 auto [CI, Pos] = isBinOpWithConstantInt(I);
1084 const APInt &FromCIValue = CI->getValue();
1085 unsigned FromCIValueBitWidth = FromCIValue.
getBitWidth();
1086 Type *RHSType = I->getOperand(Pos)->getType();
1088 switch (FromOpcode) {
1089 case Instruction::Shl:
1090 if (ToOpcode == Instruction::Mul) {
1091 RHS = ConstantInt::get(
1095 assert(FromCIValue.
isZero() &&
"Cannot convert the instruction.");
1100 case Instruction::Mul:
1102 if (ToOpcode == Instruction::Shl) {
1103 RHS = ConstantInt::get(
1104 RHSType, APInt(FromCIValueBitWidth, FromCIValue.
logBase2()));
1106 assert(FromCIValue.
isOne() &&
"Cannot convert the instruction.");
1111 case Instruction::Add:
1112 case Instruction::Sub:
1113 if (FromCIValue.
isZero()) {
1118 "Cannot convert the instruction.");
1119 APInt NegatedVal = APInt(FromCIValue);
1120 NegatedVal.negate();
1121 RHS = ConstantInt::get(RHSType, NegatedVal);
1124 case Instruction::And:
1130 assert(FromCIValue.
isZero() &&
"Cannot convert the instruction.");
1135 Value *
LHS = I->getOperand(1 - Pos);
1144 InterchangeableInfo MainOp;
1145 InterchangeableInfo AltOp;
1147 return ::isValidForAlternation(MainOp.I->getOpcode()) &&
1150 bool initializeAltOp(
const Instruction *
I) {
1160 BinOpSameOpcodeHelper(
const Instruction *MainOp,
1161 const Instruction *AltOp =
nullptr)
1162 : MainOp(MainOp), AltOp(AltOp) {}
1163 bool add(
const Instruction *
I) {
1165 "BinOpSameOpcodeHelper only accepts BinaryOperator.");
1166 unsigned Opcode =
I->getOpcode();
1167 MaskType OpcodeInMaskForm;
1170 case Instruction::Shl:
1171 OpcodeInMaskForm = ShlBIT;
1173 case Instruction::AShr:
1174 OpcodeInMaskForm = AShrBIT;
1176 case Instruction::Mul:
1177 OpcodeInMaskForm = MulBIT;
1179 case Instruction::Add:
1180 OpcodeInMaskForm = AddBIT;
1182 case Instruction::Sub:
1183 OpcodeInMaskForm = SubBIT;
1185 case Instruction::And:
1186 OpcodeInMaskForm = AndBIT;
1188 case Instruction::Or:
1189 OpcodeInMaskForm = OrBIT;
1191 case Instruction::Xor:
1192 OpcodeInMaskForm = XorBIT;
1195 return MainOp.equal(Opcode) ||
1196 (initializeAltOp(
I) && AltOp.equal(Opcode));
1198 MaskType InterchangeableMask = OpcodeInMaskForm;
1199 ConstantInt *CI = isBinOpWithConstantInt(
I).first;
1201 constexpr MaskType CanBeAll =
1202 XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | MulBIT | AShrBIT | ShlBIT;
1203 const APInt &CIValue = CI->
getValue();
1205 case Instruction::Shl:
1207 InterchangeableMask = CIValue.
isZero() ? CanBeAll : MulBIT | ShlBIT;
1209 case Instruction::Mul:
1210 if (CIValue.
isOne()) {
1211 InterchangeableMask = CanBeAll;
1215 InterchangeableMask = MulBIT | ShlBIT;
1217 case Instruction::Add:
1218 case Instruction::Sub:
1219 InterchangeableMask = CIValue.
isZero() ? CanBeAll : SubBIT | AddBIT;
1221 case Instruction::And:
1223 InterchangeableMask = CanBeAll;
1225 case Instruction::Xor:
1227 InterchangeableMask = XorBIT | OrBIT | SubBIT | AddBIT;
1231 InterchangeableMask = CanBeAll;
1235 return MainOp.trySet(OpcodeInMaskForm, InterchangeableMask) ||
1236 (initializeAltOp(
I) &&
1237 AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));
1239 unsigned getMainOpcode()
const {
return MainOp.getOpcode(); }
1241 bool hasCandidateOpcode(
unsigned Opcode)
const {
1242 return MainOp.hasCandidateOpcode(Opcode);
1244 bool hasAltOp()
const {
return AltOp.I; }
1245 unsigned getAltOpcode()
const {
1246 return hasAltOp() ? AltOp.getOpcode() : getMainOpcode();
1249 return MainOp.getOperand(
I);
1254class InstructionsState {
1280 bool HasCopyables =
false;
1284 assert(valid() &&
"InstructionsState is invalid.");
1289 assert(valid() &&
"InstructionsState is invalid.");
1294 unsigned getOpcode()
const {
return getMainOp()->getOpcode(); }
1296 unsigned getAltOpcode()
const {
return getAltOp()->getOpcode(); }
1299 bool isAltShuffle()
const {
return getMainOp() != getAltOp(); }
1308 Instruction *getMatchingMainOpOrAltOp(Instruction *
I)
const {
1309 assert(MainOp &&
"MainOp cannot be nullptr.");
1310 if (
I->getOpcode() == MainOp->getOpcode())
1312 if (MainOp->getOpcode() == Instruction::Select &&
1313 I->getOpcode() == Instruction::ZExt && !isAltShuffle())
1316 assert(AltOp &&
"AltOp cannot be nullptr.");
1317 if (
I->getOpcode() == AltOp->getOpcode())
1319 if (!
I->isBinaryOp())
1321 BinOpSameOpcodeHelper
Converter(MainOp);
1324 if (isAltShuffle() && !
Converter.hasCandidateOpcode(MainOp->getOpcode())) {
1325 BinOpSameOpcodeHelper AltConverter(AltOp);
1326 if (AltConverter.add(
I) && AltConverter.add(AltOp) &&
1327 AltConverter.hasCandidateOpcode(AltOp->getOpcode()))
1330 if (
Converter.hasAltOp() && !isAltShuffle())
1332 return Converter.hasAltOp() ? AltOp : MainOp;
1336 bool isShiftOp()
const {
1337 return getMainOp()->isShift() && getAltOp()->isShift();
1342 return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();
1346 bool isMulDivLikeOp()
const {
1347 constexpr std::array<unsigned, 8> MulDiv = {
1348 Instruction::Mul, Instruction::FMul, Instruction::SDiv,
1349 Instruction::UDiv, Instruction::FDiv, Instruction::SRem,
1350 Instruction::URem, Instruction::FRem};
1356 bool isAddSubLikeOp()
const {
1357 constexpr std::array<unsigned, 4>
AddSub = {
1358 Instruction::Add, Instruction::Sub, Instruction::FAdd,
1365 bool isCmpOp()
const {
1366 return (
getOpcode() == Instruction::ICmp ||
1372 bool valid()
const {
return MainOp && AltOp; }
1374 explicit operator bool()
const {
return valid(); }
1376 InstructionsState() =
delete;
1377 InstructionsState(Instruction *MainOp, Instruction *AltOp,
1378 bool HasCopyables =
false)
1379 : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}
1380 static InstructionsState invalid() {
return {
nullptr,
nullptr}; }
1383 bool isCopyableElement(
Value *V)
const {
1384 assert(valid() &&
"InstructionsState is invalid.");
1387 if (isAltShuffle() ||
getOpcode() == Instruction::GetElementPtr)
1392 if (
I->getParent() != MainOp->getParent() &&
1396 if (
I->getOpcode() == MainOp->getOpcode())
1398 if (!
I->isBinaryOp())
1400 BinOpSameOpcodeHelper
Converter(MainOp);
1406 bool isNonSchedulable(
Value *V)
const {
1407 assert(valid() &&
"InstructionsState is invalid.");
1414 if (getMainOp() == V)
1416 if (isCopyableElement(V)) {
1417 auto IsNonSchedulableCopyableElement = [
this](
Value *
V) {
1419 return !
I ||
isa<PHINode>(
I) ||
I->getParent() != MainOp->getParent() ||
1424 !MainOp->comesBefore(
I));
1427 return IsNonSchedulableCopyableElement(V);
1434 bool areInstructionsWithCopyableElements()
const {
1435 assert(valid() &&
"InstructionsState is invalid.");
1436 return HasCopyables;
1440std::pair<Instruction *, SmallVector<Value *>>
1442 Instruction *SelectedOp = S.getMatchingMainOpOrAltOp(
I);
1443 assert(SelectedOp &&
"Cannot convert the instruction.");
1444 if (
I->isBinaryOp()) {
1446 return std::make_pair(SelectedOp,
Converter.getOperand(SelectedOp));
1465 for (
Value *V : VL) {
1470 if (Inst->getOpcode() == Opcode)
1484 BaseOp0 == Op0 || BaseOp1 == Op1 ||
1495 "Assessing comparisons of different types?");
1505 return (BasePred == Pred &&
1507 (BasePred == SwappedPred &&
1518 return InstructionsState::invalid();
1522 return InstructionsState::invalid();
1527 (VL.
size() == 2 && InstCnt < 2))
1528 return InstructionsState::invalid();
1537 unsigned AltOpcode = Opcode;
1539 BinOpSameOpcodeHelper BinOpHelper(MainOp);
1540 bool SwappedPredsCompatible = IsCmpOp && [&]() {
1542 UniquePreds.
insert(BasePred);
1543 UniqueNonSwappedPreds.
insert(BasePred);
1544 for (
Value *V : VL) {
1551 UniqueNonSwappedPreds.
insert(CurrentPred);
1552 if (!UniquePreds.
contains(CurrentPred) &&
1553 !UniquePreds.
contains(SwappedCurrentPred))
1554 UniquePreds.
insert(CurrentPred);
1559 return UniqueNonSwappedPreds.
size() > 2 && UniquePreds.
size() == 2;
1569 return InstructionsState::invalid();
1571 bool AnyPoison = InstCnt != VL.
size();
1582 if (AnyPoison && (
I->isIntDivRem() ||
I->isFPDivRem() ||
isa<CallInst>(
I)))
1583 return InstructionsState::invalid();
1584 unsigned InstOpcode =
I->getOpcode();
1586 if (BinOpHelper.add(
I))
1591 Value *Op1 =
I->getOperand(0);
1594 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
1596 if (Opcode == AltOpcode) {
1599 "Cast isn't safe for alternation, logic needs to be updated!");
1600 AltOpcode = InstOpcode;
1607 Type *Ty0 = BaseInst->getOperand(0)->getType();
1608 Type *Ty1 = Inst->getOperand(0)->getType();
1610 assert(InstOpcode == Opcode &&
"Expected same CmpInst opcode.");
1611 assert(InstOpcode == AltOpcode &&
1612 "Alternate instructions are only supported by BinaryOperator "
1620 if ((VL.
size() == 2 || SwappedPredsCompatible) &&
1621 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1627 if (MainOp != AltOp) {
1630 }
else if (BasePred != CurrentPred) {
1633 "CmpInst isn't safe for alternation, logic needs to be updated!");
1638 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1639 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1642 }
else if (InstOpcode == Opcode) {
1643 assert(InstOpcode == AltOpcode &&
1644 "Alternate instructions are only supported by BinaryOperator and "
1647 if (Gep->getNumOperands() != 2 ||
1649 return InstructionsState::invalid();
1652 return InstructionsState::invalid();
1655 if (!LI->isSimple() || !BaseLI->isSimple())
1656 return InstructionsState::invalid();
1660 return InstructionsState::invalid();
1661 if (
Call->hasOperandBundles() &&
1663 !std::equal(
Call->op_begin() +
Call->getBundleOperandsStartIndex(),
1664 Call->op_begin() +
Call->getBundleOperandsEndIndex(),
1667 return InstructionsState::invalid();
1670 return InstructionsState::invalid();
1673 if (Mappings.
size() != BaseMappings.
size() ||
1674 Mappings.
front().ISA != BaseMappings.
front().ISA ||
1675 Mappings.
front().ScalarName != BaseMappings.
front().ScalarName ||
1676 Mappings.
front().VectorName != BaseMappings.
front().VectorName ||
1677 Mappings.
front().Shape.VF != BaseMappings.
front().Shape.VF ||
1678 Mappings.
front().Shape.Parameters !=
1679 BaseMappings.
front().Shape.Parameters)
1680 return InstructionsState::invalid();
1685 return InstructionsState::invalid();
1690 assert(MainOp &&
"Cannot find MainOp with Opcode from BinOpHelper.");
1692 assert(AltOp &&
"Cannot find AltOp with Opcode from BinOpHelper.");
1695 "Incorrect implementation of allSameOpcode.");
1696 InstructionsState S(MainOp, AltOp);
1702 "Invalid InstructionsState.");
1710 return all_of(VL, [&](
Value *V) {
return V->getType() == Ty; });
1720 unsigned Opcode = UserInst->
getOpcode();
1722 case Instruction::Load: {
1726 case Instruction::Store: {
1728 return (
SI->getPointerOperand() == Scalar);
1730 case Instruction::Call: {
1734 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1735 Arg.value().get() == Scalar;
1755 return LI->isSimple();
1757 return SI->isSimple();
1759 return !
MI->isVolatile();
1767 bool ExtendingManyInputs =
false) {
1768 if (SubMask.
empty())
1771 (!ExtendingManyInputs || SubMask.
size() > Mask.size() ||
1774 "SubMask with many inputs support must be larger than the mask.");
1776 Mask.append(SubMask.
begin(), SubMask.
end());
1780 int TermValue = std::min(Mask.size(), SubMask.
size());
1781 for (
int I = 0,
E = SubMask.
size();
I <
E; ++
I) {
1783 (!ExtendingManyInputs &&
1784 (SubMask[
I] >= TermValue || Mask[SubMask[
I]] >= TermValue)))
1786 NewMask[
I] = Mask[SubMask[
I]];
1802 const size_t Sz = Order.
size();
1805 for (
unsigned I = 0;
I < Sz; ++
I) {
1807 UnusedIndices.
reset(Order[
I]);
1809 MaskedIndices.
set(
I);
1811 if (MaskedIndices.
none())
1814 "Non-synced masked/available indices.");
1818 assert(Idx >= 0 &&
"Indices must be synced.");
1828 unsigned Opcode0,
unsigned Opcode1) {
1835 OpcodeMask.
set(Lane * ScalarTyNumElements,
1836 Lane * ScalarTyNumElements + ScalarTyNumElements);
1845 "Expected scalar constants.");
1848 std::fill_n(NewVal.begin() +
I * VF, VF, V);
1855 const unsigned E = Indices.
size();
1857 for (
unsigned I = 0;
I <
E; ++
I)
1858 Mask[Indices[
I]] =
I;
1864 assert(!Mask.empty() &&
"Expected non-empty mask.");
1868 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
1870 Scalars[Mask[
I]] = Prev[
I];
1883 auto *IO = dyn_cast<Instruction>(V);
1886 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1899 return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(
UsesLimit) &&
1901 auto *IU = dyn_cast<Instruction>(U);
1904 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1920 return !VL.
empty() &&
1936 return NumParts > 0 && NumParts < Sz &&
has_single_bit(Sz / NumParts) &&
1945 const unsigned Limit = std::numeric_limits<unsigned>::max()) {
1946 unsigned NumParts =
TTI.getNumberOfParts(VecTy);
1947 if (NumParts == 0 || NumParts >= Limit)
1950 if (NumParts >= Sz || Sz % NumParts != 0 ||
1959 class ScheduleEntity;
1961 class ScheduleCopyableData;
1962 class ScheduleBundle;
1972 struct StridedPtrInfo {
1973 Value *StrideVal =
nullptr;
1974 const SCEV *StrideSCEV =
nullptr;
2000 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
2001 AC(AC), DB(DB), DL(DL), ORE(ORE),
2020 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
2033 ArrayRef<std::tuple<WeakTrackingVH, unsigned, bool, bool>>
2034 VectorValuesAndScales = {});
2054 const SmallDenseSet<Value *> &UserIgnoreLst);
2061 assert(!VectorizableTree.empty() &&
"No graph to get the first node from");
2062 return VectorizableTree.front()->Scalars;
2068 const TreeEntry &Root = *VectorizableTree.front();
2069 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
2070 !Root.Scalars.
front()->getType()->isIntegerTy())
2071 return std::nullopt;
2072 auto It = MinBWs.find(&Root);
2073 if (It != MinBWs.end())
2077 if (Root.getOpcode() == Instruction::ZExt ||
2078 Root.getOpcode() == Instruction::SExt)
2079 return std::make_pair(
cast<CastInst>(Root.getMainOp())->getSrcTy(),
2080 Root.getOpcode() == Instruction::SExt);
2081 return std::nullopt;
2087 return MinBWs.at(VectorizableTree.front().get()).second;
2092 if (ReductionBitWidth == 0 ||
2093 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
2094 ReductionBitWidth >=
2095 DL->getTypeSizeInBits(
2096 VectorizableTree.front()->Scalars.front()->getType()))
2098 VectorizableTree.front()->Scalars.front()->getType(),
2099 VectorizableTree.front()->getVectorFactor());
2102 VectorizableTree.front()->Scalars.front()->getContext(),
2104 VectorizableTree.front()->getVectorFactor());
2109 return VectorizableTree.front()->hasState() &&
2110 (VectorizableTree.front()->CombinedOp == TreeEntry::ReducedBitcast ||
2111 VectorizableTree.front()->CombinedOp ==
2112 TreeEntry::ReducedBitcastBSwap ||
2113 VectorizableTree.front()->CombinedOp ==
2114 TreeEntry::ReducedBitcastLoads ||
2115 VectorizableTree.front()->CombinedOp ==
2116 TreeEntry::ReducedBitcastBSwapLoads) &&
2117 VectorizableTree.front()->State == TreeEntry::Vectorize;
2122 return VectorizableTree.front()->hasState() &&
2123 VectorizableTree.front()->CombinedOp ==
2124 TreeEntry::ReducedCmpBitcast &&
2125 VectorizableTree.front()->State == TreeEntry::Vectorize;
2140 VectorizableTree.clear();
2141 ScalarToTreeEntries.clear();
2142 DeletedNodes.clear();
2143 TransformedToGatherNodes.clear();
2144 OperandsToTreeEntry.clear();
2145 ScalarsInSplitNodes.clear();
2147 NonScheduledFirst.clear();
2148 EntryToLastInstruction.clear();
2149 LastInstructionToPos.clear();
2150 LoadEntriesToVectorize.clear();
2151 IsGraphTransformMode =
false;
2152 GatheredLoadsEntriesFirst.reset();
2153 CompressEntryToData.clear();
2154 ExternalUses.clear();
2155 ExternalUsesAsOriginalScalar.clear();
2156 ExternalUsesWithNonUsers.clear();
2157 for (
auto &Iter : BlocksSchedules) {
2158 BlockScheduling *BS = Iter.second.get();
2162 ReductionBitWidth = 0;
2164 CastMaxMinBWSizes.reset();
2165 ExtraBitWidthNodes.clear();
2166 InstrElementSize.clear();
2167 UserIgnoreList =
nullptr;
2168 PostponedGathers.clear();
2169 ValueToGatherNodes.clear();
2170 TreeEntryToStridedPtrInfoMap.clear();
2171 CurrentLoopNest.clear();
2187 assert(!Order.
empty() &&
"expected non-empty order");
2188 const unsigned Sz = Order.
size();
2190 return P.value() ==
P.index() ||
P.value() == Sz;
2203 bool IgnoreReorder);
2216 std::optional<OrdersType>
2254 return MaxVecRegSize;
2259 return MinVecRegSize;
2267 unsigned MaxVF =
MaxVFOption.getNumOccurrences() ?
2268 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
2269 return MaxVF ? MaxVF : UINT_MAX;
2291 Align Alignment,
const int64_t Diff,
2292 const size_t Sz)
const;
2332 Value *Ptr0,
Value *PtrN, StridedPtrInfo &SPtrInfo)
const;
2350 Align CommonAlignment,
2352 StridedPtrInfo &SPtrInfo)
const;
2367 StridedPtrInfo &SPtrInfo,
2368 unsigned *BestVF =
nullptr,
2369 bool TryRecursiveCheck =
true)
const;
2373 ListOfKnonwnNonVectorizableLoads.insert(
hash_value(VL));
2377 template <
typename T>
2379 return ListOfKnonwnNonVectorizableLoads.contains(
hash_value(VL));
2404 OS <<
"{User:" << (
UserTE ? std::to_string(
UserTE->Idx) :
"null")
2405 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
2430 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
2431 MaxLevel(MaxLevel) {}
2487 auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
2492 auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
2494 return U == U1 || U == U2 || R.isVectorized(U);
2497 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
2500 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
2502 ((
int)V1->getNumUses() == NumLanes ||
2503 AllUsersAreInternal(V1, V2)))
2509 auto CheckSameEntryOrFail = [&]() {
2514 any_of(TEs2, [&](TreeEntry *E) {
return Set.contains(E); }))
2523 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
2525 return CheckSameEntryOrFail();
2528 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
2529 LI2->getPointerOperand(), DL, SE,
true);
2530 if (!Dist || *Dist == 0) {
2533 R.TTI->isLegalMaskedGather(
2536 return CheckSameEntryOrFail();
2540 if (std::abs(*Dist) > NumLanes / 2)
2573 Value *EV2 =
nullptr;
2586 int Dist = Idx2 - Idx1;
2589 if (std::abs(Dist) == 0)
2591 if (std::abs(Dist) > NumLanes / 2)
2598 return CheckSameEntryOrFail();
2604 if (I1->getParent() != I2->getParent())
2605 return CheckSameEntryOrFail();
2614 V->getType() ==
Cond->getType()) ||
2617 V->getType() ==
Cond->getType()))
2626 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.
empty() ||
2627 !S.isAltShuffle()) &&
2631 S.getMainOp()->getNumOperands();
2643 return CheckSameEntryOrFail();
2677 int ShallowScoreAtThisLevel =
2688 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
2691 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
2693 ShallowScoreAtThisLevel))
2694 return ShallowScoreAtThisLevel;
2695 assert(I1 && I2 &&
"Should have early exited.");
2702 if (I1->getNumOperands() != I2->getNumOperands())
2704 for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
2705 OpIdx1 != NumOperands1; ++OpIdx1) {
2707 int MaxTmpScore = 0;
2708 unsigned MaxOpIdx2 = 0;
2709 bool FoundBest =
false;
2713 ? I2->getNumOperands()
2714 : std::min(I2->getNumOperands(), OpIdx1 + 1);
2715 assert(FromIdx <= ToIdx &&
"Bad index");
2716 for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
2718 if (Op2Used.
count(OpIdx2))
2723 I1, I2, CurrLevel + 1, {});
2726 TmpScore > MaxTmpScore) {
2727 MaxTmpScore = TmpScore;
2734 Op2Used.
insert(MaxOpIdx2);
2735 ShallowScoreAtThisLevel += MaxTmpScore;
2738 return ShallowScoreAtThisLevel;
2769 struct OperandData {
2770 OperandData() =
default;
2771 OperandData(
Value *V,
bool APO,
bool IsUsed)
2772 : V(V), APO(APO), IsUsed(IsUsed) {}
2782 bool IsUsed =
false;
2791 enum class ReorderingMode {
2805 unsigned ArgSize = 0;
2811 const Loop *L =
nullptr;
2814 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
2815 return OpsVec[
OpIdx][Lane];
2819 const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const {
2820 return OpsVec[
OpIdx][Lane];
2825 for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
2827 for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2829 OpsVec[
OpIdx][Lane].IsUsed =
false;
2833 void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
2834 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2846 int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx,
2848 Value *IdxLaneV = getData(Idx, Lane).V;
2861 unsigned UniquesCount = Uniques.
size();
2862 auto IdxIt = Uniques.
find(IdxLaneV);
2863 unsigned UniquesCntWithIdxLaneV =
2864 IdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2866 auto OpIdxIt = Uniques.
find(OpIdxLaneV);
2867 unsigned UniquesCntWithOpIdxLaneV =
2868 OpIdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2869 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2871 return std::min(
bit_ceil(UniquesCntWithOpIdxLaneV) -
2872 UniquesCntWithOpIdxLaneV,
2873 UniquesCntWithOpIdxLaneV -
2875 ((IdxIt != Uniques.
end() && UsedLanes.
test(IdxIt->second))
2876 ? UniquesCntWithIdxLaneV -
bit_floor(UniquesCntWithIdxLaneV)
2877 :
bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2886 int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
2887 Value *IdxLaneV = getData(Idx, Lane).V;
2900 return R.areAllUsersVectorized(IdxLaneI)
2908 static const int ScoreScaleFactor = 10;
2916 int Lane,
unsigned OpIdx,
unsigned Idx,
2926 int SplatScore = getSplatScore(Lane,
OpIdx, Idx, UsedLanes);
2927 if (Score <= -SplatScore) {
2931 Score += SplatScore;
2937 Score *= ScoreScaleFactor;
2938 Score += getExternalUseScore(Lane,
OpIdx, Idx);
2956 std::optional<unsigned>
2957 getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
2961 unsigned NumOperands = getNumOperands();
2964 Value *OpLastLane = getData(
OpIdx, LastLane).V;
2967 ReorderingMode RMode = ReorderingModes[
OpIdx];
2968 if (RMode == ReorderingMode::Failed)
2969 return std::nullopt;
2972 bool OpIdxAPO = getData(
OpIdx, Lane).APO;
2978 std::optional<unsigned> Idx;
2982 BestScoresPerLanes.try_emplace(std::make_pair(
OpIdx, Lane), 0)
2988 bool IsUsed = RMode == ReorderingMode::Splat ||
2989 RMode == ReorderingMode::Constant ||
2990 RMode == ReorderingMode::Load;
2992 for (
unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2994 OperandData &OpData = getData(Idx, Lane);
2996 bool OpAPO = OpData.APO;
3005 if (OpAPO != OpIdxAPO)
3010 case ReorderingMode::Load:
3011 case ReorderingMode::Opcode: {
3012 bool LeftToRight = Lane > LastLane;
3013 Value *OpLeft = (LeftToRight) ? OpLastLane :
Op;
3014 Value *OpRight = (LeftToRight) ?
Op : OpLastLane;
3015 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
3016 OpIdx, Idx, IsUsed, UsedLanes);
3017 if (Score >
static_cast<int>(BestOp.Score) ||
3018 (Score > 0 && Score ==
static_cast<int>(BestOp.Score) &&
3021 BestOp.Score = Score;
3022 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] = Score;
3026 case ReorderingMode::Constant:
3028 (!BestOp.Score && L && L->isLoopInvariant(
Op))) {
3032 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] =
3039 case ReorderingMode::Splat:
3041 IsUsed =
Op == OpLastLane;
3042 if (
Op == OpLastLane) {
3044 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] =
3050 case ReorderingMode::Failed:
3056 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
3060 return std::nullopt;
3067 unsigned getBestLaneToStartReordering()
const {
3068 unsigned Min = UINT_MAX;
3069 unsigned SameOpNumber = 0;
3080 for (
int I = getNumLanes();
I > 0; --
I) {
3081 unsigned Lane =
I - 1;
3082 OperandsOrderData NumFreeOpsHash =
3083 getMaxNumOperandsThatCanBeReordered(Lane);
3086 if (NumFreeOpsHash.NumOfAPOs < Min) {
3087 Min = NumFreeOpsHash.NumOfAPOs;
3088 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3090 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3091 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
3092 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
3095 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3096 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3097 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
3098 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
3099 auto [It, Inserted] =
3100 HashMap.
try_emplace(NumFreeOpsHash.Hash, 1, Lane);
3106 unsigned BestLane = 0;
3107 unsigned CntMin = UINT_MAX;
3109 if (
Data.second.first < CntMin) {
3110 CntMin =
Data.second.first;
3111 BestLane =
Data.second.second;
3118 struct OperandsOrderData {
3121 unsigned NumOfAPOs = UINT_MAX;
3124 unsigned NumOpsWithSameOpcodeParent = 0;
3138 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const {
3139 unsigned CntTrue = 0;
3140 unsigned NumOperands = getNumOperands();
3150 bool AllUndefs =
true;
3151 unsigned NumOpsWithSameOpcodeParent = 0;
3156 const OperandData &OpData = getData(
OpIdx, Lane);
3163 I->getParent() != Parent) {
3164 if (NumOpsWithSameOpcodeParent == 0) {
3165 NumOpsWithSameOpcodeParent = 1;
3167 Parent =
I->getParent();
3169 --NumOpsWithSameOpcodeParent;
3172 ++NumOpsWithSameOpcodeParent;
3181 OperandsOrderData
Data;
3182 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
3183 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
3190 const InstructionsState &S) {
3194 return VL.
size() == getNumLanes();
3196 "Expected same number of lanes");
3197 assert(S.valid() &&
"InstructionsState is invalid.");
3203 OpsVec.resize(ArgSize);
3204 unsigned NumLanes = VL.
size();
3205 for (OperandDataVec &
Ops : OpsVec)
3206 Ops.resize(NumLanes);
3221 OpsVec[
OpIdx][Lane] = {Operands[
OpIdx][Lane],
true,
false};
3224 bool IsInverseOperation =
false;
3225 if (S.isCopyableElement(VL[Lane])) {
3227 IsInverseOperation =
3230 assert(
I &&
"Expected instruction");
3231 auto [SelectedOp,
Ops] = convertTo(
I, S);
3238 bool APO = (
OpIdx == 0) ?
false : IsInverseOperation;
3239 OpsVec[
OpIdx][Lane] = {Operands[
OpIdx][Lane], APO,
false};
3245 unsigned getNumOperands()
const {
return ArgSize; }
3248 unsigned getNumLanes()
const {
return OpsVec[0].size(); }
3251 Value *getValue(
unsigned OpIdx,
unsigned Lane)
const {
3252 return getData(
OpIdx, Lane).V;
3256 bool empty()
const {
return OpsVec.empty(); }
3259 void clear() { OpsVec.clear(); }
3264 bool shouldBroadcast(
Value *
Op,
unsigned OpIdx,
unsigned Lane) {
3266 "Op is expected to be getValue(OpIdx, Lane).");
3270 bool OpAPO = getData(
OpIdx, Lane).APO;
3271 bool IsInvariant = L && L->isLoopInvariant(
Op);
3273 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3277 bool FoundCandidate =
false;
3278 for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
3279 OperandData &
Data = getData(OpI, Ln);
3280 if (
Data.APO != OpAPO ||
Data.IsUsed)
3282 Value *OpILane = getValue(OpI, Lane);
3306 L->isLoopInvariant(
Data.V))) {
3307 FoundCandidate =
true;
3314 if (!FoundCandidate)
3317 return getNumLanes() == 2 || Cnt > 1;
3324 "Op is expected to be getValue(OpIdx, Lane).");
3325 bool OpAPO = getData(
OpIdx, Lane).APO;
3326 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3330 const OperandData &
Data = getData(OpI, Ln);
3331 if (
Data.APO != OpAPO ||
Data.IsUsed)
3333 Value *OpILn = getValue(OpI, Ln);
3334 return (L && L->isLoopInvariant(OpILn)) ||
3346 const InstructionsState &S,
const BoUpSLP &R)
3347 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
3348 L(R.LI->getLoopFor(S.getMainOp()->
getParent())) {
3350 appendOperands(RootVL, Operands, S);
3358 "Expected same num of lanes across all operands");
3359 for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
3360 OpVL[Lane] = OpsVec[
OpIdx][Lane].V;
3368 unsigned NumOperands = getNumOperands();
3369 unsigned NumLanes = getNumLanes();
3389 unsigned FirstLane = getBestLaneToStartReordering();
3398 if (shouldBroadcast(OpLane0,
OpIdx, FirstLane) ||
3399 !canBeVectorized(OpILane0,
OpIdx, FirstLane))
3400 ReorderingModes[
OpIdx] = ReorderingMode::Splat;
3402 ReorderingModes[
OpIdx] = ReorderingMode::Load;
3404 ReorderingModes[
OpIdx] = ReorderingMode::Opcode;
3406 ReorderingModes[
OpIdx] = ReorderingMode::Constant;
3409 ReorderingModes[
OpIdx] = ReorderingMode::Splat;
3419 auto &&SkipReordering = [
this]() {
3422 for (
const OperandData &
Data : Op0)
3425 ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
3426 if (
any_of(
Op, [&UniqueValues](
const OperandData &
Data) {
3433 return UniqueValues.
size() != 2 &&
3435 UniqueValues.
size());
3447 if (SkipReordering())
3450 bool StrategyFailed =
false;
3458 for (
unsigned I = 0;
I < NumOperands; ++
I)
3459 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
3462 UsedLanes.
set(FirstLane);
3463 for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
3465 for (
int Direction : {+1, -1}) {
3466 int Lane = FirstLane + Direction * Distance;
3467 if (Lane < 0 || Lane >= (
int)NumLanes)
3469 UsedLanes.
set(Lane);
3470 int LastLane = Lane - Direction;
3471 assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
3476 std::optional<unsigned> BestIdx =
3477 getBestOperand(
OpIdx, Lane, LastLane, ReorderingModes,
3478 MainAltOps[
OpIdx], UsedLanes);
3485 swap(
OpIdx, *BestIdx, Lane);
3488 StrategyFailed =
true;
3492 OperandData &AltOp = getData(
OpIdx, Lane);
3493 InstructionsState OpS =
3495 if (OpS && OpS.isAltShuffle())
3502 if (!StrategyFailed)
3507#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3510 case ReorderingMode::Load:
3512 case ReorderingMode::Opcode:
3514 case ReorderingMode::Constant:
3516 case ReorderingMode::Splat:
3518 case ReorderingMode::Failed:
3539 const unsigned Indent = 2;
3541 for (
const OperandDataVec &OpDataVec : OpsVec) {
3542 OS <<
"Operand " << Cnt++ <<
"\n";
3543 for (
const OperandData &OpData : OpDataVec) {
3544 OS.
indent(Indent) <<
"{";
3545 if (
Value *V = OpData.V)
3549 OS <<
", APO:" << OpData.APO <<
"}\n";
3566 std::pair<std::optional<int>,
int>
3571 int BestScore = Limit;
3572 std::optional<int> Index;
3573 for (
int I :
seq<int>(0, Candidates.size())) {
3575 Candidates[
I].second,
3578 if (Score > BestScore) {
3583 return std::make_pair(Index, BestScore);
3593 DeletedInstructions.insert(
I);
3598 template <
typename T>
3601 ArrayRef<std::tuple<WeakTrackingVH, unsigned, bool, bool>>
3602 VectorValuesAndScales) {
3604 for (T *V : DeadVals) {
3609 for (T *V : DeadVals) {
3610 if (!V || !Processed.
insert(V).second)
3615 for (
Use &U :
I->operands()) {
3617 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
3619 (Entries.
empty() ||
none_of(Entries, [&](
const TreeEntry *Entry) {
3620 return Entry->VectorizedValue == OpI;
3624 I->dropAllReferences();
3626 for (T *V : DeadVals) {
3628 if (!
I->getParent())
3633 cast<Instruction>(U.getUser()));
3635 "trying to erase instruction with users.");
3636 I->removeFromParent();
3640 while (!DeadInsts.
empty()) {
3643 if (!VI || !VI->getParent())
3646 "Live instruction found in dead worklist!");
3647 assert(VI->use_empty() &&
"Instructions with uses are not dead.");
3654 for (
Use &OpU : VI->operands()) {
3655 Value *OpV = OpU.get();
3667 if (!DeletedInstructions.contains(OpI) &&
3668 (!OpI->getType()->isVectorTy() ||
3670 VectorValuesAndScales,
3671 [&](
const std::tuple<WeakTrackingVH, unsigned, bool, bool>
3672 &V) {
return std::get<0>(V) == OpI; })) &&
3677 VI->removeFromParent();
3679 SE->forgetValue(VI);
3686 return AnalyzedReductionsRoots.count(
I);
3691 AnalyzedReductionsRoots.insert(
I);
3696 return AnalyzedReductionVals.contains(
hash_value(VL));
3701 AnalyzedReductionVals.insert(
hash_value(VL));
3705 AnalyzedReductionsRoots.clear();
3706 AnalyzedReductionVals.clear();
3707 AnalyzedMinBWVals.clear();
3715 return MustGather.contains(V);
3719 return NonScheduledFirst.contains(V);
3724 assert(V &&
"V cannot be nullptr.");
3726 return any_of(Entries, [&](
const TreeEntry *E) {
3727 return !DeletedNodes.contains(E) && !TransformedToGatherNodes.contains(E);
3738 const InstructionsState &LocalState,
3750 bool collectValuesToDemote(
3751 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
3754 bool &IsProfitableToDemote,
bool IsTruncRoot)
const;
3763 void buildReorderableOperands(
3771 void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
3774 bool areAllUsersVectorized(
3784 const TreeEntry *getOperandEntry(
const TreeEntry *E,
unsigned Idx)
const;
3785 TreeEntry *getOperandEntry(TreeEntry *E,
unsigned Idx) {
3786 return const_cast<TreeEntry *
>(
3787 getOperandEntry(
const_cast<const TreeEntry *
>(E), Idx));
3793 Instruction *getRootEntryInstruction(
const TreeEntry &Entry)
const;
3797 getCastContextHint(
const TreeEntry &TE)
const;
3804 unsigned getScaleToLoopIterations(
const TreeEntry &TE,
3805 Value *Scalar =
nullptr,
3818 unsigned InterleaveFactor = 0);
3829 bool ResizeAllowed =
false)
const;
3836 Value *vectorizeOperand(TreeEntry *
E,
unsigned NodeIdx);
3841 template <
typename BVTy,
typename ResTy,
typename... Args>
3842 ResTy processBuildVector(
const TreeEntry *
E,
Type *ScalarTy, Args &...Params);
3847 Value *createBuildVector(
const TreeEntry *
E,
Type *ScalarTy);
3853 Instruction &getLastInstructionInBundle(
const TreeEntry *
E);
3860 std::optional<TargetTransformInfo::ShuffleKind>
3872 unsigned NumParts)
const;
3884 std::optional<TargetTransformInfo::ShuffleKind>
3885 isGatherShuffledSingleRegisterEntry(
3902 isGatherShuffledEntry(
3905 unsigned NumParts,
bool ForOrder =
false);
3911 Type *ScalarTy)
const;
3915 void setInsertPointAfterBundle(
const TreeEntry *
E);
3925 bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
3930 void tryToVectorizeGatheredLoads(
3932 std::tuple<BasicBlock *, Value *, Type *>,
3940 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
3956 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
3960 void reorderGatherNode(TreeEntry &TE);
3967 bool matchesShlZExt(
const TreeEntry &TE,
OrdersType &Order,
bool &IsBSwap,
3968 bool &ForLoads)
const;
3972 bool matchesInversedZExtSelect(
3973 const TreeEntry &SelectTE,
3979 bool matchesSelectOfBits(
const TreeEntry &SelectTE)
const;
3984 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3987 SmallVector<int> getCommonMask()
const {
3988 if (State == TreeEntry::SplitVectorize)
3990 SmallVector<int>
Mask;
3997 SmallVector<int> getSplitMask()
const {
3998 assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
3999 "Expected only split vectorize node.");
4000 unsigned CommonVF = std::max<unsigned>(
4001 CombinedEntriesWithIndices.back().second,
4002 Scalars.size() - CombinedEntriesWithIndices.back().second);
4003 const unsigned Scale =
getNumElements(Scalars.front()->getType());
4006 for (
auto [Idx,
I] :
enumerate(ReorderIndices)) {
4010 (Idx >= CombinedEntriesWithIndices.back().second
4011 ? CommonVF - CombinedEntriesWithIndices.back().second * Scale
4020 void reorderSplitNode(
unsigned Idx, ArrayRef<int> Mask,
4021 ArrayRef<int> MaskOrder);
4026 if (
Mask.size() != VL.
size() && VL.
size() == Scalars.size())
4027 return std::equal(VL.
begin(), VL.
end(), Scalars.begin());
4030 [Scalars](
Value *V,
int Idx) {
4031 return (isa<UndefValue>(V) &&
4032 Idx == PoisonMaskElem) ||
4033 (Idx != PoisonMaskElem && V == Scalars[Idx]);
4036 if (!ReorderIndices.empty()) {
4040 SmallVector<int>
Mask;
4042 if (VL.
size() == Scalars.size())
4043 return IsSame(Scalars, Mask);
4044 if (VL.
size() == ReuseShuffleIndices.size()) {
4046 return IsSame(Scalars, Mask);
4050 return IsSame(Scalars, ReuseShuffleIndices);
4054 bool hasEqualOperands(
const TreeEntry &TE)
const {
4055 if (
TE.getNumOperands() != getNumOperands())
4057 SmallBitVector
Used(getNumOperands());
4058 for (
unsigned I = 0,
E = getNumOperands();
I <
E; ++
I) {
4059 unsigned PrevCount =
Used.count();
4060 for (
unsigned K = 0;
K <
E; ++
K) {
4063 if (getOperand(K) ==
TE.getOperand(
I)) {
4069 if (PrevCount ==
Used.count())
4078 unsigned getVectorFactor()
const {
4079 if (!ReuseShuffleIndices.empty())
4080 return ReuseShuffleIndices.size();
4081 return Scalars.size();
4085 bool isGather()
const {
return State == NeedToGather; }
4091 WeakTrackingVH VectorizedValue =
nullptr;
4112 enum CombinedOpcode {
4114 MinMax = Instruction::OtherOpsEnd + 1,
4117 ReducedBitcastBSwap,
4118 ReducedBitcastLoads,
4119 ReducedBitcastBSwapLoads,
4122 CombinedOpcode CombinedOp = NotCombinedOp;
4125 SmallVector<int, 4> ReuseShuffleIndices;
4128 SmallVector<unsigned, 4> ReorderIndices;
4136 VecTreeTy &Container;
4139 EdgeInfo UserTreeIndex;
4152 SmallVector<ValueList, 2> Operands;
4155 SmallPtrSet<const Value *, 4> CopyableElements;
4159 InstructionsState S = InstructionsState::invalid();
4162 unsigned InterleaveFactor = 0;
4165 bool DoesNotNeedToSchedule =
false;
4169 if (Operands.size() <
OpIdx + 1)
4170 Operands.resize(
OpIdx + 1);
4173 "Number of operands is greater than the number of scalars.");
4179 mutable SmallDenseMap<Value *, unsigned> ValueToLane;
4183 unsigned getInterleaveFactor()
const {
return InterleaveFactor; }
4185 void setInterleave(
unsigned Factor) { InterleaveFactor = Factor; }
4188 void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule =
true; }
4191 bool doesNotNeedToSchedule()
const {
return DoesNotNeedToSchedule; }
4196 setOperand(
I, Operands[
I]);
4200 void reorderOperands(ArrayRef<int> Mask) {
4208 return Operands[
OpIdx];
4214 return Operands[
OpIdx];
4218 unsigned getNumOperands()
const {
return Operands.size(); }
4221 Value *getSingleOperand(
unsigned OpIdx)
const {
4224 return Operands[
OpIdx][0];
4228 bool isAltShuffle()
const {
return S.isAltShuffle(); }
4230 Instruction *getMatchingMainOpOrAltOp(Instruction *
I)
const {
4231 return S.getMatchingMainOpOrAltOp(
I);
4239 if (
I && getMatchingMainOpOrAltOp(
I))
4241 return S.getMainOp();
4244 void setOperations(
const InstructionsState &S) {
4245 assert(S &&
"InstructionsState is invalid.");
4249 Instruction *getMainOp()
const {
return S.getMainOp(); }
4251 Instruction *getAltOp()
const {
return S.getAltOp(); }
4254 unsigned getOpcode()
const {
return S.getOpcode(); }
4256 unsigned getAltOpcode()
const {
return S.getAltOpcode(); }
4258 bool hasState()
const {
return S.valid(); }
4261 void addCopyableElement(
Value *V) {
4262 assert(S.isCopyableElement(V) &&
"Not a copyable element.");
4263 CopyableElements.insert(V);
4267 bool isCopyableElement(
Value *V)
const {
4268 return CopyableElements.contains(V);
4272 bool hasCopyableElements()
const {
return !CopyableElements.empty(); }
4275 const InstructionsState &getOperations()
const {
return S; }
4279 unsigned findLaneForValue(
Value *V)
const {
4280 auto Res = ValueToLane.try_emplace(V, getVectorFactor());
4282 return Res.first->second;
4283 unsigned &FoundLane = Res.first->getSecond();
4284 for (
auto *It =
find(Scalars, V), *End = Scalars.end(); It != End;
4285 std::advance(It, 1)) {
4288 FoundLane = std::distance(Scalars.begin(), It);
4289 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
4290 if (!ReorderIndices.empty())
4291 FoundLane = ReorderIndices[FoundLane];
4292 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
4293 if (ReuseShuffleIndices.empty())
4295 if (
auto *RIt =
find(ReuseShuffleIndices, FoundLane);
4296 RIt != ReuseShuffleIndices.end()) {
4297 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
4301 assert(FoundLane < getVectorFactor() &&
"Unable to find given value.");
4308 buildAltOpShuffleMask(
const function_ref<
bool(Instruction *)> IsAltOp,
4309 SmallVectorImpl<int> &Mask,
4310 SmallVectorImpl<Value *> *OpScalars =
nullptr,
4311 SmallVectorImpl<Value *> *AltScalars =
nullptr)
const;
4314 bool isNonPowOf2Vec()
const {
4316 return IsNonPowerOf2;
4322 hasNonWholeRegisterOrNonPowerOf2Vec(
const TargetTransformInfo &
TTI)
const {
4325 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
4326 "Reshuffling not supported with non-power-of-2 vectors yet.");
4327 return IsNonPowerOf2;
4330 Value *getOrdered(
unsigned Idx)
const {
4331 if (ReorderIndices.empty())
4332 return Scalars[Idx];
4333 SmallVector<int>
Mask;
4335 return Scalars[
Mask[Idx]];
4341 dbgs() << Idx <<
".\n";
4342 for (
unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
4343 dbgs() <<
"Operand " << OpI <<
":\n";
4344 for (
const Value *V : Operands[OpI])
4347 dbgs() <<
"Scalars: \n";
4348 for (
Value *V : Scalars)
4350 dbgs() <<
"State: ";
4351 if (S && hasCopyableElements())
4352 dbgs() <<
"[[Copyable]] ";
4355 if (InterleaveFactor > 0) {
4356 dbgs() <<
"Vectorize with interleave factor " << InterleaveFactor
4359 dbgs() <<
"Vectorize\n";
4362 case ScatterVectorize:
4363 dbgs() <<
"ScatterVectorize\n";
4365 case StridedVectorize:
4366 dbgs() <<
"StridedVectorize\n";
4368 case CompressVectorize:
4369 dbgs() <<
"CompressVectorize\n";
4372 dbgs() <<
"NeedToGather\n";
4374 case CombinedVectorize:
4375 dbgs() <<
"CombinedVectorize\n";
4377 case SplitVectorize:
4378 dbgs() <<
"SplitVectorize\n";
4382 dbgs() <<
"MainOp: " << *S.getMainOp() <<
"\n";
4383 dbgs() <<
"AltOp: " << *S.getAltOp() <<
"\n";
4385 dbgs() <<
"MainOp: NULL\n";
4386 dbgs() <<
"AltOp: NULL\n";
4388 dbgs() <<
"VectorizedValue: ";
4389 if (VectorizedValue)
4390 dbgs() << *VectorizedValue <<
"\n";
4393 dbgs() <<
"ReuseShuffleIndices: ";
4394 if (ReuseShuffleIndices.empty())
4397 for (
int ReuseIdx : ReuseShuffleIndices)
4398 dbgs() << ReuseIdx <<
", ";
4400 dbgs() <<
"ReorderIndices: ";
4401 for (
unsigned ReorderIdx : ReorderIndices)
4402 dbgs() << ReorderIdx <<
", ";
4404 dbgs() <<
"UserTreeIndex: ";
4406 dbgs() << UserTreeIndex;
4408 dbgs() <<
"<invalid>";
4410 if (!CombinedEntriesWithIndices.empty()) {
4411 dbgs() <<
"Combined entries: ";
4413 dbgs() <<
"Entry index " <<
P.first <<
" with offset " <<
P.second;
4424 StringRef Banner)
const {
4425 dbgs() <<
"SLP: " << Banner <<
":\n";
4427 dbgs() <<
"SLP: Costs:\n";
4428 dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
4429 dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
4430 dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
4431 dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = "
4432 << ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
4438 const InstructionsState &S,
4440 ArrayRef<int> ReuseShuffleIndices = {}) {
4441 auto Invalid = ScheduleBundle::invalid();
4442 return newTreeEntry(VL,
Invalid, S, UserTreeIdx, ReuseShuffleIndices);
4447 const InstructionsState &S,
4449 ArrayRef<int> ReuseShuffleIndices = {},
4450 ArrayRef<unsigned> ReorderIndices = {},
4451 unsigned InterleaveFactor = 0) {
4452 TreeEntry::EntryState EntryState =
4453 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
4454 TreeEntry *
E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
4455 ReuseShuffleIndices, ReorderIndices);
4456 if (
E && InterleaveFactor > 0)
4457 E->setInterleave(InterleaveFactor);
4462 TreeEntry::EntryState EntryState,
4463 ScheduleBundle &Bundle,
const InstructionsState &S,
4465 ArrayRef<int> ReuseShuffleIndices = {},
4466 ArrayRef<unsigned> ReorderIndices = {}) {
4467 assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||
4468 EntryState == TreeEntry::SplitVectorize)) ||
4469 (Bundle && EntryState != TreeEntry::NeedToGather &&
4470 EntryState != TreeEntry::SplitVectorize)) &&
4471 "Need to vectorize gather entry?");
4473 if (GatheredLoadsEntriesFirst.has_value() &&
4474 EntryState == TreeEntry::NeedToGather && S &&
4475 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
4476 !UserTreeIdx.UserTE)
4478 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
4479 TreeEntry *
Last = VectorizableTree.back().get();
4480 Last->Idx = VectorizableTree.size() - 1;
4481 Last->State = EntryState;
4482 if (UserTreeIdx.UserTE)
4483 OperandsToTreeEntry.try_emplace(
4484 std::make_pair(UserTreeIdx.UserTE, UserTreeIdx.EdgeIdx),
Last);
4489 ReuseShuffleIndices.empty()) &&
4490 "Reshuffling scalars not yet supported for nodes with padding");
4491 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
4492 ReuseShuffleIndices.end());
4493 if (ReorderIndices.
empty()) {
4496 Last->setOperations(S);
4499 Last->Scalars.assign(VL.
size(),
nullptr);
4501 [VL](
unsigned Idx) ->
Value * {
4502 if (Idx >= VL.size())
4503 return UndefValue::get(VL.front()->getType());
4508 Last->setOperations(S);
4509 Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
4511 if (EntryState == TreeEntry::SplitVectorize) {
4512 assert(S &&
"Split nodes must have operations.");
4513 Last->setOperations(S);
4514 SmallPtrSet<Value *, 4> Processed;
4515 for (
Value *V : VL) {
4519 auto It = ScalarsInSplitNodes.find(V);
4520 if (It == ScalarsInSplitNodes.end()) {
4521 ScalarsInSplitNodes.try_emplace(V).first->getSecond().push_back(
Last);
4522 (void)Processed.
insert(V);
4523 }
else if (Processed.
insert(V).second) {
4525 "Value already associated with the node.");
4526 It->getSecond().push_back(
Last);
4529 }
else if (!
Last->isGather()) {
4532 (!S.areInstructionsWithCopyableElements() &&
4534 all_of(VL, [&](
Value *V) {
return S.isNonSchedulable(V); }))
4535 Last->setDoesNotNeedToSchedule();
4536 SmallPtrSet<Value *, 4> Processed;
4537 for (
Value *V : VL) {
4540 if (S.isCopyableElement(V)) {
4541 Last->addCopyableElement(V);
4544 auto It = ScalarToTreeEntries.find(V);
4545 if (It == ScalarToTreeEntries.end()) {
4546 ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(
Last);
4547 (void)Processed.
insert(V);
4548 }
else if (Processed.
insert(V).second) {
4550 "Value already associated with the node.");
4551 It->getSecond().push_back(
Last);
4555 assert((!Bundle.getBundle().empty() ||
Last->doesNotNeedToSchedule()) &&
4556 "Bundle and VL out of sync");
4557 if (!Bundle.getBundle().empty()) {
4558#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
4559 auto *BundleMember = Bundle.getBundle().begin();
4560 SmallPtrSet<Value *, 4> Processed;
4561 for (
Value *V : VL) {
4562 if (S.isNonSchedulable(V) || !Processed.
insert(V).second)
4566 assert(BundleMember == Bundle.getBundle().end() &&
4567 "Bundle and VL out of sync");
4569 Bundle.setTreeEntry(
Last);
4573 bool AllConstsOrCasts =
true;
4574 for (
Value *V : VL) {
4575 if (S && S.areInstructionsWithCopyableElements() &&
4576 S.isCopyableElement(V))
4577 Last->addCopyableElement(V);
4580 AllConstsOrCasts &=
I &&
I->getType()->isIntegerTy();
4581 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
4582 !UserTreeIdx.UserTE->isGather())
4583 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(
Last);
4586 if (AllConstsOrCasts)
4588 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
4589 MustGather.insert_range(VL);
4592 if (UserTreeIdx.UserTE)
4593 Last->UserTreeIndex = UserTreeIdx;
4599 TreeEntry::VecTreeTy VectorizableTree;
4604 for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
4605 VectorizableTree[
Id]->dump();
4606 if (TransformedToGatherNodes.contains(VectorizableTree[Id].get()))
4607 dbgs() <<
"[[TRANSFORMED TO GATHER]]";
4608 else if (DeletedNodes.contains(VectorizableTree[Id].get()))
4609 dbgs() <<
"[[DELETED NODE]]";
4617 assert(V &&
"V cannot be nullptr.");
4618 auto It = ScalarToTreeEntries.find(V);
4619 if (It == ScalarToTreeEntries.end())
4621 return It->getSecond();
4626 assert(V &&
"V cannot be nullptr.");
4627 auto It = ScalarsInSplitNodes.find(V);
4628 if (It == ScalarsInSplitNodes.end())
4630 return It->getSecond();
4635 bool SameVF =
false)
const {
4636 assert(V &&
"V cannot be nullptr.");
4637 for (TreeEntry *TE : ScalarToTreeEntries.lookup(V))
4638 if ((!SameVF ||
TE->getVectorFactor() == VL.
size()) &&
TE->isSame(VL))
4645 class ScalarsVectorizationLegality {
4646 InstructionsState S;
4648 bool TryToFindDuplicates;
4649 bool TrySplitVectorize;
4652 ScalarsVectorizationLegality(InstructionsState S,
bool IsLegal,
4653 bool TryToFindDuplicates =
true,
4654 bool TrySplitVectorize =
false)
4655 : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
4656 TrySplitVectorize(TrySplitVectorize) {
4657 assert((!IsLegal || (S.valid() && TryToFindDuplicates)) &&
4658 "Inconsistent state");
4660 const InstructionsState &getInstructionsState()
const {
return S; };
4661 bool isLegal()
const {
return IsLegal; }
4662 bool tryToFindDuplicates()
const {
return TryToFindDuplicates; }
4663 bool trySplitVectorize()
const {
return TrySplitVectorize; }
4668 ScalarsVectorizationLegality
4670 const EdgeInfo &UserTreeIdx)
const;
4674 TreeEntry::EntryState getScalarsVectorizationState(
4676 bool IsScatterVectorizeUserTE,
OrdersType &CurrentOrder,
4677 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo);
4680 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
4683 SmallPtrSet<const TreeEntry *, 8> DeletedNodes;
4687 SmallDenseMap<const TreeEntry *, InstructionCost> TransformedToGatherNodes;
4690 SmallDenseMap<std::pair<const TreeEntry *, unsigned>, TreeEntry *>
4691 OperandsToTreeEntry;
4694 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarsInSplitNodes;
4697 SmallDenseMap<Value *, unsigned> InstrElementSize;
4711 SmallDenseMap<const TreeEntry *, WeakTrackingVH> EntryToLastInstruction;
4715 SmallDenseMap<const Instruction *, Instruction *> LastInstructionToPos;
4720 SetVector<const TreeEntry *> PostponedGathers;
4722 using ValueToGatherNodesMap =
4723 DenseMap<Value *, SmallSetVector<const TreeEntry *, 4>>;
4724 ValueToGatherNodesMap ValueToGatherNodes;
4729 SetVector<unsigned> LoadEntriesToVectorize;
4732 bool IsGraphTransformMode =
false;
4735 std::optional<unsigned> GatheredLoadsEntriesFirst;
4738 SmallDenseMap<
const TreeEntry *,
4739 std::tuple<SmallVector<int>,
VectorType *, unsigned,
bool>>
4740 CompressEntryToData;
4744 SmallVector<const Loop *> CurrentLoopNest;
4747 SmallDenseMap<const Loop *, SmallVector<const Loop *>> LoopToLoopNest;
4751 SmallDenseMap<const Loop *, unsigned> LoopToScaleFactor;
4754 struct ExternalUser {
4755 ExternalUser(
Value *S, llvm::User *U,
const TreeEntry &E,
unsigned L)
4756 : Scalar(S), User(
U), E(E), Lane(
L) {}
4759 Value *Scalar =
nullptr;
4762 llvm::User *User =
nullptr;
4770 using UserList = SmallVector<ExternalUser, 16>;
4776 bool isAliased(
const MemoryLocation &Loc1, Instruction *Inst1,
4777 Instruction *Inst2) {
4780 AliasCacheKey
Key = std::make_pair(Inst1, Inst2);
4781 auto Res = AliasCache.try_emplace(
Key);
4783 return Res.first->second;
4784 bool Aliased =
isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
4786 Res.first->getSecond() = Aliased;
4790 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
4794 SmallDenseMap<AliasCacheKey, bool> AliasCache;
4799 BatchAAResults BatchAA;
4806 DenseSet<Instruction *> DeletedInstructions;
4809 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
4812 DenseSet<size_t> AnalyzedReductionVals;
4816 DenseSet<Value *> AnalyzedMinBWVals;
4822 UserList ExternalUses;
4826 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
4830 SmallPtrSet<Value *, 4> ExternalUsesWithNonUsers;
4833 SmallPtrSet<const Value *, 32> EphValues;
4837 SetVector<Instruction *> GatherShuffleExtractSeq;
4840 DenseSet<BasicBlock *> CSEBlocks;
4843 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
4850 class ScheduleEntity {
4851 friend class ScheduleBundle;
4852 friend class ScheduleData;
4853 friend class ScheduleCopyableData;
4856 enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData };
4857 Kind getKind()
const {
return K; }
4858 ScheduleEntity(Kind K) : K(K) {}
4862 int SchedulingPriority = 0;
4865 bool IsScheduled =
false;
4867 const Kind K = Kind::ScheduleData;
4870 ScheduleEntity() =
delete;
4872 void setSchedulingPriority(
int Priority) { SchedulingPriority = Priority; }
4873 int getSchedulingPriority()
const {
return SchedulingPriority; }
4874 bool isReady()
const {
4876 return SD->isReady();
4878 return CD->isReady();
4884 bool hasValidDependencies()
const {
4886 return SD->hasValidDependencies();
4888 return CD->hasValidDependencies();
4892 int getUnscheduledDeps()
const {
4894 return SD->getUnscheduledDeps();
4896 return CD->getUnscheduledDeps();
4900 int incrementUnscheduledDeps(
int Incr) {
4902 return SD->incrementUnscheduledDeps(Incr);
4906 int getDependencies()
const {
4908 return SD->getDependencies();
4914 return SD->getInst();
4919 bool isScheduled()
const {
return IsScheduled; }
4920 void setScheduled(
bool Scheduled) { IsScheduled = Scheduled; }
4922 static bool classof(
const ScheduleEntity *) {
return true; }
4924#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4925 void dump(raw_ostream &OS)
const {
4927 return SD->dump(OS);
4929 return CD->dump(OS);
4940#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4942 const BoUpSLP::ScheduleEntity &SE) {
4952 class ScheduleData final :
public ScheduleEntity {
4956 enum { InvalidDeps = -1 };
4958 ScheduleData() : ScheduleEntity(Kind::ScheduleData) {}
4959 static bool classof(
const ScheduleEntity *Entity) {
4960 return Entity->getKind() == Kind::ScheduleData;
4963 void init(
int BlockSchedulingRegionID, Instruction *
I) {
4964 NextLoadStore =
nullptr;
4965 IsScheduled =
false;
4966 SchedulingRegionID = BlockSchedulingRegionID;
4967 clearDependencies();
4973 if (hasValidDependencies()) {
4974 assert(UnscheduledDeps <= Dependencies &&
"invariant");
4976 assert(UnscheduledDeps == Dependencies &&
"invariant");
4980 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
4981 "unexpected scheduled state");
4988 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
4992 bool isReady()
const {
return UnscheduledDeps == 0 && !IsScheduled; }
4997 int incrementUnscheduledDeps(
int Incr) {
4998 assert(hasValidDependencies() &&
4999 "increment of unscheduled deps would be meaningless");
5000 UnscheduledDeps += Incr;
5001 assert(UnscheduledDeps >= 0 &&
5002 "Expected valid number of unscheduled deps");
5003 return UnscheduledDeps;
5008 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5011 void clearDependencies() {
5012 clearDirectDependencies();
5013 MemoryDependencies.clear();
5014 ControlDependencies.clear();
5021 void clearDirectDependencies() {
5022 Dependencies = InvalidDeps;
5023 resetUnscheduledDeps();
5024 IsScheduled =
false;
5028 int getUnscheduledDeps()
const {
return UnscheduledDeps; }
5030 int getDependencies()
const {
return Dependencies; }
5032 void initDependencies() { Dependencies = 0; }
5034 void incDependencies() { Dependencies++; }
5037 int getSchedulingRegionID()
const {
return SchedulingRegionID; }
5044 return MemoryDependencies;
5047 void addMemoryDependency(ScheduleData *Dep) {
5048 MemoryDependencies.push_back(Dep);
5052 return ControlDependencies;
5055 void addControlDependency(ScheduleData *Dep) {
5056 ControlDependencies.push_back(Dep);
5059 ScheduleData *getNextLoadStore()
const {
return NextLoadStore; }
5060 void setNextLoadStore(ScheduleData *
Next) { NextLoadStore =
Next; }
5062 void dump(raw_ostream &OS)
const { OS << *Inst; }
5074 ScheduleData *NextLoadStore =
nullptr;
5078 SmallVector<ScheduleData *> MemoryDependencies;
5084 SmallVector<ScheduleData *> ControlDependencies;
5088 int SchedulingRegionID = 0;
5094 int Dependencies = InvalidDeps;
5100 int UnscheduledDeps = InvalidDeps;
5105 const BoUpSLP::ScheduleData &SD) {
5111 class ScheduleBundle final :
public ScheduleEntity {
5115 bool IsValid =
true;
5117 TreeEntry *TE =
nullptr;
5118 ScheduleBundle(
bool IsValid)
5119 : ScheduleEntity(Kind::ScheduleBundle), IsValid(IsValid) {}
5122 ScheduleBundle() : ScheduleEntity(Kind::ScheduleBundle) {}
5123 static bool classof(
const ScheduleEntity *Entity) {
5124 return Entity->getKind() == Kind::ScheduleBundle;
5129 for (
const ScheduleEntity *SD : Bundle) {
5130 if (SD->hasValidDependencies()) {
5131 assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
5134 assert(SD->getUnscheduledDeps() == SD->getDependencies() &&
5138 if (isScheduled()) {
5139 assert(SD->hasValidDependencies() && SD->getUnscheduledDeps() == 0 &&
5140 "unexpected scheduled state");
5146 int unscheduledDepsInBundle()
const {
5147 assert(*
this &&
"bundle must not be empty");
5149 for (
const ScheduleEntity *BundleMember : Bundle) {
5150 if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
5151 return ScheduleData::InvalidDeps;
5152 Sum += BundleMember->getUnscheduledDeps();
5160 bool hasValidDependencies()
const {
5161 return all_of(Bundle, [](
const ScheduleEntity *SD) {
5162 return SD->hasValidDependencies();
5168 bool isReady()
const {
5169 assert(*
this &&
"bundle must not be empty");
5170 return unscheduledDepsInBundle() == 0 && !isScheduled();
5178 void add(ScheduleEntity *SD) { Bundle.push_back(SD); }
5181 void setTreeEntry(TreeEntry *TE) { this->TE = TE; }
5182 TreeEntry *getTreeEntry()
const {
return TE; }
5184 static ScheduleBundle invalid() {
return {
false}; }
5186 operator bool()
const {
return IsValid; }
5189 void dump(raw_ostream &OS)
const {
5198 OS << *SD->getInst();
5212 const BoUpSLP::ScheduleBundle &Bundle) {
5223 class ScheduleCopyableData final :
public ScheduleEntity {
5230 int SchedulingRegionID = 0;
5232 ScheduleBundle &Bundle;
5235 ScheduleCopyableData(
int BlockSchedulingRegionID,
Instruction *
I,
5236 const EdgeInfo &EI, ScheduleBundle &Bundle)
5237 : ScheduleEntity(Kind::ScheduleCopyableData), Inst(
I), EI(EI),
5238 SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {}
5239 static bool classof(
const ScheduleEntity *Entity) {
5240 return Entity->getKind() == Kind::ScheduleCopyableData;
5245 if (hasValidDependencies()) {
5246 assert(UnscheduledDeps <= Dependencies &&
"invariant");
5248 assert(UnscheduledDeps == Dependencies &&
"invariant");
5252 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
5253 "unexpected scheduled state");
5260 bool hasValidDependencies()
const {
5261 return Dependencies != ScheduleData::InvalidDeps;
5266 bool isReady()
const {
return UnscheduledDeps == 0 && !IsScheduled; }
5271 int incrementUnscheduledDeps(
int Incr) {
5272 assert(hasValidDependencies() &&
5273 "increment of unscheduled deps would be meaningless");
5274 UnscheduledDeps += Incr;
5275 assert(UnscheduledDeps >= 0 &&
"invariant");
5276 return UnscheduledDeps;
5281 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5284 int getUnscheduledDeps()
const {
return UnscheduledDeps; }
5286 int getDependencies()
const {
return Dependencies; }
5288 void initDependencies() { Dependencies = 0; }
5290 void incDependencies() { Dependencies++; }
5293 int getSchedulingRegionID()
const {
return SchedulingRegionID; }
5299 void clearDependencies() {
5300 Dependencies = ScheduleData::InvalidDeps;
5301 UnscheduledDeps = ScheduleData::InvalidDeps;
5302 IsScheduled =
false;
5306 const EdgeInfo &getEdgeInfo()
const {
return EI; }
5309 ScheduleBundle &getBundle() {
return Bundle; }
5310 const ScheduleBundle &getBundle()
const {
return Bundle; }
5312#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5313 void dump(raw_ostream &OS)
const { OS <<
"[Copyable]" << *getInst(); }
5324 int Dependencies = ScheduleData::InvalidDeps;
5330 int UnscheduledDeps = ScheduleData::InvalidDeps;
5360 struct BlockScheduling {
5362 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
5365 ScheduledBundles.clear();
5366 ScheduledBundlesList.
clear();
5367 ScheduleCopyableDataMap.clear();
5368 ScheduleCopyableDataMapByInst.clear();
5369 ScheduleCopyableDataMapByInstUser.clear();
5370 ScheduleCopyableDataMapByUsers.clear();
5372 ScheduleStart =
nullptr;
5373 ScheduleEnd =
nullptr;
5374 FirstLoadStoreInRegion =
nullptr;
5375 LastLoadStoreInRegion =
nullptr;
5376 RegionHasStackSave =
false;
5380 ScheduleRegionSizeLimit -= ScheduleRegionSize;
5383 ScheduleRegionSize = 0;
5387 ++SchedulingRegionID;
5390 ScheduleData *getScheduleData(Instruction *
I) {
5393 if (BB !=
I->getParent())
5396 ScheduleData *SD = ScheduleDataMap.lookup(
I);
5397 if (SD && isInSchedulingRegion(*SD))
5402 ScheduleData *getScheduleData(
Value *V) {
5408 ScheduleCopyableData *getScheduleCopyableData(
const EdgeInfo &EI,
5409 const Value *V)
const {
5410 if (ScheduleCopyableDataMap.empty())
5412 auto It = ScheduleCopyableDataMap.find(std::make_pair(EI, V));
5413 if (It == ScheduleCopyableDataMap.end())
5415 ScheduleCopyableData *SD = It->getSecond().get();
5416 if (!isInSchedulingRegion(*SD))
5424 getScheduleCopyableData(
const Value *User,
unsigned OperandIdx,
5426 if (ScheduleCopyableDataMapByInstUser.empty())
5428 const auto It = ScheduleCopyableDataMapByInstUser.find(
5429 std::make_pair(std::make_pair(User, OperandIdx), V));
5430 if (It == ScheduleCopyableDataMapByInstUser.end())
5433 for (ScheduleCopyableData *SD : It->getSecond()) {
5434 if (isInSchedulingRegion(*SD))
5448 bool areAllOperandsReplacedByCopyableData(Instruction *User,
5452 if (ScheduleCopyableDataMap.empty())
5454 SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;
5456 if (Entries.
empty())
5458 unsigned CurNumOps = 0;
5459 for (
const Use &U :
User->operands()) {
5465 for (TreeEntry *TE : Entries) {
5467 bool IsNonSchedulableWithParentPhiNode =
5468 TE->doesNotNeedToSchedule() &&
TE->UserTreeIndex &&
5469 TE->UserTreeIndex.UserTE->hasState() &&
5470 TE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
5471 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
5474 if (IsNonSchedulableWithParentPhiNode) {
5475 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5476 const TreeEntry *ParentTE =
TE->UserTreeIndex.UserTE;
5477 for (
Value *V : ParentTE->Scalars) {
5481 if (ParentsUniqueUsers.
insert(
PHI).second &&
5486 Inc =
count(
TE->Scalars, User);
5494 bool IsCommutativeUser =
5497 if (!IsCommutativeUser) {
5507 (!IsCommutativeUser ||
5516 "Expected commutative user with 2 first commutable operands");
5517 bool IsCommutativeWithSameOps =
5518 IsCommutativeUser &&
User->getOperand(0) ==
User->getOperand(1);
5519 if ((!IsCommutativeUser || IsCommutativeWithSameOps) &&
5521 EdgeInfo EI(TE,
U.getOperandNo());
5522 if (CurNumOps !=
NumOps || getScheduleCopyableData(EI,
Op))
5526 PotentiallyReorderedEntriesCount.
try_emplace(TE, 0)
5527 .first->getSecond() += Inc;
5530 if (PotentiallyReorderedEntriesCount.
empty())
5533 for (
auto &
P : PotentiallyReorderedEntriesCount) {
5534 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5535 bool IsNonSchedulableWithParentPhiNode =
5536 P.first->doesNotNeedToSchedule() &&
P.first->UserTreeIndex &&
5537 P.first->UserTreeIndex.UserTE->hasState() &&
5538 P.first->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
5539 P.first->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
5540 auto *It =
find(
P.first->Scalars, User);
5542 assert(It !=
P.first->Scalars.end() &&
5543 "User is not in the tree entry");
5544 int Lane = std::distance(
P.first->Scalars.begin(), It);
5545 assert(Lane >= 0 &&
"Lane is not found");
5547 Lane =
P.first->ReorderIndices[Lane];
5548 assert(Lane <
static_cast<int>(
P.first->Scalars.size()) &&
5549 "Couldn't find extract lane");
5552 if (IsNonSchedulableWithParentPhiNode) {
5553 const TreeEntry *ParentTE =
P.first->UserTreeIndex.UserTE;
5555 if (!ParentsUniqueUsers.
insert(User).second) {
5561 for (
unsigned OpIdx :
5563 P.first->getMainOp()))) {
5564 if (
P.first->getOperand(
OpIdx)[Lane] ==
Op &&
5565 getScheduleCopyableData(EdgeInfo(
P.first,
OpIdx),
Op))
5570 }
while (It !=
P.first->Scalars.end());
5572 return all_of(PotentiallyReorderedEntriesCount,
5573 [&](
const std::pair<const TreeEntry *, unsigned> &
P) {
5574 return P.second ==
NumOps - 1;
5579 getScheduleCopyableData(
const Instruction *
I)
const {
5580 if (ScheduleCopyableDataMapByInst.empty())
5582 const auto It = ScheduleCopyableDataMapByInst.find(
I);
5583 if (It == ScheduleCopyableDataMapByInst.end())
5586 for (ScheduleCopyableData *SD : It->getSecond()) {
5587 if (isInSchedulingRegion(*SD))
5594 getScheduleCopyableDataUsers(
const Instruction *User)
const {
5595 if (ScheduleCopyableDataMapByUsers.empty())
5597 const auto It = ScheduleCopyableDataMapByUsers.find(User);
5598 if (It == ScheduleCopyableDataMapByUsers.end())
5601 for (ScheduleCopyableData *SD : It->getSecond()) {
5602 if (isInSchedulingRegion(*SD))
5608 ScheduleCopyableData &addScheduleCopyableData(
const EdgeInfo &EI,
5610 int SchedulingRegionID,
5611 ScheduleBundle &Bundle) {
5612 assert(!getScheduleCopyableData(EI,
I) &&
"already in the map");
5613 ScheduleCopyableData *CD =
5614 ScheduleCopyableDataMap
5615 .try_emplace(std::make_pair(EI,
I),
5616 std::make_unique<ScheduleCopyableData>(
5617 SchedulingRegionID,
I, EI, Bundle))
5620 ScheduleCopyableDataMapByInst[
I].push_back(CD);
5624 assert(It !=
Op.end() &&
"Lane not set");
5625 SmallPtrSet<Instruction *, 4> Visited;
5627 int Lane = std::distance(
Op.begin(), It);
5628 assert(Lane >= 0 &&
"Lane not set");
5630 !EI.UserTE->ReorderIndices.empty())
5631 Lane = EI.UserTE->ReorderIndices[Lane];
5632 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
5633 "Couldn't find extract lane");
5635 if (!Visited.
insert(In).second) {
5639 ScheduleCopyableDataMapByInstUser
5640 .try_emplace(std::make_pair(std::make_pair(In, EI.EdgeIdx),
I))
5643 ScheduleCopyableDataMapByUsers.try_emplace(
I)
5650 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
5651 if (ScheduleCopyableData *UserCD =
5652 getScheduleCopyableData(UserEI, In))
5653 ScheduleCopyableDataMapByUsers[
I].remove(UserCD);
5656 }
while (It !=
Op.end());
5658 ScheduleCopyableDataMapByUsers.try_emplace(
I).first->getSecond().insert(
5668 auto It = ScheduledBundles.find(
I);
5669 if (It == ScheduledBundles.end())
5671 return It->getSecond();
5675 bool isInSchedulingRegion(
const ScheduleEntity &SD)
const {
5677 return Data->getSchedulingRegionID() == SchedulingRegionID;
5679 return CD->getSchedulingRegionID() == SchedulingRegionID;
5681 [&](
const ScheduleEntity *BundleMember) {
5682 return isInSchedulingRegion(*BundleMember);
5688 template <
typename ReadyListType>
5689 void schedule(
const BoUpSLP &R,
const InstructionsState &S,
5690 const EdgeInfo &EI, ScheduleEntity *
Data,
5691 ReadyListType &ReadyList) {
5692 auto ProcessBundleMember = [&](ScheduleEntity *BundleMember,
5697 auto DecrUnsched = [&](
auto *
Data,
bool IsControl =
false) {
5698 if ((IsControl ||
Data->hasValidDependencies()) &&
5699 Data->incrementUnscheduledDeps(-1) == 0) {
5706 CopyableBundle.
push_back(&CD->getBundle());
5707 Bundles = CopyableBundle;
5709 Bundles = getScheduleBundles(
Data->getInst());
5711 if (!Bundles.
empty()) {
5712 for (ScheduleBundle *Bundle : Bundles) {
5713 if (Bundle->unscheduledDepsInBundle() == 0) {
5714 assert(!Bundle->isScheduled() &&
5715 "already scheduled bundle gets ready");
5716 ReadyList.insert(Bundle);
5718 <<
"SLP: gets ready: " << *Bundle <<
"\n");
5724 "already scheduled bundle gets ready");
5726 "Expected non-copyable data");
5727 ReadyList.insert(
Data);
5734 if (!ScheduleCopyableDataMap.empty()) {
5736 getScheduleCopyableData(User,
OpIdx,
I);
5737 for (ScheduleCopyableData *CD : CopyableData)
5738 DecrUnsched(CD,
false);
5739 if (!CopyableData.empty())
5742 if (ScheduleData *OpSD = getScheduleData(
I))
5743 DecrUnsched(OpSD,
false);
5749 if (!Bundles.empty()) {
5750 auto *
In = BundleMember->getInst();
5752 SmallDenseMap<const Instruction *, unsigned> OperandsUses;
5753 unsigned TotalOpCount = 0;
5756 TotalOpCount = OperandsUses[
In] = 1;
5758 for (
const Use &U :
In->operands()) {
5761 ++Res.first->getSecond();
5768 auto DecrUnschedForInst =
5770 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>>
5772 if (!ScheduleCopyableDataMap.empty()) {
5773 const EdgeInfo EI = {UserTE,
OpIdx};
5774 if (ScheduleCopyableData *CD =
5775 getScheduleCopyableData(EI,
I)) {
5776 if (!Checked.insert(std::make_pair(CD,
OpIdx)).second)
5778 DecrUnsched(CD,
false);
5782 auto It = OperandsUses.
find(
I);
5783 assert(It != OperandsUses.
end() &&
"Operand not found");
5784 if (It->second > 0) {
5785 if (ScheduleData *OpSD = getScheduleData(
I)) {
5786 if (!Checked.insert(std::make_pair(OpSD,
OpIdx)).second)
5789 assert(TotalOpCount > 0 &&
"No more operands to decrement");
5791 DecrUnsched(OpSD,
false);
5794 assert(TotalOpCount > 0 &&
"No more operands to decrement");
5800 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>> Checked;
5801 for (ScheduleBundle *Bundle : Bundles) {
5802 if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
5804 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5807 auto *It =
find(Bundle->getTreeEntry()->Scalars, In);
5808 bool IsNonSchedulableWithParentPhiNode =
5809 Bundle->getTreeEntry()->doesNotNeedToSchedule() &&
5810 Bundle->getTreeEntry()->UserTreeIndex &&
5811 Bundle->getTreeEntry()->UserTreeIndex.UserTE->hasState() &&
5812 Bundle->getTreeEntry()->UserTreeIndex.UserTE->State !=
5813 TreeEntry::SplitVectorize &&
5814 Bundle->getTreeEntry()->UserTreeIndex.UserTE->getOpcode() ==
5818 std::distance(Bundle->getTreeEntry()->Scalars.begin(), It);
5819 assert(Lane >= 0 &&
"Lane not set");
5821 !Bundle->getTreeEntry()->ReorderIndices.empty())
5822 Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
5823 assert(Lane <
static_cast<int>(
5824 Bundle->getTreeEntry()->Scalars.size()) &&
5825 "Couldn't find extract lane");
5836 In->getNumOperands() ==
5837 Bundle->getTreeEntry()->getNumOperands() ||
5838 (
isa<ZExtInst>(In) && Bundle->getTreeEntry()->getOpcode() ==
5839 Instruction::Select) ||
5840 Bundle->getTreeEntry()->isCopyableElement(In)) &&
5841 "Missed TreeEntry operands?");
5845 if (IsNonSchedulableWithParentPhiNode) {
5846 const TreeEntry *ParentTE =
5847 Bundle->getTreeEntry()->UserTreeIndex.UserTE;
5849 if (!ParentsUniqueUsers.
insert(User).second) {
5850 It = std::find(std::next(It),
5851 Bundle->getTreeEntry()->Scalars.end(), In);
5856 for (
unsigned OpIdx :
5859 Bundle->getTreeEntry()->getOperand(
OpIdx)[Lane])) {
5862 DecrUnschedForInst(
I, Bundle->getTreeEntry(),
OpIdx, Checked);
5865 if (Bundle->getTreeEntry()->isCopyableElement(In))
5867 It = std::find(std::next(It),
5868 Bundle->getTreeEntry()->Scalars.end(), In);
5869 }
while (It != Bundle->getTreeEntry()->Scalars.end());
5874 for (Use &U : BundleMember->getInst()->operands()) {
5877 <<
"SLP: check for readiness (def): " << *
I <<
"\n");
5878 DecrUnschedForInst(BundleMember->getInst(),
U.getOperandNo(),
I);
5886 SmallPtrSet<const ScheduleData *, 4> VisitedMemory;
5887 for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) {
5888 if (!VisitedMemory.
insert(MemoryDep).second)
5893 << *MemoryDep <<
"\n");
5894 DecrUnsched(MemoryDep);
5897 SmallPtrSet<const ScheduleData *, 4> VisitedControl;
5898 for (ScheduleData *Dep : SD->getControlDependencies()) {
5899 if (!VisitedControl.
insert(Dep).second)
5904 <<
"SLP: check for readiness (ctrl): " << *Dep <<
"\n");
5905 DecrUnsched(Dep,
true);
5909 SD->setScheduled(
true);
5915 if (!Entries.
empty()) {
5916 for (TreeEntry *TE : Entries) {
5918 In->getNumOperands() !=
TE->getNumOperands())
5921 PseudoBundles.
emplace_back(std::make_unique<ScheduleBundle>());
5922 BundlePtr->setTreeEntry(TE);
5927 ProcessBundleMember(SD, Bundles);
5930 Bundle.setScheduled(
true);
5932 auto AreAllBundlesScheduled =
5933 [&](
const ScheduleEntity *SD,
5937 return !SDBundles.empty() &&
5938 all_of(SDBundles, [&](
const ScheduleBundle *SDBundle) {
5939 return SDBundle->isScheduled();
5942 for (ScheduleEntity *SD : Bundle.getBundle()) {
5945 SDBundles = getScheduleBundles(SD->getInst());
5946 if (AreAllBundlesScheduled(SD, SDBundles)) {
5947 SD->setScheduled(
true);
5960 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
5961 ScheduleStart->comesBefore(ScheduleEnd) &&
5962 "Not a valid scheduling region?");
5964 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
5966 if (!Bundles.
empty()) {
5967 for (ScheduleBundle *Bundle : Bundles) {
5968 assert(isInSchedulingRegion(*Bundle) &&
5969 "primary schedule data not in window?");
5974 auto *SD = getScheduleData(
I);
5977 assert(isInSchedulingRegion(*SD) &&
5978 "primary schedule data not in window?");
5983 [](
const ScheduleEntity *Bundle) {
5984 return Bundle->isReady();
5986 "item in ready list not ready?");
5990 template <
typename ReadyListType>
5991 void initialFillReadyList(ReadyListType &ReadyList) {
5992 SmallPtrSet<ScheduleBundle *, 16> Visited;
5993 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
5994 ScheduleData *SD = getScheduleData(
I);
5995 if (SD && SD->hasValidDependencies() && SD->isReady()) {
5998 for (ScheduleBundle *Bundle : Bundles) {
5999 if (!Visited.
insert(Bundle).second)
6001 if (Bundle->hasValidDependencies() && Bundle->isReady()) {
6002 ReadyList.insert(Bundle);
6004 << *Bundle <<
"\n");
6009 ReadyList.insert(SD);
6011 <<
"SLP: initially in ready list: " << *SD <<
"\n");
6022 const InstructionsState &S,
const EdgeInfo &EI);
6029 std::optional<ScheduleBundle *>
6031 const InstructionsState &S,
const EdgeInfo &EI);
6034 ScheduleData *allocateScheduleDataChunks();
6038 bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
6042 void initScheduleData(Instruction *FromI, Instruction *ToI,
6043 ScheduleData *PrevLoadStore,
6044 ScheduleData *NextLoadStore);
6048 void calculateDependencies(ScheduleBundle &Bundle,
bool InsertInReadyList,
6053 void resetSchedule();
6070 SmallDenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
6074 SmallDenseMap<std::pair<EdgeInfo, const Value *>,
6075 std::unique_ptr<ScheduleCopyableData>>
6076 ScheduleCopyableDataMap;
6082 SmallDenseMap<const Instruction *, SmallVector<ScheduleCopyableData *>>
6083 ScheduleCopyableDataMapByInst;
6089 SmallDenseMap<std::pair<std::pair<const Value *, unsigned>,
const Value *>,
6091 ScheduleCopyableDataMapByInstUser;
6111 SmallSetVector<ScheduleCopyableData *, 4>>
6112 ScheduleCopyableDataMapByUsers;
6115 SmallDenseMap<Instruction *, SmallVector<ScheduleBundle *>>
6121 SetVector<ScheduleEntity *> ReadyInsts;
6131 ScheduleData *FirstLoadStoreInRegion =
nullptr;
6135 ScheduleData *LastLoadStoreInRegion =
nullptr;
6140 bool RegionHasStackSave =
false;
6143 int ScheduleRegionSize = 0;
6152 int SchedulingRegionID = 1;
6156 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
6160 void scheduleBlock(
const BoUpSLP &R, BlockScheduling *BS);
6163 const SmallDenseSet<Value *> *UserIgnoreList =
nullptr;
6167 struct OrdersTypeDenseMapInfo {
6180 static unsigned getHashValue(
const OrdersType &V) {
6191 ScalarEvolution *SE;
6192 TargetTransformInfo *TTI;
6193 TargetLibraryInfo *TLI;
6196 AssumptionCache *AC;
6198 const DataLayout *DL;
6199 OptimizationRemarkEmitter *ORE;
6201 unsigned MaxVecRegSize;
6202 unsigned MinVecRegSize;
6205 IRBuilder<TargetFolder> Builder;
6212 DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
6217 unsigned ReductionBitWidth = 0;
6220 unsigned BaseGraphSize = 1;
6224 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
6228 DenseSet<unsigned> ExtraBitWidthNodes;
6236 SecondInfo::getEmptyKey());
6241 SecondInfo::getTombstoneKey());
6246 SecondInfo::getHashValue(Val.
EdgeIdx));
6267 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
6278 return R.VectorizableTree[0].get();
6282 return {&
N->UserTreeIndex,
N->Container};
6286 return {&
N->UserTreeIndex + 1,
N->Container};
6313 static unsigned size(
BoUpSLP *R) {
return R->VectorizableTree.size(); }
6325 OS << Entry->Idx <<
".\n";
6328 for (
auto *V : Entry->Scalars) {
6330 if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
6331 return EU.Scalar == V;
6341 if (Entry->isGather())
6343 if (Entry->State == TreeEntry::ScatterVectorize ||
6344 Entry->State == TreeEntry::StridedVectorize ||
6345 Entry->State == TreeEntry::CompressVectorize)
6346 return "color=blue";
6353 for (
auto *
I : DeletedInstructions) {
6354 if (!
I->getParent()) {
6359 I->insertBefore(F->getEntryBlock(),
6360 F->getEntryBlock().getFirstNonPHIIt());
6362 I->insertBefore(F->getEntryBlock().getTerminator()->getIterator());
6365 for (
Use &U :
I->operands()) {
6367 if (
Op && !DeletedInstructions.count(
Op) &&
Op->hasOneUser() &&
6371 I->dropAllReferences();
6373 for (
auto *
I : DeletedInstructions) {
6375 "trying to erase instruction with users.");
6376 I->eraseFromParent();
6382#ifdef EXPENSIVE_CHECKS
6393 assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
6394 "Expected non-empty mask.");
6397 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
6399 Reuses[Mask[
I]] = Prev[
I];
6407 bool BottomOrder =
false) {
6408 assert(!Mask.empty() &&
"Expected non-empty mask.");
6409 unsigned Sz = Mask.size();
6412 if (Order.
empty()) {
6414 std::iota(PrevOrder.
begin(), PrevOrder.
end(), 0);
6416 PrevOrder.
swap(Order);
6419 for (
unsigned I = 0;
I < Sz; ++
I)
6421 Order[
I] = PrevOrder[Mask[
I]];
6423 return Data.value() == Sz ||
Data.index() ==
Data.value();
6432 if (Order.
empty()) {
6434 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
6444 for (
unsigned I = 0;
I < Sz; ++
I)
6446 Order[MaskOrder[
I]] =
I;
6450std::optional<BoUpSLP::OrdersType>
6452 bool TopToBottom,
bool IgnoreReorder) {
6453 assert(TE.isGather() &&
"Expected gather node only.");
6457 Type *ScalarTy = GatheredScalars.
front()->getType();
6458 size_t NumScalars = GatheredScalars.
size();
6460 return std::nullopt;
6467 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
6469 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
6472 if (GatherShuffles.
empty() && ExtractShuffles.
empty())
6473 return std::nullopt;
6474 OrdersType CurrentOrder(NumScalars, NumScalars);
6475 if (GatherShuffles.
size() == 1 &&
6477 Entries.
front().front()->isSame(TE.Scalars)) {
6481 return std::nullopt;
6483 if (Entries.
front().front()->UserTreeIndex.UserTE ==
6484 TE.UserTreeIndex.UserTE)
6485 return std::nullopt;
6488 if (!IgnoreReorder && Entries.
front().front()->Idx == 0)
6489 return std::nullopt;
6492 if (!Entries.
front().front()->ReuseShuffleIndices.empty() &&
6493 TE.getVectorFactor() == 2 && Mask.size() == 2 &&
6496 return P.value() % 2 != static_cast<int>(P.index()) % 2;
6498 return std::nullopt;
6502 std::iota(CurrentOrder.
begin(), CurrentOrder.
end(), 0);
6503 return CurrentOrder;
6507 return all_of(Mask, [&](
int I) {
6514 if ((ExtractShuffles.
empty() && IsSplatMask(Mask) &&
6515 (Entries.
size() != 1 ||
6516 Entries.
front().front()->ReorderIndices.empty())) ||
6517 (GatherShuffles.
empty() && IsSplatMask(ExtractMask)))
6518 return std::nullopt;
6524 if (ShuffledSubMasks.
test(
I))
6526 const int VF = GetVF(
I);
6534 ShuffledSubMasks.
set(
I);
6538 int FirstMin = INT_MAX;
6539 int SecondVecFound =
false;
6541 int Idx = Mask[
I * PartSz + K];
6543 Value *V = GatheredScalars[
I * PartSz + K];
6545 SecondVecFound =
true;
6554 SecondVecFound =
true;
6558 FirstMin = (FirstMin / PartSz) * PartSz;
6560 if (SecondVecFound) {
6562 ShuffledSubMasks.
set(
I);
6566 int Idx = Mask[
I * PartSz + K];
6570 if (Idx >= PartSz) {
6571 SecondVecFound =
true;
6574 if (CurrentOrder[
I * PartSz + Idx] >
6575 static_cast<unsigned>(
I * PartSz + K) &&
6576 CurrentOrder[
I * PartSz + Idx] !=
6577 static_cast<unsigned>(
I * PartSz + Idx))
6578 CurrentOrder[
I * PartSz + Idx] =
I * PartSz + K;
6581 if (SecondVecFound) {
6583 ShuffledSubMasks.
set(
I);
6589 if (!ExtractShuffles.
empty())
6590 TransformMaskToOrder(
6591 CurrentOrder, ExtractMask, PartSz, NumParts, [&](
unsigned I) {
6592 if (!ExtractShuffles[
I])
6595 unsigned Sz =
getNumElems(TE.getVectorFactor(), PartSz,
I);
6597 int K =
I * PartSz + Idx;
6600 if (!TE.ReuseShuffleIndices.empty())
6601 K = TE.ReuseShuffleIndices[K];
6604 if (!TE.ReorderIndices.empty())
6605 K = std::distance(TE.ReorderIndices.begin(),
6606 find(TE.ReorderIndices, K));
6612 .getKnownMinValue());
6617 if (GatherShuffles.
size() == 1 && NumParts != 1) {
6618 if (ShuffledSubMasks.
any())
6619 return std::nullopt;
6620 PartSz = NumScalars;
6623 if (!Entries.
empty())
6624 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](
unsigned I) {
6625 if (!GatherShuffles[
I])
6627 return std::max(Entries[
I].front()->getVectorFactor(),
6628 Entries[
I].back()->getVectorFactor());
6630 unsigned NumUndefs =
count(CurrentOrder, NumScalars);
6631 if (ShuffledSubMasks.
all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
6632 return std::nullopt;
6633 return std::move(CurrentOrder);
6638 bool CompareOpcodes =
true) {
6644 return (!GEP1 || GEP1->getNumOperands() == 2) &&
6645 (!GEP2 || GEP2->getNumOperands() == 2) &&
6646 (((!GEP1 ||
isConstant(GEP1->getOperand(1))) &&
6647 (!GEP2 ||
isConstant(GEP2->getOperand(1)))) ||
6650 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
6654template <
typename T>
6659 return CommonAlignment;
6665 "Order is empty. Please check it before using isReverseOrder.");
6666 unsigned Sz = Order.
size();
6668 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
6689 "Coeffs vector needs to be of correct size");
6691 const SCEV *PtrSCEVLowest =
nullptr;
6692 const SCEV *PtrSCEVHighest =
nullptr;
6695 for (
Value *Ptr : PointerOps) {
6700 if (!PtrSCEVLowest && !PtrSCEVHighest) {
6701 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
6708 PtrSCEVLowest = PtrSCEV;
6715 PtrSCEVHighest = PtrSCEV;
6723 int Size =
DL.getTypeStoreSize(ElemTy);
6724 auto TryGetStride = [&](
const SCEV *Dist,
6725 const SCEV *Multiplier) ->
const SCEV * {
6727 if (M->getOperand(0) == Multiplier)
6728 return M->getOperand(1);
6729 if (M->getOperand(1) == Multiplier)
6730 return M->getOperand(0);
6733 if (Multiplier == Dist)
6738 const SCEV *Stride =
nullptr;
6739 if (
Size != 1 || SCEVs.
size() > 2) {
6741 Stride = TryGetStride(Dist, Sz);
6749 using DistOrdPair = std::pair<int64_t, int>;
6751 std::set<DistOrdPair,
decltype(Compare)> Offsets(Compare);
6753 bool IsConsecutive =
true;
6754 for (
const auto [Idx, PtrSCEV] :
enumerate(SCEVs)) {
6756 if (PtrSCEV != PtrSCEVLowest) {
6758 const SCEV *Coeff = TryGetStride(Diff, Stride);
6764 Coeffs[Idx] = (int64_t)SC->getAPInt().getLimitedValue();
6769 Dist = SC->getAPInt().getZExtValue();
6776 auto Res = Offsets.emplace(Dist, Cnt);
6780 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
6783 if (Offsets.size() != SCEVs.
size())
6785 SortedIndices.
clear();
6786 if (!IsConsecutive) {
6790 for (
const std::pair<int64_t, int> &Pair : Offsets) {
6791 SortedIndices[Cnt] = Pair.second;
6798static std::pair<InstructionCost, InstructionCost>
6817 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask,
CostKind, Index, SubTp,
6819 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
6822 Mask, NumSrcElts, NumSubElts, Index)) {
6823 if (Index + NumSubElts > NumSrcElts &&
6824 Index + NumSrcElts <=
static_cast<int>(
Mask.size()))
6828 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask,
CostKind, Index, SubTp,
6841 "ScalableVectorType is not supported.");
6844 "Incorrect usage.");
6849 unsigned ScalarTyNumElements = VecTy->getNumElements();
6852 if (!DemandedElts[
I])
6856 I * ScalarTyNumElements, VecTy);
6859 I * ScalarTyNumElements, VecTy);
6863 return TTI.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
6872 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
6873 if (Opcode == Instruction::ExtractElement) {
6879 Index * VecTy->getNumElements(), VecTy);
6882 return TTI.getVectorInstrCost(Opcode, Val,
CostKind, Index, Scalar,
6895 getWidenedType(VecTy->getElementType(), ScalarTy->getNumElements());
6897 Index * ScalarTy->getNumElements(), SubTp) +
6901 return TTI.getExtractWithExtendCost(Opcode, Dst, VecTy, Index,
CostKind);
6917 auto *Begin = std::next(
Mask.begin(), Index);
6918 std::iota(Begin, std::next(Begin, SubVecVF), 0);
6919 Vec = Builder.CreateShuffleVector(V, Mask);
6922 std::iota(
Mask.begin(),
Mask.end(), 0);
6923 std::iota(std::next(
Mask.begin(), Index),
6924 std::next(
Mask.begin(), Index + SubVecVF), VecVF);
6926 return Generator(Vec, V, Mask);
6929 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
6930 V = Builder.CreateShuffleVector(V, ResizeMask);
6932 return Builder.CreateShuffleVector(Vec, V, Mask);
6937 unsigned SubVecVF,
unsigned Index) {
6939 std::iota(Mask.begin(), Mask.end(), Index);
6940 return Builder.CreateShuffleVector(Vec, Mask);
6950 const unsigned Sz = PointerOps.
size();
6953 CompressMask[0] = 0;
6955 std::optional<unsigned> Stride = 0;
6958 Value *Ptr = Order.
empty() ? PointerOps[
I] : PointerOps[Order[
I]];
6959 std::optional<int64_t> OptPos =
6961 if (!OptPos || OptPos > std::numeric_limits<unsigned>::max())
6963 unsigned Pos =
static_cast<unsigned>(*OptPos);
6964 CompressMask[
I] = Pos;
6971 if (Pos != *Stride *
I)
6974 return Stride.has_value();
6987 InterleaveFactor = 0;
6989 const size_t Sz = VL.
size();
6997 if (AreAllUsersVectorized(V))
7000 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
CostKind,
7001 Mask.empty() ?
I : Mask[
I]);
7004 if (ExtractCost <= ScalarCost)
7009 if (Order.
empty()) {
7010 Ptr0 = PointerOps.
front();
7011 PtrN = PointerOps.
back();
7013 Ptr0 = PointerOps[Order.
front()];
7014 PtrN = PointerOps[Order.
back()];
7016 std::optional<int64_t> Diff =
7020 const size_t MaxRegSize =
7024 if (*Diff / Sz >= MaxRegSize / 8)
7028 Align CommonAlignment = LI->getAlign();
7030 Ptr0, LoadVecTy, CommonAlignment,
DL,
7033 if (IsMasked && !
TTI.isLegalMaskedLoad(LoadVecTy, CommonAlignment,
7034 LI->getPointerAddressSpace()))
7040 assert(CompressMask.
size() >= 2 &&
"At least two elements are required");
7044 auto [ScalarGEPCost, VectorGEPCost] =
7046 Instruction::GetElementPtr,
CostKind, ScalarTy, LoadVecTy);
7063 LoadCost =
TTI.getMemIntrinsicInstrCost(
7066 LI->getPointerAddressSpace()),
7070 TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
7071 LI->getPointerAddressSpace(),
CostKind);
7073 if (IsStrided && !IsMasked && Order.
empty()) {
7080 AlignedLoadVecTy = LoadVecTy;
7081 if (
TTI.isLegalInterleavedAccessType(AlignedLoadVecTy, CompressMask[1],
7083 LI->getPointerAddressSpace())) {
7085 VectorGEPCost +
TTI.getInterleavedMemoryOpCost(
7086 Instruction::Load, AlignedLoadVecTy,
7087 CompressMask[1], {}, CommonAlignment,
7088 LI->getPointerAddressSpace(),
CostKind, IsMasked);
7089 if (InterleavedCost < GatherCost) {
7090 InterleaveFactor = CompressMask[1];
7091 LoadVecTy = AlignedLoadVecTy;
7098 if (!Order.
empty()) {
7101 NewMask[
I] = CompressMask[Mask[
I]];
7103 CompressMask.
swap(NewMask);
7105 InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
7106 return TotalVecCost < GatherCost;
7119 unsigned InterleaveFactor;
7123 AreAllUsersVectorized, IsMasked, InterleaveFactor,
7124 CompressMask, LoadVecTy);
7141 Align Alignment,
const int64_t Diff,
7142 const size_t Sz)
const {
7143 if (Diff % (Sz - 1) != 0)
7147 auto IsAnyPointerUsedOutGraph =
any_of(PointerOps, [&](
Value *V) {
7149 return !isVectorized(U) && !MustGather.contains(U);
7153 const uint64_t AbsoluteDiff = std::abs(Diff);
7155 if (IsAnyPointerUsedOutGraph ||
7156 (AbsoluteDiff > Sz &&
7159 AbsoluteDiff % Sz == 0 &&
has_single_bit(AbsoluteDiff / Sz)))) ||
7160 Diff == -(
static_cast<int64_t
>(Sz) - 1)) {
7161 int64_t Stride = Diff /
static_cast<int64_t
>(Sz - 1);
7162 if (Diff != Stride *
static_cast<int64_t
>(Sz - 1))
7164 if (!TTI->isLegalStridedLoadStore(VecTy, Alignment))
7174 Value *Ptr0,
Value *PtrN, StridedPtrInfo &SPtrInfo)
const {
7175 const size_t Sz = PointerOps.
size();
7183 SortedIndices.
empty() ? PointerOps[
I] : PointerOps[SortedIndices[
I]];
7184 std::optional<int64_t>
Offset =
7186 assert(
Offset &&
"sortPtrAccesses should have validated this pointer");
7187 SortedOffsetsFromBase[
I] = *
Offset;
7204 int64_t StrideWithinGroup =
7205 SortedOffsetsFromBase[1] - SortedOffsetsFromBase[0];
7208 auto IsEndOfGroupIndex = [=, &SortedOffsetsFromBase](
unsigned Idx) {
7209 return SortedOffsetsFromBase[Idx] - SortedOffsetsFromBase[Idx - 1] !=
7214 unsigned GroupSize = FoundIt != Indices.end() ? *FoundIt : Sz;
7216 unsigned VecSz = Sz;
7217 Type *NewScalarTy = ScalarTy;
7221 bool NeedsWidening = Sz != GroupSize;
7222 if (NeedsWidening) {
7223 if (Sz % GroupSize != 0)
7226 if (StrideWithinGroup != 1)
7228 VecSz = Sz / GroupSize;
7231 DL->getTypeSizeInBits(ScalarTy).getFixedValue() * GroupSize);
7234 if (!
isStridedLoad(PointerOps, NewScalarTy, Alignment, Diff, VecSz))
7237 int64_t StrideIntVal = StrideWithinGroup;
7238 if (NeedsWidening) {
7241 unsigned CurrentGroupStartIdx = GroupSize;
7242 int64_t StrideBetweenGroups =
7243 SortedOffsetsFromBase[GroupSize] - SortedOffsetsFromBase[0];
7244 StrideIntVal = StrideBetweenGroups;
7245 for (; CurrentGroupStartIdx < Sz; CurrentGroupStartIdx += GroupSize) {
7246 if (SortedOffsetsFromBase[CurrentGroupStartIdx] -
7247 SortedOffsetsFromBase[CurrentGroupStartIdx - GroupSize] !=
7248 StrideBetweenGroups)
7252 auto CheckGroup = [=](
const unsigned StartIdx) ->
bool {
7255 unsigned GroupEndIdx = FoundIt != Indices.end() ? *FoundIt : Sz;
7256 return GroupEndIdx - StartIdx == GroupSize;
7258 for (
unsigned I = 0;
I < Sz;
I += GroupSize) {
7264 Type *StrideTy = DL->getIndexType(Ptr0->
getType());
7273 StridedPtrInfo &SPtrInfo)
const {
7279 OffsetToPointerOpIdxMap;
7280 for (
auto [Idx, Ptr] :
enumerate(PointerOps)) {
7281 const SCEV *PtrSCEV = SE->getSCEV(Ptr);
7293 Offset = SC->getAPInt().getSExtValue();
7294 if (
Offset >= std::numeric_limits<int64_t>::max() - 1) {
7301 OffsetToPointerOpIdxMap[
Offset].first.push_back(Ptr);
7302 OffsetToPointerOpIdxMap[
Offset].second.push_back(Idx);
7304 unsigned NumOffsets = OffsetToPointerOpIdxMap.
size();
7308 const unsigned Sz = PointerOps.
size();
7309 unsigned VecSz = Sz;
7310 Type *NewScalarTy = ScalarTy;
7311 if (NumOffsets > 1) {
7312 if (Sz % NumOffsets != 0)
7314 VecSz = Sz / NumOffsets;
7317 DL->getTypeSizeInBits(ScalarTy).getFixedValue() * NumOffsets);
7320 if (Sz <= MinProfitableStridedLoads || !TTI->isTypeLegal(StridedLoadTy) ||
7321 !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))
7327 for (
auto [Idx, MapPair] :
enumerate(OffsetToPointerOpIdxMap)) {
7328 if (MapPair.second.first.size() != VecSz)
7330 SortedOffsetsV[Idx] = MapPair.first;
7332 sort(SortedOffsetsV);
7334 if (NumOffsets > 1) {
7336 if (SortedOffsetsV[
I] - SortedOffsetsV[
I - 1] != 1)
7409 auto UpdateSortedIndices =
7412 if (SortedIndicesForOffset.
empty()) {
7413 SortedIndicesForOffset.
resize(IndicesInAllPointerOps.
size());
7414 std::iota(SortedIndicesForOffset.
begin(),
7415 SortedIndicesForOffset.
end(), 0);
7417 for (
const auto [Num, Idx] :
enumerate(SortedIndicesForOffset)) {
7418 SortedIndicesDraft[Num * NumOffsets + OffsetNum] =
7419 IndicesInAllPointerOps[Idx];
7423 int64_t LowestOffset = SortedOffsetsV[0];
7429 SortedIndicesForOffset0, Coeffs0);
7432 unsigned NumCoeffs0 = Coeffs0.
size();
7433 if (NumCoeffs0 * NumOffsets != Sz)
7438 OffsetToPointerOpIdxMap[LowestOffset].second;
7439 UpdateSortedIndices(SortedIndicesForOffset0, IndicesInAllPointerOps0, 0);
7445 for (
int J :
seq<int>(1, NumOffsets)) {
7448 SortedIndicesForOffset.
clear();
7450 int64_t
Offset = SortedOffsetsV[J];
7452 OffsetToPointerOpIdxMap[
Offset].first;
7454 OffsetToPointerOpIdxMap[
Offset].second;
7455 const SCEV *StrideWithinGroup =
7457 SortedIndicesForOffset, Coeffs);
7459 if (!StrideWithinGroup || StrideWithinGroup != Stride0)
7461 if (Coeffs.
size() != NumCoeffs0)
7464 if (Coeffs != Coeffs0)
7467 UpdateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps, J);
7470 SortedIndices.
clear();
7471 SortedIndices = std::move(SortedIndicesDraft);
7472 SPtrInfo.StrideSCEV = Stride0;
7473 SPtrInfo.Ty = StridedLoadTy;
7480 unsigned *BestVF,
bool TryRecursiveCheck)
const {
7493 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
7499 const size_t Sz = VL.
size();
7501 auto *POIter = PointerOps.
begin();
7502 for (
Value *V : VL) {
7504 if (!L || !L->isSimple())
7506 *POIter = L->getPointerOperand();
7512 bool IsSorted =
sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
7521 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
7522 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
7533 if (Order.
empty()) {
7534 Ptr0 = PointerOps.
front();
7535 PtrN = PointerOps.
back();
7537 Ptr0 = PointerOps[Order.
front()];
7538 PtrN = PointerOps[Order.
back()];
7543 std::optional<int64_t> Diff0 =
7545 std::optional<int64_t> DiffN =
7548 "sortPtrAccesses should have validated these pointers");
7549 int64_t Diff = *DiffN - *Diff0;
7551 if (
static_cast<uint64_t>(Diff) == Sz - 1)
7554 *TLI, [&](
Value *V) {
7555 return areAllUsersVectorized(
7563 Diff, Ptr0, PtrN, SPtrInfo))
7566 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
7567 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
7572 auto CheckForShuffledLoads = [&, &TTI = *TTI](
Align CommonAlignment,
7574 bool ProfitableGatherPointers) {
7579 auto [ScalarGEPCost, VectorGEPCost] =
7581 Instruction::GetElementPtr,
CostKind, ScalarTy, VecTy);
7585 Type *PtrScalarTy = PointerOps.
front()->getType()->getScalarType();
7587 if (
static_cast<unsigned>(
count_if(
7606 return C + TTI.getInstructionCost(
7612 TTI.getMemIntrinsicInstrCost(
7615 false, CommonAlignment),
7617 (ProfitableGatherPointers ? 0 : VectorGEPCost);
7625 constexpr unsigned ListLimit = 4;
7626 if (!TryRecursiveCheck || VL.
size() < ListLimit)
7635 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
7645 for (
unsigned Cnt = 0, End = VL.
size(); Cnt + VF <= End; Cnt += VF) {
7650 PointerOps, SPtrInfo, BestVF,
7658 DemandedElts.
setBits(Cnt, Cnt + VF);
7674 if (!DemandedElts.
isZero()) {
7680 if (DemandedElts[Idx])
7691 LI0->getPointerOperand(),
7692 Instruction::GetElementPtr,
CostKind, ScalarTy,
7696 if (
static_cast<unsigned>(
7698 PointerOps.
size() - 1 ||
7717 TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
7718 LI0->getPointerAddressSpace(),
CostKind,
7723 VecLdCost += TTI.getMemIntrinsicInstrCost(
7725 Intrinsic::experimental_vp_strided_load,
7726 SubVecTy, LI0->getPointerOperand(),
7727 false, CommonAlignment),
7732 VecLdCost += TTI.getMemIntrinsicInstrCost(
7734 Intrinsic::masked_load, SubVecTy,
7735 CommonAlignment, LI0->getPointerAddressSpace()),
7741 VecLdCost += TTI.getMemIntrinsicInstrCost(
7743 Intrinsic::masked_gather, SubVecTy,
7744 LI0->getPointerOperand(),
7745 false, CommonAlignment),
7755 ShuffleMask[Idx] = Idx / VF ==
I ? VL.
size() + Idx % VF : Idx;
7764 if (MaskedGatherCost >= VecLdCost &&
7777 bool ProfitableGatherPointers =
7778 L && Sz > 2 &&
static_cast<unsigned>(
count_if(PointerOps, [L](
Value *V) {
7779 return L->isLoopInvariant(V);
7781 if (ProfitableGatherPointers ||
all_of(PointerOps, [](
Value *
P) {
7784 (
GEP &&
GEP->getNumOperands() == 2 &&
7792 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
7793 ProfitableGatherPointers))
7805 all_of(VL, [](
const Value *V) {
return V->getType()->isPointerTy(); }) &&
7806 "Expected list of pointer operands.");
7811 std::pair<BasicBlock *, Value *>,
7815 .try_emplace(std::make_pair(
7819 SortedIndices.
clear();
7821 auto Key = std::make_pair(BBs[Cnt + 1],
7823 bool Found =
any_of(Bases.try_emplace(
Key).first->second,
7824 [&, &Cnt = Cnt, &Ptr = Ptr](
auto &
Base) {
7825 std::optional<int64_t> Diff =
7826 getPointersDiff(ElemTy, std::get<0>(Base.front()),
7827 ElemTy, Ptr, DL, SE,
7832 Base.emplace_back(Ptr, *Diff, Cnt + 1);
7838 if (Bases.size() > VL.
size() / 2 - 1)
7842 Bases.find(
Key)->second.emplace_back().emplace_back(Ptr, 0, Cnt + 1);
7846 if (Bases.size() == VL.
size())
7849 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
7850 Bases.front().second.size() == VL.
size()))
7855 auto ComparePointers = [](
Value *Ptr1,
Value *Ptr2) {
7864 FirstPointers.
insert(P1);
7865 SecondPointers.
insert(P2);
7871 "Unable to find matching root.");
7874 for (
auto &
Base : Bases) {
7875 for (
auto &Vec :
Base.second) {
7876 if (Vec.size() > 1) {
7878 int64_t InitialOffset = std::get<1>(Vec[0]);
7879 bool AnyConsecutive =
7881 return std::get<1>(
P.value()) ==
7882 int64_t(
P.index()) + InitialOffset;
7886 if (!AnyConsecutive)
7891 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
7895 for (
auto &
T : Bases)
7896 for (
const auto &Vec :
T.second)
7897 for (
const auto &
P : Vec)
7901 "Expected SortedIndices to be the size of VL");
7905std::optional<BoUpSLP::OrdersType>
7907 assert(TE.isGather() &&
"Expected gather node only.");
7908 Type *ScalarTy = TE.Scalars[0]->getType();
7911 Ptrs.
reserve(TE.Scalars.size());
7913 BBs.
reserve(TE.Scalars.size());
7914 for (
Value *V : TE.Scalars) {
7916 if (!L || !L->isSimple())
7917 return std::nullopt;
7923 if (!LoadEntriesToVectorize.contains(TE.Idx) &&
7925 return std::move(Order);
7926 return std::nullopt;
7937 if (VU->
getType() != V->getType())
7940 if (!VU->
hasOneUse() && !V->hasOneUse())
7946 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
7953 bool IsReusedIdx =
false;
7955 if (IE2 == VU && !IE1)
7957 if (IE1 == V && !IE2)
7958 return V->hasOneUse();
7959 if (IE1 && IE1 != V) {
7961 IsReusedIdx |= ReusedIdx.
test(Idx1);
7962 ReusedIdx.
set(Idx1);
7963 if ((IE1 != VU && !IE1->
hasOneUse()) || IsReusedIdx)
7968 if (IE2 && IE2 != VU) {
7970 IsReusedIdx |= ReusedIdx.
test(Idx2);
7971 ReusedIdx.
set(Idx2);
7972 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
7977 }
while (!IsReusedIdx && (IE1 || IE2));
7987std::optional<BoUpSLP::OrdersType>
7989 bool IgnoreReorder) {
7992 if (!TE.ReuseShuffleIndices.empty()) {
7994 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
7995 "Reshuffling scalars not yet supported for nodes with padding");
7998 return std::nullopt;
8006 unsigned Sz = TE.Scalars.size();
8007 if (TE.isGather()) {
8008 if (std::optional<OrdersType> CurrentOrder =
8013 ::addMask(Mask, TE.ReuseShuffleIndices);
8014 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
8015 unsigned Sz = TE.Scalars.size();
8016 for (
int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
8019 Res[Idx + K * Sz] =
I + K * Sz;
8021 return std::move(Res);
8024 if (Sz == 2 && TE.getVectorFactor() == 4 &&
8026 2 * TE.getVectorFactor())) == 1)
8027 return std::nullopt;
8028 if (TE.ReuseShuffleIndices.size() % Sz != 0)
8029 return std::nullopt;
8033 if (TE.ReorderIndices.empty())
8034 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
8037 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
8038 unsigned VF = ReorderMask.
size();
8042 for (
unsigned I = 0;
I < VF;
I += Sz) {
8044 unsigned UndefCnt = 0;
8045 unsigned Limit = std::min(Sz, VF -
I);
8054 Val >=
static_cast<int>(NumParts) || UsedVals.
test(Val) ||
8056 return std::nullopt;
8058 for (
unsigned K = 0; K < NumParts; ++K) {
8059 unsigned Idx = Val + Sz * K;
8060 if (Idx < VF &&
I + K < VF)
8061 ResOrder[Idx] =
I + K;
8064 return std::move(ResOrder);
8066 unsigned VF = TE.getVectorFactor();
8069 TE.ReuseShuffleIndices.end());
8070 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
8072 if (isa<PoisonValue>(V))
8074 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
8075 return Idx && *Idx < Sz;
8077 assert(!TE.isAltShuffle() &&
"Alternate instructions are only supported "
8078 "by BinaryOperator and CastInst.");
8080 if (TE.ReorderIndices.empty())
8081 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
8084 for (
unsigned I = 0;
I < VF; ++
I) {
8085 int &Idx = ReusedMask[
I];
8088 Value *V = TE.Scalars[ReorderMask[Idx]];
8090 Idx = std::distance(ReorderMask.
begin(),
find(ReorderMask, *EI));
8096 std::iota(ResOrder.
begin(), ResOrder.
end(), 0);
8097 auto *It = ResOrder.
begin();
8098 for (
unsigned K = 0; K < VF; K += Sz) {
8102 std::iota(SubMask.
begin(), SubMask.
end(), 0);
8104 transform(CurrentOrder, It, [K](
unsigned Pos) {
return Pos + K; });
8105 std::advance(It, Sz);
8108 return Data.index() ==
Data.value();
8110 return std::nullopt;
8111 return std::move(ResOrder);
8113 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
8114 (!TE.UserTreeIndex || !TE.UserTreeIndex.UserTE->hasState() ||
8116 (TE.ReorderIndices.empty() ||
isReverseOrder(TE.ReorderIndices)))
8117 return std::nullopt;
8118 if (TE.State == TreeEntry::SplitVectorize ||
8119 ((TE.State == TreeEntry::Vectorize ||
8120 TE.State == TreeEntry::StridedVectorize ||
8121 TE.State == TreeEntry::CompressVectorize) &&
8124 assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
8125 "Alternate instructions are only supported by "
8126 "BinaryOperator and CastInst.");
8127 return TE.ReorderIndices;
8129 if (!TopToBottom && IgnoreReorder && TE.State == TreeEntry::Vectorize &&
8130 TE.isAltShuffle()) {
8131 assert(TE.ReuseShuffleIndices.empty() &&
8132 "ReuseShuffleIndices should be "
8133 "empty for alternate instructions.");
8135 TE.buildAltOpShuffleMask(
8137 assert(TE.getMatchingMainOpOrAltOp(
I) &&
8138 "Unexpected main/alternate opcode");
8142 const int VF = TE.getVectorFactor();
8147 ResOrder[Mask[
I] % VF] =
I;
8149 return std::move(ResOrder);
8151 if (!TE.ReorderIndices.empty())
8152 return TE.ReorderIndices;
8153 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
8154 if (!TE.ReorderIndices.empty())
8155 return TE.ReorderIndices;
8158 for (
auto [
I, V] :
zip(UserBVHead, TE.Scalars)) {
8166 while (
II &&
II->hasOneUse() &&
II->getParent() == BB) {
8174 assert(BB1 != BB2 &&
"Expected different basic blocks.");
8175 if (!DT->isReachableFromEntry(BB1))
8177 if (!DT->isReachableFromEntry(BB2))
8179 auto *NodeA = DT->getNode(BB1);
8180 auto *NodeB = DT->getNode(BB2);
8181 assert(NodeA &&
"Should only process reachable instructions");
8182 assert(NodeB &&
"Should only process reachable instructions");
8183 assert((NodeA == NodeB) ==
8184 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
8185 "Different nodes should have different DFS numbers");
8186 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
8188 auto PHICompare = [&](
unsigned I1,
unsigned I2) {
8189 Value *V1 = TE.Scalars[I1];
8190 Value *V2 = TE.Scalars[I2];
8203 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
8204 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
8205 FirstUserOfPhi2->getParent());
8215 if (UserBVHead[I1] && !UserBVHead[I2])
8217 if (!UserBVHead[I1])
8219 if (UserBVHead[I1] == UserBVHead[I2])
8222 return CompareByBasicBlocks(UserBVHead[I1]->
getParent(),
8224 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
8237 if (EE1->getOperand(0) == EE2->getOperand(0))
8239 if (!Inst1 && Inst2)
8241 if (Inst1 && Inst2) {
8249 "Expected either instructions or arguments vector operands.");
8250 return P1->getArgNo() < P2->getArgNo();
8255 std::iota(Phis.
begin(), Phis.
end(), 0);
8258 return std::nullopt;
8259 return std::move(Phis);
8261 if (TE.isGather() &&
8262 (!TE.hasState() || !TE.isAltShuffle() ||
8263 ScalarsInSplitNodes.contains(TE.getMainOp())) &&
8267 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
8271 auto *EE = dyn_cast<ExtractElementInst>(V);
8272 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
8278 canReuseExtract(TE.Scalars, CurrentOrder,
true);
8279 if (Reuse || !CurrentOrder.
empty())
8280 return std::move(CurrentOrder);
8288 int Sz = TE.Scalars.size();
8292 if (It == TE.Scalars.begin())
8295 if (It != TE.Scalars.end()) {
8297 unsigned Idx = std::distance(TE.Scalars.begin(), It);
8312 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
8315 return std::move(Order);
8320 return std::nullopt;
8321 if (TE.Scalars.size() >= 3)
8326 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
8328 StridedPtrInfo SPtrInfo;
8331 CurrentOrder, PointerOps, SPtrInfo);
8334 return std::move(CurrentOrder);
8339 if (std::optional<OrdersType> CurrentOrder =
8341 return CurrentOrder;
8343 return std::nullopt;
8353 for (
unsigned I = Sz,
E = Mask.size();
I <
E;
I += Sz) {
8355 if (Cluster != FirstCluster)
8361void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const {
8364 const unsigned Sz =
TE.Scalars.size();
8366 if (!
TE.isGather() ||
8373 addMask(NewMask,
TE.ReuseShuffleIndices);
8375 TE.ReorderIndices.clear();
8382 for (
auto *It =
TE.ReuseShuffleIndices.begin(),
8383 *End =
TE.ReuseShuffleIndices.end();
8384 It != End; std::advance(It, Sz))
8385 std::iota(It, std::next(It, Sz), 0);
8391 "Expected same size of orders");
8392 size_t Sz = Order.
size();
8395 if (Order[Idx] != Sz)
8396 UsedIndices.
set(Order[Idx]);
8398 if (SecondaryOrder.
empty()) {
8400 if (Order[Idx] == Sz && !UsedIndices.
test(Idx))
8404 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
8405 !UsedIndices.
test(SecondaryOrder[Idx]))
8406 Order[Idx] = SecondaryOrder[Idx];
8414 constexpr unsigned TinyVF = 2;
8415 constexpr unsigned TinyTree = 10;
8416 constexpr unsigned PhiOpsLimit = 12;
8417 constexpr unsigned GatherLoadsLimit = 2;
8418 if (VectorizableTree.size() <= TinyTree)
8420 if (VectorizableTree.front()->hasState() &&
8421 !VectorizableTree.front()->isGather() &&
8422 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
8423 VectorizableTree.front()->getOpcode() == Instruction::PHI ||
8424 (VectorizableTree.front()->getVectorFactor() <= TinyVF &&
8425 (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
8426 VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
8427 VectorizableTree.front()->ReorderIndices.empty()) {
8431 if (VectorizableTree.front()->hasState() &&
8432 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
8433 VectorizableTree.front()->Scalars.size() == TinyVF &&
8434 VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
8437 if (VectorizableTree.front()->hasState() &&
8438 VectorizableTree.front()->getOpcode() == Instruction::Store &&
8439 VectorizableTree.front()->ReorderIndices.empty()) {
8440 const unsigned ReorderedSplitsCnt =
8441 count_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
8442 return TE->State == TreeEntry::SplitVectorize &&
8443 !TE->ReorderIndices.empty() && TE->UserTreeIndex.UserTE &&
8444 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8447 if (ReorderedSplitsCnt <= 1 &&
8449 VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
8450 return ((!TE->isGather() &&
8451 (TE->ReorderIndices.empty() ||
8452 (TE->UserTreeIndex.UserTE &&
8453 TE->UserTreeIndex.UserTE->State ==
8454 TreeEntry::Vectorize &&
8455 !TE->UserTreeIndex.UserTE->ReuseShuffleIndices
8457 (TE->isGather() && TE->ReorderIndices.empty() &&
8458 (!TE->hasState() || TE->isAltShuffle() ||
8459 TE->getOpcode() == Instruction::Load ||
8460 TE->getOpcode() == Instruction::ZExt ||
8461 TE->getOpcode() == Instruction::SExt))) &&
8462 (VectorizableTree.front()->getVectorFactor() > TinyVF ||
8463 !TE->isGather() ||
none_of(TE->Scalars, [&](
Value *V) {
8464 return !isConstant(V) && isVectorized(V);
8466 })) >= VectorizableTree.size() - ReorderedSplitsCnt)
8469 bool HasPhis =
false;
8470 bool HasLoad =
true;
8471 unsigned GatherLoads = 0;
8472 for (
const std::unique_ptr<TreeEntry> &TE :
8473 ArrayRef(VectorizableTree).drop_front()) {
8474 if (TE->State == TreeEntry::SplitVectorize)
8476 if (!TE->hasState()) {
8480 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8485 if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) {
8486 if (!TE->isGather()) {
8493 if (GatherLoads >= GatherLoadsLimit)
8496 if (TE->getOpcode() == Instruction::GetElementPtr ||
8499 if (TE->getOpcode() != Instruction::PHI &&
8500 (!TE->hasCopyableElements() ||
8502 TE->Scalars.size() / 2))
8504 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8505 TE->getNumOperands() > PhiOpsLimit)
8514void BoUpSLP::TreeEntry::reorderSplitNode(
unsigned Idx,
ArrayRef<int> Mask,
8516 assert(State == TreeEntry::SplitVectorize &&
"Expected split user node.");
8519 std::iota(NewMask.
begin(), NewMask.
end(), 0);
8520 std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0);
8523 copy(MaskOrder, NewMaskOrder.begin());
8525 assert(Idx == 1 &&
"Expected either 0 or 1 index.");
8526 unsigned Offset = CombinedEntriesWithIndices.
back().second;
8535 ReorderIndices.clear();
8554 ExternalUserReorderMap;
8558 for_each(VectorizableTree, [&, &TTIRef = *TTI](
8559 const std::unique_ptr<TreeEntry> &TE) {
8562 findExternalStoreUsersReorderIndices(TE.get());
8563 if (!ExternalUserReorderIndices.
empty()) {
8564 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
8566 std::move(ExternalUserReorderIndices));
8572 if (TE->hasState() && TE->isAltShuffle() &&
8573 TE->State != TreeEntry::SplitVectorize) {
8574 Type *ScalarTy = TE->Scalars[0]->getType();
8576 unsigned Opcode0 = TE->getOpcode();
8577 unsigned Opcode1 = TE->getAltOpcode();
8581 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
8582 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
8588 bool IgnoreReorder =
8589 !UserIgnoreList && VectorizableTree.front()->hasState() &&
8590 (VectorizableTree.front()->
getOpcode() == Instruction::InsertElement ||
8591 VectorizableTree.front()->getOpcode() == Instruction::Store);
8592 if (std::optional<OrdersType> CurrentOrder =
8602 const TreeEntry *UserTE = TE.get();
8604 if (!UserTE->UserTreeIndex)
8606 if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8607 UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
8608 UserTE->UserTreeIndex.UserTE->Idx != 0)
8610 UserTE = UserTE->UserTreeIndex.UserTE;
8613 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
8614 if (!(TE->State == TreeEntry::Vectorize ||
8615 TE->State == TreeEntry::StridedVectorize ||
8616 TE->State == TreeEntry::SplitVectorize ||
8617 TE->State == TreeEntry::CompressVectorize) ||
8618 !TE->ReuseShuffleIndices.empty())
8619 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
8620 if (TE->State == TreeEntry::Vectorize &&
8621 TE->getOpcode() == Instruction::PHI)
8622 PhisToOrders.
try_emplace(TE.get(), *CurrentOrder);
8627 for (
unsigned VF = VectorizableTree.front()->getVectorFactor();
8628 !VFToOrderedEntries.
empty() && VF > 1; VF -= 2 - (VF & 1U)) {
8629 auto It = VFToOrderedEntries.
find(VF);
8630 if (It == VFToOrderedEntries.
end())
8644 for (
const TreeEntry *OpTE : OrderedEntries) {
8647 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE) &&
8648 OpTE->State != TreeEntry::SplitVectorize)
8651 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
8653 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
8654 auto It = GathersToOrders.find(OpTE);
8655 if (It != GathersToOrders.end())
8658 if (OpTE->hasState() && OpTE->isAltShuffle()) {
8659 auto It = AltShufflesToOrders.find(OpTE);
8660 if (It != AltShufflesToOrders.end())
8663 if (OpTE->State == TreeEntry::Vectorize &&
8664 OpTE->getOpcode() == Instruction::PHI) {
8665 auto It = PhisToOrders.
find(OpTE);
8666 if (It != PhisToOrders.
end())
8669 return OpTE->ReorderIndices;
8672 auto It = ExternalUserReorderMap.
find(OpTE);
8673 if (It != ExternalUserReorderMap.
end()) {
8674 const auto &ExternalUserReorderIndices = It->second;
8678 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
8679 OrdersUses.try_emplace(
OrdersType(), 0).first->second +=
8680 ExternalUserReorderIndices.size();
8682 for (
const OrdersType &ExtOrder : ExternalUserReorderIndices)
8683 ++OrdersUses.try_emplace(ExtOrder, 0).first->second;
8690 if (OpTE->State == TreeEntry::Vectorize &&
8691 OpTE->getOpcode() == Instruction::Store && !Order.
empty()) {
8692 assert(!OpTE->isAltShuffle() &&
8693 "Alternate instructions are only supported by BinaryOperator "
8697 unsigned E = Order.
size();
8700 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8703 ++OrdersUses.try_emplace(CurrentOrder, 0).first->second;
8705 ++OrdersUses.try_emplace(Order, 0).first->second;
8708 if (OrdersUses.empty())
8711 unsigned IdentityCnt = 0;
8712 unsigned FilledIdentityCnt = 0;
8714 for (
auto &Pair : OrdersUses) {
8716 if (!Pair.first.empty())
8717 FilledIdentityCnt += Pair.second;
8718 IdentityCnt += Pair.second;
8723 unsigned Cnt = IdentityCnt;
8724 for (
auto &Pair : OrdersUses) {
8728 if (Cnt < Pair.second ||
8729 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
8730 Cnt == Pair.second && !BestOrder.
empty() &&
8733 BestOrder = Pair.first;
8746 unsigned E = BestOrder.
size();
8748 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8751 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8753 if (TE->Scalars.size() != VF) {
8754 if (TE->ReuseShuffleIndices.size() == VF) {
8755 assert(TE->State != TreeEntry::SplitVectorize &&
8756 "Split vectorized not expected.");
8761 (!TE->UserTreeIndex ||
8762 TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
8763 TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() ||
8764 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) &&
8765 "All users must be of VF size.");
8772 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->hasState() &&
8778 reorderNodeWithReuses(*TE, Mask);
8780 if (TE->UserTreeIndex &&
8781 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8782 TE->UserTreeIndex.UserTE->reorderSplitNode(
8783 TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder);
8787 if ((TE->State == TreeEntry::SplitVectorize &&
8788 TE->ReuseShuffleIndices.empty()) ||
8789 ((TE->State == TreeEntry::Vectorize ||
8790 TE->State == TreeEntry::StridedVectorize ||
8791 TE->State == TreeEntry::CompressVectorize) &&
8796 (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize &&
8797 TE->ReuseShuffleIndices.empty())) &&
8798 "Alternate instructions are only supported by BinaryOperator "
8804 TE->reorderOperands(Mask);
8807 TE->reorderOperands(Mask);
8808 assert(TE->ReorderIndices.empty() &&
8809 "Expected empty reorder sequence.");
8812 if (!TE->ReuseShuffleIndices.empty()) {
8819 addMask(NewReuses, TE->ReuseShuffleIndices);
8820 TE->ReuseShuffleIndices.swap(NewReuses);
8821 }
else if (TE->UserTreeIndex &&
8822 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8824 TE->UserTreeIndex.UserTE->reorderSplitNode(TE->UserTreeIndex.EdgeIdx,
8830void BoUpSLP::buildReorderableOperands(
8831 TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
8835 if (
any_of(Edges, [
I](
const std::pair<unsigned, TreeEntry *> &OpData) {
8836 return OpData.first ==
I &&
8837 (OpData.second->State == TreeEntry::Vectorize ||
8838 OpData.second->State == TreeEntry::StridedVectorize ||
8839 OpData.second->State == TreeEntry::CompressVectorize ||
8840 OpData.second->State == TreeEntry::SplitVectorize);
8844 if (UserTE->hasState()) {
8845 if (UserTE->getOpcode() == Instruction::ExtractElement ||
8846 UserTE->getOpcode() == Instruction::ExtractValue)
8848 if (UserTE->getOpcode() == Instruction::InsertElement &&
I == 0)
8850 if (UserTE->getOpcode() == Instruction::Store &&
8851 UserTE->State == TreeEntry::Vectorize &&
I == 1)
8853 if (UserTE->getOpcode() == Instruction::Load &&
8854 (UserTE->State == TreeEntry::Vectorize ||
8855 UserTE->State == TreeEntry::StridedVectorize ||
8856 UserTE->State == TreeEntry::CompressVectorize))
8859 TreeEntry *
TE = getOperandEntry(UserTE,
I);
8860 assert(TE &&
"Expected operand entry.");
8861 if (!
TE->isGather()) {
8864 Edges.emplace_back(
I, TE);
8870 if (
TE->State == TreeEntry::ScatterVectorize &&
8871 TE->ReuseShuffleIndices.empty() &&
TE->ReorderIndices.empty())
8875 if (ReorderableGathers.
contains(TE))
8881 struct TreeEntryCompare {
8882 bool operator()(
const TreeEntry *LHS,
const TreeEntry *RHS)
const {
8883 if (LHS->UserTreeIndex && RHS->UserTreeIndex)
8884 return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx;
8885 return LHS->Idx < RHS->Idx;
8894 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8895 if (TE->State != TreeEntry::Vectorize &&
8896 TE->State != TreeEntry::StridedVectorize &&
8897 TE->State != TreeEntry::CompressVectorize &&
8898 TE->State != TreeEntry::SplitVectorize)
8899 NonVectorized.
insert(TE.get());
8900 if (std::optional<OrdersType> CurrentOrder =
8902 Queue.push(TE.get());
8903 if (!(TE->State == TreeEntry::Vectorize ||
8904 TE->State == TreeEntry::StridedVectorize ||
8905 TE->State == TreeEntry::CompressVectorize ||
8906 TE->State == TreeEntry::SplitVectorize) ||
8907 !TE->ReuseShuffleIndices.empty())
8908 GathersToOrders.
insert(TE.get());
8917 while (!Queue.empty()) {
8919 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>
Users;
8920 TreeEntry *TE = Queue.top();
8921 const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
8924 while (!Queue.empty()) {
8926 if (!UserTE || UserTE != TE->UserTreeIndex.UserTE)
8931 for (TreeEntry *TE : OrderedOps) {
8932 if (!(TE->State == TreeEntry::Vectorize ||
8933 TE->State == TreeEntry::StridedVectorize ||
8934 TE->State == TreeEntry::CompressVectorize ||
8935 TE->State == TreeEntry::SplitVectorize ||
8936 (TE->isGather() && GathersToOrders.
contains(TE))) ||
8937 !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
8938 !Visited.
insert(TE).second)
8942 Users.first = TE->UserTreeIndex.UserTE;
8943 Users.second.emplace_back(TE->UserTreeIndex.EdgeIdx, TE);
8947 if (
Data.first->State == TreeEntry::SplitVectorize) {
8949 Data.second.size() <= 2 &&
8950 "Expected not greater than 2 operands for split vectorize node.");
8952 [](
const auto &
Op) { return !Op.second->UserTreeIndex; }))
8955 assert(
Data.first->CombinedEntriesWithIndices.size() == 2 &&
8956 "Expected exactly 2 entries.");
8957 for (
const auto &
P :
Data.first->CombinedEntriesWithIndices) {
8958 TreeEntry &OpTE = *VectorizableTree[
P.first];
8960 if (Order.
empty() || !OpTE.ReuseShuffleIndices.empty()) {
8961 if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
8963 const auto BestOrder =
8972 const unsigned E = Order.
size();
8975 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8977 Data.first->reorderSplitNode(
P.second ? 1 : 0, Mask, MaskOrder);
8979 if (!OpTE.ReorderIndices.empty()) {
8980 OpTE.ReorderIndices.clear();
8981 }
else if (!OpTE.ReuseShuffleIndices.empty()) {
8984 assert(OpTE.isGather() &&
"Expected only gather/buildvector node.");
8988 if (
Data.first->ReuseShuffleIndices.empty() &&
8989 !
Data.first->ReorderIndices.empty()) {
8992 Queue.push(
Data.first);
8998 buildReorderableOperands(
Data.first,
Data.second, NonVectorized,
9010 for (
const auto &
Op :
Data.second) {
9011 TreeEntry *OpTE =
Op.second;
9012 if (!VisitedOps.
insert(OpTE).second)
9014 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
9016 const auto Order = [&]() ->
const OrdersType {
9017 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
9021 return OpTE->ReorderIndices;
9025 if (Order.
size() == 1)
9031 Value *Root = OpTE->hasState()
9034 auto GetSameNodesUsers = [&](
Value *Root) {
9036 for (
const TreeEntry *TE : ValueToGatherNodes.lookup(Root)) {
9037 if (TE != OpTE && TE->UserTreeIndex &&
9038 TE->getVectorFactor() == OpTE->getVectorFactor() &&
9039 TE->Scalars.size() == OpTE->Scalars.size() &&
9040 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
9041 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
9042 Res.
insert(TE->UserTreeIndex.UserTE);
9044 for (
const TreeEntry *TE : getTreeEntries(Root)) {
9045 if (TE != OpTE && TE->UserTreeIndex &&
9046 TE->getVectorFactor() == OpTE->getVectorFactor() &&
9047 TE->Scalars.size() == OpTE->Scalars.size() &&
9048 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
9049 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
9050 Res.
insert(TE->UserTreeIndex.UserTE);
9054 auto GetNumOperands = [](
const TreeEntry *TE) {
9055 if (TE->State == TreeEntry::SplitVectorize)
9056 return TE->getNumOperands();
9058 return CI->arg_size();
9059 return TE->getNumOperands();
9061 auto NodeShouldBeReorderedWithOperands = [&, TTI = TTI](
9062 const TreeEntry *TE) {
9070 const TreeEntry *
Op = getOperandEntry(TE, Idx);
9071 if (
Op->isGather() &&
Op->hasState()) {
9072 const TreeEntry *VecOp =
9073 getSameValuesTreeEntry(
Op->getMainOp(),
Op->Scalars);
9077 if (
Op->ReorderIndices.empty() &&
Op->ReuseShuffleIndices.empty())
9084 if (!RevisitedOps.
insert(UTE).second)
9086 return UTE ==
Data.first || !UTE->ReorderIndices.empty() ||
9087 !UTE->ReuseShuffleIndices.empty() ||
9088 (UTE->UserTreeIndex &&
9089 UTE->UserTreeIndex.UserTE ==
Data.first) ||
9090 (
Data.first->UserTreeIndex &&
9091 Data.first->UserTreeIndex.UserTE == UTE) ||
9092 (IgnoreReorder && UTE->UserTreeIndex &&
9093 UTE->UserTreeIndex.UserTE->Idx == 0) ||
9094 NodeShouldBeReorderedWithOperands(UTE);
9097 for (TreeEntry *UTE :
Users) {
9105 const TreeEntry *
Op = getOperandEntry(UTE, Idx);
9107 Queue.push(
const_cast<TreeEntry *
>(
Op));
9112 Data.second, [OpTE](
const std::pair<unsigned, TreeEntry *> &
P) {
9113 return P.second == OpTE;
9116 if (OpTE->State == TreeEntry::Vectorize &&
9117 OpTE->getOpcode() == Instruction::Store && !Order.
empty()) {
9118 assert(!OpTE->isAltShuffle() &&
9119 "Alternate instructions are only supported by BinaryOperator "
9123 unsigned E = Order.
size();
9126 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
9129 OrdersUses.try_emplace(CurrentOrder, 0).first->second +=
NumOps;
9131 OrdersUses.try_emplace(Order, 0).first->second +=
NumOps;
9133 auto Res = OrdersUses.try_emplace(
OrdersType(), 0);
9134 const auto AllowsReordering = [&](
const TreeEntry *TE) {
9135 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
9136 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
9137 (IgnoreReorder && TE->Idx == 0))
9139 if (TE->isGather()) {
9149 if (OpTE->UserTreeIndex) {
9150 TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE;
9151 if (!VisitedUsers.
insert(UserTE).second)
9156 if (AllowsReordering(UserTE))
9164 if (
static_cast<unsigned>(
count_if(
9165 Ops, [UserTE, &AllowsReordering](
9166 const std::pair<unsigned, TreeEntry *> &
Op) {
9167 return AllowsReordering(
Op.second) &&
9168 Op.second->UserTreeIndex.UserTE == UserTE;
9169 })) <=
Ops.size() / 2)
9170 ++Res.first->second;
9173 if (OrdersUses.empty()) {
9178 unsigned IdentityCnt = 0;
9179 unsigned VF =
Data.second.front().second->getVectorFactor();
9181 for (
auto &Pair : OrdersUses) {
9183 IdentityCnt += Pair.second;
9188 unsigned Cnt = IdentityCnt;
9189 for (
auto &Pair : OrdersUses) {
9193 if (Cnt < Pair.second) {
9195 BestOrder = Pair.first;
9212 unsigned E = BestOrder.
size();
9214 return I < E ? static_cast<int>(I) : PoisonMaskElem;
9216 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second) {
9217 TreeEntry *TE =
Op.second;
9218 if (!VisitedOps.
insert(TE).second)
9220 if (TE->ReuseShuffleIndices.size() == BestOrder.
size()) {
9221 reorderNodeWithReuses(*TE, Mask);
9225 if (TE->State != TreeEntry::Vectorize &&
9226 TE->State != TreeEntry::StridedVectorize &&
9227 TE->State != TreeEntry::CompressVectorize &&
9228 TE->State != TreeEntry::SplitVectorize &&
9229 (TE->State != TreeEntry::ScatterVectorize ||
9230 TE->ReorderIndices.empty()))
9232 assert((BestOrder.
size() == TE->ReorderIndices.size() ||
9233 TE->ReorderIndices.empty()) &&
9234 "Non-matching sizes of user/operand entries.");
9236 if (IgnoreReorder && TE == VectorizableTree.front().get())
9237 IgnoreReorder =
false;
9240 for (TreeEntry *
Gather : GatherOps) {
9242 "Unexpected reordering of gathers.");
9243 if (!
Gather->ReuseShuffleIndices.empty()) {
9253 auto IsNotProfitableAltCodeNode = [](
const TreeEntry &TE) {
9254 return TE.isAltShuffle() &&
9255 (!TE.ReuseShuffleIndices.empty() || TE.getVectorFactor() == 2 ||
9256 TE.ReorderIndices.empty());
9258 if (
Data.first->State != TreeEntry::Vectorize ||
9260 Data.first->getMainOp()) ||
9261 IsNotProfitableAltCodeNode(*
Data.first))
9262 Data.first->reorderOperands(Mask);
9264 IsNotProfitableAltCodeNode(*
Data.first) ||
9265 Data.first->State == TreeEntry::StridedVectorize ||
9266 Data.first->State == TreeEntry::CompressVectorize) {
9270 if (
Data.first->ReuseShuffleIndices.empty() &&
9271 !
Data.first->ReorderIndices.empty() &&
9272 !IsNotProfitableAltCodeNode(*
Data.first)) {
9275 Queue.push(
Data.first);
9283 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
9284 VectorizableTree.front()->ReuseShuffleIndices.empty())
9285 VectorizableTree.front()->ReorderIndices.
clear();
9288Instruction *BoUpSLP::getRootEntryInstruction(
const TreeEntry &Entry)
const {
9289 if (Entry.hasState() &&
9290 (Entry.getOpcode() == Instruction::Store ||
9291 Entry.getOpcode() == Instruction::Load) &&
9292 Entry.State == TreeEntry::StridedVectorize &&
9293 !Entry.ReorderIndices.empty() &&
isReverseOrder(Entry.ReorderIndices))
9300 const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
9303 for (
auto &TEPtr : VectorizableTree) {
9304 TreeEntry *Entry = TEPtr.get();
9307 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize ||
9308 DeletedNodes.contains(Entry) ||
9309 TransformedToGatherNodes.contains(Entry))
9313 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
9314 Value *Scalar = Entry->Scalars[Lane];
9319 auto It = ScalarToExtUses.
find(Scalar);
9320 if (It != ScalarToExtUses.
end() && !ExternalUses[It->second].User)
9323 if (Scalar->hasNUsesOrMore(NumVectScalars)) {
9324 unsigned FoundLane = Entry->findLaneForValue(Scalar);
9325 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract from lane " << FoundLane
9326 <<
" from " << *Scalar <<
"for many users.\n");
9327 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
9328 ExternalUses.emplace_back(Scalar,
nullptr, *Entry, FoundLane);
9329 ExternalUsesWithNonUsers.insert(Scalar);
9334 const auto ExtI = ExternallyUsedValues.
find(Scalar);
9335 if (ExtI != ExternallyUsedValues.
end()) {
9336 unsigned FoundLane = Entry->findLaneForValue(Scalar);
9337 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract: Extra arg from lane "
9338 << FoundLane <<
" from " << *Scalar <<
".\n");
9339 ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size());
9340 ExternalUses.emplace_back(Scalar,
nullptr, *Entry, FoundLane);
9351 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
9356 any_of(UseEntries, [
this](
const TreeEntry *UseEntry) {
9357 return !DeletedNodes.contains(UseEntry) &&
9358 !TransformedToGatherNodes.contains(UseEntry);
9363 if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
9366 all_of(UseEntries, [&](TreeEntry *UseEntry) {
9367 if (DeletedNodes.contains(UseEntry) ||
9368 TransformedToGatherNodes.contains(UseEntry))
9370 return UseEntry->State == TreeEntry::ScatterVectorize ||
9372 Scalar, getRootEntryInstruction(*UseEntry), TLI,
9375 LLVM_DEBUG(
dbgs() <<
"SLP: \tInternal user will be removed:" << *U
9378 [](TreeEntry *UseEntry) {
9379 return UseEntry->isGather();
9385 if (It != ScalarToExtUses.
end()) {
9386 ExternalUses[It->second].User =
nullptr;
9391 if (U && Scalar->hasNUsesOrMore(
UsesLimit))
9393 unsigned FoundLane = Entry->findLaneForValue(Scalar);
9395 <<
" from lane " << FoundLane <<
" from " << *Scalar
9397 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
9398 ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
9399 ExternalUsesWithNonUsers.insert(Scalar);
9408BoUpSLP::collectUserStores(
const BoUpSLP::TreeEntry *TE)
const {
9412 for (
unsigned Lane :
seq<unsigned>(0, TE->Scalars.size())) {
9413 Value *V = TE->Scalars[Lane];
9426 if (
SI ==
nullptr || !
SI->isSimple() ||
SI->getFunction() !=
F ||
9435 auto &StoresVec = PtrToStoresMap[{
SI->getParent(),
9436 SI->getValueOperand()->getType(), Ptr}];
9439 if (StoresVec.size() > Lane)
9441 if (!StoresVec.empty()) {
9443 SI->getValueOperand()->getType(),
SI->getPointerOperand(),
9444 SI->getValueOperand()->getType(),
9445 StoresVec.front()->getPointerOperand(), *
DL, *SE,
9451 StoresVec.push_back(SI);
9456 for (
auto &
P : PtrToStoresMap) {
9471 StoreInst *S0 = StoresVec[0];
9476 StoreInst *
SI = StoresVec[Idx];
9477 std::optional<int64_t> Diff =
9479 SI->getPointerOperand(), *DL, *SE,
9485 if (StoreOffsetVec.
size() != StoresVec.
size())
9487 sort(StoreOffsetVec, llvm::less_first());
9489 int64_t PrevDist = 0;
9490 for (
const auto &
P : StoreOffsetVec) {
9491 if (Idx > 0 &&
P.first != PrevDist + 1)
9499 ReorderIndices.assign(StoresVec.
size(), 0);
9500 bool IsIdentity =
true;
9502 ReorderIndices[
P.second] =
I;
9503 IsIdentity &=
P.second ==
I;
9509 ReorderIndices.clear();
9516 for (
unsigned Idx : Order)
9517 dbgs() << Idx <<
", ";
9523BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE)
const {
9524 unsigned NumLanes =
TE->Scalars.size();
9537 if (StoresVec.
size() != NumLanes)
9542 if (!canFormVector(StoresVec, ReorderIndices))
9547 ExternalReorderIndices.
push_back(ReorderIndices);
9549 return ExternalReorderIndices;
9555 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9556 "TreeEntryToStridedPtrInfoMap is not cleared");
9557 UserIgnoreList = &UserIgnoreLst;
9560 buildTreeRec(Roots, 0,
EdgeInfo());
9565 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9566 "TreeEntryToStridedPtrInfoMap is not cleared");
9569 buildTreeRec(Roots, 0,
EdgeInfo());
9578 bool AddNew =
true) {
9586 for (
Value *V : VL) {
9590 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
9592 bool IsFound =
false;
9593 for (
auto [Map,
Data] :
zip(ClusteredDistToLoad, ClusteredLoads)) {
9594 assert(LI->getParent() ==
Data.front().first->getParent() &&
9595 LI->getType() ==
Data.front().first->getType() &&
9599 "Expected loads with the same type, same parent and same "
9600 "underlying pointer.");
9602 LI->getType(), LI->getPointerOperand(),
Data.front().first->getType(),
9603 Data.front().first->getPointerOperand(),
DL, SE,
9607 auto It = Map.find(*Dist);
9608 if (It != Map.end() && It->second != LI)
9610 if (It == Map.end()) {
9611 Data.emplace_back(LI, *Dist);
9612 Map.try_emplace(*Dist, LI);
9622 auto FindMatchingLoads =
9627 int64_t &
Offset,
unsigned &Start) {
9629 return GatheredLoads.
end();
9638 std::optional<int64_t> Dist =
9640 Data.front().first->getType(),
9641 Data.front().first->getPointerOperand(),
DL, SE,
9647 for (std::pair<LoadInst *, int64_t>
P :
Data) {
9653 unsigned NumUniques = 0;
9654 for (
auto [Cnt, Pair] :
enumerate(Loads)) {
9655 bool Used = DataLoads.
contains(Pair.first);
9656 if (!Used && !DataDists.
contains(*Dist + Pair.second)) {
9663 if (NumUniques > 0 &&
9664 (Loads.
size() == NumUniques ||
9665 (Loads.
size() - NumUniques >= 2 &&
9666 Loads.
size() - NumUniques >= Loads.
size() / 2 &&
9672 return std::next(GatheredLoads.
begin(), Idx);
9676 return GatheredLoads.
end();
9678 for (
ArrayRef<std::pair<LoadInst *, int64_t>>
Data : ClusteredLoads) {
9682 auto *It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd,
Repeated,
9684 while (It != GatheredLoads.
end()) {
9685 assert(!LocalToAdd.
empty() &&
"Expected some elements to add.");
9686 for (
unsigned Idx : LocalToAdd)
9693 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
9700 Loads.push_back(
Data[Idx]);
9706 GatheredLoads, [&](
ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9707 return PD.front().first->getParent() == LI->
getParent() &&
9708 PD.front().first->getType() == LI->
getType();
9710 while (It != GatheredLoads.
end()) {
9713 std::next(It), GatheredLoads.
end(),
9714 [&](
ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9715 return PD.front().first->getParent() == LI->getParent() &&
9716 PD.front().first->getType() == LI->getType();
9720 GatheredLoads.emplace_back().append(
Data.begin(),
Data.end());
9721 AddNewLoads(GatheredLoads.emplace_back());
9726void BoUpSLP::tryToVectorizeGatheredLoads(
9727 const SmallMapVector<
9728 std::tuple<BasicBlock *, Value *, Type *>,
9731 GatheredLoadsEntriesFirst = VectorizableTree.
size();
9734 LoadEntriesToVectorize.size());
9735 for (
auto [Idx, Set] :
zip(LoadEntriesToVectorize, LoadSetsToVectorize))
9736 Set.insert_range(VectorizableTree[Idx]->Scalars);
9739 auto LoadSorter = [](
const std::pair<LoadInst *, int64_t> &L1,
9740 const std::pair<LoadInst *, int64_t> &L2) {
9741 return L1.second > L2.second;
9748 auto *Ty =
getWidenedType(Loads.front()->getType(), Loads.size());
9749 return TTI->isLegalMaskedGather(Ty, Alignment) &&
9750 !TTI->forceScalarizeMaskedGather(Ty, Alignment);
9755 SmallVectorImpl<LoadInst *> &NonVectorized,
9756 bool Final,
unsigned MaxVF) {
9758 unsigned StartIdx = 0;
9759 SmallVector<int> CandidateVFs;
9763 *TTI, Loads.
front()->getType(), MaxVF);
9765 *TTI, Loads.
front()->getType(), NumElts - 1)) {
9771 if (Final && CandidateVFs.
empty())
9774 unsigned BestVF = Final ? CandidateVFs.
back() : 0;
9775 for (
unsigned NumElts : CandidateVFs) {
9776 if (Final && NumElts > BestVF)
9778 SmallVector<unsigned> MaskedGatherVectorized;
9779 for (
unsigned Cnt = StartIdx,
E = Loads.
size(); Cnt <
E;
9783 if (VectorizedLoads.count(Slice.
front()) ||
9784 VectorizedLoads.count(Slice.
back()) ||
9790 bool AllowToVectorize =
false;
9793 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
9796 for (LoadInst *LI : Slice) {
9798 if (LI->hasOneUse())
9804 if (
static_cast<unsigned int>(std::distance(
9805 LI->user_begin(), LI->user_end())) != LI->getNumUses())
9807 if (!IsLegalBroadcastLoad)
9811 for (User *U : LI->users()) {
9814 for (
const TreeEntry *UTE : getTreeEntries(U)) {
9815 for (
int I :
seq<int>(UTE->getNumOperands())) {
9817 return V == LI || isa<PoisonValue>(V);
9827 AllowToVectorize = CheckIfAllowed(Slice);
9831 any_of(ValueToGatherNodes.at(Slice.front()),
9832 [=](
const TreeEntry *TE) {
9833 return TE->Scalars.size() == 2 &&
9834 ((TE->Scalars.front() == Slice.front() &&
9835 TE->Scalars.back() == Slice.back()) ||
9836 (TE->Scalars.front() == Slice.back() &&
9837 TE->Scalars.back() == Slice.front()));
9842 if (AllowToVectorize) {
9847 reinterpret_cast<Value *
const *
>(Slice.begin()), Slice.size());
9848 StridedPtrInfo SPtrInfo;
9850 PointerOps, SPtrInfo, &BestVF);
9852 (BestVF > 1 &&
static_cast<unsigned>(NumElts) == 2 * BestVF)) {
9854 if (MaskedGatherVectorized.
empty() ||
9855 Cnt >= MaskedGatherVectorized.
back() + NumElts)
9860 Results.emplace_back(Values, LS);
9861 VectorizedLoads.insert_range(Slice);
9864 if (Cnt == StartIdx)
9865 StartIdx += NumElts;
9868 if (StartIdx >= Loads.
size())
9872 if (!MaskedGatherVectorized.
empty() &&
9873 Cnt < MaskedGatherVectorized.
back() + NumElts)
9879 if (!AllowToVectorize || BestVF == 0)
9883 for (
unsigned Cnt : MaskedGatherVectorized) {
9885 Cnt, std::min<unsigned>(NumElts, Loads.
size() - Cnt));
9889 VectorizedLoads.insert_range(Slice);
9891 if (Cnt == StartIdx)
9892 StartIdx += NumElts;
9895 for (LoadInst *LI : Loads) {
9896 if (!VectorizedLoads.contains(LI))
9897 NonVectorized.push_back(LI);
9901 auto ProcessGatheredLoads =
9904 bool Final =
false) {
9906 for (
ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists :
9908 if (LoadsDists.size() <= 1) {
9909 NonVectorized.
push_back(LoadsDists.back().first);
9917 unsigned MaxConsecutiveDistance = 0;
9918 unsigned CurrentConsecutiveDist = 1;
9919 int64_t LastDist = LocalLoadsDists.front().second;
9920 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
9921 for (
const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {
9924 assert(LastDist >=
L.second &&
9925 "Expected first distance always not less than second");
9926 if (
static_cast<uint64_t
>(LastDist -
L.second) ==
9927 CurrentConsecutiveDist) {
9928 ++CurrentConsecutiveDist;
9929 MaxConsecutiveDistance =
9930 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
9934 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
9937 CurrentConsecutiveDist = 1;
9938 LastDist =
L.second;
9941 if (Loads.
size() <= 1)
9943 if (AllowMaskedGather)
9944 MaxConsecutiveDistance = Loads.
size();
9945 else if (MaxConsecutiveDistance < 2)
9950 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
9951 Final, MaxConsecutiveDistance);
9953 OriginalLoads.size() == Loads.
size() &&
9954 MaxConsecutiveDistance == Loads.
size() &&
9959 VectorizedLoads.
clear();
9963 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
9964 UnsortedNonVectorized, Final,
9965 OriginalLoads.size());
9966 if (SortedNonVectorized.
size() >= UnsortedNonVectorized.
size()) {
9967 SortedNonVectorized.
swap(UnsortedNonVectorized);
9968 Results.swap(UnsortedResults);
9973 << Slice.
size() <<
")\n");
9975 for (
Value *L : Slice)
9983 unsigned MaxVF = Slice.size();
9984 unsigned UserMaxVF = 0;
9985 unsigned InterleaveFactor = 0;
9990 std::optional<unsigned> InterleavedLoadsDistance = 0;
9992 std::optional<unsigned> CommonVF = 0;
9993 DenseMap<const TreeEntry *, unsigned> EntryToPosition;
9994 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
9995 for (
auto [Idx, V] :
enumerate(Slice)) {
9996 for (
const TreeEntry *
E : ValueToGatherNodes.at(V)) {
9997 UserMaxVF = std::max<unsigned>(UserMaxVF,
E->Scalars.size());
10000 UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
10002 if (*CommonVF == 0) {
10003 CommonVF =
E->Scalars.size();
10006 if (*CommonVF !=
E->Scalars.size())
10010 if (Pos != Idx && InterleavedLoadsDistance) {
10013 if (isa<Constant>(V))
10015 if (isVectorized(V))
10017 const auto &Nodes = ValueToGatherNodes.at(V);
10018 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
10019 !is_contained(Slice, V);
10021 InterleavedLoadsDistance.reset();
10024 DeinterleavedNodes.
insert(
E);
10025 if (*InterleavedLoadsDistance == 0) {
10026 InterleavedLoadsDistance = Idx - Pos;
10029 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
10030 (Idx - Pos) / *InterleavedLoadsDistance < Order)
10031 InterleavedLoadsDistance.reset();
10032 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
10036 DeinterleavedNodes.
clear();
10038 if (InterleavedLoadsDistance.value_or(0) > 1 &&
10039 CommonVF.value_or(0) != 0) {
10040 InterleaveFactor =
bit_ceil(*InterleavedLoadsDistance);
10041 unsigned VF = *CommonVF;
10044 StridedPtrInfo SPtrInfo;
10046 if (InterleaveFactor <= Slice.size() &&
10047 TTI.isLegalInterleavedAccessType(
10055 UserMaxVF = InterleaveFactor * VF;
10057 InterleaveFactor = 0;
10062 unsigned ConsecutiveNodesSize = 0;
10063 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
10064 any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
10065 [&, Slice = Slice](
const auto &
P) {
10067 return std::get<1>(
P).contains(V);
10069 if (It == Slice.end())
10071 const TreeEntry &
TE =
10072 *VectorizableTree[std::get<0>(
P)];
10076 StridedPtrInfo SPtrInfo;
10078 VL, VL.
front(), Order, PointerOps, SPtrInfo);
10082 ConsecutiveNodesSize += VL.
size();
10083 size_t Start = std::distance(Slice.begin(), It);
10084 size_t Sz = Slice.size() -
Start;
10085 return Sz < VL.
size() ||
10086 Slice.slice(Start, VL.
size()) != VL;
10091 if (InterleaveFactor == 0 &&
10093 [&, Slice = Slice](
unsigned Idx) {
10095 SmallVector<Value *> PointerOps;
10096 StridedPtrInfo SPtrInfo;
10097 return canVectorizeLoads(
10098 Slice.slice(Idx * UserMaxVF, UserMaxVF),
10099 Slice[Idx * UserMaxVF], Order, PointerOps,
10100 SPtrInfo) == LoadsState::ScatterVectorize;
10103 if (Slice.size() != ConsecutiveNodesSize)
10104 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
10106 for (
unsigned VF = MaxVF; VF >= 2; VF /= 2) {
10107 bool IsVectorized =
true;
10108 for (
unsigned I = 0,
E = Slice.size();
I <
E;
I += VF) {
10110 Slice.slice(
I, std::min(VF,
E -
I));
10115 if (
any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
10116 [&](
const auto &
P) {
10117 return !SubSlice.
equals(
10118 VectorizableTree[std::get<0>(
P)]
10123 unsigned Sz = VectorizableTree.size();
10124 buildTreeRec(SubSlice, 0,
EdgeInfo(), InterleaveFactor);
10125 if (Sz == VectorizableTree.size()) {
10126 IsVectorized =
false;
10129 if (InterleaveFactor > 0) {
10130 VF = 2 * (MaxVF / InterleaveFactor);
10131 InterleaveFactor = 0;
10140 NonVectorized.
append(SortedNonVectorized);
10142 return NonVectorized;
10144 for (
const auto &GLs : GatheredLoads) {
10145 const auto &
Ref = GLs.second;
10147 if (!
Ref.empty() && !NonVectorized.
empty() &&
10149 Ref.begin(),
Ref.end(), 0u,
10150 [](
unsigned S,
ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists)
10151 ->
unsigned { return S + LoadsDists.size(); }) !=
10152 NonVectorized.
size() &&
10153 IsMaskedGatherSupported(NonVectorized)) {
10155 FinalGatheredLoads;
10156 for (LoadInst *LI : NonVectorized) {
10160 FinalGatheredLoads,
10164 (void)ProcessGatheredLoads(FinalGatheredLoads,
true);
10168 for (
unsigned Idx : LoadEntriesToVectorize) {
10169 const TreeEntry &
E = *VectorizableTree[Idx];
10172 if (!
E.ReorderIndices.empty()) {
10175 SmallVector<int> ReorderMask;
10179 buildTreeRec(GatheredScalars, 0,
EdgeInfo());
10183 if (
static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
10184 VectorizableTree.size())
10185 GatheredLoadsEntriesFirst.reset();
10195 bool AllowAlternate) {
10201 if (LI->isSimple())
10212 SubKey =
hash_value(EI->getVectorOperand());
10219 if (AllowAlternate)
10230 std::pair<size_t, size_t> OpVals =
10238 if (CI->isCommutative())
10260 SubKey =
hash_value(Gep->getPointerOperand());
10272 return std::make_pair(
Key, SubKey);
10278 Instruction *AltOp,
const TargetLibraryInfo &TLI);
10284 const unsigned VF,
unsigned MinBW,
10307static std::pair<InstructionCost, InstructionCost>
10327 FMF = FPCI->getFastMathFlags();
10330 LibCost.isValid() ? LibCost : ScalarLimit);
10344 assert(L &&
"Expected valid loop");
10350 while (L && IsLoopInvariant(L, VL))
10351 L = L->getParentLoop();
10357 assert(L &&
"Expected valid loop");
10360 SmallVector<const Loop *> &Res =
10361 LoopToLoopNest.try_emplace(L).first->getSecond();
10364 SmallVector<const Loop *> LoopNest;
10367 L =
L->getParentLoop();
10373BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
10375 bool IsScatterVectorizeUserTE,
OrdersType &CurrentOrder,
10376 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo) {
10378 "Expected instructions with same/alternate opcodes only.");
10380 unsigned ShuffleOrOp =
10381 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
10383 switch (ShuffleOrOp) {
10384 case Instruction::PHI: {
10387 return TreeEntry::NeedToGather;
10389 for (
Value *V : VL) {
10393 for (
Value *Incoming :
PHI->incoming_values()) {
10395 if (Term &&
Term->isTerminator()) {
10397 <<
"SLP: Need to swizzle PHINodes (terminator use).\n");
10398 return TreeEntry::NeedToGather;
10403 return TreeEntry::Vectorize;
10405 case Instruction::ExtractElement:
10412 return TreeEntry::NeedToGather;
10414 case Instruction::ExtractValue: {
10415 bool Reuse = canReuseExtract(VL, CurrentOrder);
10419 return TreeEntry::NeedToGather;
10420 if (Reuse || !CurrentOrder.empty())
10421 return TreeEntry::Vectorize;
10423 return TreeEntry::NeedToGather;
10425 case Instruction::InsertElement: {
10429 for (
Value *V : VL) {
10431 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement/poison vector.\n");
10432 return TreeEntry::NeedToGather;
10436 "Non-constant or undef index?");
10440 return !SourceVectors.contains(V);
10443 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
10444 "different source vectors.\n");
10445 return TreeEntry::NeedToGather;
10450 return SourceVectors.contains(V) && !
V->hasOneUse();
10453 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
10454 "multiple uses.\n");
10455 return TreeEntry::NeedToGather;
10458 return TreeEntry::Vectorize;
10460 case Instruction::Load: {
10467 auto IsGatheredNode = [&]() {
10468 if (!GatheredLoadsEntriesFirst)
10473 return any_of(getTreeEntries(V), [&](
const TreeEntry *TE) {
10474 return TE->Idx >= *GatheredLoadsEntriesFirst;
10480 return TreeEntry::Vectorize;
10482 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
10484 LoadEntriesToVectorize.insert(VectorizableTree.size());
10485 return TreeEntry::NeedToGather;
10487 return IsGatheredNode() ? TreeEntry::NeedToGather
10488 : TreeEntry::CompressVectorize;
10490 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
10492 LoadEntriesToVectorize.insert(VectorizableTree.size());
10493 return TreeEntry::NeedToGather;
10495 return IsGatheredNode() ? TreeEntry::NeedToGather
10496 : TreeEntry::ScatterVectorize;
10498 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
10500 LoadEntriesToVectorize.insert(VectorizableTree.size());
10501 return TreeEntry::NeedToGather;
10503 return IsGatheredNode() ? TreeEntry::NeedToGather
10504 : TreeEntry::StridedVectorize;
10508 if (DL->getTypeSizeInBits(ScalarTy) !=
10509 DL->getTypeAllocSizeInBits(ScalarTy))
10510 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering loads of non-packed type.\n");
10513 return !LI || !LI->isSimple();
10517 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering non-consecutive loads.\n");
10520 return TreeEntry::NeedToGather;
10524 case Instruction::ZExt:
10525 case Instruction::SExt:
10526 case Instruction::FPToUI:
10527 case Instruction::FPToSI:
10528 case Instruction::FPExt:
10529 case Instruction::PtrToInt:
10530 case Instruction::IntToPtr:
10531 case Instruction::SIToFP:
10532 case Instruction::UIToFP:
10533 case Instruction::Trunc:
10534 case Instruction::FPTrunc:
10535 case Instruction::BitCast: {
10537 for (
Value *V : VL) {
10543 dbgs() <<
"SLP: Gathering casts with different src types.\n");
10544 return TreeEntry::NeedToGather;
10547 return TreeEntry::Vectorize;
10549 case Instruction::ICmp:
10550 case Instruction::FCmp: {
10555 for (
Value *V : VL) {
10559 if ((
Cmp->getPredicate() != P0 &&
Cmp->getPredicate() != SwapP0) ||
10560 Cmp->getOperand(0)->getType() != ComparedTy) {
10561 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering cmp with different predicate.\n");
10562 return TreeEntry::NeedToGather;
10565 return TreeEntry::Vectorize;
10567 case Instruction::Select:
10569 SmallPtrSet<Type *, 4> CondTypes;
10570 for (
Value *V : VL) {
10577 if (CondTypes.
size() > 1) {
10580 <<
"SLP: Gathering select with different condition types.\n");
10581 return TreeEntry::NeedToGather;
10585 case Instruction::FNeg:
10586 case Instruction::Add:
10587 case Instruction::FAdd:
10588 case Instruction::Sub:
10589 case Instruction::FSub:
10590 case Instruction::Mul:
10591 case Instruction::FMul:
10592 case Instruction::UDiv:
10593 case Instruction::SDiv:
10594 case Instruction::FDiv:
10595 case Instruction::URem:
10596 case Instruction::SRem:
10597 case Instruction::FRem:
10598 case Instruction::Shl:
10599 case Instruction::LShr:
10600 case Instruction::AShr:
10601 case Instruction::And:
10602 case Instruction::Or:
10603 case Instruction::Xor:
10604 case Instruction::Freeze:
10605 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10606 TTI->isFPVectorizationPotentiallyUnsafe() &&
any_of(VL, [](
Value *V) {
10608 return I &&
I->isBinaryOp() && !
I->isFast();
10610 return TreeEntry::NeedToGather;
10611 return TreeEntry::Vectorize;
10612 case Instruction::GetElementPtr: {
10614 for (
Value *V : VL) {
10618 if (
I->getNumOperands() != 2) {
10619 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (nested indexes).\n");
10620 return TreeEntry::NeedToGather;
10627 for (
Value *V : VL) {
10631 Type *CurTy =
GEP->getSourceElementType();
10632 if (Ty0 != CurTy) {
10633 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (different types).\n");
10634 return TreeEntry::NeedToGather;
10640 for (
Value *V : VL) {
10644 auto *
Op =
I->getOperand(1);
10646 (
Op->getType() != Ty1 &&
10648 Op->getType()->getScalarSizeInBits() >
10649 DL->getIndexSizeInBits(
10650 V->getType()->getPointerAddressSpace())))) {
10652 dbgs() <<
"SLP: not-vectorizable GEP (non-constant indexes).\n");
10653 return TreeEntry::NeedToGather;
10657 return TreeEntry::Vectorize;
10659 case Instruction::Store: {
10661 llvm::Type *ScalarTy =
cast<StoreInst>(VL0)->getValueOperand()->getType();
10664 if (DL->getTypeSizeInBits(ScalarTy) !=
10665 DL->getTypeAllocSizeInBits(ScalarTy)) {
10666 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering stores of non-packed type.\n");
10667 return TreeEntry::NeedToGather;
10671 for (
Value *V : VL) {
10673 if (!
SI->isSimple()) {
10675 return TreeEntry::NeedToGather;
10684 if (CurrentOrder.empty()) {
10685 Ptr0 = PointerOps.
front();
10686 PtrN = PointerOps.
back();
10688 Ptr0 = PointerOps[CurrentOrder.front()];
10689 PtrN = PointerOps[CurrentOrder.back()];
10691 std::optional<int64_t> Dist =
10694 if (
static_cast<uint64_t
>(*Dist) == VL.size() - 1)
10695 return TreeEntry::Vectorize;
10699 return TreeEntry::NeedToGather;
10701 case Instruction::Call: {
10702 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10703 TTI->isFPVectorizationPotentiallyUnsafe() &&
any_of(VL, [](
Value *V) {
10705 return I && !
I->isFast();
10707 return TreeEntry::NeedToGather;
10717 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10721 return TreeEntry::NeedToGather;
10724 unsigned NumArgs = CI->
arg_size();
10725 SmallVector<Value *, 4> ScalarArgs(NumArgs,
nullptr);
10726 for (
unsigned J = 0; J != NumArgs; ++J)
10729 for (
Value *V : VL) {
10734 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
10736 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched calls:" << *CI <<
"!=" << *V
10738 return TreeEntry::NeedToGather;
10742 for (
unsigned J = 0; J != NumArgs; ++J) {
10745 if (ScalarArgs[J] != A1J) {
10747 <<
"SLP: mismatched arguments in call:" << *CI
10748 <<
" argument " << ScalarArgs[J] <<
"!=" << A1J <<
"\n");
10749 return TreeEntry::NeedToGather;
10758 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched bundle operands in calls:" << *CI
10759 <<
"!=" << *V <<
'\n');
10760 return TreeEntry::NeedToGather;
10765 auto *VecTy =
getWidenedType(S.getMainOp()->getType(), VL.size());
10767 if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())
10768 return TreeEntry::NeedToGather;
10770 return TreeEntry::Vectorize;
10772 case Instruction::ShuffleVector: {
10773 if (!S.isAltShuffle()) {
10776 return TreeEntry::Vectorize;
10779 LLVM_DEBUG(
dbgs() <<
"SLP: ShuffleVector are not vectorized.\n");
10780 return TreeEntry::NeedToGather;
10783 return TreeEntry::Vectorize;
10787 return TreeEntry::NeedToGather;
10796 PHINode *Main =
nullptr;
10801 PHIHandler() =
delete;
10803 : DT(DT), Main(Main), Phis(Phis),
10804 Operands(Main->getNumIncomingValues(),
10806 void buildOperands() {
10807 constexpr unsigned FastLimit = 4;
10816 for (
auto [Idx, V] :
enumerate(Phis)) {
10820 "Expected isa instruction or poison value.");
10821 Operands[
I][Idx] =
V;
10824 if (
P->getIncomingBlock(
I) == InBB)
10825 Operands[
I][Idx] =
P->getIncomingValue(
I);
10827 Operands[
I][Idx] =
P->getIncomingValueForBlock(InBB);
10832 SmallMapVector<BasicBlock *, SmallVector<unsigned>, 4>
10842 for (
auto [Idx, V] :
enumerate(Phis)) {
10845 Operands[
I][Idx] =
V;
10854 Operands[
I][Idx] =
P->getIncomingValue(
I);
10857 auto *It = Blocks.
find(InBB);
10858 if (It == Blocks.
end())
10860 Operands[It->second.front()][Idx] =
P->getIncomingValue(
I);
10863 for (
const auto &
P : Blocks) {
10864 ArrayRef<unsigned> IncomingValues =
P.second;
10865 if (IncomingValues.
size() <= 1)
10868 for (
unsigned I : IncomingValues) {
10870 [&](
const auto &
Data) {
10871 return !
Data.value() ||
10872 Data.value() == Operands[BasicI][
Data.index()];
10874 "Expected empty operands list.");
10875 Operands[
I] = Operands[BasicI];
10888static std::pair<Instruction *, Instruction *>
10892 for (
Value *V : VL) {
10902 if (MainOp->
getOpcode() ==
I->getOpcode()) {
10921 "Expected different main and alt instructions.");
10922 return std::make_pair(MainOp, AltOp);
10935 const InstructionsState &S,
10937 bool TryPad =
false) {
10941 for (
Value *V : VL) {
10957 size_t NumUniqueScalarValues = UniqueValues.
size();
10960 if (NumUniqueScalarValues == VL.
size() &&
10962 ReuseShuffleIndices.
clear();
10967 if ((UserTreeIdx.
UserTE &&
10968 UserTreeIdx.
UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(
TTI)) ||
10970 LLVM_DEBUG(
dbgs() <<
"SLP: Reshuffling scalars not yet supported "
10971 "for nodes with padding.\n");
10972 ReuseShuffleIndices.
clear();
10977 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
10981 if (TryPad && UniquePositions.
size() > 1 && NumUniqueScalarValues > 1 &&
10982 S.getMainOp()->isSafeToRemove() &&
10983 (S.areInstructionsWithCopyableElements() ||
10987 TTI, UniqueValues.
front()->getType(), UniqueValues.
size());
10988 PWSz = std::min<unsigned>(PWSz, VL.
size());
10989 if (PWSz == VL.
size()) {
10993 ReuseShuffleIndices.
clear();
10997 UniqueValues.
end());
10998 PaddedUniqueValues.
append(
10999 PWSz - UniqueValues.
size(),
11003 if ((!S.areInstructionsWithCopyableElements() &&
11005 (S.areInstructionsWithCopyableElements() && S.isMulDivLikeOp() &&
11006 (S.getMainOp()->isIntDivRem() || S.getMainOp()->isFPDivRem() ||
11009 ReuseShuffleIndices.
clear();
11012 VL = std::move(PaddedUniqueValues);
11017 ReuseShuffleIndices.
clear();
11020 VL = std::move(UniqueValues);
11025 const InstructionsState &LocalState,
11029 constexpr unsigned SmallNodeSize = 4;
11030 if (VL.
size() <= SmallNodeSize || TTI->preferAlternateOpcodeVectorization() ||
11035 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *LocalState.getMainOp()
11037 for (TreeEntry *E : getSplitTreeEntries(LocalState.getMainOp())) {
11038 if (E->isSame(VL)) {
11040 << *LocalState.getMainOp() <<
".\n");
11058 Op1Indices.
set(Idx);
11061 if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
11064 (LocalState.getAltOpcode() == LocalState.getOpcode() &&
11066 LocalState.getAltOp(), *TLI))) {
11068 Op1Indices.
set(Idx);
11075 unsigned Opcode0 = LocalState.getOpcode();
11076 unsigned Opcode1 = LocalState.getAltOpcode();
11082 if (UOp1.
size() <= 1 || UOp2.
size() <= 1 ||
11083 TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) ||
11088 unsigned Op1Cnt = 0, Op2Cnt = Op1.
size();
11090 if (Op1Indices.
test(Idx)) {
11091 ReorderIndices[Op1Cnt] = Idx;
11094 ReorderIndices[Op2Cnt] = Idx;
11099 ReorderIndices.
clear();
11101 if (!ReorderIndices.
empty())
11103 unsigned NumParts = TTI->getNumberOfParts(VecTy);
11108 if (NumParts >= VL.
size())
11117 if (!LocalState.isCmpOp() && NumParts <= 1 &&
11118 (Mask.empty() || InsertCost >= NewShuffleCost))
11120 if ((LocalState.getMainOp()->isBinaryOp() &&
11121 LocalState.getAltOp()->isBinaryOp() &&
11122 (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
11123 LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
11124 (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
11125 (LocalState.getMainOp()->isUnaryOp() &&
11126 LocalState.getAltOp()->isUnaryOp())) {
11128 TTI->getArithmeticInstrCost(Opcode0, VecTy, Kind) +
11129 TTI->getArithmeticInstrCost(Opcode1, VecTy, Kind);
11134 OriginalMask[Idx] = Idx + (Op1Indices.
test(Idx) ? 0 : VL.
size());
11138 VecTy, OriginalMask, Kind);
11140 TTI->getArithmeticInstrCost(Opcode0, Op1VecTy, Kind) +
11141 TTI->getArithmeticInstrCost(Opcode1, Op2VecTy, Kind);
11143 NewVecOpsCost + InsertCost +
11144 (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
11145 VectorizableTree.front()->getOpcode() == Instruction::Store
11149 if (NewCost >= OriginalCost)
11159class InstructionsCompatibilityAnalysis {
11164 unsigned MainOpcode = 0;
11169 static bool isSupportedOpcode(
const unsigned Opcode) {
11170 return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
11171 Opcode == Instruction::LShr || Opcode == Instruction::Shl ||
11172 Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
11173 Opcode == Instruction::And || Opcode == Instruction::Or ||
11174 Opcode == Instruction::Xor || Opcode == Instruction::FAdd ||
11175 Opcode == Instruction::FSub || Opcode == Instruction::FMul ||
11176 Opcode == Instruction::FDiv;
11186 auto IsSupportedInstruction = [&](
Instruction *
I,
bool AnyUndef) {
11187 if (AnyUndef && (
I->isIntDivRem() ||
I->isFPDivRem() ||
isa<CallInst>(
I)))
11189 return I && isSupportedOpcode(
I->getOpcode()) &&
11194 SmallDenseSet<Value *, 8> Operands;
11195 SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
11196 bool AnyUndef =
false;
11197 for (
Value *V : VL) {
11205 if (Candidates.
empty()) {
11206 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
11208 Operands.
insert(
I->op_begin(),
I->op_end());
11211 if (Parent ==
I->getParent()) {
11212 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
11213 Operands.
insert(
I->op_begin(),
I->op_end());
11216 auto *NodeA = DT.
getNode(Parent);
11217 auto *NodeB = DT.
getNode(
I->getParent());
11218 assert(NodeA &&
"Should only process reachable instructions");
11219 assert(NodeB &&
"Should only process reachable instructions");
11220 assert((NodeA == NodeB) ==
11221 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11222 "Different nodes should have different DFS numbers");
11223 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
11224 Candidates.
clear();
11225 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
11228 Operands.
insert(
I->op_begin(),
I->op_end());
11231 unsigned BestOpcodeNum = 0;
11233 bool UsedOutside =
false;
11234 for (
const auto &
P : Candidates) {
11236 if (UsedOutside && !PUsedOutside)
11238 if (!UsedOutside && PUsedOutside)
11240 if (
P.second.size() < BestOpcodeNum)
11243 if (!PUsedOutside &&
any_of(
P.second, [&](Instruction *
I) {
11244 return Operands.contains(I);
11247 UsedOutside = PUsedOutside;
11248 for (Instruction *
I :
P.second) {
11249 if (IsSupportedInstruction(
I, AnyUndef)) {
11251 BestOpcodeNum =
P.second.size();
11261 return I &&
I->getParent() == MainOp->
getParent() &&
11274 Value *selectBestIdempotentValue()
const {
11275 assert(isSupportedOpcode(MainOpcode) &&
"Unsupported opcode");
11286 if (!S.isCopyableElement(V))
11288 assert(isSupportedOpcode(MainOpcode) &&
"Unsupported opcode");
11289 return {
V, selectBestIdempotentValue()};
11295 SmallVectorImpl<BoUpSLP::ValueList> &Operands)
const {
11297 unsigned ShuffleOrOp =
11298 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
11301 switch (ShuffleOrOp) {
11302 case Instruction::PHI: {
11306 PHIHandler Handler(DT, PH, VL);
11307 Handler.buildOperands();
11308 Operands.
assign(PH->getNumOperands(), {});
11310 Operands[
I].
assign(Handler.getOperands(
I).begin(),
11311 Handler.getOperands(
I).end());
11314 case Instruction::ExtractValue:
11315 case Instruction::ExtractElement:
11320 case Instruction::InsertElement:
11328 case Instruction::Load:
11332 for (
auto [V,
Op] :
zip(VL, Operands.
back())) {
11336 Op = LI->getPointerOperand();
11339 case Instruction::ZExt:
11340 case Instruction::SExt:
11341 case Instruction::FPToUI:
11342 case Instruction::FPToSI:
11343 case Instruction::FPExt:
11344 case Instruction::PtrToInt:
11345 case Instruction::IntToPtr:
11346 case Instruction::SIToFP:
11347 case Instruction::UIToFP:
11348 case Instruction::Trunc:
11349 case Instruction::FPTrunc:
11350 case Instruction::BitCast:
11351 case Instruction::ICmp:
11352 case Instruction::FCmp:
11353 case Instruction::FNeg:
11354 case Instruction::Add:
11355 case Instruction::FAdd:
11356 case Instruction::Sub:
11357 case Instruction::FSub:
11358 case Instruction::Mul:
11359 case Instruction::FMul:
11360 case Instruction::UDiv:
11361 case Instruction::SDiv:
11362 case Instruction::FDiv:
11363 case Instruction::URem:
11364 case Instruction::SRem:
11365 case Instruction::FRem:
11366 case Instruction::Shl:
11367 case Instruction::LShr:
11368 case Instruction::AShr:
11369 case Instruction::And:
11370 case Instruction::Or:
11371 case Instruction::Xor:
11372 case Instruction::Freeze:
11373 case Instruction::Store:
11374 case Instruction::ShuffleVector:
11383 auto [
Op, ConvertedOps] = convertTo(
I, S);
11388 case Instruction::Select:
11402 Operands[0][Idx] =
I->getOperand(0);
11403 Operands[1][Idx] = ConstantInt::get(
I->getType(), 1);
11404 Operands[2][Idx] = ConstantInt::getNullValue(
I->getType());
11407 auto [
Op, ConvertedOps] = convertTo(
I, S);
11412 case Instruction::GetElementPtr: {
11419 const unsigned IndexIdx = 1;
11425 return !
GEP || VL0Ty ==
GEP->getOperand(IndexIdx)->getType();
11429 ->getPointerOperandType()
11430 ->getScalarType());
11434 Operands[0][Idx] =
V;
11435 Operands[1][Idx] = ConstantInt::getNullValue(Ty);
11438 Operands[0][Idx] =
GEP->getPointerOperand();
11439 auto *
Op =
GEP->getOperand(IndexIdx);
11442 CI, Ty, CI->getValue().isSignBitSet(),
DL)
11447 case Instruction::Call: {
11454 for (
Value *V : VL) {
11456 Ops.push_back(
I ?
I->getOperand(Idx)
11475 const InstructionsState &S,
11476 const InstructionsState &CopyableS) {
11483 Instruction *SAlt = S.isAltShuffle() ? S.getAltOp() :
nullptr;
11485 const bool IsAltCommutative =
11489 buildOriginalOperands(S, SMain,
Ops);
11491 if (
Ops.size() != 2)
11503 auto *I = dyn_cast<Instruction>(V);
11504 return I && I->getOpcode() == SMainOpI->getOpcode();
11507 SmallPtrSet<Value *, 8> Operands;
11508 for (
Value *V : VL) {
11510 if (!
I ||
I == SMain)
11512 Instruction *MatchingOp = S.getMatchingMainOpOrAltOp(
I);
11513 if (MatchingOp != SMain)
11516 buildOriginalOperands(S,
I, VOps);
11517 Operands.
insert(
I->op_begin(),
I->op_end());
11519 "Expected binary operations only.");
11520 if (CheckOperands(VOps[0][0],
Ops[0][0]) ||
11521 CheckOperands(VOps[1][0],
Ops[1][0]) ||
11522 (IsCommutative && (CheckOperands(VOps[0][0],
Ops[1][0]) ||
11523 CheckOperands(VOps[1][0],
Ops[0][0])))) {
11530 buildOriginalOperands(S, MainOp, MainOps);
11532 auto BuildFirstOperandCandidates =
11533 [&](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates,
11535 bool IsCommutative) {
11541 auto BuildSecondOperandCandidates =
11542 [&](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates,
11544 Value *Op1,
bool IsCommutative) {
11545 if (PrevBestIdx != 1)
11547 if (PrevBestIdx != 0 && IsCommutative)
11551 auto FindBestCandidate =
11554 auto Res =
R.findBestRootPair(Candidates);
11555 Score = Res.second;
11558 isConstant(Candidates[Res.first.value_or(0)].first) &&
11559 isConstant(Candidates[Res.first.value_or(0)].second);
11563 for (
const auto [Idx,
P] :
enumerate(Candidates)) {
11565 P.second ==
P.first) {
11568 Score =
isa<LoadInst>(Candidates[Res.first.value_or(0)].first)
11578 for (
Value *V : VL) {
11580 if (!
I || (
I == MainOp && (!S.isAltShuffle() ||
I == SMain)) ||
11581 (!S.isAltShuffle() &&
I == SMain))
11584 buildOriginalOperands(S,
I == SMain ? MainOp :
I, VOps);
11587 if (CopyableOps.
size() == VOps.
size() &&
11588 all_of(
zip(CopyableOps, VOps), [&](
const auto &
P) {
11589 return std::get<0>(
P) == std::get<1>(
P)[0];
11593 BuildFirstOperandCandidates(Candidates, MainOps, CopyableOps[0],
11594 CopyableOps[1], IsMainCommutative);
11595 const unsigned OpSize = Candidates.
size();
11597 S.getMatchingMainOpOrAltOp(
I) == S.getMainOp() ? SMain : SAlt;
11598 const bool IsCommutativeInst =
11599 (MatchingOp == SMain ? IsCommutative : IsAltCommutative) ||
11601 if (S.isAltShuffle() && MatchingOp == SAlt &&
11607 if (S.isAltShuffle() && MatchingOp == SMain)
11608 Operands.
insert(
I->op_begin(),
I->op_end());
11609 BuildFirstOperandCandidates(Candidates,
Ops, VOps[0][0], VOps[1][0],
11610 IsCommutativeInst);
11613 std::optional<int> BestOp =
11614 FindBestCandidate(Candidates, IsBestConst, Score);
11615 const bool IsOriginalBetter =
11616 static_cast<unsigned>(BestOp.value_or(OpSize)) >= OpSize;
11617 Candidates.
clear();
11618 BuildSecondOperandCandidates(
11619 Candidates, MainOps, IsOriginalBetter ? -1 : *BestOp, CopyableOps[0],
11620 CopyableOps[1], IsMainCommutative);
11621 const unsigned SecondOpSize = Candidates.
size();
11622 BuildSecondOperandCandidates(
11624 IsOriginalBetter ? BestOp.value_or(OpSize - 1) - OpSize : -1,
11625 VOps[0][0], VOps[1][0], IsCommutativeInst);
11626 bool IsSecondBestConst;
11628 std::optional<int> SecondBestOp =
11629 FindBestCandidate(Candidates, IsSecondBestConst, SecondScore);
11631 if (!BestOp && !SecondBestOp)
11634 const bool IsSecondOriginalBetter =
11635 static_cast<unsigned>(SecondBestOp.value_or(SecondOpSize)) >=
11637 if (IsOriginalBetter && IsSecondOriginalBetter)
11641 if (!BestOp && IsSecondOriginalBetter)
11645 if (!SecondBestOp && IsOriginalBetter)
11649 if (!IsOriginalBetter && IsBestConst && IsSecondOriginalBetter &&
11650 !IsSecondBestConst)
11654 if (BestOp && IsOriginalBetter && !IsBestConst &&
11655 !IsSecondOriginalBetter && IsSecondBestConst)
11658 if (((Score > SecondScore ||
11660 Score == SecondScore)) &&
11661 IsOriginalBetter) ||
11662 (IsSecondOriginalBetter &&
11663 (SecondScore > Score ||
11665 Score == SecondScore))))
11672 InstructionsCompatibilityAnalysis(DominatorTree &DT,
const DataLayout &
DL,
11673 const TargetTransformInfo &
TTI,
11674 const TargetLibraryInfo &TLI)
11679 bool WithProfitabilityCheck =
false,
11680 bool SkipSameCodeCheck =
false) {
11681 InstructionsState S = (SkipSameCodeCheck || !
allSameBlock(VL))
11682 ? InstructionsState::invalid()
11694 return (ZExt && ZExt->getSrcTy()->isIntegerTy(1)) ||
11698 return InstructionsState(SelectOp, SelectOp);
11700 if (S && S.isAltShuffle()) {
11701 Type *ScalarTy = S.getMainOp()->getType();
11703 unsigned Opcode0 = S.getOpcode();
11704 unsigned Opcode1 = S.getAltOpcode();
11705 SmallBitVector OpcodeMask(
11714 return !
I ||
I->getOpcode() == S.getOpcode();
11720 findAndSetMainInstruction(VL, R);
11723 InstructionsState OrigS = S;
11724 S = InstructionsState(MainOp, MainOp,
true);
11725 if (OrigS && !isCopyablePreferable(VL, R, OrigS, S))
11727 if (!WithProfitabilityCheck)
11731 auto BuildCandidates =
11732 [](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates,
Value *V1,
11738 if (I1 && I2 &&
I1->getOpcode() == I2->getOpcode() &&
11739 I1->getParent() != I2->getParent())
11743 if (VL.
size() == 2) {
11746 BuildCandidates(Candidates1, Operands[0][0], Operands[0][1]);
11747 BuildCandidates(Candidates2, Operands[1][0], Operands[1][1]);
11748 bool Res = !Candidates1.
empty() && !Candidates2.
empty() &&
11749 R.findBestRootPair(Candidates1).first &&
11750 R.findBestRootPair(Candidates2).first;
11752 Candidates1.
clear();
11753 Candidates2.
clear();
11754 BuildCandidates(Candidates1, Operands[0][0], Operands[1][1]);
11755 BuildCandidates(Candidates2, Operands[1][0], Operands[0][1]);
11756 Res = !Candidates1.
empty() && !Candidates2.
empty() &&
11757 R.findBestRootPair(Candidates1).first &&
11758 R.findBestRootPair(Candidates2).first;
11765 FixedVectorType *VecTy =
11767 switch (MainOpcode) {
11768 case Instruction::Add:
11769 case Instruction::Sub:
11770 case Instruction::LShr:
11771 case Instruction::Shl:
11772 case Instruction::SDiv:
11773 case Instruction::UDiv:
11774 case Instruction::And:
11775 case Instruction::Or:
11776 case Instruction::Xor:
11777 case Instruction::FAdd:
11778 case Instruction::FMul:
11779 case Instruction::FSub:
11780 case Instruction::FDiv:
11786 if (VectorCost > ScalarCost)
11790 assert(Operands.
size() == 2 &&
"Unexpected number of operands!");
11791 unsigned CopyableNum =
11792 count_if(VL, [&](
Value *V) {
return S.isCopyableElement(V); });
11793 if (CopyableNum < VL.
size() / 2)
11796 const unsigned Limit = VL.
size() / 24;
11797 if ((CopyableNum >= VL.
size() - Limit ||
11798 (CopyableNum >= VL.
size() - 1 && VL.
size() > 4) ||
11807 for (
auto [OpL, OpR] :
zip(Operands.
front(), Operands.
back())) {
11829 constexpr unsigned Limit = 4;
11830 if (Operands.
front().size() >= Limit) {
11831 SmallDenseMap<const Value *, unsigned>
Counters;
11839 return C.second == 1;
11845 InstructionsCompatibilityAnalysis
Analysis(DT,
DL,
TTI, TLI);
11846 InstructionsState OpS =
Analysis.buildInstructionsState(
Ops, R);
11847 if (!OpS || (OpS.getOpcode() == Instruction::PHI && !
allSameBlock(
Ops)))
11849 unsigned CopyableNum =
11851 return CopyableNum <= VL.
size() / 2;
11853 if (!CheckOperand(Operands.
front()))
11861 assert(S &&
"Invalid state!");
11863 if (S.areInstructionsWithCopyableElements()) {
11864 MainOp = S.getMainOp();
11865 MainOpcode = S.getOpcode();
11870 for (
auto [OperandIdx, Operand] :
enumerate(OperandsForValue))
11871 Operands[OperandIdx][Idx] = Operand;
11874 buildOriginalOperands(S, VL, Operands);
11881BoUpSLP::ScalarsVectorizationLegality
11883 const EdgeInfo &UserTreeIdx)
const {
11886 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
11887 InstructionsState S =
Analysis.buildInstructionsState(
11890 bool AreScatterAllGEPSameBlock =
false;
11892 SmallVector<unsigned> SortedIndices;
11894 bool IsScatterVectorizeUserTE =
11895 UserTreeIdx.UserTE &&
11896 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11897 AreScatterAllGEPSameBlock =
11911 *SE, SortedIndices));
11912 if (!AreScatterAllGEPSameBlock) {
11913 LLVM_DEBUG(
dbgs() <<
"SLP: Try split and if failed, gathering due to "
11914 "C,S,B,O, small shuffle. \n";
11918 return ScalarsVectorizationLegality(S,
false,
11924 assert(It != VL.
end() &&
"Expected at least one GEP.");
11927 assert(S &&
"Must be valid.");
11933 return ScalarsVectorizationLegality(S,
false,
11939 BasicBlock *BB = S.getMainOp()->getParent();
11942 !DT->isReachableFromEntry(BB)) {
11948 return ScalarsVectorizationLegality(S,
false);
11957 return ScalarsVectorizationLegality(S,
false,
11962 if (S.getOpcode() == Instruction::ExtractElement &&
11965 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to scalable vector type.\n");
11966 return ScalarsVectorizationLegality(S,
false);
11973 (S.isAltShuffle() || VL.
size() < 4 ||
11980 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to max recursion depth.\n");
11981 return ScalarsVectorizationLegality(S,
false);
11985 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *S.getMainOp() <<
".\n");
11986 for (TreeEntry *
E : getTreeEntries(S.getMainOp())) {
11987 if (
E->isSame(VL)) {
11988 LLVM_DEBUG(
dbgs() <<
"SLP: Perfect diamond merge at " << *S.getMainOp()
11990 return ScalarsVectorizationLegality(S,
false);
11995 (S.getOpcode() == Instruction::PHI &&
isa<PHINode>(V) &&
11996 LI->getLoopFor(S.getMainOp()->getParent()) &&
12000 return ScalarsVectorizationLegality(S,
false);
12004 bool AreAllSameBlock = !AreScatterAllGEPSameBlock;
12005 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
12006 if (!AreAllSameInsts ||
isSplat(VL) ||
12010 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to C,S,B,O conditions. \n";
12014 return ScalarsVectorizationLegality(S,
false);
12018 if (!EphValues.empty()) {
12019 for (
Value *V : VL) {
12020 if (EphValues.count(V)) {
12022 <<
") is ephemeral.\n");
12024 return ScalarsVectorizationLegality(S,
false,
12036 if (S.isAltShuffle()) {
12037 auto GetNumVectorizedExtracted = [&]() {
12043 all_of(
I->operands(), [&](
const Use &U) {
12044 return isa<ExtractElementInst>(U.get());
12049 else if (!
I->hasOneUser() && !areAllUsersVectorized(
I, UserIgnoreList))
12052 return std::make_pair(Vectorized, Extracted);
12054 auto [Vectorized, Extracted] = GetNumVectorizedExtracted();
12056 bool PreferScalarize = !Vectorized.
isAllOnes() && VL.size() == 2;
12057 if (!Vectorized.
isAllOnes() && !PreferScalarize) {
12060 Type *ScalarTy = VL.front()->getType();
12065 false,
true, Kind);
12067 *TTI, ScalarTy, VecTy, Vectorized,
12068 true,
false, Kind,
false);
12069 PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate;
12071 if (PreferScalarize) {
12072 LLVM_DEBUG(
dbgs() <<
"SLP: The instructions are in tree and alternate "
12073 "node is not profitable.\n");
12074 return ScalarsVectorizationLegality(S,
false);
12079 if (UserIgnoreList && !UserIgnoreList->empty()) {
12080 for (
Value *V : VL) {
12081 if (UserIgnoreList->contains(V)) {
12082 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to gathered scalar.\n");
12083 return ScalarsVectorizationLegality(S,
false);
12088 return ScalarsVectorizationLegality(S,
true);
12093 unsigned InterleaveFactor) {
12096 SmallVector<int> ReuseShuffleIndices;
12100 auto TrySplitNode = [&](
const InstructionsState &LocalState) {
12106 auto Invalid = ScheduleBundle::invalid();
12107 auto *
TE = newTreeEntry(VL, TreeEntry::SplitVectorize,
Invalid, LocalState,
12108 UserTreeIdx, {}, ReorderIndices);
12113 getSameValuesTreeEntry(S.getMainOp(),
Op,
true))) {
12115 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
12116 Idx == 0 ? 0 : Op1.
size());
12117 (void)newTreeEntry(
Op, TreeEntry::NeedToGather,
Invalid, S, {
TE, Idx});
12119 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
12120 Idx == 0 ? 0 : Op1.
size());
12130 bool AreConsts =
false;
12131 for (
Value *V : VL) {
12143 if (AreOnlyConstsWithPHIs(VL)) {
12144 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to all constants and PHIs.\n");
12145 newGatherTreeEntry(VL, InstructionsState::invalid(), UserTreeIdx);
12149 ScalarsVectorizationLegality Legality =
12150 getScalarsVectorizationLegality(VL,
Depth, UserTreeIdx);
12151 InstructionsState S = Legality.getInstructionsState();
12152 if (!Legality.isLegal()) {
12153 if (Legality.trySplitVectorize()) {
12156 if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
12159 if (Legality.tryToFindDuplicates())
12162 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
12167 if (S.isAltShuffle() && TrySplitNode(S))
12173 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
12178 bool IsScatterVectorizeUserTE =
12179 UserTreeIdx.UserTE &&
12180 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
12183 StridedPtrInfo SPtrInfo;
12184 TreeEntry::EntryState State = getScalarsVectorizationState(
12185 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, SPtrInfo);
12186 if (State == TreeEntry::NeedToGather) {
12187 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
12194 if (VectorizableTree.empty()) {
12195 assert(CurrentLoopNest.empty() &&
"Expected empty loop nest");
12197 BasicBlock *Parent = S.getMainOp()->getParent();
12198 if (
const Loop *L = LI->getLoopFor(Parent)) {
12201 CurrentLoopNest.assign(getLoopNest(L));
12203 }
else if (!UserTreeIdx ||
12204 UserTreeIdx.UserTE->State == TreeEntry::SplitVectorize ||
12205 UserTreeIdx.UserTE->isGather() ||
12206 UserTreeIdx.UserTE->getMainOp()->getParent() !=
12207 S.getMainOp()->getParent()) {
12208 BasicBlock *Parent = S.getMainOp()->getParent();
12209 if (
const Loop *L = LI->getLoopFor(Parent)) {
12214 SmallVector<const Loop *> NewLoopNest(getLoopNest(L));
12215 for (
const auto [L1, L2] :
zip(CurrentLoopNest, NewLoopNest)) {
12218 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
12222 if (NewLoopNest.size() > CurrentLoopNest.size())
12223 CurrentLoopNest.append(std::next(NewLoopNest.begin(), CurrentLoopNest.size()),
12224 NewLoopNest.end());
12231 auto &BSRef = BlocksSchedules[BB];
12233 BSRef = std::make_unique<BlockScheduling>(BB);
12235 BlockScheduling &BS = *BSRef;
12238 std::optional<ScheduleBundle *> BundlePtr =
12239 BS.tryScheduleBundle(UniqueValues.getArrayRef(),
this, S, UserTreeIdx);
12240#ifdef EXPENSIVE_CHECKS
12244 if (!BundlePtr || (*BundlePtr && !*BundlePtr.value())) {
12245 LLVM_DEBUG(
dbgs() <<
"SLP: We are not able to schedule this bundle!\n");
12247 if (S.isAltShuffle() && ReuseShuffleIndices.
empty() && TrySplitNode(S))
12249 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
12250 NonScheduledFirst.insert(VL.front());
12251 if (S.getOpcode() == Instruction::Load &&
12252 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
12256 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
12258 ScheduleBundle
Empty;
12259 ScheduleBundle &Bundle = BundlePtr.value() ? *BundlePtr.value() :
Empty;
12260 LLVM_DEBUG(
dbgs() <<
"SLP: We are able to schedule this bundle.\n");
12262 unsigned ShuffleOrOp =
12263 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
12264 auto CreateOperandNodes = [&](TreeEntry *
TE,
const auto &Operands) {
12266 SmallVector<unsigned> PHIOps;
12272 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
12277 for (
unsigned I : PHIOps)
12278 buildTreeRec(Operands[
I],
Depth + 1, {
TE,
I});
12280 switch (ShuffleOrOp) {
12281 case Instruction::PHI: {
12283 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
12287 TE->setOperands(Operands);
12288 CreateOperandNodes(TE, Operands);
12291 case Instruction::ExtractValue:
12292 case Instruction::ExtractElement: {
12293 if (CurrentOrder.empty()) {
12294 LLVM_DEBUG(
dbgs() <<
"SLP: Reusing or shuffling extract sequence.\n");
12297 dbgs() <<
"SLP: Reusing or shuffling of reordered extract sequence "
12299 for (
unsigned Idx : CurrentOrder)
12300 dbgs() <<
" " << Idx;
12307 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12308 ReuseShuffleIndices, CurrentOrder);
12310 "(ExtractValueInst/ExtractElementInst).\n";
12314 TE->setOperands(Operands);
12317 case Instruction::InsertElement: {
12318 assert(ReuseShuffleIndices.
empty() &&
"All inserts should be unique");
12320 auto OrdCompare = [](
const std::pair<int, int> &
P1,
12321 const std::pair<int, int> &
P2) {
12322 return P1.first >
P2.first;
12325 decltype(OrdCompare)>
12326 Indices(OrdCompare);
12327 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
12329 Indices.emplace(Idx,
I);
12331 OrdersType CurrentOrder(VL.size(), VL.size());
12332 bool IsIdentity =
true;
12333 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
12334 CurrentOrder[Indices.top().second] =
I;
12335 IsIdentity &= Indices.top().second ==
I;
12339 CurrentOrder.clear();
12340 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12342 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (InsertElementInst).\n";
12345 TE->setOperands(Operands);
12346 buildTreeRec(
TE->getOperand(1),
Depth + 1, {TE, 1});
12349 case Instruction::Load: {
12356 TreeEntry *
TE =
nullptr;
12359 case TreeEntry::Vectorize:
12360 TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12361 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
12362 if (CurrentOrder.empty())
12363 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (LoadInst).\n";
12367 <<
"SLP: added a new TreeEntry (jumbled LoadInst).\n";
12370 case TreeEntry::CompressVectorize:
12372 TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S,
12373 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
12376 <<
"SLP: added a new TreeEntry (masked LoadInst + compress).\n";
12379 case TreeEntry::StridedVectorize:
12381 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
12382 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
12383 TreeEntryToStridedPtrInfoMap[
TE] = SPtrInfo;
12384 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (strided LoadInst).\n";
12387 case TreeEntry::ScatterVectorize:
12389 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
12390 UserTreeIdx, ReuseShuffleIndices);
12393 <<
"SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
12396 case TreeEntry::CombinedVectorize:
12397 case TreeEntry::SplitVectorize:
12398 case TreeEntry::NeedToGather:
12401 if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {
12402 assert(Operands.
size() == 1 &&
"Expected a single operand only");
12403 SmallVector<int>
Mask;
12407 TE->setOperands(Operands);
12408 if (State == TreeEntry::ScatterVectorize)
12409 buildTreeRec(PointerOps,
Depth + 1, {
TE, 0});
12412 case Instruction::ZExt:
12413 case Instruction::SExt:
12414 case Instruction::FPToUI:
12415 case Instruction::FPToSI:
12416 case Instruction::FPExt:
12417 case Instruction::PtrToInt:
12418 case Instruction::IntToPtr:
12419 case Instruction::SIToFP:
12420 case Instruction::UIToFP:
12421 case Instruction::Trunc:
12422 case Instruction::FPTrunc:
12423 case Instruction::BitCast: {
12424 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
12425 std::make_pair(std::numeric_limits<unsigned>::min(),
12426 std::numeric_limits<unsigned>::max()));
12427 if (ShuffleOrOp == Instruction::ZExt ||
12428 ShuffleOrOp == Instruction::SExt) {
12429 CastMaxMinBWSizes = std::make_pair(
12430 std::max<unsigned>(DL->getTypeSizeInBits(VL0->
getType()),
12432 std::min<unsigned>(
12435 }
else if (ShuffleOrOp == Instruction::Trunc) {
12436 CastMaxMinBWSizes = std::make_pair(
12437 std::max<unsigned>(
12440 std::min<unsigned>(DL->getTypeSizeInBits(VL0->
getType()),
12443 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12444 ReuseShuffleIndices);
12445 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (CastInst).\n";
12448 TE->setOperands(Operands);
12450 buildTreeRec(
TE->getOperand(
I),
Depth, {TE, I});
12451 if (ShuffleOrOp == Instruction::Trunc) {
12452 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
12453 }
else if (ShuffleOrOp == Instruction::SIToFP ||
12454 ShuffleOrOp == Instruction::UIToFP) {
12455 unsigned NumSignBits =
12458 APInt
Mask = DB->getDemandedBits(OpI);
12459 NumSignBits = std::max(NumSignBits,
Mask.countl_zero());
12461 if (NumSignBits * 2 >=
12463 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
12467 case Instruction::ICmp:
12468 case Instruction::FCmp: {
12471 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12472 ReuseShuffleIndices);
12481 "Commutative Predicate mismatch");
12484 Operands.
back() =
Ops.getVL(1);
12491 if (
Cmp->getPredicate() != P0)
12495 TE->setOperands(Operands);
12496 buildTreeRec(Operands.
front(),
Depth, {TE, 0});
12497 buildTreeRec(Operands.
back(),
Depth, {TE, 1});
12498 if (ShuffleOrOp == Instruction::ICmp) {
12499 unsigned NumSignBits0 =
12501 if (NumSignBits0 * 2 >=
12503 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
12504 unsigned NumSignBits1 =
12506 if (NumSignBits1 * 2 >=
12508 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
12512 case Instruction::Select:
12513 case Instruction::FNeg:
12514 case Instruction::Add:
12515 case Instruction::FAdd:
12516 case Instruction::Sub:
12517 case Instruction::FSub:
12518 case Instruction::Mul:
12519 case Instruction::FMul:
12520 case Instruction::UDiv:
12521 case Instruction::SDiv:
12522 case Instruction::FDiv:
12523 case Instruction::URem:
12524 case Instruction::SRem:
12525 case Instruction::FRem:
12526 case Instruction::Shl:
12527 case Instruction::LShr:
12528 case Instruction::AShr:
12529 case Instruction::And:
12530 case Instruction::Or:
12531 case Instruction::Xor:
12532 case Instruction::Freeze: {
12533 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12534 ReuseShuffleIndices);
12536 dbgs() <<
"SLP: added a new TreeEntry "
12537 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
12543 Operands[0] =
Ops.getVL(0);
12544 Operands[1] =
Ops.getVL(1);
12546 TE->setOperands(Operands);
12548 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
12551 case Instruction::GetElementPtr: {
12552 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12553 ReuseShuffleIndices);
12554 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (GetElementPtrInst).\n";
12556 TE->setOperands(Operands);
12559 buildTreeRec(Operands[
I],
Depth + 1, {
TE,
I});
12562 case Instruction::Store: {
12563 bool Consecutive = CurrentOrder.empty();
12566 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12567 ReuseShuffleIndices, CurrentOrder);
12569 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (StoreInst).\n";
12573 dbgs() <<
"SLP: added a new TreeEntry (jumbled StoreInst).\n";
12575 TE->setOperands(Operands);
12576 buildTreeRec(
TE->getOperand(0),
Depth + 1, {TE, 0});
12579 case Instruction::Call: {
12585 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12586 ReuseShuffleIndices);
12587 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (CallInst).\n";
12592 Operands[0] =
Ops.getVL(0);
12593 Operands[1] =
Ops.getVL(1);
12595 TE->setOperands(Operands);
12601 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
12605 case Instruction::ShuffleVector: {
12606 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12607 ReuseShuffleIndices);
12608 if (S.isAltShuffle()) {
12609 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (isAltShuffle).\n";
12614 dbgs() <<
"SLP: added a new TreeEntry (ShuffleVectorInst).\n";
12628 "Expected different main/alternate predicates.");
12644 TE->setOperands(Operands);
12645 buildTreeRec(Operands.
front(),
Depth + 1, {TE, 0});
12646 buildTreeRec(Operands.
back(),
Depth + 1, {TE, 1});
12653 Operands[0] =
Ops.getVL(0);
12654 Operands[1] =
Ops.getVL(1);
12656 TE->setOperands(Operands);
12658 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
12676 for (
const auto *Ty : ST->elements())
12677 if (Ty != *ST->element_begin())
12679 N *= ST->getNumElements();
12680 EltTy = *ST->element_begin();
12682 N *= AT->getNumElements();
12683 EltTy = AT->getElementType();
12686 N *= VT->getNumElements();
12687 EltTy = VT->getElementType();
12693 size_t VTSize = DL->getTypeStoreSizeInBits(
getWidenedType(EltTy,
N));
12694 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
12695 VTSize != DL->getTypeStoreSizeInBits(T))
12702 bool ResizeAllowed)
const {
12704 assert(It != VL.
end() &&
"Expected at least one extract instruction.");
12711 Value *Vec = E0->getOperand(0);
12713 CurrentOrder.
clear();
12717 if (E0->getOpcode() == Instruction::ExtractValue) {
12729 unsigned E = VL.
size();
12730 if (!ResizeAllowed && NElts !=
E)
12733 unsigned MinIdx = NElts, MaxIdx = 0;
12738 if (Inst->getOperand(0) != Vec)
12746 const unsigned ExtIdx = *Idx;
12747 if (ExtIdx >= NElts)
12749 Indices[
I] = ExtIdx;
12750 if (MinIdx > ExtIdx)
12752 if (MaxIdx < ExtIdx)
12755 if (MaxIdx - MinIdx + 1 >
E)
12757 if (MaxIdx + 1 <=
E)
12761 bool ShouldKeepOrder =
true;
12768 for (
unsigned I = 0;
I <
E; ++
I) {
12771 const unsigned ExtIdx = Indices[
I] - MinIdx;
12772 if (CurrentOrder[ExtIdx] !=
E) {
12773 CurrentOrder.
clear();
12776 ShouldKeepOrder &= ExtIdx ==
I;
12777 CurrentOrder[ExtIdx] =
I;
12779 if (ShouldKeepOrder)
12780 CurrentOrder.
clear();
12782 return ShouldKeepOrder;
12785bool BoUpSLP::areAllUsersVectorized(
12786 Instruction *
I,
const SmallDenseSet<Value *> *VectorizedVals)
const {
12787 return (
I->hasOneUse() && (!VectorizedVals || VectorizedVals->
contains(
I))) ||
12788 all_of(
I->users(), [
this](User *U) {
12789 return isVectorized(U) || isVectorLikeInstWithConstOps(U) ||
12790 (isa<ExtractElementInst>(U) && MustGather.contains(U));
12794void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
12795 const function_ref<
bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
12796 SmallVectorImpl<Value *> *OpScalars,
12797 SmallVectorImpl<Value *> *AltScalars)
const {
12798 unsigned Sz = Scalars.size();
12800 SmallVector<int> OrderMask;
12801 if (!ReorderIndices.empty())
12803 for (
unsigned I = 0;
I < Sz; ++
I) {
12805 if (!ReorderIndices.empty())
12806 Idx = OrderMask[
I];
12810 if (IsAltOp(OpInst)) {
12811 Mask[
I] = Sz + Idx;
12820 if (!ReuseShuffleIndices.
empty()) {
12822 transform(ReuseShuffleIndices, NewMask.
begin(), [&Mask](
int Idx) {
12823 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
12825 Mask.swap(NewMask);
12832 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(
I) == MainOp;
12842 assert(MainP != AltP &&
"Expected different main/alternate predicates.");
12851 assert((MainP ==
P || AltP ==
P || MainP == SwappedP || AltP == SwappedP) &&
12852 "CmpInst expected to match either main or alternate predicate or "
12854 return MainP !=
P && MainP != SwappedP;
12856 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(
I) == AltOp;
12874 return CI->getValue().isPowerOf2();
12880 return CI->getValue().isNegatedPowerOf2();
12885 if (IsConstant && IsUniform)
12887 else if (IsConstant)
12889 else if (IsUniform)
12901class BaseShuffleAnalysis {
12903 Type *ScalarTy =
nullptr;
12905 BaseShuffleAnalysis(
Type *ScalarTy) : ScalarTy(ScalarTy) {}
12913 unsigned getVF(
Value *V)
const {
12914 assert(V &&
"V cannot be nullptr");
12916 "V does not have FixedVectorType");
12917 assert(ScalarTy &&
"ScalarTy cannot be nullptr");
12919 unsigned VNumElements =
12921 assert(VNumElements > ScalarTyNumElements &&
12922 "the number of elements of V is not large enough");
12923 assert(VNumElements % ScalarTyNumElements == 0 &&
12924 "the number of elements of V is not a vectorized value");
12925 return VNumElements / ScalarTyNumElements;
12931 static bool isIdentityMask(ArrayRef<int> Mask,
const FixedVectorType *VecTy,
12933 int Limit =
Mask.size();
12945 if (Limit % VF == 0 &&
all_of(
seq<int>(0, Limit / VF), [=](
int Idx) {
12946 ArrayRef<int> Slice =
Mask.slice(Idx * VF, VF);
12959 static void combineMasks(
unsigned LocalVF, SmallVectorImpl<int> &Mask,
12960 ArrayRef<int> ExtMask) {
12961 unsigned VF =
Mask.size();
12963 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
12966 int MaskedIdx =
Mask[ExtMask[
I] % VF];
12970 Mask.swap(NewMask);
13006 static bool peekThroughShuffles(
Value *&V, SmallVectorImpl<int> &Mask,
13007 bool SinglePermute) {
13009 ShuffleVectorInst *IdentityOp =
nullptr;
13010 SmallVector<int> IdentityMask;
13019 if (isIdentityMask(Mask, SVTy,
false)) {
13020 if (!IdentityOp || !SinglePermute ||
13021 (isIdentityMask(Mask, SVTy,
true) &&
13023 IdentityMask.
size()))) {
13028 IdentityMask.
assign(Mask);
13048 if (SV->isZeroEltSplat()) {
13050 IdentityMask.
assign(Mask);
13052 int LocalVF =
Mask.size();
13055 LocalVF = SVOpTy->getNumElements();
13059 static_cast<unsigned>(
I) >= SV->getShuffleMask().size())
13061 ExtMask[Idx] = SV->getMaskValue(
I);
13071 if (!IsOp1Undef && !IsOp2Undef) {
13073 for (
int &
I : Mask) {
13076 if (SV->getMaskValue(
I % SV->getShuffleMask().size()) ==
13082 SmallVector<int> ShuffleMask(SV->getShuffleMask());
13083 combineMasks(LocalVF, ShuffleMask, Mask);
13084 Mask.swap(ShuffleMask);
13086 Op = SV->getOperand(0);
13088 Op = SV->getOperand(1);
13091 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
13096 "Expected masks of same sizes.");
13101 Mask.swap(IdentityMask);
13103 return SinglePermute &&
13106 (Shuffle &&
Mask.size() == Shuffle->getShuffleMask().size() &&
13107 Shuffle->isZeroEltSplat() &&
13111 Shuffle->getShuffleMask()[
P.index()] == 0;
13124 template <
typename T,
typename ShuffleBuilderTy>
13125 static T createShuffle(
Value *V1,
Value *V2, ArrayRef<int> Mask,
13126 ShuffleBuilderTy &Builder,
Type *ScalarTy) {
13127 assert(V1 &&
"Expected at least one vector value.");
13129 SmallVector<int> NewMask(Mask);
13130 if (ScalarTyNumElements != 1) {
13136 Builder.resizeToMatch(V1, V2);
13137 int VF =
Mask.size();
13139 VF = FTy->getNumElements();
13150 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
13152 CombinedMask1[
I] =
Mask[
I];
13154 CombinedMask2[
I] =
Mask[
I] - VF;
13161 (void)peekThroughShuffles(Op1, CombinedMask1,
false);
13162 (void)peekThroughShuffles(Op2, CombinedMask2,
false);
13168 for (
auto [Idx,
I] :
enumerate(CombinedMask1)) {
13171 ExtMask1[Idx] = SV1->getMaskValue(
I);
13175 ->getNumElements(),
13176 ExtMask1, UseMask::SecondArg);
13177 SmallVector<int> ExtMask2(CombinedMask2.size(),
PoisonMaskElem);
13178 for (
auto [Idx,
I] :
enumerate(CombinedMask2)) {
13181 ExtMask2[Idx] = SV2->getMaskValue(
I);
13185 ->getNumElements(),
13186 ExtMask2, UseMask::SecondArg);
13187 if (SV1->getOperand(0)->getType() ==
13188 SV2->getOperand(0)->getType() &&
13189 SV1->getOperand(0)->getType() != SV1->getType() &&
13192 Op1 = SV1->getOperand(0);
13193 Op2 = SV2->getOperand(0);
13194 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
13195 int LocalVF = ShuffleMask1.size();
13197 LocalVF = FTy->getNumElements();
13198 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
13199 CombinedMask1.swap(ShuffleMask1);
13200 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
13201 LocalVF = ShuffleMask2.size();
13203 LocalVF = FTy->getNumElements();
13204 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
13205 CombinedMask2.swap(ShuffleMask2);
13208 }
while (PrevOp1 != Op1 || PrevOp2 != Op2);
13209 Builder.resizeToMatch(Op1, Op2);
13211 ->getElementCount()
13212 .getKnownMinValue(),
13214 ->getElementCount()
13215 .getKnownMinValue());
13216 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
13219 "Expected undefined mask element");
13220 CombinedMask1[
I] = CombinedMask2[
I] + (Op1 == Op2 ? 0 : VF);
13229 return Builder.createIdentity(Op1);
13230 return Builder.createShuffleVector(
13235 return Builder.createPoison(
13237 bool IsIdentity = peekThroughShuffles(V1, NewMask,
true);
13238 assert(V1 &&
"Expected non-null value after looking through shuffles.");
13241 return Builder.createShuffleVector(V1, NewMask);
13242 return Builder.createIdentity(V1);
13248 ArrayRef<int> Mask) {
13257static std::pair<InstructionCost, InstructionCost>
13268 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
13277 ScalarCost =
TTI.getPointersChainCost(
13278 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
13282 for (
Value *V : Ptrs) {
13283 if (V == BasePtr) {
13296 if (PtrsRetainedInVecCode.
size() == Ptrs.size()) {
13301 VecCost =
TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
13302 TTI::PointersChainInfo::getKnownStride(),
13312 [](
const Value *V) {
13314 return Ptr && !Ptr->hasAllConstantIndices();
13316 ? TTI::PointersChainInfo::getUnknownStride()
13317 : TTI::PointersChainInfo::getKnownStride();
13320 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy,
CostKind);
13324 if (It != Ptrs.
end())
13329 VecCost =
TTI.getGEPCost(BaseGEP->getSourceElementType(),
13330 BaseGEP->getPointerOperand(), Indices, VecTy,
13335 return std::make_pair(ScalarCost, VecCost);
13338void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
13339 assert(
TE.isGather() &&
TE.ReorderIndices.empty() &&
13340 "Expected gather node without reordering.");
13342 SmallSet<size_t, 2> LoadKeyUsed;
13346 if (
TE.Scalars.size() == 2 || (
TE.hasState() && !
TE.isAltShuffle()) ||
13351 return VectorizableTree[Idx]->isSame(TE.Scalars);
13355 auto GenerateLoadsSubkey = [&](
size_t Key, LoadInst *LI) {
13360 auto LIt = LoadsMap.
find(std::make_pair(
Key, Ptr));
13361 if (LIt != LoadsMap.
end()) {
13362 for (LoadInst *RLI : LIt->second) {
13364 LI->
getType(), LI->getPointerOperand(), *DL, *SE,
13368 for (LoadInst *RLI : LIt->second) {
13370 LI->getPointerOperand(), *TLI)) {
13375 if (LIt->second.size() > 2) {
13377 hash_value(LIt->second.back()->getPointerOperand());
13383 LoadsMap.
try_emplace(std::make_pair(
Key, Ptr)).first->second.push_back(LI);
13386 MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues;
13387 SmallDenseMap<Value *, SmallVector<unsigned>, 8> KeyToIndex;
13388 bool IsOrdered =
true;
13389 unsigned NumInstructions = 0;
13393 size_t Key = 1, Idx = 1;
13401 auto &Container = SortedValues[
Key];
13402 if (IsOrdered && !KeyToIndex.
contains(V) &&
13405 ((Container.contains(Idx) &&
13406 KeyToIndex.
at(Container[Idx].back()).back() !=
I - 1) ||
13407 (!Container.empty() && !Container.contains(Idx) &&
13408 KeyToIndex.
at(Container.back().second.back()).back() !=
I - 1)))
13410 auto &KTI = KeyToIndex[
V];
13412 Container[Idx].push_back(V);
13417 if (!IsOrdered && NumInstructions > 1) {
13419 TE.ReorderIndices.resize(
TE.Scalars.size(),
TE.Scalars.size());
13420 for (
const auto &
D : SortedValues) {
13421 for (
const auto &
P :
D.second) {
13423 for (
Value *V :
P.second) {
13424 ArrayRef<unsigned> Indices = KeyToIndex.
at(V);
13425 for (
auto [K, Idx] :
enumerate(Indices)) {
13426 TE.ReorderIndices[Cnt +
K] = Idx;
13427 TE.Scalars[Cnt +
K] =
V;
13429 Sz += Indices.
size();
13430 Cnt += Indices.
size();
13434 *TTI,
TE.Scalars.front()->getType(), Sz);
13438 }
else if (!
P.second.empty() &&
isConstant(
P.second.front())) {
13446 if (!
TE.ReuseShuffleIndices.empty() ||
TE.ReorderIndices.empty())
13451 auto *ScalarTy =
TE.Scalars.front()->getType();
13453 for (
auto [Idx, Sz] : SubVectors) {
13460 int Sz =
TE.Scalars.size();
13461 SmallVector<int> ReorderMask(
TE.ReorderIndices.begin(),
13462 TE.ReorderIndices.end());
13468 ReorderMask[
I] =
I +
TE.ReorderIndices.size();
13472 any_of(ReorderMask, [&](
int I) {
return I >= Sz; })
13475 VecTy, ReorderMask);
13481 DemandedElts.clearBit(
I);
13483 ReorderMask[
I] =
I;
13485 ReorderMask[
I] =
I + Sz;
13491 if (!DemandedElts.isAllOnes())
13493 if (
Cost >= BVCost) {
13494 SmallVector<int>
Mask(
TE.ReorderIndices.begin(),
TE.ReorderIndices.end());
13496 TE.ReorderIndices.clear();
13503 const InstructionsState &S,
13509 return V->getType()->getScalarType()->isFloatingPointTy();
13511 "Can only convert to FMA for floating point types");
13512 assert(S.isAddSubLikeOp() &&
"Can only convert to FMA for add/sub");
13517 for (
Value *V : VL) {
13521 if (S.isCopyableElement(
I))
13523 Instruction *MatchingI = S.getMatchingMainOpOrAltOp(
I);
13524 if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
13527 FMF &= FPCI->getFastMathFlags();
13531 if (!CheckForContractable(VL))
13534 InstructionsCompatibilityAnalysis
Analysis(DT,
DL,
TTI, TLI);
13541 if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
13543 if (!CheckForContractable(Operands.
front()))
13551 for (
Value *V : VL) {
13555 if (!S.isCopyableElement(
I))
13557 FMF &= FPCI->getFastMathFlags();
13558 FMulPlusFAddCost +=
TTI.getInstructionCost(
I,
CostKind);
13561 for (
auto [V,
Op] :
zip(VL, Operands.
front())) {
13562 if (S.isCopyableElement(V))
13565 if (!
I || !
I->hasOneUse() || OpS.isCopyableElement(
I)) {
13567 FMACost +=
TTI.getInstructionCost(OpI,
CostKind);
13574 FMF &= FPCI->getFastMathFlags();
13575 FMulPlusFAddCost +=
TTI.getInstructionCost(
I,
CostKind);
13583bool BoUpSLP::matchesShlZExt(
const TreeEntry &TE,
OrdersType &Order,
13584 bool &IsBSwap,
bool &ForLoads)
const {
13585 assert(
TE.hasState() &&
TE.getOpcode() == Instruction::Shl &&
13586 "Expected Shl node.");
13589 if (
TE.State != TreeEntry::Vectorize || !
TE.ReorderIndices.empty() ||
13590 !
TE.ReuseShuffleIndices.empty() || MinBWs.contains(&TE) ||
13591 any_of(
TE.Scalars, [](
Value *V) { return !V->hasOneUse(); }))
13593 Type *ScalarTy =
TE.getMainOp()->getType();
13599 const unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
13600 const TreeEntry *LhsTE = getOperandEntry(&TE, 0);
13601 const TreeEntry *RhsTE = getOperandEntry(&TE, 1);
13603 if (!(LhsTE->State == TreeEntry::Vectorize &&
13604 LhsTE->getOpcode() == Instruction::ZExt &&
13605 LhsTE->ReorderIndices.empty() && LhsTE->ReuseShuffleIndices.empty() &&
13606 !MinBWs.contains(LhsTE) &&
13607 all_of(LhsTE->Scalars, [](
Value *V) { return V->hasOneUse(); })))
13610 unsigned Stride = DL->getTypeSizeInBits(SrcScalarTy);
13611 if (!
isPowerOf2_64(Stride) || Stride >= Sz || Sz % Stride != 0 ||
13614 if (!(RhsTE->isGather() && RhsTE->ReorderIndices.empty() &&
13615 RhsTE->ReuseShuffleIndices.empty() && !MinBWs.contains(RhsTE)))
13618 unsigned CurrentValue = 0;
13620 if (
all_of(RhsTE->Scalars,
13622 CurrentValue += Stride;
13623 if (isa<UndefValue>(V))
13625 auto *C = dyn_cast<Constant>(V);
13628 return C->getUniqueInteger() == CurrentValue - Stride;
13630 CurrentValue <= Sz) {
13633 const unsigned VF = RhsTE->getVectorFactor();
13634 Order.assign(VF, VF);
13636 SmallBitVector SeenPositions(VF);
13639 if (VF * Stride > Sz)
13641 for (
const auto [Idx, V] :
enumerate(RhsTE->Scalars)) {
13647 const APInt &Val =
C->getUniqueInteger();
13652 if (Order[Idx] != VF || Pos >= VF)
13654 if (SeenPositions.test(Pos))
13656 SeenPositions.set(Pos);
13664 auto *SrcType = IntegerType::getIntNTy(ScalarTy->
getContext(),
13665 Stride * LhsTE->getVectorFactor());
13667 SmallPtrSet<Value *, 4> CheckedExtracts;
13669 auto *SrcVecTy =
getWidenedType(SrcScalarTy, LhsTE->getVectorFactor());
13671 getCastContextHint(*getOperandEntry(LhsTE, 0));
13673 TTI->getArithmeticReductionCost(Instruction::Or, VecTy, FMF,
CostKind) +
13674 TTI->getArithmeticInstrCost(Instruction::Shl, VecTy,
CostKind,
13675 getOperandInfo(LhsTE->Scalars)) +
13676 TTI->getCastInstrCost(
13677 Instruction::ZExt, VecTy,
13681 Instruction::BitCast, SrcType, SrcVecTy, CastCtx,
CostKind);
13682 if (!Order.empty()) {
13684 SmallVector<int>
Mask;
13690 constexpr unsigned ByteSize = 8;
13692 DL->getTypeSizeInBits(SrcScalarTy) == ByteSize) {
13693 IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, SrcType, {SrcType});
13695 TTI->getCastInstrCost(Instruction::BitCast, SrcType, SrcVecTy, CastCtx,
13697 TTI->getIntrinsicInstrCost(CostAttrs,
CostKind);
13698 if (BSwapCost <= BitcastCost) {
13699 BitcastCost = BSwapCost;
13703 const TreeEntry *SrcTE = getOperandEntry(LhsTE, 0);
13704 if (SrcTE->State == TreeEntry::Vectorize &&
13705 SrcTE->ReorderIndices.empty() && SrcTE->ReuseShuffleIndices.empty() &&
13706 SrcTE->getOpcode() == Instruction::Load && !SrcTE->isAltShuffle() &&
13707 all_of(SrcTE->Scalars, [](
Value *V) { return V->hasOneUse(); })) {
13709 IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, SrcType, {SrcType});
13711 TTI->getMemoryOpCost(Instruction::Load, SrcType, LI->getAlign(),
13712 LI->getPointerAddressSpace(),
CostKind) +
13713 TTI->getIntrinsicInstrCost(CostAttrs,
CostKind);
13714 if (BSwapCost <= BitcastCost) {
13716 TTI->getMemoryOpCost(Instruction::Load, SrcVecTy, LI->getAlign(),
13717 LI->getPointerAddressSpace(),
CostKind);
13718 BitcastCost = BSwapCost;
13723 }
else if (Order.empty() && DL->getTypeSizeInBits(SrcScalarTy) == ByteSize) {
13725 const TreeEntry *SrcTE = getOperandEntry(LhsTE, 0);
13726 if (SrcTE->State == TreeEntry::Vectorize && SrcTE->ReorderIndices.empty() &&
13727 SrcTE->ReuseShuffleIndices.empty() &&
13728 SrcTE->getOpcode() == Instruction::Load && !SrcTE->isAltShuffle() &&
13729 all_of(SrcTE->Scalars, [](
Value *V) { return V->hasOneUse(); })) {
13732 TTI->getMemoryOpCost(Instruction::Load, SrcType, LI->getAlign(),
13733 LI->getPointerAddressSpace(),
CostKind);
13735 TTI->getMemoryOpCost(Instruction::Load, SrcVecTy, LI->getAlign(),
13736 LI->getPointerAddressSpace(),
CostKind);
13740 if (SrcType != ScalarTy) {
13741 BitcastCost += TTI->getCastInstrCost(Instruction::ZExt, ScalarTy, SrcType,
13744 return BitcastCost < VecCost;
13747bool BoUpSLP::matchesInversedZExtSelect(
13748 const TreeEntry &SelectTE,
13749 SmallVectorImpl<unsigned> &InversedCmpsIndices)
const {
13750 assert(SelectTE.hasState() && SelectTE.getOpcode() == Instruction::Select &&
13751 "Expected select node.");
13753 for (
auto [Idx, V] :
enumerate(SelectTE.Scalars)) {
13755 if (!Inst || Inst->getOpcode() != Instruction::ZExt)
13761 const auto *CmpTE = getOperandEntry(&SelectTE, 0);
13762 const auto *Op1TE = getOperandEntry(&SelectTE, 1);
13763 const auto *Op2TE = getOperandEntry(&SelectTE, 2);
13767 if (CmpTE->State != TreeEntry::Vectorize || !CmpTE->isAltShuffle() ||
13768 (CmpTE->getOpcode() != Instruction::ICmp &&
13769 CmpTE->getOpcode() != Instruction::FCmp) ||
13770 !CmpTE->ReorderIndices.empty() || !CmpTE->ReuseShuffleIndices.empty() ||
13771 !Op1TE->ReorderIndices.empty() || !Op1TE->ReuseShuffleIndices.empty() ||
13772 !Op2TE->ReorderIndices.empty() || !Op2TE->ReuseShuffleIndices.empty())
13775 if (!Op1TE->isGather() || !Op2TE->isGather())
13778 auto *
Cmp = CmpTE->getMainOp();
13781 if (!
match(Cmp, MatchCmp))
13783 CmpPredicate MainPred = Pred;
13786 for (
const auto [Idx, V] :
enumerate(CmpTE->Scalars)) {
13787 if (!
match(V, MatchCmp))
13793 if (!
V->hasOneUse())
13798 if (InversedCmpsIndices.
empty())
13806 TTI->getCmpSelInstrCost(CmpTE->getOpcode(), VecTy, CmpTy, MainPred,
13807 CostKind, getOperandInfo(CmpTE->getOperand(0)),
13808 getOperandInfo(CmpTE->getOperand(1)));
13813 for (
Value *V : CmpTE->Scalars) {
13817 BVCost += TTI->getInstructionCost(
I,
CostKind);
13819 return VecCost < BVCost;
13822bool BoUpSLP::matchesSelectOfBits(
const TreeEntry &SelectTE)
const {
13823 assert(SelectTE.hasState() && SelectTE.getOpcode() == Instruction::Select &&
13824 "Expected select node.");
13825 if (DL->isBigEndian())
13827 if (!SelectTE.ReorderIndices.empty() || !SelectTE.ReuseShuffleIndices.empty())
13829 if (!UserIgnoreList)
13831 if (
any_of(SelectTE.Scalars, [](
Value *V) { return !V->hasOneUse(); }))
13834 if (
any_of(*UserIgnoreList,
13837 const TreeEntry *Op1TE = getOperandEntry(&SelectTE, 1);
13838 const TreeEntry *Op2TE = getOperandEntry(&SelectTE, 2);
13839 if (!Op1TE->isGather() || !Op2TE->isGather())
13842 if (!Op1TE->ReorderIndices.empty() || !Op1TE->ReuseShuffleIndices.empty() ||
13843 !Op2TE->ReuseShuffleIndices.empty())
13845 Type *ScalarTy = Op1TE->Scalars.front()->getType();
13849 if (
any_of(Op2TE->Scalars, [](
Value *V) { return !match(V, m_ZeroInt()); }))
13854 return !(match(P.value(), m_ConstantInt(V)) && isPowerOf2_64(V) &&
13855 Log2_64(V) == P.index());
13859 auto *DstTy = IntegerType::getIntNTy(ScalarTy->
getContext(),
13860 SelectTE.getVectorFactor());
13864 auto It = MinBWs.find(&SelectTE);
13865 if (It != MinBWs.end()) {
13866 auto *EffectiveScalarTy =
13868 VecTy =
getWidenedType(EffectiveScalarTy, SelectTE.getVectorFactor());
13873 if (DstTy != ScalarTy) {
13874 BitcastCost += TTI->getCastInstrCost(Instruction::ZExt, ScalarTy, DstTy,
13879 TTI->getCmpSelInstrCost(Instruction::Select, VecTy, CmpTy,
13881 getOperandInfo(Op1TE->Scalars),
13882 getOperandInfo(Op2TE->Scalars)) +
13883 TTI->getArithmeticReductionCost(Instruction::Or, VecTy, FMF,
CostKind);
13884 return BitcastCost <= SelectCost;
13889 BaseGraphSize = VectorizableTree.size();
13891 class GraphTransformModeRAAI {
13892 bool &SavedIsGraphTransformMode;
13895 GraphTransformModeRAAI(
bool &IsGraphTransformMode)
13896 : SavedIsGraphTransformMode(IsGraphTransformMode) {
13897 IsGraphTransformMode =
true;
13899 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode =
false; }
13900 } TransformContext(IsGraphTransformMode);
13909 const InstructionsState &S) {
13913 I2->getOperand(
Op));
13914 return all_of(Candidates, [
this](
13915 ArrayRef<std::pair<Value *, Value *>> Cand) {
13917 [](
const std::pair<Value *, Value *> &
P) {
13927 TreeEntry &E = *VectorizableTree[Idx];
13929 reorderGatherNode(E);
13934 constexpr unsigned VFLimit = 16;
13935 bool ForceLoadGather =
13936 count_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
13937 return TE->isGather() && TE->hasState() &&
13938 TE->getOpcode() == Instruction::Load &&
13939 TE->getVectorFactor() < VFLimit;
13945 return TE->isSame(VL) ||
all_of(VL, [&](
Value *V) {
13954 auto CheckForSameVectorNodes = [&](
const TreeEntry &E) {
13955 if (E.hasState()) {
13957 !TEs.
empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
13958 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
13959 ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
13960 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13961 return is_contained(TEs, TE);
13968 !TEs.
empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
13969 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
13970 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
13971 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13972 return is_contained(TEs, TE);
13980 if (It != E.Scalars.end()) {
13982 !TEs.empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
13983 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
13984 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
13985 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13986 return is_contained(TEs, TE);
13996 for (
unsigned Idx :
seq<unsigned>(BaseGraphSize)) {
13997 TreeEntry &
E = *VectorizableTree[Idx];
13998 if (
E.isGather()) {
14001 unsigned MinVF =
getMinVF(2 * Sz);
14004 if (VL.
size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
14005 !(!
E.hasState() ||
E.getOpcode() == Instruction::Load ||
14011 if (ForceLoadGather &&
E.hasState() &&
E.getOpcode() == Instruction::Load)
14014 if (CheckForSameVectorNodes(
E))
14018 unsigned StartIdx = 0;
14019 unsigned End = VL.
size();
14020 SmallBitVector Processed(End);
14022 *TTI, VL.
front()->getType(), VL.
size() - 1);
14024 *TTI, VL.
front()->getType(), VF - 1)) {
14025 if (StartIdx + VF > End)
14028 bool AllStrided =
true;
14029 for (
unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
14034 !getSameValuesTreeEntry(Slice.
front(), Slice,
true))
14041 bool IsSplat =
isSplat(Slice);
14042 bool IsTwoRegisterSplat =
true;
14043 if (IsSplat && VF == 2) {
14046 IsTwoRegisterSplat = NumRegs2VF == 2;
14048 if (Slices.
empty() || !IsSplat || !IsTwoRegisterSplat ||
14056 (S.getOpcode() == Instruction::Load &&
14058 (S.getOpcode() != Instruction::Load &&
14064 if ((!UserIgnoreList ||
E.Idx != 0) &&
14065 TTI->getInstructionCost(S.getMainOp(),
CostKind) <
14074 if (S.getOpcode() == Instruction::Load) {
14077 StridedPtrInfo SPtrInfo;
14079 PointerOps, SPtrInfo);
14090 if (UserIgnoreList &&
E.Idx == 0)
14095 }
else if (S.getOpcode() == Instruction::ExtractElement ||
14096 (TTI->getInstructionCost(S.getMainOp(),
CostKind) <
14098 !CheckOperandsProfitability(
14115 if (VF == 2 && AllStrided && Slices.
size() > 2)
14117 auto AddCombinedNode = [&](
unsigned Idx,
unsigned Cnt,
unsigned Sz) {
14118 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
14119 Processed.set(Cnt, Cnt + Sz);
14120 if (StartIdx == Cnt)
14121 StartIdx = Cnt + Sz;
14122 if (End == Cnt + Sz)
14125 for (
auto [Cnt, Sz] : Slices) {
14127 const TreeEntry *SameTE =
nullptr;
14129 It != Slice.
end()) {
14131 SameTE = getSameValuesTreeEntry(*It, Slice);
14133 unsigned PrevSize = VectorizableTree.size();
14134 [[maybe_unused]]
unsigned PrevEntriesSize =
14135 LoadEntriesToVectorize.size();
14136 buildTreeRec(Slice, 0,
EdgeInfo(&
E, UINT_MAX));
14137 if (PrevSize + 1 == VectorizableTree.size() && !SameTE &&
14138 VectorizableTree[PrevSize]->isGather() &&
14139 VectorizableTree[PrevSize]->hasState() &&
14140 VectorizableTree[PrevSize]->getOpcode() !=
14141 Instruction::ExtractElement &&
14143 if (UserIgnoreList &&
E.Idx == 0 && VF == 2)
14145 VectorizableTree.pop_back();
14146 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
14147 "LoadEntriesToVectorize expected to remain the same");
14150 AddCombinedNode(PrevSize, Cnt, Sz);
14154 if (
E.CombinedEntriesWithIndices.empty() && !
E.ReorderIndices.empty()) {
14155 SmallVector<int>
Mask(
E.ReorderIndices.begin(),
E.ReorderIndices.end());
14157 E.ReorderIndices.clear();
14162 switch (
E.getOpcode()) {
14163 case Instruction::Load: {
14166 if (
E.State != TreeEntry::Vectorize)
14168 Type *ScalarTy =
E.getMainOp()->getType();
14174 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
14175 SmallVector<int>
Mask;
14179 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
14180 BaseLI->getPointerAddressSpace(),
CostKind,
14184 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_load,
14185 VecTy, BaseLI->getPointerOperand(),
14186 false, CommonAlignment,
14193 ->getPointerOperand()
14195 StridedPtrInfo SPtrInfo;
14196 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1);
14197 SPtrInfo.Ty = VecTy;
14198 TreeEntryToStridedPtrInfoMap[&
E] = SPtrInfo;
14199 E.State = TreeEntry::StridedVectorize;
14204 case Instruction::Store: {
14212 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
14213 SmallVector<int>
Mask;
14217 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
14218 BaseSI->getPointerAddressSpace(),
CostKind,
14222 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_store,
14223 VecTy, BaseSI->getPointerOperand(),
14224 false, CommonAlignment,
14227 if (StridedCost < OriginalVecCost)
14230 E.State = TreeEntry::StridedVectorize;
14231 }
else if (!
E.ReorderIndices.empty()) {
14233 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int>
Mask) {
14235 assert(
Mask.size() > 1 &&
"Expected mask greater than 1 element.");
14236 if (
Mask.size() < 4)
14240 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
14241 TTI.isLegalInterleavedAccessType(
14242 VecTy, Factor, BaseSI->getAlign(),
14243 BaseSI->getPointerAddressSpace()))
14249 SmallVector<int>
Mask(
E.ReorderIndices.begin(),
E.ReorderIndices.end());
14250 unsigned InterleaveFactor = IsInterleaveMask(Mask);
14251 if (InterleaveFactor != 0)
14252 E.setInterleave(InterleaveFactor);
14256 case Instruction::Select: {
14257 if (
E.State != TreeEntry::Vectorize)
14262 E.CombinedOp = TreeEntry::MinMax;
14263 TreeEntry *CondEntry = getOperandEntry(&
E, 0);
14264 if (SelectOnly && CondEntry->UserTreeIndex &&
14265 CondEntry->State == TreeEntry::Vectorize) {
14267 CondEntry->State = TreeEntry::CombinedVectorize;
14272 SmallVector<unsigned> InversedCmpsIndices;
14273 if (matchesInversedZExtSelect(
E, InversedCmpsIndices)) {
14274 auto *CmpTE = getOperandEntry(&
E, 0);
14275 auto *Op1TE = getOperandEntry(&
E, 1);
14276 auto *Op2TE = getOperandEntry(&
E, 2);
14278 CmpTE->setOperations(
14279 InstructionsState(CmpTE->getMainOp(), CmpTE->getMainOp()));
14282 auto UpdateGatherEntry = [&](TreeEntry *OldTE, TreeEntry *NewTE,
14286 auto It = ValueToGatherNodes.find(V);
14287 assert(It != ValueToGatherNodes.end() &&
14288 "Expected to find the value in the map.");
14289 auto &
C = It->getSecond();
14296 for (
const unsigned Idx : InversedCmpsIndices) {
14297 Value *V1 = Op1TE->Scalars[Idx];
14298 Value *V2 = Op2TE->Scalars[Idx];
14299 std::swap(Op1TE->Scalars[Idx], Op2TE->Scalars[Idx]);
14301 UpdateGatherEntry(Op1TE, Op2TE, V1);
14302 UpdateGatherEntry(Op2TE, Op1TE, V2);
14304 OperandsToTreeEntry.emplace_or_assign(std::make_pair(&
E, 1), Op1TE);
14305 OperandsToTreeEntry.emplace_or_assign(std::make_pair(&
E, 2), Op2TE);
14308 if (matchesSelectOfBits(
E)) {
14310 const TreeEntry::CombinedOpcode
Code = TreeEntry::ReducedCmpBitcast;
14311 E.CombinedOp =
Code;
14312 auto *Op1TE = getOperandEntry(&
E, 1);
14313 auto *Op2TE = getOperandEntry(&
E, 2);
14314 Op1TE->State = TreeEntry::CombinedVectorize;
14315 Op1TE->CombinedOp =
Code;
14316 Op2TE->State = TreeEntry::CombinedVectorize;
14317 Op2TE->CombinedOp =
Code;
14322 case Instruction::FSub:
14323 case Instruction::FAdd: {
14325 if (
E.State != TreeEntry::Vectorize ||
14326 !
E.getOperations().isAddSubLikeOp())
14332 E.CombinedOp = TreeEntry::FMulAdd;
14333 TreeEntry *FMulEntry = getOperandEntry(&
E, 0);
14334 if (FMulEntry->UserTreeIndex &&
14335 FMulEntry->State == TreeEntry::Vectorize) {
14337 FMulEntry->State = TreeEntry::CombinedVectorize;
14341 case Instruction::Shl: {
14342 if (
E.Idx != 0 || DL->isBigEndian())
14344 if (!UserIgnoreList)
14354 if (!matchesShlZExt(
E, Order, IsBSwap, ForLoads))
14357 TreeEntry::CombinedOpcode
Code =
14358 IsBSwap ? (ForLoads ? TreeEntry::ReducedBitcastBSwapLoads
14359 : TreeEntry::ReducedBitcastBSwap)
14360 : (ForLoads ? TreeEntry::ReducedBitcastLoads
14361 : TreeEntry::ReducedBitcast);
14362 E.CombinedOp =
Code;
14363 E.ReorderIndices = std::move(Order);
14364 TreeEntry *ZExtEntry = getOperandEntry(&
E, 0);
14365 assert(ZExtEntry->UserTreeIndex &&
14366 ZExtEntry->State == TreeEntry::Vectorize &&
14367 ZExtEntry->getOpcode() == Instruction::ZExt &&
14368 "Expected ZExt node.");
14370 ZExtEntry->State = TreeEntry::CombinedVectorize;
14371 ZExtEntry->CombinedOp =
Code;
14373 TreeEntry *LoadsEntry = getOperandEntry(ZExtEntry, 0);
14374 assert(LoadsEntry->UserTreeIndex &&
14375 LoadsEntry->State == TreeEntry::Vectorize &&
14376 LoadsEntry->getOpcode() == Instruction::Load &&
14377 "Expected Load node.");
14379 LoadsEntry->State = TreeEntry::CombinedVectorize;
14380 LoadsEntry->CombinedOp =
Code;
14382 TreeEntry *ConstEntry = getOperandEntry(&
E, 1);
14383 assert(ConstEntry->UserTreeIndex && ConstEntry->isGather() &&
14384 "Expected ZExt node.");
14386 ConstEntry->State = TreeEntry::CombinedVectorize;
14387 ConstEntry->CombinedOp =
Code;
14395 if (LoadEntriesToVectorize.empty()) {
14397 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
14398 VectorizableTree.front()->getOpcode() == Instruction::Load)
14401 constexpr unsigned SmallTree = 3;
14402 constexpr unsigned SmallVF = 2;
14403 if ((VectorizableTree.size() <= SmallTree &&
14404 VectorizableTree.front()->Scalars.size() == SmallVF) ||
14405 (VectorizableTree.size() <= 2 && UserIgnoreList))
14408 if (VectorizableTree.front()->isNonPowOf2Vec() &&
14412 [](
const std::unique_ptr<TreeEntry> &TE) {
14413 return TE->isGather() &&
TE->hasState() &&
14414 TE->getOpcode() == Instruction::Load &&
14422 SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
14426 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
14427 TreeEntry &
E = *
TE;
14428 if (
E.isGather() &&
14429 ((
E.hasState() &&
E.getOpcode() == Instruction::Load) ||
14430 (!
E.hasState() &&
any_of(
E.Scalars,
14432 return isa<LoadInst>(V) &&
14433 !isVectorized(V) &&
14434 !isDeleted(cast<Instruction>(V));
14437 for (
Value *V :
E.Scalars) {
14444 *
this, V, *DL, *SE, *TTI,
14445 GatheredLoads[std::make_tuple(
14453 if (!GatheredLoads.
empty())
14454 tryToVectorizeGatheredLoads(GatheredLoads);
14464 bool IsFinalized =
false;
14477 bool SameNodesEstimated =
true;
14480 if (Ty->getScalarType()->isPointerTy()) {
14484 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
14485 Ty->getScalarType());
14503 assert(It != VL.
end() &&
"Expected at least one non-undef value.");
14506 count(VL, *It) > 1 &&
14508 if (!NeedShuffle) {
14511 return TTI.getShuffleCost(
14516 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
14517 CostKind, std::distance(VL.
begin(), It),
14523 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
14526 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
14530 VecTy, ShuffleMask, CostKind,
14534 return GatherCost +
14537 : R.getGatherCost(Gathers, !Root && VL.
equals(Gathers),
14545 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
14546 unsigned NumParts) {
14547 assert(VL.
size() > NumParts &&
"Unexpected scalarized shuffle.");
14549 std::accumulate(VL.
begin(), VL.
end(), 0, [](
unsigned Sz,
Value *V) {
14550 auto *EE = dyn_cast<ExtractElementInst>(V);
14553 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
14556 return std::max(Sz, VecTy->getNumElements());
14563 -> std::optional<TTI::ShuffleKind> {
14564 if (NumElts <= EltsPerVector)
14565 return std::nullopt;
14567 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
14569 if (I == PoisonMaskElem)
14571 return std::min(S, I);
14574 int OffsetReg1 = OffsetReg0;
14578 int FirstRegId = -1;
14579 Indices.assign(1, OffsetReg0);
14583 int Idx =
I - OffsetReg0;
14585 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
14586 if (FirstRegId < 0)
14587 FirstRegId = RegId;
14588 RegIndices.
insert(RegId);
14589 if (RegIndices.
size() > 2)
14590 return std::nullopt;
14591 if (RegIndices.
size() == 2) {
14593 if (Indices.
size() == 1) {
14596 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
14597 [&](
int S,
int I) {
14598 if (I == PoisonMaskElem)
14600 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
14601 ((I - OffsetReg0) % NumElts) / EltsPerVector;
14602 if (RegId == FirstRegId)
14604 return std::min(S, I);
14607 unsigned Index = OffsetReg1 % NumElts;
14608 Indices.push_back(Index);
14609 SubVecSizes.push_back(std::min(NumElts - Index, EltsPerVector));
14611 Idx =
I - OffsetReg1;
14613 I = (Idx % NumElts) % EltsPerVector +
14614 (RegId == FirstRegId ? 0 : EltsPerVector);
14616 return ShuffleKind;
14624 if (!ShuffleKinds[Part])
14627 Part * EltsPerVector,
getNumElems(Mask.size(), EltsPerVector, Part));
14632 std::optional<TTI::ShuffleKind> RegShuffleKind =
14633 CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);
14634 if (!RegShuffleKind) {
14637 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.
size())))
14650 *R.TTI, VL.
front()->getType(),
alignTo(NumElts, EltsPerVector));
14651 for (
const auto [Idx, SubVecSize] :
zip(Indices, SubVecSizes)) {
14652 assert((Idx + SubVecSize) <= BaseVF &&
14653 "SK_ExtractSubvector index out of range");
14663 TTI, *ShuffleKinds[Part],
getWidenedType(ScalarTy, NumElts), SubMask);
14664 if (OriginalCost < Cost)
14665 Cost = OriginalCost;
14672 void estimateNodesPermuteCost(
const TreeEntry &E1,
const TreeEntry *E2,
14674 unsigned SliceSize) {
14675 if (SameNodesEstimated) {
14681 if ((InVectors.size() == 2 &&
14685 unsigned Limit =
getNumElems(Mask.size(), SliceSize, Part);
14688 "Expected all poisoned elements.");
14690 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
14695 Cost += createShuffle(InVectors.front(),
14696 InVectors.size() == 1 ?
nullptr : InVectors.back(),
14698 transformMaskAfterShuffle(CommonMask, CommonMask);
14699 }
else if (InVectors.size() == 2) {
14700 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14701 transformMaskAfterShuffle(CommonMask, CommonMask);
14703 SameNodesEstimated =
false;
14704 if (!E2 && InVectors.size() == 1) {
14705 unsigned VF = E1.getVectorFactor();
14707 VF = std::max(VF, getVF(V1));
14710 VF = std::max(VF, E->getVectorFactor());
14712 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14714 CommonMask[Idx] = Mask[Idx] + VF;
14715 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
14716 transformMaskAfterShuffle(CommonMask, CommonMask);
14718 auto P = InVectors.front();
14719 Cost += createShuffle(&E1, E2, Mask);
14720 unsigned VF = Mask.size();
14726 VF = std::max(VF, E->getVectorFactor());
14728 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14730 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
14731 Cost += createShuffle(
P, InVectors.front(), CommonMask);
14732 transformMaskAfterShuffle(CommonMask, CommonMask);
14736 class ShuffleCostBuilder {
14739 static bool isEmptyOrIdentity(
ArrayRef<int> Mask,
unsigned VF) {
14741 return Mask.empty() ||
14742 (VF == Mask.size() &&
14750 ~ShuffleCostBuilder() =
default;
14756 if (isEmptyOrIdentity(Mask, VF))
14765 if (isEmptyOrIdentity(Mask, VF))
14774 void resizeToMatch(
Value *&,
Value *&)
const {}
14784 ShuffleCostBuilder Builder(TTI);
14787 unsigned CommonVF = Mask.size();
14789 auto GetNodeMinBWAffectedCost = [&](
const TreeEntry &E,
14793 Type *EScalarTy = E.Scalars.front()->getType();
14794 bool IsSigned =
true;
14795 if (
auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
14797 IsSigned = It->second.second;
14799 if (EScalarTy != ScalarTy) {
14800 unsigned CastOpcode = Instruction::Trunc;
14801 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
14802 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
14804 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14805 return TTI.getCastInstrCost(CastOpcode,
getWidenedType(ScalarTy, VF),
14815 Type *EScalarTy = VecTy->getElementType();
14816 if (EScalarTy != ScalarTy) {
14818 unsigned CastOpcode = Instruction::Trunc;
14819 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
14820 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
14822 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14823 return TTI.getCastInstrCost(
14829 if (!V1 && !V2 && !P2.isNull()) {
14832 unsigned VF = E->getVectorFactor();
14834 CommonVF = std::max(VF, E2->getVectorFactor());
14837 return Idx < 2 * static_cast<int>(CommonVF);
14839 "All elements in mask must be less than 2 * CommonVF.");
14840 if (E->Scalars.size() == E2->Scalars.size()) {
14844 for (
int &Idx : CommonMask) {
14847 if (Idx <
static_cast<int>(CommonVF) && !EMask.
empty())
14849 else if (Idx >=
static_cast<int>(CommonVF))
14850 Idx = (E2Mask.
empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
14854 CommonVF = E->Scalars.size();
14855 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
14856 GetNodeMinBWAffectedCost(*E2, CommonVF);
14858 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
14859 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
14862 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
14863 }
else if (!V1 && P2.isNull()) {
14866 unsigned VF = E->getVectorFactor();
14870 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
14871 "All elements in mask must be less than CommonVF.");
14872 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
14874 assert(!EMask.
empty() &&
"Expected non-empty common mask.");
14875 for (
int &Idx : CommonMask) {
14879 CommonVF = E->Scalars.size();
14880 }
else if (
unsigned Factor = E->getInterleaveFactor();
14881 Factor > 0 && E->Scalars.size() != Mask.size() &&
14885 std::iota(CommonMask.begin(), CommonMask.end(), 0);
14887 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
14890 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
14891 CommonVF == CommonMask.size() &&
14893 [](
const auto &&
P) {
14895 static_cast<unsigned>(
P.value()) !=
P.index();
14903 }
else if (V1 && P2.isNull()) {
14905 ExtraCost += GetValueMinBWAffectedCost(V1);
14906 CommonVF = getVF(V1);
14909 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
14910 "All elements in mask must be less than CommonVF.");
14911 }
else if (V1 && !V2) {
14913 unsigned VF = getVF(V1);
14915 CommonVF = std::max(VF, E2->getVectorFactor());
14918 return Idx < 2 * static_cast<int>(CommonVF);
14920 "All elements in mask must be less than 2 * CommonVF.");
14921 if (E2->Scalars.size() == VF && VF != CommonVF) {
14923 assert(!E2Mask.
empty() &&
"Expected non-empty common mask.");
14924 for (
int &Idx : CommonMask) {
14927 if (Idx >=
static_cast<int>(CommonVF))
14928 Idx = E2Mask[Idx - CommonVF] + VF;
14932 ExtraCost += GetValueMinBWAffectedCost(V1);
14934 ExtraCost += GetNodeMinBWAffectedCost(
14935 *E2, std::min(CommonVF, E2->getVectorFactor()));
14936 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
14937 }
else if (!V1 && V2) {
14939 unsigned VF = getVF(V2);
14941 CommonVF = std::max(VF, E1->getVectorFactor());
14944 return Idx < 2 * static_cast<int>(CommonVF);
14946 "All elements in mask must be less than 2 * CommonVF.");
14947 if (E1->Scalars.size() == VF && VF != CommonVF) {
14949 assert(!E1Mask.
empty() &&
"Expected non-empty common mask.");
14950 for (
int &Idx : CommonMask) {
14953 if (Idx >=
static_cast<int>(CommonVF))
14954 Idx = E1Mask[Idx - CommonVF] + VF;
14960 ExtraCost += GetNodeMinBWAffectedCost(
14961 *E1, std::min(CommonVF, E1->getVectorFactor()));
14963 ExtraCost += GetValueMinBWAffectedCost(V2);
14964 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
14966 assert(V1 && V2 &&
"Expected both vectors.");
14967 unsigned VF = getVF(V1);
14968 CommonVF = std::max(VF, getVF(V2));
14971 return Idx < 2 * static_cast<int>(CommonVF);
14973 "All elements in mask must be less than 2 * CommonVF.");
14975 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
14978 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
14983 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
14986 InVectors.front() =
14988 if (InVectors.size() == 2)
14989 InVectors.pop_back();
14990 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
14991 V1, V2, CommonMask, Builder, ScalarTy);
14998 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
14999 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
15000 CheckedExtracts(CheckedExtracts) {}
15002 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
15003 unsigned NumParts,
bool &UseVecBaseAsInput) {
15004 UseVecBaseAsInput =
false;
15007 Value *VecBase =
nullptr;
15009 if (!E->ReorderIndices.empty()) {
15011 E->ReorderIndices.end());
15016 bool PrevNodeFound =
any_of(
15017 ArrayRef(R.VectorizableTree).take_front(E->Idx),
15018 [&](
const std::unique_ptr<TreeEntry> &TE) {
15019 return ((TE->hasState() && !TE->isAltShuffle() &&
15020 TE->getOpcode() == Instruction::ExtractElement) ||
15022 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
15023 return VL.size() > Data.index() &&
15024 (Mask[Data.index()] == PoisonMaskElem ||
15025 isa<UndefValue>(VL[Data.index()]) ||
15026 Data.value() == VL[Data.index()]);
15034 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
15048 VecBase = EE->getVectorOperand();
15049 UniqueBases.
insert(VecBase);
15051 if (!CheckedExtracts.
insert(V).second ||
15054 [&](
const TreeEntry *TE) {
15055 return R.DeletedNodes.contains(TE) ||
15056 R.TransformedToGatherNodes.contains(TE);
15058 (E->UserTreeIndex && E->UserTreeIndex.EdgeIdx == UINT_MAX &&
15059 !R.isVectorized(EE) &&
15061 count_if(E->UserTreeIndex.UserTE->Scalars,
15062 [&](
Value *V) { return V == EE; })) ||
15065 return isa<GetElementPtrInst>(U) &&
15066 !R.areAllUsersVectorized(cast<Instruction>(U),
15074 unsigned Idx = *EEIdx;
15076 if (EE->hasOneUse() || !PrevNodeFound) {
15082 Cost -=
TTI.getExtractWithExtendCost(
15086 Cost +=
TTI.getCastInstrCost(
15092 APInt &DemandedElts =
15093 VectorOpsToExtracts
15096 .first->getSecond();
15097 DemandedElts.
setBit(Idx);
15100 for (
const auto &[Vec, DemandedElts] : VectorOpsToExtracts)
15102 DemandedElts,
false,
15110 if (!PrevNodeFound)
15111 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
15114 transformMaskAfterShuffle(CommonMask, CommonMask);
15115 SameNodesEstimated =
false;
15116 if (NumParts != 1 && UniqueBases.
size() != 1) {
15117 UseVecBaseAsInput =
true;
15125 std::optional<InstructionCost>
15129 return std::nullopt;
15133 IsFinalized =
false;
15134 CommonMask.clear();
15137 VectorizedVals.clear();
15138 SameNodesEstimated =
true;
15144 return Idx < static_cast<int>(E1.getVectorFactor());
15146 "Expected single vector shuffle mask.");
15150 if (InVectors.empty()) {
15151 CommonMask.assign(Mask.begin(), Mask.end());
15152 InVectors.assign({&E1, &E2});
15155 assert(!CommonMask.empty() &&
"Expected non-empty common mask.");
15160 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
15161 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
15164 if (InVectors.empty()) {
15165 CommonMask.assign(Mask.begin(), Mask.end());
15166 InVectors.assign(1, &E1);
15169 assert(!CommonMask.empty() &&
"Expected non-empty common mask.");
15174 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
15175 estimateNodesPermuteCost(E1,
nullptr, Mask, Part, SliceSize);
15176 if (!SameNodesEstimated && InVectors.size() == 1)
15177 InVectors.emplace_back(&E1);
15183 assert(InVectors.size() == 1 &&
15190 ->getOrdered(
P.index()));
15191 return EI->getVectorOperand() == V1 ||
15192 EI->getVectorOperand() == V2;
15194 "Expected extractelement vectors.");
15198 if (InVectors.empty()) {
15199 assert(CommonMask.empty() && !ForExtracts &&
15200 "Expected empty input mask/vectors.");
15201 CommonMask.assign(Mask.begin(), Mask.end());
15202 InVectors.assign(1, V1);
15208 !CommonMask.empty() &&
15212 ->getOrdered(
P.index());
15214 return P.value() == Mask[
P.index()] ||
15219 return EI->getVectorOperand() == V1;
15221 "Expected only tree entry for extractelement vectors.");
15224 assert(!InVectors.empty() && !CommonMask.empty() &&
15225 "Expected only tree entries from extracts/reused buildvectors.");
15226 unsigned VF = getVF(V1);
15227 if (InVectors.size() == 2) {
15228 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
15229 transformMaskAfterShuffle(CommonMask, CommonMask);
15230 VF = std::max<unsigned>(VF, CommonMask.size());
15231 }
else if (
const auto *InTE =
15232 InVectors.front().dyn_cast<
const TreeEntry *>()) {
15233 VF = std::max(VF, InTE->getVectorFactor());
15237 ->getNumElements());
15239 InVectors.push_back(V1);
15240 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
15242 CommonMask[Idx] = Mask[Idx] + VF;
15245 Value *Root =
nullptr) {
15246 Cost += getBuildVectorCost(VL, Root);
15250 unsigned VF = VL.
size();
15252 VF = std::min(VF, MaskVF);
15253 Type *VLScalarTy = VL.
front()->getType();
15277 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
15283 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
15288 IsFinalized =
true;
15291 if (InVectors.
size() == 2)
15292 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
15294 Cost += createShuffle(Vec,
nullptr, CommonMask);
15295 transformMaskAfterShuffle(CommonMask, CommonMask);
15297 "Expected vector length for the final value before action.");
15300 Cost += createShuffle(V1, V2, Mask);
15303 InVectors.
front() = V;
15305 if (!SubVectors.empty()) {
15307 if (InVectors.
size() == 2)
15308 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
15310 Cost += createShuffle(Vec,
nullptr, CommonMask);
15311 transformMaskAfterShuffle(CommonMask, CommonMask);
15313 if (!SubVectorsMask.
empty()) {
15315 "Expected same size of masks for subvectors and common mask.");
15317 copy(SubVectorsMask, SVMask.begin());
15318 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
15321 I1 = I2 + CommonMask.
size();
15328 for (
auto [
E, Idx] : SubVectors) {
15329 Type *EScalarTy =
E->Scalars.front()->getType();
15330 bool IsSigned =
true;
15331 if (
auto It =
R.MinBWs.find(
E); It !=
R.MinBWs.end()) {
15334 IsSigned = It->second.second;
15336 if (ScalarTy != EScalarTy) {
15337 unsigned CastOpcode = Instruction::Trunc;
15338 unsigned DstSz =
R.DL->getTypeSizeInBits(ScalarTy);
15339 unsigned SrcSz =
R.DL->getTypeSizeInBits(EScalarTy);
15341 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
15342 Cost += TTI.getCastInstrCost(
15351 if (!CommonMask.
empty()) {
15352 std::iota(std::next(CommonMask.
begin(), Idx),
15353 std::next(CommonMask.
begin(), Idx +
E->getVectorFactor()),
15359 if (!ExtMask.
empty()) {
15360 if (CommonMask.
empty()) {
15364 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
15367 NewMask[
I] = CommonMask[ExtMask[
I]];
15369 CommonMask.
swap(NewMask);
15372 if (CommonMask.
empty()) {
15373 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
15377 createShuffle(InVectors.
front(),
15378 InVectors.
size() == 2 ? InVectors.
back() :
nullptr,
15383 assert((IsFinalized || CommonMask.empty()) &&
15384 "Shuffle construction must be finalized.");
15388const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(
const TreeEntry *E,
15389 unsigned Idx)
const {
15390 TreeEntry *
Op = OperandsToTreeEntry.
at({E, Idx});
15391 assert(
Op->isSame(
E->getOperand(Idx)) &&
"Operands mismatch!");
15396 if (
TE.State == TreeEntry::ScatterVectorize ||
15397 TE.State == TreeEntry::StridedVectorize)
15399 if (
TE.State == TreeEntry::CompressVectorize)
15401 if (
TE.State == TreeEntry::Vectorize &&
TE.getOpcode() == Instruction::Load &&
15402 !
TE.isAltShuffle()) {
15403 if (
TE.ReorderIndices.empty())
15405 SmallVector<int>
Mask;
15424 if (!L->getExitingBlock())
15431unsigned BoUpSLP::getScaleToLoopIterations(
const TreeEntry &TE,
Value *Scalar,
15435 Parent =
U->getParent();
15436 }
else if (
TE.isGather() ||
TE.State == TreeEntry::SplitVectorize) {
15438 while (EI.UserTE) {
15439 if (EI.UserTE->isGather() ||
15440 EI.UserTE->State == TreeEntry::SplitVectorize) {
15441 EI = EI.UserTE->UserTreeIndex;
15444 if (EI.UserTE->State == TreeEntry::Vectorize &&
15445 EI.UserTE->getOpcode() == Instruction::PHI) {
15447 Parent = PH->getIncomingBlock(EI.EdgeIdx);
15449 Parent = EI.UserTE->getMainOp()->
getParent();
15456 Parent =
TE.getMainOp()->getParent();
15458 if (
const Loop *L = LI->getLoopFor(Parent)) {
15459 const auto It = LoopToScaleFactor.find(L);
15460 if (It != LoopToScaleFactor.end())
15462 unsigned Scale = 1;
15466 for (
const Loop *LN : getLoopNest(NonInvL)) {
15469 auto LNRes = LoopToScaleFactor.try_emplace(LN, 0);
15470 auto &LoopScale = LNRes.first->getSecond();
15471 if (!LNRes.second) {
15472 Scale *= LoopScale;
15479 LoopToScaleFactor.try_emplace(L, Scale);
15487 SmallPtrSetImpl<Value *> &CheckedExtracts) {
15492 return InstructionCost::getInvalid();
15497 auto It = MinBWs.find(
E);
15498 Type *OrigScalarTy = ScalarTy;
15499 if (It != MinBWs.end()) {
15505 const TreeEntry *ZExt = getOperandEntry(
E, 0);
15509 unsigned EntryVF =
E->getVectorFactor();
15512 if (
E->isGather() || TransformedToGatherNodes.contains(
E)) {
15516 return InstructionCost::getInvalid();
15518 ScalarTy = VL.
front()->getType();
15519 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
15520 E, ScalarTy, *TTI, VectorizedVals, *
this, CheckedExtracts);
15522 if (
E->State == TreeEntry::SplitVectorize) {
15523 assert(
E->CombinedEntriesWithIndices.size() == 2 &&
15524 "Expected exactly 2 combined entries.");
15525 assert(
E->ReuseShuffleIndices.empty() &&
"Expected empty reuses mask.");
15527 if (
E->ReorderIndices.empty()) {
15530 E->CombinedEntriesWithIndices.back().second,
15533 VectorizableTree[
E->CombinedEntriesWithIndices.back().first]
15534 ->getVectorFactor()));
15536 unsigned CommonVF =
15537 std::max(VectorizableTree[
E->CombinedEntriesWithIndices.front().first]
15538 ->getVectorFactor(),
15539 VectorizableTree[
E->CombinedEntriesWithIndices.back().first]
15540 ->getVectorFactor());
15545 LLVM_DEBUG(dumpTreeCosts(
E, 0, VectorCost, 0,
"Calculated costs for Tree"));
15549 SmallVector<int>
Mask;
15550 if (!
E->ReorderIndices.empty() &&
E->State != TreeEntry::CompressVectorize &&
15551 (
E->State != TreeEntry::StridedVectorize ||
15553 SmallVector<int> NewMask;
15554 if (
E->getOpcode() == Instruction::Store) {
15556 NewMask.
resize(
E->ReorderIndices.size());
15563 if (!
E->ReuseShuffleIndices.empty())
15568 assert((
E->State == TreeEntry::Vectorize ||
15569 E->State == TreeEntry::ScatterVectorize ||
15570 E->State == TreeEntry::StridedVectorize ||
15571 E->State == TreeEntry::CompressVectorize) &&
15572 "Unhandled state");
15575 (
E->getOpcode() == Instruction::GetElementPtr &&
15576 E->getMainOp()->getType()->isPointerTy()) ||
15577 E->hasCopyableElements()) &&
15580 unsigned ShuffleOrOp =
15581 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector :
E->
getOpcode();
15582 if (
E->CombinedOp != TreeEntry::NotCombinedOp)
15583 ShuffleOrOp =
E->CombinedOp;
15584 SmallSetVector<Value *, 16> UniqueValues;
15585 SmallVector<unsigned, 16> UniqueIndexes;
15587 if (UniqueValues.insert(V))
15588 UniqueIndexes.push_back(Idx);
15589 const unsigned Sz = UniqueValues.size();
15590 SmallBitVector UsedScalars(Sz,
false);
15591 for (
unsigned I = 0;
I < Sz; ++
I) {
15593 !
E->isCopyableElement(UniqueValues[
I]) &&
15594 getTreeEntries(UniqueValues[
I]).
front() ==
E)
15596 UsedScalars.set(
I);
15598 auto GetCastContextHint = [&](
Value *
V) {
15600 return getCastContextHint(*OpTEs.front());
15601 InstructionsState SrcState =
getSameOpcode(
E->getOperand(0), *TLI);
15602 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
15603 !SrcState.isAltShuffle())
15616 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
15618 for (
unsigned I = 0;
I < Sz; ++
I) {
15619 if (UsedScalars.test(
I))
15621 ScalarCost += ScalarEltCost(
I);
15628 if (It != MinBWs.end() && !UnaryInstruction::isCast(
E->getOpcode()) &&
15630 (
E->getOpcode() != Instruction::Load ||
E->UserTreeIndex)) {
15632 if (!EI.UserTE->hasState() ||
15633 EI.UserTE->getOpcode() != Instruction::Select ||
15635 auto UserBWIt = MinBWs.find(EI.UserTE);
15636 Type *UserScalarTy =
15637 (EI.UserTE->isGather() ||
15638 EI.UserTE->State == TreeEntry::SplitVectorize)
15639 ? EI.UserTE->Scalars.front()->getType()
15640 : EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
15641 if (UserBWIt != MinBWs.end())
15643 UserBWIt->second.first);
15644 if (ScalarTy != UserScalarTy) {
15645 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
15646 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
15647 unsigned VecOpcode;
15649 if (BWSz > SrcBWSz)
15650 VecOpcode = Instruction::Trunc;
15653 It->second.second ? Instruction::SExt : Instruction::ZExt;
15655 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
15660 LLVM_DEBUG(dumpTreeCosts(
E, CommonCost, VecCost - CommonCost,
15661 ScalarCost,
"Calculated costs for Tree"));
15662 return VecCost - ScalarCost;
15667 assert((
E->State == TreeEntry::Vectorize ||
15668 E->State == TreeEntry::StridedVectorize ||
15669 E->State == TreeEntry::CompressVectorize) &&
15670 "Entry state expected to be Vectorize, StridedVectorize or "
15671 "MaskedLoadCompressVectorize here.");
15675 *TTI, Ptrs, BasePtr,
E->getOpcode(),
CostKind, OrigScalarTy, VecTy);
15676 LLVM_DEBUG(dumpTreeCosts(
E, 0, VecCost, ScalarCost,
15677 "Calculated GEPs cost for Tree"));
15679 return VecCost - ScalarCost;
15685 return InstructionCost::getInvalid();
15686 Type *CanonicalType = Ty;
15692 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
15693 {CanonicalType, CanonicalType});
15695 TTI->getIntrinsicInstrCost(CostAttrs,
CostKind);
15698 if (VI && SelectOnly) {
15700 "Expected only for scalar type.");
15703 CI->
getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
15704 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
15705 {TTI::OK_AnyValue, TTI::OP_None}, CI);
15709 auto GetFMulAddCost = [&, &TTI = *TTI](
const InstructionsState &S,
15714 switch (ShuffleOrOp) {
15715 case Instruction::PHI: {
15718 SmallPtrSet<const TreeEntry *, 4> CountedOps;
15719 for (
Value *V : UniqueValues) {
15724 ValueList Operands(
PHI->getNumIncomingValues(),
nullptr);
15725 for (
unsigned I = 0,
N =
PHI->getNumIncomingValues();
I <
N; ++
I) {
15729 if (
const TreeEntry *OpTE =
15730 getSameValuesTreeEntry(Operands.
front(), Operands))
15731 if (CountedOps.
insert(OpTE).second &&
15732 !OpTE->ReuseShuffleIndices.empty())
15733 ScalarCost +=
TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
15734 OpTE->Scalars.size());
15737 return CommonCost - ScalarCost;
15739 case Instruction::ExtractValue:
15740 case Instruction::ExtractElement: {
15741 APInt DemandedElts;
15743 auto GetScalarCost = [&](
unsigned Idx) {
15749 if (ShuffleOrOp == Instruction::ExtractElement) {
15751 SrcVecTy = EE->getVectorOperandType();
15754 Type *AggregateTy = EV->getAggregateOperand()->getType();
15757 NumElts = ATy->getNumElements();
15763 if (
I->hasOneUse()) {
15773 Cost -= TTI->getCastInstrCost(
15779 if (DemandedElts.
isZero())
15785 return CommonCost - (DemandedElts.
isZero()
15787 : TTI.getScalarizationOverhead(
15788 SrcVecTy, DemandedElts,
false,
15791 return GetCostDiff(GetScalarCost, GetVectorCost);
15793 case Instruction::InsertElement: {
15794 assert(
E->ReuseShuffleIndices.empty() &&
15795 "Unique insertelements only are expected.");
15797 unsigned const NumElts = SrcVecTy->getNumElements();
15798 unsigned const NumScalars = VL.
size();
15804 unsigned OffsetEnd = OffsetBeg;
15805 InsertMask[OffsetBeg] = 0;
15808 if (OffsetBeg > Idx)
15810 else if (OffsetEnd < Idx)
15812 InsertMask[Idx] =
I + 1;
15815 if (NumOfParts > 0 && NumOfParts < NumElts)
15816 VecScalarsSz =
PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
15817 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
15819 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
15820 unsigned InsertVecSz = std::min<unsigned>(
15822 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
15823 bool IsWholeSubvector =
15824 OffsetBeg ==
Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
15828 if (OffsetBeg + InsertVecSz > VecSz) {
15831 InsertVecSz = VecSz;
15836 SmallVector<int>
Mask;
15837 if (!
E->ReorderIndices.empty()) {
15842 std::iota(
Mask.begin(), std::next(
Mask.begin(), InsertVecSz), 0);
15844 bool IsIdentity =
true;
15846 Mask.swap(PrevMask);
15847 for (
unsigned I = 0;
I < NumScalars; ++
I) {
15849 DemandedElts.
setBit(InsertIdx);
15850 IsIdentity &= InsertIdx - OffsetBeg ==
I;
15851 Mask[InsertIdx - OffsetBeg] =
I;
15853 assert(
Offset < NumElts &&
"Failed to find vector index offset");
15867 InsertVecTy, Mask);
15869 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
15875 SmallBitVector InMask =
15877 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
15878 if (!InMask.
all() && NumScalars != NumElts && !IsWholeSubvector) {
15879 if (InsertVecSz != VecSz) {
15884 for (
unsigned I = 0, End = OffsetBeg -
Offset;
I < End; ++
I)
15886 for (
unsigned I = OffsetBeg -
Offset, End = OffsetEnd -
Offset;
15890 for (
unsigned I = OffsetEnd + 1 -
Offset;
I < VecSz; ++
I)
15899 case Instruction::ZExt:
15900 case Instruction::SExt:
15901 case Instruction::FPToUI:
15902 case Instruction::FPToSI:
15903 case Instruction::FPExt:
15904 case Instruction::PtrToInt:
15905 case Instruction::IntToPtr:
15906 case Instruction::SIToFP:
15907 case Instruction::UIToFP:
15908 case Instruction::Trunc:
15909 case Instruction::FPTrunc:
15910 case Instruction::BitCast: {
15911 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
15914 unsigned Opcode = ShuffleOrOp;
15915 unsigned VecOpcode = Opcode;
15917 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
15919 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->
getScalarType());
15920 if (SrcIt != MinBWs.end()) {
15921 SrcBWSz = SrcIt->second.first;
15927 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->
getScalarType());
15928 if (BWSz == SrcBWSz) {
15929 VecOpcode = Instruction::BitCast;
15930 }
else if (BWSz < SrcBWSz) {
15931 VecOpcode = Instruction::Trunc;
15932 }
else if (It != MinBWs.end()) {
15933 assert(BWSz > SrcBWSz &&
"Invalid cast!");
15934 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
15935 }
else if (SrcIt != MinBWs.end()) {
15936 assert(BWSz > SrcBWSz &&
"Invalid cast!");
15938 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
15940 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
15941 !SrcIt->second.second) {
15942 VecOpcode = Instruction::UIToFP;
15945 assert(Idx == 0 &&
"Expected 0 index only");
15946 return TTI->getCastInstrCost(Opcode, VL0->
getType(),
15953 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
15955 auto *
VI = VL0->
getOpcode() == Opcode ? VL0 :
nullptr;
15958 bool IsArithmeticExtendedReduction =
15959 E->Idx == 0 && UserIgnoreList &&
15962 return is_contained({Instruction::Add, Instruction::FAdd,
15963 Instruction::Mul, Instruction::FMul,
15964 Instruction::And, Instruction::Or,
15968 if (IsArithmeticExtendedReduction &&
15969 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
15971 return CommonCost +
15972 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH,
CostKind,
15973 VecOpcode == Opcode ? VI :
nullptr);
15975 return GetCostDiff(GetScalarCost, GetVectorCost);
15977 case Instruction::FCmp:
15978 case Instruction::ICmp:
15979 case Instruction::Select: {
15980 CmpPredicate VecPred, SwappedVecPred;
15983 match(VL0, MatchCmp))
15989 auto GetScalarCost = [&](
unsigned Idx) {
16003 !
match(VI, MatchCmp)) ||
16011 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
16012 CostKind, getOperandInfo(
VI->getOperand(0)),
16013 getOperandInfo(
VI->getOperand(1)), VI);
16024 TTI->getCmpSelInstrCost(
E->getOpcode(), VecTy, MaskTy, VecPred,
16025 CostKind, getOperandInfo(
E->getOperand(0)),
16026 getOperandInfo(
E->getOperand(1)), VL0);
16030 unsigned CondNumElements = CondType->getNumElements();
16032 assert(VecTyNumElements >= CondNumElements &&
16033 VecTyNumElements % CondNumElements == 0 &&
16034 "Cannot vectorize Instruction::Select");
16035 if (CondNumElements != VecTyNumElements) {
16044 return VecCost + CommonCost;
16046 return GetCostDiff(GetScalarCost, GetVectorCost);
16048 case TreeEntry::MinMax: {
16049 auto GetScalarCost = [&](
unsigned Idx) {
16050 return GetMinMaxCost(OrigScalarTy);
16054 return VecCost + CommonCost;
16056 return GetCostDiff(GetScalarCost, GetVectorCost);
16058 case TreeEntry::FMulAdd: {
16059 auto GetScalarCost = [&](
unsigned Idx) {
16062 return GetFMulAddCost(
E->getOperations(),
16068 for (
Value *V :
E->Scalars) {
16070 FMF &= FPCI->getFastMathFlags();
16072 FMF &= FPCIOp->getFastMathFlags();
16075 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,
16076 {VecTy, VecTy, VecTy}, FMF);
16078 return VecCost + CommonCost;
16080 return GetCostDiff(GetScalarCost, GetVectorCost);
16082 case TreeEntry::ReducedBitcast:
16083 case TreeEntry::ReducedBitcastBSwap: {
16084 auto GetScalarCost = [&, &TTI = *TTI](
unsigned Idx) {
16094 ScalarCost += TTI.getInstructionCost(ZExt,
CostKind);
16098 const TreeEntry *LhsTE = getOperandEntry(
E, 0);
16100 getCastContextHint(*getOperandEntry(LhsTE, 0));
16102 auto *SrcVecTy =
getWidenedType(SrcScalarTy, LhsTE->getVectorFactor());
16104 Instruction::BitCast, ScalarTy, SrcVecTy, CastCtx,
CostKind);
16105 if (ShuffleOrOp == TreeEntry::ReducedBitcastBSwap) {
16106 auto *SrcType = IntegerType::getIntNTy(
16108 DL->getTypeSizeInBits(SrcScalarTy) * EntryVF);
16109 IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, SrcType, {SrcType});
16111 TTI.getIntrinsicInstrCost(CostAttrs,
CostKind);
16113 if (SrcType != ScalarTy) {
16115 TTI.getCastInstrCost(Instruction::ZExt, ScalarTy, SrcType,
16119 return BitcastCost + CommonCost;
16121 return GetCostDiff(GetScalarCost, GetVectorCost);
16123 case TreeEntry::ReducedBitcastLoads:
16124 case TreeEntry::ReducedBitcastBSwapLoads: {
16125 auto GetScalarCost = [&, &TTI = *TTI](
unsigned Idx) {
16135 ScalarCost += TTI.getInstructionCost(ZExt,
CostKind);
16139 ScalarCost += TTI.getInstructionCost(Load,
CostKind);
16143 const TreeEntry *LhsTE = getOperandEntry(
E, 0);
16144 const TreeEntry *LoadTE = getOperandEntry(LhsTE, 0);
16146 auto *SrcType = IntegerType::getIntNTy(
16148 DL->getTypeSizeInBits(LI0->getType()) * EntryVF);
16150 TTI.getMemoryOpCost(Instruction::Load, SrcType, LI0->getAlign(),
16151 LI0->getPointerAddressSpace(),
CostKind);
16152 if (ShuffleOrOp == TreeEntry::ReducedBitcastBSwapLoads) {
16153 IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, SrcType, {SrcType});
16155 TTI.getIntrinsicInstrCost(CostAttrs,
CostKind);
16157 if (SrcType != ScalarTy) {
16159 TTI.getCastInstrCost(Instruction::ZExt, ScalarTy, SrcType,
16163 return LoadCost + CommonCost;
16165 return GetCostDiff(GetScalarCost, GetVectorCost);
16167 case TreeEntry::ReducedCmpBitcast: {
16168 auto GetScalarCost = [&, &TTI = *TTI](
unsigned Idx) {
16180 IntegerType::getIntNTy(ScalarTy->
getContext(),
E->getVectorFactor());
16182 TTI.getCastInstrCost(Instruction::BitCast, DstTy, CmpTy,
16184 if (DstTy != ScalarTy) {
16186 TTI.getCastInstrCost(Instruction::ZExt, ScalarTy, DstTy,
16189 return BitcastCost + CommonCost;
16191 return GetCostDiff(GetScalarCost, GetVectorCost);
16193 case Instruction::FNeg:
16194 case Instruction::Add:
16195 case Instruction::FAdd:
16196 case Instruction::Sub:
16197 case Instruction::FSub:
16198 case Instruction::Mul:
16199 case Instruction::FMul:
16200 case Instruction::UDiv:
16201 case Instruction::SDiv:
16202 case Instruction::FDiv:
16203 case Instruction::URem:
16204 case Instruction::SRem:
16205 case Instruction::FRem:
16206 case Instruction::Shl:
16207 case Instruction::LShr:
16208 case Instruction::AShr:
16209 case Instruction::And:
16210 case Instruction::Or:
16211 case Instruction::Xor: {
16212 auto GetScalarCost = [&](
unsigned Idx) {
16219 unsigned Lane = UniqueIndexes[Idx];
16220 Value *Op1 =
E->getOperand(0)[Lane];
16222 SmallVector<const Value *, 2> Operands(1, Op1);
16226 Op2 =
E->getOperand(1)[Lane];
16232 ShuffleOrOp, OrigScalarTy,
CostKind, Op1Info, Op2Info, Operands);
16234 I && (ShuffleOrOp == Instruction::FAdd ||
16235 ShuffleOrOp == Instruction::FSub)) {
16243 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
16248 return CI && CI->getValue().countr_one() >= It->second.first;
16256 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy,
CostKind, Op1Info,
16257 Op2Info, {},
nullptr, TLI) +
16260 return GetCostDiff(GetScalarCost, GetVectorCost);
16262 case Instruction::GetElementPtr: {
16263 return CommonCost + GetGEPCostDiff(VL, VL0);
16265 case Instruction::Load: {
16266 auto GetScalarCost = [&](
unsigned Idx) {
16268 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
16269 VI->getAlign(),
VI->getPointerAddressSpace(),
16275 switch (
E->State) {
16276 case TreeEntry::Vectorize:
16277 if (
unsigned Factor =
E->getInterleaveFactor()) {
16278 VecLdCost = TTI->getInterleavedMemoryOpCost(
16279 Instruction::Load, VecTy, Factor, {}, LI0->getAlign(),
16280 LI0->getPointerAddressSpace(),
CostKind);
16283 VecLdCost = TTI->getMemoryOpCost(
16284 Instruction::Load, VecTy, LI0->getAlign(),
16288 case TreeEntry::StridedVectorize: {
16289 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(
E);
16290 FixedVectorType *StridedLoadTy = SPtrInfo.Ty;
16291 assert(StridedLoadTy &&
"Missing StridedPointerInfo for tree entry.");
16292 Align CommonAlignment =
16294 VecLdCost = TTI->getMemIntrinsicInstrCost(
16295 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_load,
16296 StridedLoadTy, LI0->getPointerOperand(),
16297 false, CommonAlignment),
16299 if (StridedLoadTy != VecTy)
16301 TTI->getCastInstrCost(Instruction::BitCast, VecTy, StridedLoadTy,
16306 case TreeEntry::CompressVectorize: {
16308 unsigned InterleaveFactor;
16309 SmallVector<int> CompressMask;
16312 if (!
E->ReorderIndices.empty()) {
16313 SmallVector<int>
Mask(
E->ReorderIndices.begin(),
16314 E->ReorderIndices.end());
16321 Scalars, PointerOps,
E->ReorderIndices, *TTI, *DL, *SE, *AC, *DT,
16322 *TLI, [](
Value *) { return true; }, IsMasked, InterleaveFactor,
16323 CompressMask, LoadVecTy);
16324 assert(IsVectorized &&
"Failed to vectorize load");
16325 CompressEntryToData.try_emplace(
E, CompressMask, LoadVecTy,
16326 InterleaveFactor, IsMasked);
16327 Align CommonAlignment = LI0->getAlign();
16328 if (InterleaveFactor) {
16329 VecLdCost = TTI->getInterleavedMemoryOpCost(
16330 Instruction::Load, LoadVecTy, InterleaveFactor, {},
16331 CommonAlignment, LI0->getPointerAddressSpace(),
CostKind);
16332 }
else if (IsMasked) {
16333 VecLdCost = TTI->getMemIntrinsicInstrCost(
16334 MemIntrinsicCostAttributes(Intrinsic::masked_load, LoadVecTy,
16336 LI0->getPointerAddressSpace()),
16340 LoadVecTy, CompressMask,
CostKind);
16342 VecLdCost = TTI->getMemoryOpCost(
16343 Instruction::Load, LoadVecTy, CommonAlignment,
16347 LoadVecTy, CompressMask,
CostKind);
16351 case TreeEntry::ScatterVectorize: {
16352 Align CommonAlignment =
16354 VecLdCost = TTI->getMemIntrinsicInstrCost(
16355 MemIntrinsicCostAttributes(Intrinsic::masked_gather, VecTy,
16356 LI0->getPointerOperand(),
16357 false, CommonAlignment),
16361 case TreeEntry::CombinedVectorize:
16362 case TreeEntry::SplitVectorize:
16363 case TreeEntry::NeedToGather:
16366 return VecLdCost + CommonCost;
16372 if (
E->State == TreeEntry::ScatterVectorize)
16379 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
16381 case Instruction::Store: {
16382 bool IsReorder = !
E->ReorderIndices.empty();
16383 auto GetScalarCost = [=](
unsigned Idx) {
16386 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
16387 VI->getAlign(),
VI->getPointerAddressSpace(),
16395 if (
E->State == TreeEntry::StridedVectorize) {
16396 Align CommonAlignment =
16398 VecStCost = TTI->getMemIntrinsicInstrCost(
16399 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_store,
16400 VecTy, BaseSI->getPointerOperand(),
16401 false, CommonAlignment),
16404 assert(
E->State == TreeEntry::Vectorize &&
16405 "Expected either strided or consecutive stores.");
16406 if (
unsigned Factor =
E->getInterleaveFactor()) {
16407 assert(
E->ReuseShuffleIndices.empty() && !
E->ReorderIndices.empty() &&
16408 "No reused shuffles expected");
16410 VecStCost = TTI->getInterleavedMemoryOpCost(
16411 Instruction::Store, VecTy, Factor, {}, BaseSI->getAlign(),
16412 BaseSI->getPointerAddressSpace(),
CostKind);
16415 VecStCost = TTI->getMemoryOpCost(
16416 Instruction::Store, VecTy, BaseSI->getAlign(),
16417 BaseSI->getPointerAddressSpace(),
CostKind, OpInfo);
16420 return VecStCost + CommonCost;
16424 unsigned Idx = IsReorder ?
E->ReorderIndices[
I] :
I;
16428 return GetCostDiff(GetScalarCost, GetVectorCost) +
16429 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
16431 case Instruction::Call: {
16432 auto GetScalarCost = [&](
unsigned Idx) {
16436 IntrinsicCostAttributes CostAttrs(
ID, *CI, 1);
16437 return TTI->getIntrinsicInstrCost(CostAttrs,
CostKind);
16447 CI,
ID, VecTy->getNumElements(),
16448 It != MinBWs.end() ? It->second.first : 0, TTI);
16450 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
16452 return GetCostDiff(GetScalarCost, GetVectorCost);
16454 case Instruction::ShuffleVector: {
16462 "Invalid Shuffle Vector Operand");
16465 auto TryFindNodeWithEqualOperands = [=]() {
16466 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
16469 if (
TE->hasState() &&
TE->isAltShuffle() &&
16470 ((
TE->getOpcode() ==
E->getOpcode() &&
16471 TE->getAltOpcode() ==
E->getAltOpcode()) ||
16472 (
TE->getOpcode() ==
E->getAltOpcode() &&
16473 TE->getAltOpcode() ==
E->getOpcode())) &&
16474 TE->hasEqualOperands(*
E))
16479 auto GetScalarCost = [&](
unsigned Idx) {
16484 assert(
E->getMatchingMainOpOrAltOp(VI) &&
16485 "Unexpected main/alternate opcode");
16487 return TTI->getInstructionCost(VI,
CostKind);
16495 if (TryFindNodeWithEqualOperands()) {
16497 dbgs() <<
"SLP: diamond match for alternate node found.\n";
16504 TTIRef.getArithmeticInstrCost(
E->getOpcode(), VecTy,
CostKind);
16506 TTIRef.getArithmeticInstrCost(
E->getAltOpcode(), VecTy,
CostKind);
16509 VecCost = TTIRef.getCmpSelInstrCost(
16510 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(),
CostKind,
16511 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
16513 VecCost += TTIRef.getCmpSelInstrCost(
16514 E->getOpcode(), VecTy, MaskTy,
16516 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
16519 Type *SrcSclTy =
E->getMainOp()->getOperand(0)->getType();
16522 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
16523 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
16525 DL->getTypeSizeInBits(
E->getMainOp()->getOperand(0)->getType());
16526 if (SrcIt != MinBWs.end()) {
16527 SrcBWSz = SrcIt->second.first;
16531 if (BWSz <= SrcBWSz) {
16532 if (BWSz < SrcBWSz)
16534 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
16538 <<
"SLP: alternate extension, which should be truncated.\n";
16544 VecCost = TTIRef.getCastInstrCost(
E->getOpcode(), VecTy, SrcTy,
16547 TTIRef.getCastInstrCost(
E->getAltOpcode(), VecTy, SrcTy,
16550 SmallVector<int>
Mask;
16551 E->buildAltOpShuffleMask(
16552 [&](Instruction *
I) {
16553 assert(
E->getMatchingMainOpOrAltOp(
I) &&
16554 "Unexpected main/alternate opcode");
16565 unsigned Opcode0 =
E->getOpcode();
16566 unsigned Opcode1 =
E->getAltOpcode();
16567 SmallBitVector OpcodeMask(
16571 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
16573 VecTy, Opcode0, Opcode1, OpcodeMask,
CostKind);
16574 return AltVecCost < VecCost ? AltVecCost : VecCost;
16580 return GetCostDiff(
16585 "Not supported shufflevector usage.");
16587 unsigned SVNumElements =
16589 ->getNumElements();
16590 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
16591 for (
size_t I = 0, End = VL.
size();
I != End;
I += GroupSize) {
16596 "Not supported shufflevector usage.");
16599 [[maybe_unused]]
bool IsExtractSubvectorMask =
16600 SV->isExtractSubvectorMask(Index);
16601 assert(IsExtractSubvectorMask &&
16602 "Not supported shufflevector usage.");
16603 if (NextIndex != Index)
16605 NextIndex += SV->getShuffleMask().size();
16608 return ::getShuffleCost(
16614 return GetCostDiff(GetScalarCost, GetVectorCost);
16616 case Instruction::Freeze:
16623bool BoUpSLP::isFullyVectorizableTinyTree(
bool ForReduction)
const {
16625 << VectorizableTree.size() <<
" is fully vectorizable .\n");
16627 auto &&AreVectorizableGathers = [
this](
const TreeEntry *
TE,
unsigned Limit) {
16628 SmallVector<int>
Mask;
16629 return TE->isGather() &&
16631 [
this](
Value *V) { return EphValues.contains(V); }) &&
16633 TE->Scalars.size() < Limit ||
16634 (((
TE->hasState() &&
16635 TE->getOpcode() == Instruction::ExtractElement) ||
16638 (
TE->hasState() &&
TE->getOpcode() == Instruction::Load &&
16639 !
TE->isAltShuffle()) ||
16644 if (VectorizableTree.size() == 1 &&
16645 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
16646 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
16647 VectorizableTree[0]->State == TreeEntry::CompressVectorize ||
16649 AreVectorizableGathers(VectorizableTree[0].
get(),
16650 VectorizableTree[0]->Scalars.size()) &&
16651 VectorizableTree[0]->getVectorFactor() > 2)))
16654 if (VectorizableTree.size() != 2)
16661 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
16662 AreVectorizableGathers(VectorizableTree[1].
get(),
16663 VectorizableTree[0]->Scalars.size()))
16667 if (VectorizableTree[0]->
isGather() ||
16668 (VectorizableTree[1]->
isGather() &&
16669 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
16670 VectorizableTree[0]->State != TreeEntry::StridedVectorize &&
16671 VectorizableTree[0]->State != TreeEntry::CompressVectorize))
16682 if (VectorizableTree.empty()) {
16683 assert(ExternalUses.empty() &&
"We shouldn't have any external users");
16688 if (VectorizableTree.size() == 1 && !ForReduction &&
16689 VectorizableTree.front()->isGather() &&
16690 VectorizableTree.front()->hasState() &&
16691 VectorizableTree.front()->getOpcode() == Instruction::ExtractElement)
16694 if (VectorizableTree.size() == 2 &&
16696 VectorizableTree[1]->isGather() &&
16697 (VectorizableTree[1]->getVectorFactor() <= 2 ||
16698 !(
isSplat(VectorizableTree[1]->Scalars) ||
16704 constexpr int Limit = 4;
16706 (!ForReduction || VectorizableTree.front()->getVectorFactor() <= 2) &&
16708 [&](
const std::unique_ptr<TreeEntry> &TE) {
16709 return TE->isGather() && TE->getVectorFactor() <= Limit &&
16721 !VectorizableTree.empty() &&
16722 all_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
16723 return (TE->isGather() &&
16724 (!TE->hasState() ||
16725 TE->getOpcode() != Instruction::ExtractElement) &&
16727 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
16734 VectorizableTree.size() <= Limit &&
16735 all_of(VectorizableTree,
16736 [&](
const std::unique_ptr<TreeEntry> &TE) {
16737 return (TE->isGather() &&
16738 (!TE->hasState() ||
16739 TE->getOpcode() != Instruction::ExtractElement) &&
16743 (TE->getOpcode() == Instruction::InsertElement ||
16744 (TE->getOpcode() == Instruction::PHI &&
16746 return isa<PoisonValue>(V) || MustGather.contains(V);
16749 any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
16750 return TE->State == TreeEntry::Vectorize &&
16751 TE->getOpcode() == Instruction::PHI;
16756 constexpr unsigned LargeTree = 20;
16757 bool HasSingleLoad =
false;
16759 all_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
16760 bool PrevLoad = HasSingleLoad;
16762 TE->hasState() && !TE->isGather() &&
16763 (TE->getOpcode() == Instruction::Load ||
16764 TE->hasCopyableElements()) &&
16765 (TE->getVectorFactor() > 2 || TE->ReorderIndices.empty());
16766 return (TE->hasState() &&
16767 (TE->getOpcode() == Instruction::PHI ||
16768 (VectorizableTree.size() >= LargeTree &&
16769 (TE->getOpcode() == Instruction::Store ||
16770 (TE->getOpcode() == Instruction::Load && !PrevLoad)) &&
16771 TE->getVectorFactor() <= Limit))) ||
16773 (!TE->hasState() ||
16774 TE->getOpcode() != Instruction::ExtractElement));
16779 bool VectorNodeFound =
false;
16780 bool AnyNonConst =
false;
16781 if (!ForReduction &&
SLPCostThreshold >= 0 && VectorizableTree.size() >= 5 &&
16782 VectorizableTree.front()->getVectorFactor() <= 2 &&
16783 VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() &&
16784 all_of(VectorizableTree,
16785 [&](
const std::unique_ptr<TreeEntry> &TE) {
16786 if (TE->State == TreeEntry::Vectorize && TE->hasState()) {
16787 if (TE->hasState() && (TE->getOpcode() == Instruction::PHI ||
16788 !TE->ReorderIndices.empty()))
16790 bool PrevVectorNodeFound = VectorNodeFound;
16791 VectorNodeFound =
true;
16792 return !PrevVectorNodeFound;
16795 return TE->isGather() || TE->State == TreeEntry::SplitVectorize;
16803 unsigned NumGathers = 0;
16804 constexpr int LimitTreeSize = 36;
16806 all_of(VectorizableTree,
16807 [&](
const std::unique_ptr<TreeEntry> &TE) {
16808 if (!TE->isGather() && TE->hasState() &&
16809 (TE->getOpcode() == Instruction::Load ||
16810 TE->getOpcode() == Instruction::Store)) {
16814 if (TE->isGather())
16816 return TE->State == TreeEntry::SplitVectorize ||
16817 (TE->Idx == 0 && TE->Scalars.size() == 2 &&
16818 TE->hasState() && TE->getOpcode() == Instruction::ICmp &&
16819 VectorizableTree.size() > LimitTreeSize) ||
16823 (TE->getOpcode() == Instruction::PHI ||
16824 (TE->hasCopyableElements() &&
16827 TE->Scalars.size() / 2) ||
16828 ((!TE->ReuseShuffleIndices.empty() ||
16829 !TE->ReorderIndices.empty() || TE->isAltShuffle()) &&
16830 TE->Scalars.size() == 2)));
16832 (StoreLoadNodes.
empty() ||
16833 (VectorizableTree.size() > LimitTreeSize * StoreLoadNodes.
size() &&
16834 (NumGathers > 0 ||
none_of(StoreLoadNodes, [&](
const TreeEntry *TE) {
16835 return TE->getOpcode() == Instruction::Store ||
16837 return !isa<LoadInst>(V) ||
16838 areAllUsersVectorized(cast<Instruction>(V));
16846 VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
16847 VectorizableTree.size() >= Limit &&
16849 [&](
const std::unique_ptr<TreeEntry> &TE) {
16850 return !TE->isGather() && TE->UserTreeIndex.UserTE &&
16851 TE->UserTreeIndex.UserTE->Idx == 0;
16858 VectorizableTree.size() > 2 &&
16859 VectorizableTree.front()->State == TreeEntry::Vectorize &&
16860 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
16861 VectorizableTree[1]->State == TreeEntry::Vectorize &&
16862 VectorizableTree[1]->getOpcode() == Instruction::PHI &&
16864 ArrayRef(VectorizableTree).drop_front(2),
16865 [&](
const std::unique_ptr<TreeEntry> &TE) {
return TE->isGather(); }))
16875 if (isFullyVectorizableTinyTree(ForReduction))
16880 bool IsAllowedSingleBVNode =
16881 VectorizableTree.
size() > 1 ||
16882 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
16883 !VectorizableTree.front()->isAltShuffle() &&
16884 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
16885 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
16887 if (
any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
16888 return TE->isGather() &&
all_of(TE->Scalars, [&](
Value *V) {
16889 return isa<ExtractElementInst, Constant>(V) ||
16890 (IsAllowedSingleBVNode &&
16891 !V->hasNUsesOrMore(UsesLimit) &&
16892 any_of(V->users(), IsaPred<InsertElementInst>));
16897 if (VectorizableTree.back()->isGather() &&
16898 VectorizableTree.back()->hasState() &&
16899 VectorizableTree.back()->isAltShuffle() &&
16900 VectorizableTree.back()->getVectorFactor() > 2 &&
16902 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
16903 TTI->getScalarizationOverhead(
16904 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
16905 VectorizableTree.back()->getVectorFactor()),
16918 constexpr unsigned SmallTree = 3;
16919 if (VectorizableTree.front()->isNonPowOf2Vec() &&
16922 [](
const std::unique_ptr<TreeEntry> &TE) {
16923 return TE->isGather() && TE->hasState() &&
16924 TE->getOpcode() == Instruction::Load &&
16932 TreeEntry &E = *VectorizableTree[Idx];
16933 if (E.State == TreeEntry::SplitVectorize)
16937 if ((E.hasState() && E.getOpcode() != Instruction::Load) ||
16956 const TreeEntry *Root = VectorizableTree.front().get();
16957 if (Root->isGather())
16966 for (
const auto &TEPtr : VectorizableTree) {
16967 if (TEPtr->CombinedOp == TreeEntry::ReducedBitcast ||
16968 TEPtr->CombinedOp == TreeEntry::ReducedBitcastBSwap ||
16969 TEPtr->CombinedOp == TreeEntry::ReducedBitcastLoads ||
16970 TEPtr->CombinedOp == TreeEntry::ReducedBitcastBSwapLoads ||
16971 TEPtr->CombinedOp == TreeEntry::ReducedCmpBitcast) {
16972 ScalarOrPseudoEntries.
insert(TEPtr.get());
16975 if (!TEPtr->isGather()) {
16976 Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
16977 EntriesToLastInstruction.
try_emplace(TEPtr.get(), LastInst);
16978 LastInstructions.
insert(LastInst);
16980 if (TEPtr->UserTreeIndex)
16981 EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
16988 if (
II->isAssumeLikeIntrinsic())
16995 return IntrCost < CallCost;
17002 CheckedInstructions;
17003 unsigned Budget = 0;
17004 const unsigned BudgetLimit =
17009 "Expected instructions in same block.");
17010 if (
auto It = CheckedInstructions.
find(
Last);
17011 It != CheckedInstructions.
end()) {
17012 const Instruction *Checked = It->second.getPointer();
17014 return It->second.getInt() != 0;
17020 ++
First->getIterator().getReverse(),
17022 Last->getIterator().getReverse();
17024 while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
17030 for (
const Instruction *LastInst : LastInstsInRange)
17031 CheckedInstructions.
try_emplace(LastInst, &*PrevInstIt, 0);
17034 if (LastInstructions.
contains(&*PrevInstIt))
17035 LastInstsInRange.
push_back(&*PrevInstIt);
17040 for (
const Instruction *LastInst : LastInstsInRange)
17042 LastInst, PrevInstIt == InstIt ?
First : &*PrevInstIt,
17043 Budget <= BudgetLimit ? 1 : 0);
17044 return Budget <= BudgetLimit;
17046 auto AddCosts = [&](
const TreeEntry *
Op) {
17049 Type *ScalarTy =
Op->Scalars.front()->getType();
17050 auto It = MinBWs.find(
Op);
17051 if (It != MinBWs.end())
17054 unsigned Scale = getScaleToLoopIterations(*
Op);
17055 InstructionCost KeepLiveCost = TTI->getCostOfKeepingLiveOverCall(VecTy);
17056 KeepLiveCost *= Scale;
17057 Cost += KeepLiveCost;
17060 Cost -=
Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy) *
17068 ParentOpParentToPreds;
17071 auto Key = std::make_pair(Root, OpParent);
17072 if (
auto It = ParentOpParentToPreds.
find(
Key);
17073 It != ParentOpParentToPreds.
end())
17085 for (
const auto &KeyPair : ParentsPairsToAdd) {
17087 "Should not have been added before.");
17091 while (!Worklist.
empty()) {
17093 if (BB == OpParent || !Visited.
insert(BB).second)
17095 auto Pair = std::make_pair(BB, OpParent);
17096 if (
auto It = ParentOpParentToPreds.
find(Pair);
17097 It != ParentOpParentToPreds.
end()) {
17101 ParentsPairsToAdd.
insert(Pair);
17106 if (Budget > BudgetLimit)
17118 auto FindNonScalarParentEntry = [&](
const TreeEntry *E) ->
const TreeEntry * {
17120 "Expected scalar or pseudo entry.");
17121 const TreeEntry *Entry = E;
17122 while (Entry->UserTreeIndex) {
17123 Entry = Entry->UserTreeIndex.UserTE;
17124 if (!ScalarOrPseudoEntries.
contains(Entry))
17129 while (!LiveEntries.
empty()) {
17132 if (Operands.
empty())
17134 if (ScalarOrPseudoEntries.
contains(Entry)) {
17135 Entry = FindNonScalarParentEntry(Entry);
17137 for (
const TreeEntry *
Op : Operands) {
17138 if (!
Op->isGather())
17144 Instruction *LastInst = EntriesToLastInstruction.
at(Entry);
17146 for (
const TreeEntry *
Op : Operands) {
17147 if (!
Op->isGather())
17151 if (Entry->State == TreeEntry::SplitVectorize ||
17152 (Entry->getOpcode() != Instruction::PHI &&
Op->isGather()) ||
17158 Pred = Phi->getIncomingBlock(
Op->UserTreeIndex.EdgeIdx);
17161 if (
Op->isGather()) {
17162 assert(Entry->getOpcode() == Instruction::PHI &&
17163 "Expected phi node only.");
17165 ->getIncomingBlock(
Op->UserTreeIndex.EdgeIdx);
17167 for (
Value *V :
Op->Scalars) {
17178 OpLastInst = EntriesToLastInstruction.
at(
Op);
17182 if (OpParent == Parent) {
17183 if (Entry->getOpcode() == Instruction::PHI) {
17184 if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
17188 if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
17194 if (Entry->getOpcode() != Instruction::PHI &&
17195 !CheckForNonVecCallsInSameBlock(
17201 if (!CheckForNonVecCallsInSameBlock(OpLastInst,
17207 if (!CheckPredecessors(Parent, Pred, OpParent)) {
17223 const auto *I1 = IE1;
17224 const auto *I2 = IE2;
17236 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
17239 if (I2 && ((I2 == IE2 || I2->
hasOneUse())) &&
17242 }
while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
17249struct ValueSelect {
17250 template <
typename U>
17251 static std::enable_if_t<std::is_same_v<Value *, U>,
Value *>
get(
Value *V) {
17254 template <
typename U>
17255 static std::enable_if_t<!std::is_same_v<Value *, U>,
U>
get(
Value *) {
17273template <
typename T>
17279 assert(!ShuffleMask.empty() &&
"Empty list of shuffles for inserts.");
17281 auto VMIt = std::next(ShuffleMask.begin());
17284 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
17286 if (!IsBaseUndef.
all()) {
17288 std::pair<T *, bool> Res =
17289 ResizeAction(ShuffleMask.begin()->first, Mask,
false);
17291 for (
unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
17295 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
17297 [[maybe_unused]]
auto *V = ValueSelect::get<T *>(
Base);
17298 assert((!V || GetVF(V) == Mask.size()) &&
17299 "Expected base vector of VF number of elements.");
17300 Prev = Action(Mask, {
nullptr, Res.first});
17301 }
else if (ShuffleMask.size() == 1) {
17304 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
17310 Prev = Action(Mask, {ShuffleMask.begin()->first});
17314 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
17315 unsigned Vec2VF = GetVF(VMIt->first);
17316 if (Vec1VF == Vec2VF) {
17320 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
17323 Mask[
I] = SecMask[
I] + Vec1VF;
17326 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
17329 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
17331 std::pair<T *, bool> Res2 =
17332 ResizeAction(VMIt->first, VMIt->second,
false);
17334 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
17341 Mask[
I] = (Res2.second ?
I : SecMask[
I]) + VF;
17344 Prev = Action(Mask, {Res1.first, Res2.first});
17346 VMIt = std::next(VMIt);
17348 [[maybe_unused]]
bool IsBaseNotUndef = !IsBaseUndef.
all();
17350 for (
auto E = ShuffleMask.end(); VMIt !=
E; ++VMIt) {
17352 std::pair<T *, bool> Res =
17353 ResizeAction(VMIt->first, VMIt->second,
false);
17355 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
17358 "Multiple uses of scalars.");
17359 Mask[
I] = (Res.second ?
I : SecMask[
I]) + VF;
17364 Prev = Action(Mask, {Prev, Res.first});
17376 << VectorizableTree.size() <<
".\n");
17377 auto IsExternallyUsed = [&](
const TreeEntry &TE,
Value *V) {
17378 assert(TE.hasState() && !TE.isGather() &&
17379 TE.State != TreeEntry::SplitVectorize &&
"Expected vector node.");
17380 if (V->hasOneUse() || V->getType()->isVoidTy())
17382 if (TE.hasCopyableElements() && TE.isCopyableElement(V))
17384 const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
17385 if (V->hasNUsesOrMore(NumVectScalars))
17391 if (match(U, m_InsertElt(m_Value(),
17392 m_OneUse(m_CastOrSelf(m_Specific(I))),
17396 m_InsertElt(m_Value(), m_Specific(I), m_ConstantInt())))
17398 if (match(U, m_Store(m_OneUse(m_CastOrSelf(m_Specific(I))),
17401 if (match(U, m_Store(m_Specific(I), m_Value())))
17403 ArrayRef<TreeEntry *> Entries = getTreeEntries(U);
17404 if (Entries.empty() && !MustGather.contains(U))
17406 if (any_of(Entries, [&](TreeEntry *TE) {
17407 return DeletedNodes.contains(TE);
17410 return any_of(ValueToGatherNodes.lookup(U),
17411 [&](
const TreeEntry *TE) {
17412 return DeletedNodes.contains(TE);
17419 unsigned PrevScale = 0;
17421 for (
const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
17422 TreeEntry &TE = *Ptr;
17425 if (TE.State == TreeEntry::CombinedVectorize) {
17427 dbgs() <<
"SLP: Skipping cost for combined node that starts with "
17428 << *TE.Scalars[0] <<
".\n";
17429 TE.dump();
dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
17430 NodesCosts.try_emplace(&TE);
17433 if (TE.hasState() &&
17434 (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {
17435 if (
const TreeEntry *
E =
17436 getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
17437 E &&
E->getVectorFactor() == TE.getVectorFactor()) {
17442 <<
"SLP: Current total cost = " << Cost <<
"\n");
17443 NodesCosts.try_emplace(&TE);
17450 assert((!
TE.isGather() ||
TE.Idx == 0 ||
TE.UserTreeIndex) &&
17451 "Expected gather nodes with users only.");
17454 unsigned Scale = 0;
17455 bool CostIsFree =
C == 0;
17456 if (!CostIsFree && !
TE.isGather() &&
TE.hasState()) {
17457 if (PrevVecParent ==
TE.getMainOp()->getParent()) {
17463 if (!CostIsFree && !Scale) {
17464 Scale = getScaleToLoopIterations(TE);
17467 if (!
TE.isGather() &&
TE.hasState()) {
17468 PrevVecParent =
TE.getMainOp()->getParent();
17473 NodesCosts.try_emplace(&TE,
C);
17476 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
17478 if (
TE.Idx > 0 && !
TE.UserTreeIndex &&
TE.hasState() &&
17479 TE.getOpcode() == Instruction::Load)
17480 GatheredLoadsNodes.insert(&TE);
17481 if (!
TE.isGather() &&
TE.State != TreeEntry::SplitVectorize &&
17482 !(
TE.Idx == 0 && (
TE.getOpcode() == Instruction::InsertElement ||
17483 TE.getOpcode() == Instruction::Store))) {
17486 for (
Value *V :
TE.Scalars) {
17487 if (IsExternallyUsed(TE, V))
17488 DemandedElts.
setBit(
TE.findLaneForValue(V));
17490 if (!DemandedElts.
isZero()) {
17491 Type *ScalarTy =
TE.Scalars.front()->getType();
17492 auto It = MinBWs.find(&TE);
17493 if (It != MinBWs.end())
17497 *TTI, ScalarTy, VecTy, DemandedElts,
false,
17499 if (ExtCost.
isValid() && ExtCost != 0) {
17501 Scale = getScaleToLoopIterations(TE);
17505 ExtractCosts.try_emplace(&TE, ExtCost);
17514 constexpr unsigned PartLimit = 2;
17515 const unsigned Sz =
17517 const unsigned MinVF =
getMinVF(Sz);
17519 VectorizableTree.front()->Scalars.size() * PartLimit <= MinVF &&
17520 (!VectorizableTree.front()->hasState() ||
17521 (VectorizableTree.front()->getOpcode() != Instruction::Store &&
17522 LI->getLoopFor(VectorizableTree.front()->getMainOp()->getParent()))))
17529 std::tuple<InstructionCost, InstructionCost, SmallVector<unsigned>>>
17530 SubtreeCosts(VectorizableTree.size());
17531 auto UpdateParentNodes =
17532 [&](
const TreeEntry *UserTE,
const TreeEntry *
TE,
17534 SmallDenseSet<std::pair<const TreeEntry *, const TreeEntry *>, 4>
17536 bool AddToList =
true) {
17538 VisitedUser.insert(std::make_pair(TE, UserTE)).second) {
17539 std::get<0>(SubtreeCosts[UserTE->Idx]) += TotalCost;
17540 std::get<1>(SubtreeCosts[UserTE->Idx]) +=
Cost;
17542 std::get<2>(SubtreeCosts[UserTE->Idx]).push_back(
TE->Idx);
17543 UserTE = UserTE->UserTreeIndex.UserTE;
17546 for (
const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
17547 TreeEntry &
TE = *Ptr;
17550 std::get<0>(SubtreeCosts[
TE.Idx]) +=
C + ExtractCost;
17551 std::get<1>(SubtreeCosts[
TE.Idx]) +=
C;
17552 if (
const TreeEntry *UserTE =
TE.UserTreeIndex.UserTE) {
17553 SmallDenseSet<std::pair<const TreeEntry *, const TreeEntry *>, 4>
17555 UpdateParentNodes(UserTE, &TE,
C + ExtractCost,
C, VisitedUser);
17558 SmallDenseSet<std::pair<const TreeEntry *, const TreeEntry *>, 4> Visited;
17559 for (TreeEntry *TE : GatheredLoadsNodes) {
17562 for (
Value *V :
TE->Scalars) {
17563 for (
const TreeEntry *BVTE : ValueToGatherNodes.lookup(V))
17564 UpdateParentNodes(BVTE, TE, TotalCost,
Cost, Visited,
17569 using CostIndicesTy =
17571 SmallVector<unsigned>>>;
17572 struct FirstGreater {
17573 bool operator()(
const CostIndicesTy &
LHS,
const CostIndicesTy &
RHS)
const {
17574 return std::get<0>(
LHS.second) < std::get<0>(
RHS.second) ||
17575 (std::get<0>(
LHS.second) == std::get<0>(
RHS.second) &&
17576 LHS.first->Idx <
RHS.first->Idx);
17579 PriorityQueue<CostIndicesTy, SmallVector<CostIndicesTy>, FirstGreater>
17581 for (
const auto [Idx,
P] :
enumerate(SubtreeCosts))
17582 Worklist.emplace(VectorizableTree[Idx].
get(),
P);
17585 if (!UserIgnoreList && VectorizableTree.front()->getVectorFactor() < MinVF &&
17586 VectorizableTree.front()->hasState() &&
17587 VectorizableTree.front()->getOpcode() == Instruction::Store &&
17588 (Worklist.top().first->Idx == 0 || Worklist.top().first->Idx == 1))
17592 while (!Worklist.empty() && std::get<0>(Worklist.top().second) > 0) {
17593 TreeEntry *
TE = Worklist.top().first;
17594 if (
TE->isGather() ||
TE->Idx == 0 || DeletedNodes.contains(TE) ||
17597 (
TE->UserTreeIndex &&
17598 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize &&
17600 ArrayRef<TreeEntry *> Entries = getSplitTreeEntries(V);
17601 return Entries.size() > 1;
17607 if (
TE->State == TreeEntry::Vectorize && !
TE->isAltShuffle() &&
17608 (
TE->getOpcode() == Instruction::ICmp ||
17609 TE->getOpcode() == Instruction::FCmp) &&
17611 auto *I = dyn_cast<CmpInst>(V);
17614 return I->getPredicate() !=
17615 cast<CmpInst>(TE->getMainOp())->getPredicate();
17622 InstructionCost TotalSubtreeCost = std::get<0>(Worklist.top().second);
17624 if (TotalSubtreeCost < TE->Scalars.size()) {
17628 if (!TransformedToGatherNodes.empty()) {
17629 for (
unsigned Idx : std::get<2>(Worklist.top().second)) {
17630 auto It = TransformedToGatherNodes.find(VectorizableTree[Idx].
get());
17631 if (It != TransformedToGatherNodes.end()) {
17632 TotalSubtreeCost -= std::get<0>(SubtreeCosts[Idx]);
17633 SubtreeCost -= std::get<1>(SubtreeCosts[Idx]);
17634 TotalSubtreeCost += It->second;
17635 SubtreeCost += It->second;
17639 if (TotalSubtreeCost < 0 || TotalSubtreeCost < TE->Scalars.size()) {
17643 const unsigned Sz =
TE->Scalars.size();
17651 auto It = MinBWs.find(TE);
17652 if (It != MinBWs.end())
17655 ScalarTy =
TE->Scalars.front()->getType();
17657 const unsigned EntryVF =
TE->getVectorFactor();
17660 *TTI, ScalarTy, VecTy, DemandedElts,
17662 SmallVector<int>
Mask;
17663 if (!
TE->ReorderIndices.empty() &&
17664 TE->State != TreeEntry::CompressVectorize &&
17665 (
TE->State != TreeEntry::StridedVectorize ||
17667 SmallVector<int> NewMask;
17668 if (
TE->getOpcode() == Instruction::Store) {
17670 NewMask.
resize(
TE->ReorderIndices.size());
17677 if (!
TE->ReuseShuffleIndices.empty())
17684 if ((!
TE->hasState() || !
TE->isAltShuffle()) &&
17686 return (TE->hasCopyableElements() && TE->isCopyableElement(V)) ||
17687 isConstant(V) || isGathered(V) || getTreeEntries(V).size() > 1;
17691 if (TotalSubtreeCost > GatherCost) {
17694 if (VectorizableTree.front()->hasState() &&
17695 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
17697 return InstructionCost::getInvalid();
17699 LLVM_DEBUG(
dbgs() <<
"SLP: Trimming unprofitable subtree at node "
17700 <<
TE->Idx <<
" with cost "
17701 << std::get<0>(Worklist.top().second)
17702 <<
" and gather cost " << GatherCost <<
".\n");
17703 if (
TE->UserTreeIndex) {
17704 TransformedToGatherNodes.try_emplace(TE, GatherCost);
17705 NodesCosts.erase(TE);
17707 DeletedNodes.insert(TE);
17708 TransformedToGatherNodes.erase(TE);
17709 NodesCosts.erase(TE);
17711 for (
unsigned Idx : std::get<2>(Worklist.top().second)) {
17712 TreeEntry &ChildTE = *VectorizableTree[Idx];
17713 DeletedNodes.insert(&ChildTE);
17714 TransformedToGatherNodes.erase(&ChildTE);
17715 NodesCosts.erase(&ChildTE);
17722 return std::get<1>(SubtreeCosts.front());
17724 SmallPtrSet<TreeEntry *, 4> GatheredLoadsToDelete;
17731 for (TreeEntry *TE : GatheredLoadsNodes) {
17732 if (DeletedNodes.contains(TE) || TransformedToGatherNodes.contains(TE))
17734 GatheredLoadsToDelete.
insert(TE);
17737 SmallDenseMap<const TreeEntry *, SmallVector<Value *>> ValuesToInsert;
17738 for (
Value *V :
TE->Scalars) {
17739 unsigned Pos =
TE->findLaneForValue(V);
17740 for (
const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
17741 if (DeletedNodes.contains(BVE))
17743 DemandedElts.
setBit(Pos);
17744 ValuesToInsert.
try_emplace(BVE).first->second.push_back(V);
17747 if (!DemandedElts.
isZero()) {
17748 Type *ScalarTy =
TE->Scalars.front()->getType();
17749 auto It = MinBWs.find(TE);
17750 if (It != MinBWs.end())
17754 *TTI, ScalarTy, VecTy, DemandedElts,
17757 for (
const auto &[BVE, Values] : ValuesToInsert) {
17761 for (
Value *V : Values) {
17762 unsigned Pos = BVE->findLaneForValue(V);
17764 BVDemandedElts.
setBit(Pos);
17766 auto *BVVecTy =
getWidenedType(ScalarTy, BVE->getVectorFactor());
17768 *TTI, ScalarTy, BVVecTy, BVDemandedElts,
17772 if (ExtractsCost < BVCost) {
17773 LoadsExtractsCost += ExtractsCost;
17774 GatheredLoadsToDelete.
erase(TE);
17777 LoadsExtractsCost += BVCost;
17779 NodesCosts.erase(TE);
17783 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
17784 if (
TE->UserTreeIndex &&
17785 GatheredLoadsToDelete.
contains(
TE->UserTreeIndex.UserTE)) {
17786 DeletedNodes.insert(
TE.get());
17787 NodesCosts.erase(
TE.get());
17788 GatheredLoadsToDelete.
insert(
TE.get());
17790 if (GatheredLoadsToDelete.
contains(
TE.get()))
17791 DeletedNodes.insert(
TE.get());
17794 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
17795 if (!
TE->UserTreeIndex && TransformedToGatherNodes.contains(
TE.get())) {
17796 assert(
TE->getOpcode() == Instruction::Load &&
"Expected load only.");
17799 if (DeletedNodes.contains(
TE.get()))
17801 if (!NodesCosts.contains(
TE.get())) {
17803 getEntryCost(
TE.get(), VectorizedVals, CheckedExtracts);
17804 if (!
C.isValid() ||
C == 0) {
17805 NodesCosts.try_emplace(
TE.get(),
C);
17808 unsigned Scale = EntryToScale.
lookup(
TE.get());
17810 Scale = getScaleToLoopIterations(*
TE.get());
17812 NodesCosts.try_emplace(
TE.get(),
C);
17816 LLVM_DEBUG(
dbgs() <<
"SLP: Recalculate costs after tree trimming.\n");
17818 for (
const auto &
P : NodesCosts) {
17819 NewCost +=
P.second;
17820 LLVM_DEBUG(
dbgs() <<
"SLP: Adding cost " <<
P.second <<
" for bundle "
17823 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
17825 if (NewCost + LoadsExtractsCost >=
Cost) {
17826 DeletedNodes.clear();
17827 TransformedToGatherNodes.clear();
17832 if (VectorizableTree.size()>= 2 && VectorizableTree.front()->hasState() &&
17833 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
17834 TransformedToGatherNodes.contains(VectorizableTree[1].get()))
17835 return InstructionCost::getInvalid();
17836 if (VectorizableTree.size() >= 3 && VectorizableTree.front()->hasState() &&
17837 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
17838 VectorizableTree[1]->hasState() &&
17839 VectorizableTree[1]->State == TreeEntry::Vectorize &&
17840 (VectorizableTree[1]->getOpcode() == Instruction::ZExt ||
17841 VectorizableTree[1]->getOpcode() == Instruction::SExt ||
17842 VectorizableTree[1]->getOpcode() == Instruction::Trunc) &&
17843 TransformedToGatherNodes.contains(VectorizableTree[2].get()))
17844 return InstructionCost::getInvalid();
17852template <
typename T>
struct ShuffledInsertData {
17856 MapVector<T, SmallVector<int>> ValueMasks;
17868 if (!
C.isValid() ||
C == 0)
17870 unsigned &Scale = EntryToScale.
try_emplace(&TE, 0).first->getSecond();
17872 Scale = getScaleToLoopIterations(TE, Scalar, U);
17876 if (UserIgnoreList) {
17878 assert(It != UserIgnoreList->end() &&
"Expected reduction instruction.");
17881 ReductionCost = ScaleCost(ReductionCost, *VectorizableTree.front().get(),
17882 nullptr, ReductionRoot);
17886 Cost += ReductionCost;
17890 constexpr unsigned CostLimit = 100;
17892 (VectorizableTree.size() - DeletedNodes.size()) *
17893 VectorizableTree.front()->getVectorFactor() <
17898 none_of(ExternalUses, [](
const ExternalUser &EU) {
17909 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
17916 for (ExternalUser &EU : ExternalUses) {
17917 ScalarUserAndIdx.
emplace_back(EU.Scalar, EU.User, EU.Lane);
17920 for (ExternalUser &EU : ExternalUses) {
17921 LLVM_DEBUG(
dbgs() <<
"SLP: Computing cost for external use of TreeEntry "
17922 << EU.E.Idx <<
" in lane " << EU.Lane <<
"\n");
17924 else dbgs() <<
" User: nullptr\n");
17925 LLVM_DEBUG(
dbgs() <<
" Use: " << EU.Scalar->getNameOrAsOperand() <<
"\n");
17930 if (EphValues.count(EU.User))
17934 if (!CheckedScalarUser.
insert(std::make_pair(EU.Scalar, EU.User)).second ||
17936 CheckedScalarUser.
contains(std::make_pair(EU.Scalar,
nullptr))))
17944 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
17950 !ExtractCostCalculated.
insert(EU.Scalar).second)
17963 if (!UsedInserts.
insert(VU).second)
17967 const TreeEntry *ScalarTE = &EU.E;
17970 [
this, VU](
const ShuffledInsertData<const TreeEntry *> &
Data) {
17975 Value *Op0 =
II->getOperand(0);
17982 if (It == ShuffledInserts.
end()) {
17984 Data.InsertElements.emplace_back(VU);
17986 VecId = ShuffledInserts.
size() - 1;
17987 auto It = MinBWs.find(ScalarTE);
17988 if (It != MinBWs.end() &&
17990 .
insert(std::make_pair(ScalarTE, FTy->getElementType()))
17992 unsigned BWSz = It->second.first;
17993 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
17994 unsigned VecOpcode;
17995 if (DstBWSz < BWSz)
17996 VecOpcode = Instruction::Trunc;
17999 It->second.second ? Instruction::SExt : Instruction::ZExt;
18004 FTy->getNumElements()),
18007 <<
" for extending externally used vector with "
18008 "non-equal minimum bitwidth.\n");
18013 It->InsertElements.front() = VU;
18014 VecId = std::distance(ShuffledInserts.
begin(), It);
18016 int InIdx = *InsertIdx;
18018 ShuffledInserts[VecId].ValueMasks[ScalarTE];
18021 Mask[InIdx] = EU.Lane;
18022 DemandedElts[VecId].setBit(InIdx);
18033 auto *ScalarTy = EU.Scalar->getType();
18034 const unsigned BundleWidth = EU.E.getVectorFactor();
18035 assert(EU.Lane < BundleWidth &&
"Extracted lane out of bounds.");
18037 const TreeEntry *Entry = &EU.E;
18038 auto It = MinBWs.find(Entry);
18039 if (It != MinBWs.end()) {
18044 ? Instruction::ZExt
18045 : Instruction::SExt;
18050 << ExtraCost <<
"\n");
18054 CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx);
18055 LLVM_DEBUG(
dbgs() <<
" ExtractElement cost for " << *ScalarTy <<
" from "
18056 << *VecTy <<
": " << ExtraCost <<
"\n");
18059 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
18060 Entry->getOpcode() == Instruction::Load) {
18062 auto IsPhiInLoop = [&](
const ExternalUser &U) {
18065 const Loop *L = LI->getLoopFor(Phi->getParent());
18066 return L && (Phi->getParent() ==
I->getParent() ||
18067 L == LI->getLoopFor(
I->getParent()));
18071 if (!ValueToExtUses) {
18072 ValueToExtUses.emplace();
18073 for (
const auto &
P :
enumerate(ExternalUses)) {
18075 if (IsPhiInLoop(
P.value()))
18078 ValueToExtUses->try_emplace(
P.value().Scalar,
P.index());
18085 auto OperandIsScalar = [&](
Value *V) {
18091 return !EE->hasOneUse() || !MustGather.contains(EE);
18094 return ValueToExtUses->contains(V);
18096 bool CanBeUsedAsScalar =
all_of(Inst->operands(), OperandIsScalar);
18097 bool CanBeUsedAsScalarCast =
false;
18100 Op &&
all_of(
Op->operands(), OperandIsScalar)) {
18105 if (ScalarCost + OpCost <= ExtraCost) {
18106 CanBeUsedAsScalar = CanBeUsedAsScalarCast =
true;
18107 ScalarCost += OpCost;
18111 if (CanBeUsedAsScalar) {
18112 bool KeepScalar = ScalarCost <= ExtraCost;
18116 bool IsProfitablePHIUser =
18118 VectorizableTree.front()->Scalars.size() > 2)) &&
18119 VectorizableTree.front()->hasState() &&
18120 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
18124 auto *PHIUser = dyn_cast<PHINode>(U);
18125 return (!PHIUser ||
18126 PHIUser->getParent() !=
18128 VectorizableTree.front()->getMainOp())
18133 return ValueToExtUses->contains(V);
18135 if (IsProfitablePHIUser) {
18139 (!GatheredLoadsEntriesFirst.has_value() ||
18140 Entry->Idx < *GatheredLoadsEntriesFirst)) {
18141 unsigned ScalarUsesCount =
count_if(Entry->Scalars, [&](
Value *V) {
18142 return ValueToExtUses->contains(V);
18144 auto It = ExtractsCount.
find(Entry);
18145 if (It != ExtractsCount.
end()) {
18146 assert(ScalarUsesCount >= It->getSecond().size() &&
18147 "Expected total number of external uses not less than "
18148 "number of scalar uses.");
18149 ScalarUsesCount -= It->getSecond().size();
18154 KeepScalar = ScalarUsesCount <= 1 || !
has_single_bit(ScalarUsesCount);
18157 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
18158 for (
Value *V : Inst->operands()) {
18159 auto It = ValueToExtUses->find(V);
18160 if (It != ValueToExtUses->end()) {
18162 ExternalUses[It->second].User =
nullptr;
18165 ExtraCost = ScalarCost;
18166 if (!IsPhiInLoop(EU))
18167 ExtractsCount[Entry].
insert(Inst);
18168 if (CanBeUsedAsScalarCast) {
18169 ScalarOpsFromCasts.
insert(Inst->getOperand(0));
18173 for (
Value *V : IOp->operands()) {
18174 auto It = ValueToExtUses->find(V);
18175 if (It != ValueToExtUses->end()) {
18177 ExternalUses[It->second].User =
nullptr;
18186 ExtraCost = ScaleCost(ExtraCost, *Entry, EU.Scalar,
18189 ExtractCost += ExtraCost;
18193 for (
Value *V : ScalarOpsFromCasts) {
18194 ExternalUsesAsOriginalScalar.insert(V);
18196 const auto *It =
find_if_not(TEs, [&](TreeEntry *TE) {
18197 return TransformedToGatherNodes.contains(TE) ||
18198 DeletedNodes.contains(TE);
18200 if (It != TEs.end()) {
18201 const TreeEntry *UserTE = *It;
18202 ExternalUses.emplace_back(V,
nullptr, *UserTE,
18203 UserTE->findLaneForValue(V));
18208 if (!VectorizedVals.
empty()) {
18209 const TreeEntry &Root = *VectorizableTree.front();
18210 auto BWIt = MinBWs.find(&Root);
18211 if (BWIt != MinBWs.end()) {
18212 Type *DstTy = Root.Scalars.front()->getType();
18213 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->
getScalarType());
18215 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
18216 if (OriginalSz != SrcSz) {
18217 unsigned Opcode = Instruction::Trunc;
18218 if (OriginalSz > SrcSz)
18219 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
18226 TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
18229 CastCost = ScaleCost(CastCost, Root,
nullptr, ReductionRoot);
18239 VectorizableTree[1]->hasState() &&
18240 VectorizableTree[1]->State == TreeEntry::Vectorize &&
18241 all_of(VectorizableTree[1]->Scalars, [&](
Value *V) {
18242 return ExternalUsesAsOriginalScalar.contains(V);
18246 Cost += ExtractCost;
18247 auto &&ResizeToVF = [
this, &Cost](
const TreeEntry *TE,
ArrayRef<int> Mask,
18248 bool ForSingleMask) {
18250 unsigned VF = Mask.size();
18251 unsigned VecVF = TE->getVectorFactor();
18252 bool HasLargeIndex =
18253 any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); });
18254 if ((VF != VecVF && HasLargeIndex) ||
18257 if (HasLargeIndex) {
18259 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
18265 dbgs() <<
"SLP: Adding cost " <<
C
18266 <<
" for final shuffle of insertelement external users.\n";
18267 TE->dump();
dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
18269 return std::make_pair(TE,
true);
18272 if (!ForSingleMask) {
18274 for (
unsigned I = 0;
I < VF; ++
I) {
18276 ResizeMask[Mask[
I]] = Mask[
I];
18283 dbgs() <<
"SLP: Adding cost " <<
C
18284 <<
" for final shuffle of insertelement external users.\n";
18285 TE->dump();
dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
18290 return std::make_pair(TE,
false);
18293 for (
int I = 0, E = ShuffledInserts.
size();
I < E; ++
I) {
18294 Value *
Base = ShuffledInserts[
I].InsertElements.
front()->getOperand(0);
18295 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
18299 assert((TEs.size() == 1 || TEs.size() == 2) &&
18300 "Expected exactly 1 or 2 tree entries.");
18301 if (TEs.size() == 1) {
18303 VF = TEs.front()->getVectorFactor();
18304 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
18308 (
Data.index() < VF &&
18309 static_cast<int>(
Data.index()) ==
Data.value());
18313 C = ScaleCost(
C, *TEs.front());
18315 <<
" for final shuffle of insertelement "
18316 "external users.\n";
18317 TEs.front()->
dump();
18318 dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
18324 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
18325 VF = TEs.front()->getVectorFactor();
18329 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
18332 C = ScaleCost(
C, *TEs.back());
18334 <<
" for final shuffle of vector node and external "
18335 "insertelement users.\n";
18336 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
18337 dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
18345 [](
const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
18346 EstimateShufflesCost);
18349 ShuffledInserts[
I].InsertElements.
front()->getType()),
18352 Cost -= InsertCost;
18356 if (ReductionBitWidth != 0) {
18357 assert(UserIgnoreList &&
"Expected reduction tree.");
18358 const TreeEntry &E = *VectorizableTree.front();
18359 auto It = MinBWs.find(&E);
18360 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
18361 unsigned SrcSize = It->second.first;
18362 unsigned DstSize = ReductionBitWidth;
18363 unsigned Opcode = Instruction::Trunc;
18364 if (SrcSize < DstSize) {
18365 bool IsArithmeticExtendedReduction =
18368 return is_contained({Instruction::Add, Instruction::FAdd,
18369 Instruction::Mul, Instruction::FMul,
18370 Instruction::And, Instruction::Or,
18374 if (IsArithmeticExtendedReduction)
18376 Instruction::BitCast;
18378 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
18380 if (Opcode != Instruction::BitCast) {
18382 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
18384 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
18386 switch (E.getOpcode()) {
18387 case Instruction::SExt:
18388 case Instruction::ZExt:
18389 case Instruction::Trunc: {
18390 const TreeEntry *OpTE = getOperandEntry(&E, 0);
18391 CCH = getCastContextHint(*OpTE);
18398 TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
18400 CastCost = ScaleCost(CastCost, *VectorizableTree.front().get(),
18401 nullptr, ReductionRoot);
18404 <<
" for final resize for reduction from " << SrcVecTy
18405 <<
" to " << DstVecTy <<
"\n";
18406 dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
18411 std::optional<InstructionCost> SpillCost;
18414 Cost += *SpillCost;
18420 OS <<
"SLP: Spill Cost = ";
18425 OS <<
".\nSLP: Extract Cost = " << ExtractCost <<
".\n";
18427 OS <<
"SLP: Reduction Cost = " << ReductionCost <<
".\n";
18428 OS <<
"SLP: Total Cost = " << Cost <<
".\n";
18432 ViewGraph(
this,
"SLP" + F->getName(),
false, Str);
18443std::optional<TTI::ShuffleKind>
18444BoUpSLP::tryToGatherSingleRegisterExtractElements(
18450 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
18466 if (Idx >= VecTy->getNumElements()) {
18470 SmallBitVector ExtractMask(VecTy->getNumElements(),
true);
18471 ExtractMask.reset(*Idx);
18476 VectorOpToIdx[EI->getVectorOperand()].push_back(
I);
18481 stable_sort(Vectors, [](
const auto &P1,
const auto &P2) {
18482 return P1.second.size() >
P2.second.size();
18485 const int UndefSz = UndefVectorExtracts.
size();
18486 unsigned SingleMax = 0;
18487 unsigned PairMax = 0;
18488 if (!Vectors.
empty()) {
18489 SingleMax = Vectors.
front().second.size() + UndefSz;
18490 if (Vectors.
size() > 1) {
18491 auto *ItNext = std::next(Vectors.
begin());
18492 PairMax = SingleMax + ItNext->second.size();
18495 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
18496 return std::nullopt;
18502 if (SingleMax >= PairMax && SingleMax) {
18503 for (
int Idx : Vectors.
front().second)
18504 std::swap(GatheredExtracts[Idx], VL[Idx]);
18505 }
else if (!Vectors.
empty()) {
18506 for (
unsigned Idx : {0, 1})
18507 for (
int Idx : Vectors[Idx].second)
18508 std::swap(GatheredExtracts[Idx], VL[Idx]);
18511 for (
int Idx : UndefVectorExtracts)
18512 std::swap(GatheredExtracts[Idx], VL[Idx]);
18515 std::optional<TTI::ShuffleKind> Res =
18521 return std::nullopt;
18525 for (
int I = 0,
E = GatheredExtracts.size();
I <
E; ++
I) {
18546BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
18547 SmallVectorImpl<int> &Mask,
18548 unsigned NumParts)
const {
18549 assert(NumParts > 0 &&
"NumParts expected be greater than or equal to 1.");
18556 const unsigned PartOffset = Part * SliceSize;
18559 if (PartOffset + PartSize > VL.
size())
18563 SmallVector<int> SubMask;
18564 std::optional<TTI::ShuffleKind> Res =
18565 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
18566 ShufflesRes[Part] = Res;
18567 copy(SubMask, std::next(
Mask.begin(), Part * SliceSize));
18569 if (
none_of(ShufflesRes, [](
const std::optional<TTI::ShuffleKind> &Res) {
18570 return Res.has_value();
18572 ShufflesRes.clear();
18573 return ShufflesRes;
18576std::optional<TargetTransformInfo::ShuffleKind>
18577BoUpSLP::isGatherShuffledSingleRegisterEntry(
18579 SmallVectorImpl<const TreeEntry *> &Entries,
unsigned Part,
bool ForOrder) {
18582 return std::nullopt;
18585 auto GetUserEntry = [&](
const TreeEntry *
TE) {
18586 while (
TE->UserTreeIndex &&
TE->UserTreeIndex.EdgeIdx == UINT_MAX)
18587 TE =
TE->UserTreeIndex.UserTE;
18588 if (TE == VectorizableTree.front().get())
18589 return EdgeInfo(
const_cast<TreeEntry *
>(TE), 0);
18590 return TE->UserTreeIndex;
18592 auto HasGatherUser = [&](
const TreeEntry *
TE) {
18593 while (
TE->Idx != 0 &&
TE->UserTreeIndex) {
18594 if (
TE->UserTreeIndex.EdgeIdx == UINT_MAX)
18596 TE =
TE->UserTreeIndex.UserTE;
18600 const EdgeInfo TEUseEI = GetUserEntry(TE);
18601 if (!TEUseEI || (TEUseEI.UserTE->Idx == 0 && TEUseEI.UserTE->isGather() &&
18602 !TEUseEI.UserTE->hasState()))
18603 return std::nullopt;
18604 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
18609 TEUseEI.UserTE->hasState() ? TEUseEI.UserTE->getMainOp() :
nullptr);
18610 PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
18611 TEInsertBlock =
PHI->getIncomingBlock(TEUseEI.EdgeIdx);
18614 TEInsertBlock = TEInsertPt->
getParent();
18616 if (!DT->isReachableFromEntry(TEInsertBlock))
18617 return std::nullopt;
18618 auto *NodeUI = DT->getNode(TEInsertBlock);
18619 assert(NodeUI &&
"Should only process reachable instructions");
18621 auto CheckOrdering = [&](
const Instruction *InsertPt) {
18634 const BasicBlock *InsertBlock = InsertPt->getParent();
18635 auto *NodeEUI = DT->getNode(InsertBlock);
18638 assert((NodeUI == NodeEUI) ==
18639 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
18640 "Different nodes should have different DFS numbers");
18642 if (TEInsertPt->
getParent() != InsertBlock &&
18643 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
18645 if (TEInsertPt->
getParent() == InsertBlock &&
18658 SmallDenseMap<Value *, int> UsedValuesEntry;
18659 SmallPtrSet<const Value *, 16> VisitedValue;
18660 bool IsReusedNodeFound =
false;
18661 auto CheckAndUseSameNode = [&](
const TreeEntry *TEPtr) {
18663 if (IsReusedNodeFound)
18665 if ((TEPtr->getVectorFactor() != VL.
size() &&
18666 TEPtr->Scalars.size() != VL.
size()) ||
18667 (!TEPtr->isSame(VL) && !TEPtr->isSame(
TE->Scalars)))
18669 IsReusedNodeFound =
18670 equal(
TE->Scalars, TEPtr->Scalars) &&
18671 equal(
TE->ReorderIndices, TEPtr->ReorderIndices) &&
18672 equal(
TE->ReuseShuffleIndices, TEPtr->ReuseShuffleIndices);
18675 for (
Value *V : VL) {
18682 auto CheckParentNodes = [&](
const TreeEntry *User1,
const TreeEntry *User2,
18683 unsigned EdgeIdx) {
18684 const TreeEntry *Ptr1 = User1;
18685 const TreeEntry *Ptr2 = User2;
18686 SmallDenseMap<const TreeEntry *, unsigned> PtrToIdx;
18689 EdgeIdx = Ptr2->UserTreeIndex.EdgeIdx;
18690 Ptr2 = Ptr2->UserTreeIndex.UserTE;
18693 unsigned Idx = Ptr1->UserTreeIndex.EdgeIdx;
18694 Ptr1 = Ptr1->UserTreeIndex.UserTE;
18695 if (
auto It = PtrToIdx.
find(Ptr1); It != PtrToIdx.
end())
18696 return Idx < It->second;
18700 auto CheckNonSchedulableOrdering = [&](
const TreeEntry *
E,
18702 return TEUseEI && TEUseEI.UserTE && TEUseEI.UserTE->hasCopyableElements() &&
18703 !TEUseEI.UserTE->isCopyableElement(
18706 InsertPt->getNextNode() == TEInsertPt &&
18707 (!
E->hasCopyableElements() || !
E->isCopyableElement(InsertPt) ||
18710 for (
Value *V : VL) {
18714 SmallPtrSet<const TreeEntry *, 4> VToTEs;
18716 ValueToGatherNodes.lookup(V).takeVector());
18717 if (TransformedToGatherNodes.contains(TE)) {
18718 for (TreeEntry *
E : getSplitTreeEntries(V)) {
18719 if (TE ==
E || !TransformedToGatherNodes.contains(
E) ||
18720 !
E->UserTreeIndex ||
E->UserTreeIndex.UserTE->isGather())
18722 GatherNodes.push_back(
E);
18724 for (TreeEntry *
E : getTreeEntries(V)) {
18725 if (TE ==
E || !TransformedToGatherNodes.contains(
E) ||
18726 !
E->UserTreeIndex ||
E->UserTreeIndex.UserTE->isGather())
18728 GatherNodes.push_back(
E);
18731 for (
const TreeEntry *TEPtr : GatherNodes) {
18732 if (TEPtr == TE || TEPtr->Idx == 0 || DeletedNodes.contains(TEPtr))
18735 [&](
Value *V) { return GatheredScalars.contains(V); }) &&
18736 "Must contain at least single gathered value.");
18737 assert(TEPtr->UserTreeIndex &&
18738 "Expected only single user of a gather node.");
18739 if (
any_of(TEPtr->CombinedEntriesWithIndices,
18740 [&](
const auto &
P) { return P.first == TE->Idx; }))
18742 const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
18744 PHINode *UserPHI = (UseEI.UserTE->State != TreeEntry::SplitVectorize &&
18745 UseEI.UserTE->hasState())
18750 : &getLastInstructionInBundle(UseEI.UserTE);
18751 if (TEInsertPt == InsertPt) {
18753 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
18754 (TEUseEI.UserTE->getOpcode() != Instruction::PHI ||
18755 TEUseEI.UserTE->isAltShuffle()) &&
18757 if (UseEI.UserTE->State != TreeEntry::Vectorize ||
18758 (UseEI.UserTE->hasState() &&
18759 UseEI.UserTE->getOpcode() == Instruction::PHI &&
18760 !UseEI.UserTE->isAltShuffle()) ||
18769 (TEUseEI.UserTE != UseEI.UserTE || TEUseEI.EdgeIdx < UseEI.EdgeIdx))
18772 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
18773 TEUseEI.UserTE->getOpcode() == Instruction::PHI &&
18774 UseEI.UserTE->State == TreeEntry::Vectorize &&
18775 UseEI.UserTE->getOpcode() == Instruction::PHI &&
18776 TEUseEI.UserTE != UseEI.UserTE)
18781 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
18785 if (TEUseEI.UserTE != UseEI.UserTE &&
18786 (TEUseEI.UserTE->Idx < UseEI.UserTE->Idx ||
18787 HasGatherUser(TEUseEI.UserTE)))
18790 if (CheckParentNodes(TEUseEI.UserTE, UseEI.UserTE, UseEI.EdgeIdx))
18794 if (!TEUseEI.UserTE->isGather() && !UserPHI &&
18795 TEUseEI.UserTE->doesNotNeedToSchedule() !=
18796 UseEI.UserTE->doesNotNeedToSchedule() &&
18801 if ((TEInsertBlock != InsertPt->
getParent() ||
18802 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
18803 (!CheckOrdering(InsertPt) ||
18804 (UseEI.UserTE->hasCopyableElements() &&
18809 if (CheckAndUseSameNode(TEPtr))
18814 if (CheckNonSchedulableOrdering(UseEI.UserTE, InsertPt))
18819 const auto *It =
find_if(VTEs, [&](
const TreeEntry *MTE) {
18820 return MTE !=
TE && MTE != TEUseEI.UserTE &&
18821 !DeletedNodes.contains(MTE) &&
18822 !TransformedToGatherNodes.contains(MTE);
18824 if (It != VTEs.end()) {
18825 const TreeEntry *VTE = *It;
18826 if (
none_of(
TE->CombinedEntriesWithIndices,
18827 [&](
const auto &
P) { return P.first == VTE->Idx; })) {
18828 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
18829 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
18833 if (CheckAndUseSameNode(VTE))
18839 const auto *It =
find_if(VTEs, [&, MainTE = TE](
const TreeEntry *TE) {
18840 return TE != MainTE && !DeletedNodes.contains(TE) &&
18841 !TransformedToGatherNodes.contains(TE);
18843 if (It != VTEs.end()) {
18844 const TreeEntry *VTE = *It;
18845 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
18846 VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
18847 VTEs = VTEs.drop_front();
18849 const auto *MIt =
find_if(VTEs, [](
const TreeEntry *MTE) {
18850 return MTE->State == TreeEntry::Vectorize;
18852 if (MIt == VTEs.end())
18856 if (
none_of(
TE->CombinedEntriesWithIndices,
18857 [&](
const auto &
P) { return P.first == VTE->Idx; })) {
18858 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
18859 if (&LastBundleInst == TEInsertPt ||
18860 !CheckOrdering(&LastBundleInst) ||
18861 CheckNonSchedulableOrdering(VTE, &LastBundleInst))
18865 if (CheckAndUseSameNode(VTE))
18870 if (IsReusedNodeFound)
18872 if (VToTEs.
empty())
18874 if (UsedTEs.
empty()) {
18882 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
18884 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
18888 if (!VToTEs.
empty()) {
18894 VToTEs = SavedVToTEs;
18899 if (Idx == UsedTEs.
size()) {
18903 if (UsedTEs.
size() == 2)
18905 UsedTEs.push_back(SavedVToTEs);
18906 Idx = UsedTEs.
size() - 1;
18912 if (UsedTEs.
empty()) {
18914 return std::nullopt;
18918 if (UsedTEs.
size() == 1) {
18921 UsedTEs.front().
end());
18922 sort(FirstEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
18923 return TE1->Idx < TE2->Idx;
18926 auto *It =
find_if(FirstEntries, [=](
const TreeEntry *EntryPtr) {
18927 return EntryPtr->isSame(VL) || EntryPtr->isSame(
TE->Scalars);
18929 if (It != FirstEntries.end() &&
18930 (IsReusedNodeFound || (*It)->getVectorFactor() == VL.size() ||
18931 ((*It)->getVectorFactor() ==
TE->Scalars.size() &&
18932 TE->ReuseShuffleIndices.size() == VL.size() &&
18933 (*It)->isSame(
TE->Scalars)))) {
18935 if (IsReusedNodeFound || (*It)->getVectorFactor() == VL.size()) {
18936 std::iota(std::next(
Mask.begin(), Part * VL.size()),
18937 std::next(
Mask.begin(), (Part + 1) * VL.size()), 0);
18939 SmallVector<int> CommonMask =
TE->getCommonMask();
18950 Entries.
push_back(FirstEntries.front());
18952 for (
auto &
P : UsedValuesEntry)
18954 VF = FirstEntries.front()->getVectorFactor();
18957 assert(UsedTEs.
size() == 2 &&
"Expected at max 2 permuted entries.");
18959 DenseMap<int, const TreeEntry *> VFToTE;
18960 for (
const TreeEntry *TE : UsedTEs.front()) {
18961 unsigned VF =
TE->getVectorFactor();
18962 auto It = VFToTE.
find(VF);
18963 if (It != VFToTE.
end()) {
18964 if (It->second->Idx >
TE->Idx)
18965 It->getSecond() =
TE;
18972 UsedTEs.back().
end());
18973 sort(SecondEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
18974 return TE1->Idx < TE2->Idx;
18976 for (
const TreeEntry *TE : SecondEntries) {
18977 auto It = VFToTE.
find(
TE->getVectorFactor());
18978 if (It != VFToTE.
end()) {
18987 if (Entries.
empty()) {
18989 UsedTEs.front(), [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
18990 return TE1->Idx < TE2->Idx;
18992 Entries.
push_back(SecondEntries.front());
18993 VF = std::max(Entries.
front()->getVectorFactor(),
18994 Entries.
back()->getVectorFactor());
18996 VF = Entries.
front()->getVectorFactor();
18999 for (
const TreeEntry *
E : Entries)
19003 for (
auto &
P : UsedValuesEntry) {
19005 if (ValuesToEntries[Idx].
contains(
P.first)) {
19015 auto AreCompatiblePHIs = [&](
Value *
V,
Value *V1) {
19022 for (
int I = 0,
E =
PHI->getNumIncomingValues();
I <
E; ++
I) {
19024 Value *In1 = PHI1->getIncomingValue(
I);
19039 auto MightBeIgnored = [=](
Value *
V) {
19043 !areAllUsersVectorized(
I, UserIgnoreList) &&
isSimple(
I);
19048 auto NeighborMightBeIgnored = [&](
Value *
V,
int Idx) {
19049 Value *V1 = VL[Idx];
19050 bool UsedInSameVTE =
false;
19051 auto It = UsedValuesEntry.find(V1);
19052 if (It != UsedValuesEntry.end())
19053 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
19054 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
19061 SmallBitVector UsedIdxs(Entries.size());
19063 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
19065 auto It = UsedValuesEntry.find(V);
19066 if (It == UsedValuesEntry.end())
19072 ((
I > 0 && NeighborMightBeIgnored(V,
I - 1)) ||
19073 (
I !=
E - 1 && NeighborMightBeIgnored(V,
I + 1)))))
19075 unsigned Idx = It->second;
19082 for (
unsigned I = 0, Sz = Entries.size();
I < Sz; ++
I) {
19083 if (!UsedIdxs.test(
I))
19089 for (std::pair<unsigned, int> &Pair : EntryLanes)
19090 if (Pair.first ==
I)
19091 Pair.first = TempEntries.
size();
19094 Entries.swap(TempEntries);
19095 if (EntryLanes.size() == Entries.size() &&
19097 .slice(Part * VL.size(),
19098 std::min<int>(VL.size(),
TE->Scalars.size())))) {
19104 return std::nullopt;
19107 bool IsIdentity = Entries.size() == 1;
19110 for (
const std::pair<unsigned, int> &Pair : EntryLanes) {
19111 unsigned Idx = Part * VL.size() + Pair.second;
19114 (ForOrder ? std::distance(
19115 Entries[Pair.first]->Scalars.begin(),
19116 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
19117 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
19118 IsIdentity &=
Mask[Idx] == Pair.second;
19120 if (ForOrder || IsIdentity || Entries.empty()) {
19121 switch (Entries.size()) {
19123 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
19127 if (EntryLanes.size() > 2 || VL.size() <= 2)
19134 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
19136 SmallVector<int> SubMask(std::next(
Mask.begin(), Part * VL.size()),
19137 std::next(
Mask.begin(), (Part + 1) * VL.size()));
19138 int MinElement = SubMask.
front(), MaxElement = SubMask.
front();
19139 for (
int Idx : SubMask) {
19147 assert(MaxElement >= 0 && MinElement >= 0 &&
19148 MaxElement % VF >= MinElement % VF &&
19149 "Expected at least single element.");
19150 unsigned NewVF = std::max<unsigned>(
19152 (MaxElement % VF) -
19153 (MinElement % VF) + 1));
19155 for (
int &Idx : SubMask) {
19158 Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
19159 (Idx >=
static_cast<int>(VF) ? NewVF : 0);
19167 auto *MaskVecTy =
getWidenedType(VL.front()->getType(), SubMask.size());
19168 auto GetShuffleCost = [&,
19169 &TTI = *TTI](ArrayRef<int>
Mask,
19172 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
19174 Mask, Entries.front()->getInterleaveFactor()))
19176 return ::getShuffleCost(TTI,
19181 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
19183 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
19184 if (Entries.size() == 1 || !Entries[0]->isGather()) {
19185 FirstShuffleCost = ShuffleCost;
19189 bool IsIdentity =
true;
19190 for (
auto [
I, Idx] :
enumerate(FirstMask)) {
19191 if (Idx >=
static_cast<int>(NewVF)) {
19196 IsIdentity &=
static_cast<int>(
I) == Idx;
19200 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
19202 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
19206 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
19207 if (Entries.size() == 1 || !Entries[1]->isGather()) {
19208 SecondShuffleCost = ShuffleCost;
19212 bool IsIdentity =
true;
19213 for (
auto [
I, Idx] :
enumerate(SecondMask)) {
19214 if (Idx <
static_cast<int>(NewVF) && Idx >= 0) {
19220 IsIdentity &=
static_cast<int>(
I) == Idx;
19225 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
19227 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
19235 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
19237 const TreeEntry *BestEntry =
nullptr;
19238 if (FirstShuffleCost < ShuffleCost) {
19239 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
19240 std::next(
Mask.begin(), (Part + 1) * VL.size()),
19242 if (Idx >= static_cast<int>(VF))
19243 Idx = PoisonMaskElem;
19245 BestEntry = Entries.front();
19246 ShuffleCost = FirstShuffleCost;
19248 if (SecondShuffleCost < ShuffleCost) {
19249 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
19250 std::next(
Mask.begin(), (Part + 1) * VL.size()),
19252 if (Idx < static_cast<int>(VF))
19253 Idx = PoisonMaskElem;
19257 BestEntry = Entries[1];
19258 ShuffleCost = SecondShuffleCost;
19260 if (BuildVectorCost >= ShuffleCost) {
19263 Entries.push_back(BestEntry);
19271 std::fill(std::next(
Mask.begin(), Part * VL.size()),
19273 return std::nullopt;
19277BoUpSLP::isGatherShuffledEntry(
19281 assert(NumParts > 0 && NumParts < VL.
size() &&
19282 "Expected positive number of registers.");
19285 if (TE == VectorizableTree.front().get() &&
19286 (!GatheredLoadsEntriesFirst.has_value() ||
19288 [](
const std::unique_ptr<TreeEntry> &TE) {
19289 return !
TE->isGather();
19294 if (
TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
19297 assert((
TE->UserTreeIndex || TE == VectorizableTree.front().get()) &&
19298 "Expected only single user of the gather node.");
19300 "Number of scalars must be divisible by NumParts.");
19301 if (
TE->UserTreeIndex &&
TE->UserTreeIndex.UserTE->isGather() &&
19302 TE->UserTreeIndex.EdgeIdx == UINT_MAX &&
19304 (
TE->hasState() &&
TE->getOpcode() == Instruction::ExtractElement) ||
19307 getSameValuesTreeEntry(
TE->getMainOp(),
TE->Scalars))))
19314 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
19315 std::optional<TTI::ShuffleKind> SubRes =
19316 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
19319 SubEntries.
clear();
19322 SubEntries.
front()->getVectorFactor() == VL.
size() &&
19323 (SubEntries.
front()->isSame(
TE->Scalars) ||
19324 SubEntries.
front()->isSame(VL))) {
19326 LocalSubEntries.
swap(SubEntries);
19329 std::iota(
Mask.begin(),
Mask.end(), 0);
19331 for (
int I = 0, Sz = VL.
size();
I < Sz; ++
I)
19334 Entries.emplace_back(1, LocalSubEntries.
front());
19340 [](
const std::optional<TTI::ShuffleKind> &SK) {
return !SK; })) {
19348 Type *ScalarTy)
const {
19349 const unsigned VF = VL.
size();
19357 auto EstimateInsertCost = [&](
unsigned I,
Value *
V) {
19359 if (
V->getType() != ScalarTy)
19360 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy,
V->getType(),
19364 std::iota(ConstantShuffleMask.begin(), ConstantShuffleMask.end(), 0);
19371 ConstantShuffleMask[
I] =
I + VF;
19374 EstimateInsertCost(
I, V);
19377 bool IsAnyNonUndefConst =
19380 if (!ForPoisonSrc && IsAnyNonUndefConst) {
19382 ConstantShuffleMask);
19386 if (!DemandedElements.
isZero())
19390 ForPoisonSrc && !IsAnyNonUndefConst, VL);
19394Instruction &BoUpSLP::getLastInstructionInBundle(
const TreeEntry *
E) {
19395 auto It = EntryToLastInstruction.find(
E);
19396 if (It != EntryToLastInstruction.end())
19404 if (
E->hasState()) {
19405 Front =
E->getMainOp();
19406 Opcode =
E->getOpcode();
19413 ((GatheredLoadsEntriesFirst.has_value() && Opcode == Instruction::Load &&
19414 E->isGather() &&
E->Idx < *GatheredLoadsEntriesFirst) ||
19415 E->State == TreeEntry::SplitVectorize ||
E->hasCopyableElements() ||
19417 [=](
Value *V) ->
bool {
19418 if (Opcode == Instruction::GetElementPtr &&
19419 !isa<GetElementPtrInst>(V))
19421 auto *I = dyn_cast<Instruction>(V);
19422 return !I || !E->getMatchingMainOpOrAltOp(I) ||
19423 I->getParent() == BB || isVectorLikeInstWithConstOps(I);
19425 "Expected gathered loads or GEPs or instructions from same basic "
19428 auto FindLastInst = [&]() {
19430 for (
Value *V :
E->Scalars) {
19434 if (
E->isCopyableElement(
I))
19436 if (LastInst->
getParent() ==
I->getParent()) {
19441 assert(((Opcode == Instruction::GetElementPtr &&
19443 E->State == TreeEntry::SplitVectorize ||
19446 (GatheredLoadsEntriesFirst.has_value() &&
19447 Opcode == Instruction::Load &&
E->isGather() &&
19448 E->Idx < *GatheredLoadsEntriesFirst)) &&
19449 "Expected vector-like or non-GEP in GEP node insts only.");
19450 if (!DT->isReachableFromEntry(LastInst->
getParent())) {
19454 if (!DT->isReachableFromEntry(
I->getParent()))
19456 auto *NodeA = DT->getNode(LastInst->
getParent());
19457 auto *NodeB = DT->getNode(
I->getParent());
19458 assert(NodeA &&
"Should only process reachable instructions");
19459 assert(NodeB &&
"Should only process reachable instructions");
19460 assert((NodeA == NodeB) ==
19461 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
19462 "Different nodes should have different DFS numbers");
19463 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
19470 auto FindFirstInst = [&]() {
19472 for (
Value *V :
E->Scalars) {
19476 if (
E->isCopyableElement(
I))
19478 if (FirstInst->
getParent() ==
I->getParent()) {
19479 if (
I->comesBefore(FirstInst))
19483 assert(((Opcode == Instruction::GetElementPtr &&
19487 "Expected vector-like or non-GEP in GEP node insts only.");
19488 if (!DT->isReachableFromEntry(FirstInst->
getParent())) {
19492 if (!DT->isReachableFromEntry(
I->getParent()))
19494 auto *NodeA = DT->getNode(FirstInst->
getParent());
19495 auto *NodeB = DT->getNode(
I->getParent());
19496 assert(NodeA &&
"Should only process reachable instructions");
19497 assert(NodeB &&
"Should only process reachable instructions");
19498 assert((NodeA == NodeB) ==
19499 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
19500 "Different nodes should have different DFS numbers");
19501 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
19507 if (
E->State == TreeEntry::SplitVectorize) {
19508 Res = FindLastInst();
19510 for (
auto *
E : Entries) {
19513 I = &getLastInstructionInBundle(
E);
19518 EntryToLastInstruction.try_emplace(
E, Res);
19523 if (GatheredLoadsEntriesFirst.has_value() &&
19524 E->Idx >= *GatheredLoadsEntriesFirst && !
E->isGather() &&
19525 Opcode == Instruction::Load) {
19526 Res = FindFirstInst();
19527 EntryToLastInstruction.try_emplace(
E, Res);
19533 auto FindScheduleBundle = [&](
const TreeEntry *
E) ->
const ScheduleBundle * {
19537 const auto *It = BlocksSchedules.find(BB);
19538 if (It == BlocksSchedules.end())
19540 for (
Value *V :
E->Scalars) {
19546 if (Bundles.
empty())
19549 Bundles, [&](ScheduleBundle *
B) {
return B->getTreeEntry() ==
E; });
19550 if (It != Bundles.
end())
19555 const ScheduleBundle *Bundle = FindScheduleBundle(
E);
19556 if (!
E->isGather() && !Bundle) {
19557 if ((Opcode == Instruction::GetElementPtr &&
19560 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
19564 return isa<PoisonValue>(V) ||
19565 (E->Idx == 0 && isa<InsertElementInst>(V)) ||
19566 E->isCopyableElement(V) ||
19567 (!isVectorLikeInstWithConstOps(V) &&
19568 isUsedOutsideBlock(V));
19570 (!
E->doesNotNeedToSchedule() ||
19573 if (!isa<Instruction>(V) ||
19574 (E->hasCopyableElements() && E->isCopyableElement(V)))
19576 return !areAllOperandsNonInsts(V);
19579 if (!isa<Instruction>(V) ||
19580 (E->hasCopyableElements() && E->isCopyableElement(V)))
19582 return MustGather.contains(V);
19584 Res = FindLastInst();
19586 Res = FindFirstInst();
19587 EntryToLastInstruction.try_emplace(
E, Res);
19596 assert(!
E->isGather() &&
"Gathered instructions should not be scheduled");
19597 Res = Bundle->getBundle().back()->getInst();
19598 EntryToLastInstruction.try_emplace(
E, Res);
19621 Res = FindLastInst();
19622 assert(Res &&
"Failed to find last instruction in bundle");
19623 EntryToLastInstruction.try_emplace(
E, Res);
19627void BoUpSLP::setInsertPointAfterBundle(
const TreeEntry *
E) {
19628 auto *Front =
E->getMainOp();
19629 Instruction *LastInst = &getLastInstructionInBundle(
E);
19630 assert(LastInst &&
"Failed to find last instruction in bundle");
19635 LastInstIt = LastInst->
getParent()->getFirstNonPHIIt();
19636 if (LastInstIt != LastInst->
getParent()->end() &&
19637 LastInstIt->getParent()->isLandingPad())
19638 LastInstIt = std::next(LastInstIt);
19641 (!
E->isGather() &&
E->State != TreeEntry::SplitVectorize &&
19642 (
E->doesNotNeedToSchedule() ||
19643 (
E->hasCopyableElements() && !
E->isCopyableElement(LastInst) &&
19645 (GatheredLoadsEntriesFirst.has_value() &&
19646 E->Idx >= *GatheredLoadsEntriesFirst && !
E->isGather() &&
19647 E->getOpcode() == Instruction::Load)) {
19648 Builder.SetInsertPoint(LastInst->
getParent(), LastInstIt);
19652 Builder.SetInsertPoint(
19655 if (Instruction *Res = LastInstructionToPos.lookup(LastInst)) {
19658 Res = Builder.CreateAlignedLoad(Builder.getPtrTy(),
19663 LastInstructionToPos.try_emplace(LastInst, Res);
19666 Builder.SetCurrentDebugLocation(Front->
getDebugLoc());
19669Value *BoUpSLP::gather(
19671 function_ref<
Value *(
Value *,
Value *, ArrayRef<int>)> CreateShuffle) {
19677 SmallSet<int, 4> PostponedIndices;
19678 Loop *
L = LI->getLoopFor(Builder.GetInsertBlock());
19680 SmallPtrSet<BasicBlock *, 4> Visited;
19681 while (InsertBB && InsertBB != InstBB && Visited.
insert(InsertBB).second)
19682 InsertBB = InsertBB->getSinglePredecessor();
19683 return InsertBB && InsertBB == InstBB;
19685 for (
int I = 0,
E = VL.
size();
I <
E; ++
I) {
19687 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
19689 (L && (!Root ||
L->isLoopInvariant(Root)) &&
L->contains(Inst))) &&
19690 PostponedIndices.
insert(
I).second)
19694 auto &&CreateInsertElement = [
this](
Value *Vec,
Value *
V,
unsigned Pos,
19701 if (
Scalar->getType() != Ty) {
19712 Scalar = Builder.CreateIntCast(
19726 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
19731 GatherShuffleExtractSeq.insert(InsElt);
19736 const auto *It =
find_if(Entries, [&](
const TreeEntry *
E) {
19737 return !TransformedToGatherNodes.contains(
E) &&
19738 !DeletedNodes.contains(
E);
19740 if (It != Entries.
end()) {
19742 User *UserOp =
nullptr;
19747 if (
V->getType()->isVectorTy()) {
19749 SV && SV->getOperand(0) != V && SV->getOperand(1) != V) {
19751 auto FindOperand = [](
Value *Vec,
Value *
V) -> Instruction * {
19753 if (SV->getOperand(0) == V)
19755 if (SV->getOperand(1) == V)
19761 if (Instruction *User = FindOperand(SV->getOperand(0), V))
19763 else if (Instruction *User = FindOperand(SV->getOperand(1), V))
19766 "Failed to find shufflevector, caused by resize.");
19772 unsigned FoundLane = (*It)->findLaneForValue(V);
19773 ExternalUses.emplace_back(V, UserOp, **It, FoundLane);
19781 SmallVector<int> NonConsts;
19783 std::iota(
Mask.begin(),
Mask.end(), 0);
19784 Value *OriginalRoot = Root;
19787 SV->getOperand(0)->getType() == VecTy) {
19788 Root = SV->getOperand(0);
19789 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
19792 for (
int I = 0,
E = VL.
size();
I <
E; ++
I) {
19801 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
19806 Vec = OriginalRoot;
19808 Vec = CreateShuffle(Root, Vec, Mask);
19810 OI && OI->use_empty() &&
19811 none_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
19812 return TE->VectorizedValue == OI;
19818 for (
int I : NonConsts)
19819 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
19822 for (
const std::pair<Value *, unsigned> &Pair : PostponedInsts)
19823 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
19861 bool IsFinalized =
false;
19874 class ShuffleIRBuilder {
19887 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
19888 CSEBlocks(CSEBlocks),
DL(DL) {}
19889 ~ShuffleIRBuilder() =
default;
19895 "Expected integer vector types only.");
19901 ->getIntegerBitWidth())
19902 V2 = Builder.CreateIntCast(
19905 V1 = Builder.CreateIntCast(
19909 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
19911 GatherShuffleExtractSeq.insert(
I);
19912 CSEBlocks.insert(
I->getParent());
19921 unsigned VF = Mask.size();
19925 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
19927 GatherShuffleExtractSeq.insert(
I);
19928 CSEBlocks.insert(
I->getParent());
19932 Value *createIdentity(
Value *V) {
return V; }
19933 Value *createPoison(
Type *Ty,
unsigned VF) {
19938 void resizeToMatch(
Value *&V1,
Value *&V2) {
19943 int VF = std::max(V1VF, V2VF);
19944 int MinVF = std::min(V1VF, V2VF);
19946 std::iota(IdentityMask.
begin(), std::next(IdentityMask.
begin(), MinVF),
19948 Value *&
Op = MinVF == V1VF ? V1 : V2;
19949 Op = Builder.CreateShuffleVector(
Op, IdentityMask);
19951 GatherShuffleExtractSeq.insert(
I);
19952 CSEBlocks.insert(
I->getParent());
19965 assert(V1 &&
"Expected at least one vector value.");
19966 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
19967 R.CSEBlocks, *R.DL);
19968 return BaseShuffleAnalysis::createShuffle<Value *>(
19969 V1, V2, Mask, ShuffleBuilder, ScalarTy);
19975 std::optional<bool> IsSigned = std::nullopt) {
19978 if (VecTy->getElementType() == ScalarTy->getScalarType())
19980 return Builder.CreateIntCast(
19981 V,
VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
19985 Value *getVectorizedValue(
const TreeEntry &E) {
19986 Value *Vec = E.VectorizedValue;
19989 return castToScalarTyElem(Vec,
any_of(E.Scalars, [&](
Value *V) {
19990 return !isa<PoisonValue>(V) &&
19991 !isKnownNonNegative(
19992 V, SimplifyQuery(*R.DL));
19998 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
20002 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
20003 unsigned NumParts,
bool &UseVecBaseAsInput) {
20004 UseVecBaseAsInput =
false;
20006 Value *VecBase =
nullptr;
20008 if (!E->ReorderIndices.empty()) {
20010 E->ReorderIndices.end());
20013 for (
int I = 0, Sz = Mask.size();
I < Sz; ++
I) {
20018 VecBase = EI->getVectorOperand();
20020 VecBase = TEs.front()->VectorizedValue;
20021 assert(VecBase &&
"Expected vectorized value.");
20022 UniqueBases.
insert(VecBase);
20025 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
20026 (E->UserTreeIndex && E->UserTreeIndex.EdgeIdx == UINT_MAX &&
20027 !R.isVectorized(EI) &&
20029 count_if(E->UserTreeIndex.UserTE->Scalars,
20030 [&](
Value *V) { return V == EI; })) ||
20031 (NumParts != 1 &&
count(VL, EI) > 1) ||
20033 ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(U);
20034 return UTEs.empty() || UTEs.size() > 1 ||
20036 [&](const TreeEntry *TE) {
20037 return R.DeletedNodes.contains(TE) ||
20038 R.TransformedToGatherNodes.contains(TE);
20044 [&](
const std::unique_ptr<TreeEntry> &TE) {
20045 return TE->UserTreeIndex.UserTE ==
20047 is_contained(VL, EI);
20051 R.eraseInstruction(EI);
20053 if (NumParts == 1 || UniqueBases.
size() == 1) {
20054 assert(VecBase &&
"Expected vectorized value.");
20055 return castToScalarTyElem(VecBase);
20057 UseVecBaseAsInput =
true;
20067 Value *Vec =
nullptr;
20074 constexpr int MaxBases = 2;
20076 auto VLMask =
zip(SubVL, SubMask);
20077 const unsigned VF = std::accumulate(
20078 VLMask.begin(), VLMask.end(), 0U, [&](
unsigned S,
const auto &
D) {
20079 if (std::get<1>(D) == PoisonMaskElem)
20082 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
20083 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp);
20085 VecOp = TEs.front()->VectorizedValue;
20086 assert(VecOp &&
"Expected vectorized value.");
20087 const unsigned Size =
20088 cast<FixedVectorType>(VecOp->getType())->getNumElements();
20089 return std::max(S, Size);
20091 for (
const auto [V,
I] : VLMask) {
20096 VecOp = TEs.front()->VectorizedValue;
20097 assert(VecOp &&
"Expected vectorized value.");
20098 VecOp = castToScalarTyElem(VecOp);
20099 Bases[
I / VF] = VecOp;
20101 if (!Bases.front())
20104 if (Bases.back()) {
20105 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
20106 TransformToIdentity(SubMask);
20108 SubVec = Bases.front();
20114 ArrayRef<int> SubMask =
20115 Mask.slice(
P * SliceSize,
20118 return all_of(SubMask, [](
int Idx) {
20122 "Expected first part or all previous parts masked.");
20123 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
20128 unsigned SubVecVF =
20130 NewVF = std::max(NewVF, SubVecVF);
20133 for (
int &Idx : SubMask)
20136 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
20137 Vec = createShuffle(Vec, SubVec, VecMask);
20138 TransformToIdentity(VecMask);
20146 std::optional<Value *>
20152 TEs, [](
const TreeEntry *TE) {
return TE->VectorizedValue; });
20154 return std::nullopt;
20157 auto *ResVecTy =
getWidenedType(ScalarTy, E->getVectorFactor());
20158 return Builder.CreateAlignedLoad(
20165 IsFinalized =
false;
20166 CommonMask.clear();
20172 Value *V1 = getVectorizedValue(E1);
20173 Value *V2 = getVectorizedValue(E2);
20179 Value *V1 = getVectorizedValue(E1);
20184 assert(V1 && V2 && !Mask.empty() &&
"Expected non-empty input vectors.");
20187 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
20188 V1 = castToScalarTyElem(V1);
20189 V2 = castToScalarTyElem(V2);
20190 if (InVectors.empty()) {
20191 InVectors.push_back(V1);
20192 InVectors.push_back(V2);
20193 CommonMask.assign(Mask.begin(), Mask.end());
20196 Value *Vec = InVectors.front();
20197 if (InVectors.size() == 2) {
20198 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
20199 transformMaskAfterShuffle(CommonMask, CommonMask);
20202 Vec = createShuffle(Vec,
nullptr, CommonMask);
20203 transformMaskAfterShuffle(CommonMask, CommonMask);
20205 V1 = createShuffle(V1, V2, Mask);
20206 unsigned VF = std::max(getVF(V1), getVF(Vec));
20207 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
20209 CommonMask[Idx] = Idx + VF;
20210 InVectors.front() = Vec;
20211 if (InVectors.size() == 2)
20212 InVectors.back() = V1;
20214 InVectors.push_back(V1);
20219 "castToScalarTyElem expects V1 to be FixedVectorType");
20220 V1 = castToScalarTyElem(V1);
20221 if (InVectors.empty()) {
20222 InVectors.push_back(V1);
20223 CommonMask.assign(Mask.begin(), Mask.end());
20226 const auto *It =
find(InVectors, V1);
20227 if (It == InVectors.end()) {
20228 if (InVectors.size() == 2 ||
20229 InVectors.front()->getType() != V1->
getType()) {
20230 Value *V = InVectors.front();
20231 if (InVectors.size() == 2) {
20232 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
20233 transformMaskAfterShuffle(CommonMask, CommonMask);
20235 CommonMask.size()) {
20236 V = createShuffle(InVectors.front(),
nullptr, CommonMask);
20237 transformMaskAfterShuffle(CommonMask, CommonMask);
20239 unsigned VF = std::max(CommonMask.size(), Mask.size());
20240 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
20242 CommonMask[Idx] = V->getType() != V1->
getType()
20244 : Mask[Idx] + getVF(V1);
20245 if (V->getType() != V1->
getType())
20246 V1 = createShuffle(V1,
nullptr, Mask);
20247 InVectors.front() = V;
20248 if (InVectors.size() == 2)
20249 InVectors.back() = V1;
20251 InVectors.push_back(V1);
20256 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
20258 InVectors.push_back(V1);
20263 for (
Value *V : InVectors)
20264 VF = std::max(VF, getVF(V));
20265 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
20267 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
20276 Value *Root =
nullptr) {
20277 return R.gather(VL, Root, ScalarTy,
20279 return createShuffle(V1, V2, Mask);
20288 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
20293 IsFinalized =
true;
20296 if (InVectors.
size() == 2) {
20297 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
20300 Vec = createShuffle(Vec,
nullptr, CommonMask);
20302 transformMaskAfterShuffle(CommonMask, CommonMask);
20304 "Expected vector length for the final value before action.");
20308 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
20309 Vec = createShuffle(Vec,
nullptr, ResizeMask);
20311 Action(Vec, CommonMask, [
this](
Value *V1,
Value *V2, ArrayRef<int> Mask) {
20312 return createShuffle(V1, V2, Mask);
20314 InVectors.
front() = Vec;
20316 if (!SubVectors.empty()) {
20318 if (InVectors.
size() == 2) {
20319 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
20322 Vec = createShuffle(Vec,
nullptr, CommonMask);
20324 transformMaskAfterShuffle(CommonMask, CommonMask);
20325 auto CreateSubVectors = [&](
Value *Vec,
20326 SmallVectorImpl<int> &CommonMask) {
20327 for (
auto [
E, Idx] : SubVectors) {
20328 Value *
V = getVectorizedValue(*
E);
20335 Type *OrigScalarTy = ScalarTy;
20338 Builder, Vec, V, InsertionIndex,
20339 std::bind(&ShuffleInstructionBuilder::createShuffle,
this, _1, _2,
20341 ScalarTy = OrigScalarTy;
20342 if (!CommonMask.
empty()) {
20343 std::iota(std::next(CommonMask.
begin(), Idx),
20344 std::next(CommonMask.
begin(), Idx +
E->getVectorFactor()),
20350 if (SubVectorsMask.
empty()) {
20351 Vec = CreateSubVectors(Vec, CommonMask);
20354 copy(SubVectorsMask, SVMask.begin());
20355 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
20358 I1 = I2 + CommonMask.
size();
20363 Vec = createShuffle(InsertVec, Vec, SVMask);
20364 transformMaskAfterShuffle(CommonMask, SVMask);
20366 InVectors.
front() = Vec;
20369 if (!ExtMask.
empty()) {
20370 if (CommonMask.
empty()) {
20374 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
20377 NewMask[
I] = CommonMask[ExtMask[
I]];
20379 CommonMask.
swap(NewMask);
20382 if (CommonMask.
empty()) {
20383 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
20384 return InVectors.
front();
20386 if (InVectors.
size() == 2)
20387 return createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
20388 return createShuffle(InVectors.
front(),
nullptr, CommonMask);
20392 assert((IsFinalized || CommonMask.empty()) &&
20393 "Shuffle construction must be finalized.");
20397Value *BoUpSLP::vectorizeOperand(TreeEntry *E,
unsigned NodeIdx) {
20401template <
typename BVTy,
typename ResTy,
typename... Args>
20402ResTy BoUpSLP::processBuildVector(
const TreeEntry *E,
Type *ScalarTy,
20404 assert((E->isGather() || TransformedToGatherNodes.contains(E)) &&
20405 "Expected gather node.");
20406 unsigned VF = E->getVectorFactor();
20408 bool NeedFreeze =
false;
20412 E->CombinedEntriesWithIndices.size());
20413 if (E->State == TreeEntry::SplitVectorize &&
20414 TransformedToGatherNodes.contains(E)) {
20415 SubVectors.
clear();
20418 for (
auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
20420 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
20423 E->CombinedEntriesWithIndices, SubVectors.
begin(), [&](
const auto &
P) {
20424 return std::make_pair(VectorizableTree[P.first].get(), P.second);
20430 E->ReorderIndices.end());
20431 if (!ReorderMask.
empty())
20437 if (!SubVectors.
empty() && !SubVectorsMask.
empty()) {
20439 if (E->Scalars[
I] == GatheredScalars[ReorderMask[
I]])
20442 SubVectorsMask.
clear();
20446 unsigned I,
unsigned SliceSize,
20447 bool IsNotPoisonous) {
20449 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
20452 TreeEntry *UserTE = E->UserTreeIndex.UserTE;
20453 unsigned EdgeIdx = E->UserTreeIndex.EdgeIdx;
20454 if (UserTE->getNumOperands() != 2)
20456 if (!IsNotPoisonous) {
20457 auto *It =
find_if(
ArrayRef(VectorizableTree).drop_front(UserTE->Idx + 1),
20458 [=](
const std::unique_ptr<TreeEntry> &TE) {
20459 return TE->UserTreeIndex.UserTE == UserTE &&
20460 TE->UserTreeIndex.EdgeIdx != EdgeIdx;
20462 if (It == VectorizableTree.end())
20465 if (!(*It)->ReorderIndices.empty()) {
20469 if (!
all_of(
zip(GatheredScalars, GS), [&](
const auto &
P) {
20470 Value *V0 = std::get<0>(
P);
20471 Value *V1 = std::get<1>(
P);
20479 if ((Mask.size() < InputVF &&
20482 (Mask.size() == InputVF &&
20485 std::next(Mask.begin(),
I * SliceSize),
20486 std::next(Mask.begin(),
20493 std::next(Mask.begin(),
I * SliceSize),
20494 std::next(Mask.begin(),
20500 BVTy ShuffleBuilder(ScalarTy, Params...);
20501 ResTy Res = ResTy();
20505 Value *ExtractVecBase =
nullptr;
20506 bool UseVecBaseAsInput =
false;
20509 Type *OrigScalarTy = GatheredScalars.
front()->getType();
20514 bool Resized =
false;
20516 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
20517 if (!ExtractShuffles.
empty()) {
20519 for (
auto [Idx,
I] :
enumerate(ExtractMask)) {
20525 ExtractEntries.
append(TEs.begin(), TEs.end());
20527 if (std::optional<ResTy> Delayed =
20528 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
20530 PostponedGathers.insert(E);
20535 if (
Value *VecBase = ShuffleBuilder.adjustExtracts(
20536 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
20537 ExtractVecBase = VecBase;
20539 if (VF == VecBaseTy->getNumElements() &&
20540 GatheredScalars.
size() != VF) {
20542 GatheredScalars.
append(VF - GatheredScalars.
size(),
20550 if (!ExtractShuffles.
empty() || !E->hasState() ||
20551 E->getOpcode() != Instruction::Load ||
20552 (((E->hasState() && E->getOpcode() == Instruction::Load) ||
20556 return isa<LoadInst>(V) && isVectorized(V);
20558 (E->hasState() && E->isAltShuffle()) ||
20559 all_of(E->Scalars, [
this](
Value *V) { return isVectorized(V); }) ||
20561 (E->Scalars != GatheredScalars && GatheredScalars.
size() <= 2)) {
20563 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
20565 if (!GatherShuffles.
empty()) {
20566 if (std::optional<ResTy> Delayed =
20567 ShuffleBuilder.needToDelay(E, Entries)) {
20569 PostponedGathers.insert(E);
20574 if (GatherShuffles.
size() == 1 &&
20576 Entries.
front().front()->isSame(E->Scalars)) {
20579 LLVM_DEBUG(
dbgs() <<
"SLP: perfect diamond match for gather bundle "
20582 Mask.resize(E->Scalars.size());
20583 const TreeEntry *FrontTE = Entries.
front().front();
20584 if (FrontTE->ReorderIndices.empty() &&
20585 ((FrontTE->ReuseShuffleIndices.empty() &&
20586 E->Scalars.size() == FrontTE->Scalars.size()) ||
20587 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
20588 std::iota(Mask.begin(), Mask.end(), 0);
20595 Mask[
I] = FrontTE->findLaneForValue(V);
20600 ShuffleBuilder.resetForSameNode();
20602 if (
equal(E->Scalars, FrontTE->Scalars) &&
20603 equal(E->ReorderIndices, FrontTE->ReorderIndices) &&
20604 equal(E->ReuseShuffleIndices, FrontTE->ReuseShuffleIndices)) {
20605 Mask.resize(FrontTE->getVectorFactor());
20606 std::iota(Mask.begin(), Mask.end(), 0);
20607 ShuffleBuilder.add(*FrontTE, Mask);
20608 Res = ShuffleBuilder.finalize({}, {}, {});
20610 ShuffleBuilder.add(*FrontTE, Mask);
20611 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
20616 if (GatheredScalars.
size() != VF &&
20618 return any_of(TEs, [&](
const TreeEntry *TE) {
20619 return TE->getVectorFactor() == VF;
20622 GatheredScalars.
append(VF - GatheredScalars.
size(),
20626 for (
int I = 0, Sz = Mask.size();
I < Sz; ++
I) {
20634 bool IsRootPoison) {
20637 bool IsSplat = IsRootPoison &&
isSplat(Scalars) &&
20644 int NumNonConsts = 0;
20663 Scalars.
front() = OrigV;
20666 const auto Res = UniquePositions.
try_emplace(OrigV,
I);
20667 Scalars[Res.first->second] = OrigV;
20668 ReuseMask[
I] = Res.first->second;
20671 if (NumNonConsts == 1) {
20676 if (!UndefPos.
empty() && UndefPos.
front() == 0)
20679 ReuseMask[SinglePos] = SinglePos;
20680 }
else if (!UndefPos.
empty() && IsSplat) {
20687 (E->UserTreeIndex &&
any_of(V->uses(), [E](
const Use &U) {
20690 return E->UserTreeIndex.EdgeIdx != U.getOperandNo() &&
20691 is_contained(E->UserTreeIndex.UserTE->Scalars,
20695 if (It != Scalars.
end()) {
20697 int Pos = std::distance(Scalars.
begin(), It);
20698 for (
int I : UndefPos) {
20700 ReuseMask[
I] = Pos;
20709 for (
int I : UndefPos) {
20718 if (!ExtractShuffles.
empty() || !GatherShuffles.
empty()) {
20719 bool IsNonPoisoned =
true;
20720 bool IsUsedInExpr =
true;
20721 Value *Vec1 =
nullptr;
20722 if (!ExtractShuffles.
empty()) {
20726 Value *Vec2 =
nullptr;
20727 for (
unsigned I = 0, Sz = ExtractMask.
size();
I < Sz; ++
I) {
20731 if (UseVecBaseAsInput) {
20732 Vec1 = ExtractVecBase;
20734 for (
unsigned I = 0, Sz = ExtractMask.
size();
I < Sz; ++
I) {
20740 Value *VecOp = EI->getVectorOperand();
20742 !TEs.
empty() && TEs.front()->VectorizedValue)
20743 VecOp = TEs.front()->VectorizedValue;
20746 }
else if (Vec1 != VecOp) {
20747 assert((!Vec2 || Vec2 == VecOp) &&
20748 "Expected only 1 or 2 vectors shuffle.");
20754 IsUsedInExpr =
false;
20757 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
20760 IsUsedInExpr &= FindReusedSplat(
20763 ExtractMask.
size(), IsNotPoisonedVec);
20764 ShuffleBuilder.add(Vec1, ExtractMask,
true);
20765 IsNonPoisoned &= IsNotPoisonedVec;
20767 IsUsedInExpr =
false;
20772 if (!GatherShuffles.
empty()) {
20773 unsigned SliceSize =
20777 for (
const auto [
I, TEs] :
enumerate(Entries)) {
20780 "No shuffles with empty entries list expected.");
20783 assert((TEs.size() == 1 || TEs.size() == 2) &&
20784 "Expected shuffle of 1 or 2 entries.");
20785 unsigned Limit =
getNumElems(Mask.size(), SliceSize,
I);
20788 copy(SubMask, std::next(VecMask.
begin(),
I * SliceSize));
20789 if (TEs.size() == 1) {
20790 bool IsNotPoisonedVec =
20791 TEs.front()->VectorizedValue
20795 FindReusedSplat(VecMask, TEs.
front()->getVectorFactor(),
I,
20796 SliceSize, IsNotPoisonedVec);
20797 ShuffleBuilder.add(*TEs.front(), VecMask);
20798 IsNonPoisoned &= IsNotPoisonedVec;
20800 IsUsedInExpr =
false;
20801 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
20802 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
20813 int EMSz = ExtractMask.
size();
20814 int MSz = Mask.size();
20817 bool IsSingleShuffle = ExtractShuffles.
empty() || GatherShuffles.
empty();
20818 bool IsIdentityShuffle =
20819 ((UseVecBaseAsInput ||
20821 [](
const std::optional<TTI::ShuffleKind> &SK) {
20825 none_of(ExtractMask, [&](
int I) {
return I >= EMSz; }) &&
20827 (!GatherShuffles.
empty() &&
20829 [](
const std::optional<TTI::ShuffleKind> &SK) {
20833 none_of(Mask, [&](
int I) {
return I >= MSz; }) &&
20835 bool EnoughConstsForShuffle =
20845 (!IsIdentityShuffle ||
20846 (GatheredScalars.
size() == 2 &&
20854 for (
int I = 0, Sz = GatheredScalars.
size();
I < Sz; ++
I) {
20855 if (EnoughConstsForShuffle &&
isa<Constant>(GatheredScalars[
I]))
20863 TryPackScalars(GatheredScalars, BVMask,
true);
20864 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.
size());
20865 ShuffleBuilder.add(BV, BVMask);
20869 (IsSingleShuffle && ((IsIdentityShuffle &&
20872 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
20875 Res = ShuffleBuilder.finalize(
20876 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.
size(),
20878 bool IsSplat = isSplat(NonConstants);
20879 SmallVector<int> BVMask(Mask.size(), PoisonMaskElem);
20880 TryPackScalars(NonConstants, BVMask, false);
20881 auto CheckIfSplatIsProfitable = [&]() {
20884 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
20885 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
20886 if (isa<ExtractElementInst>(V) || isVectorized(V))
20888 InstructionCost SplatCost = TTI->getVectorInstrCost(
20889 Instruction::InsertElement, VecTy, CostKind, 0,
20890 PoisonValue::get(VecTy), V);
20891 SmallVector<int> NewMask(Mask.begin(), Mask.end());
20892 for (auto [Idx, I] : enumerate(BVMask))
20893 if (I != PoisonMaskElem)
20894 NewMask[Idx] = Mask.size();
20895 SplatCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy,
20896 NewMask, CostKind);
20897 InstructionCost BVCost = TTI->getVectorInstrCost(
20898 Instruction::InsertElement, VecTy, CostKind,
20899 *find_if(Mask, not_equal_to(PoisonMaskElem)), Vec, V);
20901 if (count(BVMask, PoisonMaskElem) <
20902 static_cast<int>(BVMask.size() - 1)) {
20903 SmallVector<int> NewMask(Mask.begin(), Mask.end());
20904 for (auto [Idx, I] : enumerate(BVMask))
20905 if (I != PoisonMaskElem)
20907 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
20908 VecTy, NewMask, CostKind);
20910 return SplatCost <= BVCost;
20912 if (!IsSplat || Mask.size() <= 2 || !CheckIfSplatIsProfitable()) {
20916 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
20922 Value *BV = ShuffleBuilder.gather(Values, BVMask.
size());
20925 return I == PoisonMaskElem ? PoisonMaskElem : 0;
20928 BV = CreateShuffle(BV,
nullptr, SplatMask);
20931 Mask[Idx] = BVMask.size() + Idx;
20932 Vec = CreateShuffle(Vec, BV, Mask);
20941 TryPackScalars(GatheredScalars, ReuseMask,
true);
20942 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
20943 ShuffleBuilder.add(BV, ReuseMask);
20944 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
20949 for (
auto [
I, V] :
enumerate(GatheredScalars)) {
20953 Value *BV = ShuffleBuilder.gather(GatheredScalars);
20954 ShuffleBuilder.add(BV, Mask);
20955 Res = ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors,
20960 Res = ShuffleBuilder.createFreeze(Res);
20964Value *BoUpSLP::createBuildVector(
const TreeEntry *
E,
Type *ScalarTy) {
20966 if (
E->State != TreeEntry::SplitVectorize ||
20967 !TransformedToGatherNodes.contains(
E)) {
20968 for (
auto [EIdx,
_] :
E->CombinedEntriesWithIndices)
20971 return processBuildVector<ShuffleInstructionBuilder, Value *>(
E, ScalarTy,
20979 for (
Value *V : VL)
20992 IRBuilderBase::InsertPointGuard Guard(Builder);
20994 Value *
V =
E->Scalars.front();
20995 Type *ScalarTy =
V->getType();
20998 auto It = MinBWs.find(
E);
20999 if (It != MinBWs.end()) {
21005 if (
E->VectorizedValue)
21006 return E->VectorizedValue;
21008 if (
E->isGather() || TransformedToGatherNodes.contains(
E)) {
21010 if (
E->hasState() &&
E->Idx == 0 && !UserIgnoreList)
21011 setInsertPointAfterBundle(
E);
21012 Value *Vec = createBuildVector(
E, ScalarTy);
21013 E->VectorizedValue = Vec;
21016 if (
E->State == TreeEntry::SplitVectorize) {
21017 assert(
E->CombinedEntriesWithIndices.size() == 2 &&
21018 "Expected exactly 2 combined entries.");
21019 setInsertPointAfterBundle(
E);
21021 *VectorizableTree[
E->CombinedEntriesWithIndices.front().first];
21023 ArrayRef(
E->Scalars).take_front(OpTE1.getVectorFactor())) &&
21024 "Expected same first part of scalars.");
21027 *VectorizableTree[
E->CombinedEntriesWithIndices.back().first];
21029 OpTE2.isSame(
ArrayRef(
E->Scalars).take_back(OpTE2.getVectorFactor())) &&
21030 "Expected same second part of scalars.");
21032 auto GetOperandSignedness = [&](
const TreeEntry *OpE) {
21033 bool IsSigned =
false;
21034 auto It = MinBWs.find(OpE);
21035 if (It != MinBWs.end())
21036 IsSigned = It->second.second;
21039 if (isa<PoisonValue>(V))
21041 return !isKnownNonNegative(R, SimplifyQuery(*DL));
21048 Op1 = Builder.CreateIntCast(
21053 GetOperandSignedness(&OpTE1));
21058 Op2 = Builder.CreateIntCast(
21063 GetOperandSignedness(&OpTE2));
21065 if (
E->ReorderIndices.empty()) {
21069 std::next(
Mask.begin(),
E->CombinedEntriesWithIndices.back().second),
21072 if (ScalarTyNumElements != 1) {
21076 Value *Vec = Builder.CreateShuffleVector(Op1, Mask);
21078 E->CombinedEntriesWithIndices.back().second *
21079 ScalarTyNumElements);
21080 E->VectorizedValue = Vec;
21083 unsigned CommonVF =
21084 std::max(OpTE1.getVectorFactor(), OpTE2.getVectorFactor());
21091 Op1 = Builder.CreateShuffleVector(Op1, Mask);
21097 Op2 = Builder.CreateShuffleVector(Op2, Mask);
21099 Value *Vec = Builder.CreateShuffleVector(Op1, Op2,
E->getSplitMask());
21100 E->VectorizedValue = Vec;
21104 bool IsReverseOrder =
21106 auto FinalShuffle = [&](
Value *
V,
const TreeEntry *
E) {
21108 if (
E->getOpcode() == Instruction::Store &&
21109 E->State == TreeEntry::Vectorize) {
21110 ArrayRef<int>
Mask =
21111 ArrayRef(
reinterpret_cast<const int *
>(
E->ReorderIndices.begin()),
21112 E->ReorderIndices.size());
21113 ShuffleBuilder.add(V, Mask);
21114 }
else if ((
E->State == TreeEntry::StridedVectorize && IsReverseOrder) ||
21115 E->State == TreeEntry::CompressVectorize) {
21116 ShuffleBuilder.addOrdered(V, {});
21118 ShuffleBuilder.addOrdered(V,
E->ReorderIndices);
21121 E->CombinedEntriesWithIndices.size());
21123 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](
const auto &
P) {
21124 return std::make_pair(VectorizableTree[P.first].get(), P.second);
21127 (
E->CombinedEntriesWithIndices.empty() ||
E->ReorderIndices.empty()) &&
21128 "Expected either combined subnodes or reordering");
21129 return ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors, {});
21132 assert(!
E->isGather() &&
"Unhandled state");
21133 unsigned ShuffleOrOp =
21134 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector :
E->
getOpcode();
21135 if (!
E->isAltShuffle()) {
21136 switch (E->CombinedOp) {
21137 case TreeEntry::ReducedBitcast:
21138 case TreeEntry::ReducedBitcastBSwap:
21139 case TreeEntry::ReducedBitcastLoads:
21140 case TreeEntry::ReducedBitcastBSwapLoads:
21141 case TreeEntry::ReducedCmpBitcast:
21142 ShuffleOrOp = E->CombinedOp;
21149 auto GetOperandSignedness = [&](
unsigned Idx) {
21150 const TreeEntry *OpE = getOperandEntry(
E, Idx);
21151 bool IsSigned =
false;
21152 auto It = MinBWs.find(OpE);
21153 if (It != MinBWs.end())
21154 IsSigned = It->second.second;
21157 if (isa<PoisonValue>(V))
21159 return !isKnownNonNegative(R, SimplifyQuery(*DL));
21163 switch (ShuffleOrOp) {
21164 case Instruction::PHI: {
21165 assert((
E->ReorderIndices.empty() || !
E->ReuseShuffleIndices.empty() ||
21166 E != VectorizableTree.front().get() ||
E->UserTreeIndex) &&
21167 "PHI reordering is free.");
21169 Builder.SetInsertPoint(PH->getParent(),
21170 PH->getParent()->getFirstNonPHIIt());
21172 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
21176 Builder.SetInsertPoint(PH->getParent(),
21177 PH->getParent()->getFirstInsertionPt());
21180 V = FinalShuffle(V,
E);
21182 E->VectorizedValue =
V;
21189 SmallDenseMap<BasicBlock *, unsigned, 4> VisitedBBs;
21195 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
21201 TreeEntry *OpTE = getOperandEntry(
E,
I);
21202 if (OpTE->isGather() || DeletedNodes.contains(OpTE) ||
21203 TransformedToGatherNodes.contains(OpTE)) {
21206 assert(!OpTE->VectorizedValue &&
"Expected no vectorized value.");
21207 OpTE->VectorizedValue = VecOp;
21214 Value *Vec = vectorizeOperand(
E,
I);
21215 if (VecTy != Vec->
getType()) {
21217 MinBWs.contains(getOperandEntry(
E,
I))) &&
21218 "Expected item in MinBWs.");
21219 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(
I));
21225 "Invalid number of incoming values");
21226 assert(
E->VectorizedValue &&
"Expected vectorized value.");
21227 return E->VectorizedValue;
21230 case Instruction::ExtractElement: {
21231 Value *
V =
E->getSingleOperand(0);
21232 setInsertPointAfterBundle(
E);
21233 V = FinalShuffle(V,
E);
21234 E->VectorizedValue =
V;
21237 case Instruction::ExtractValue: {
21239 Builder.SetInsertPoint(LI);
21240 Value *Ptr = LI->getPointerOperand();
21241 LoadInst *
V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
21243 NewV = FinalShuffle(NewV,
E);
21244 E->VectorizedValue = NewV;
21247 case Instruction::InsertElement: {
21248 assert(
E->ReuseShuffleIndices.empty() &&
"All inserts should be unique");
21249 if (
const TreeEntry *OpE = getOperandEntry(
E, 1);
21250 OpE && !OpE->isGather() && OpE->hasState() &&
21251 !OpE->hasCopyableElements())
21254 setInsertPointAfterBundle(
E);
21255 Value *
V = vectorizeOperand(
E, 1);
21257 Type *ScalarTy =
Op.front()->getType();
21260 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(
E, 1));
21261 assert(Res.first > 0 &&
"Expected item in MinBWs.");
21262 V = Builder.CreateIntCast(
21272 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
21274 const unsigned NumElts =
21276 const unsigned NumScalars =
E->Scalars.size();
21279 assert(
Offset < NumElts &&
"Failed to find vector index offset");
21282 SmallVector<int>
Mask;
21283 if (!
E->ReorderIndices.empty()) {
21288 std::iota(
Mask.begin(), std::next(
Mask.begin(), NumScalars), 0);
21291 bool IsIdentity =
true;
21293 Mask.swap(PrevMask);
21294 for (
unsigned I = 0;
I < NumScalars; ++
I) {
21297 IsIdentity &= InsertIdx -
Offset ==
I;
21300 if (!IsIdentity || NumElts != NumScalars) {
21301 Value *V2 =
nullptr;
21302 bool IsVNonPoisonous =
21304 SmallVector<int> InsertMask(Mask);
21305 if (NumElts != NumScalars &&
Offset == 0) {
21314 InsertMask[*InsertIdx] = *InsertIdx;
21320 SmallBitVector UseMask =
21321 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
21322 SmallBitVector IsFirstPoison =
21324 SmallBitVector IsFirstUndef =
21326 if (!IsFirstPoison.
all()) {
21328 for (
unsigned I = 0;
I < NumElts;
I++) {
21330 IsFirstUndef.
test(
I)) {
21331 if (IsVNonPoisonous) {
21332 InsertMask[
I] =
I < NumScalars ?
I : 0;
21337 if (Idx >= NumScalars)
21338 Idx = NumScalars - 1;
21339 InsertMask[
I] = NumScalars + Idx;
21352 V = Builder.CreateShuffleVector(V, V2, InsertMask);
21354 GatherShuffleExtractSeq.insert(
I);
21355 CSEBlocks.insert(
I->getParent());
21360 for (
unsigned I = 0;
I < NumElts;
I++) {
21364 SmallBitVector UseMask =
21365 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
21366 SmallBitVector IsFirstUndef =
21368 if ((!IsIdentity ||
Offset != 0 || !IsFirstUndef.
all()) &&
21369 NumElts != NumScalars) {
21370 if (IsFirstUndef.
all()) {
21372 SmallBitVector IsFirstPoison =
21374 if (!IsFirstPoison.
all()) {
21375 for (
unsigned I = 0;
I < NumElts;
I++) {
21377 InsertMask[
I] =
I + NumElts;
21380 V = Builder.CreateShuffleVector(
21386 GatherShuffleExtractSeq.insert(
I);
21387 CSEBlocks.insert(
I->getParent());
21391 SmallBitVector IsFirstPoison =
21393 for (
unsigned I = 0;
I < NumElts;
I++) {
21397 InsertMask[
I] += NumElts;
21399 V = Builder.CreateShuffleVector(
21400 FirstInsert->getOperand(0), V, InsertMask,
21403 GatherShuffleExtractSeq.insert(
I);
21404 CSEBlocks.insert(
I->getParent());
21409 ++NumVectorInstructions;
21410 E->VectorizedValue =
V;
21413 case Instruction::ZExt:
21414 case Instruction::SExt:
21415 case Instruction::FPToUI:
21416 case Instruction::FPToSI:
21417 case Instruction::FPExt:
21418 case Instruction::PtrToInt:
21419 case Instruction::IntToPtr:
21420 case Instruction::SIToFP:
21421 case Instruction::UIToFP:
21422 case Instruction::Trunc:
21423 case Instruction::FPTrunc:
21424 case Instruction::BitCast: {
21425 setInsertPointAfterBundle(
E);
21427 Value *InVec = vectorizeOperand(
E, 0);
21432 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
21434 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
21437 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
21438 if (SrcIt != MinBWs.end())
21439 SrcBWSz = SrcIt->second.first;
21440 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->
getScalarType());
21441 if (BWSz == SrcBWSz) {
21442 VecOpcode = Instruction::BitCast;
21443 }
else if (BWSz < SrcBWSz) {
21444 VecOpcode = Instruction::Trunc;
21445 }
else if (It != MinBWs.end()) {
21446 assert(BWSz > SrcBWSz &&
"Invalid cast!");
21447 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
21448 }
else if (SrcIt != MinBWs.end()) {
21449 assert(BWSz > SrcBWSz &&
"Invalid cast!");
21451 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
21453 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
21454 !SrcIt->second.second) {
21455 VecOpcode = Instruction::UIToFP;
21456 }
else if (VecOpcode == Instruction::BitCast && SrcIt != MinBWs.end() &&
21458 Type *OrigSrcScalarTy = CI->getSrcTy();
21459 auto *OrigSrcVectorTy =
21462 Builder.CreateIntCast(InVec, OrigSrcVectorTy, SrcIt->second.second);
21464 Value *
V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
21466 : Builder.CreateCast(VecOpcode, InVec, VecTy);
21467 V = FinalShuffle(V,
E);
21469 E->VectorizedValue =
V;
21470 ++NumVectorInstructions;
21473 case Instruction::FCmp:
21474 case Instruction::ICmp: {
21475 setInsertPointAfterBundle(
E);
21477 Value *
L = vectorizeOperand(
E, 0);
21478 Value *
R = vectorizeOperand(
E, 1);
21479 if (
L->getType() !=
R->getType()) {
21482 MinBWs.contains(getOperandEntry(
E, 0)) ||
21483 MinBWs.contains(getOperandEntry(
E, 1))) &&
21484 "Expected item in MinBWs.");
21487 ->getIntegerBitWidth();
21490 ->getIntegerBitWidth();
21493 Type *CastTy =
R->getType();
21494 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
21496 Type *CastTy =
L->getType();
21497 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
21502 Value *
V = Builder.CreateCmp(P0, L, R);
21505 ICmp->setSameSign(
false);
21508 V = FinalShuffle(V,
E);
21510 E->VectorizedValue =
V;
21511 ++NumVectorInstructions;
21514 case Instruction::Select: {
21515 setInsertPointAfterBundle(
E);
21518 Value *True = vectorizeOperand(
E, 1);
21519 Value *False = vectorizeOperand(
E, 2);
21523 MinBWs.contains(getOperandEntry(
E, 1)) ||
21524 MinBWs.contains(getOperandEntry(
E, 2))) &&
21525 "Expected item in MinBWs.");
21526 if (True->
getType() != VecTy)
21527 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
21528 if (False->
getType() != VecTy)
21529 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
21534 assert(TrueNumElements >= CondNumElements &&
21535 TrueNumElements % CondNumElements == 0 &&
21536 "Cannot vectorize Instruction::Select");
21538 "Cannot vectorize Instruction::Select");
21539 if (CondNumElements != TrueNumElements) {
21542 Cond = Builder.CreateShuffleVector(
21547 "Cannot vectorize Instruction::Select");
21549 Builder.CreateSelectWithUnknownProfile(
Cond, True, False,
DEBUG_TYPE);
21550 V = FinalShuffle(V,
E);
21552 E->VectorizedValue =
V;
21553 ++NumVectorInstructions;
21556 case Instruction::FNeg: {
21557 setInsertPointAfterBundle(
E);
21559 Value *
Op = vectorizeOperand(
E, 0);
21561 Value *
V = Builder.CreateUnOp(
21567 V = FinalShuffle(V,
E);
21569 E->VectorizedValue =
V;
21570 ++NumVectorInstructions;
21574 case Instruction::Freeze: {
21575 setInsertPointAfterBundle(
E);
21577 Value *
Op = vectorizeOperand(
E, 0);
21579 if (
Op->getType() != VecTy) {
21581 MinBWs.contains(getOperandEntry(
E, 0))) &&
21582 "Expected item in MinBWs.");
21583 Op = Builder.CreateIntCast(
Op, VecTy, GetOperandSignedness(0));
21585 Value *
V = Builder.CreateFreeze(
Op);
21586 V = FinalShuffle(V,
E);
21588 E->VectorizedValue =
V;
21589 ++NumVectorInstructions;
21593 case Instruction::Add:
21594 case Instruction::FAdd:
21595 case Instruction::Sub:
21596 case Instruction::FSub:
21597 case Instruction::Mul:
21598 case Instruction::FMul:
21599 case Instruction::UDiv:
21600 case Instruction::SDiv:
21601 case Instruction::FDiv:
21602 case Instruction::URem:
21603 case Instruction::SRem:
21604 case Instruction::FRem:
21605 case Instruction::Shl:
21606 case Instruction::LShr:
21607 case Instruction::AShr:
21608 case Instruction::And:
21609 case Instruction::Or:
21610 case Instruction::Xor: {
21611 setInsertPointAfterBundle(
E);
21615 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
21620 return CI && CI->getValue().countr_one() >= It->second.first;
21622 V = FinalShuffle(
I == 0 ?
RHS :
LHS,
E);
21623 E->VectorizedValue =
V;
21624 ++NumVectorInstructions;
21632 MinBWs.contains(getOperandEntry(
E, 0)) ||
21633 MinBWs.contains(getOperandEntry(
E, 1))) &&
21634 "Expected item in MinBWs.");
21636 LHS = Builder.CreateIntCast(
LHS, VecTy, GetOperandSignedness(0));
21638 RHS = Builder.CreateIntCast(
RHS, VecTy, GetOperandSignedness(1));
21641 Value *
V = Builder.CreateBinOp(
21648 if (!MinBWs.contains(
E) && ShuffleOrOp == Instruction::Sub &&
21650 return isa<PoisonValue>(V) ||
21651 (E->hasCopyableElements() && E->isCopyableElement(V)) ||
21652 isCommutative(cast<Instruction>(V));
21654 I->setHasNoUnsignedWrap(
false);
21657 V = FinalShuffle(V,
E);
21659 E->VectorizedValue =
V;
21660 ++NumVectorInstructions;
21664 case Instruction::Load: {
21667 setInsertPointAfterBundle(
E);
21671 FixedVectorType *StridedLoadTy =
nullptr;
21672 Value *PO = LI->getPointerOperand();
21673 if (
E->State == TreeEntry::Vectorize) {
21674 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
21675 }
else if (
E->State == TreeEntry::CompressVectorize) {
21676 auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
21677 CompressEntryToData.at(
E);
21678 Align CommonAlignment = LI->getAlign();
21684 for (
int I : CompressMask)
21688 MaskValues =
replicateMask(MaskValues, VecTy->getNumElements());
21691 NewLI = Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment,
21694 NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment);
21705 }
else if (
E->State == TreeEntry::StridedVectorize) {
21708 PO = IsReverseOrder ? PtrN : Ptr0;
21709 Type *StrideTy = DL->getIndexType(PO->
getType());
21711 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(
E);
21712 StridedLoadTy = SPtrInfo.Ty;
21713 assert(StridedLoadTy &&
"Missing StridedPoinerInfo for tree entry.");
21714 unsigned StridedLoadEC =
21717 Value *Stride = SPtrInfo.StrideVal;
21719 const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
21720 assert(StrideSCEV &&
"Neither StrideVal nor StrideSCEV were set.");
21721 SCEVExpander Expander(*SE,
"strided-load-vec");
21722 Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->
getType(),
21723 &*Builder.GetInsertPoint());
21726 Builder.CreateIntCast(Stride, StrideTy,
true);
21727 StrideVal = Builder.CreateMul(
21729 StrideTy, (IsReverseOrder ? -1 : 1) *
21731 DL->getTypeAllocSize(ScalarTy))));
21733 auto *Inst = Builder.CreateIntrinsic(
21734 Intrinsic::experimental_vp_strided_load,
21735 {StridedLoadTy, PO->
getType(), StrideTy},
21738 Builder.getInt32(StridedLoadEC)});
21739 Inst->addParamAttr(
21741 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
21744 assert(
E->State == TreeEntry::ScatterVectorize &&
"Unhandled state");
21745 Value *VecPtr = vectorizeOperand(
E, 0);
21750 unsigned ScalarTyNumElements =
21752 unsigned VecTyNumElements =
21754 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
21755 "Cannot expand getelementptr.");
21756 unsigned VF = VecTyNumElements / ScalarTyNumElements;
21759 return Builder.getInt64(I % ScalarTyNumElements);
21761 VecPtr = Builder.CreateGEP(
21762 VecTy->getElementType(),
21763 Builder.CreateShuffleVector(
21769 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
21771 Value *
V =
E->State == TreeEntry::CompressVectorize
21775 if (StridedLoadTy != VecTy)
21776 V = Builder.CreateBitOrPointerCast(V, VecTy);
21777 V = FinalShuffle(V,
E);
21778 E->VectorizedValue =
V;
21779 ++NumVectorInstructions;
21782 case Instruction::Store: {
21785 setInsertPointAfterBundle(
E);
21787 Value *VecValue = vectorizeOperand(
E, 0);
21788 if (VecValue->
getType() != VecTy)
21790 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
21791 VecValue = FinalShuffle(VecValue,
E);
21793 Value *Ptr =
SI->getPointerOperand();
21795 if (
E->State == TreeEntry::Vectorize) {
21796 ST = Builder.CreateAlignedStore(VecValue, Ptr,
SI->getAlign());
21798 assert(
E->State == TreeEntry::StridedVectorize &&
21799 "Expected either strided or consecutive stores.");
21800 if (!
E->ReorderIndices.empty()) {
21802 Ptr =
SI->getPointerOperand();
21805 Type *StrideTy = DL->getIndexType(
SI->getPointerOperandType());
21806 auto *Inst = Builder.CreateIntrinsic(
21807 Intrinsic::experimental_vp_strided_store,
21808 {VecTy, Ptr->
getType(), StrideTy},
21811 StrideTy, -
static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
21812 Builder.getAllOnesMask(VecTy->getElementCount()),
21813 Builder.getInt32(
E->Scalars.size())});
21814 Inst->addParamAttr(
21816 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
21822 E->VectorizedValue =
V;
21823 ++NumVectorInstructions;
21826 case Instruction::GetElementPtr: {
21828 setInsertPointAfterBundle(
E);
21830 Value *Op0 = vectorizeOperand(
E, 0);
21833 for (
int J = 1,
N = GEP0->getNumOperands(); J <
N; ++J) {
21834 Value *OpVec = vectorizeOperand(
E, J);
21838 Value *
V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
21841 for (
Value *V :
E->Scalars) {
21848 V = FinalShuffle(V,
E);
21850 E->VectorizedValue =
V;
21851 ++NumVectorInstructions;
21855 case Instruction::Call: {
21857 setInsertPointAfterBundle(
E);
21862 CI,
ID, VecTy->getNumElements(),
21863 It != MinBWs.end() ? It->second.first : 0, TTI);
21866 VecCallCosts.first <= VecCallCosts.second;
21868 Value *ScalarArg =
nullptr;
21879 ScalarArg = CEI->getArgOperand(
I);
21882 if (
ID == Intrinsic::abs && It != MinBWs.end() &&
21883 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
21884 ScalarArg = Builder.getFalse();
21891 Value *OpVec = vectorizeOperand(
E,
I);
21892 ScalarArg = CEI->getArgOperand(
I);
21895 It == MinBWs.end()) {
21898 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(
I));
21899 }
else if (It != MinBWs.end()) {
21900 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(
I));
21909 if (!UseIntrinsic) {
21914 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
21921 Value *
V = Builder.CreateCall(CF, OpVecs, OpBundles);
21925 V = FinalShuffle(V,
E);
21927 E->VectorizedValue =
V;
21928 ++NumVectorInstructions;
21931 case Instruction::ShuffleVector: {
21934 setInsertPointAfterBundle(
E);
21935 Value *Src = vectorizeOperand(
E, 0);
21938 SmallVector<int> NewMask(ThisMask.size());
21940 return SVSrc->getShuffleMask()[Mask];
21942 V = Builder.CreateShuffleVector(SVSrc->getOperand(0),
21943 SVSrc->getOperand(1), NewMask);
21945 V = Builder.CreateShuffleVector(Src, ThisMask);
21950 V = FinalShuffle(V,
E);
21958 "Invalid Shuffle Vector Operand");
21962 setInsertPointAfterBundle(
E);
21963 LHS = vectorizeOperand(
E, 0);
21964 RHS = vectorizeOperand(
E, 1);
21966 setInsertPointAfterBundle(
E);
21967 LHS = vectorizeOperand(
E, 0);
21973 assert((It != MinBWs.end() ||
21974 getOperandEntry(
E, 0)->State == TreeEntry::NeedToGather ||
21975 getOperandEntry(
E, 1)->State == TreeEntry::NeedToGather ||
21976 MinBWs.contains(getOperandEntry(
E, 0)) ||
21977 MinBWs.contains(getOperandEntry(
E, 1))) &&
21978 "Expected item in MinBWs.");
21979 Type *CastTy = VecTy;
21985 ->getIntegerBitWidth())
21991 LHS = Builder.CreateIntCast(
LHS, CastTy, GetOperandSignedness(0));
21993 RHS = Builder.CreateIntCast(
RHS, CastTy, GetOperandSignedness(1));
21998 V0 = Builder.CreateBinOp(
22000 V1 = Builder.CreateBinOp(
22003 V0 = Builder.CreateCmp(CI0->getPredicate(),
LHS,
RHS);
22006 V1 = Builder.CreateCmp(AltPred,
LHS,
RHS);
22009 unsigned SrcBWSz = DL->getTypeSizeInBits(
22011 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
22012 if (BWSz <= SrcBWSz) {
22013 if (BWSz < SrcBWSz)
22014 LHS = Builder.CreateIntCast(
LHS, VecTy, It->second.first);
22016 "Expected same type as operand.");
22020 E->VectorizedValue =
LHS;
22021 ++NumVectorInstructions;
22025 V0 = Builder.CreateCast(
22027 V1 = Builder.CreateCast(
22032 for (
Value *V : {V0, V1}) {
22034 GatherShuffleExtractSeq.insert(
I);
22035 CSEBlocks.insert(
I->getParent());
22043 SmallVector<int>
Mask;
22044 E->buildAltOpShuffleMask(
22045 [
E,
this](Instruction *
I) {
22046 assert(
E->getMatchingMainOpOrAltOp(
I) &&
22047 "Unexpected main/alternate opcode");
22051 Mask, &OpScalars, &AltScalars);
22055 auto DropNuwFlag = [&](
Value *Vec,
unsigned Opcode) {
22058 I && Opcode == Instruction::Sub && !MinBWs.contains(
E) &&
22060 if (isa<PoisonValue>(V))
22062 if (E->hasCopyableElements() && E->isCopyableElement(V))
22064 auto *IV = cast<Instruction>(V);
22065 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
22067 I->setHasNoUnsignedWrap(
false);
22069 DropNuwFlag(V0,
E->getOpcode());
22070 DropNuwFlag(V1,
E->getAltOpcode());
22076 V = Builder.CreateShuffleVector(V0, V1, Mask);
22079 GatherShuffleExtractSeq.insert(
I);
22080 CSEBlocks.insert(
I->getParent());
22084 E->VectorizedValue =
V;
22085 ++NumVectorInstructions;
22089 case TreeEntry::ReducedBitcast:
22090 case TreeEntry::ReducedBitcastBSwap: {
22091 assert(UserIgnoreList &&
"Expected reduction operations only.");
22092 setInsertPointAfterBundle(
E);
22093 TreeEntry *ZExt = getOperandEntry(
E, 0);
22095 ZExt->getMainOp()->getType(), ZExt->getVectorFactor()));
22096 TreeEntry *
Const = getOperandEntry(
E, 1);
22098 Const->Scalars.front()->getType(),
Const->getVectorFactor()));
22099 Value *
Op = vectorizeOperand(ZExt, 0);
22102 DL->getTypeSizeInBits(
cast<CastInst>(ZExt->getMainOp())->getSrcTy()) *
22103 E->getVectorFactor());
22104 auto *OrigScalarTy = ScalarTy;
22107 Op = FinalShuffle(
Op,
E);
22108 auto *
V = Builder.CreateBitCast(
Op, SrcType);
22109 ++NumVectorInstructions;
22110 if (ShuffleOrOp == TreeEntry::ReducedBitcastBSwap) {
22111 V = Builder.CreateUnaryIntrinsic(Intrinsic::bswap, V);
22112 ++NumVectorInstructions;
22114 if (SrcType != OrigScalarTy) {
22115 V = Builder.CreateIntCast(V, OrigScalarTy,
false);
22116 ++NumVectorInstructions;
22118 E->VectorizedValue =
V;
22121 case TreeEntry::ReducedBitcastLoads:
22122 case TreeEntry::ReducedBitcastBSwapLoads: {
22123 assert(UserIgnoreList &&
"Expected reduction operations only.");
22124 TreeEntry *ZExt = getOperandEntry(
E, 0);
22125 TreeEntry *
Load = getOperandEntry(ZExt, 0);
22126 setInsertPointAfterBundle(Load);
22128 ZExt->getMainOp()->getType(), ZExt->getVectorFactor()));
22129 TreeEntry *
Const = getOperandEntry(
E, 1);
22131 Const->Scalars.front()->getType(),
Const->getVectorFactor()));
22133 Load->getMainOp()->getType(),
Load->getVectorFactor()));
22135 Value *PO = LI->getPointerOperand();
22138 DL->getTypeSizeInBits(
cast<CastInst>(ZExt->getMainOp())->getSrcTy()) *
22139 E->getVectorFactor());
22140 auto *OrigScalarTy = ScalarTy;
22141 ScalarTy = ZExt->getMainOp()->getType();
22142 Value *
V = Builder.CreateAlignedLoad(SrcTy, PO, LI->getAlign());
22143 ++NumVectorInstructions;
22144 if (ShuffleOrOp == TreeEntry::ReducedBitcastBSwapLoads) {
22145 V = Builder.CreateUnaryIntrinsic(Intrinsic::bswap, V);
22146 ++NumVectorInstructions;
22148 if (SrcTy != OrigScalarTy) {
22149 V = Builder.CreateIntCast(V, OrigScalarTy,
false);
22150 ++NumVectorInstructions;
22152 E->VectorizedValue =
V;
22155 case TreeEntry::ReducedCmpBitcast: {
22156 assert(UserIgnoreList &&
"Expected reduction operations only.");
22157 setInsertPointAfterBundle(
E);
22158 TreeEntry *Op1TE = getOperandEntry(
E, 1);
22159 TreeEntry *Op2TE = getOperandEntry(
E, 2);
22160 Op1TE->VectorizedValue =
22162 Op2TE->VectorizedValue =
22167 IntegerType::getIntNTy(ScalarTy->
getContext(),
E->getVectorFactor());
22168 auto *
V = Builder.CreateBitCast(Cmp, DstTy);
22169 ++NumVectorInstructions;
22170 if (DstTy != ScalarTy) {
22171 V = Builder.CreateIntCast(V, ScalarTy,
false);
22172 ++NumVectorInstructions;
22174 E->VectorizedValue =
V;
22191 ArrayRef<std::tuple<WeakTrackingVH, unsigned, bool, bool>>
22192 VectorValuesAndScales) {
22195 EntryToLastInstruction.clear();
22197 for (
auto &BSIter : BlocksSchedules)
22198 scheduleBlock(*
this, BSIter.second.get());
22201 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
22204 if (TE->isGather() || DeletedNodes.contains(TE.get()) ||
22205 (TE->State == TreeEntry::CombinedVectorize &&
22206 (TE->CombinedOp == TreeEntry::ReducedBitcast ||
22207 TE->CombinedOp == TreeEntry::ReducedBitcastBSwap ||
22208 ((TE->CombinedOp == TreeEntry::ReducedBitcastLoads ||
22209 TE->CombinedOp == TreeEntry::ReducedBitcastBSwapLoads ||
22210 TE->CombinedOp == TreeEntry::ReducedCmpBitcast) &&
22211 (!TE->hasState() || TE->getOpcode() != Instruction::Load)))))
22213 (void)getLastInstructionInBundle(TE.get());
22217 Builder.SetInsertPoint(ReductionRoot->
getParent(),
22220 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
22224 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
22225 if (DeletedNodes.contains(TE.get()))
22227 if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
22228 TE->UserTreeIndex.UserTE->hasState() &&
22229 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
22230 (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
22231 TE->UserTreeIndex.UserTE->isAltShuffle()) &&
22232 !TE->UserTreeIndex.UserTE->hasCopyableElements() &&
22233 all_of(TE->UserTreeIndex.UserTE->Scalars,
22234 [](
Value *V) { return isUsedOutsideBlock(V); })) {
22236 getLastInstructionInBundle(TE->UserTreeIndex.UserTE);
22240 for (
auto &Entry : GatherEntries) {
22242 Builder.SetInsertPoint(Entry.second);
22243 Builder.SetCurrentDebugLocation(Entry.second->getDebugLoc());
22248 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
22249 if (DeletedNodes.contains(TE.get()))
22251 if (GatheredLoadsEntriesFirst.has_value() &&
22252 TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue &&
22253 (!TE->isGather() || TE->UserTreeIndex)) {
22254 assert((TE->UserTreeIndex ||
22255 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
22256 "Expected gathered load node.");
22265 for (
const TreeEntry *E : PostponedNodes) {
22266 auto *TE =
const_cast<TreeEntry *
>(E);
22268 TE->VectorizedValue =
nullptr;
22279 (TE->UserTreeIndex.UserTE->hasState() &&
22280 TE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
22281 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI)) {
22290 if (UI->comesBefore(InsertPt))
22293 Builder.SetInsertPoint(InsertPt);
22295 Builder.SetInsertPoint(PrevVec);
22297 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
22300 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
22301 Builder.GetInsertPoint()->comesBefore(VecI))
22302 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
22303 Builder.GetInsertPoint());
22304 if (Vec->
getType() != PrevVec->getType()) {
22306 PrevVec->getType()->isIntOrIntVectorTy() &&
22307 "Expected integer vector types only.");
22308 std::optional<bool> IsSigned;
22309 for (
Value *V : TE->Scalars) {
22311 for (
const TreeEntry *MNTE : getTreeEntries(V)) {
22312 auto It = MinBWs.find(MNTE);
22313 if (It != MinBWs.end()) {
22314 IsSigned = IsSigned.value_or(
false) || It->second.second;
22319 if (IsSigned.value_or(
false))
22322 for (
const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
22323 auto It = MinBWs.find(BVE);
22324 if (It != MinBWs.end()) {
22325 IsSigned = IsSigned.value_or(
false) || It->second.second;
22330 if (IsSigned.value_or(
false))
22334 IsSigned.value_or(
false) ||
22338 if (IsSigned.value_or(
false))
22342 if (IsSigned.value_or(
false)) {
22344 auto It = MinBWs.find(TE->UserTreeIndex.UserTE);
22345 if (It != MinBWs.end())
22346 IsSigned = It->second.second;
22349 "Expected user node or perfect diamond match in MinBWs.");
22350 Vec = Builder.CreateIntCast(Vec, PrevVec->
getType(), *IsSigned);
22352 PrevVec->replaceAllUsesWith(Vec);
22353 PostponedValues.
try_emplace(Vec).first->second.push_back(TE);
22356 auto It = PostponedValues.
find(PrevVec);
22357 if (It != PostponedValues.
end()) {
22358 for (TreeEntry *VTE : It->getSecond())
22359 VTE->VectorizedValue = Vec;
22379 for (
const auto &ExternalUse : ExternalUses) {
22380 Value *Scalar = ExternalUse.Scalar;
22387 const TreeEntry *E = &ExternalUse.E;
22388 assert(E &&
"Invalid scalar");
22389 assert(!E->isGather() &&
"Extracting from a gather list");
22391 if (E->getOpcode() == Instruction::GetElementPtr &&
22395 Value *Vec = E->VectorizedValue;
22396 assert(Vec &&
"Can't find vectorizable value");
22398 Value *Lane = Builder.getInt32(ExternalUse.Lane);
22399 auto ExtractAndExtendIfNeeded = [&](
Value *Vec) {
22400 if (Scalar->getType() != Vec->
getType()) {
22401 Value *Ex =
nullptr;
22402 Value *ExV =
nullptr;
22404 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
22405 auto It = ScalarToEEs.
find(Scalar);
22406 if (It != ScalarToEEs.
end()) {
22409 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
22410 : Builder.GetInsertBlock());
22411 if (EEIt != It->second.end()) {
22412 Value *PrevV = EEIt->second.first;
22414 I && !ReplaceInst &&
22415 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
22416 Builder.GetInsertPoint()->comesBefore(
I)) {
22417 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
22418 Builder.GetInsertPoint());
22423 ExV = EEIt->second.second ? EEIt->second.second : Ex;
22432 IgnoredExtracts.
insert(EE);
22435 auto *CloneInst = Inst->clone();
22436 CloneInst->insertBefore(Inst->getIterator());
22437 if (Inst->hasName())
22438 CloneInst->takeName(Inst);
22443 Value *V = ES->getVectorOperand();
22446 V = ETEs.front()->VectorizedValue;
22448 !
IV ||
IV == Vec ||
IV->getParent() != IVec->getParent() ||
22449 IV->comesBefore(IVec))
22450 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
22452 Ex = Builder.CreateExtractElement(Vec, Lane);
22453 }
else if (
auto *VecTy =
22456 unsigned VecTyNumElements = VecTy->getNumElements();
22461 ExternalUse.Lane * VecTyNumElements);
22463 Ex = Builder.CreateExtractElement(Vec, Lane);
22468 if (Scalar->getType() != Ex->
getType())
22469 ExV = Builder.CreateIntCast(
22474 : &F->getEntryBlock(),
22475 std::make_pair(Ex, ExV));
22481 GatherShuffleExtractSeq.insert(ExI);
22482 CSEBlocks.insert(ExI->getParent());
22488 "In-tree scalar of vector type is not insertelement?");
22497 if (!ScalarsWithNullptrUser.
insert(Scalar).second)
22500 (ExternallyUsedValues.
count(Scalar) ||
22501 ExternalUsesWithNonUsers.count(Scalar) ||
22502 ExternalUsesAsOriginalScalar.contains(Scalar) ||
22506 if (ExternalUsesAsOriginalScalar.contains(U))
22508 ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
22509 return !UseEntries.empty() &&
22510 (E->State == TreeEntry::Vectorize ||
22511 E->State == TreeEntry::StridedVectorize ||
22512 E->State == TreeEntry::CompressVectorize) &&
22513 any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {
22514 return (UseEntry->State == TreeEntry::Vectorize ||
22516 TreeEntry::StridedVectorize ||
22518 TreeEntry::CompressVectorize) &&
22519 doesInTreeUserNeedToExtract(
22520 Scalar, getRootEntryInstruction(*UseEntry),
22524 "Scalar with nullptr User must be registered in "
22525 "ExternallyUsedValues map or remain as scalar in vectorized "
22529 if (
PHI->getParent()->isLandingPad())
22530 Builder.SetInsertPoint(
22533 PHI->getParent()->getLandingPadInst()->getIterator()));
22535 Builder.SetInsertPoint(
PHI->getParent(),
22536 PHI->getParent()->getFirstNonPHIIt());
22538 Builder.SetInsertPoint(VecI->getParent(),
22539 std::next(VecI->getIterator()));
22542 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
22544 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
22546 if (Scalar != NewInst) {
22549 "Extractelements should not be replaced.");
22550 Scalar->replaceAllUsesWith(NewInst);
22560 if (!UsedInserts.
insert(VU).second)
22563 auto BWIt = MinBWs.find(E);
22565 auto *ScalarTy = FTy->getElementType();
22566 auto Key = std::make_pair(Vec, ScalarTy);
22567 auto VecIt = VectorCasts.
find(
Key);
22568 if (VecIt == VectorCasts.
end()) {
22571 if (IVec->getParent()->isLandingPad())
22572 Builder.SetInsertPoint(IVec->getParent(),
22573 std::next(IVec->getParent()
22574 ->getLandingPadInst()
22577 Builder.SetInsertPoint(
22578 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
22580 Builder.SetInsertPoint(IVec->getNextNode());
22582 Vec = Builder.CreateIntCast(
22587 BWIt->second.second);
22590 Vec = VecIt->second;
22597 ShuffledInserts, [VU](
const ShuffledInsertData<Value *> &
Data) {
22604 unsigned Idx = *InsertIdx;
22605 if (It == ShuffledInserts.
end()) {
22607 It = std::next(ShuffledInserts.
begin(),
22608 ShuffledInserts.
size() - 1);
22613 Mask[Idx] = ExternalUse.Lane;
22625 for (
unsigned I :
seq<unsigned>(0, PH->getNumIncomingValues())) {
22626 if (PH->getIncomingValue(
I) == Scalar) {
22628 PH->getIncomingBlock(
I)->getTerminator();
22630 Builder.SetInsertPoint(VecI->getParent(),
22631 std::next(VecI->getIterator()));
22633 Builder.SetInsertPoint(PH->getIncomingBlock(
I)->getTerminator());
22635 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
22636 PH->setOperand(
I, NewInst);
22641 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
22645 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
22646 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
22657 for (
int I = 0, E = Mask.size();
I < E; ++
I) {
22659 CombinedMask1[
I] = Mask[
I];
22661 CombinedMask2[
I] = Mask[
I] - VF;
22663 ShuffleInstructionBuilder ShuffleBuilder(
22665 ShuffleBuilder.add(V1, CombinedMask1);
22667 ShuffleBuilder.add(V2, CombinedMask2);
22668 return ShuffleBuilder.finalize({}, {}, {});
22671 auto &&ResizeToVF = [&CreateShuffle](
Value *Vec, ArrayRef<int>
Mask,
22672 bool ForSingleMask) {
22673 unsigned VF =
Mask.size();
22676 if (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); })) {
22677 Vec = CreateShuffle(Vec,
nullptr, Mask);
22678 return std::make_pair(Vec,
true);
22680 if (!ForSingleMask) {
22682 for (
unsigned I = 0;
I < VF; ++
I) {
22686 Vec = CreateShuffle(Vec,
nullptr, ResizeMask);
22690 return std::make_pair(Vec,
false);
22694 for (
int I = 0,
E = ShuffledInserts.size();
I <
E; ++
I) {
22697 InsertElementInst *FirstInsert = ShuffledInserts[
I].InsertElements.front();
22698 InsertElementInst *LastInsert = ShuffledInserts[
I].InsertElements.back();
22699 Builder.SetInsertPoint(LastInsert);
22700 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
22705 return cast<VectorType>(Vec->getType())
22706 ->getElementCount()
22707 .getKnownMinValue();
22710 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
22712 assert((Vals.size() == 1 || Vals.size() == 2) &&
22713 "Expected exactly 1 or 2 input values.");
22714 if (Vals.size() == 1) {
22717 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
22718 ->getNumElements() ||
22719 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
22720 return CreateShuffle(Vals.front(), nullptr, Mask);
22721 return Vals.front();
22723 return CreateShuffle(Vals.
front() ? Vals.
front()
22725 Vals.
back(), Mask);
22727 auto It = ShuffledInserts[
I].InsertElements.rbegin();
22729 InsertElementInst *
II =
nullptr;
22730 if (It != ShuffledInserts[
I].InsertElements.rend())
22733 while (It != ShuffledInserts[
I].InsertElements.rend()) {
22734 assert(
II &&
"Must be an insertelement instruction.");
22741 for (Instruction *
II :
reverse(Inserts)) {
22742 II->replaceUsesOfWith(
II->getOperand(0), NewInst);
22744 if (
II->getParent() == NewI->getParent() &&
II->comesBefore(NewI))
22745 II->moveAfter(NewI);
22749 for (InsertElementInst *IE :
reverse(ShuffledInserts[
I].InsertElements)) {
22750 IE->replaceUsesOfWith(
IE->getOperand(0),
22752 IE->replaceUsesOfWith(
IE->getOperand(1),
22756 CSEBlocks.insert(LastInsert->
getParent());
22761 for (
auto &TEPtr : VectorizableTree) {
22762 TreeEntry *
Entry = TEPtr.get();
22765 if (
Entry->isGather() ||
Entry->State == TreeEntry::SplitVectorize ||
22766 DeletedNodes.contains(Entry) ||
22767 TransformedToGatherNodes.contains(Entry))
22770 if (
Entry->CombinedOp == TreeEntry::ReducedBitcast ||
22771 Entry->CombinedOp == TreeEntry::ReducedBitcastBSwap ||
22772 Entry->CombinedOp == TreeEntry::ReducedBitcastLoads ||
22773 Entry->CombinedOp == TreeEntry::ReducedBitcastBSwapLoads ||
22774 Entry->CombinedOp == TreeEntry::ReducedCmpBitcast) {
22776 if (!
Entry->hasState()) {
22783 if (!
I ||
Entry->isCopyableElement(
I))
22791 assert(
Entry->VectorizedValue &&
"Can't find vectorizable value");
22794 for (
int Lane = 0, LE =
Entry->Scalars.size(); Lane != LE; ++Lane) {
22797 if (
Entry->getOpcode() == Instruction::GetElementPtr &&
22801 EE && IgnoredExtracts.contains(EE))
22808 for (User *U :
Scalar->users()) {
22813 (UserIgnoreList && UserIgnoreList->contains(U)) ||
22816 "Deleting out-of-tree value");
22820 LLVM_DEBUG(
dbgs() <<
"SLP: \tErasing scalar:" << *Scalar <<
".\n");
22829 V->mergeDIAssignID(RemovedInsts);
22832 if (UserIgnoreList) {
22833 for (Instruction *
I : RemovedInsts) {
22834 const TreeEntry *
IE = getTreeEntries(
I).front();
22836 !SplitEntries.empty() && SplitEntries.front()->Idx <
IE->Idx)
22837 IE = SplitEntries.front();
22838 if (
IE->Idx != 0 &&
22839 !(VectorizableTree.front()->isGather() &&
IE->UserTreeIndex &&
22840 (ValueToGatherNodes.lookup(
I).contains(
22841 VectorizableTree.front().get()) ||
22842 (
IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
22843 IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
22844 !(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
22845 IE->UserTreeIndex &&
22847 !(GatheredLoadsEntriesFirst.has_value() &&
22848 IE->Idx >= *GatheredLoadsEntriesFirst &&
22849 VectorizableTree.front()->isGather() &&
22851 !(!VectorizableTree.front()->isGather() &&
22852 VectorizableTree.front()->isCopyableElement(
I)))
22857 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
22858 (match(U.getUser(), m_LogicalAnd()) ||
22859 match(U.getUser(), m_LogicalOr())) &&
22860 U.getOperandNo() == 0;
22861 if (IsPoisoningLogicalOp) {
22862 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
22865 return UserIgnoreList->contains(
U.getUser());
22869 for (SelectInst *SI : LogicalOpSelects)
22879 Builder.ClearInsertionPoint();
22880 InstrElementSize.clear();
22882 const TreeEntry &RootTE = *VectorizableTree.front();
22883 Value *Vec = RootTE.VectorizedValue;
22884 if (
auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
22885 It != MinBWs.end() &&
22886 ReductionBitWidth != It->second.first) {
22887 IRBuilder<>::InsertPointGuard Guard(Builder);
22888 Builder.SetInsertPoint(ReductionRoot->getParent(),
22889 ReductionRoot->getIterator());
22891 Vec = Builder.CreateIntCast(Vec, Builder.getIntNTy(ReductionBitWidth),
22892 It->second.second);
22895 Vec = Builder.CreateIntCast(
22897 VectorType::get(Builder.getIntNTy(ReductionBitWidth),
22899 It->second.second);
22906 LLVM_DEBUG(
dbgs() <<
"SLP: Optimizing " << GatherShuffleExtractSeq.size()
22907 <<
" gather sequences instructions.\n");
22914 Loop *L = LI->getLoopFor(
I->getParent());
22919 BasicBlock *PreHeader = L->getLoopPreheader();
22927 auto *OpI = dyn_cast<Instruction>(V);
22928 return OpI && L->contains(OpI);
22934 CSEBlocks.insert(PreHeader);
22939 CSEWorkList.
reserve(CSEBlocks.size());
22942 assert(DT->isReachableFromEntry(
N));
22949 assert((
A ==
B) == (
A->getDFSNumIn() ==
B->getDFSNumIn()) &&
22950 "Different nodes should have different DFS numbers");
22951 return A->getDFSNumIn() <
B->getDFSNumIn();
22959 auto &&IsIdenticalOrLessDefined = [TTI = TTI](
Instruction *I1,
22962 if (I1->getType() != I2->getType())
22967 return I1->isIdenticalTo(I2);
22968 if (SI1->isIdenticalTo(SI2))
22970 for (
int I = 0, E = SI1->getNumOperands();
I < E; ++
I)
22971 if (SI1->getOperand(
I) != SI2->getOperand(
I))
22974 NewMask.
assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
22978 unsigned LastUndefsCnt = 0;
22979 for (
int I = 0, E = NewMask.
size();
I < E; ++
I) {
22985 NewMask[
I] != SM1[
I])
22988 NewMask[
I] = SM1[
I];
22992 return SM1.
size() - LastUndefsCnt > 1 &&
22996 SM1.
size() - LastUndefsCnt));
23002 for (
auto I = CSEWorkList.
begin(), E = CSEWorkList.
end();
I != E; ++
I) {
23004 (
I == CSEWorkList.
begin() || !DT->dominates(*
I, *std::prev(
I))) &&
23005 "Worklist not sorted properly!");
23012 !GatherShuffleExtractSeq.contains(&In))
23017 bool Replaced =
false;
23020 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
23021 DT->dominates(V->getParent(), In.getParent())) {
23022 In.replaceAllUsesWith(V);
23025 if (!NewMask.
empty())
23026 SI->setShuffleMask(NewMask);
23031 GatherShuffleExtractSeq.contains(V) &&
23032 IsIdenticalOrLessDefined(V, &In, NewMask) &&
23033 DT->dominates(In.getParent(), V->getParent())) {
23035 V->replaceAllUsesWith(&In);
23038 if (!NewMask.
empty())
23039 SI->setShuffleMask(NewMask);
23047 Visited.push_back(&In);
23052 GatherShuffleExtractSeq.clear();
23055BoUpSLP::ScheduleBundle &BoUpSLP::BlockScheduling::buildBundle(
23058 ScheduledBundlesList.emplace_back(std::make_unique<ScheduleBundle>());
23059 for (
Value *V : VL) {
23060 if (S.isNonSchedulable(V))
23063 if (S.isCopyableElement(V)) {
23065 ScheduleCopyableData &SD =
23066 addScheduleCopyableData(EI,
I, SchedulingRegionID, *BundlePtr);
23068 BundlePtr->add(&SD);
23071 ScheduleData *BundleMember = getScheduleData(V);
23072 assert(BundleMember &&
"no ScheduleData for bundle member "
23073 "(maybe not in same basic block)");
23075 BundlePtr->add(BundleMember);
23076 ScheduledBundles.try_emplace(
I).first->getSecond().push_back(
23079 assert(BundlePtr && *BundlePtr &&
"Failed to find schedule bundle");
23085std::optional<BoUpSLP::ScheduleBundle *>
23087 const InstructionsState &S,
23100 bool HasCopyables = S.areInstructionsWithCopyableElements();
23101 bool DoesNotRequireScheduling =
23103 all_of(VL, [&](
Value *V) {
return S.isNonSchedulable(V); });
23104 if (!DoesNotRequireScheduling && S.areInstructionsWithCopyableElements() &&
23105 EI && EI.UserTE->hasState() && EI.UserTE->doesNotNeedToSchedule() &&
23106 EI.UserTE->getOpcode() != Instruction::PHI &&
23107 EI.UserTE->getOpcode() != Instruction::InsertElement &&
23109 auto *I = dyn_cast<Instruction>(V);
23112 for (User *U : I->users()) {
23113 auto *UI = cast<Instruction>(U);
23114 if (isa<BinaryOperator>(UI))
23119 return std::nullopt;
23120 if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() &&
23121 EI.UserTE->hasCopyableElements() &&
23122 EI.UserTE->getMainOp()->getParent() == S.getMainOp()->getParent() &&
23124 if (S.isCopyableElement(V))
23128 return std::nullopt;
23131 if (S.areInstructionsWithCopyableElements() &&
any_of(VL, [&](
Value *V) {
23144 return std::nullopt;
23145 if (S.areInstructionsWithCopyableElements() && EI) {
23146 bool IsNonSchedulableWithParentPhiNode =
23147 EI.UserTE->doesNotNeedToSchedule() && EI.UserTE->UserTreeIndex &&
23148 EI.UserTE->UserTreeIndex.UserTE->hasState() &&
23149 EI.UserTE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
23150 EI.UserTE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
23151 if (IsNonSchedulableWithParentPhiNode) {
23152 SmallSet<std::pair<Value *, Value *>, 4> Values;
23153 for (
const auto [Idx, V] :
23154 enumerate(EI.UserTE->UserTreeIndex.UserTE->Scalars)) {
23155 Value *
Op = EI.UserTE->UserTreeIndex.UserTE->getOperand(
23156 EI.UserTE->UserTreeIndex.EdgeIdx)[Idx];
23160 if (!Values.
insert(std::make_pair(V,
Op)).second)
23161 return std::nullopt;
23167 if (EI.UserTE->hasCopyableElements() &&
23168 EI.UserTE->isCopyableElement(V))
23170 ArrayRef<TreeEntry *> Entries = SLP->getTreeEntries(V);
23171 return any_of(Entries, [](const TreeEntry *TE) {
23172 return TE->doesNotNeedToSchedule() && TE->UserTreeIndex &&
23173 TE->UserTreeIndex.UserTE->hasState() &&
23174 TE->UserTreeIndex.UserTE->State !=
23175 TreeEntry::SplitVectorize &&
23176 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
23179 return std::nullopt;
23182 if (DoesNotRequireScheduling) {
23186 for (
Value *V : VL) {
23188 if (!
I || (HasCopyables && S.isCopyableElement(V)))
23190 SmallDenseMap<std::pair<Instruction *, Value *>,
unsigned> UserOpToNumOps;
23191 for (
const Use &U :
I->operands()) {
23194 .first->getSecond();
23197 Op && areAllOperandsReplacedByCopyableData(
I,
Op, *SLP,
NumOps)) {
23198 if (ScheduleData *OpSD = getScheduleData(
Op);
23199 OpSD && OpSD->hasValidDependencies())
23201 return std::nullopt;
23211 if (S.areInstructionsWithCopyableElements() && EI.UserTE &&
23212 EI.UserTE->State == TreeEntry::SplitVectorize &&
23214 return !S.isNonSchedulable(V) && S.isCopyableElement(V);
23216 return std::nullopt;
23220 LLVM_DEBUG(
dbgs() <<
"SLP: bundle: " << *S.getMainOp() <<
"\n");
23222 auto TryScheduleBundleImpl = [=](
bool ReSchedule, ScheduleBundle &Bundle) {
23225 SmallVector<ScheduleData *> ControlDependentMembers;
23226 auto CheckIfNeedToClearDeps = [&](ScheduleBundle &Bundle) {
23227 SmallDenseMap<std::pair<Instruction *, Value *>,
unsigned> UserOpToNumOps;
23228 for (ScheduleEntity *SE : Bundle.getBundle()) {
23230 if (ScheduleData *BundleMember = getScheduleData(SD->getInst());
23231 BundleMember && BundleMember->hasValidDependencies()) {
23232 BundleMember->clearDirectDependencies();
23233 if (RegionHasStackSave ||
23235 BundleMember->getInst()))
23236 ControlDependentMembers.
push_back(BundleMember);
23241 if (SD->hasValidDependencies() &&
23242 (!S.areInstructionsWithCopyableElements() ||
23243 !S.isCopyableElement(SD->getInst())) &&
23244 !getScheduleCopyableData(SD->getInst()).empty() && EI.UserTE &&
23245 EI.UserTE->hasState() &&
23246 (!EI.UserTE->hasCopyableElements() ||
23247 !EI.UserTE->isCopyableElement(SD->getInst())))
23248 SD->clearDirectDependencies();
23249 for (
const Use &U : SD->getInst()->operands()) {
23252 .
try_emplace(std::make_pair(SD->getInst(),
U.get()), 0)
23253 .first->getSecond();
23256 Op && areAllOperandsReplacedByCopyableData(SD->getInst(),
Op,
23258 if (ScheduleData *OpSD = getScheduleData(
Op);
23259 OpSD && OpSD->hasValidDependencies()) {
23260 OpSD->clearDirectDependencies();
23261 if (RegionHasStackSave ||
23263 ControlDependentMembers.
push_back(OpSD);
23274 if (OldScheduleEnd && ScheduleEnd != OldScheduleEnd) {
23275 for_each(ScheduleDataMap, [&](
auto &
P) {
23276 if (BB !=
P.first->getParent())
23278 ScheduleData *SD =
P.second;
23279 if (isInSchedulingRegion(*SD))
23280 SD->clearDependencies();
23282 for_each(ScheduleCopyableDataMapByInst, [&](
auto &
P) {
23283 for_each(
P.second, [&](ScheduleCopyableData *SD) {
23284 if (isInSchedulingRegion(*SD))
23285 SD->clearDependencies();
23292 if (Bundle && !Bundle.getBundle().empty()) {
23293 if (S.areInstructionsWithCopyableElements() ||
23294 !ScheduleCopyableDataMap.empty())
23295 CheckIfNeedToClearDeps(Bundle);
23296 LLVM_DEBUG(
dbgs() <<
"SLP: try schedule bundle " << Bundle <<
" in block "
23298 calculateDependencies(Bundle, !ReSchedule, SLP,
23299 ControlDependentMembers);
23300 }
else if (!ControlDependentMembers.
empty()) {
23301 ScheduleBundle
Invalid = ScheduleBundle::invalid();
23302 calculateDependencies(
Invalid, !ReSchedule, SLP,
23303 ControlDependentMembers);
23308 initialFillReadyList(ReadyInsts);
23315 while (((!Bundle && ReSchedule) || (Bundle && !Bundle.isReady())) &&
23316 !ReadyInsts.empty()) {
23317 ScheduleEntity *Picked = ReadyInsts.pop_back_val();
23318 assert(Picked->isReady() &&
"must be ready to schedule");
23319 schedule(*SLP, S, EI, Picked, ReadyInsts);
23320 if (Picked == &Bundle)
23327 for (
Value *V : VL) {
23328 if (S.isNonSchedulable(V))
23330 if (!extendSchedulingRegion(V, S)) {
23337 ScheduleBundle
Invalid = ScheduleBundle::invalid();
23338 TryScheduleBundleImpl(
false,
Invalid);
23339 return std::nullopt;
23343 bool ReSchedule =
false;
23344 for (
Value *V : VL) {
23345 if (S.isNonSchedulable(V))
23349 if (!CopyableData.
empty()) {
23350 for (ScheduleCopyableData *SD : CopyableData)
23351 ReadyInsts.remove(SD);
23353 ScheduleData *BundleMember = getScheduleData(V);
23354 assert((BundleMember || S.isCopyableElement(V)) &&
23355 "no ScheduleData for bundle member (maybe not in same basic block)");
23361 ReadyInsts.remove(BundleMember);
23363 !Bundles.
empty()) {
23364 for (ScheduleBundle *
B : Bundles)
23365 ReadyInsts.remove(
B);
23368 if (!S.isCopyableElement(V) && !BundleMember->isScheduled())
23375 LLVM_DEBUG(
dbgs() <<
"SLP: reset schedule because " << *BundleMember
23376 <<
" was already scheduled\n");
23380 ScheduleBundle &Bundle = buildBundle(VL, S, EI);
23381 TryScheduleBundleImpl(ReSchedule, Bundle);
23382 if (!Bundle.isReady()) {
23383 for (ScheduleEntity *BD : Bundle.getBundle()) {
23387 if (BD->isReady()) {
23389 if (Bundles.
empty()) {
23390 ReadyInsts.insert(BD);
23393 for (ScheduleBundle *
B : Bundles)
23395 ReadyInsts.insert(
B);
23398 ScheduledBundlesList.pop_back();
23399 SmallVector<ScheduleData *> ControlDependentMembers;
23400 for (
Value *V : VL) {
23401 if (S.isNonSchedulable(V))
23404 if (S.isCopyableElement(
I)) {
23407 auto KV = std::make_pair(EI,
I);
23408 assert(ScheduleCopyableDataMap.contains(KV) &&
23409 "no ScheduleCopyableData for copyable element");
23410 ScheduleCopyableData *SD =
23411 ScheduleCopyableDataMapByInst.find(
I)->getSecond().pop_back_val();
23412 ScheduleCopyableDataMapByUsers[
I].remove(SD);
23415 const auto *It =
find(
Op,
I);
23416 assert(It !=
Op.end() &&
"Lane not set");
23417 SmallPtrSet<Instruction *, 4> Visited;
23419 int Lane = std::distance(
Op.begin(), It);
23420 assert(Lane >= 0 &&
"Lane not set");
23422 !EI.UserTE->ReorderIndices.empty())
23423 Lane = EI.UserTE->ReorderIndices[Lane];
23424 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
23425 "Couldn't find extract lane");
23427 if (!Visited.
insert(In).second) {
23431 ScheduleCopyableDataMapByInstUser
23432 [std::make_pair(std::make_pair(In, EI.EdgeIdx),
I)]
23435 }
while (It !=
Op.end());
23437 if (ScheduleCopyableData *UserCD = getScheduleCopyableData(UserEI,
I))
23438 ScheduleCopyableDataMapByUsers[
I].insert(UserCD);
23440 if (ScheduleCopyableDataMapByUsers[
I].
empty())
23441 ScheduleCopyableDataMapByUsers.erase(
I);
23442 ScheduleCopyableDataMap.erase(KV);
23444 if (ScheduleData *OpSD = getScheduleData(
I);
23445 OpSD && OpSD->hasValidDependencies()) {
23446 OpSD->clearDirectDependencies();
23447 if (RegionHasStackSave ||
23449 ControlDependentMembers.
push_back(OpSD);
23453 ScheduledBundles.find(
I)->getSecond().pop_back();
23455 if (!ControlDependentMembers.
empty()) {
23456 ScheduleBundle
Invalid = ScheduleBundle::invalid();
23457 calculateDependencies(
Invalid,
false, SLP,
23458 ControlDependentMembers);
23460 return std::nullopt;
23465BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
23467 if (ChunkPos >= ChunkSize) {
23468 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
23471 return &(ScheduleDataChunks.back()[ChunkPos++]);
23474bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
23475 Value *V,
const InstructionsState &S) {
23477 assert(
I &&
"bundle member must be an instruction");
23478 if (getScheduleData(
I))
23480 if (!ScheduleStart) {
23482 initScheduleData(
I,
I->getNextNode(),
nullptr,
nullptr);
23484 ScheduleEnd =
I->getNextNode();
23485 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
23486 LLVM_DEBUG(
dbgs() <<
"SLP: initialize schedule region to " << *
I <<
"\n");
23494 ++ScheduleStart->getIterator().getReverse();
23500 return II->isAssumeLikeIntrinsic();
23503 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
23504 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
23505 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter !=
I &&
23507 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
23508 LLVM_DEBUG(
dbgs() <<
"SLP: exceeded schedule region size limit\n");
23515 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
23516 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
23518 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter ==
I)) {
23519 assert(
I->getParent() == ScheduleStart->getParent() &&
23520 "Instruction is in wrong basic block.");
23521 initScheduleData(
I, ScheduleStart,
nullptr, FirstLoadStoreInRegion);
23527 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter ==
I)) &&
23528 "Expected to reach top of the basic block or instruction down the "
23530 assert(
I->getParent() == ScheduleEnd->getParent() &&
23531 "Instruction is in wrong basic block.");
23532 initScheduleData(ScheduleEnd,
I->getNextNode(), LastLoadStoreInRegion,
23534 ScheduleEnd =
I->getNextNode();
23535 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
23536 LLVM_DEBUG(
dbgs() <<
"SLP: extend schedule region end to " << *
I <<
"\n");
23540void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
23542 ScheduleData *PrevLoadStore,
23543 ScheduleData *NextLoadStore) {
23544 ScheduleData *CurrentLoadStore = PrevLoadStore;
23549 ScheduleData *SD = ScheduleDataMap.lookup(
I);
23551 SD = allocateScheduleDataChunks();
23552 ScheduleDataMap[
I] = SD;
23554 assert(!isInSchedulingRegion(*SD) &&
23555 "new ScheduleData already in scheduling region");
23556 SD->init(SchedulingRegionID,
I);
23563 return LI && LI->isSimple() &&
23564 LI->getMetadata(LLVMContext::MD_invariant_load);
23567 if (
I->mayReadOrWriteMemory() &&
23569 !CanIgnoreLoad(
I) &&
23573 Intrinsic::pseudoprobe))) {
23575 if (CurrentLoadStore) {
23576 CurrentLoadStore->setNextLoadStore(SD);
23578 FirstLoadStoreInRegion = SD;
23580 CurrentLoadStore = SD;
23585 RegionHasStackSave =
true;
23587 if (NextLoadStore) {
23588 if (CurrentLoadStore)
23589 CurrentLoadStore->setNextLoadStore(NextLoadStore);
23591 LastLoadStoreInRegion = CurrentLoadStore;
23595void BoUpSLP::BlockScheduling::calculateDependencies(
23596 ScheduleBundle &Bundle,
bool InsertInReadyList,
BoUpSLP *SLP,
23598 SmallVector<ScheduleEntity *> WorkList;
23599 auto ProcessNode = [&](ScheduleEntity *SE) {
23601 if (CD->hasValidDependencies())
23604 CD->initDependencies();
23605 CD->resetUnscheduledDeps();
23606 const EdgeInfo &EI = CD->getEdgeInfo();
23609 const auto *It =
find(
Op, CD->getInst());
23610 assert(It !=
Op.end() &&
"Lane not set");
23611 SmallPtrSet<Instruction *, 4> Visited;
23613 int Lane = std::distance(
Op.begin(), It);
23614 assert(Lane >= 0 &&
"Lane not set");
23616 !EI.UserTE->ReorderIndices.empty())
23617 Lane = EI.UserTE->ReorderIndices[Lane];
23618 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
23619 "Couldn't find extract lane");
23621 if (EI.UserTE->isCopyableElement(In)) {
23624 if (ScheduleCopyableData *UseSD =
23625 getScheduleCopyableData(EI.UserTE->UserTreeIndex, In)) {
23626 CD->incDependencies();
23627 if (!UseSD->isScheduled())
23628 CD->incrementUnscheduledDeps(1);
23629 if (!UseSD->hasValidDependencies() ||
23630 (InsertInReadyList && UseSD->isReady()))
23633 }
else if (Visited.
insert(In).second) {
23634 if (ScheduleData *UseSD = getScheduleData(In)) {
23635 CD->incDependencies();
23636 if (!UseSD->isScheduled())
23637 CD->incrementUnscheduledDeps(1);
23638 if (!UseSD->hasValidDependencies() ||
23639 (InsertInReadyList && UseSD->isReady()))
23644 }
while (It !=
Op.end());
23645 if (CD->isReady() && CD->getDependencies() == 0 &&
23646 (EI.UserTE->hasState() &&
23647 (EI.UserTE->getMainOp()->getParent() !=
23648 CD->getInst()->getParent() ||
23650 (EI.UserTE->getMainOp()->hasNUsesOrMore(
UsesLimit) ||
23651 any_of(EI.UserTE->getMainOp()->users(), [&](User *U) {
23652 auto *IU = dyn_cast<Instruction>(U);
23655 return IU->getParent() == EI.UserTE->getMainOp()->getParent();
23661 CD->incDependencies();
23662 CD->incrementUnscheduledDeps(1);
23668 if (BundleMember->hasValidDependencies())
23670 LLVM_DEBUG(
dbgs() <<
"SLP: update deps of " << *BundleMember <<
"\n");
23671 BundleMember->initDependencies();
23672 BundleMember->resetUnscheduledDeps();
23674 SmallDenseMap<Value *, unsigned> UserToNumOps;
23675 for (User *U : BundleMember->getInst()->users()) {
23678 if (ScheduleData *UseSD = getScheduleData(U)) {
23682 if (areAllOperandsReplacedByCopyableData(
23685 BundleMember->incDependencies();
23686 if (!UseSD->isScheduled())
23687 BundleMember->incrementUnscheduledDeps(1);
23688 if (!UseSD->hasValidDependencies() ||
23689 (InsertInReadyList && UseSD->isReady()))
23693 for (ScheduleCopyableData *UseSD :
23694 getScheduleCopyableDataUsers(BundleMember->getInst())) {
23695 BundleMember->incDependencies();
23696 if (!UseSD->isScheduled())
23697 BundleMember->incrementUnscheduledDeps(1);
23698 if (!UseSD->hasValidDependencies() ||
23699 (InsertInReadyList && UseSD->isReady()))
23703 SmallPtrSet<const Instruction *, 4> Visited;
23706 if (!Visited.
insert(
I).second)
23708 auto *DepDest = getScheduleData(
I);
23709 assert(DepDest &&
"must be in schedule window");
23710 DepDest->addControlDependency(BundleMember);
23711 BundleMember->incDependencies();
23712 if (!DepDest->isScheduled())
23713 BundleMember->incrementUnscheduledDeps(1);
23714 if (!DepDest->hasValidDependencies() ||
23715 (InsertInReadyList && DepDest->isReady()))
23723 for (Instruction *
I = BundleMember->getInst()->getNextNode();
23724 I != ScheduleEnd;
I =
I->getNextNode()) {
23729 MakeControlDependent(
I);
23737 if (RegionHasStackSave) {
23742 match(BundleMember->getInst(),
23744 for (Instruction *
I = BundleMember->getInst()->getNextNode();
23745 I != ScheduleEnd;
I =
I->getNextNode()) {
23756 MakeControlDependent(
I);
23766 BundleMember->getInst()->mayReadOrWriteMemory()) {
23767 for (Instruction *
I = BundleMember->getInst()->getNextNode();
23768 I != ScheduleEnd;
I =
I->getNextNode()) {
23774 MakeControlDependent(
I);
23781 ScheduleData *NextLoadStore = BundleMember->getNextLoadStore();
23782 if (!NextLoadStore)
23786 "NextLoadStore list for non memory effecting bundle?");
23789 unsigned NumAliased = 0;
23790 unsigned DistToSrc = 1;
23791 bool IsNonSimpleSrc = !SrcLoc.
Ptr || !
isSimple(SrcInst);
23793 for (ScheduleData *DepDest = NextLoadStore; DepDest;
23794 DepDest = DepDest->getNextLoadStore()) {
23795 assert(isInSchedulingRegion(*DepDest) &&
"Expected to be in region");
23805 ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
23807 SLP->isAliased(SrcLoc, SrcInst, DepDest->getInst())))) {
23814 DepDest->addMemoryDependency(BundleMember);
23815 BundleMember->incDependencies();
23816 if (!DepDest->isScheduled())
23817 BundleMember->incrementUnscheduledDeps(1);
23818 if (!DepDest->hasValidDependencies() ||
23819 (InsertInReadyList && DepDest->isReady()))
23843 "expected at least one instruction to schedule");
23845 WorkList.
push_back(Bundle.getBundle().front());
23847 SmallPtrSet<ScheduleBundle *, 16> Visited;
23848 while (!WorkList.
empty()) {
23853 CopyableBundle.
push_back(&CD->getBundle());
23854 Bundles = CopyableBundle;
23856 Bundles = getScheduleBundles(SD->getInst());
23858 if (Bundles.
empty()) {
23859 if (!SD->hasValidDependencies())
23861 if (InsertInReadyList && SD->isReady()) {
23862 ReadyInsts.insert(SD);
23863 LLVM_DEBUG(
dbgs() <<
"SLP: gets ready on update: " << *SD <<
"\n");
23867 for (ScheduleBundle *Bundle : Bundles) {
23868 if (Bundle->hasValidDependencies() || !Visited.
insert(Bundle).second)
23870 assert(isInSchedulingRegion(*Bundle) &&
23871 "ScheduleData not in scheduling region");
23872 for_each(Bundle->getBundle(), ProcessNode);
23874 if (InsertInReadyList && SD->isReady()) {
23875 for (ScheduleBundle *Bundle : Bundles) {
23876 assert(isInSchedulingRegion(*Bundle) &&
23877 "ScheduleData not in scheduling region");
23878 if (!Bundle->isReady())
23880 ReadyInsts.insert(Bundle);
23888void BoUpSLP::BlockScheduling::resetSchedule() {
23890 "tried to reset schedule on block which has not been scheduled");
23891 for_each(ScheduleDataMap, [&](
auto &
P) {
23892 if (BB !=
P.first->getParent())
23894 ScheduleData *SD =
P.second;
23895 if (isInSchedulingRegion(*SD)) {
23896 SD->setScheduled(
false);
23897 SD->resetUnscheduledDeps();
23900 for_each(ScheduleCopyableDataMapByInst, [&](
auto &
P) {
23901 for_each(
P.second, [&](ScheduleCopyableData *SD) {
23902 if (isInSchedulingRegion(*SD)) {
23903 SD->setScheduled(false);
23904 SD->resetUnscheduledDeps();
23908 for_each(ScheduledBundles, [&](
auto &
P) {
23909 for_each(
P.second, [&](ScheduleBundle *Bundle) {
23910 if (isInSchedulingRegion(*Bundle))
23911 Bundle->setScheduled(false);
23915 for (
auto &
P : ScheduleCopyableDataMap) {
23916 if (isInSchedulingRegion(*
P.second)) {
23917 P.second->setScheduled(
false);
23918 P.second->resetUnscheduledDeps();
23921 ReadyInsts.clear();
23924void BoUpSLP::scheduleBlock(
const BoUpSLP &R, BlockScheduling *BS) {
23925 if (!BS->ScheduleStart)
23928 LLVM_DEBUG(
dbgs() <<
"SLP: schedule block " << BS->BB->getName() <<
"\n");
23935 BS->resetSchedule();
23942 struct ScheduleDataCompare {
23943 bool operator()(
const ScheduleEntity *SD1,
23944 const ScheduleEntity *SD2)
const {
23945 return SD2->getSchedulingPriority() < SD1->getSchedulingPriority();
23948 std::set<ScheduleEntity *, ScheduleDataCompare> ReadyInsts;
23953 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
23954 I =
I->getNextNode()) {
23956 if (!Bundles.
empty()) {
23957 for (ScheduleBundle *Bundle : Bundles) {
23958 Bundle->setSchedulingPriority(Idx++);
23959 if (!Bundle->hasValidDependencies())
23960 BS->calculateDependencies(*Bundle,
false,
this);
23963 for (ScheduleCopyableData *SD :
reverse(SDs)) {
23964 ScheduleBundle &Bundle = SD->getBundle();
23965 Bundle.setSchedulingPriority(Idx++);
23966 if (!Bundle.hasValidDependencies())
23967 BS->calculateDependencies(Bundle,
false,
this);
23972 BS->getScheduleCopyableDataUsers(
I);
23973 if (ScheduleData *SD = BS->getScheduleData(
I)) {
23976 SDTEs.
front()->doesNotNeedToSchedule() ||
23978 "scheduler and vectorizer bundle mismatch");
23979 SD->setSchedulingPriority(Idx++);
23980 if (!CopyableData.
empty() ||
23981 any_of(
R.ValueToGatherNodes.lookup(
I), [&](
const TreeEntry *TE) {
23982 assert(TE->isGather() &&
"expected gather node");
23983 return TE->hasState() && TE->hasCopyableElements() &&
23984 TE->isCopyableElement(I);
23986 SD->clearDirectDependencies();
23991 ScheduleBundle Bundle;
23993 BS->calculateDependencies(Bundle,
false,
this);
23996 for (ScheduleCopyableData *SD :
reverse(CopyableData)) {
23997 ScheduleBundle &Bundle = SD->getBundle();
23998 Bundle.setSchedulingPriority(Idx++);
23999 if (!Bundle.hasValidDependencies())
24000 BS->calculateDependencies(Bundle,
false,
this);
24003 BS->initialFillReadyList(ReadyInsts);
24005 Instruction *LastScheduledInst = BS->ScheduleEnd;
24008 SmallPtrSet<Instruction *, 16> Scheduled;
24009 while (!ReadyInsts.empty()) {
24010 auto *Picked = *ReadyInsts.begin();
24011 ReadyInsts.erase(ReadyInsts.begin());
24016 for (
const ScheduleEntity *BundleMember : Bundle->getBundle()) {
24017 Instruction *PickedInst = BundleMember->getInst();
24019 bool IsCopyable = Bundle->getTreeEntry()->isCopyableElement(PickedInst);
24020 if ((IsCopyable && BS->getScheduleData(PickedInst)) ||
24021 (!IsCopyable && !Scheduled.
insert(PickedInst).second))
24023 if (PickedInst->
getNextNode() != LastScheduledInst)
24025 LastScheduledInst = PickedInst;
24027 EntryToLastInstruction.try_emplace(Bundle->getTreeEntry(),
24028 LastScheduledInst);
24032 if (PickedInst->
getNextNode() != LastScheduledInst)
24034 LastScheduledInst = PickedInst;
24036 auto Invalid = InstructionsState::invalid();
24041#ifdef EXPENSIVE_CHECKS
24045#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
24047 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
24048 I =
I->getNextNode()) {
24051 [](
const ScheduleBundle *Bundle) {
24052 return Bundle->isScheduled();
24054 "must be scheduled at this point");
24059 BS->ScheduleStart =
nullptr;
24067 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
24072 auto E = InstrElementSize.find(V);
24073 if (E != InstrElementSize.end())
24090 Value *FirstNonBool =
nullptr;
24091 while (!Worklist.
empty()) {
24096 auto *Ty =
I->getType();
24099 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
24107 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
24115 for (
Use &U :
I->operands()) {
24117 if (Visited.
insert(J).second &&
24123 FirstNonBool = U.get();
24134 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
24136 Width = DL->getTypeSizeInBits(V->getType());
24140 InstrElementSize[
I] = Width;
24145bool BoUpSLP::collectValuesToDemote(
24146 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
24149 bool &IsProfitableToDemote,
bool IsTruncRoot)
const {
24154 unsigned OrigBitWidth =
24155 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
24169 if (isa<PoisonValue>(R))
24171 return !isKnownNonNegative(R, SimplifyQuery(*DL));
24173 auto IsPotentiallyTruncated = [&](
Value *V,
unsigned &
BitWidth) ->
bool {
24176 if (getTreeEntries(V).
size() > 1)
24182 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth >
BitWidth) {
24188 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
24192 APInt
Mask = DB->getDemandedBits(
I);
24193 unsigned BitWidth2 =
24194 std::max<unsigned>(1,
Mask.getBitWidth() -
Mask.countl_zero());
24195 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
24201 BitWidth1 = std::min(BitWidth1, BitWidth2);
24206 auto FinalAnalysis = [&, TTI = TTI]() {
24207 if (!IsProfitableToDemote)
24210 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(
BitWidth)));
24212 if (Res &&
E.isGather()) {
24213 if (
E.hasState()) {
24214 if (
const TreeEntry *SameTE =
24215 getSameValuesTreeEntry(
E.getMainOp(),
E.Scalars))
24216 if (collectValuesToDemote(*SameTE, IsProfitableToDemoteRoot,
BitWidth,
24217 ToDemote, Visited, NodesToKeepBWs,
24218 MaxDepthLevel, IsProfitableToDemote,
24226 SmallPtrSet<Value *, 4> UniqueBases;
24227 for (
Value *V :
E.Scalars) {
24231 UniqueBases.
insert(EE->getVectorOperand());
24233 const unsigned VF =
E.Scalars.size();
24234 Type *OrigScalarTy =
E.Scalars.front()->getType();
24235 if (UniqueBases.
size() <= 2 ||
24248 if (
E.isGather() || !Visited.
insert(&
E).second ||
24250 return !isa<Constant>(V) && all_of(V->users(), [&](User *U) {
24251 return isa<InsertElementInst>(U) && !isVectorized(U);
24254 return FinalAnalysis();
24257 return !isa<Constant>(V) && !all_of(V->users(), [=](User *U) {
24258 return isVectorized(U) ||
24259 (E.Idx == 0 && UserIgnoreList &&
24260 UserIgnoreList->contains(U)) ||
24261 (!isa<CmpInst>(U) && U->getType()->isSized() &&
24262 !U->getType()->isScalableTy() &&
24263 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
24264 }) && !IsPotentiallyTruncated(V,
BitWidth);
24269 bool &NeedToExit) {
24270 NeedToExit =
false;
24271 unsigned InitLevel = MaxDepthLevel;
24272 for (
const TreeEntry *
Op : Operands) {
24273 unsigned Level = InitLevel;
24274 if (!collectValuesToDemote(*
Op, IsProfitableToDemoteRoot,
BitWidth,
24275 ToDemote, Visited, NodesToKeepBWs, Level,
24276 IsProfitableToDemote, IsTruncRoot)) {
24277 if (!IsProfitableToDemote)
24280 if (!FinalAnalysis())
24284 MaxDepthLevel = std::max(MaxDepthLevel, Level);
24288 auto AttemptCheckBitwidth =
24289 [&](function_ref<bool(
unsigned,
unsigned)> Checker,
bool &NeedToExit) {
24291 NeedToExit =
false;
24292 unsigned BestFailBitwidth = 0;
24294 if (Checker(
BitWidth, OrigBitWidth))
24296 if (BestFailBitwidth == 0 && FinalAnalysis())
24300 if (BestFailBitwidth == 0) {
24311 auto TryProcessInstruction =
24313 function_ref<bool(
unsigned,
unsigned)> Checker = {}) {
24314 if (Operands.empty()) {
24317 for (
Value *V :
E.Scalars)
24318 (void)IsPotentiallyTruncated(V,
BitWidth);
24323 return !V->hasOneUse() && !IsPotentiallyTruncated(V, BitWidth);
24326 bool NeedToExit =
false;
24327 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
24331 if (!ProcessOperands(Operands, NeedToExit))
24340 return IsProfitableToDemote;
24343 if (
E.State == TreeEntry::SplitVectorize)
24344 return TryProcessInstruction(
24346 {VectorizableTree[
E.CombinedEntriesWithIndices.front().first].get(),
24347 VectorizableTree[
E.CombinedEntriesWithIndices.back().first].get()});
24349 if (
E.isAltShuffle()) {
24351 auto IsDangerousOpcode = [](
unsigned Opcode) {
24353 case Instruction::Shl:
24354 case Instruction::AShr:
24355 case Instruction::LShr:
24356 case Instruction::UDiv:
24357 case Instruction::SDiv:
24358 case Instruction::URem:
24359 case Instruction::SRem:
24366 if (IsDangerousOpcode(
E.getAltOpcode()))
24367 return FinalAnalysis();
24370 switch (
E.getOpcode()) {
24374 case Instruction::Trunc:
24375 if (IsProfitableToDemoteRoot)
24376 IsProfitableToDemote =
true;
24377 return TryProcessInstruction(
BitWidth);
24378 case Instruction::ZExt:
24379 case Instruction::SExt:
24380 if (
E.UserTreeIndex.UserTE &&
E.UserTreeIndex.UserTE->hasState() &&
24381 E.UserTreeIndex.UserTE->getOpcode() == Instruction::BitCast &&
24382 E.UserTreeIndex.UserTE->getMainOp()->getType()->isFPOrFPVectorTy())
24384 IsProfitableToDemote =
true;
24385 return TryProcessInstruction(
BitWidth);
24389 case Instruction::Add:
24390 case Instruction::Sub:
24391 case Instruction::Mul:
24392 case Instruction::And:
24393 case Instruction::Or:
24394 case Instruction::Xor: {
24395 return TryProcessInstruction(
24396 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)});
24398 case Instruction::Freeze:
24399 return TryProcessInstruction(
BitWidth, getOperandEntry(&
E, 0));
24400 case Instruction::Shl: {
24403 auto ShlChecker = [&](
unsigned BitWidth, unsigned) {
24405 if (isa<PoisonValue>(V))
24407 if (E.isCopyableElement(V))
24409 auto *I = cast<Instruction>(V);
24410 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
24411 return AmtKnownBits.getMaxValue().ult(BitWidth);
24414 return TryProcessInstruction(
24415 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)}, ShlChecker);
24417 case Instruction::LShr: {
24421 auto LShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
24423 if (isa<PoisonValue>(V))
24425 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
24426 if (E.isCopyableElement(V))
24427 return MaskedValueIsZero(V, ShiftedBits, SimplifyQuery(*DL));
24428 auto *I = cast<Instruction>(V);
24429 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
24430 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
24431 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
24432 SimplifyQuery(*DL));
24435 return TryProcessInstruction(
24436 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)},
24439 case Instruction::AShr: {
24443 auto AShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
24445 if (isa<PoisonValue>(V))
24447 auto *I = cast<Instruction>(V);
24448 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
24449 unsigned ShiftedBits = OrigBitWidth - BitWidth;
24450 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
24452 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
24455 return TryProcessInstruction(
24456 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)},
24459 case Instruction::UDiv:
24460 case Instruction::URem: {
24462 auto Checker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
24465 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
24466 if (E.hasCopyableElements() && E.isCopyableElement(V))
24467 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
24468 auto *I = cast<Instruction>(V);
24469 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
24470 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
24473 return TryProcessInstruction(
24474 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)}, Checker);
24478 case Instruction::Select: {
24479 return TryProcessInstruction(
24480 BitWidth, {getOperandEntry(&
E, 1), getOperandEntry(&
E, 2)});
24484 case Instruction::PHI: {
24485 const unsigned NumOps =
E.getNumOperands();
24488 [&](
unsigned Idx) { return getOperandEntry(&E, Idx); });
24493 case Instruction::Call: {
24498 if (
ID != Intrinsic::abs &&
ID != Intrinsic::smin &&
24499 ID != Intrinsic::smax &&
ID != Intrinsic::umin &&
ID != Intrinsic::umax)
24502 function_ref<bool(
unsigned,
unsigned)> CallChecker;
24503 auto CompChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
24506 auto *I = cast<Instruction>(V);
24507 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
24508 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
24509 return MaskedValueIsZero(I->getOperand(0), Mask,
24510 SimplifyQuery(*DL)) &&
24511 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
24513 assert((
ID == Intrinsic::smin ||
ID == Intrinsic::smax) &&
24514 "Expected min/max intrinsics only.");
24515 unsigned SignBits = OrigBitWidth -
BitWidth;
24517 unsigned Op0SignBits =
24519 unsigned Op1SignBits =
24521 return SignBits <= Op0SignBits &&
24522 ((SignBits != Op0SignBits &&
24525 SimplifyQuery(*DL))) &&
24526 SignBits <= Op1SignBits &&
24527 ((SignBits != Op1SignBits &&
24532 auto AbsChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
24535 auto *I = cast<Instruction>(V);
24536 unsigned SignBits = OrigBitWidth - BitWidth;
24537 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
24538 unsigned Op0SignBits =
24539 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
24540 return SignBits <= Op0SignBits &&
24541 ((SignBits != Op0SignBits &&
24542 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
24543 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
24546 if (
ID != Intrinsic::abs) {
24547 Operands.push_back(getOperandEntry(&
E, 1));
24548 CallChecker = CompChecker;
24550 CallChecker = AbsChecker;
24553 std::numeric_limits<InstructionCost::CostType>::max();
24555 unsigned VF =
E.Scalars.size();
24557 auto Checker = [&](
unsigned BitWidth, unsigned) {
24565 if (
Cost < BestCost) {
24571 [[maybe_unused]]
bool NeedToExit;
24572 (void)AttemptCheckBitwidth(Checker, NeedToExit);
24574 return TryProcessInstruction(
BitWidth, Operands, CallChecker);
24582 return FinalAnalysis();
24589 bool IsStoreOrInsertElt =
24590 VectorizableTree.front()->hasState() &&
24591 (VectorizableTree.front()->
getOpcode() == Instruction::Store ||
24592 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
24593 if ((IsStoreOrInsertElt || UserIgnoreList) &&
24594 ExtraBitWidthNodes.size() <= 1 &&
24595 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
24596 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
24599 unsigned NodeIdx = 0;
24600 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
24604 assert((VectorizableTree[NodeIdx]->
isGather() || NodeIdx != 0 ||
24605 !VectorizableTree[NodeIdx]->UserTreeIndex) &&
24606 "Unexpected tree is graph.");
24610 bool IsTruncRoot =
false;
24611 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
24614 if (NodeIdx != 0 &&
24615 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
24616 VectorizableTree[NodeIdx]->
getOpcode() == Instruction::Trunc) {
24617 assert(IsStoreOrInsertElt &&
"Expected store/insertelement seeded graph.");
24618 IsTruncRoot =
true;
24620 IsProfitableToDemoteRoot =
true;
24625 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
24629 auto ComputeMaxBitWidth =
24630 [&](
const TreeEntry &E,
bool IsTopRoot,
bool IsProfitableToDemoteRoot,
24631 unsigned Limit,
bool IsTruncRoot,
bool IsSignedCmp) ->
unsigned {
24635 if (E.isGather() && IsTruncRoot && E.UserTreeIndex &&
24636 !NodesToKeepBWs.
contains(E.Idx) &&
24637 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
24639 return V->hasOneUse() || isa<Constant>(V) ||
24640 (!V->hasNUsesOrMore(UsesLimit) &&
24641 none_of(V->users(), [&](User *U) {
24642 ArrayRef<TreeEntry *> TEs = getTreeEntries(U);
24643 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
24644 if (TEs.empty() || is_contained(TEs, UserTE))
24646 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
24648 isa<SIToFPInst, UIToFPInst>(U) ||
24649 (UserTE->hasState() &&
24650 (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
24651 SelectInst>(UserTE->getMainOp()) ||
24652 isa<SIToFPInst, UIToFPInst>(UserTE->getMainOp()))))
24654 unsigned UserTESz = DL->getTypeSizeInBits(
24655 UserTE->Scalars.front()->getType());
24656 if (all_of(TEs, [&](const TreeEntry *TE) {
24657 auto It = MinBWs.find(TE);
24658 return It != MinBWs.end() &&
24659 It->second.first > UserTESz;
24662 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
24666 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
24667 auto It = MinBWs.find(UserTE);
24668 if (It != MinBWs.end())
24669 return It->second.first;
24670 unsigned MaxBitWidth =
24671 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
24672 MaxBitWidth =
bit_ceil(MaxBitWidth);
24673 if (MaxBitWidth < 8 && MaxBitWidth > 1)
24675 return MaxBitWidth;
24681 unsigned VF = E.getVectorFactor();
24682 Type *ScalarTy = E.Scalars.front()->getType();
24689 [&](
Value *V) { return AnalyzedMinBWVals.contains(V); }))
24698 unsigned MaxBitWidth = 1u;
24706 bool IsKnownPositive = !IsSignedCmp &&
all_of(E.Scalars, [&](
Value *R) {
24707 if (isa<PoisonValue>(R))
24709 KnownBits Known = computeKnownBits(R, *DL);
24710 return Known.isNonNegative();
24713 if (!IsKnownPositive && !IsTopRoot && E.UserTreeIndex &&
24714 E.UserTreeIndex.UserTE->hasState() &&
24715 E.UserTreeIndex.UserTE->getOpcode() == Instruction::UIToFP)
24717 std::min(DL->getTypeSizeInBits(
24718 E.UserTreeIndex.UserTE->Scalars.front()->getType()),
24719 DL->getTypeSizeInBits(ScalarTy));
24723 for (
Value *Root : E.Scalars) {
24729 unsigned BitWidth1 = NumTypeBits - NumSignBits;
24745 if (!IsKnownPositive)
24750 MaxBitWidth = std::max(BitWidth1, MaxBitWidth);
24753 APInt Mask = DB->getDemandedBits(
I);
24754 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
24756 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
24759 if (MaxBitWidth < 8 && MaxBitWidth > 1)
24764 if (NumParts > 1 &&
24772 unsigned Opcode = E.getOpcode();
24773 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
24774 Opcode == Instruction::SExt ||
24775 Opcode == Instruction::ZExt || NumParts > 1;
24780 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
24781 bool NeedToDemote = IsProfitableToDemote;
24783 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
24784 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
24785 NeedToDemote, IsTruncRoot) ||
24786 (MaxDepthLevel <= Limit &&
24787 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
24788 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
24789 DL->getTypeSizeInBits(TreeRootIT) /
24790 DL->getTypeSizeInBits(
24791 E.getMainOp()->getOperand(0)->getType()) >
24795 MaxBitWidth =
bit_ceil(MaxBitWidth);
24797 return MaxBitWidth;
24804 if (UserIgnoreList &&
24808 if (
all_of(*UserIgnoreList,
24813 VectorizableTree.front()->State == TreeEntry::Vectorize &&
24814 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
24815 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
24816 Builder.getInt1Ty()) {
24817 ReductionBitWidth = 1;
24819 for (
Value *V : *UserIgnoreList) {
24823 TypeSize NumTypeBits = DL->getTypeSizeInBits(
V->getType());
24824 unsigned BitWidth1 = NumTypeBits - NumSignBits;
24827 unsigned BitWidth2 = BitWidth1;
24830 BitWidth2 =
Mask.getBitWidth() -
Mask.countl_zero();
24832 ReductionBitWidth =
24833 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
24835 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
24836 ReductionBitWidth = 8;
24838 ReductionBitWidth =
bit_ceil(ReductionBitWidth);
24841 bool IsTopRoot = NodeIdx == 0;
24842 while (NodeIdx < VectorizableTree.size() &&
24843 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
24844 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
24845 RootDemotes.push_back(NodeIdx);
24847 IsTruncRoot =
true;
24849 bool IsSignedCmp =
false;
24850 if (UserIgnoreList &&
24854 IsSignedCmp =
true;
24855 while (NodeIdx < VectorizableTree.size()) {
24857 unsigned Limit = 2;
24859 ReductionBitWidth ==
24860 DL->getTypeSizeInBits(
24861 VectorizableTree.front()->Scalars.front()->getType()))
24863 unsigned MaxBitWidth = ComputeMaxBitWidth(
24864 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
24865 IsTruncRoot, IsSignedCmp);
24866 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
24867 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
24868 ReductionBitWidth =
bit_ceil(MaxBitWidth);
24869 else if (MaxBitWidth == 0)
24870 ReductionBitWidth = 0;
24873 for (
unsigned Idx : RootDemotes) {
24874 if (
all_of(VectorizableTree[Idx]->Scalars, [&](
Value *V) {
24875 uint32_t OrigBitWidth =
24876 DL->getTypeSizeInBits(
V->getType()->getScalarType());
24877 if (OrigBitWidth > MaxBitWidth) {
24885 RootDemotes.clear();
24887 IsProfitableToDemoteRoot =
true;
24889 if (ExtraBitWidthNodes.empty()) {
24890 NodeIdx = VectorizableTree.size();
24892 unsigned NewIdx = 0;
24894 NewIdx = *ExtraBitWidthNodes.begin();
24895 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
24896 }
while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
24899 NodeIdx < VectorizableTree.size() &&
24900 VectorizableTree[NodeIdx]->UserTreeIndex &&
24901 VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&
24902 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
24903 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
24904 Instruction::Trunc &&
24905 !VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();
24907 NodeIdx < VectorizableTree.size() &&
24908 VectorizableTree[NodeIdx]->UserTreeIndex &&
24909 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
24910 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
24911 Instruction::ICmp &&
24913 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->Scalars,
24915 auto *IC = dyn_cast<ICmpInst>(V);
24916 return IC && (IC->isSigned() ||
24917 !isKnownNonNegative(IC->getOperand(0),
24918 SimplifyQuery(*DL)) ||
24919 !isKnownNonNegative(IC->getOperand(1),
24920 SimplifyQuery(*DL)));
24926 if (MaxBitWidth == 0 ||
24930 if (UserIgnoreList)
24931 AnalyzedMinBWVals.insert_range(TreeRoot);
24938 for (
unsigned Idx : ToDemote) {
24939 TreeEntry *
TE = VectorizableTree[Idx].get();
24940 if (MinBWs.contains(TE))
24943 if (isa<PoisonValue>(R))
24945 return !isKnownNonNegative(R, SimplifyQuery(*DL));
24947 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
24988 DL = &
F.getDataLayout();
24996 if (!
TTI->getNumberOfRegisters(
TTI->getRegisterClassForType(
true))) {
24998 dbgs() <<
"SLP: Didn't find any vector registers for target, abort.\n");
25003 if (
F.hasFnAttribute(Attribute::NoImplicitFloat))
25006 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing blocks in " <<
F.getName() <<
".\n");
25010 BoUpSLP R(&
F,
SE,
TTI,
TLI,
AA,
LI,
DT,
AC,
DB,
DL, ORE_);
25016 DT->updateDFSNumbers();
25019 for (
auto *BB :
post_order(&
F.getEntryBlock())) {
25024 R.clearReductionData();
25025 collectSeedInstructions(BB);
25028 if (!Stores.empty()) {
25030 <<
" underlying objects.\n");
25031 Changed |= vectorizeStoreChains(R);
25035 Changed |= vectorizeChainsInBlock(BB, R);
25040 if (!GEPs.empty()) {
25042 <<
" underlying objects.\n");
25043 Changed |= vectorizeGEPIndices(BB, R);
25048 R.optimizeGatherSequence();
25056 unsigned Idx,
unsigned MinVF,
25061 const unsigned Sz = R.getVectorElementSize(Chain[0]);
25062 unsigned VF = Chain.
size();
25068 VF < 2 || VF < MinVF) {
25076 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << VF <<
" stores at offset " << Idx
25080 for (
Value *V : Chain)
25083 InstructionsCompatibilityAnalysis
Analysis(*DT, *
DL, *
TTI, *TLI);
25084 InstructionsState S =
25088 bool IsAllowedSize =
25092 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
25093 (!S.getMainOp()->isSafeToRemove() ||
25096 return !isa<ExtractElementInst>(V) &&
25097 (V->getNumUses() > Chain.size() ||
25098 any_of(V->users(), [&](User *U) {
25099 return !Stores.contains(U);
25102 (ValOps.
size() > Chain.size() / 2 && !S)) {
25103 Size = (!IsAllowedSize && S) ? 1 : 2;
25107 R.buildTree(Chain);
25109 if (
R.isTreeTinyAndNotFullyVectorizable()) {
25110 if (
R.isGathered(Chain.front()) ||
25112 return std::nullopt;
25113 Size =
R.getCanonicalGraphSize();
25116 if (
R.isProfitableToReorder()) {
25117 R.reorderTopToBottom();
25118 R.reorderBottomToTop();
25120 R.transformNodes();
25121 R.computeMinimumValueSizes();
25124 R.buildExternalUses();
25126 Size =
R.getCanonicalGraphSize();
25127 if (S && S.getOpcode() == Instruction::Load)
25135 using namespace ore;
25137 R.getORE()->emit(OptimizationRemark(
SV_NAME,
"StoresVectorized",
25139 <<
"Stores SLP vectorized with cost " <<
NV(
"Cost",
Cost)
25140 <<
" and with tree size "
25141 <<
NV(
"TreeSize",
R.getTreeSize()));
25154 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
25155 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
25156 unsigned Size = Val.first;
25168 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
25169 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
25170 unsigned P = Val.first;
25173 return V + (P - Mean) * (P - Mean);
25176 return Dev * 96 / (Mean * Mean) == 0;
25184class RelatedStoreInsts {
25187 : AllStores(AllStores) {
25188 reset(BaseInstrIdx);
25191 void reset(
unsigned NewBaseInstr) {
25192 assert(NewBaseInstr < AllStores.size() &&
25193 "Instruction index out of bounds");
25194 BaseInstrIdx = NewBaseInstr;
25196 insertOrLookup(NewBaseInstr, 0);
25203 std::optional<unsigned> insertOrLookup(
unsigned InstrIdx, int64_t PtrDist) {
25204 auto [It,
Inserted] = Instrs.emplace(PtrDist, InstrIdx);
25205 return Inserted ? std::nullopt : std::make_optional(It->second);
25208 using DistToInstMap = std::map<int64_t, unsigned>;
25209 const DistToInstMap &getStores()
const {
return Instrs; }
25213 std::optional<int64_t> getPointerDiff(StoreInst &SI,
const DataLayout &
DL,
25214 ScalarEvolution &SE)
const {
25215 StoreInst &BaseStore = *AllStores[BaseInstrIdx];
25218 SI.getValueOperand()->getType(),
SI.getPointerOperand(),
DL, SE,
25224 void rebase(
unsigned MinSafeIdx,
unsigned NewBaseInstIdx,
25225 int64_t DistFromCurBase) {
25226 DistToInstMap PrevSet = std::move(Instrs);
25227 reset(NewBaseInstIdx);
25232 for (
auto [Dist, InstIdx] : PrevSet) {
25233 if (InstIdx >= MinSafeIdx)
25234 insertOrLookup(InstIdx, Dist - DistFromCurBase);
25240 DistToInstMap::reverse_iterator LastVectorizedStore =
find_if(
25241 reverse(Instrs), [&](
const std::pair<int64_t, unsigned> &DistAndIdx) {
25242 return VectorizedStores.
contains(AllStores[DistAndIdx.second]);
25247 DistToInstMap::iterator VectorizedStoresEnd = LastVectorizedStore.base();
25248 Instrs.erase(Instrs.begin(), VectorizedStoresEnd);
25253 unsigned BaseInstrIdx;
25256 DistToInstMap Instrs;
25264bool SLPVectorizerPass::vectorizeStores(
25266 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
25273 auto TryToVectorize = [&](
const RelatedStoreInsts::DistToInstMap &StoreSeq) {
25274 int64_t PrevDist = -1;
25278 auto &[Dist, InstIdx] =
Data;
25279 if (Operands.
empty() || Dist - PrevDist == 1) {
25282 if (Idx != StoreSeq.size() - 1)
25291 if (Operands.
size() <= 1 ||
25293 .
insert({Operands.front(),
25294 cast<StoreInst>(Operands.front())->getValueOperand(),
25296 cast<StoreInst>(Operands.back())->getValueOperand(),
25301 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
25302 unsigned EltSize =
R.getVectorElementSize(Operands[0]);
25306 std::min(
R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
25308 Type *StoreTy =
Store->getValueOperand()->getType();
25309 Type *ValueTy = StoreTy;
25311 ValueTy = Trunc->getSrcTy();
25320 R.getMinVF(DL->getTypeStoreSizeInBits(StoreScalarTy)), StoreScalarTy,
25323 MinVF = std::max<unsigned>(2, MinVF);
25325 if (MaxVF < MinVF) {
25326 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
25328 <<
"MinVF (" << MinVF <<
")\n");
25332 unsigned NonPowerOf2VF = 0;
25337 unsigned CandVF = std::clamp<unsigned>(Operands.
size(), MinVF, MaxVF);
25339 NonPowerOf2VF = CandVF;
25340 assert(NonPowerOf2VF != MaxVF &&
25341 "Non-power-of-2 VF should not be equal to MaxVF");
25348 unsigned MaxRegVF = MaxVF;
25350 MaxVF = std::min<unsigned>(MaxVF,
bit_floor(Operands.
size()));
25351 if (MaxVF < MinVF) {
25352 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
25354 <<
"MinVF (" << MinVF <<
")\n");
25358 SmallVector<unsigned> CandidateVFs;
25359 for (
unsigned VF = std::max(MaxVF, NonPowerOf2VF); VF >= MinVF;
25363 unsigned End = Operands.
size();
25364 unsigned Repeat = 0;
25365 constexpr unsigned MaxAttempts = 4;
25371 Operands.
size(), {1, 1});
25374 DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable;
25375 auto IsNotVectorized = [](
const std::pair<unsigned, unsigned> &
P) {
25376 return P.first > 0;
25378 auto IsVectorized = [](
const std::pair<unsigned, unsigned> &
P) {
25379 return P.first == 0;
25381 auto VFIsProfitable = [](
unsigned Size,
25382 const std::pair<unsigned, unsigned> &
P) {
25383 return Size >=
P.first;
25385 auto FirstSizeSame = [](
unsigned Size,
25386 const std::pair<unsigned, unsigned> &
P) {
25387 return Size ==
P.first;
25391 bool RepeatChanged =
false;
25392 bool AnyProfitableGraph =
false;
25393 for (
unsigned VF : CandidateVFs) {
25394 AnyProfitableGraph =
false;
25395 unsigned FirstUnvecStore = std::distance(
25396 RangeSizes.begin(),
find_if(RangeSizes, IsNotVectorized));
25400 while (FirstUnvecStore < End) {
25401 unsigned FirstVecStore = std::distance(
25402 RangeSizes.begin(),
25403 find_if(RangeSizes.drop_front(FirstUnvecStore), IsVectorized));
25404 unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore;
25405 for (
unsigned SliceStartIdx = FirstUnvecStore;
25406 SliceStartIdx + VF <= MaxSliceEnd;) {
25416 ->getValueOperand()
25419 ->getValueOperand()
25422 "Expected all operands of same type.");
25423 if (!NonSchedulable.
empty()) {
25424 auto [NonSchedSizeMax, NonSchedSizeMin] =
25426 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= VF) {
25429 SliceStartIdx += NonSchedSizeMax;
25434 std::optional<bool> Res =
25435 vectorizeStoreChain(Slice, R, SliceStartIdx, MinVF, TreeSize);
25441 .first->getSecond()
25447 AnyProfitableGraph = RepeatChanged =
Changed =
true;
25450 for (std::pair<unsigned, unsigned> &
P :
25451 RangeSizes.slice(SliceStartIdx, VF))
25452 P.first =
P.second = 0;
25453 if (SliceStartIdx < FirstUnvecStore + MinVF) {
25454 for (std::pair<unsigned, unsigned> &
P : RangeSizes.slice(
25455 FirstUnvecStore, SliceStartIdx - FirstUnvecStore))
25456 P.first =
P.second = 0;
25457 FirstUnvecStore = SliceStartIdx + VF;
25459 if (SliceStartIdx > MaxSliceEnd - VF - MinVF) {
25460 for (std::pair<unsigned, unsigned> &
P :
25461 RangeSizes.slice(SliceStartIdx + VF,
25462 MaxSliceEnd - (SliceStartIdx + VF)))
25463 P.first =
P.second = 0;
25464 if (MaxSliceEnd == End)
25465 End = SliceStartIdx;
25466 MaxSliceEnd = SliceStartIdx;
25468 SliceStartIdx += VF;
25471 if (VF > 2 && Res &&
25472 !
all_of(RangeSizes.slice(SliceStartIdx, VF),
25473 std::bind(VFIsProfitable, TreeSize, _1))) {
25474 SliceStartIdx += VF;
25479 if (VF > MaxRegVF && TreeSize > 1 &&
25480 all_of(RangeSizes.slice(SliceStartIdx, VF),
25481 std::bind(FirstSizeSame, TreeSize, _1))) {
25482 SliceStartIdx += VF;
25483 while (SliceStartIdx != MaxSliceEnd &&
25484 RangeSizes[SliceStartIdx].first == TreeSize)
25489 for (std::pair<unsigned, unsigned> &
P :
25490 RangeSizes.slice(SliceStartIdx, VF))
25491 P.second = std::max(
P.second, TreeSize);
25493 AnyProfitableGraph =
true;
25495 if (FirstUnvecStore >= End)
25497 if (MaxSliceEnd - FirstUnvecStore < VF &&
25498 MaxSliceEnd - FirstUnvecStore >= MinVF)
25499 AnyProfitableGraph =
true;
25500 FirstUnvecStore = std::distance(
25501 RangeSizes.begin(),
25502 find_if(RangeSizes.drop_front(MaxSliceEnd), IsNotVectorized));
25504 if (!AnyProfitableGraph && VF >= MaxRegVF &&
has_single_bit(VF))
25507 if (VF == MaxRegVF)
25508 for (std::pair<unsigned, unsigned> &
P : RangeSizes)
25510 P.first = std::max(
P.second,
P.first);
25513 if (
all_of(RangeSizes, IsVectorized))
25516 if (Repeat >= MaxAttempts ||
25517 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
25519 constexpr unsigned StoresLimit = 64;
25520 const unsigned MaxTotalNum = std::min<unsigned>(
25522 static_cast<unsigned>(
25524 std::distance(RangeSizes.begin(),
25525 find_if(RangeSizes, IsNotVectorized)) +
25527 unsigned VF =
bit_ceil(CandidateVFs.front()) * 2;
25528 if (VF > MaxTotalNum || VF >= StoresLimit)
25530 for (std::pair<unsigned, unsigned> &
P : RangeSizes) {
25532 P.first = std::max(
P.second,
P.first);
25536 CandidateVFs.clear();
25539 if (
bit_floor(Limit) == VF && Limit != VF)
25540 CandidateVFs.push_back(Limit);
25541 CandidateVFs.push_back(VF);
25581 auto FillStoresSet = [&](
unsigned Idx, StoreInst *
SI) {
25582 std::optional<int64_t> PtrDist;
25583 auto *RelatedStores =
find_if(
25584 SortedStores, [&PtrDist, SI,
this](
const RelatedStoreInsts &StoreSeq) {
25585 PtrDist = StoreSeq.getPointerDiff(*SI, *DL, *SE);
25586 return PtrDist.has_value();
25590 if (RelatedStores == SortedStores.
end()) {
25598 if (std::optional<unsigned> PrevInst =
25599 RelatedStores->insertOrLookup(Idx, *PtrDist)) {
25600 TryToVectorize(RelatedStores->getStores());
25601 RelatedStores->clearVectorizedStores(VectorizedStores);
25602 RelatedStores->rebase(*PrevInst + 1,
25607 Type *PrevValTy =
nullptr;
25609 if (
R.isDeleted(SI))
25612 PrevValTy =
SI->getValueOperand()->getType();
25614 if (PrevValTy !=
SI->getValueOperand()->getType()) {
25615 for (RelatedStoreInsts &StoreSeq : SortedStores)
25616 TryToVectorize(StoreSeq.getStores());
25617 SortedStores.clear();
25618 PrevValTy =
SI->getValueOperand()->getType();
25620 FillStoresSet(
I, SI);
25624 for (RelatedStoreInsts &StoreSeq : SortedStores)
25625 TryToVectorize(StoreSeq.getStores());
25630void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
25638 for (Instruction &
I : *BB) {
25642 if (!
SI->isSimple())
25653 if (
GEP->getNumIndices() != 1)
25655 Value *Idx =
GEP->idx_begin()->get();
25660 if (
GEP->getType()->isVectorTy())
25672 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize a list of length = "
25673 << VL.
size() <<
".\n");
25684 for (
Value *V : VL) {
25685 Type *Ty =
V->getType();
25689 R.getORE()->emit([&]() {
25690 std::string TypeStr;
25691 llvm::raw_string_ostream OS(TypeStr);
25693 return OptimizationRemarkMissed(
SV_NAME,
"UnsupportedType", I0)
25694 <<
"Cannot SLP vectorize list: type "
25695 << TypeStr +
" is unsupported by vectorizer";
25702 unsigned Sz =
R.getVectorElementSize(I0);
25703 unsigned MinVF =
R.getMinVF(Sz);
25704 unsigned MaxVF = std::max<unsigned>(
25706 MaxVF = std::min(
R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
25708 R.getORE()->emit([&]() {
25709 return OptimizationRemarkMissed(
SV_NAME,
"SmallVF", I0)
25710 <<
"Cannot SLP vectorize list: vectorization factor "
25711 <<
"less than 2 is not supported";
25717 bool CandidateFound =
false;
25720 unsigned NextInst = 0, MaxInst = VL.size();
25721 for (
unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
25727 if (TTI->getNumberOfParts(VecTy) == VF)
25729 for (
unsigned I = NextInst;
I < MaxInst; ++
I) {
25730 unsigned ActualVF = std::min(MaxInst -
I, VF);
25735 if (MaxVFOnly && ActualVF < MaxVF)
25737 if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
25742 for (
Value *V : VL.drop_front(
I)) {
25746 !Inst || !
R.isDeleted(Inst)) {
25749 if (Idx == ActualVF)
25754 if (Idx != ActualVF)
25757 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << ActualVF <<
" operations "
25761 if (
R.isTreeTinyAndNotFullyVectorizable())
25763 if (
R.isProfitableToReorder()) {
25764 R.reorderTopToBottom();
25767 R.transformNodes();
25768 R.computeMinimumValueSizes();
25770 R.buildExternalUses();
25773 CandidateFound =
true;
25774 MinCost = std::min(MinCost,
Cost);
25777 <<
" for VF=" << ActualVF <<
"\n");
25780 R.getORE()->emit(OptimizationRemark(
SV_NAME,
"VectorizedList",
25782 <<
"SLP vectorized with cost " <<
ore::NV(
"Cost",
Cost)
25783 <<
" and with tree size "
25784 <<
ore::NV(
"TreeSize",
R.getTreeSize()));
25795 if (!
Changed && CandidateFound) {
25796 R.getORE()->emit([&]() {
25797 return OptimizationRemarkMissed(
SV_NAME,
"NotBeneficial", I0)
25798 <<
"List vectorization was possible but not beneficial with cost "
25799 <<
ore::NV(
"Cost", MinCost) <<
" >= "
25803 R.getORE()->emit([&]() {
25804 return OptimizationRemarkMissed(
SV_NAME,
"NotPossible", I0)
25805 <<
"Cannot SLP vectorize list: vectorization was impossible"
25806 <<
" with available vectorization factors";
25841 using ReductionOpsType = SmallVector<Value *, 16>;
25842 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
25843 ReductionOpsListType ReductionOps;
25847 SmallDenseMap<Value *, SmallVector<Instruction *>, 16> ReducedValsToOps;
25848 WeakTrackingVH ReductionRoot;
25853 bool IsSupportedHorRdxIdentityOp =
false;
25859 VectorValuesAndScales;
25861 static bool isCmpSelMinMax(Instruction *
I) {
25869 static bool isBoolLogicOp(Instruction *
I) {
25876 ReductionOrdering RK = ReductionOrdering::None;
25877 static ReductionOrdering isVectorizable(
RecurKind Kind, Instruction *
I,
25878 bool TwoElementReduction =
false) {
25879 if (Kind == RecurKind::None)
25880 return ReductionOrdering::None;
25885 return ReductionOrdering::Unordered;
25888 if (TwoElementReduction)
25889 return ReductionOrdering::Unordered;
25891 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
25895 return I->getFastMathFlags().noNaNs() ? ReductionOrdering::Unordered
25896 : ReductionOrdering::Ordered;
25899 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
25900 return ReductionOrdering::Unordered;
25902 if (
I->isAssociative())
25903 return ReductionOrdering::Unordered;
25905 return ::isCommutative(
I) ? ReductionOrdering::Ordered
25906 : ReductionOrdering::None;
25909 static Value *getRdxOperand(Instruction *
I,
unsigned Index) {
25915 return I->getOperand(2);
25916 return I->getOperand(Index);
25921 Value *
RHS,
const Twine &Name,
bool UseSelect) {
25925 case RecurKind::Or: {
25934 case RecurKind::And: {
25944 case RecurKind::Add:
25945 case RecurKind::Mul:
25946 case RecurKind::Xor:
25947 case RecurKind::FAdd:
25948 case RecurKind::FMul: {
25953 case RecurKind::SMax:
25954 case RecurKind::SMin:
25955 case RecurKind::UMax:
25956 case RecurKind::UMin:
25964 case RecurKind::FMax:
25965 case RecurKind::FMin:
25966 case RecurKind::FMaximum:
25967 case RecurKind::FMinimum:
25968 case RecurKind::FMaximumNum:
25969 case RecurKind::FMinimumNum: {
25982 const ReductionOpsListType &ReductionOps) {
25983 bool UseSelect = ReductionOps.size() == 2 ||
25985 (ReductionOps.size() == 1 &&
25987 assert((!UseSelect || ReductionOps.size() != 2 ||
25989 "Expected cmp + select pairs for reduction");
25990 Value *
Op = createOp(Builder, RdxKind,
LHS,
RHS, Name, UseSelect);
26008 return RecurKind::None;
26010 return RecurKind::Add;
26012 return RecurKind::Mul;
26015 return RecurKind::And;
26018 return RecurKind::Or;
26020 return RecurKind::Xor;
26022 return RecurKind::FAdd;
26024 return RecurKind::FMul;
26027 return RecurKind::FMax;
26029 return RecurKind::FMin;
26032 return RecurKind::FMaximum;
26034 return RecurKind::FMinimum;
26040 return RecurKind::SMax;
26042 return RecurKind::SMin;
26044 return RecurKind::UMax;
26046 return RecurKind::UMin;
26072 return RecurKind::None;
26076 return RecurKind::None;
26079 return RecurKind::None;
26083 return RecurKind::None;
26088 return RecurKind::None;
26091 return RecurKind::SMax;
26094 return RecurKind::SMin;
26097 return RecurKind::UMax;
26100 return RecurKind::UMin;
26103 return RecurKind::None;
26107 static unsigned getFirstOperandIndex(Instruction *
I) {
26108 return isCmpSelMinMax(
I) ? 1 : 0;
26113 static unsigned getNumberOfOperands(Instruction *
I) {
26114 return isCmpSelMinMax(
I) ? 3 : 2;
26119 static bool hasSameParent(Instruction *
I, BasicBlock *BB) {
26120 if (isCmpSelMinMax(
I) || isBoolLogicOp(
I)) {
26123 return Sel->getParent() == BB &&
Cmp &&
Cmp->getParent() == BB;
26125 return I->getParent() == BB;
26129 static bool hasRequiredNumberOfUses(
bool IsCmpSelMinMax, Instruction *
I) {
26130 if (IsCmpSelMinMax) {
26134 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
26135 return I->hasNUses(2);
26143 void initReductionOps(Instruction *
I) {
26144 if (isCmpSelMinMax(
I))
26145 ReductionOps.assign(2, ReductionOpsType());
26147 ReductionOps.assign(1, ReductionOpsType());
26151 void addReductionOps(Instruction *
I) {
26152 if (isCmpSelMinMax(
I)) {
26154 ReductionOps[1].emplace_back(
I);
26156 ReductionOps[0].emplace_back(
I);
26161 int Sz =
Data.size();
26170 void optimizeReducedVals(BoUpSLP &R, DominatorTree &DT,
const DataLayout &
DL,
26171 const TargetTransformInfo &
TTI,
26172 const TargetLibraryInfo &TLI) {
26173 SmallDenseMap<unsigned, unsigned> UsedReductionOpIds;
26174 for (
const auto [Idx, Vals] :
enumerate(ReducedVals)) {
26179 auto ZExtIt = UsedReductionOpIds.
find(Instruction::ZExt);
26180 auto SelectIt = UsedReductionOpIds.
find(Instruction::Select);
26181 if (ZExtIt != UsedReductionOpIds.
end() &&
26182 SelectIt != UsedReductionOpIds.
end()) {
26183 unsigned ZExtIdx = ZExtIt->second;
26184 unsigned SelectIdx = SelectIt->second;
26187 if (ZExt->getSrcTy()->isIntegerTy(1) &&
26188 ZExt->getType() == ReducedVals[SelectIdx].front()->getType()) {
26189 ReducedVals[ZExtIdx].
append(ReducedVals[SelectIdx]);
26190 ReducedVals.
erase(std::next(ReducedVals.
begin(), SelectIdx));
26195 if (ReducedVals.
size() == 2 &&
26196 (ReducedVals.
front().size() == 1 || ReducedVals.
back().size() == 1)) {
26198 ReducedVals.
back().size());
26201 std::next(
Ops.begin(), ReducedVals.
front().size()));
26202 InstructionsCompatibilityAnalysis
Analysis(DT,
DL,
TTI, TLI);
26203 InstructionsState OpS =
Analysis.buildInstructionsState(
26206 if (OpS && OpS.areInstructionsWithCopyableElements() &&
26207 OpS.getOpcode() == Instruction::Shl) {
26209 if (ReducedVals.
back().size() == 1 && ReducedVals.
front().size() != 1)
26214 auto Comparator = [](
Value *V1,
Value *V2) {
26215 ConstantInt *C1, *C2;
26223 ReducedVals.
front().append(ReducedVals.
back());
26232 : ReductionRoot(
I), ReductionLimit(2) {
26233 RdxKind = HorizontalReduction::getRdxKind(
I);
26234 ReductionOps.emplace_back().push_back(
I);
26237 ReducedValsToOps[
V].push_back(
I);
26240 bool matchReductionForOperands() {
26243 assert(ReductionRoot &&
"Reduction root is not set!");
26246 return Ops.size() == 2;
26248 return RK != ReductionOrdering::None;
26252 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
26253 ScalarEvolution &SE, DominatorTree &DT,
26254 const DataLayout &
DL,
26255 const TargetTransformInfo &
TTI,
26256 const TargetLibraryInfo &TLI) {
26257 RdxKind = HorizontalReduction::getRdxKind(Root);
26258 RK = isVectorizable(RdxKind, Root);
26259 if (RK == ReductionOrdering::None)
26271 if (!Sel->getCondition()->hasOneUse())
26272 RK = ReductionOrdering::Ordered;
26274 ReductionRoot = Root;
26279 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
26281 1, std::make_pair(Root, 0));
26287 SmallVectorImpl<Value *> &PossibleReducedVals,
26288 SmallVectorImpl<Instruction *> &ReductionOps,
26291 getNumberOfOperands(TreeN)))) {
26292 Value *EdgeVal = getRdxOperand(TreeN,
I);
26293 ReducedValsToOps[EdgeVal].push_back(TreeN);
26301 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst);
26302 ReductionOrdering CurrentRK = IsReducedVal
26303 ? ReductionOrdering::None
26304 : isVectorizable(RdxKind, EdgeInst);
26305 if (!IsReducedVal && CurrentRK == ReductionOrdering::Unordered &&
26306 RK == ReductionOrdering::Unordered &&
26307 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst)) {
26308 IsReducedVal =
true;
26309 CurrentRK = ReductionOrdering::None;
26310 if (PossibleReducedVals.size() < ReductionLimit)
26311 PossibleOrderedReductionOps.
emplace_back(EdgeInst, Level);
26313 if (CurrentRK == ReductionOrdering::None ||
26314 (
R.isAnalyzedReductionRoot(EdgeInst) &&
26316 PossibleReducedVals.push_back(EdgeVal);
26319 if (CurrentRK == ReductionOrdering::Ordered)
26320 RK = ReductionOrdering::Ordered;
26321 ReductionOps.push_back(EdgeInst);
26330 size_t, SmallMapVector<size_t, SmallMapVector<Value *, unsigned, 2>, 2>,
26332 PossibleReducedVals;
26333 initReductionOps(Root);
26335 SmallSet<size_t, 2> LoadKeyUsed;
26337 auto GenerateLoadsSubkey = [&](
size_t Key, LoadInst *LI) {
26342 auto LIt = LoadsMap.
find(std::make_pair(
Key, Ptr));
26343 if (LIt != LoadsMap.
end()) {
26344 for (LoadInst *RLI : LIt->second) {
26350 for (LoadInst *RLI : LIt->second) {
26357 if (LIt->second.size() > 2) {
26359 hash_value(LIt->second.back()->getPointerOperand());
26365 .first->second.push_back(LI);
26370 while (!Worklist.empty()) {
26371 auto [TreeN,
Level] = Worklist.pop_back_val();
26374 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
26375 addReductionOps(TreeN);
26376 ReducedValsCandidates.
append(PossibleRedVals.
begin(),
26377 PossibleRedVals.
end());
26378 for (Instruction *
I :
reverse(PossibleReductionOps))
26379 Worklist.emplace_back(
I,
I->getParent() == BB ? 0 : Level + 1);
26383 if (Worklist.empty() && ReducedValsCandidates.
size() < ReductionLimit &&
26384 !PossibleOrderedReductionOps.
empty() &&
26385 RK == ReductionOrdering::Unordered) {
26386 RK = ReductionOrdering::Ordered;
26387 SmallPtrSet<const Instruction *, 4>
Ops;
26388 for (
const auto &
P : PossibleOrderedReductionOps)
26389 Ops.insert(
P.first);
26392 return I &&
Ops.contains(
I);
26394 Worklist.append(PossibleOrderedReductionOps.begin(),
26395 PossibleOrderedReductionOps.end());
26396 PossibleOrderedReductionOps.
clear();
26401 for (
Value *V : ReducedValsCandidates) {
26405 ++PossibleReducedVals[
Key][Idx].
try_emplace(V, 0).first->second;
26407 auto PossibleReducedValsVect = PossibleReducedVals.
takeVector();
26410 for (
auto &PossibleReducedVals : PossibleReducedValsVect) {
26411 auto PossibleRedVals = PossibleReducedVals.second.
takeVector();
26413 for (
auto &Slice : PossibleRedVals) {
26415 auto RedValsVect = Slice.second.takeVector();
26417 for (
const std::pair<Value *, unsigned> &
Data : RedValsVect)
26418 PossibleRedValsVect.
back().append(
Data.second,
Data.first);
26420 stable_sort(PossibleRedValsVect, [](
const auto &P1,
const auto &P2) {
26421 return P1.size() >
P2.size();
26428 }
else if (!isGoodForReduction(
Data)) {
26431 if (!LI || !LastLI ||
26436 ReducedVals.
back().append(
Data.rbegin(),
Data.rend());
26441 optimizeReducedVals(R, DT,
DL,
TTI, TLI);
26445 return P1.size() >
P2.size();
26451 Value *tryToReduce(BoUpSLP &V,
const DataLayout &
DL, TargetTransformInfo *
TTI,
26452 const TargetLibraryInfo &TLI, AssumptionCache *AC,
26453 DominatorTree &DT) {
26454 constexpr unsigned RegMaxNumber = 4;
26455 constexpr unsigned RedValsMaxNumber = 128;
26459 if (
unsigned NumReducedVals = std::accumulate(
26460 ReducedVals.
begin(), ReducedVals.
end(), 0,
26462 if (!isGoodForReduction(Vals))
26464 return Num + Vals.size();
26466 NumReducedVals < ReductionLimit &&
26470 for (ReductionOpsType &RdxOps : ReductionOps)
26471 for (
Value *RdxOp : RdxOps)
26476 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
26482 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.
size() *
26483 ReducedVals.
front().size());
26487 auto &&GetCmpForMinMaxReduction = [](
Instruction *RdxRootInst) {
26489 "Expected min/max reduction to have select root instruction");
26492 "Expected min/max reduction to have compare condition");
26496 bool AnyBoolLogicOp =
any_of(ReductionOps.back(), [](
Value *V) {
26497 return isBoolLogicOp(cast<Instruction>(V));
26500 auto GetNewVectorizedTree = [&](
Value *VectorizedTree,
Value *Res) {
26501 if (VectorizedTree) {
26505 if (AnyBoolLogicOp) {
26506 auto It = ReducedValsToOps.
find(VectorizedTree);
26507 auto It1 = ReducedValsToOps.
find(Res);
26508 if ((It == ReducedValsToOps.
end() && It1 == ReducedValsToOps.
end()) ||
26510 (It != ReducedValsToOps.
end() &&
26511 any_of(It->getSecond(), [&](Instruction *
I) {
26512 return isBoolLogicOp(I) &&
26513 getRdxOperand(I, 0) == VectorizedTree;
26517 (It1 != ReducedValsToOps.
end() &&
26518 any_of(It1->getSecond(), [&](Instruction *
I) {
26519 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
26523 VectorizedTree = Builder.
CreateFreeze(VectorizedTree);
26527 return createOp(Builder, RdxKind, VectorizedTree, Res,
"op.rdx",
26533 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
26534 ReductionOps.front().size());
26535 for (ReductionOpsType &RdxOps : ReductionOps)
26536 for (
Value *RdxOp : RdxOps) {
26539 IgnoreList.insert(RdxOp);
26542 FastMathFlags RdxFMF;
26544 for (
Value *U : IgnoreList)
26546 RdxFMF &= FPMO->getFastMathFlags();
26549 if (RK == ReductionOrdering::Ordered)
26550 IgnoreList.
clear();
26556 for (
Value *V : Candidates)
26557 TrackedVals.try_emplace(V, V);
26559 auto At = [](SmallMapVector<Value *, unsigned, 16> &MV,
26560 Value *
V) ->
unsigned & {
26561 auto *It = MV.
find(V);
26562 assert(It != MV.
end() &&
"Unable to find given key.");
26566 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.
size());
26569 SmallPtrSet<Value *, 4> RequiredExtract;
26570 WeakTrackingVH VectorizedTree =
nullptr;
26571 bool CheckForReusedReductionOps =
false;
26577 const bool TwoGroupsOnly = ReducedVals.
size() == 2;
26578 const bool TwoGroupsOfSameSmallSize =
26580 ReducedVals.
front().size() == ReducedVals.
back().size() &&
26581 ReducedVals.
front().size() < ReductionLimit;
26586 States.
back().getOpcode() == Instruction::Load)) {
26587 LocalReducedVals.
emplace_back().append(RV.begin(), RV.end());
26588 States.
push_back(InstructionsState::invalid());
26591 if (!LocalReducedVals.
empty() &&
26594 LocalReducedVals.
emplace_back().append(RV.begin(), RV.end());
26600 if (!TwoGroupsOfSameSmallSize) {
26602 if (!LocalReducedVals.
empty())
26603 Ops = LocalReducedVals.
back();
26604 Ops.append(RV.begin(), RV.end());
26605 InstructionsCompatibilityAnalysis
Analysis(DT,
DL, *
TTI, TLI);
26606 InstructionsState OpS =
Analysis.buildInstructionsState(
26609 if (OpS && OpS.areInstructionsWithCopyableElements()) {
26610 if (LocalReducedVals.
empty()) {
26615 LocalReducedVals.
back().swap(
Ops);
26616 States.
back() = OpS;
26621 if (TwoGroupsOnly) {
26623 OpS = InstructionsState(MainOp, AltOp);
26627 if (MainOp && AltOp &&
26628 V.canBuildSplitNode(
Ops, OpS, Op1, Op2, ReorderIndices)) {
26629 if (LocalReducedVals.
empty()) {
26634 LocalReducedVals.
back().swap(
Ops);
26635 States.
back() = OpS;
26640 LocalReducedVals.
emplace_back().append(RV.begin(), RV.end());
26643 ReducedVals.swap(LocalReducedVals);
26644 for (
unsigned I = 0,
E = ReducedVals.
size();
I <
E; ++
I) {
26646 InstructionsState S = States[
I];
26649 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.
size());
26650 for (
Value *ReducedVal : OrigReducedVals) {
26651 Value *RdxVal = TrackedVals.at(ReducedVal);
26658 (!S || (!S.getMatchingMainOpOrAltOp(Inst) &&
26659 !S.isCopyableElement(Inst)))) ||
26661 !S.isCopyableElement(RdxVal)))
26664 TrackedToOrig.try_emplace(RdxVal, ReducedVal);
26666 bool ShuffledExtracts =
false;
26668 if (S && S.getOpcode() == Instruction::ExtractElement &&
26669 !S.isAltShuffle() &&
I + 1 <
E) {
26671 for (
Value *RV : ReducedVals[
I + 1]) {
26672 Value *RdxVal = TrackedVals.at(RV);
26679 CommonCandidates.push_back(RdxVal);
26680 TrackedToOrig.try_emplace(RdxVal, RV);
26682 SmallVector<int>
Mask;
26685 Candidates.
swap(CommonCandidates);
26686 ShuffledExtracts =
true;
26692 if (RK == ReductionOrdering::Ordered)
26695 Value *OrigV = TrackedToOrig.at(Candidates.
front());
26696 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
26698 Res = createOp(Builder, RdxKind, Res, VC,
"const.rdx", ReductionOps);
26699 Value *OrigV = TrackedToOrig.at(VC);
26700 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
26702 V.analyzedReductionRoot(ResI);
26704 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
26708 unsigned NumReducedVals = Candidates.
size();
26709 if (NumReducedVals < ReductionLimit &&
26710 (NumReducedVals < 2 || !
isSplat(Candidates)))
26715 IsSupportedHorRdxIdentityOp =
26716 RK == ReductionOrdering::Unordered && RdxKind != RecurKind::Mul &&
26717 RdxKind != RecurKind::FMul && RdxKind != RecurKind::FMulAdd;
26719 SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
26720 if (IsSupportedHorRdxIdentityOp)
26721 for (
Value *V : Candidates) {
26722 Value *OrigV = TrackedToOrig.at(V);
26723 ++SameValuesCounter.
try_emplace(OrigV).first->second;
26735 bool SameScaleFactor =
false;
26736 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
26737 SameValuesCounter.
size() != Candidates.size();
26739 if (OptReusedScalars) {
26741 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
26742 RdxKind == RecurKind::Xor) &&
26744 [&SameValuesCounter](
const std::pair<Value *, unsigned> &
P) {
26745 return P.second == SameValuesCounter.
front().second;
26747 Candidates.resize(SameValuesCounter.
size());
26748 transform(SameValuesCounter, Candidates.begin(),
26749 [&](
const auto &
P) { return TrackedVals.at(P.first); });
26750 NumReducedVals = Candidates.size();
26752 if (NumReducedVals == 1) {
26753 Value *OrigV = TrackedToOrig.at(Candidates.front());
26754 unsigned Cnt = At(SameValuesCounter, OrigV);
26756 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
26757 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
26758 VectorizedVals.try_emplace(OrigV, Cnt);
26759 ExternallyUsedValues.
insert(OrigV);
26764 unsigned MaxVecRegSize =
V.getMaxVecRegSize();
26765 unsigned EltSize =
V.getVectorElementSize(Candidates[0]);
26766 const unsigned MaxElts = std::clamp<unsigned>(
26768 RegMaxNumber * RedValsMaxNumber);
26770 unsigned ReduxWidth = NumReducedVals;
26771 auto GetVectorFactor = [&, &
TTI = *
TTI](
unsigned ReduxWidth) {
26772 unsigned NumParts, NumRegs;
26773 Type *ScalarTy = Candidates.front()->getType();
26780 while (NumParts > NumRegs) {
26781 assert(ReduxWidth > 0 &&
"ReduxWidth is unexpectedly 0.");
26782 ReduxWidth =
bit_floor(ReduxWidth - 1);
26788 if (NumParts > NumRegs / 2)
26793 ReduxWidth = GetVectorFactor(ReduxWidth);
26794 ReduxWidth = std::min(ReduxWidth, MaxElts);
26796 unsigned Start = 0;
26797 unsigned Pos =
Start;
26799 unsigned PrevReduxWidth = ReduxWidth;
26800 bool CheckForReusedReductionOpsLocal =
false;
26801 auto AdjustReducedVals = [&](
bool IgnoreVL =
false) {
26802 bool IsAnyRedOpGathered =
26804 (RK == ReductionOrdering::Ordered ||
V.isAnyGathered(IgnoreList));
26805 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
26808 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
26811 if (Pos < NumReducedVals - ReduxWidth + 1)
26812 return IsAnyRedOpGathered;
26815 if (ReduxWidth > 1)
26816 ReduxWidth = GetVectorFactor(ReduxWidth);
26817 return IsAnyRedOpGathered;
26819 bool AnyVectorized =
false;
26820 SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
26821 while (Pos < NumReducedVals - ReduxWidth + 1 &&
26822 ReduxWidth >= ReductionLimit) {
26825 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
26827 CheckForReusedReductionOps =
true;
26830 PrevReduxWidth = ReduxWidth;
26833 if (IgnoredCandidates.
contains(std::make_pair(Pos, ReduxWidth)) ||
26836 std::make_pair(Pos,
bit_floor(ReduxWidth))) ||
26838 std::make_pair(Pos + (ReduxWidth -
bit_floor(ReduxWidth)),
26840 V.areAnalyzedReductionVals(VL)) {
26841 (void)AdjustReducedVals(
true);
26848 return RedValI &&
V.isDeleted(RedValI);
26851 if (RK == ReductionOrdering::Ordered)
26854 V.buildTree(VL, IgnoreList);
26855 if (
V.isTreeTinyAndNotFullyVectorizable(RK ==
26856 ReductionOrdering::Unordered)) {
26857 if (!AdjustReducedVals())
26858 V.analyzedReductionVals(VL);
26861 V.reorderTopToBottom();
26864 VL.front()->getType()->isIntOrIntVectorTy() ||
26865 ReductionLimit > 2 ||
26866 RK == ReductionOrdering::Ordered);
26870 ExternallyUsedValues);
26874 LocalExternallyUsedValues.insert(ReductionRoot);
26875 for (
unsigned Cnt = 0, Sz = ReducedVals.
size(); Cnt < Sz; ++Cnt) {
26876 if (Cnt ==
I || (ShuffledExtracts && Cnt ==
I - 1))
26878 for (
Value *V : ReducedVals[Cnt])
26880 LocalExternallyUsedValues.insert(TrackedVals[V]);
26882 if (!IsSupportedHorRdxIdentityOp) {
26885 "Reused values counter map is not empty");
26886 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
26887 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
26889 Value *
V = Candidates[Cnt];
26890 Value *OrigV = TrackedToOrig.at(V);
26891 ++SameValuesCounter.
try_emplace(OrigV).first->second;
26894 V.transformNodes();
26895 V.computeMinimumValueSizes();
26900 SmallPtrSet<Value *, 4> Visited;
26901 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
26902 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
26904 Value *RdxVal = Candidates[Cnt];
26905 if (
auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
26906 RdxVal = It->second;
26907 if (!Visited.
insert(RdxVal).second)
26911 if (!VLScalars.contains(RdxVal) &&
V.isVectorized(RdxVal)) {
26912 LocalExternallyUsedValues.insert(RdxVal);
26915 Value *OrigV = TrackedToOrig.at(RdxVal);
26917 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
26918 if (
NumOps != ReducedValsToOps.
at(OrigV).size())
26919 LocalExternallyUsedValues.insert(RdxVal);
26922 if (!IsSupportedHorRdxIdentityOp)
26923 SameValuesCounter.
clear();
26924 for (
Value *RdxVal : VL)
26925 if (RequiredExtract.
contains(RdxVal))
26926 LocalExternallyUsedValues.insert(RdxVal);
26927 V.buildExternalUses(LocalExternallyUsedValues);
26931 if (RK == ReductionOrdering::Ordered ||
V.isReducedBitcastRoot() ||
26932 V.isReducedCmpBitcastRoot())
26936 getReductionCost(
TTI, VL, SameValuesCounter, IsCmpSelMinMax,
26937 RdxFMF, V, DT,
DL, TLI);
26940 <<
" for reduction\n");
26944 V.getORE()->emit([&]() {
26945 return OptimizationRemarkMissed(
SV_NAME,
"HorSLPNotBeneficial",
26946 ReducedValsToOps.
at(VL[0]).front())
26947 <<
"Vectorizing horizontal reduction is possible "
26948 <<
"but not beneficial with cost " <<
ore::NV(
"Cost",
Cost)
26949 <<
" and threshold "
26952 if (!AdjustReducedVals()) {
26953 V.analyzedReductionVals(VL);
26955 if (ReduxWidth > ReductionLimit &&
V.isTreeNotExtendable()) {
26958 *
TTI, VL.front()->getType(), ReduxWidth - 1);
26959 VF >= ReductionLimit;
26961 *
TTI, VL.front()->getType(), VF - 1)) {
26963 V.getCanonicalGraphSize() !=
V.getTreeSize())
26966 IgnoredCandidates.
insert(std::make_pair(
Offset + Idx, VF));
26973 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorizing horizontal reduction at cost:"
26974 <<
Cost <<
". (HorRdx)\n");
26975 V.getORE()->emit([&]() {
26976 return OptimizationRemark(
SV_NAME,
"VectorizedHorizontalReduction",
26977 ReducedValsToOps.
at(VL[0]).front())
26978 <<
"Vectorized horizontal reduction with cost "
26979 <<
ore::NV(
"Cost",
Cost) <<
" and with tree size "
26980 <<
ore::NV(
"TreeSize",
V.getTreeSize());
26989 if (IsCmpSelMinMax)
26990 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
26993 Value *VectorizedRoot =
V.vectorizeTree(
26994 LocalExternallyUsedValues, InsertPt, VectorValuesAndScales);
26997 for (
Value *RdxVal : Candidates) {
26998 Value *OrigVal = TrackedToOrig.at(RdxVal);
26999 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
27000 if (TransformedRdxVal != RdxVal)
27001 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
27004 if (RK == ReductionOrdering::Ordered) {
27007 assert(VectorizedRoot &&
"Expected vectorized tree");
27010 for (
Value *RdxVal : VL)
27011 ++VectorizedVals.try_emplace(RdxVal).first->getSecond();
27014 ReduxWidth = NumReducedVals - Pos;
27015 if (ReduxWidth > 1)
27016 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
27017 AnyVectorized =
true;
27018 VectorizedTree = ReductionRoot;
27027 VectorizedRoot = Builder.
CreateFreeze(VectorizedRoot);
27030 if (OptReusedScalars && !SameScaleFactor) {
27031 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
27032 SameValuesCounter, TrackedToOrig);
27035 Type *ScalarTy = VL.front()->getType();
27040 OptReusedScalars && SameScaleFactor
27041 ? SameValuesCounter.
front().second
27044 ?
V.isSignedMinBitwidthRootNode()
27046 V.isReducedBitcastRoot() ||
V.isReducedCmpBitcastRoot());
27049 for (
Value *RdxVal : VL) {
27050 Value *OrigV = TrackedToOrig.at(RdxVal);
27051 if (IsSupportedHorRdxIdentityOp) {
27052 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
27055 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
27056 if (!
V.isVectorized(RdxVal))
27057 RequiredExtract.
insert(RdxVal);
27061 ReduxWidth = NumReducedVals - Pos;
27062 if (ReduxWidth > 1)
27063 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
27064 AnyVectorized =
true;
27066 if (OptReusedScalars && !AnyVectorized) {
27067 for (
const std::pair<Value *, unsigned> &
P : SameValuesCounter) {
27068 Value *RdxVal = TrackedVals.at(
P.first);
27069 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder,
P.second);
27070 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
27071 VectorizedVals.try_emplace(
P.first,
P.second);
27078 if (RK == ReductionOrdering::Ordered)
27079 return VectorizedTree;
27081 if (!VectorValuesAndScales.
empty())
27082 VectorizedTree = GetNewVectorizedTree(
27084 emitReduction(Builder, *
TTI, ReductionRoot->getType()));
27086 if (!VectorizedTree) {
27087 if (!CheckForReusedReductionOps) {
27088 for (ReductionOpsType &RdxOps : ReductionOps)
27089 for (
Value *RdxOp : RdxOps)
27111 auto FixBoolLogicalOps =
27114 if (!AnyBoolLogicOp)
27116 if (isBoolLogicOp(RedOp1) && ((!InitStep &&
LHS == VectorizedTree) ||
27117 getRdxOperand(RedOp1, 0) ==
LHS ||
27120 bool NeedFreeze =
LHS != VectorizedTree;
27121 if (isBoolLogicOp(RedOp2) && ((!InitStep &&
RHS == VectorizedTree) ||
27122 getRdxOperand(RedOp2, 0) ==
RHS ||
27125 if ((InitStep ||
RHS != VectorizedTree) &&
27126 getRdxOperand(RedOp2, 0) ==
RHS &&
27127 ((isBoolLogicOp(RedOp1) &&
27128 getRdxOperand(RedOp1, 1) == RedOp2) ||
27132 return OpI && isBoolLogicOp(OpI) &&
27133 getRdxOperand(OpI, 1) == RedOp2;
27136 NeedFreeze =
false;
27150 unsigned Sz = InstVals.
size();
27152 for (
unsigned I = 0,
E = (Sz / 2) * 2;
I <
E;
I += 2) {
27155 Value *RdxVal1 = InstVals[
I].second;
27156 Value *StableRdxVal1 = RdxVal1;
27157 auto It1 = TrackedVals.find(RdxVal1);
27158 if (It1 != TrackedVals.end())
27159 StableRdxVal1 = It1->second;
27160 Value *RdxVal2 = InstVals[
I + 1].second;
27161 Value *StableRdxVal2 = RdxVal2;
27162 auto It2 = TrackedVals.find(RdxVal2);
27163 if (It2 != TrackedVals.end())
27164 StableRdxVal2 = It2->second;
27168 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[
I].first,
27170 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
27171 StableRdxVal2,
"op.rdx", ReductionOps);
27172 ExtraReds[
I / 2] = std::make_pair(InstVals[
I].first, ExtraRed);
27175 ExtraReds[Sz / 2] = InstVals.
back();
27181 SmallPtrSet<Value *, 8> Visited;
27183 for (
Value *RdxVal : Candidates) {
27184 if (!Visited.
insert(RdxVal).second)
27186 unsigned NumOps = VectorizedVals.lookup(RdxVal);
27187 for (Instruction *RedOp :
27193 bool InitStep =
true;
27194 while (ExtraReductions.
size() > 1) {
27196 FinalGen(ExtraReductions, InitStep);
27197 ExtraReductions.
swap(NewReds);
27200 VectorizedTree = ExtraReductions.
front().second;
27202 ReductionRoot->replaceAllUsesWith(VectorizedTree);
27209 SmallPtrSet<Value *, 4> IgnoreSet;
27218 for (
auto *U :
Ignore->users()) {
27220 "All users must be either in the reduction ops list.");
27223 if (!
Ignore->use_empty()) {
27225 Ignore->replaceAllUsesWith(
P);
27228 V.removeInstructionsAndOperands(RdxOps, VectorValuesAndScales);
27230 return VectorizedTree;
27236 Value *createSingleOp(IRBuilderBase &Builder,
const TargetTransformInfo &
TTI,
27237 Value *Vec,
unsigned Scale,
bool IsSigned,
Type *DestTy,
27238 bool ReducedInTree) {
27240 if (ReducedInTree) {
27263 Rdx, emitReduction(Lane, Builder, &
TTI, DestTy),
I);
27266 Rdx = emitReduction(Vec, Builder, &
TTI, DestTy);
27268 if (Rdx->
getType() != DestTy)
27274 Rdx = emitScaleForReusedOps(Rdx, Builder, Scale);
27281 const SmallMapVector<Value *, unsigned, 16> SameValuesCounter,
27282 bool IsCmpSelMinMax, FastMathFlags FMF,
const BoUpSLP &R,
27283 DominatorTree &DT,
const DataLayout &
DL,
const TargetLibraryInfo &TLI) {
27285 Type *ScalarTy = ReducedVals.
front()->getType();
27286 unsigned ReduxWidth = ReducedVals.
size();
27287 FixedVectorType *VectorTy =
R.getReductionType();
27292 auto EvaluateScalarCost = [&](function_ref<
InstructionCost()> GenCostFn) {
27295 int Cnt = ReducedVals.
size();
27296 for (
Value *RdxVal : ReducedVals) {
27300 unsigned SameValueCount = SameValuesCounter.
lookup(RdxVal);
27301 Cost += (SameValueCount ? SameValueCount - 1 : 0) * GenCostFn();
27306 unsigned SameValueCount = SameValuesCounter.
lookup(RdxVal);
27307 Cost += (SameValueCount ? SameValueCount : 1) * GenCostFn();
27311 for (User *U : RdxVal->
users()) {
27313 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
27314 if (RdxKind == RecurKind::FAdd) {
27324 FMACost -= FMulCost;
27326 ScalarCost += FMACost;
27333 ScalarCost = InstructionCost::getInvalid();
27337 Cost += ScalarCost;
27339 Cost += GenCostFn();
27348 bool DoesRequireReductionOp = !AllConsts && VectorValuesAndScales.
empty();
27350 case RecurKind::Add:
27351 case RecurKind::Mul:
27352 case RecurKind::Or:
27353 case RecurKind::And:
27354 case RecurKind::Xor:
27355 case RecurKind::FAdd:
27356 case RecurKind::FMul: {
27359 if (DoesRequireReductionOp) {
27362 unsigned ScalarTyNumElements = VecTy->getNumElements();
27367 ReducedVals.size()),
27378 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
27379 std::make_pair(RedTy,
true));
27380 if (RType == RedTy) {
27385 RdxOpcode, !IsSigned, RedTy,
27391 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
27392 std::make_pair(RedTy,
true));
27395 if (RdxKind == RecurKind::FAdd) {
27400 for (
Value *RdxVal : ReducedVals) {
27406 FMF &= FPCI->getFastMathFlags();
27409 if (!
Ops.empty()) {
27414 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy,
27415 {RVecTy, RVecTy, RVecTy}, FMF);
27421 Instruction::FMul, RVecTy,
CostKind);
27423 <<
"Minus vector FMul cost: " << FMulCost <<
"\n");
27424 FMACost -= FMulCost;
27428 if (FMACost.isValid())
27429 VectorCost += FMACost;
27433 if (RType != RedTy) {
27434 unsigned Opcode = Instruction::Trunc;
27436 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
27442 ScalarCost = EvaluateScalarCost([&]() {
27447 case RecurKind::FMax:
27448 case RecurKind::FMin:
27449 case RecurKind::FMaximum:
27450 case RecurKind::FMinimum:
27451 case RecurKind::SMax:
27452 case RecurKind::SMin:
27453 case RecurKind::UMax:
27454 case RecurKind::UMin: {
27457 if (DoesRequireReductionOp) {
27463 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
27464 std::make_pair(RedTy,
true));
27466 IntrinsicCostAttributes ICA(Id, RVecTy, {RVecTy, RVecTy}, FMF);
27468 if (RType != RedTy) {
27469 unsigned Opcode = Instruction::Trunc;
27471 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
27477 ScalarCost = EvaluateScalarCost([&]() {
27478 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
27487 LLVM_DEBUG(
dbgs() <<
"SLP: Adding cost " << VectorCost - ScalarCost
27489 <<
" (It is a splitting reduction)\n");
27490 return VectorCost - ScalarCost;
27496 Value *emitReduction(IRBuilderBase &Builder,
const TargetTransformInfo &
TTI,
27498 Value *ReducedSubTree =
nullptr;
27500 auto CreateSingleOp = [&](
Value *Vec,
unsigned Scale,
bool IsSigned,
27501 bool ReducedInTree) {
27502 Value *Rdx = createSingleOp(Builder,
TTI, Vec, Scale, IsSigned, DestTy,
27504 if (ReducedSubTree)
27505 ReducedSubTree = createOp(Builder, RdxKind, ReducedSubTree, Rdx,
27506 "op.rdx", ReductionOps);
27508 ReducedSubTree = Rdx;
27510 if (VectorValuesAndScales.
size() == 1) {
27511 const auto &[Vec, Scale, IsSigned, ReducedInTree] =
27512 VectorValuesAndScales.
front();
27513 CreateSingleOp(Vec, Scale, IsSigned, ReducedInTree);
27514 return ReducedSubTree;
27518 Value *VecRes =
nullptr;
27519 bool VecResSignedness =
false;
27520 auto CreateVecOp = [&](
Value *Vec,
unsigned Cnt,
bool IsSigned,
27521 bool ReducedInTree) {
27522 if (ReducedInTree) {
27523 CreateSingleOp(Vec, Cnt, IsSigned, ReducedInTree);
27531 case RecurKind::Add: {
27532 if (ScalarTy == Builder.
getInt1Ty() && ScalarTy != DestTy) {
27535 <<
". (HorRdx)\n");
27538 std::iota(std::next(
Mask.begin(), VF *
I),
27539 std::next(
Mask.begin(), VF * (
I + 1)), 0);
27540 ++NumVectorInstructions;
27551 LLVM_DEBUG(
dbgs() <<
"SLP: Add (to-mul) " << Cnt <<
"of " << Vec
27552 <<
". (HorRdx)\n");
27553 ++NumVectorInstructions;
27557 case RecurKind::Xor: {
27560 <<
"SLP: Xor " << Cnt <<
"of " << Vec <<
". (HorRdx)\n");
27565 case RecurKind::FAdd: {
27569 LLVM_DEBUG(
dbgs() <<
"SLP: FAdd (to-fmul) " << Cnt <<
"of " << Vec
27570 <<
". (HorRdx)\n");
27571 ++NumVectorInstructions;
27575 case RecurKind::And:
27576 case RecurKind::Or:
27577 case RecurKind::SMax:
27578 case RecurKind::SMin:
27579 case RecurKind::UMax:
27580 case RecurKind::UMin:
27581 case RecurKind::FMax:
27582 case RecurKind::FMin:
27583 case RecurKind::FMaximum:
27584 case RecurKind::FMinimum:
27587 case RecurKind::Sub:
27588 case RecurKind::AddChainWithSubs:
27589 case RecurKind::Mul:
27590 case RecurKind::FMul:
27591 case RecurKind::FMulAdd:
27592 case RecurKind::AnyOf:
27593 case RecurKind::FindIV:
27594 case RecurKind::FindLast:
27595 case RecurKind::FMaxNum:
27596 case RecurKind::FMinNum:
27597 case RecurKind::FMaximumNum:
27598 case RecurKind::FMinimumNum:
27599 case RecurKind::None:
27606 VecResSignedness = IsSigned;
27608 ++NumVectorInstructions;
27609 if (ScalarTy == Builder.
getInt1Ty() && ScalarTy != DestTy &&
27615 std::iota(
Mask.begin(),
Mask.end(), 0);
27617 if (VecResVF < VecVF) {
27621 if (VecResVF != VecVF) {
27623 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
27632 "Expected the number of elements in VecRes to be a multiple "
27633 "of the number of elements in DestTy");
27649 if (VecResVF < VecVF) {
27655 if (VecResVF != VecVF)
27657 Op = createOp(Builder, RdxKind,
Op, Vec,
"rdx.op", ReductionOps);
27658 if (VecResVF != VecVF)
27663 for (
auto [Vec, Scale, IsSigned, ReducedInTree] : VectorValuesAndScales)
27664 CreateVecOp(Vec, Scale, IsSigned, ReducedInTree);
27665 CreateSingleOp(VecRes, 1,
false,
27668 return ReducedSubTree;
27672 Value *emitReduction(
Value *VectorizedValue, IRBuilderBase &Builder,
27673 const TargetTransformInfo *
TTI,
Type *DestTy) {
27674 assert(VectorizedValue &&
"Need to have a vectorized tree node");
27675 assert(RdxKind != RecurKind::FMulAdd &&
27676 "A call to the llvm.fmuladd intrinsic is not handled yet");
27679 if (FTy->getScalarType() == Builder.
getInt1Ty() &&
27680 RdxKind == RecurKind::Add &&
27685 VectorizedValue, Builder.
getIntNTy(FTy->getNumElements()));
27686 ++NumVectorInstructions;
27689 ++NumVectorInstructions;
27694 Value *emitScaleForReusedOps(
Value *VectorizedValue, IRBuilderBase &Builder,
27696 assert(IsSupportedHorRdxIdentityOp &&
27697 "The optimization of matched scalar identity horizontal reductions "
27698 "must be supported.");
27700 return VectorizedValue;
27702 case RecurKind::Add: {
27705 ConstantInt::get(VectorizedValue->
getType(), Cnt,
27708 << VectorizedValue <<
". (HorRdx)\n");
27709 return Builder.
CreateMul(VectorizedValue, Scale);
27711 case RecurKind::Xor: {
27713 LLVM_DEBUG(
dbgs() <<
"SLP: Xor " << Cnt <<
"of " << VectorizedValue
27714 <<
". (HorRdx)\n");
27717 return VectorizedValue;
27719 case RecurKind::FAdd: {
27721 Value *Scale = ConstantFP::get(VectorizedValue->
getType(), Cnt);
27723 << VectorizedValue <<
". (HorRdx)\n");
27724 return Builder.
CreateFMul(VectorizedValue, Scale);
27726 case RecurKind::And:
27727 case RecurKind::Or:
27728 case RecurKind::SMax:
27729 case RecurKind::SMin:
27730 case RecurKind::UMax:
27731 case RecurKind::UMin:
27732 case RecurKind::FMax:
27733 case RecurKind::FMin:
27734 case RecurKind::FMaximum:
27735 case RecurKind::FMinimum:
27737 return VectorizedValue;
27738 case RecurKind::Sub:
27739 case RecurKind::AddChainWithSubs:
27740 case RecurKind::Mul:
27741 case RecurKind::FMul:
27742 case RecurKind::FMulAdd:
27743 case RecurKind::AnyOf:
27744 case RecurKind::FindIV:
27745 case RecurKind::FindLast:
27746 case RecurKind::FMaxNum:
27747 case RecurKind::FMinNum:
27748 case RecurKind::FMaximumNum:
27749 case RecurKind::FMinimumNum:
27750 case RecurKind::None:
27759 emitReusedOps(
Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,
27760 const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter,
27761 const DenseMap<Value *, Value *> &TrackedToOrig) {
27762 assert(IsSupportedHorRdxIdentityOp &&
27763 "The optimization of matched scalar identity horizontal reductions "
27764 "must be supported.");
27767 if (VTy->getElementType() != VL.
front()->getType()) {
27771 R.isSignedMinBitwidthRootNode());
27774 case RecurKind::Add: {
27777 for (
Value *V : VL) {
27778 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
27779 Vals.
push_back(ConstantInt::get(
V->getType(), Cnt,
false));
27783 << VectorizedValue <<
". (HorRdx)\n");
27784 return Builder.
CreateMul(VectorizedValue, Scale);
27786 case RecurKind::And:
27787 case RecurKind::Or:
27790 <<
". (HorRdx)\n");
27791 return VectorizedValue;
27792 case RecurKind::SMax:
27793 case RecurKind::SMin:
27794 case RecurKind::UMax:
27795 case RecurKind::UMin:
27796 case RecurKind::FMax:
27797 case RecurKind::FMin:
27798 case RecurKind::FMaximum:
27799 case RecurKind::FMinimum:
27802 <<
". (HorRdx)\n");
27803 return VectorizedValue;
27804 case RecurKind::Xor: {
27809 SmallVector<int>
Mask(
27812 std::iota(
Mask.begin(),
Mask.end(), 0);
27813 bool NeedShuffle =
false;
27814 for (
unsigned I = 0, VF = VL.size();
I < VF; ++
I) {
27816 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
27817 if (Cnt % 2 == 0) {
27819 NeedShuffle =
true;
27825 dbgs() <<
"> of " << VectorizedValue <<
". (HorRdx)\n");
27829 ConstantVector::getNullValue(VectorizedValue->
getType()), Mask);
27830 return VectorizedValue;
27832 case RecurKind::FAdd: {
27835 for (
Value *V : VL) {
27836 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
27837 Vals.
push_back(ConstantFP::get(
V->getType(), Cnt));
27840 return Builder.
CreateFMul(VectorizedValue, Scale);
27842 case RecurKind::Sub:
27843 case RecurKind::AddChainWithSubs:
27844 case RecurKind::Mul:
27845 case RecurKind::FMul:
27846 case RecurKind::FMulAdd:
27847 case RecurKind::AnyOf:
27848 case RecurKind::FindIV:
27849 case RecurKind::FindLast:
27850 case RecurKind::FMaxNum:
27851 case RecurKind::FMinNum:
27852 case RecurKind::FMaximumNum:
27853 case RecurKind::FMinimumNum:
27854 case RecurKind::None:
27864 return HorizontalReduction::getRdxKind(V);
27870 unsigned AggregateSize = 1;
27872 Type *CurrentType =
IV->getType();
27875 for (
auto *Elt : ST->elements())
27876 if (Elt != ST->getElementType(0))
27877 return std::nullopt;
27878 AggregateSize *= ST->getNumElements();
27879 CurrentType = ST->getElementType(0);
27881 AggregateSize *= AT->getNumElements();
27882 CurrentType = AT->getElementType();
27884 AggregateSize *= VT->getNumElements();
27885 return AggregateSize;
27887 return AggregateSize;
27889 return std::nullopt;
27898 unsigned OperandOffset,
const BoUpSLP &R) {
27901 std::optional<unsigned> OperandIndex =
27903 if (!OperandIndex || R.isDeleted(LastInsertInst))
27907 BuildVectorOpds, InsertElts, *OperandIndex, R);
27910 BuildVectorOpds[*OperandIndex] = InsertedOperand;
27911 InsertElts[*OperandIndex] = LastInsertInst;
27914 }
while (LastInsertInst !=
nullptr &&
27941 "Expected insertelement or insertvalue instruction!");
27944 "Expected empty result vectors!");
27947 if (!AggregateSize)
27949 BuildVectorOpds.
resize(*AggregateSize);
27950 InsertElts.
resize(*AggregateSize);
27955 if (BuildVectorOpds.
size() >= 2)
27973 auto DominatedReduxValue = [&](
Value *R) {
27981 if (
P->getIncomingBlock(0) == ParentBB) {
27983 }
else if (
P->getIncomingBlock(1) == ParentBB) {
27987 if (Rdx && DominatedReduxValue(Rdx))
28000 if (
P->getIncomingBlock(0) == BBLatch) {
28002 }
else if (
P->getIncomingBlock(1) == BBLatch) {
28006 if (Rdx && DominatedReduxValue(Rdx))
28042 "Expected binop, select, or intrinsic for reduction matching");
28044 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root));
28046 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
28057 Value *Op0 =
nullptr;
28058 Value *Op1 =
nullptr;
28067 Value *B0 =
nullptr, *B1 =
nullptr;
28072bool SLPVectorizerPass::vectorizeHorReduction(
28073 PHINode *
P, Instruction *Root, BasicBlock *BB,
BoUpSLP &R,
28074 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
28083 auto SelectRoot = [&]() {
28085 HorizontalReduction::getRdxKind(Root) != RecurKind::None)
28102 std::queue<std::pair<Instruction *, unsigned>>
Stack;
28103 Stack.emplace(SelectRoot(), 0);
28104 SmallPtrSet<Value *, 8> VisitedInstrs;
28107 if (
R.isAnalyzedReductionRoot(Inst))
28112 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DT, *DL, *TTI, *TLI))
28114 return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC, *DT);
28116 auto TryAppendToPostponedInsts = [&](
Instruction *FutureSeed) {
28117 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
28129 while (!
Stack.empty()) {
28132 std::tie(Inst, Level) =
Stack.front();
28137 if (
R.isDeleted(Inst))
28139 if (
Value *VectorizedV = TryToReduce(Inst)) {
28143 Stack.emplace(
I, Level);
28146 if (
R.isDeleted(Inst))
28150 if (!TryAppendToPostponedInsts(Inst)) {
28161 if (VisitedInstrs.
insert(
Op).second)
28166 !
R.isDeleted(
I) &&
I->getParent() == BB)
28167 Stack.emplace(
I, Level);
28172bool SLPVectorizerPass::tryToVectorize(Instruction *
I,
BoUpSLP &R) {
28179 if ((
I->getOpcode() == Instruction::FAdd ||
28180 I->getOpcode() == Instruction::FSub) &&
28190 if (!Op0 || !Op1 || Op0->getParent() !=
P || Op1->getParent() !=
P ||
28191 R.isDeleted(Op0) ||
R.isDeleted(Op1))
28201 if (
A &&
B &&
B->hasOneUse()) {
28204 if (B0 && B0->getParent() ==
P && !
R.isDeleted(B0))
28206 if (B1 && B1->getParent() ==
P && !
R.isDeleted(B1))
28210 if (
B &&
A &&
A->hasOneUse()) {
28213 if (A0 && A0->getParent() ==
P && !
R.isDeleted(A0))
28215 if (A1 && A1->getParent() ==
P && !
R.isDeleted(A1))
28219 auto TryToReduce = [
this, &
R, &TTI = *TTI](
Instruction *Inst,
28223 Type *Ty = Inst->getType();
28227 if (!HorRdx.matchReductionForOperands())
28233 TTI.getScalarizationOverhead(
28236 TTI.getInstructionCost(Inst,
CostKind);
28239 case RecurKind::Add:
28240 case RecurKind::Mul:
28241 case RecurKind::Or:
28242 case RecurKind::And:
28243 case RecurKind::Xor:
28244 case RecurKind::FAdd:
28245 case RecurKind::FMul: {
28248 FMF = FPCI->getFastMathFlags();
28249 RedCost = TTI.getArithmeticReductionCost(Inst->getOpcode(), VecTy, FMF,
28256 if (RedCost >= ScalarCost)
28259 return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC, *DT) !=
nullptr;
28261 if (Candidates.
size() == 1)
28262 return TryToReduce(
I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1},
R);
28265 std::optional<int> BestCandidate =
R.findBestRootPair(Candidates).first;
28266 if (!BestCandidate)
28268 return (*BestCandidate == 0 &&
28269 TryToReduce(
I, {Candidates[*BestCandidate].first,
28270 Candidates[*BestCandidate].second})) ||
28271 tryToVectorizeList({Candidates[*BestCandidate].first,
28272 Candidates[*BestCandidate].second},
28276bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *
P, Instruction *Root,
28277 BasicBlock *BB,
BoUpSLP &R) {
28279 bool Res = vectorizeHorReduction(
P, Root, BB, R, PostponedInsts);
28280 Res |= tryToVectorize(PostponedInsts, R);
28287 for (
Value *V : Insts)
28289 Res |= tryToVectorize(Inst, R);
28293bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
28296 if (!
R.canMapToVector(IVI->
getType()))
28299 SmallVector<Value *, 16> BuildVectorOpds;
28300 SmallVector<Value *, 16> BuildVectorInsts;
28304 if (MaxVFOnly && BuildVectorOpds.
size() == 2) {
28305 R.getORE()->emit([&]() {
28306 return OptimizationRemarkMissed(
SV_NAME,
"NotPossible", IVI)
28307 <<
"Cannot SLP vectorize list: only 2 elements of buildvalue, "
28308 "trying reduction first.";
28312 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IVI <<
"\n");
28314 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
28317bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
28320 SmallVector<Value *, 16> BuildVectorInsts;
28321 SmallVector<Value *, 16> BuildVectorOpds;
28322 SmallVector<int>
Mask;
28328 if (MaxVFOnly && BuildVectorInsts.
size() == 2) {
28329 R.getORE()->emit([&]() {
28330 return OptimizationRemarkMissed(
SV_NAME,
"NotPossible", IEI)
28331 <<
"Cannot SLP vectorize list: only 2 elements of buildvector, "
28332 "trying reduction first.";
28336 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IEI <<
"\n");
28337 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
28340template <
typename T>
28345 bool MaxVFOnly,
BoUpSLP &R) {
28358 if (!
I || R.isDeleted(
I)) {
28362 auto *SameTypeIt = IncIt;
28365 AreCompatible(VL, *SameTypeIt))) {
28368 if (
I && !R.isDeleted(
I))
28373 unsigned NumElts = VL.
size();
28374 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize starting at nodes ("
28375 << NumElts <<
")\n");
28385 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL), MaxVFOnly)) {
28388 VL.
swap(Candidates);
28389 Candidates.
clear();
28397 auto GetMinNumElements = [&R](
Value *V) {
28398 unsigned EltSize = R.getVectorElementSize(V);
28399 return std::max(2U, R.getMaxVecRegSize() / EltSize);
28401 if (NumElts < GetMinNumElements(*IncIt) &&
28402 (Candidates.
empty() ||
28403 Candidates.
front()->getType() == (*IncIt)->getType())) {
28411 if (Candidates.
size() > 1 &&
28412 (SameTypeIt ==
E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
28413 if (TryToVectorizeHelper(Candidates,
false)) {
28416 }
else if (MaxVFOnly) {
28419 for (
auto *It = Candidates.
begin(), *End = Candidates.
end(); It != End;
28422 if (!
I || R.isDeleted(
I)) {
28426 auto *SameTypeIt = It;
28427 while (SameTypeIt != End &&
28430 AreCompatible(*SameTypeIt, *It))) {
28433 if (
I && !R.isDeleted(
I))
28436 unsigned NumElts = VL.
size();
28437 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL),
28443 Candidates.
clear();
28447 IncIt = SameTypeIt;
28459template <
bool IsCompatibility>
28464 "Expected valid element types only.");
28466 return IsCompatibility;
28469 if (CI1->getOperand(0)->getType()->getTypeID() <
28471 return !IsCompatibility;
28472 if (CI1->getOperand(0)->getType()->getTypeID() >
28475 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
28477 return !IsCompatibility;
28478 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
28487 if (BasePred1 < BasePred2)
28488 return !IsCompatibility;
28489 if (BasePred1 > BasePred2)
28492 bool CI1Preds = Pred1 == BasePred1;
28493 bool CI2Preds = Pred2 == BasePred1;
28494 for (
int I = 0,
E = CI1->getNumOperands();
I <
E; ++
I) {
28495 auto *Op1 = CI1->getOperand(CI1Preds ?
I :
E -
I - 1);
28499 if (Op1->getValueID() < Op2->getValueID())
28500 return !IsCompatibility;
28501 if (Op1->getValueID() > Op2->getValueID())
28505 if (IsCompatibility) {
28506 if (I1->getParent() != I2->getParent())
28513 return NodeI2 !=
nullptr;
28516 assert((NodeI1 == NodeI2) ==
28518 "Different nodes should have different DFS numbers");
28519 if (NodeI1 != NodeI2)
28523 if (S && (IsCompatibility || !S.isAltShuffle()))
28525 if (IsCompatibility)
28527 if (I1->getOpcode() != I2->getOpcode())
28528 return I1->getOpcode() < I2->getOpcode();
28531 return IsCompatibility;
28534template <
typename ItT>
28540 if (R.isDeleted(
I))
28544 Changed |= vectorizeRootInstruction(
nullptr, RootOp, BB, R);
28545 if (R.isDeleted(
I))
28551 if (R.isDeleted(
I))
28557 auto CompareSorter = [&](
Value *V,
Value *V2) {
28573 if (Vals.
size() <= 1)
28576 Vals, CompareSorter, AreCompatibleCompares,
28579 bool ArePossiblyReducedInOtherBlock =
any_of(Candidates, [](
Value *V) {
28581 auto *Select = dyn_cast<SelectInst>(U);
28583 Select->getParent() != cast<Instruction>(V)->getParent();
28586 if (ArePossiblyReducedInOtherBlock)
28588 return tryToVectorizeList(Candidates, R, MaxVFOnly);
28594bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
28597 "This function only accepts Insert instructions");
28598 bool OpsChanged =
false;
28600 for (
auto *
I :
reverse(Instructions)) {
28606 vectorizeInsertValueInst(LastInsertValue, BB, R,
true);
28609 vectorizeInsertElementInst(LastInsertElem, BB, R,
true);
28612 if (
R.isDeleted(
I))
28614 OpsChanged |= vectorizeHorReduction(
nullptr,
I, BB, R, PostponedInsts);
28620 vectorizeInsertValueInst(LastInsertValue, BB, R,
false);
28622 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
28627 OpsChanged |= tryToVectorize(PostponedInsts, R);
28633bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB,
BoUpSLP &R) {
28635 SmallVector<Value *, 4> Incoming;
28636 SmallPtrSet<Value *, 16> VisitedInstrs;
28640 DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;
28641 auto PHICompare = [
this, &PHIToOpcodes](
Value *V1,
Value *V2) {
28644 "Expected vectorizable types only.");
28654 V2->getType()->getScalarSizeInBits())
28657 V2->getType()->getScalarSizeInBits())
28661 if (Opcodes1.
size() < Opcodes2.
size())
28663 if (Opcodes1.
size() > Opcodes2.
size())
28665 for (
int I = 0,
E = Opcodes1.
size();
I <
E; ++
I) {
28674 return NodeI2 !=
nullptr;
28677 assert((NodeI1 == NodeI2) ==
28679 "Different nodes should have different DFS numbers");
28680 if (NodeI1 != NodeI2)
28683 if (S && !S.isAltShuffle() &&
I1->getOpcode() == I2->getOpcode()) {
28699 DT->getNode(V1->getParent());
28701 DT->getNode(V2->getParent());
28703 return NodeI2 !=
nullptr;
28706 assert((NodeI1 == NodeI2) ==
28708 "Different nodes should have different DFS numbers");
28709 if (NodeI1 != NodeI2)
28711 return V1->comesBefore(V2);
28724 return *Id1 < *Id2;
28728 if (
I1->getOpcode() == I2->getOpcode())
28730 return I1->getOpcode() < I2->getOpcode();
28753 auto ValID1 = Opcodes1[
I]->getValueID();
28754 auto ValID2 = Opcodes2[
I]->getValueID();
28755 if (ValID1 == ValID2)
28757 if (ValID1 < ValID2)
28759 if (ValID1 > ValID2)
28768 assert(U1 && U2 &&
"The only thing left should be undef & undef.");
28774 if (VL.empty() || V1 == VL.back())
28776 Value *V2 = VL.back();
28781 if (Opcodes1.
size() != Opcodes2.
size())
28783 for (
int I = 0,
E = Opcodes1.
size();
I <
E; ++
I) {
28789 if (
R.isDeleted(I1) ||
R.isDeleted(I2))
28791 if (
I1->getParent() != I2->getParent())
28799 if (Opcodes1[
I]->getValueID() != Opcodes2[
I]->getValueID())
28805 bool HaveVectorizedPhiNodes =
false;
28809 for (Instruction &
I : *BB) {
28816 if (!VisitedInstrs.
count(
P) && !
R.isDeleted(
P) &&
28821 if (Incoming.
size() <= 1)
28826 for (
Value *V : Incoming) {
28827 SmallVectorImpl<Value *> &Opcodes =
28829 if (!Opcodes.
empty())
28831 SmallVector<Value *, 4> Nodes(1, V);
28832 SmallPtrSet<Value *, 4> Visited;
28833 while (!Nodes.empty()) {
28837 for (
Value *V :
PHI->incoming_values()) {
28839 Nodes.push_back(PHI1);
28848 Incoming, PHICompare, AreCompatiblePHIs,
28850 return tryToVectorizeList(Candidates, R, MaxVFOnly);
28853 Changed |= HaveVectorizedPhiNodes;
28854 if (HaveVectorizedPhiNodes &&
any_of(PHIToOpcodes, [&](
const auto &
P) {
28856 return !
PHI ||
R.isDeleted(
PHI);
28858 PHIToOpcodes.
clear();
28860 }
while (HaveVectorizedPhiNodes);
28862 VisitedInstrs.
clear();
28864 InstSetVector PostProcessInserts;
28865 SmallSetVector<CmpInst *, 8> PostProcessCmps;
28868 auto VectorizeInsertsAndCmps = [&](
bool VectorizeCmps) {
28869 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
28870 if (VectorizeCmps) {
28872 PostProcessCmps.
clear();
28874 PostProcessInserts.clear();
28880 return PostProcessCmps.
contains(Cmp);
28882 PostProcessInserts.contains(
I);
28888 return I->use_empty() &&
28898 if (
R.isDeleted(&*It))
28901 if (!VisitedInstrs.
insert(&*It).second) {
28902 if (HasNoUsers(&*It) &&
28903 VectorizeInsertsAndCmps(It->isTerminator())) {
28916 if (
P->getNumIncomingValues() == 2) {
28919 if (Root && vectorizeRootInstruction(
P, Root, BB, R)) {
28933 if (BB ==
P->getIncomingBlock(
I) ||
28934 !DT->isReachableFromEntry(
P->getIncomingBlock(
I)))
28940 PI && !IsInPostProcessInstrs(PI)) {
28942 vectorizeRootInstruction(
nullptr, PI,
P->getIncomingBlock(
I), R);
28944 if (Res &&
R.isDeleted(
P)) {
28954 if (HasNoUsers(&*It)) {
28955 bool OpsChanged =
false;
28966 TryToVectorizeRoot |= (
I == Stores.
end() ||
I->second.size() == 1) &&
28967 SI->getValueOperand()->hasOneUse();
28969 if (TryToVectorizeRoot) {
28970 for (
auto *V : It->operand_values()) {
28974 VI && !IsInPostProcessInstrs(VI))
28976 OpsChanged |= vectorizeRootInstruction(
nullptr, VI, BB, R);
28983 VectorizeInsertsAndCmps(It->isTerminator());
28995 PostProcessInserts.insert(&*It);
29003bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB,
BoUpSLP &R) {
29005 for (
auto &Entry : GEPs) {
29008 if (
Entry.second.size() < 2)
29011 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing a getelementptr list of length "
29012 <<
Entry.second.size() <<
".\n");
29020 return !R.isDeleted(GEP);
29022 if (It ==
Entry.second.end())
29024 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
29025 unsigned EltSize =
R.getVectorElementSize(*(*It)->idx_begin());
29026 if (MaxVecRegSize < EltSize)
29029 unsigned MaxElts = MaxVecRegSize / EltSize;
29030 for (
unsigned BI = 0, BE =
Entry.second.size(); BI < BE; BI += MaxElts) {
29031 auto Len = std::min<unsigned>(BE - BI, MaxElts);
29044 Candidates.remove_if([&R](
Value *
I) {
29054 for (
int I = 0,
E = GEPList.size();
I <
E && Candidates.
size() > 1; ++
I) {
29055 auto *GEPI = GEPList[
I];
29056 if (!Candidates.count(GEPI))
29058 const SCEV *SCEVI = SE->getSCEV(GEPList[
I]);
29059 for (
int J =
I + 1; J <
E && Candidates.
size() > 1; ++J) {
29060 auto *GEPJ = GEPList[J];
29061 if (!Candidates.count(GEPJ))
29063 const SCEV *SCEVJ = SE->getSCEV(GEPList[J]);
29065 Candidates.remove(GEPI);
29066 Candidates.remove(GEPJ);
29067 }
else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
29068 Candidates.remove(GEPJ);
29075 if (Candidates.
size() < 2)
29081 SmallVector<Value *, 16> Bundle(Candidates.
size());
29082 auto BundleIndex = 0
u;
29083 for (
auto *V : Candidates) {
29085 auto *GEPIdx =
GEP->idx_begin()->get();
29087 Bundle[BundleIndex++] = GEPIdx;
29099 Changed |= tryToVectorizeList(Bundle, R);
29105bool SLPVectorizerPass::vectorizeStoreChains(
BoUpSLP &R) {
29110 auto &&StoreSorter = [
this](StoreInst *
V, StoreInst *V2) {
29111 if (
V->getValueOperand()->getType()->getTypeID() <
29114 if (
V->getValueOperand()->getType()->getTypeID() >
29117 if (
V->getPointerOperandType()->getTypeID() <
29118 V2->getPointerOperandType()->getTypeID())
29120 if (
V->getPointerOperandType()->getTypeID() >
29121 V2->getPointerOperandType()->getTypeID())
29123 if (
V->getValueOperand()->getType()->getScalarSizeInBits() <
29126 if (
V->getValueOperand()->getType()->getScalarSizeInBits() >
29133 DomTreeNodeBase<llvm::BasicBlock> *NodeI1 = DT->getNode(
I1->getParent());
29134 DomTreeNodeBase<llvm::BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
29135 assert(NodeI1 &&
"Should only process reachable instructions");
29136 assert(NodeI2 &&
"Should only process reachable instructions");
29137 assert((NodeI1 == NodeI2) ==
29139 "Different nodes should have different DFS numbers");
29140 if (NodeI1 != NodeI2)
29142 return I1->getOpcode() < I2->getOpcode();
29148 return V->getValueOperand()->getValueID() <
29152 bool SameParent =
true;
29158 StoreInst *V2 = VL.
back();
29183 SameParent &=
I1 && I2 &&
I1->getParent() == I2->getParent();
29185 for (
auto [SI, V] :
zip(VL, NewVL))
29186 V =
SI->getValueOperand();
29187 NewVL.back() = V1->getValueOperand();
29188 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
29189 InstructionsState S =
Analysis.buildInstructionsState(
29197 return V1->getValueOperand()->
getValueID() ==
29202 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> Attempted;
29203 for (
auto &Pair : Stores) {
29204 if (Pair.second.size() < 2)
29208 << Pair.second.size() <<
".\n");
29217 Pair.second.rend());
29219 ReversedStores, StoreSorter, AreCompatibleStores,
29221 return vectorizeStores(Candidates, R, Attempted);
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool isConstant(const MachineInstr &MI)
AMDGPU Register Bank Select
ReachingDefInfo InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis false
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
block Block Frequency Analysis
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
static bool runImpl(Function &F, const TargetLowering &TLI, const LibcallLoweringInfo &Libcalls, AssumptionCache *AC)
This is the interface for a simple mod/ref and alias analysis over globals.
static Value * getCondition(Instruction *I)
static void setCondition(Instruction *I, Value *NewCond)
ManagedStatic< HTTPClientCleanup > Cleanup
static Type * getIndexType(Value *In)
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
This file provides utility analysis objects describing memory locations.
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
static bool IsSelect(MachineInstr &MI)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
static std::optional< OperandInfo > getOperandInfo(const MachineOperand &MO)
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool isAlternateInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static cl::opt< bool > SplitAlternateInstructions("slp-split-alternate-instructions", cl::init(true), cl::Hidden, cl::desc("Improve the code quality by splitting alternate instructions"))
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static bool isMaskedLoadCompress(ArrayRef< Value * > VL, ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC, const DominatorTree &DT, const TargetLibraryInfo &TLI, const function_ref< bool(Value *)> AreAllUsersVectorized, bool &IsMasked, unsigned &InterleaveFactor, SmallVectorImpl< int > &CompressMask, VectorType *&LoadVecTy)
Checks if the VL can be transformed to a (masked)load + compress or (masked) interleaved load.
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
static cl::opt< bool > VectorizeCopyableElements("slp-copyable-elements", cl::init(true), cl::Hidden, cl::desc("Try to replace values with the idempotent instructions for " "better vectorization."))
Enables vectorization of copyable elements.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static bool allSameOpcode(ArrayRef< Value * > VL)
static InstructionCost canConvertToFMA(ArrayRef< Value * > VL, const InstructionsState &S, DominatorTree &DT, const DataLayout &DL, TargetTransformInfo &TTI, const TargetLibraryInfo &TLI)
Check if we can convert fadd/fsub sequence to FMAD.
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool isCommutative(Instruction *I, Value *ValWithUses, bool IsCopyable=false)
static unsigned getLoopTripCount(const Loop *L, ScalarEvolution &SE)
Get the assumed loop trip count for the loop L.
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Creates subvector insert.
static void findBuildAggregateRec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool isSimple(Instruction *I)
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static DebugLoc getDebugLocFromPHI(PHINode &PN)
static std::optional< unsigned > getExtractIndex(const Instruction *E)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static cl::opt< bool > ForceStridedLoads("slp-force-strided-loads", cl::init(false), cl::Hidden, cl::desc("Generate strided loads even if they are not " "profitable. Used for testing only."))
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static bool isMainInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an main operation for the given MainOp and AltOp instruction...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static unsigned getNumberOfPotentiallyCommutativeOps(Instruction *I)
static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy, VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={})
This is similar to TargetTransformInfo::getScalarizationOverhead, but if ScalarTy is a FixedVectorTyp...
static bool isCommutableOperand(Instruction *I, Value *ValWithUses, unsigned Op, bool IsCopyable=false)
Checks if the operand is commutative.
static bool buildCompressMask(ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, Type *ScalarTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< int > &CompressMask)
Builds compress-like mask for shuffles for the given PointerOps, ordered with Order.
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
Calculates the costs of vectorized intrinsic (if possible) and vectorized function (if possible) call...
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static bool isValidForAlternation(unsigned Opcode)
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
static SmallVector< Constant * > replicateMask(ArrayRef< Constant * > Val, unsigned VF)
Replicates the given Val VF times.
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static cl::opt< unsigned > LoopAwareTripCount("slp-cost-loop-trip-count", cl::init(2), cl::Hidden, cl::desc("Loop trip count, considered by the cost model during " "modeling (0=loops are ignored and considered flat code)"))
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static Instruction * findInstructionWithOpcode(ArrayRef< Value * > VL, unsigned Opcode)
Find an instruction with a specific opcode in VL.
static const SCEV * calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, SmallVectorImpl< int64_t > &Coeffs)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static InstructionCost getExtractWithExtendCost(const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput)
This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst is a FixedVectorType,...
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static std::pair< Instruction *, Instruction * > getMainAltOpsNoStateVL(ArrayRef< Value * > VL)
Returns main/alternate instructions for the given VL.
static const Loop * findInnermostNonInvariantLoop(const Loop *L, ArrayRef< Value * > VL)
Find the innermost loop starting from L, for which at least a single value in VL is not invariant.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int64_t > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(ArrayRef< T * >, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static unsigned getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy, const unsigned Limit=std::numeric_limits< unsigned >::max())
Returns number of parts, the type VecTy will be split at the codegen phase.
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static bool tryToFindDuplicates(SmallVectorImpl< Value * > &VL, SmallVectorImpl< int > &ReuseShuffleIndices, const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, const InstructionsState &S, const BoUpSLP::EdgeInfo &UserTreeIdx, bool TryPad=false)
Checks that every instruction appears once in the list and if not, packs them, building ReuseShuffleI...
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Value * createExtractVector(IRBuilderBase &Builder, Value *Vec, unsigned SubVecVF, unsigned Index)
Generates subvector extract using Generator or using default shuffle.
static cl::opt< bool > DisableTreeReorder("slp-disable-tree-reorder", cl::init(false), cl::Hidden, cl::desc("Disable tree reordering even if it is " "profitable. Used for testing only."))
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static InstructionCost getVectorInstrCost(const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar, ArrayRef< std::tuple< Value *, User *, int > > ScalarUserAndIdx)
This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy is a FixedVectorType,...
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, Type *ScalarTy, unsigned Opcode0, unsigned Opcode1)
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static SymbolRef::Type getType(const Symbol *Sym)
static const int BlockSize
LocallyHashedType DenseMapInfo< LocallyHashedType >::Empty
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
static const uint32_t IV[8]
Merges shuffle masks and emits final shuffle instruction, if required.
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost createFreeze(InstructionCost Cost)
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void resetForSameNode()
Reset the builder to handle perfect diamond match.
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Merges shuffle masks and emits final shuffle instruction, if required.
Value * createFreeze(Value *V)
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void resetForSameNode()
Reset the builder to handle perfect diamond match.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
~ShuffleInstructionBuilder()
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
A manager for alias analyses.
Class for arbitrary precision integers.
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
void clearBit(unsigned BitPosition)
Set a given bit to 0.
uint64_t getZExtValue() const
Get zero extended value.
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
unsigned getBitWidth() const
Return the number of bits in the APInt.
bool ult(const APInt &RHS) const
Unsigned less than comparison.
bool isNegative() const
Determine sign of this APInt.
void clearAllBits()
Set every bit to 0.
unsigned logBase2() const
void setAllBits()
Set every bit to 1.
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
bool isOne() const
Determine if this is a value of 1.
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
const T & back() const
back - Get the last element.
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
const T & front() const
front - Get the first element.
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
const T & consume_front()
consume_front() - Returns the first element and drops it from ArrayRef.
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::reverse_iterator reverse_iterator
InstListType::iterator iterator
Instruction iterators...
LLVM_ABI const_iterator getFirstNonPHIOrDbgOrAlloca() const
Returns an iterator to the first instruction in this block that is not a PHINode, a debug intrinsic,...
InstListType::const_reverse_iterator const_reverse_iterator
bool isEHPad() const
Return true if this basic block is an exception handling block.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Represents analyses that only rely on functions' control flow.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
LLVM_ABI void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
This class is the base class for the comparison instructions.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ ICMP_SLT
signed less than
@ ICMP_SLE
signed less or equal
@ ICMP_UGE
unsigned greater or equal
@ ICMP_UGT
unsigned greater than
@ ICMP_SGT
signed greater than
@ ICMP_ULT
unsigned less than
@ ICMP_SGE
signed greater or equal
@ ICMP_ULE
unsigned less or equal
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Predicate getPredicate() const
Return the predicate for this instruction.
static LLVM_ABI std::optional< CmpPredicate > getMatching(CmpPredicate A, CmpPredicate B)
Compares two CmpPredicates taking samesign into account and returns the canonicalized CmpPredicate if...
bool hasSameSign() const
Query samesign information, for optimizations.
static LLVM_ABI Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
static LLVM_ABI Constant * getBinOpIdentity(unsigned Opcode, Type *Ty, bool AllowRHSConstant=false, bool NSZ=false)
Return the identity constant for a binary opcode.
This is the shared class of boolean and integer constants.
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
const APInt & getValue() const
Return the constant as an APInt value reference.
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
static bool shouldExecute(CounterInfo &Counter)
static DebugLoc getUnknown()
An analysis that produces DemandedBits for a function.
ValueT & at(const_arg_type_t< KeyT > Val)
at - Return the entry for the specified key, or abort if no such entry exists.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
bool erase(const KeyT &Val)
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
Base class for the actual dominator tree node.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Convenience struct for specifying and reasoning about fast-math flags.
bool allowReassoc() const
Flag queries.
bool allowContract() const
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
ArrayRef< Type * > params() const
Type * getReturnType() const
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
For the node iterator we just need to turn the TreeEntry iterator into a TreeEntry* iterator so that ...
nodes_iterator operator++()
nodes_iterator(const ItTy &It2)
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Value * CreateFreeze(Value *V, const Twine &Name="")
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
LLVM_ABI Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
LLVM_ABI Value * CreateSelectWithUnknownProfile(Value *C, Value *True, Value *False, StringRef PassName, const Twine &Name="")
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
static InstructionCost getInvalid(CostType Val=0)
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
LLVM_ABI bool mayWriteToMemory() const LLVM_READONLY
Return true if this instruction may modify memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
const SmallVectorImpl< Type * > & getArgTypes() const
An instruction for reading from memory.
Value * getPointerOperand()
Analysis pass that exposes the LoopInfo for a function.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
This class implements a map that also provides access to all stored values in a deterministic order.
VectorType takeVector()
Clear the MapVector and return the underlying vector.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
ValueT lookup(const KeyT &Key) const
std::pair< KeyT, ValueT > & front()
Information for memory intrinsic cost model.
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
T & front() const
front - Get the first element.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
void clear()
clear - Erase all elements from the queue.
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(SCEVUse LHS, SCEVUse RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< SCEVUse > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI unsigned getSmallConstantTripCount(const Loop *L)
Returns the exact trip count of the loop if we can compute it, and the result is a small constant.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< SCEVUse > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
LLVM_ABI const SCEV * getUDivExactExpr(SCEVUse LHS, SCEVUse RHS)
Get a canonical unsigned division expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
ArrayRef< value_type > getArrayRef() const
size_type size() const
Determine the number of elements in the SetVector.
const value_type & front() const
Return the first element of the SetVector.
void insert_range(Range &&R)
Vector takeVector()
Clear the SetVector and return the underlying vector.
bool contains(const_arg_type key) const
Check if the SetVector contains the given key.
void clear()
Completely clear the SetVector.
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
static LLVM_ABI bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static LLVM_ABI bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static LLVM_ABI bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
bool contains(const T &V) const
Check if the SmallSet contains the given element.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
iterator erase(const_iterator CI)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void push_back(const T &Elt)
reverse_iterator rbegin()
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
The instances of the Type class are immutable: once they are created, they are never changed.
LLVM_ABI bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
bool isVectorTy() const
True if this is an instance of VectorType.
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
bool isPointerTy() const
True if this is an instance of PointerType.
LLVM_ABI unsigned getStructNumElements() const
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
bool isIntegerTy() const
True if this is an instance of IntegerType.
TypeID getTypeID() const
Return the type id for the type.
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
bool isVoidTy() const
Return true if this is 'void'.
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Value * getOperand(unsigned i) const
unsigned getNumOperands() const
iterator_range< value_op_iterator > operand_values()
The Vector Function Database.
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
user_iterator user_begin()
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
LLVMContext & getContext() const
All values hold a context through their type.
iterator_range< user_iterator > users()
unsigned getValueID() const
Return an ID for the concrete type of this object.
bool hasUseList() const
Check if this Value has a use-list.
LLVM_ABI bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
LLVM_ABI User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
LLVM_ABI unsigned getNumUses() const
This method computes the number of uses of this Value.
iterator_range< use_iterator > uses()
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
iterator find(const_arg_type_t< ValueT > V)
void insert_range(Range &&R)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
const ParentTy * getParent() const
self_iterator getIterator()
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator I
iterator_adaptor_base()=default
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
static const int ScoreConstants
Constants.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
A helper data structure to hold the operands of a vector of instructions.
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, ArrayRef< ValueList > Operands, const InstructionsState &S, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
Bottom Up SLP Vectorizer.
static bool isIdentityOrder(ArrayRef< unsigned > Order)
Does this non-empty order represent an identity order?
bool isProfitableToReorder() const
Checks if it is profitable to reorder the current tree.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleEntity &SE)
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleBundle &Bundle)
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
unsigned getMaxVecRegSize() const
OptimizationRemarkEmitter * getORE()
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
bool isReducedCmpBitcastRoot() const
Returns true if the tree results in the reduced cmp bitcast root.
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
unsigned getTreeSize() const
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
InstructionCost getSpillCost()
bool isVectorized(const Value *V) const
Check if the value is vectorized in the tree.
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool isStridedLoad(ArrayRef< Value * > PointerOps, Type *ScalarTy, Align Alignment, const int64_t Diff, const size_t Sz) const
Checks if strided loads can be generated out of VL loads with pointers PointerOps:
SmallVector< StoreInst *, 8 > StoreList
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
unsigned getMinVecRegSize() const
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals, ArrayRef< std::tuple< WeakTrackingVH, unsigned, bool, bool > > VectorValuesAndScales)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
unsigned getVectorElementSize(Value *V)
unsigned getMinVF(unsigned Sz) const
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, StridedPtrInfo &SPtrInfo, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
SmallVector< unsigned, 4 > OrdersType
SmallVector< Instruction *, 16 > InstrList
std::pair< std::optional< int >, int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
InstructionCost getTreeCost(InstructionCost TreeCost, ArrayRef< Value * > VectorizedVals={}, InstructionCost ReductionCost=TTI::TCC_Free)
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool analyzeRtStrideCandidate(ArrayRef< Value * > PointerOps, Type *ScalarTy, Align CommonAlignment, SmallVectorImpl< unsigned > &SortedIndices, StridedPtrInfo &SPtrInfo) const
Return true if an array of scalar loads can be replaced with a strided load (with run-time stride).
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
bool canBuildSplitNode(ArrayRef< Value * > VL, const InstructionsState &LocalState, SmallVectorImpl< Value * > &Op1, SmallVectorImpl< Value * > &Op2, OrdersType &ReorderIndices) const
Checks if it is legal and profitable to build SplitVectorize node for the given VL.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleData &SD)
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
SmallVector< Value *, 8 > ValueList
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isReducedBitcastRoot() const
Returns true if the tree results in one of the reduced bitcasts variants.
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
bool analyzeConstantStrideCandidate(const ArrayRef< Value * > PointerOps, Type *ElemTy, Align Alignment, const SmallVectorImpl< unsigned > &SortedIndices, const int64_t Diff, Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const
Return true if an array of scalar loads can be replaced with a strided load (with constant stride).
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
InstructionCost calculateTreeCostAndTrimNonProfitable(ArrayRef< Value * > VectorizedVals={})
Calculates the cost of the subtrees, trims non-profitable ones and returns final cost.
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Gets reordering data for the given tree entry.
SmallPtrSet< Value *, 16 > ValueSet
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char IsConst[]
Key for Kernel::Arg::Metadata::mIsConst.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ BasicBlock
Various leaf nodes.
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaxNum(const Opnd0 &Op0, const Opnd1 &Op1)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
DisjointOr_match< LHS, RHS > m_DisjointOr(const LHS &L, const RHS &R)
auto match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinimum(const Opnd0 &Op0, const Opnd1 &Op1)
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaximum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
NoWrapTrunc_match< OpTy, TruncInst::NoUnsignedWrap > m_NUWTrunc(const OpTy &Op)
Matches trunc nuw.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinNum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
initializer< Ty > init(const Ty &Val)
unsigned combineHashValue(unsigned a, unsigned b)
Simplistic combination of 32-bit hash values into 32-bit hash values.
@ User
could "use" a pointer
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< CodeNode * > Code
friend class Instruction
Iterator for Instructions in a `BasicBlock.
LLVM_ABI iterator begin() const
LLVM_ABI Instruction & front() const
A private "module" namespace for types and utilities used by this pass.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
LLVM_ABI Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
constexpr auto not_equal_to(T &&Arg)
Functor variant of std::not_equal_to that can be used as a UnaryPredicate in functional algorithms li...
FunctionAddr VTableAddr Value
void stable_sort(R &&Range)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
void fill(R &&Range, T &&Value)
Provide wrappers to std::fill which take ranges instead of having to pass begin/end explicitly.
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI std::optional< unsigned > getLoopEstimatedTripCount(Loop *L, unsigned *EstimatedLoopInvocationWeight=nullptr)
Return either:
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
hash_code hash_value(const FixedPointSemantics &Val)
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
MaybeAlign getAlign(const CallInst &I, unsigned Index)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
@ LLVM_MARK_AS_BITMASK_ENUM
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
auto pred_end(const MachineBasicBlock *BB)
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
LLVM_ABI void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
scope_exit(Callable) -> scope_exit< Callable >
constexpr from_range_t from_range
LLVM_ABI std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
auto cast_or_null(const Y &Val)
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
iterator_range< po_iterator< T > > post_order(const T &G)
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
bool isa_and_nonnull(const Y &Val)
auto binary_search(R &&Range, T &&Value)
Provide wrappers to std::binary_search which take ranges instead of having to pass begin/end explicit...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
DomTreeNodeBase< BasicBlock > DomTreeNode
auto dyn_cast_or_null(const Y &Val)
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
constexpr bool has_single_bit(T Value) noexcept
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
bool isa_and_present(const Y &Val)
isa_and_present<X> - Functionally identical to isa, except that a null value is accepted.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
bool isPointerTy(const Type *T)
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
auto make_first_range(ContainerTy &&c)
Given a container of pairs, return a range over the first elements.
LLVM_ABI std::optional< int64_t > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
LLVM_ABI bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
constexpr bool is_sorted_constexpr(R &&Range, Cmp C=Cmp{})
Check if elements in a range R are sorted with respect to a comparator C.
bool isModOrRefSet(const ModRefInfo MRI)
LLVM_ABI bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Ref
The access may reference the value stored in memory.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
LLVM_ABI CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)
Returns the comparison predicate used when expanding a min/max reduction.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
RecurKind
These are the kinds of recurrences that we support.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
FunctionAddr VTableAddr Next
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
OutputIt copy(R &&Range, OutputIt Out)
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
constexpr unsigned BitWidth
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
auto pred_begin(const MachineBasicBlock *BB)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
template class LLVM_TEMPLATE_ABI DomTreeNodeBase< BasicBlock >
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
LLVM_ABI Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
LLVM_ABI bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
LLVM_ABI bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Used to keep track of an operand bundle.
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
BoUpSLP::TreeEntry TreeEntry
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits(bool IsSimple=false)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits(bool simple=false)
DenseMapInfo< BoUpSLP::TreeEntry * > FirstInfo
static bool isEqual(const BoUpSLP::EdgeInfo &LHS, const BoUpSLP::EdgeInfo &RHS)
static BoUpSLP::EdgeInfo getEmptyKey()
DenseMapInfo< unsigned > SecondInfo
static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val)
static BoUpSLP::EdgeInfo getTombstoneKey()
An information struct used to provide DenseMap with the various necessary components for a given valu...
Add the VectorizableTree to the index iterator to be able to return TreeEntry pointers.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
ContainerTy & VectorizableTree
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
BoUpSLP::TreeEntry TreeEntry
static nodes_iterator nodes_end(BoUpSLP *R)
BoUpSLP::TreeEntry::VecTreeTy ContainerTy
Incoming for lane mask phi as machine instruction, incoming register Reg and incoming block Block are...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
A memory-efficient immutable range with a single value repeated N times.
TargetTransformInfo * TTI
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Function object to check whether the second component of a container supported by std::get (like std:...
This structure holds any data we need about the edges being traversed during buildTreeRec().
bool operator==(const EdgeInfo &Other) const
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
unsigned EdgeIdx
The operand index of the use.
void dump(raw_ostream &OS) const
Debug print.
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)