74#ifdef EXPENSIVE_CHECKS
109using namespace std::placeholders;
111#define SV_NAME "slp-vectorizer"
112#define DEBUG_TYPE "SLP"
114STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
117 "Controls which SLP graphs should be vectorized.");
121 cl::desc(
"Run the SLP vectorization passes"));
125 cl::desc(
"Enable vectorization for wider vector utilization"));
129 cl::desc(
"Only vectorize if you gain more than this "
134 cl::desc(
"When true, SLP vectorizer bypasses profitability checks based on "
135 "heuristics and makes vectorization decision via cost modeling."));
139 cl::desc(
"Attempt to vectorize horizontal reductions"));
144 "Attempt to vectorize horizontal reductions feeding into a store"));
148 cl::desc(
"Improve the code quality by splitting alternate instructions"));
152 cl::desc(
"Attempt to vectorize for this register size in bits"));
156 cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
164 cl::desc(
"Limit the size of the SLP scheduling region per block"));
168 cl::desc(
"Attempt to vectorize for this register size in bits"));
172 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
176 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
182 cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
191 cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
195 cl::desc(
"The minimum number of loads, which should be considered strided, "
196 "if the stride is > 1 or is runtime value"));
200 cl::desc(
"The maximum stride, considered to be profitable."));
204 cl::desc(
"Disable tree reordering even if it is "
205 "profitable. Used for testing only."));
209 cl::desc(
"Generate strided loads even if they are not "
210 "profitable. Used for testing only."));
214 cl::desc(
"Display the SLP trees with Graphviz"));
218 cl::desc(
"Try to vectorize with non-power-of-2 number of elements."));
223 cl::desc(
"Try to replace values with the idempotent instructions for "
224 "better vectorization."));
256 Ty = Ty->getScalarType();
258 !Ty->isPPC_FP128Ty();
267 return SI->getValueOperand()->getType();
269 return CI->getOperand(0)->getType();
271 return IE->getOperand(1)->getType();
278 "ScalableVectorType is not supported.");
280 return VecTy->getNumElements();
294 Type *Ty,
unsigned Sz) {
299 if (NumParts == 0 || NumParts >= Sz)
314 if (NumParts == 0 || NumParts >= Sz)
319 return (Sz / RegVF) * RegVF;
331 I * VecTyNumElements, VecTyNumElements)))
333 : Mask[
I] * VecTyNumElements + J;
367 unsigned SVNumElements =
369 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
370 if (SVNumElements % ShuffleMaskSize != 0)
372 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
373 if (GroupSize == 0 || (VL.
size() % GroupSize) != 0)
375 unsigned NumGroup = 0;
376 for (
size_t I = 0,
E = VL.
size();
I !=
E;
I += GroupSize) {
378 Value *Src = SV->getOperand(0);
384 if (SV->getOperand(0) != Src)
387 if (!SV->isExtractSubvectorMask(Index))
389 ExpectedIndex.
set(Index / ShuffleMaskSize);
393 if (!ExpectedIndex.
all())
397 assert(NumGroup == (VL.
size() / GroupSize) &&
"Unexpected number of groups");
416 unsigned SVNumElements =
419 unsigned AccumulateLength = 0;
420 for (
Value *V : VL) {
422 for (
int M : SV->getShuffleMask())
424 : AccumulateLength + M);
425 AccumulateLength += SVNumElements;
466 return std::min<unsigned>(PartNumElems,
Size - Part * PartNumElems);
475 OS <<
"Idx: " << Idx <<
", ";
476 OS <<
"n=" << VL.
size() <<
" [" << *VL.
front() <<
", ..]";
499 if (BB !=
II->getParent())
516 Value *FirstNonUndef =
nullptr;
517 for (
Value *V : VL) {
520 if (!FirstNonUndef) {
524 if (V != FirstNonUndef)
527 return FirstNonUndef !=
nullptr;
542 bool IsCopyable =
false) {
544 return Cmp->isCommutative();
546 return BO->isCommutative() ||
547 (BO->getOpcode() == Instruction::Sub &&
555 if (match(U.getUser(),
556 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
557 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
561 auto *I = dyn_cast<BinaryOperator>(U.get());
562 return match(U.getUser(),
563 m_Intrinsic<Intrinsic::abs>(
564 m_Specific(U.get()), m_ConstantInt(Flag))) &&
565 ((!IsCopyable && I && !I->hasNoSignedWrap()) ||
568 (BO->getOpcode() == Instruction::FSub &&
572 return match(U.getUser(),
573 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
575 return I->isCommutative();
582 bool IsCopyable =
false) {
584 "The instruction is not commutative.");
588 switch (BO->getOpcode()) {
589 case Instruction::Sub:
590 case Instruction::FSub:
596 return I->isCommutableOperand(
Op);
616 constexpr unsigned IntrinsicNumOperands = 2;
617 return IntrinsicNumOperands;
619 return I->getNumOperands();
625 static_assert(std::is_same_v<T, InsertElementInst> ||
626 std::is_same_v<T, ExtractElementInst>,
636 if (CI->getValue().uge(VT->getNumElements()))
638 Index *= VT->getNumElements();
639 Index += CI->getZExtValue();
661 Type *CurrentType =
IV->getType();
662 for (
unsigned I :
IV->indices()) {
664 Index *= ST->getNumElements();
665 CurrentType = ST->getElementType(
I);
667 Index *= AT->getNumElements();
668 CurrentType = AT->getElementType();
690 return std::all_of(It, VL.
end(), [&](
Value *V) {
691 if (auto *CI = dyn_cast<CmpInst>(V))
692 return BasePred == CI->getPredicate();
693 if (auto *I = dyn_cast<Instruction>(V))
694 return I->getOpcode() == Opcode;
695 return isa<PoisonValue>(V);
723 if (MaskArg == UseMask::UndefsAsMask)
727 if (MaskArg == UseMask::FirstArg &&
Value < VF)
728 UseMask.reset(
Value);
729 else if (MaskArg == UseMask::SecondArg &&
Value >= VF)
730 UseMask.reset(
Value - VF);
738template <
bool IsPoisonOnly = false>
742 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
750 if (!UseMask.empty()) {
761 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
776 for (
unsigned I = 0,
E = VecTy->getNumElements();
I !=
E; ++
I) {
777 if (
Constant *Elem =
C->getAggregateElement(
I))
779 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
807static std::optional<TargetTransformInfo::ShuffleKind>
814 std::accumulate(VL.
begin(), VL.
end(), 0u, [](
unsigned S,
Value *V) {
815 auto *EI = dyn_cast<ExtractElementInst>(V);
818 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
821 return std::max(S, VTy->getNumElements());
824 Value *Vec1 =
nullptr;
825 Value *Vec2 =
nullptr;
830 Value *Vec = EE->getVectorOperand();
836 ShuffleMode CommonShuffleMode =
Unknown;
838 for (
unsigned I = 0,
E = VL.
size();
I <
E; ++
I) {
845 auto *Vec = EI->getVectorOperand();
859 if (Idx->getValue().uge(
Size))
861 unsigned IntIdx = Idx->getValue().getZExtValue();
868 if (!Vec1 || Vec1 == Vec) {
870 }
else if (!Vec2 || Vec2 == Vec) {
876 if (CommonShuffleMode == Permute)
880 if (Mask[
I] %
Size !=
I) {
881 CommonShuffleMode = Permute;
884 CommonShuffleMode =
Select;
887 if (CommonShuffleMode ==
Select && Vec2)
897 unsigned Opcode =
E->getOpcode();
898 assert((Opcode == Instruction::ExtractElement ||
899 Opcode == Instruction::ExtractValue) &&
900 "Expected extractelement or extractvalue instruction.");
901 if (Opcode == Instruction::ExtractElement) {
907 unsigned Idx = CI->getZExtValue();
915 if (EI->getNumIndices() != 1)
917 return *EI->idx_begin();
951class BinOpSameOpcodeHelper {
952 using MaskType = std::uint_fast16_t;
954 constexpr static std::initializer_list<unsigned> SupportedOp = {
955 Instruction::Add, Instruction::Sub, Instruction::Mul, Instruction::Shl,
956 Instruction::AShr, Instruction::And, Instruction::Or, Instruction::Xor};
966 MainOpBIT = 0b100000000,
974 static std::pair<ConstantInt *, unsigned>
975 isBinOpWithConstantInt(
const Instruction *
I) {
976 unsigned Opcode =
I->getOpcode();
982 if (Opcode == Instruction::Sub || Opcode == Instruction::Shl ||
983 Opcode == Instruction::AShr)
989 struct InterchangeableInfo {
992 MaskType Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |
993 MulBIT | AShrBIT | ShlBIT;
998 MaskType SeenBefore = 0;
999 InterchangeableInfo(
const Instruction *I) : I(I) {}
1003 bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) {
1004 if (Mask & InterchangeableMask) {
1005 SeenBefore |= OpcodeInMaskForm;
1006 Mask &= InterchangeableMask;
1011 bool equal(
unsigned Opcode) {
1012 return Opcode == I->getOpcode() && trySet(MainOpBIT, MainOpBIT);
1015 MaskType Candidate = Mask & SeenBefore;
1016 if (Candidate & MainOpBIT)
1017 return I->getOpcode();
1018 if (Candidate & ShlBIT)
1019 return Instruction::Shl;
1020 if (Candidate & AShrBIT)
1021 return Instruction::AShr;
1022 if (Candidate & MulBIT)
1023 return Instruction::Mul;
1024 if (Candidate & AddBIT)
1025 return Instruction::Add;
1026 if (Candidate & SubBIT)
1027 return Instruction::Sub;
1028 if (Candidate & AndBIT)
1029 return Instruction::And;
1030 if (Candidate & OrBIT)
1031 return Instruction::Or;
1032 if (Candidate & XorBIT)
1033 return Instruction::Xor;
1038 bool hasCandidateOpcode(
unsigned Opcode)
const {
1039 MaskType Candidate = Mask & SeenBefore;
1041 case Instruction::Shl:
1042 return Candidate & ShlBIT;
1043 case Instruction::AShr:
1044 return Candidate & AShrBIT;
1045 case Instruction::Mul:
1046 return Candidate & MulBIT;
1047 case Instruction::Add:
1048 return Candidate & AddBIT;
1049 case Instruction::Sub:
1050 return Candidate & SubBIT;
1051 case Instruction::And:
1052 return Candidate & AndBIT;
1053 case Instruction::Or:
1054 return Candidate & OrBIT;
1055 case Instruction::Xor:
1056 return Candidate & XorBIT;
1057 case Instruction::LShr:
1058 case Instruction::FAdd:
1059 case Instruction::FSub:
1060 case Instruction::FMul:
1061 case Instruction::SDiv:
1062 case Instruction::UDiv:
1063 case Instruction::FDiv:
1064 case Instruction::SRem:
1065 case Instruction::URem:
1066 case Instruction::FRem:
1076 unsigned FromOpcode = I->getOpcode();
1077 if (FromOpcode == ToOpcode)
1080 auto [CI, Pos] = isBinOpWithConstantInt(I);
1081 const APInt &FromCIValue = CI->getValue();
1082 unsigned FromCIValueBitWidth = FromCIValue.
getBitWidth();
1084 switch (FromOpcode) {
1085 case Instruction::Shl:
1086 if (ToOpcode == Instruction::Mul) {
1090 assert(FromCIValue.
isZero() &&
"Cannot convert the instruction.");
1091 ToCIValue = ToOpcode == Instruction::And
1093 : APInt::getZero(FromCIValueBitWidth);
1096 case Instruction::Mul:
1098 if (ToOpcode == Instruction::Shl) {
1099 ToCIValue = APInt(FromCIValueBitWidth, FromCIValue.
logBase2());
1101 assert(FromCIValue.
isOne() &&
"Cannot convert the instruction.");
1102 ToCIValue = ToOpcode == Instruction::And
1104 : APInt::getZero(FromCIValueBitWidth);
1107 case Instruction::Add:
1108 case Instruction::Sub:
1109 if (FromCIValue.
isZero()) {
1113 "Cannot convert the instruction.");
1114 ToCIValue = FromCIValue;
1118 case Instruction::And:
1120 ToCIValue = ToOpcode == Instruction::Mul
1122 : APInt::getZero(FromCIValueBitWidth);
1125 assert(FromCIValue.
isZero() &&
"Cannot convert the instruction.");
1129 Value *
LHS = I->getOperand(1 - Pos);
1131 ConstantInt::get(I->getOperand(Pos)->getType(), ToCIValue);
1135 ((FromOpcode == Instruction::Add || FromOpcode == Instruction::Or ||
1136 FromOpcode == Instruction::Xor) &&
1137 ToOpcode == Instruction::Sub))
1142 InterchangeableInfo MainOp;
1143 InterchangeableInfo AltOp;
1145 return ::isValidForAlternation(MainOp.I->getOpcode()) &&
1148 bool initializeAltOp(
const Instruction *
I) {
1158 BinOpSameOpcodeHelper(
const Instruction *MainOp,
1159 const Instruction *AltOp =
nullptr)
1160 : MainOp(MainOp), AltOp(AltOp) {
1163 bool add(
const Instruction *
I) {
1165 "BinOpSameOpcodeHelper only accepts BinaryOperator.");
1166 unsigned Opcode =
I->getOpcode();
1167 MaskType OpcodeInMaskForm;
1170 case Instruction::Shl:
1171 OpcodeInMaskForm = ShlBIT;
1173 case Instruction::AShr:
1174 OpcodeInMaskForm = AShrBIT;
1176 case Instruction::Mul:
1177 OpcodeInMaskForm = MulBIT;
1179 case Instruction::Add:
1180 OpcodeInMaskForm = AddBIT;
1182 case Instruction::Sub:
1183 OpcodeInMaskForm = SubBIT;
1185 case Instruction::And:
1186 OpcodeInMaskForm = AndBIT;
1188 case Instruction::Or:
1189 OpcodeInMaskForm = OrBIT;
1191 case Instruction::Xor:
1192 OpcodeInMaskForm = XorBIT;
1195 return MainOp.equal(Opcode) ||
1196 (initializeAltOp(
I) && AltOp.equal(Opcode));
1198 MaskType InterchangeableMask = OpcodeInMaskForm;
1199 ConstantInt *CI = isBinOpWithConstantInt(
I).first;
1201 constexpr MaskType CanBeAll =
1202 XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | MulBIT | AShrBIT | ShlBIT;
1203 const APInt &CIValue = CI->
getValue();
1205 case Instruction::Shl:
1207 InterchangeableMask = CIValue.
isZero() ? CanBeAll : MulBIT | ShlBIT;
1209 case Instruction::Mul:
1210 if (CIValue.
isOne()) {
1211 InterchangeableMask = CanBeAll;
1215 InterchangeableMask = MulBIT | ShlBIT;
1217 case Instruction::Add:
1218 case Instruction::Sub:
1219 InterchangeableMask = CIValue.
isZero() ? CanBeAll : SubBIT | AddBIT;
1221 case Instruction::And:
1223 InterchangeableMask = CanBeAll;
1225 case Instruction::Xor:
1227 InterchangeableMask = XorBIT | OrBIT | SubBIT | AddBIT;
1231 InterchangeableMask = CanBeAll;
1235 return MainOp.trySet(OpcodeInMaskForm, InterchangeableMask) ||
1236 (initializeAltOp(
I) &&
1237 AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));
1239 unsigned getMainOpcode()
const {
return MainOp.getOpcode(); }
1241 bool hasCandidateOpcode(
unsigned Opcode)
const {
1242 return MainOp.hasCandidateOpcode(Opcode);
1244 bool hasAltOp()
const {
return AltOp.I; }
1245 unsigned getAltOpcode()
const {
1246 return hasAltOp() ? AltOp.getOpcode() : getMainOpcode();
1249 return MainOp.getOperand(
I);
1254class InstructionsState {
1280 bool HasCopyables =
false;
1284 assert(valid() &&
"InstructionsState is invalid.");
1289 assert(valid() &&
"InstructionsState is invalid.");
1294 unsigned getOpcode()
const {
return getMainOp()->getOpcode(); }
1296 unsigned getAltOpcode()
const {
return getAltOp()->getOpcode(); }
1299 bool isAltShuffle()
const {
return getMainOp() != getAltOp(); }
1308 Instruction *getMatchingMainOpOrAltOp(Instruction *
I)
const {
1309 assert(MainOp &&
"MainOp cannot be nullptr.");
1310 if (
I->getOpcode() == MainOp->getOpcode())
1313 assert(AltOp &&
"AltOp cannot be nullptr.");
1314 if (
I->getOpcode() == AltOp->getOpcode())
1316 if (!
I->isBinaryOp())
1318 BinOpSameOpcodeHelper
Converter(MainOp);
1321 if (isAltShuffle() && !
Converter.hasCandidateOpcode(MainOp->getOpcode())) {
1322 BinOpSameOpcodeHelper AltConverter(AltOp);
1323 if (AltConverter.add(
I) && AltConverter.add(AltOp) &&
1324 AltConverter.hasCandidateOpcode(AltOp->getOpcode()))
1327 if (
Converter.hasAltOp() && !isAltShuffle())
1329 return Converter.hasAltOp() ? AltOp : MainOp;
1333 bool isShiftOp()
const {
1334 return getMainOp()->isShift() && getAltOp()->isShift();
1339 return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();
1343 bool isMulDivLikeOp()
const {
1344 constexpr std::array<unsigned, 8> MulDiv = {
1345 Instruction::Mul, Instruction::FMul, Instruction::SDiv,
1346 Instruction::UDiv, Instruction::FDiv, Instruction::SRem,
1347 Instruction::URem, Instruction::FRem};
1353 bool isAddSubLikeOp()
const {
1354 constexpr std::array<unsigned, 4>
AddSub = {
1355 Instruction::Add, Instruction::Sub, Instruction::FAdd,
1362 bool isCmpOp()
const {
1363 return (
getOpcode() == Instruction::ICmp ||
1369 bool valid()
const {
return MainOp && AltOp; }
1371 explicit operator bool()
const {
return valid(); }
1373 InstructionsState() =
delete;
1374 InstructionsState(Instruction *MainOp, Instruction *AltOp,
1375 bool HasCopyables =
false)
1376 : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}
1377 static InstructionsState invalid() {
return {
nullptr,
nullptr}; }
1380 bool isCopyableElement(
Value *V)
const {
1381 assert(valid() &&
"InstructionsState is invalid.");
1384 if (isAltShuffle() ||
getOpcode() == Instruction::GetElementPtr)
1389 if (
I->getParent() != MainOp->getParent() &&
1393 if (
I->getOpcode() == MainOp->getOpcode())
1395 if (!
I->isBinaryOp())
1397 BinOpSameOpcodeHelper
Converter(MainOp);
1403 bool isNonSchedulable(
Value *V)
const {
1404 assert(valid() &&
"InstructionsState is invalid.");
1411 if (getMainOp() == V)
1413 if (isCopyableElement(V)) {
1414 auto IsNonSchedulableCopyableElement = [
this](
Value *
V) {
1416 return !
I ||
isa<PHINode>(
I) ||
I->getParent() != MainOp->getParent() ||
1421 !MainOp->comesBefore(
I));
1424 return IsNonSchedulableCopyableElement(V);
1431 bool areInstructionsWithCopyableElements()
const {
1432 assert(valid() &&
"InstructionsState is invalid.");
1433 return HasCopyables;
1437std::pair<Instruction *, SmallVector<Value *>>
1439 Instruction *SelectedOp = S.getMatchingMainOpOrAltOp(
I);
1440 assert(SelectedOp &&
"Cannot convert the instruction.");
1441 if (
I->isBinaryOp()) {
1443 return std::make_pair(SelectedOp,
Converter.getOperand(SelectedOp));
1462 for (
Value *V : VL) {
1467 if (Inst->getOpcode() == Opcode)
1481 BaseOp0 == Op0 || BaseOp1 == Op1 ||
1492 "Assessing comparisons of different types?");
1502 return (BasePred == Pred &&
1504 (BasePred == SwappedPred &&
1515 return InstructionsState::invalid();
1519 return InstructionsState::invalid();
1524 (VL.
size() == 2 && InstCnt < 2))
1525 return InstructionsState::invalid();
1534 unsigned AltOpcode = Opcode;
1536 BinOpSameOpcodeHelper BinOpHelper(MainOp);
1537 bool SwappedPredsCompatible = IsCmpOp && [&]() {
1539 UniquePreds.
insert(BasePred);
1540 UniqueNonSwappedPreds.
insert(BasePred);
1541 for (
Value *V : VL) {
1548 UniqueNonSwappedPreds.
insert(CurrentPred);
1549 if (!UniquePreds.
contains(CurrentPred) &&
1550 !UniquePreds.
contains(SwappedCurrentPred))
1551 UniquePreds.
insert(CurrentPred);
1556 return UniqueNonSwappedPreds.
size() > 2 && UniquePreds.
size() == 2;
1566 return InstructionsState::invalid();
1568 bool AnyPoison = InstCnt != VL.
size();
1579 if (AnyPoison && (
I->isIntDivRem() ||
I->isFPDivRem() ||
isa<CallInst>(
I)))
1580 return InstructionsState::invalid();
1581 unsigned InstOpcode =
I->getOpcode();
1583 if (BinOpHelper.add(
I))
1588 Value *Op1 =
I->getOperand(0);
1591 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
1593 if (Opcode == AltOpcode) {
1596 "Cast isn't safe for alternation, logic needs to be updated!");
1597 AltOpcode = InstOpcode;
1604 Type *Ty0 = BaseInst->getOperand(0)->getType();
1605 Type *Ty1 = Inst->getOperand(0)->getType();
1607 assert(InstOpcode == Opcode &&
"Expected same CmpInst opcode.");
1608 assert(InstOpcode == AltOpcode &&
1609 "Alternate instructions are only supported by BinaryOperator "
1617 if ((VL.
size() == 2 || SwappedPredsCompatible) &&
1618 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1624 if (MainOp != AltOp) {
1627 }
else if (BasePred != CurrentPred) {
1630 "CmpInst isn't safe for alternation, logic needs to be updated!");
1635 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1636 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1639 }
else if (InstOpcode == Opcode) {
1640 assert(InstOpcode == AltOpcode &&
1641 "Alternate instructions are only supported by BinaryOperator and "
1644 if (Gep->getNumOperands() != 2 ||
1646 return InstructionsState::invalid();
1649 return InstructionsState::invalid();
1652 if (!LI->isSimple() || !BaseLI->isSimple())
1653 return InstructionsState::invalid();
1657 return InstructionsState::invalid();
1658 if (
Call->hasOperandBundles() &&
1660 !std::equal(
Call->op_begin() +
Call->getBundleOperandsStartIndex(),
1661 Call->op_begin() +
Call->getBundleOperandsEndIndex(),
1664 return InstructionsState::invalid();
1667 return InstructionsState::invalid();
1670 if (Mappings.
size() != BaseMappings.
size() ||
1671 Mappings.
front().ISA != BaseMappings.
front().ISA ||
1672 Mappings.
front().ScalarName != BaseMappings.
front().ScalarName ||
1673 Mappings.
front().VectorName != BaseMappings.
front().VectorName ||
1674 Mappings.
front().Shape.VF != BaseMappings.
front().Shape.VF ||
1675 Mappings.
front().Shape.Parameters !=
1676 BaseMappings.
front().Shape.Parameters)
1677 return InstructionsState::invalid();
1682 return InstructionsState::invalid();
1687 assert(MainOp &&
"Cannot find MainOp with Opcode from BinOpHelper.");
1689 assert(MainOp &&
"Cannot find AltOp with Opcode from BinOpHelper.");
1692 "Incorrect implementation of allSameOpcode.");
1693 InstructionsState S(MainOp, AltOp);
1699 "Invalid InstructionsState.");
1707 return all_of(VL, [&](
Value *V) {
return V->getType() == Ty; });
1717 unsigned Opcode = UserInst->
getOpcode();
1719 case Instruction::Load: {
1723 case Instruction::Store: {
1725 return (
SI->getPointerOperand() == Scalar);
1727 case Instruction::Call: {
1731 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1732 Arg.value().get() == Scalar;
1752 return LI->isSimple();
1754 return SI->isSimple();
1756 return !
MI->isVolatile();
1764 bool ExtendingManyInputs =
false) {
1765 if (SubMask.
empty())
1768 (!ExtendingManyInputs || SubMask.
size() > Mask.size() ||
1771 "SubMask with many inputs support must be larger than the mask.");
1773 Mask.append(SubMask.
begin(), SubMask.
end());
1777 int TermValue = std::min(Mask.size(), SubMask.
size());
1778 for (
int I = 0,
E = SubMask.
size();
I <
E; ++
I) {
1780 (!ExtendingManyInputs &&
1781 (SubMask[
I] >= TermValue || Mask[SubMask[
I]] >= TermValue)))
1783 NewMask[
I] = Mask[SubMask[
I]];
1799 const size_t Sz = Order.
size();
1802 for (
unsigned I = 0;
I < Sz; ++
I) {
1804 UnusedIndices.
reset(Order[
I]);
1806 MaskedIndices.
set(
I);
1808 if (MaskedIndices.
none())
1811 "Non-synced masked/available indices.");
1815 assert(Idx >= 0 &&
"Indices must be synced.");
1825 unsigned Opcode0,
unsigned Opcode1) {
1832 OpcodeMask.
set(Lane * ScalarTyNumElements,
1833 Lane * ScalarTyNumElements + ScalarTyNumElements);
1842 "Expected scalar constants.");
1845 std::fill_n(NewVal.begin() +
I * VF, VF, V);
1852 const unsigned E = Indices.
size();
1854 for (
unsigned I = 0;
I <
E; ++
I)
1855 Mask[Indices[
I]] =
I;
1861 assert(!Mask.empty() &&
"Expected non-empty mask.");
1865 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
1867 Scalars[Mask[
I]] = Prev[
I];
1880 auto *IO = dyn_cast<Instruction>(V);
1883 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1896 return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(
UsesLimit) &&
1898 auto *IU = dyn_cast<Instruction>(U);
1901 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1917 return !VL.
empty() &&
1933 return NumParts > 0 && NumParts < Sz &&
has_single_bit(Sz / NumParts) &&
1942 const unsigned Limit = std::numeric_limits<unsigned>::max()) {
1943 unsigned NumParts =
TTI.getNumberOfParts(VecTy);
1944 if (NumParts == 0 || NumParts >= Limit)
1947 if (NumParts >= Sz || Sz % NumParts != 0 ||
1956 class ScheduleEntity;
1958 class ScheduleCopyableData;
1959 class ScheduleBundle;
1969 struct StridedPtrInfo {
1970 Value *StrideVal =
nullptr;
1971 const SCEV *StrideSCEV =
nullptr;
1997 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1998 AC(AC), DB(DB), DL(DL), ORE(ORE),
2017 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
2030 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales = {});
2050 const SmallDenseSet<Value *> &UserIgnoreLst);
2057 assert(!VectorizableTree.empty() &&
"No graph to get the first node from");
2058 return VectorizableTree.front()->Scalars;
2064 const TreeEntry &Root = *VectorizableTree.front();
2065 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
2066 !Root.Scalars.
front()->getType()->isIntegerTy())
2067 return std::nullopt;
2068 auto It = MinBWs.find(&Root);
2069 if (It != MinBWs.end())
2073 if (Root.getOpcode() == Instruction::ZExt ||
2074 Root.getOpcode() == Instruction::SExt)
2075 return std::make_pair(
cast<CastInst>(Root.getMainOp())->getSrcTy(),
2076 Root.getOpcode() == Instruction::SExt);
2077 return std::nullopt;
2083 return MinBWs.at(VectorizableTree.front().get()).second;
2088 if (ReductionBitWidth == 0 ||
2089 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
2090 ReductionBitWidth >=
2091 DL->getTypeSizeInBits(
2092 VectorizableTree.front()->Scalars.front()->getType()))
2094 VectorizableTree.front()->Scalars.front()->getType(),
2095 VectorizableTree.front()->getVectorFactor());
2098 VectorizableTree.front()->Scalars.front()->getContext(),
2100 VectorizableTree.front()->getVectorFactor());
2105 return VectorizableTree.front()->hasState() &&
2106 (VectorizableTree.front()->CombinedOp == TreeEntry::ReducedBitcast ||
2107 VectorizableTree.front()->CombinedOp ==
2108 TreeEntry::ReducedBitcastBSwap) &&
2109 VectorizableTree.front()->State == TreeEntry::Vectorize;
2124 VectorizableTree.clear();
2125 ScalarToTreeEntries.clear();
2126 DeletedNodes.clear();
2127 TransformedToGatherNodes.clear();
2128 OperandsToTreeEntry.clear();
2129 ScalarsInSplitNodes.clear();
2131 NonScheduledFirst.clear();
2132 EntryToLastInstruction.clear();
2133 LastInstructionToPos.clear();
2134 LoadEntriesToVectorize.clear();
2135 IsGraphTransformMode =
false;
2136 GatheredLoadsEntriesFirst.reset();
2137 CompressEntryToData.clear();
2138 ExternalUses.clear();
2139 ExternalUsesAsOriginalScalar.clear();
2140 ExternalUsesWithNonUsers.clear();
2141 for (
auto &Iter : BlocksSchedules) {
2142 BlockScheduling *BS = Iter.second.get();
2146 ReductionBitWidth = 0;
2148 CastMaxMinBWSizes.reset();
2149 ExtraBitWidthNodes.clear();
2150 InstrElementSize.clear();
2151 UserIgnoreList =
nullptr;
2152 PostponedGathers.clear();
2153 ValueToGatherNodes.clear();
2154 TreeEntryToStridedPtrInfoMap.clear();
2170 assert(!Order.
empty() &&
"expected non-empty order");
2171 const unsigned Sz = Order.
size();
2173 return P.value() ==
P.index() ||
P.value() == Sz;
2186 bool IgnoreReorder);
2199 std::optional<OrdersType>
2237 return MaxVecRegSize;
2242 return MinVecRegSize;
2250 unsigned MaxVF =
MaxVFOption.getNumOccurrences() ?
2251 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
2252 return MaxVF ? MaxVF : UINT_MAX;
2291 Align Alignment,
const int64_t Diff,
2292 const size_t Sz)
const;
2332 Value *Ptr0,
Value *PtrN, StridedPtrInfo &SPtrInfo)
const;
2350 Align CommonAlignment,
2352 StridedPtrInfo &SPtrInfo)
const;
2367 StridedPtrInfo &SPtrInfo,
2368 unsigned *BestVF =
nullptr,
2369 bool TryRecursiveCheck =
true)
const;
2373 ListOfKnonwnNonVectorizableLoads.insert(
hash_value(VL));
2377 template <
typename T>
2379 return ListOfKnonwnNonVectorizableLoads.contains(
hash_value(VL));
2404 OS <<
"{User:" << (
UserTE ? std::to_string(
UserTE->Idx) :
"null")
2405 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
2430 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
2431 MaxLevel(MaxLevel) {}
2487 auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
2492 auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
2494 return U == U1 || U == U2 || R.isVectorized(U);
2497 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
2500 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
2502 ((
int)V1->getNumUses() == NumLanes ||
2503 AllUsersAreInternal(V1, V2)))
2509 auto CheckSameEntryOrFail = [&]() {
2514 any_of(TEs2, [&](TreeEntry *E) {
return Set.contains(E); }))
2523 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
2525 return CheckSameEntryOrFail();
2528 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
2529 LI2->getPointerOperand(), DL, SE,
true);
2530 if (!Dist || *Dist == 0) {
2533 R.TTI->isLegalMaskedGather(
2536 return CheckSameEntryOrFail();
2540 if (std::abs(*Dist) > NumLanes / 2)
2573 Value *EV2 =
nullptr;
2586 int Dist = Idx2 - Idx1;
2589 if (std::abs(Dist) == 0)
2591 if (std::abs(Dist) > NumLanes / 2)
2598 return CheckSameEntryOrFail();
2604 if (I1->getParent() != I2->getParent())
2605 return CheckSameEntryOrFail();
2613 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.
empty() ||
2614 !S.isAltShuffle()) &&
2618 S.getMainOp()->getNumOperands();
2630 return CheckSameEntryOrFail();
2664 int ShallowScoreAtThisLevel =
2675 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
2678 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
2680 ShallowScoreAtThisLevel))
2681 return ShallowScoreAtThisLevel;
2682 assert(I1 && I2 &&
"Should have early exited.");
2689 for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
2690 OpIdx1 != NumOperands1; ++OpIdx1) {
2692 int MaxTmpScore = 0;
2693 unsigned MaxOpIdx2 = 0;
2694 bool FoundBest =
false;
2698 ? I2->getNumOperands()
2699 : std::min(I2->getNumOperands(), OpIdx1 + 1);
2700 assert(FromIdx <= ToIdx &&
"Bad index");
2701 for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
2703 if (Op2Used.
count(OpIdx2))
2708 I1, I2, CurrLevel + 1, {});
2711 TmpScore > MaxTmpScore) {
2712 MaxTmpScore = TmpScore;
2719 Op2Used.
insert(MaxOpIdx2);
2720 ShallowScoreAtThisLevel += MaxTmpScore;
2723 return ShallowScoreAtThisLevel;
2754 struct OperandData {
2755 OperandData() =
default;
2756 OperandData(
Value *V,
bool APO,
bool IsUsed)
2757 : V(V), APO(APO), IsUsed(IsUsed) {}
2767 bool IsUsed =
false;
2776 enum class ReorderingMode {
2790 unsigned ArgSize = 0;
2796 const Loop *L =
nullptr;
2799 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
2800 return OpsVec[
OpIdx][Lane];
2804 const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const {
2805 return OpsVec[
OpIdx][Lane];
2810 for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
2812 for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2814 OpsVec[
OpIdx][Lane].IsUsed =
false;
2818 void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
2819 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2831 int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx,
2833 Value *IdxLaneV = getData(Idx, Lane).V;
2846 unsigned UniquesCount = Uniques.
size();
2847 auto IdxIt = Uniques.
find(IdxLaneV);
2848 unsigned UniquesCntWithIdxLaneV =
2849 IdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2851 auto OpIdxIt = Uniques.
find(OpIdxLaneV);
2852 unsigned UniquesCntWithOpIdxLaneV =
2853 OpIdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2854 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2856 return std::min(
bit_ceil(UniquesCntWithOpIdxLaneV) -
2857 UniquesCntWithOpIdxLaneV,
2858 UniquesCntWithOpIdxLaneV -
2860 ((IdxIt != Uniques.
end() && UsedLanes.
test(IdxIt->second))
2861 ? UniquesCntWithIdxLaneV -
bit_floor(UniquesCntWithIdxLaneV)
2862 :
bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2871 int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
2872 Value *IdxLaneV = getData(Idx, Lane).V;
2885 return R.areAllUsersVectorized(IdxLaneI)
2893 static const int ScoreScaleFactor = 10;
2901 int Lane,
unsigned OpIdx,
unsigned Idx,
2911 int SplatScore = getSplatScore(Lane,
OpIdx, Idx, UsedLanes);
2912 if (Score <= -SplatScore) {
2916 Score += SplatScore;
2922 Score *= ScoreScaleFactor;
2923 Score += getExternalUseScore(Lane,
OpIdx, Idx);
2941 std::optional<unsigned>
2942 getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
2946 unsigned NumOperands = getNumOperands();
2949 Value *OpLastLane = getData(
OpIdx, LastLane).V;
2952 ReorderingMode RMode = ReorderingModes[
OpIdx];
2953 if (RMode == ReorderingMode::Failed)
2954 return std::nullopt;
2957 bool OpIdxAPO = getData(
OpIdx, Lane).APO;
2963 std::optional<unsigned> Idx;
2967 BestScoresPerLanes.try_emplace(std::make_pair(
OpIdx, Lane), 0)
2973 bool IsUsed = RMode == ReorderingMode::Splat ||
2974 RMode == ReorderingMode::Constant ||
2975 RMode == ReorderingMode::Load;
2977 for (
unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2979 OperandData &OpData = getData(Idx, Lane);
2981 bool OpAPO = OpData.APO;
2990 if (OpAPO != OpIdxAPO)
2995 case ReorderingMode::Load:
2996 case ReorderingMode::Opcode: {
2997 bool LeftToRight = Lane > LastLane;
2998 Value *OpLeft = (LeftToRight) ? OpLastLane :
Op;
2999 Value *OpRight = (LeftToRight) ?
Op : OpLastLane;
3000 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
3001 OpIdx, Idx, IsUsed, UsedLanes);
3002 if (Score >
static_cast<int>(BestOp.Score) ||
3003 (Score > 0 && Score ==
static_cast<int>(BestOp.Score) &&
3006 BestOp.Score = Score;
3007 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] = Score;
3011 case ReorderingMode::Constant:
3013 (!BestOp.Score && L && L->isLoopInvariant(
Op))) {
3017 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] =
3024 case ReorderingMode::Splat:
3026 IsUsed =
Op == OpLastLane;
3027 if (
Op == OpLastLane) {
3029 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] =
3035 case ReorderingMode::Failed:
3041 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
3045 return std::nullopt;
3052 unsigned getBestLaneToStartReordering()
const {
3053 unsigned Min = UINT_MAX;
3054 unsigned SameOpNumber = 0;
3065 for (
int I = getNumLanes();
I > 0; --
I) {
3066 unsigned Lane =
I - 1;
3067 OperandsOrderData NumFreeOpsHash =
3068 getMaxNumOperandsThatCanBeReordered(Lane);
3071 if (NumFreeOpsHash.NumOfAPOs < Min) {
3072 Min = NumFreeOpsHash.NumOfAPOs;
3073 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3075 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3076 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
3077 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
3080 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3081 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3082 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
3083 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
3084 auto [It, Inserted] =
3085 HashMap.
try_emplace(NumFreeOpsHash.Hash, 1, Lane);
3091 unsigned BestLane = 0;
3092 unsigned CntMin = UINT_MAX;
3094 if (
Data.second.first < CntMin) {
3095 CntMin =
Data.second.first;
3096 BestLane =
Data.second.second;
3103 struct OperandsOrderData {
3106 unsigned NumOfAPOs = UINT_MAX;
3109 unsigned NumOpsWithSameOpcodeParent = 0;
3123 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const {
3124 unsigned CntTrue = 0;
3125 unsigned NumOperands = getNumOperands();
3135 bool AllUndefs =
true;
3136 unsigned NumOpsWithSameOpcodeParent = 0;
3141 const OperandData &OpData = getData(
OpIdx, Lane);
3148 I->getParent() != Parent) {
3149 if (NumOpsWithSameOpcodeParent == 0) {
3150 NumOpsWithSameOpcodeParent = 1;
3152 Parent =
I->getParent();
3154 --NumOpsWithSameOpcodeParent;
3157 ++NumOpsWithSameOpcodeParent;
3166 OperandsOrderData
Data;
3167 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
3168 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
3175 const InstructionsState &S) {
3179 return VL.
size() == getNumLanes();
3181 "Expected same number of lanes");
3182 assert(S.valid() &&
"InstructionsState is invalid.");
3188 OpsVec.resize(ArgSize);
3189 unsigned NumLanes = VL.
size();
3190 for (OperandDataVec &
Ops : OpsVec)
3191 Ops.resize(NumLanes);
3206 OpsVec[
OpIdx][Lane] = {Operands[
OpIdx][Lane],
true,
false};
3209 bool IsInverseOperation =
false;
3210 if (S.isCopyableElement(VL[Lane])) {
3212 IsInverseOperation =
3215 assert(
I &&
"Expected instruction");
3216 auto [SelectedOp,
Ops] = convertTo(
I, S);
3223 bool APO = (
OpIdx == 0) ?
false : IsInverseOperation;
3224 OpsVec[
OpIdx][Lane] = {Operands[
OpIdx][Lane], APO,
false};
3230 unsigned getNumOperands()
const {
return ArgSize; }
3233 unsigned getNumLanes()
const {
return OpsVec[0].size(); }
3236 Value *getValue(
unsigned OpIdx,
unsigned Lane)
const {
3237 return getData(
OpIdx, Lane).V;
3241 bool empty()
const {
return OpsVec.empty(); }
3244 void clear() { OpsVec.clear(); }
3249 bool shouldBroadcast(
Value *
Op,
unsigned OpIdx,
unsigned Lane) {
3251 "Op is expected to be getValue(OpIdx, Lane).");
3255 bool OpAPO = getData(
OpIdx, Lane).APO;
3256 bool IsInvariant = L && L->isLoopInvariant(
Op);
3258 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3262 bool FoundCandidate =
false;
3263 for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
3264 OperandData &
Data = getData(OpI, Ln);
3265 if (
Data.APO != OpAPO ||
Data.IsUsed)
3267 Value *OpILane = getValue(OpI, Lane);
3291 L->isLoopInvariant(
Data.V))) {
3292 FoundCandidate =
true;
3299 if (!FoundCandidate)
3302 return getNumLanes() == 2 || Cnt > 1;
3309 "Op is expected to be getValue(OpIdx, Lane).");
3310 bool OpAPO = getData(
OpIdx, Lane).APO;
3311 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3315 const OperandData &
Data = getData(OpI, Ln);
3316 if (
Data.APO != OpAPO ||
Data.IsUsed)
3318 Value *OpILn = getValue(OpI, Ln);
3319 return (L && L->isLoopInvariant(OpILn)) ||
3331 const InstructionsState &S,
const BoUpSLP &R)
3332 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
3333 L(R.LI->getLoopFor(S.getMainOp()->
getParent())) {
3335 appendOperands(RootVL, Operands, S);
3343 "Expected same num of lanes across all operands");
3344 for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
3345 OpVL[Lane] = OpsVec[
OpIdx][Lane].V;
3353 unsigned NumOperands = getNumOperands();
3354 unsigned NumLanes = getNumLanes();
3374 unsigned FirstLane = getBestLaneToStartReordering();
3383 if (shouldBroadcast(OpLane0,
OpIdx, FirstLane) ||
3384 !canBeVectorized(OpILane0,
OpIdx, FirstLane))
3385 ReorderingModes[
OpIdx] = ReorderingMode::Splat;
3387 ReorderingModes[
OpIdx] = ReorderingMode::Load;
3389 ReorderingModes[
OpIdx] = ReorderingMode::Opcode;
3391 ReorderingModes[
OpIdx] = ReorderingMode::Constant;
3394 ReorderingModes[
OpIdx] = ReorderingMode::Splat;
3404 auto &&SkipReordering = [
this]() {
3407 for (
const OperandData &
Data : Op0)
3410 ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
3411 if (
any_of(
Op, [&UniqueValues](
const OperandData &
Data) {
3418 return UniqueValues.
size() != 2 &&
3420 UniqueValues.
size());
3432 if (SkipReordering())
3435 bool StrategyFailed =
false;
3443 for (
unsigned I = 0;
I < NumOperands; ++
I)
3444 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
3447 UsedLanes.
set(FirstLane);
3448 for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
3450 for (
int Direction : {+1, -1}) {
3451 int Lane = FirstLane + Direction * Distance;
3452 if (Lane < 0 || Lane >= (
int)NumLanes)
3454 UsedLanes.
set(Lane);
3455 int LastLane = Lane - Direction;
3456 assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
3461 std::optional<unsigned> BestIdx =
3462 getBestOperand(
OpIdx, Lane, LastLane, ReorderingModes,
3463 MainAltOps[
OpIdx], UsedLanes);
3470 swap(
OpIdx, *BestIdx, Lane);
3473 StrategyFailed =
true;
3477 OperandData &AltOp = getData(
OpIdx, Lane);
3478 InstructionsState OpS =
3480 if (OpS && OpS.isAltShuffle())
3487 if (!StrategyFailed)
3492#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3495 case ReorderingMode::Load:
3497 case ReorderingMode::Opcode:
3499 case ReorderingMode::Constant:
3501 case ReorderingMode::Splat:
3503 case ReorderingMode::Failed:
3524 const unsigned Indent = 2;
3526 for (
const OperandDataVec &OpDataVec : OpsVec) {
3527 OS <<
"Operand " << Cnt++ <<
"\n";
3528 for (
const OperandData &OpData : OpDataVec) {
3529 OS.
indent(Indent) <<
"{";
3530 if (
Value *V = OpData.V)
3534 OS <<
", APO:" << OpData.APO <<
"}\n";
3556 int BestScore = Limit;
3557 std::optional<int> Index;
3558 for (
int I :
seq<int>(0, Candidates.size())) {
3560 Candidates[
I].second,
3563 if (Score > BestScore) {
3578 DeletedInstructions.insert(
I);
3583 template <
typename T>
3586 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
3588 for (T *V : DeadVals) {
3593 for (T *V : DeadVals) {
3594 if (!V || !Processed.
insert(V).second)
3599 for (
Use &U :
I->operands()) {
3601 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
3603 (Entries.
empty() ||
none_of(Entries, [&](
const TreeEntry *Entry) {
3604 return Entry->VectorizedValue == OpI;
3608 I->dropAllReferences();
3610 for (T *V : DeadVals) {
3612 if (!
I->getParent())
3617 cast<Instruction>(U.getUser()));
3619 "trying to erase instruction with users.");
3620 I->removeFromParent();
3624 while (!DeadInsts.
empty()) {
3627 if (!VI || !VI->getParent())
3630 "Live instruction found in dead worklist!");
3631 assert(VI->use_empty() &&
"Instructions with uses are not dead.");
3638 for (
Use &OpU : VI->operands()) {
3639 Value *OpV = OpU.get();
3651 if (!DeletedInstructions.contains(OpI) &&
3652 (!OpI->getType()->isVectorTy() ||
3653 none_of(VectorValuesAndScales,
3654 [&](
const std::tuple<Value *, unsigned, bool> &V) {
3655 return std::get<0>(V) == OpI;
3661 VI->removeFromParent();
3663 SE->forgetValue(VI);
3670 return AnalyzedReductionsRoots.count(
I);
3675 AnalyzedReductionsRoots.insert(
I);
3680 return AnalyzedReductionVals.contains(
hash_value(VL));
3685 AnalyzedReductionVals.insert(
hash_value(VL));
3689 AnalyzedReductionsRoots.clear();
3690 AnalyzedReductionVals.clear();
3691 AnalyzedMinBWVals.clear();
3699 return MustGather.contains(V);
3703 return NonScheduledFirst.contains(V);
3708 assert(V &&
"V cannot be nullptr.");
3709 return ScalarToTreeEntries.contains(V);
3719 bool collectValuesToDemote(
3720 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
3723 bool &IsProfitableToDemote,
bool IsTruncRoot)
const;
3732 void buildReorderableOperands(
3740 void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
3743 bool areAllUsersVectorized(
3753 const TreeEntry *getOperandEntry(
const TreeEntry *E,
unsigned Idx)
const;
3754 TreeEntry *getOperandEntry(TreeEntry *E,
unsigned Idx) {
3755 return const_cast<TreeEntry *
>(
3756 getOperandEntry(
const_cast<const TreeEntry *
>(E), Idx));
3762 Instruction *getRootEntryInstruction(
const TreeEntry &Entry)
const;
3766 getCastContextHint(
const TreeEntry &TE)
const;
3780 const InstructionsState &LocalState,
3787 unsigned InterleaveFactor = 0);
3798 bool ResizeAllowed =
false)
const;
3805 Value *vectorizeOperand(TreeEntry *
E,
unsigned NodeIdx);
3810 template <
typename BVTy,
typename ResTy,
typename... Args>
3811 ResTy processBuildVector(
const TreeEntry *
E,
Type *ScalarTy, Args &...Params);
3816 Value *createBuildVector(
const TreeEntry *
E,
Type *ScalarTy);
3822 Instruction &getLastInstructionInBundle(
const TreeEntry *
E);
3829 std::optional<TargetTransformInfo::ShuffleKind>
3841 unsigned NumParts)
const;
3853 std::optional<TargetTransformInfo::ShuffleKind>
3854 isGatherShuffledSingleRegisterEntry(
3871 isGatherShuffledEntry(
3874 unsigned NumParts,
bool ForOrder =
false);
3880 Type *ScalarTy)
const;
3884 void setInsertPointAfterBundle(
const TreeEntry *
E);
3894 bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
3899 void tryToVectorizeGatheredLoads(
3901 std::tuple<BasicBlock *, Value *, Type *>,
3909 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
3925 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
3929 void reorderGatherNode(TreeEntry &TE);
3935 bool matchesShlZExt(
const TreeEntry &TE,
OrdersType &Order,
3936 bool &IsBSwap)
const;
3941 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3944 SmallVector<int> getCommonMask()
const {
3945 if (State == TreeEntry::SplitVectorize)
3947 SmallVector<int>
Mask;
3954 SmallVector<int> getSplitMask()
const {
3955 assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
3956 "Expected only split vectorize node.");
3958 unsigned CommonVF = std::max<unsigned>(
3959 CombinedEntriesWithIndices.back().second,
3960 Scalars.size() - CombinedEntriesWithIndices.back().second);
3961 for (
auto [Idx,
I] :
enumerate(ReorderIndices))
3963 Idx + (Idx >= CombinedEntriesWithIndices.back().second
3964 ? CommonVF - CombinedEntriesWithIndices.back().second
3971 void reorderSplitNode(
unsigned Idx, ArrayRef<int> Mask,
3972 ArrayRef<int> MaskOrder);
3977 if (
Mask.size() != VL.
size() && VL.
size() == Scalars.size())
3978 return std::equal(VL.
begin(), VL.
end(), Scalars.begin());
3981 [Scalars](
Value *V,
int Idx) {
3982 return (isa<UndefValue>(V) &&
3983 Idx == PoisonMaskElem) ||
3984 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3987 if (!ReorderIndices.empty()) {
3991 SmallVector<int>
Mask;
3993 if (VL.
size() == Scalars.size())
3994 return IsSame(Scalars, Mask);
3995 if (VL.
size() == ReuseShuffleIndices.size()) {
3997 return IsSame(Scalars, Mask);
4001 return IsSame(Scalars, ReuseShuffleIndices);
4005 bool hasEqualOperands(
const TreeEntry &TE)
const {
4006 if (
TE.getNumOperands() != getNumOperands())
4008 SmallBitVector
Used(getNumOperands());
4009 for (
unsigned I = 0,
E = getNumOperands();
I <
E; ++
I) {
4010 unsigned PrevCount =
Used.count();
4011 for (
unsigned K = 0;
K <
E; ++
K) {
4014 if (getOperand(K) ==
TE.getOperand(
I)) {
4020 if (PrevCount ==
Used.count())
4029 unsigned getVectorFactor()
const {
4030 if (!ReuseShuffleIndices.empty())
4031 return ReuseShuffleIndices.size();
4032 return Scalars.size();
4036 bool isGather()
const {
return State == NeedToGather; }
4042 WeakTrackingVH VectorizedValue =
nullptr;
4063 enum CombinedOpcode {
4065 MinMax = Instruction::OtherOpsEnd + 1,
4068 ReducedBitcastBSwap,
4070 CombinedOpcode CombinedOp = NotCombinedOp;
4073 SmallVector<int, 4> ReuseShuffleIndices;
4076 SmallVector<unsigned, 4> ReorderIndices;
4084 VecTreeTy &Container;
4087 EdgeInfo UserTreeIndex;
4100 SmallVector<ValueList, 2> Operands;
4103 SmallPtrSet<const Value *, 4> CopyableElements;
4107 InstructionsState S = InstructionsState::invalid();
4110 unsigned InterleaveFactor = 0;
4113 bool DoesNotNeedToSchedule =
false;
4117 if (Operands.size() <
OpIdx + 1)
4118 Operands.resize(
OpIdx + 1);
4121 "Number of operands is greater than the number of scalars.");
4128 unsigned getInterleaveFactor()
const {
return InterleaveFactor; }
4130 void setInterleave(
unsigned Factor) { InterleaveFactor = Factor; }
4133 void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule =
true; }
4136 bool doesNotNeedToSchedule()
const {
return DoesNotNeedToSchedule; }
4141 setOperand(
I, Operands[
I]);
4145 void reorderOperands(ArrayRef<int> Mask) {
4153 return Operands[
OpIdx];
4159 return Operands[
OpIdx];
4163 unsigned getNumOperands()
const {
return Operands.size(); }
4166 Value *getSingleOperand(
unsigned OpIdx)
const {
4169 return Operands[
OpIdx][0];
4173 bool isAltShuffle()
const {
return S.isAltShuffle(); }
4175 Instruction *getMatchingMainOpOrAltOp(Instruction *
I)
const {
4176 return S.getMatchingMainOpOrAltOp(
I);
4184 if (
I && getMatchingMainOpOrAltOp(
I))
4186 return S.getMainOp();
4189 void setOperations(
const InstructionsState &S) {
4190 assert(S &&
"InstructionsState is invalid.");
4194 Instruction *getMainOp()
const {
return S.getMainOp(); }
4196 Instruction *getAltOp()
const {
return S.getAltOp(); }
4199 unsigned getOpcode()
const {
return S.getOpcode(); }
4201 unsigned getAltOpcode()
const {
return S.getAltOpcode(); }
4203 bool hasState()
const {
return S.valid(); }
4206 void addCopyableElement(
Value *V) {
4207 assert(S.isCopyableElement(V) &&
"Not a copyable element.");
4208 CopyableElements.insert(V);
4212 bool isCopyableElement(
Value *V)
const {
4213 return CopyableElements.contains(V);
4217 bool hasCopyableElements()
const {
return !CopyableElements.empty(); }
4220 const InstructionsState &getOperations()
const {
return S; }
4224 unsigned findLaneForValue(
Value *V)
const {
4225 unsigned FoundLane = getVectorFactor();
4226 for (
auto *It =
find(Scalars, V), *End = Scalars.end(); It != End;
4227 std::advance(It, 1)) {
4230 FoundLane = std::distance(Scalars.begin(), It);
4231 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
4232 if (!ReorderIndices.empty())
4233 FoundLane = ReorderIndices[FoundLane];
4234 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
4235 if (ReuseShuffleIndices.empty())
4237 if (
auto *RIt =
find(ReuseShuffleIndices, FoundLane);
4238 RIt != ReuseShuffleIndices.end()) {
4239 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
4243 assert(FoundLane < getVectorFactor() &&
"Unable to find given value.");
4250 buildAltOpShuffleMask(
const function_ref<
bool(Instruction *)> IsAltOp,
4251 SmallVectorImpl<int> &Mask,
4252 SmallVectorImpl<Value *> *OpScalars =
nullptr,
4253 SmallVectorImpl<Value *> *AltScalars =
nullptr)
const;
4256 bool isNonPowOf2Vec()
const {
4258 return IsNonPowerOf2;
4264 hasNonWholeRegisterOrNonPowerOf2Vec(
const TargetTransformInfo &
TTI)
const {
4267 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
4268 "Reshuffling not supported with non-power-of-2 vectors yet.");
4269 return IsNonPowerOf2;
4272 Value *getOrdered(
unsigned Idx)
const {
4273 if (ReorderIndices.empty())
4274 return Scalars[Idx];
4275 SmallVector<int>
Mask;
4277 return Scalars[
Mask[Idx]];
4283 dbgs() << Idx <<
".\n";
4284 for (
unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
4285 dbgs() <<
"Operand " << OpI <<
":\n";
4286 for (
const Value *V : Operands[OpI])
4289 dbgs() <<
"Scalars: \n";
4290 for (
Value *V : Scalars)
4292 dbgs() <<
"State: ";
4293 if (S && hasCopyableElements())
4294 dbgs() <<
"[[Copyable]] ";
4297 if (InterleaveFactor > 0) {
4298 dbgs() <<
"Vectorize with interleave factor " << InterleaveFactor
4301 dbgs() <<
"Vectorize\n";
4304 case ScatterVectorize:
4305 dbgs() <<
"ScatterVectorize\n";
4307 case StridedVectorize:
4308 dbgs() <<
"StridedVectorize\n";
4310 case CompressVectorize:
4311 dbgs() <<
"CompressVectorize\n";
4314 dbgs() <<
"NeedToGather\n";
4316 case CombinedVectorize:
4317 dbgs() <<
"CombinedVectorize\n";
4319 case SplitVectorize:
4320 dbgs() <<
"SplitVectorize\n";
4324 dbgs() <<
"MainOp: " << *S.getMainOp() <<
"\n";
4325 dbgs() <<
"AltOp: " << *S.getAltOp() <<
"\n";
4327 dbgs() <<
"MainOp: NULL\n";
4328 dbgs() <<
"AltOp: NULL\n";
4330 dbgs() <<
"VectorizedValue: ";
4331 if (VectorizedValue)
4332 dbgs() << *VectorizedValue <<
"\n";
4335 dbgs() <<
"ReuseShuffleIndices: ";
4336 if (ReuseShuffleIndices.empty())
4339 for (
int ReuseIdx : ReuseShuffleIndices)
4340 dbgs() << ReuseIdx <<
", ";
4342 dbgs() <<
"ReorderIndices: ";
4343 for (
unsigned ReorderIdx : ReorderIndices)
4344 dbgs() << ReorderIdx <<
", ";
4346 dbgs() <<
"UserTreeIndex: ";
4348 dbgs() << UserTreeIndex;
4350 dbgs() <<
"<invalid>";
4352 if (!CombinedEntriesWithIndices.empty()) {
4353 dbgs() <<
"Combined entries: ";
4355 dbgs() <<
"Entry index " <<
P.first <<
" with offset " <<
P.second;
4366 StringRef Banner)
const {
4367 dbgs() <<
"SLP: " << Banner <<
":\n";
4369 dbgs() <<
"SLP: Costs:\n";
4370 dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
4371 dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
4372 dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
4373 dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = "
4374 << ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
4380 const InstructionsState &S,
4382 ArrayRef<int> ReuseShuffleIndices = {}) {
4383 auto Invalid = ScheduleBundle::invalid();
4384 return newTreeEntry(VL,
Invalid, S, UserTreeIdx, ReuseShuffleIndices);
4389 const InstructionsState &S,
4391 ArrayRef<int> ReuseShuffleIndices = {},
4392 ArrayRef<unsigned> ReorderIndices = {},
4393 unsigned InterleaveFactor = 0) {
4394 TreeEntry::EntryState EntryState =
4395 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
4396 TreeEntry *
E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
4397 ReuseShuffleIndices, ReorderIndices);
4398 if (
E && InterleaveFactor > 0)
4399 E->setInterleave(InterleaveFactor);
4404 TreeEntry::EntryState EntryState,
4405 ScheduleBundle &Bundle,
const InstructionsState &S,
4407 ArrayRef<int> ReuseShuffleIndices = {},
4408 ArrayRef<unsigned> ReorderIndices = {}) {
4409 assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||
4410 EntryState == TreeEntry::SplitVectorize)) ||
4411 (Bundle && EntryState != TreeEntry::NeedToGather &&
4412 EntryState != TreeEntry::SplitVectorize)) &&
4413 "Need to vectorize gather entry?");
4415 if (GatheredLoadsEntriesFirst.has_value() &&
4416 EntryState == TreeEntry::NeedToGather && S &&
4417 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
4418 !UserTreeIdx.UserTE)
4420 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
4421 TreeEntry *
Last = VectorizableTree.back().get();
4422 Last->Idx = VectorizableTree.size() - 1;
4423 Last->State = EntryState;
4424 if (UserTreeIdx.UserTE)
4425 OperandsToTreeEntry.try_emplace(
4426 std::make_pair(UserTreeIdx.UserTE, UserTreeIdx.EdgeIdx),
Last);
4431 ReuseShuffleIndices.empty()) &&
4432 "Reshuffling scalars not yet supported for nodes with padding");
4433 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
4434 ReuseShuffleIndices.end());
4435 if (ReorderIndices.
empty()) {
4438 Last->setOperations(S);
4441 Last->Scalars.assign(VL.
size(),
nullptr);
4443 [VL](
unsigned Idx) ->
Value * {
4444 if (Idx >= VL.size())
4445 return UndefValue::get(VL.front()->getType());
4450 Last->setOperations(S);
4451 Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
4453 if (EntryState == TreeEntry::SplitVectorize) {
4454 assert(S &&
"Split nodes must have operations.");
4455 Last->setOperations(S);
4456 SmallPtrSet<Value *, 4> Processed;
4457 for (
Value *V : VL) {
4461 auto It = ScalarsInSplitNodes.find(V);
4462 if (It == ScalarsInSplitNodes.end()) {
4463 ScalarsInSplitNodes.try_emplace(V).first->getSecond().push_back(
Last);
4464 (void)Processed.
insert(V);
4465 }
else if (Processed.
insert(V).second) {
4467 "Value already associated with the node.");
4468 It->getSecond().push_back(
Last);
4471 }
else if (!
Last->isGather()) {
4474 (!S.areInstructionsWithCopyableElements() &&
4476 all_of(VL, [&](
Value *V) {
return S.isNonSchedulable(V); }))
4477 Last->setDoesNotNeedToSchedule();
4478 SmallPtrSet<Value *, 4> Processed;
4479 for (
Value *V : VL) {
4482 if (S.isCopyableElement(V)) {
4483 Last->addCopyableElement(V);
4486 auto It = ScalarToTreeEntries.find(V);
4487 if (It == ScalarToTreeEntries.end()) {
4488 ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(
Last);
4489 (void)Processed.
insert(V);
4490 }
else if (Processed.
insert(V).second) {
4492 "Value already associated with the node.");
4493 It->getSecond().push_back(
Last);
4497 assert((!Bundle.getBundle().empty() ||
Last->doesNotNeedToSchedule()) &&
4498 "Bundle and VL out of sync");
4499 if (!Bundle.getBundle().empty()) {
4500#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
4501 auto *BundleMember = Bundle.getBundle().begin();
4502 SmallPtrSet<Value *, 4> Processed;
4503 for (
Value *V : VL) {
4504 if (S.isNonSchedulable(V) || !Processed.
insert(V).second)
4508 assert(BundleMember == Bundle.getBundle().end() &&
4509 "Bundle and VL out of sync");
4511 Bundle.setTreeEntry(
Last);
4515 bool AllConstsOrCasts =
true;
4516 for (
Value *V : VL) {
4517 if (S && S.areInstructionsWithCopyableElements() &&
4518 S.isCopyableElement(V))
4519 Last->addCopyableElement(V);
4522 AllConstsOrCasts &=
I &&
I->getType()->isIntegerTy();
4523 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
4524 !UserTreeIdx.UserTE->isGather())
4525 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(
Last);
4528 if (AllConstsOrCasts)
4530 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
4531 MustGather.insert_range(VL);
4534 if (UserTreeIdx.UserTE)
4535 Last->UserTreeIndex = UserTreeIdx;
4541 TreeEntry::VecTreeTy VectorizableTree;
4546 for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
4547 VectorizableTree[
Id]->dump();
4548 if (TransformedToGatherNodes.contains(VectorizableTree[Id].get()))
4549 dbgs() <<
"[[TRANSFORMED TO GATHER]]";
4550 else if (DeletedNodes.contains(VectorizableTree[Id].get()))
4551 dbgs() <<
"[[DELETED NODE]]";
4559 assert(V &&
"V cannot be nullptr.");
4560 auto It = ScalarToTreeEntries.find(V);
4561 if (It == ScalarToTreeEntries.end())
4563 return It->getSecond();
4568 assert(V &&
"V cannot be nullptr.");
4569 auto It = ScalarsInSplitNodes.find(V);
4570 if (It == ScalarsInSplitNodes.end())
4572 return It->getSecond();
4577 bool SameVF =
false)
const {
4578 assert(V &&
"V cannot be nullptr.");
4579 for (TreeEntry *TE : ScalarToTreeEntries.lookup(V))
4580 if ((!SameVF ||
TE->getVectorFactor() == VL.
size()) &&
TE->isSame(VL))
4591 bool areAltOperandsProfitable(
const InstructionsState &S,
4596 class ScalarsVectorizationLegality {
4597 InstructionsState S;
4599 bool TryToFindDuplicates;
4600 bool TrySplitVectorize;
4603 ScalarsVectorizationLegality(InstructionsState S,
bool IsLegal,
4604 bool TryToFindDuplicates =
true,
4605 bool TrySplitVectorize =
false)
4606 : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
4607 TrySplitVectorize(TrySplitVectorize) {
4608 assert((!IsLegal || (S.valid() && TryToFindDuplicates)) &&
4609 "Inconsistent state");
4611 const InstructionsState &getInstructionsState()
const {
return S; };
4612 bool isLegal()
const {
return IsLegal; }
4613 bool tryToFindDuplicates()
const {
return TryToFindDuplicates; }
4614 bool trySplitVectorize()
const {
return TrySplitVectorize; }
4619 ScalarsVectorizationLegality
4622 bool TryCopyableElementsVectorization)
const;
4626 TreeEntry::EntryState getScalarsVectorizationState(
4628 bool IsScatterVectorizeUserTE,
OrdersType &CurrentOrder,
4629 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo);
4632 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
4635 SmallPtrSet<const TreeEntry *, 8> DeletedNodes;
4639 SmallDenseMap<const TreeEntry *, InstructionCost> TransformedToGatherNodes;
4642 SmallDenseMap<std::pair<const TreeEntry *, unsigned>, TreeEntry *>
4643 OperandsToTreeEntry;
4646 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarsInSplitNodes;
4649 SmallDenseMap<Value *, unsigned> InstrElementSize;
4663 SmallDenseMap<const TreeEntry *, WeakTrackingVH> EntryToLastInstruction;
4667 SmallDenseMap<const Instruction *, Instruction *> LastInstructionToPos;
4672 SetVector<const TreeEntry *> PostponedGathers;
4674 using ValueToGatherNodesMap =
4675 DenseMap<Value *, SmallSetVector<const TreeEntry *, 4>>;
4676 ValueToGatherNodesMap ValueToGatherNodes;
4681 SetVector<unsigned> LoadEntriesToVectorize;
4684 bool IsGraphTransformMode =
false;
4687 std::optional<unsigned> GatheredLoadsEntriesFirst;
4690 SmallDenseMap<
const TreeEntry *,
4691 std::tuple<SmallVector<int>,
VectorType *, unsigned,
bool>>
4692 CompressEntryToData;
4695 struct ExternalUser {
4696 ExternalUser(
Value *S, llvm::User *U,
const TreeEntry &E,
unsigned L)
4697 : Scalar(S), User(
U), E(E), Lane(
L) {}
4700 Value *Scalar =
nullptr;
4703 llvm::User *User =
nullptr;
4711 using UserList = SmallVector<ExternalUser, 16>;
4717 bool isAliased(
const MemoryLocation &Loc1, Instruction *Inst1,
4718 Instruction *Inst2) {
4721 AliasCacheKey
Key = std::make_pair(Inst1, Inst2);
4722 auto Res = AliasCache.try_emplace(
Key);
4724 return Res.first->second;
4725 bool Aliased =
isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
4727 Res.first->getSecond() = Aliased;
4731 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
4735 SmallDenseMap<AliasCacheKey, bool> AliasCache;
4740 BatchAAResults BatchAA;
4747 DenseSet<Instruction *> DeletedInstructions;
4750 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
4753 DenseSet<size_t> AnalyzedReductionVals;
4757 DenseSet<Value *> AnalyzedMinBWVals;
4763 UserList ExternalUses;
4767 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
4771 SmallPtrSet<Value *, 4> ExternalUsesWithNonUsers;
4774 SmallPtrSet<const Value *, 32> EphValues;
4778 SetVector<Instruction *> GatherShuffleExtractSeq;
4781 DenseSet<BasicBlock *> CSEBlocks;
4784 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
4791 class ScheduleEntity {
4792 friend class ScheduleBundle;
4793 friend class ScheduleData;
4794 friend class ScheduleCopyableData;
4797 enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData };
4798 Kind getKind()
const {
return K; }
4799 ScheduleEntity(Kind K) : K(K) {}
4803 int SchedulingPriority = 0;
4806 bool IsScheduled =
false;
4808 const Kind K = Kind::ScheduleData;
4811 ScheduleEntity() =
delete;
4813 void setSchedulingPriority(
int Priority) { SchedulingPriority = Priority; }
4814 int getSchedulingPriority()
const {
return SchedulingPriority; }
4815 bool isReady()
const {
4817 return SD->isReady();
4819 return CD->isReady();
4825 bool hasValidDependencies()
const {
4827 return SD->hasValidDependencies();
4829 return CD->hasValidDependencies();
4833 int getUnscheduledDeps()
const {
4835 return SD->getUnscheduledDeps();
4837 return CD->getUnscheduledDeps();
4841 int incrementUnscheduledDeps(
int Incr) {
4843 return SD->incrementUnscheduledDeps(Incr);
4847 int getDependencies()
const {
4849 return SD->getDependencies();
4855 return SD->getInst();
4860 bool isScheduled()
const {
return IsScheduled; }
4861 void setScheduled(
bool Scheduled) { IsScheduled = Scheduled; }
4863 static bool classof(
const ScheduleEntity *) {
return true; }
4865#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4866 void dump(raw_ostream &OS)
const {
4868 return SD->dump(OS);
4870 return CD->dump(OS);
4881#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4883 const BoUpSLP::ScheduleEntity &SE) {
4893 class ScheduleData final :
public ScheduleEntity {
4897 enum { InvalidDeps = -1 };
4899 ScheduleData() : ScheduleEntity(Kind::ScheduleData) {}
4900 static bool classof(
const ScheduleEntity *Entity) {
4901 return Entity->getKind() == Kind::ScheduleData;
4904 void init(
int BlockSchedulingRegionID, Instruction *
I) {
4905 NextLoadStore =
nullptr;
4906 IsScheduled =
false;
4907 SchedulingRegionID = BlockSchedulingRegionID;
4908 clearDependencies();
4914 if (hasValidDependencies()) {
4915 assert(UnscheduledDeps <= Dependencies &&
"invariant");
4917 assert(UnscheduledDeps == Dependencies &&
"invariant");
4921 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
4922 "unexpected scheduled state");
4929 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
4933 bool isReady()
const {
return UnscheduledDeps == 0 && !IsScheduled; }
4938 int incrementUnscheduledDeps(
int Incr) {
4939 assert(hasValidDependencies() &&
4940 "increment of unscheduled deps would be meaningless");
4941 UnscheduledDeps += Incr;
4942 assert(UnscheduledDeps >= 0 &&
4943 "Expected valid number of unscheduled deps");
4944 return UnscheduledDeps;
4949 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
4952 void clearDependencies() {
4953 clearDirectDependencies();
4954 MemoryDependencies.clear();
4955 ControlDependencies.clear();
4962 void clearDirectDependencies() {
4963 Dependencies = InvalidDeps;
4964 resetUnscheduledDeps();
4965 IsScheduled =
false;
4969 int getUnscheduledDeps()
const {
return UnscheduledDeps; }
4971 int getDependencies()
const {
return Dependencies; }
4973 void initDependencies() { Dependencies = 0; }
4975 void incDependencies() { Dependencies++; }
4978 int getSchedulingRegionID()
const {
return SchedulingRegionID; }
4985 return MemoryDependencies;
4988 void addMemoryDependency(ScheduleData *Dep) {
4989 MemoryDependencies.push_back(Dep);
4993 return ControlDependencies;
4996 void addControlDependency(ScheduleData *Dep) {
4997 ControlDependencies.push_back(Dep);
5000 ScheduleData *getNextLoadStore()
const {
return NextLoadStore; }
5001 void setNextLoadStore(ScheduleData *
Next) { NextLoadStore =
Next; }
5003 void dump(raw_ostream &OS)
const { OS << *Inst; }
5015 ScheduleData *NextLoadStore =
nullptr;
5019 SmallVector<ScheduleData *> MemoryDependencies;
5025 SmallVector<ScheduleData *> ControlDependencies;
5029 int SchedulingRegionID = 0;
5035 int Dependencies = InvalidDeps;
5041 int UnscheduledDeps = InvalidDeps;
5046 const BoUpSLP::ScheduleData &SD) {
5052 class ScheduleBundle final :
public ScheduleEntity {
5056 bool IsValid =
true;
5058 TreeEntry *TE =
nullptr;
5059 ScheduleBundle(
bool IsValid)
5060 : ScheduleEntity(Kind::ScheduleBundle), IsValid(IsValid) {}
5063 ScheduleBundle() : ScheduleEntity(Kind::ScheduleBundle) {}
5064 static bool classof(
const ScheduleEntity *Entity) {
5065 return Entity->getKind() == Kind::ScheduleBundle;
5070 for (
const ScheduleEntity *SD : Bundle) {
5071 if (SD->hasValidDependencies()) {
5072 assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
5075 assert(SD->getUnscheduledDeps() == SD->getDependencies() &&
5079 if (isScheduled()) {
5080 assert(SD->hasValidDependencies() && SD->getUnscheduledDeps() == 0 &&
5081 "unexpected scheduled state");
5087 int unscheduledDepsInBundle()
const {
5088 assert(*
this &&
"bundle must not be empty");
5090 for (
const ScheduleEntity *BundleMember : Bundle) {
5091 if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
5092 return ScheduleData::InvalidDeps;
5093 Sum += BundleMember->getUnscheduledDeps();
5101 bool hasValidDependencies()
const {
5102 return all_of(Bundle, [](
const ScheduleEntity *SD) {
5103 return SD->hasValidDependencies();
5109 bool isReady()
const {
5110 assert(*
this &&
"bundle must not be empty");
5111 return unscheduledDepsInBundle() == 0 && !isScheduled();
5119 void add(ScheduleEntity *SD) { Bundle.push_back(SD); }
5122 void setTreeEntry(TreeEntry *TE) { this->TE = TE; }
5123 TreeEntry *getTreeEntry()
const {
return TE; }
5125 static ScheduleBundle invalid() {
return {
false}; }
5127 operator bool()
const {
return IsValid; }
5130 void dump(raw_ostream &OS)
const {
5139 OS << *SD->getInst();
5153 const BoUpSLP::ScheduleBundle &Bundle) {
5164 class ScheduleCopyableData final :
public ScheduleEntity {
5171 int SchedulingRegionID = 0;
5173 ScheduleBundle &Bundle;
5176 ScheduleCopyableData(
int BlockSchedulingRegionID,
Instruction *
I,
5177 const EdgeInfo &EI, ScheduleBundle &Bundle)
5178 : ScheduleEntity(Kind::ScheduleCopyableData), Inst(
I), EI(EI),
5179 SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {}
5180 static bool classof(
const ScheduleEntity *Entity) {
5181 return Entity->getKind() == Kind::ScheduleCopyableData;
5186 if (hasValidDependencies()) {
5187 assert(UnscheduledDeps <= Dependencies &&
"invariant");
5189 assert(UnscheduledDeps == Dependencies &&
"invariant");
5193 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
5194 "unexpected scheduled state");
5201 bool hasValidDependencies()
const {
5202 return Dependencies != ScheduleData::InvalidDeps;
5207 bool isReady()
const {
return UnscheduledDeps == 0 && !IsScheduled; }
5212 int incrementUnscheduledDeps(
int Incr) {
5213 assert(hasValidDependencies() &&
5214 "increment of unscheduled deps would be meaningless");
5215 UnscheduledDeps += Incr;
5216 assert(UnscheduledDeps >= 0 &&
"invariant");
5217 return UnscheduledDeps;
5222 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5225 int getUnscheduledDeps()
const {
return UnscheduledDeps; }
5227 int getDependencies()
const {
return Dependencies; }
5229 void initDependencies() { Dependencies = 0; }
5231 void incDependencies() { Dependencies++; }
5234 int getSchedulingRegionID()
const {
return SchedulingRegionID; }
5240 void clearDependencies() {
5241 Dependencies = ScheduleData::InvalidDeps;
5242 UnscheduledDeps = ScheduleData::InvalidDeps;
5243 IsScheduled =
false;
5247 const EdgeInfo &getEdgeInfo()
const {
return EI; }
5250 ScheduleBundle &getBundle() {
return Bundle; }
5251 const ScheduleBundle &getBundle()
const {
return Bundle; }
5253#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5254 void dump(raw_ostream &OS)
const { OS <<
"[Copyable]" << *getInst(); }
5265 int Dependencies = ScheduleData::InvalidDeps;
5271 int UnscheduledDeps = ScheduleData::InvalidDeps;
5301 struct BlockScheduling {
5303 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
5306 ScheduledBundles.clear();
5307 ScheduledBundlesList.
clear();
5308 ScheduleCopyableDataMap.clear();
5309 ScheduleCopyableDataMapByInst.clear();
5310 ScheduleCopyableDataMapByInstUser.clear();
5311 ScheduleCopyableDataMapByUsers.clear();
5313 ScheduleStart =
nullptr;
5314 ScheduleEnd =
nullptr;
5315 FirstLoadStoreInRegion =
nullptr;
5316 LastLoadStoreInRegion =
nullptr;
5317 RegionHasStackSave =
false;
5321 ScheduleRegionSizeLimit -= ScheduleRegionSize;
5324 ScheduleRegionSize = 0;
5328 ++SchedulingRegionID;
5331 ScheduleData *getScheduleData(Instruction *
I) {
5334 if (BB !=
I->getParent())
5337 ScheduleData *SD = ScheduleDataMap.lookup(
I);
5338 if (SD && isInSchedulingRegion(*SD))
5343 ScheduleData *getScheduleData(
Value *V) {
5349 ScheduleCopyableData *getScheduleCopyableData(
const EdgeInfo &EI,
5350 const Value *V)
const {
5351 if (ScheduleCopyableDataMap.empty())
5353 auto It = ScheduleCopyableDataMap.find(std::make_pair(EI, V));
5354 if (It == ScheduleCopyableDataMap.end())
5356 ScheduleCopyableData *SD = It->getSecond().get();
5357 if (!isInSchedulingRegion(*SD))
5365 getScheduleCopyableData(
const Value *User,
unsigned OperandIdx,
5367 if (ScheduleCopyableDataMapByInstUser.empty())
5369 const auto It = ScheduleCopyableDataMapByInstUser.find(
5370 std::make_pair(std::make_pair(User, OperandIdx), V));
5371 if (It == ScheduleCopyableDataMapByInstUser.end())
5374 for (ScheduleCopyableData *SD : It->getSecond()) {
5375 if (isInSchedulingRegion(*SD))
5389 bool areAllOperandsReplacedByCopyableData(Instruction *User,
5393 if (ScheduleCopyableDataMap.empty())
5395 SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;
5397 if (Entries.
empty())
5399 unsigned CurNumOps = 0;
5400 for (
const Use &U :
User->operands()) {
5406 for (TreeEntry *TE : Entries) {
5408 bool IsNonSchedulableWithParentPhiNode =
5409 TE->doesNotNeedToSchedule() &&
TE->UserTreeIndex &&
5410 TE->UserTreeIndex.UserTE->hasState() &&
5411 TE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
5412 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
5415 if (IsNonSchedulableWithParentPhiNode) {
5416 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5417 const TreeEntry *ParentTE =
TE->UserTreeIndex.UserTE;
5418 for (
Value *V : ParentTE->Scalars) {
5422 if (ParentsUniqueUsers.
insert(
PHI).second &&
5427 Inc =
count(
TE->Scalars, User);
5435 bool IsCommutativeUser =
5438 if (!IsCommutativeUser) {
5448 (!IsCommutativeUser ||
5457 "Expected commutative user with 2 first commutable operands");
5458 bool IsCommutativeWithSameOps =
5459 IsCommutativeUser &&
User->getOperand(0) ==
User->getOperand(1);
5460 if ((!IsCommutativeUser || IsCommutativeWithSameOps) &&
5462 EdgeInfo EI(TE,
U.getOperandNo());
5463 if (CurNumOps !=
NumOps || getScheduleCopyableData(EI,
Op))
5467 PotentiallyReorderedEntriesCount.
try_emplace(TE, 0)
5468 .first->getSecond() += Inc;
5471 if (PotentiallyReorderedEntriesCount.
empty())
5474 for (
auto &
P : PotentiallyReorderedEntriesCount) {
5475 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5476 bool IsNonSchedulableWithParentPhiNode =
5477 P.first->doesNotNeedToSchedule() &&
P.first->UserTreeIndex &&
5478 P.first->UserTreeIndex.UserTE->hasState() &&
5479 P.first->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
5480 P.first->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
5481 auto *It =
find(
P.first->Scalars, User);
5483 assert(It !=
P.first->Scalars.end() &&
5484 "User is not in the tree entry");
5485 int Lane = std::distance(
P.first->Scalars.begin(), It);
5486 assert(Lane >= 0 &&
"Lane is not found");
5488 Lane =
P.first->ReorderIndices[Lane];
5489 assert(Lane <
static_cast<int>(
P.first->Scalars.size()) &&
5490 "Couldn't find extract lane");
5493 if (IsNonSchedulableWithParentPhiNode) {
5494 const TreeEntry *ParentTE =
P.first->UserTreeIndex.UserTE;
5496 if (!ParentsUniqueUsers.
insert(User).second) {
5502 for (
unsigned OpIdx :
5504 P.first->getMainOp()))) {
5505 if (
P.first->getOperand(
OpIdx)[Lane] ==
Op &&
5506 getScheduleCopyableData(EdgeInfo(
P.first,
OpIdx),
Op))
5511 }
while (It !=
P.first->Scalars.end());
5513 return all_of(PotentiallyReorderedEntriesCount,
5514 [&](
const std::pair<const TreeEntry *, unsigned> &
P) {
5515 return P.second ==
NumOps - 1;
5520 getScheduleCopyableData(
const Instruction *
I)
const {
5521 if (ScheduleCopyableDataMapByInst.empty())
5523 const auto It = ScheduleCopyableDataMapByInst.find(
I);
5524 if (It == ScheduleCopyableDataMapByInst.end())
5527 for (ScheduleCopyableData *SD : It->getSecond()) {
5528 if (isInSchedulingRegion(*SD))
5535 getScheduleCopyableDataUsers(
const Instruction *User)
const {
5536 if (ScheduleCopyableDataMapByUsers.empty())
5538 const auto It = ScheduleCopyableDataMapByUsers.find(User);
5539 if (It == ScheduleCopyableDataMapByUsers.end())
5542 for (ScheduleCopyableData *SD : It->getSecond()) {
5543 if (isInSchedulingRegion(*SD))
5549 ScheduleCopyableData &addScheduleCopyableData(
const EdgeInfo &EI,
5551 int SchedulingRegionID,
5552 ScheduleBundle &Bundle) {
5553 assert(!getScheduleCopyableData(EI,
I) &&
"already in the map");
5554 ScheduleCopyableData *CD =
5555 ScheduleCopyableDataMap
5556 .try_emplace(std::make_pair(EI,
I),
5557 std::make_unique<ScheduleCopyableData>(
5558 SchedulingRegionID,
I, EI, Bundle))
5561 ScheduleCopyableDataMapByInst[
I].push_back(CD);
5565 assert(It !=
Op.end() &&
"Lane not set");
5566 SmallPtrSet<Instruction *, 4> Visited;
5568 int Lane = std::distance(
Op.begin(), It);
5569 assert(Lane >= 0 &&
"Lane not set");
5571 !EI.UserTE->ReorderIndices.empty())
5572 Lane = EI.UserTE->ReorderIndices[Lane];
5573 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
5574 "Couldn't find extract lane");
5576 if (!Visited.
insert(In).second) {
5580 ScheduleCopyableDataMapByInstUser
5581 .try_emplace(std::make_pair(std::make_pair(In, EI.EdgeIdx),
I))
5584 ScheduleCopyableDataMapByUsers.try_emplace(
I)
5591 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
5592 if (ScheduleCopyableData *UserCD =
5593 getScheduleCopyableData(UserEI, In))
5594 ScheduleCopyableDataMapByUsers[
I].remove(UserCD);
5597 }
while (It !=
Op.end());
5599 ScheduleCopyableDataMapByUsers.try_emplace(
I).first->getSecond().insert(
5609 auto It = ScheduledBundles.find(
I);
5610 if (It == ScheduledBundles.end())
5612 return It->getSecond();
5616 bool isInSchedulingRegion(
const ScheduleEntity &SD)
const {
5618 return Data->getSchedulingRegionID() == SchedulingRegionID;
5620 return CD->getSchedulingRegionID() == SchedulingRegionID;
5622 [&](
const ScheduleEntity *BundleMember) {
5623 return isInSchedulingRegion(*BundleMember);
5629 template <
typename ReadyListType>
5630 void schedule(
const BoUpSLP &R,
const InstructionsState &S,
5631 const EdgeInfo &EI, ScheduleEntity *
Data,
5632 ReadyListType &ReadyList) {
5633 auto ProcessBundleMember = [&](ScheduleEntity *BundleMember,
5638 auto DecrUnsched = [&](
auto *
Data,
bool IsControl =
false) {
5639 if ((IsControl ||
Data->hasValidDependencies()) &&
5640 Data->incrementUnscheduledDeps(-1) == 0) {
5647 CopyableBundle.
push_back(&CD->getBundle());
5648 Bundles = CopyableBundle;
5650 Bundles = getScheduleBundles(
Data->getInst());
5652 if (!Bundles.
empty()) {
5653 for (ScheduleBundle *Bundle : Bundles) {
5654 if (Bundle->unscheduledDepsInBundle() == 0) {
5655 assert(!Bundle->isScheduled() &&
5656 "already scheduled bundle gets ready");
5657 ReadyList.insert(Bundle);
5659 <<
"SLP: gets ready: " << *Bundle <<
"\n");
5665 "already scheduled bundle gets ready");
5667 "Expected non-copyable data");
5668 ReadyList.insert(
Data);
5675 if (!ScheduleCopyableDataMap.empty()) {
5677 getScheduleCopyableData(User,
OpIdx,
I);
5678 for (ScheduleCopyableData *CD : CopyableData)
5679 DecrUnsched(CD,
false);
5680 if (!CopyableData.empty())
5683 if (ScheduleData *OpSD = getScheduleData(
I))
5684 DecrUnsched(OpSD,
false);
5690 if (!Bundles.empty()) {
5691 auto *
In = BundleMember->getInst();
5693 SmallDenseMap<const Instruction *, unsigned> OperandsUses;
5694 unsigned TotalOpCount = 0;
5697 TotalOpCount = OperandsUses[
In] = 1;
5699 for (
const Use &U :
In->operands()) {
5702 ++Res.first->getSecond();
5709 auto DecrUnschedForInst =
5711 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>>
5713 if (!ScheduleCopyableDataMap.empty()) {
5714 const EdgeInfo EI = {UserTE,
OpIdx};
5715 if (ScheduleCopyableData *CD =
5716 getScheduleCopyableData(EI,
I)) {
5717 if (!Checked.insert(std::make_pair(CD,
OpIdx)).second)
5719 DecrUnsched(CD,
false);
5723 auto It = OperandsUses.
find(
I);
5724 assert(It != OperandsUses.
end() &&
"Operand not found");
5725 if (It->second > 0) {
5726 if (ScheduleData *OpSD = getScheduleData(
I)) {
5727 if (!Checked.insert(std::make_pair(OpSD,
OpIdx)).second)
5730 assert(TotalOpCount > 0 &&
"No more operands to decrement");
5732 DecrUnsched(OpSD,
false);
5735 assert(TotalOpCount > 0 &&
"No more operands to decrement");
5741 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>> Checked;
5742 for (ScheduleBundle *Bundle : Bundles) {
5743 if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
5745 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5748 auto *It =
find(Bundle->getTreeEntry()->Scalars, In);
5749 bool IsNonSchedulableWithParentPhiNode =
5750 Bundle->getTreeEntry()->doesNotNeedToSchedule() &&
5751 Bundle->getTreeEntry()->UserTreeIndex &&
5752 Bundle->getTreeEntry()->UserTreeIndex.UserTE->hasState() &&
5753 Bundle->getTreeEntry()->UserTreeIndex.UserTE->State !=
5754 TreeEntry::SplitVectorize &&
5755 Bundle->getTreeEntry()->UserTreeIndex.UserTE->getOpcode() ==
5759 std::distance(Bundle->getTreeEntry()->Scalars.begin(), It);
5760 assert(Lane >= 0 &&
"Lane not set");
5762 !Bundle->getTreeEntry()->ReorderIndices.empty())
5763 Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
5764 assert(Lane <
static_cast<int>(
5765 Bundle->getTreeEntry()->Scalars.size()) &&
5766 "Couldn't find extract lane");
5776 In->getNumOperands() ==
5777 Bundle->getTreeEntry()->getNumOperands() ||
5778 Bundle->getTreeEntry()->isCopyableElement(In)) &&
5779 "Missed TreeEntry operands?");
5783 if (IsNonSchedulableWithParentPhiNode) {
5784 const TreeEntry *ParentTE =
5785 Bundle->getTreeEntry()->UserTreeIndex.UserTE;
5787 if (!ParentsUniqueUsers.
insert(User).second) {
5788 It = std::find(std::next(It),
5789 Bundle->getTreeEntry()->Scalars.end(), In);
5794 for (
unsigned OpIdx :
5797 Bundle->getTreeEntry()->getOperand(
OpIdx)[Lane])) {
5800 DecrUnschedForInst(
I, Bundle->getTreeEntry(),
OpIdx, Checked);
5803 if (Bundle->getTreeEntry()->isCopyableElement(In))
5805 It = std::find(std::next(It),
5806 Bundle->getTreeEntry()->Scalars.end(), In);
5807 }
while (It != Bundle->getTreeEntry()->Scalars.end());
5812 for (Use &U : BundleMember->getInst()->operands()) {
5815 <<
"SLP: check for readiness (def): " << *
I <<
"\n");
5816 DecrUnschedForInst(BundleMember->getInst(),
U.getOperandNo(),
I);
5824 SmallPtrSet<const ScheduleData *, 4> VisitedMemory;
5825 for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) {
5826 if (!VisitedMemory.
insert(MemoryDep).second)
5831 << *MemoryDep <<
"\n");
5832 DecrUnsched(MemoryDep);
5835 SmallPtrSet<const ScheduleData *, 4> VisitedControl;
5836 for (ScheduleData *Dep : SD->getControlDependencies()) {
5837 if (!VisitedControl.
insert(Dep).second)
5842 <<
"SLP: check for readiness (ctrl): " << *Dep <<
"\n");
5843 DecrUnsched(Dep,
true);
5847 SD->setScheduled(
true);
5852 if (
R.isVectorized(In)) {
5854 for (TreeEntry *TE : Entries) {
5856 In->getNumOperands() !=
TE->getNumOperands())
5859 PseudoBundles.
emplace_back(std::make_unique<ScheduleBundle>());
5860 BundlePtr->setTreeEntry(TE);
5865 ProcessBundleMember(SD, Bundles);
5868 Bundle.setScheduled(
true);
5870 auto AreAllBundlesScheduled =
5871 [&](
const ScheduleEntity *SD,
5875 return !SDBundles.empty() &&
5876 all_of(SDBundles, [&](
const ScheduleBundle *SDBundle) {
5877 return SDBundle->isScheduled();
5880 for (ScheduleEntity *SD : Bundle.getBundle()) {
5883 SDBundles = getScheduleBundles(SD->getInst());
5884 if (AreAllBundlesScheduled(SD, SDBundles)) {
5885 SD->setScheduled(
true);
5898 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
5899 ScheduleStart->comesBefore(ScheduleEnd) &&
5900 "Not a valid scheduling region?");
5902 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
5904 if (!Bundles.
empty()) {
5905 for (ScheduleBundle *Bundle : Bundles) {
5906 assert(isInSchedulingRegion(*Bundle) &&
5907 "primary schedule data not in window?");
5912 auto *SD = getScheduleData(
I);
5915 assert(isInSchedulingRegion(*SD) &&
5916 "primary schedule data not in window?");
5921 [](
const ScheduleEntity *Bundle) {
5922 return Bundle->isReady();
5924 "item in ready list not ready?");
5928 template <
typename ReadyListType>
5929 void initialFillReadyList(ReadyListType &ReadyList) {
5930 SmallPtrSet<ScheduleBundle *, 16> Visited;
5931 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
5932 ScheduleData *SD = getScheduleData(
I);
5933 if (SD && SD->hasValidDependencies() && SD->isReady()) {
5936 for (ScheduleBundle *Bundle : Bundles) {
5937 if (!Visited.
insert(Bundle).second)
5939 if (Bundle->hasValidDependencies() && Bundle->isReady()) {
5940 ReadyList.insert(Bundle);
5942 << *Bundle <<
"\n");
5947 ReadyList.insert(SD);
5949 <<
"SLP: initially in ready list: " << *SD <<
"\n");
5960 const InstructionsState &S,
const EdgeInfo &EI);
5967 std::optional<ScheduleBundle *>
5969 const InstructionsState &S,
const EdgeInfo &EI);
5972 ScheduleData *allocateScheduleDataChunks();
5976 bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
5980 void initScheduleData(Instruction *FromI, Instruction *ToI,
5981 ScheduleData *PrevLoadStore,
5982 ScheduleData *NextLoadStore);
5986 void calculateDependencies(ScheduleBundle &Bundle,
bool InsertInReadyList,
5991 void resetSchedule();
6008 SmallDenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
6012 SmallDenseMap<std::pair<EdgeInfo, const Value *>,
6013 std::unique_ptr<ScheduleCopyableData>>
6014 ScheduleCopyableDataMap;
6020 SmallDenseMap<const Instruction *, SmallVector<ScheduleCopyableData *>>
6021 ScheduleCopyableDataMapByInst;
6027 SmallDenseMap<std::pair<std::pair<const Value *, unsigned>,
const Value *>,
6029 ScheduleCopyableDataMapByInstUser;
6049 SmallSetVector<ScheduleCopyableData *, 4>>
6050 ScheduleCopyableDataMapByUsers;
6053 SmallDenseMap<Instruction *, SmallVector<ScheduleBundle *>>
6059 SetVector<ScheduleEntity *> ReadyInsts;
6069 ScheduleData *FirstLoadStoreInRegion =
nullptr;
6073 ScheduleData *LastLoadStoreInRegion =
nullptr;
6078 bool RegionHasStackSave =
false;
6081 int ScheduleRegionSize = 0;
6090 int SchedulingRegionID = 1;
6094 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
6098 void scheduleBlock(
const BoUpSLP &R, BlockScheduling *BS);
6101 const SmallDenseSet<Value *> *UserIgnoreList =
nullptr;
6105 struct OrdersTypeDenseMapInfo {
6118 static unsigned getHashValue(
const OrdersType &V) {
6129 ScalarEvolution *SE;
6130 TargetTransformInfo *TTI;
6131 TargetLibraryInfo *TLI;
6134 AssumptionCache *AC;
6136 const DataLayout *DL;
6137 OptimizationRemarkEmitter *ORE;
6139 unsigned MaxVecRegSize;
6140 unsigned MinVecRegSize;
6143 IRBuilder<TargetFolder> Builder;
6150 DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
6155 unsigned ReductionBitWidth = 0;
6158 unsigned BaseGraphSize = 1;
6162 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
6166 DenseSet<unsigned> ExtraBitWidthNodes;
6174 SecondInfo::getEmptyKey());
6179 SecondInfo::getTombstoneKey());
6184 SecondInfo::getHashValue(Val.
EdgeIdx));
6205 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
6216 return R.VectorizableTree[0].get();
6220 return {&
N->UserTreeIndex,
N->Container};
6224 return {&
N->UserTreeIndex + 1,
N->Container};
6251 static unsigned size(
BoUpSLP *R) {
return R->VectorizableTree.size(); }
6263 OS << Entry->Idx <<
".\n";
6266 for (
auto *V : Entry->Scalars) {
6268 if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
6269 return EU.Scalar == V;
6279 if (Entry->isGather())
6281 if (Entry->State == TreeEntry::ScatterVectorize ||
6282 Entry->State == TreeEntry::StridedVectorize ||
6283 Entry->State == TreeEntry::CompressVectorize)
6284 return "color=blue";
6291 for (
auto *
I : DeletedInstructions) {
6292 if (!
I->getParent()) {
6297 I->insertBefore(F->getEntryBlock(),
6298 F->getEntryBlock().getFirstNonPHIIt());
6300 I->insertBefore(F->getEntryBlock().getTerminator()->getIterator());
6303 for (
Use &U :
I->operands()) {
6305 if (
Op && !DeletedInstructions.count(
Op) &&
Op->hasOneUser() &&
6309 I->dropAllReferences();
6311 for (
auto *
I : DeletedInstructions) {
6313 "trying to erase instruction with users.");
6314 I->eraseFromParent();
6320#ifdef EXPENSIVE_CHECKS
6331 assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
6332 "Expected non-empty mask.");
6335 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
6337 Reuses[Mask[
I]] = Prev[
I];
6345 bool BottomOrder =
false) {
6346 assert(!Mask.empty() &&
"Expected non-empty mask.");
6347 unsigned Sz = Mask.size();
6350 if (Order.
empty()) {
6352 std::iota(PrevOrder.
begin(), PrevOrder.
end(), 0);
6354 PrevOrder.
swap(Order);
6357 for (
unsigned I = 0;
I < Sz; ++
I)
6359 Order[
I] = PrevOrder[Mask[
I]];
6361 return Data.value() == Sz ||
Data.index() ==
Data.value();
6370 if (Order.
empty()) {
6372 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
6382 for (
unsigned I = 0;
I < Sz; ++
I)
6384 Order[MaskOrder[
I]] =
I;
6388std::optional<BoUpSLP::OrdersType>
6390 bool TopToBottom,
bool IgnoreReorder) {
6391 assert(TE.isGather() &&
"Expected gather node only.");
6395 Type *ScalarTy = GatheredScalars.
front()->getType();
6396 size_t NumScalars = GatheredScalars.
size();
6398 return std::nullopt;
6405 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
6407 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
6410 if (GatherShuffles.
empty() && ExtractShuffles.
empty())
6411 return std::nullopt;
6412 OrdersType CurrentOrder(NumScalars, NumScalars);
6413 if (GatherShuffles.
size() == 1 &&
6415 Entries.
front().front()->isSame(TE.Scalars)) {
6419 return std::nullopt;
6421 if (Entries.
front().front()->UserTreeIndex.UserTE ==
6422 TE.UserTreeIndex.UserTE)
6423 return std::nullopt;
6426 if (!IgnoreReorder && Entries.
front().front()->Idx == 0)
6427 return std::nullopt;
6430 if (!Entries.
front().front()->ReuseShuffleIndices.empty() &&
6431 TE.getVectorFactor() == 2 && Mask.size() == 2 &&
6434 return P.value() % 2 != static_cast<int>(P.index()) % 2;
6436 return std::nullopt;
6440 std::iota(CurrentOrder.
begin(), CurrentOrder.
end(), 0);
6441 return CurrentOrder;
6445 return all_of(Mask, [&](
int I) {
6452 if ((ExtractShuffles.
empty() && IsSplatMask(Mask) &&
6453 (Entries.
size() != 1 ||
6454 Entries.
front().front()->ReorderIndices.empty())) ||
6455 (GatherShuffles.
empty() && IsSplatMask(ExtractMask)))
6456 return std::nullopt;
6462 if (ShuffledSubMasks.
test(
I))
6464 const int VF = GetVF(
I);
6472 ShuffledSubMasks.
set(
I);
6476 int FirstMin = INT_MAX;
6477 int SecondVecFound =
false;
6479 int Idx = Mask[
I * PartSz + K];
6481 Value *V = GatheredScalars[
I * PartSz + K];
6483 SecondVecFound =
true;
6492 SecondVecFound =
true;
6496 FirstMin = (FirstMin / PartSz) * PartSz;
6498 if (SecondVecFound) {
6500 ShuffledSubMasks.
set(
I);
6504 int Idx = Mask[
I * PartSz + K];
6508 if (Idx >= PartSz) {
6509 SecondVecFound =
true;
6512 if (CurrentOrder[
I * PartSz + Idx] >
6513 static_cast<unsigned>(
I * PartSz + K) &&
6514 CurrentOrder[
I * PartSz + Idx] !=
6515 static_cast<unsigned>(
I * PartSz + Idx))
6516 CurrentOrder[
I * PartSz + Idx] =
I * PartSz + K;
6519 if (SecondVecFound) {
6521 ShuffledSubMasks.
set(
I);
6527 if (!ExtractShuffles.
empty())
6528 TransformMaskToOrder(
6529 CurrentOrder, ExtractMask, PartSz, NumParts, [&](
unsigned I) {
6530 if (!ExtractShuffles[
I])
6533 unsigned Sz =
getNumElems(TE.getVectorFactor(), PartSz,
I);
6535 int K =
I * PartSz + Idx;
6538 if (!TE.ReuseShuffleIndices.empty())
6539 K = TE.ReuseShuffleIndices[K];
6542 if (!TE.ReorderIndices.empty())
6543 K = std::distance(TE.ReorderIndices.begin(),
6544 find(TE.ReorderIndices, K));
6550 .getKnownMinValue());
6555 if (GatherShuffles.
size() == 1 && NumParts != 1) {
6556 if (ShuffledSubMasks.
any())
6557 return std::nullopt;
6558 PartSz = NumScalars;
6561 if (!Entries.
empty())
6562 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](
unsigned I) {
6563 if (!GatherShuffles[
I])
6565 return std::max(Entries[
I].front()->getVectorFactor(),
6566 Entries[
I].back()->getVectorFactor());
6568 unsigned NumUndefs =
count(CurrentOrder, NumScalars);
6569 if (ShuffledSubMasks.
all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
6570 return std::nullopt;
6571 return std::move(CurrentOrder);
6576 bool CompareOpcodes =
true) {
6582 return (!GEP1 || GEP1->getNumOperands() == 2) &&
6583 (!GEP2 || GEP2->getNumOperands() == 2) &&
6584 (((!GEP1 ||
isConstant(GEP1->getOperand(1))) &&
6585 (!GEP2 ||
isConstant(GEP2->getOperand(1)))) ||
6588 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
6592template <
typename T>
6597 return CommonAlignment;
6603 "Order is empty. Please check it before using isReverseOrder.");
6604 unsigned Sz = Order.
size();
6606 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
6627 "Coeffs vector needs to be of correct size");
6629 const SCEV *PtrSCEVLowest =
nullptr;
6630 const SCEV *PtrSCEVHighest =
nullptr;
6633 for (
Value *Ptr : PointerOps) {
6638 if (!PtrSCEVLowest && !PtrSCEVHighest) {
6639 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
6646 PtrSCEVLowest = PtrSCEV;
6653 PtrSCEVHighest = PtrSCEV;
6661 int Size =
DL.getTypeStoreSize(ElemTy);
6662 auto TryGetStride = [&](
const SCEV *Dist,
6663 const SCEV *Multiplier) ->
const SCEV * {
6665 if (M->getOperand(0) == Multiplier)
6666 return M->getOperand(1);
6667 if (M->getOperand(1) == Multiplier)
6668 return M->getOperand(0);
6671 if (Multiplier == Dist)
6676 const SCEV *Stride =
nullptr;
6677 if (
Size != 1 || SCEVs.
size() > 2) {
6679 Stride = TryGetStride(Dist, Sz);
6687 using DistOrdPair = std::pair<int64_t, int>;
6689 std::set<DistOrdPair,
decltype(Compare)> Offsets(Compare);
6691 bool IsConsecutive =
true;
6692 for (
const auto [Idx, PtrSCEV] :
enumerate(SCEVs)) {
6694 if (PtrSCEV != PtrSCEVLowest) {
6696 const SCEV *Coeff = TryGetStride(Diff, Stride);
6702 Coeffs[Idx] = (int64_t)SC->getAPInt().getLimitedValue();
6707 Dist = SC->getAPInt().getZExtValue();
6714 auto Res = Offsets.emplace(Dist, Cnt);
6718 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
6721 if (Offsets.size() != SCEVs.
size())
6723 SortedIndices.
clear();
6724 if (!IsConsecutive) {
6728 for (
const std::pair<int64_t, int> &Pair : Offsets) {
6729 SortedIndices[Cnt] = Pair.second;
6736static std::pair<InstructionCost, InstructionCost>
6755 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask,
CostKind, Index, SubTp,
6757 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
6760 Mask, NumSrcElts, NumSubElts, Index)) {
6761 if (Index + NumSubElts > NumSrcElts &&
6762 Index + NumSrcElts <=
static_cast<int>(
Mask.size()))
6766 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask,
CostKind, Index, SubTp,
6779 "ScalableVectorType is not supported.");
6782 "Incorrect usage.");
6787 unsigned ScalarTyNumElements = VecTy->getNumElements();
6790 if (!DemandedElts[
I])
6794 I * ScalarTyNumElements, VecTy);
6797 I * ScalarTyNumElements, VecTy);
6801 return TTI.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
6810 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
6811 if (Opcode == Instruction::ExtractElement) {
6817 Index * VecTy->getNumElements(), VecTy);
6820 return TTI.getVectorInstrCost(Opcode, Val,
CostKind, Index, Scalar,
6833 getWidenedType(VecTy->getElementType(), ScalarTy->getNumElements());
6835 Index * ScalarTy->getNumElements(), SubTp) +
6839 return TTI.getExtractWithExtendCost(Opcode, Dst, VecTy, Index,
CostKind);
6855 auto *Begin = std::next(
Mask.begin(), Index);
6856 std::iota(Begin, std::next(Begin, SubVecVF), 0);
6857 Vec = Builder.CreateShuffleVector(V, Mask);
6860 std::iota(
Mask.begin(),
Mask.end(), 0);
6861 std::iota(std::next(
Mask.begin(), Index),
6862 std::next(
Mask.begin(), Index + SubVecVF), VecVF);
6864 return Generator(Vec, V, Mask);
6867 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
6868 V = Builder.CreateShuffleVector(V, ResizeMask);
6870 return Builder.CreateShuffleVector(Vec, V, Mask);
6875 unsigned SubVecVF,
unsigned Index) {
6877 std::iota(Mask.begin(), Mask.end(), Index);
6878 return Builder.CreateShuffleVector(Vec, Mask);
6888 const unsigned Sz = PointerOps.
size();
6891 CompressMask[0] = 0;
6893 std::optional<unsigned> Stride = 0;
6896 Value *Ptr = Order.
empty() ? PointerOps[
I] : PointerOps[Order[
I]];
6897 std::optional<int64_t> OptPos =
6899 if (!OptPos || OptPos > std::numeric_limits<unsigned>::max())
6901 unsigned Pos =
static_cast<unsigned>(*OptPos);
6902 CompressMask[
I] = Pos;
6909 if (Pos != *Stride *
I)
6912 return Stride.has_value();
6925 InterleaveFactor = 0;
6927 const size_t Sz = VL.
size();
6935 if (AreAllUsersVectorized(V))
6938 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
CostKind,
6939 Mask.empty() ?
I : Mask[
I]);
6942 if (ExtractCost <= ScalarCost)
6947 if (Order.
empty()) {
6948 Ptr0 = PointerOps.
front();
6949 PtrN = PointerOps.
back();
6951 Ptr0 = PointerOps[Order.
front()];
6952 PtrN = PointerOps[Order.
back()];
6954 std::optional<int64_t> Diff =
6958 const size_t MaxRegSize =
6962 if (*Diff / Sz >= MaxRegSize / 8)
6966 Align CommonAlignment = LI->getAlign();
6968 Ptr0, LoadVecTy, CommonAlignment,
DL,
6971 if (IsMasked && !
TTI.isLegalMaskedLoad(LoadVecTy, CommonAlignment,
6972 LI->getPointerAddressSpace()))
6978 assert(CompressMask.
size() >= 2 &&
"At least two elements are required");
6982 auto [ScalarGEPCost, VectorGEPCost] =
6984 Instruction::GetElementPtr,
CostKind, ScalarTy, LoadVecTy);
7001 LoadCost =
TTI.getMemIntrinsicInstrCost(
7004 LI->getPointerAddressSpace()),
7008 TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
7009 LI->getPointerAddressSpace(),
CostKind);
7011 if (IsStrided && !IsMasked && Order.
empty()) {
7018 AlignedLoadVecTy = LoadVecTy;
7019 if (
TTI.isLegalInterleavedAccessType(AlignedLoadVecTy, CompressMask[1],
7021 LI->getPointerAddressSpace())) {
7023 VectorGEPCost +
TTI.getInterleavedMemoryOpCost(
7024 Instruction::Load, AlignedLoadVecTy,
7025 CompressMask[1], {}, CommonAlignment,
7026 LI->getPointerAddressSpace(),
CostKind, IsMasked);
7027 if (InterleavedCost < GatherCost) {
7028 InterleaveFactor = CompressMask[1];
7029 LoadVecTy = AlignedLoadVecTy;
7036 if (!Order.
empty()) {
7039 NewMask[
I] = CompressMask[Mask[
I]];
7041 CompressMask.
swap(NewMask);
7043 InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
7044 return TotalVecCost < GatherCost;
7057 unsigned InterleaveFactor;
7061 AreAllUsersVectorized, IsMasked, InterleaveFactor,
7062 CompressMask, LoadVecTy);
7079 Align Alignment,
const int64_t Diff,
7080 const size_t Sz)
const {
7081 if (Diff % (Sz - 1) != 0)
7085 auto IsAnyPointerUsedOutGraph =
any_of(PointerOps, [&](
Value *V) {
7087 return !isVectorized(U) && !MustGather.contains(U);
7091 const uint64_t AbsoluteDiff = std::abs(Diff);
7093 if (IsAnyPointerUsedOutGraph ||
7094 (AbsoluteDiff > Sz &&
7097 AbsoluteDiff % Sz == 0 &&
has_single_bit(AbsoluteDiff / Sz)))) ||
7098 Diff == -(
static_cast<int64_t
>(Sz) - 1)) {
7099 int64_t Stride = Diff /
static_cast<int64_t
>(Sz - 1);
7100 if (Diff != Stride *
static_cast<int64_t
>(Sz - 1))
7102 if (!TTI->isLegalStridedLoadStore(VecTy, Alignment))
7112 Value *Ptr0,
Value *PtrN, StridedPtrInfo &SPtrInfo)
const {
7113 const size_t Sz = PointerOps.
size();
7121 SortedIndices.
empty() ? PointerOps[
I] : PointerOps[SortedIndices[
I]];
7122 std::optional<int64_t>
Offset =
7124 assert(
Offset &&
"sortPtrAccesses should have validated this pointer");
7125 SortedOffsetsFromBase[
I] = *
Offset;
7142 int64_t StrideWithinGroup =
7143 SortedOffsetsFromBase[1] - SortedOffsetsFromBase[0];
7146 auto IsEndOfGroupIndex = [=, &SortedOffsetsFromBase](
unsigned Idx) {
7147 return SortedOffsetsFromBase[Idx] - SortedOffsetsFromBase[Idx - 1] !=
7152 unsigned GroupSize = FoundIt != Indices.end() ? *FoundIt : Sz;
7154 unsigned VecSz = Sz;
7155 Type *NewScalarTy = ScalarTy;
7159 bool NeedsWidening = Sz != GroupSize;
7160 if (NeedsWidening) {
7161 if (Sz % GroupSize != 0)
7164 if (StrideWithinGroup != 1)
7166 VecSz = Sz / GroupSize;
7169 DL->getTypeSizeInBits(ScalarTy).getFixedValue() * GroupSize);
7172 if (!
isStridedLoad(PointerOps, NewScalarTy, Alignment, Diff, VecSz))
7175 int64_t StrideIntVal = StrideWithinGroup;
7176 if (NeedsWidening) {
7179 unsigned CurrentGroupStartIdx = GroupSize;
7180 int64_t StrideBetweenGroups =
7181 SortedOffsetsFromBase[GroupSize] - SortedOffsetsFromBase[0];
7182 StrideIntVal = StrideBetweenGroups;
7183 for (; CurrentGroupStartIdx < Sz; CurrentGroupStartIdx += GroupSize) {
7184 if (SortedOffsetsFromBase[CurrentGroupStartIdx] -
7185 SortedOffsetsFromBase[CurrentGroupStartIdx - GroupSize] !=
7186 StrideBetweenGroups)
7190 auto CheckGroup = [=](
const unsigned StartIdx) ->
bool {
7193 unsigned GroupEndIdx = FoundIt != Indices.end() ? *FoundIt : Sz;
7194 return GroupEndIdx - StartIdx == GroupSize;
7196 for (
unsigned I = 0;
I < Sz;
I += GroupSize) {
7202 Type *StrideTy = DL->getIndexType(Ptr0->
getType());
7211 StridedPtrInfo &SPtrInfo)
const {
7217 OffsetToPointerOpIdxMap;
7218 for (
auto [Idx, Ptr] :
enumerate(PointerOps)) {
7219 const SCEV *PtrSCEV = SE->getSCEV(Ptr);
7231 Offset = SC->getAPInt().getSExtValue();
7235 OffsetToPointerOpIdxMap[
Offset].first.push_back(Ptr);
7236 OffsetToPointerOpIdxMap[
Offset].second.push_back(Idx);
7238 unsigned NumOffsets = OffsetToPointerOpIdxMap.
size();
7242 const unsigned Sz = PointerOps.
size();
7243 unsigned VecSz = Sz;
7244 Type *NewScalarTy = ScalarTy;
7245 if (NumOffsets > 1) {
7246 if (Sz % NumOffsets != 0)
7248 VecSz = Sz / NumOffsets;
7251 DL->getTypeSizeInBits(ScalarTy).getFixedValue() * NumOffsets);
7254 if (Sz <= MinProfitableStridedLoads || !TTI->isTypeLegal(StridedLoadTy) ||
7255 !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))
7261 for (
auto [Idx, MapPair] :
enumerate(OffsetToPointerOpIdxMap)) {
7262 if (MapPair.second.first.size() != VecSz)
7264 SortedOffsetsV[Idx] = MapPair.first;
7266 sort(SortedOffsetsV);
7268 if (NumOffsets > 1) {
7270 if (SortedOffsetsV[
I] - SortedOffsetsV[
I - 1] != 1)
7343 auto UpdateSortedIndices =
7346 if (SortedIndicesForOffset.
empty()) {
7347 SortedIndicesForOffset.
resize(IndicesInAllPointerOps.
size());
7348 std::iota(SortedIndicesForOffset.
begin(),
7349 SortedIndicesForOffset.
end(), 0);
7351 for (
const auto [Num, Idx] :
enumerate(SortedIndicesForOffset)) {
7352 SortedIndicesDraft[Num * NumOffsets + OffsetNum] =
7353 IndicesInAllPointerOps[Idx];
7357 int64_t LowestOffset = SortedOffsetsV[0];
7363 SortedIndicesForOffset0, Coeffs0);
7366 unsigned NumCoeffs0 = Coeffs0.
size();
7367 if (NumCoeffs0 * NumOffsets != Sz)
7372 OffsetToPointerOpIdxMap[LowestOffset].second;
7373 UpdateSortedIndices(SortedIndicesForOffset0, IndicesInAllPointerOps0, 0);
7379 for (
int J :
seq<int>(1, NumOffsets)) {
7382 SortedIndicesForOffset.
clear();
7384 int64_t
Offset = SortedOffsetsV[J];
7386 OffsetToPointerOpIdxMap[
Offset].first;
7388 OffsetToPointerOpIdxMap[
Offset].second;
7389 const SCEV *StrideWithinGroup =
7391 SortedIndicesForOffset, Coeffs);
7393 if (!StrideWithinGroup || StrideWithinGroup != Stride0)
7395 if (Coeffs.
size() != NumCoeffs0)
7398 if (Coeffs != Coeffs0)
7401 UpdateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps, J);
7404 SortedIndices.
clear();
7405 SortedIndices = std::move(SortedIndicesDraft);
7406 SPtrInfo.StrideSCEV = Stride0;
7407 SPtrInfo.Ty = StridedLoadTy;
7414 unsigned *BestVF,
bool TryRecursiveCheck)
const {
7427 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
7433 const size_t Sz = VL.
size();
7435 auto *POIter = PointerOps.
begin();
7436 for (
Value *V : VL) {
7438 if (!L || !L->isSimple())
7440 *POIter = L->getPointerOperand();
7446 bool IsSorted =
sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
7455 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
7456 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
7467 if (Order.
empty()) {
7468 Ptr0 = PointerOps.
front();
7469 PtrN = PointerOps.
back();
7471 Ptr0 = PointerOps[Order.
front()];
7472 PtrN = PointerOps[Order.
back()];
7477 std::optional<int64_t> Diff0 =
7479 std::optional<int64_t> DiffN =
7482 "sortPtrAccesses should have validated these pointers");
7483 int64_t Diff = *DiffN - *Diff0;
7485 if (
static_cast<uint64_t>(Diff) == Sz - 1)
7488 *TLI, [&](
Value *V) {
7489 return areAllUsersVectorized(
7497 Diff, Ptr0, PtrN, SPtrInfo))
7500 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
7501 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
7506 auto CheckForShuffledLoads = [&, &TTI = *TTI](
Align CommonAlignment,
7508 bool ProfitableGatherPointers) {
7513 auto [ScalarGEPCost, VectorGEPCost] =
7515 Instruction::GetElementPtr,
CostKind, ScalarTy, VecTy);
7519 Type *PtrScalarTy = PointerOps.
front()->getType()->getScalarType();
7521 if (
static_cast<unsigned>(
count_if(
7540 return C + TTI.getInstructionCost(
7546 TTI.getMemIntrinsicInstrCost(
7549 false, CommonAlignment),
7551 (ProfitableGatherPointers ? 0 : VectorGEPCost);
7559 constexpr unsigned ListLimit = 4;
7560 if (!TryRecursiveCheck || VL.
size() < ListLimit)
7569 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
7579 for (
unsigned Cnt = 0, End = VL.
size(); Cnt + VF <= End; Cnt += VF) {
7584 PointerOps, SPtrInfo, BestVF,
7592 DemandedElts.
setBits(Cnt, Cnt + VF);
7608 if (!DemandedElts.
isZero()) {
7614 if (DemandedElts[Idx])
7625 LI0->getPointerOperand(),
7626 Instruction::GetElementPtr,
CostKind, ScalarTy,
7630 if (
static_cast<unsigned>(
7632 PointerOps.
size() - 1 ||
7651 TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
7652 LI0->getPointerAddressSpace(),
CostKind,
7657 VecLdCost += TTI.getMemIntrinsicInstrCost(
7659 Intrinsic::experimental_vp_strided_load,
7660 SubVecTy, LI0->getPointerOperand(),
7661 false, CommonAlignment),
7666 VecLdCost += TTI.getMemIntrinsicInstrCost(
7668 Intrinsic::masked_load, SubVecTy,
7669 CommonAlignment, LI0->getPointerAddressSpace()),
7675 VecLdCost += TTI.getMemIntrinsicInstrCost(
7677 Intrinsic::masked_gather, SubVecTy,
7678 LI0->getPointerOperand(),
7679 false, CommonAlignment),
7689 ShuffleMask[Idx] = Idx / VF ==
I ? VL.
size() + Idx % VF : Idx;
7698 if (MaskedGatherCost >= VecLdCost &&
7711 bool ProfitableGatherPointers =
7712 L && Sz > 2 &&
static_cast<unsigned>(
count_if(PointerOps, [L](
Value *V) {
7713 return L->isLoopInvariant(V);
7715 if (ProfitableGatherPointers ||
all_of(PointerOps, [](
Value *
P) {
7718 (
GEP &&
GEP->getNumOperands() == 2 &&
7726 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
7727 ProfitableGatherPointers))
7739 all_of(VL, [](
const Value *V) {
return V->getType()->isPointerTy(); }) &&
7740 "Expected list of pointer operands.");
7745 std::pair<BasicBlock *, Value *>,
7749 .try_emplace(std::make_pair(
7753 SortedIndices.
clear();
7755 auto Key = std::make_pair(BBs[Cnt + 1],
7757 bool Found =
any_of(Bases.try_emplace(
Key).first->second,
7758 [&, &Cnt = Cnt, &Ptr = Ptr](
auto &
Base) {
7759 std::optional<int64_t> Diff =
7760 getPointersDiff(ElemTy, std::get<0>(Base.front()),
7761 ElemTy, Ptr, DL, SE,
7766 Base.emplace_back(Ptr, *Diff, Cnt + 1);
7772 if (Bases.size() > VL.
size() / 2 - 1)
7776 Bases.find(
Key)->second.emplace_back().emplace_back(Ptr, 0, Cnt + 1);
7780 if (Bases.size() == VL.
size())
7783 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
7784 Bases.front().second.size() == VL.
size()))
7789 auto ComparePointers = [](
Value *Ptr1,
Value *Ptr2) {
7798 FirstPointers.
insert(P1);
7799 SecondPointers.
insert(P2);
7805 "Unable to find matching root.");
7808 for (
auto &
Base : Bases) {
7809 for (
auto &Vec :
Base.second) {
7810 if (Vec.size() > 1) {
7812 int64_t InitialOffset = std::get<1>(Vec[0]);
7813 bool AnyConsecutive =
7815 return std::get<1>(
P.value()) ==
7816 int64_t(
P.index()) + InitialOffset;
7820 if (!AnyConsecutive)
7825 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
7829 for (
auto &
T : Bases)
7830 for (
const auto &Vec :
T.second)
7831 for (
const auto &
P : Vec)
7835 "Expected SortedIndices to be the size of VL");
7839std::optional<BoUpSLP::OrdersType>
7841 assert(TE.isGather() &&
"Expected gather node only.");
7842 Type *ScalarTy = TE.Scalars[0]->getType();
7845 Ptrs.
reserve(TE.Scalars.size());
7847 BBs.
reserve(TE.Scalars.size());
7848 for (
Value *V : TE.Scalars) {
7850 if (!L || !L->isSimple())
7851 return std::nullopt;
7857 if (!LoadEntriesToVectorize.contains(TE.Idx) &&
7859 return std::move(Order);
7860 return std::nullopt;
7871 if (VU->
getType() != V->getType())
7874 if (!VU->
hasOneUse() && !V->hasOneUse())
7880 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
7887 bool IsReusedIdx =
false;
7889 if (IE2 == VU && !IE1)
7891 if (IE1 == V && !IE2)
7892 return V->hasOneUse();
7893 if (IE1 && IE1 != V) {
7895 IsReusedIdx |= ReusedIdx.
test(Idx1);
7896 ReusedIdx.
set(Idx1);
7897 if ((IE1 != VU && !IE1->
hasOneUse()) || IsReusedIdx)
7902 if (IE2 && IE2 != VU) {
7904 IsReusedIdx |= ReusedIdx.
test(Idx2);
7905 ReusedIdx.
set(Idx2);
7906 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
7911 }
while (!IsReusedIdx && (IE1 || IE2));
7921std::optional<BoUpSLP::OrdersType>
7923 bool IgnoreReorder) {
7926 if (!TE.ReuseShuffleIndices.empty()) {
7928 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
7929 "Reshuffling scalars not yet supported for nodes with padding");
7932 return std::nullopt;
7940 unsigned Sz = TE.Scalars.size();
7941 if (TE.isGather()) {
7942 if (std::optional<OrdersType> CurrentOrder =
7947 ::addMask(Mask, TE.ReuseShuffleIndices);
7948 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
7949 unsigned Sz = TE.Scalars.size();
7950 for (
int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
7953 Res[Idx + K * Sz] =
I + K * Sz;
7955 return std::move(Res);
7958 if (Sz == 2 && TE.getVectorFactor() == 4 &&
7960 2 * TE.getVectorFactor())) == 1)
7961 return std::nullopt;
7962 if (TE.ReuseShuffleIndices.size() % Sz != 0)
7963 return std::nullopt;
7967 if (TE.ReorderIndices.empty())
7968 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
7971 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
7972 unsigned VF = ReorderMask.
size();
7976 for (
unsigned I = 0;
I < VF;
I += Sz) {
7978 unsigned UndefCnt = 0;
7979 unsigned Limit = std::min(Sz, VF -
I);
7988 Val >=
static_cast<int>(NumParts) || UsedVals.
test(Val) ||
7990 return std::nullopt;
7992 for (
unsigned K = 0; K < NumParts; ++K) {
7993 unsigned Idx = Val + Sz * K;
7994 if (Idx < VF &&
I + K < VF)
7995 ResOrder[Idx] =
I + K;
7998 return std::move(ResOrder);
8000 unsigned VF = TE.getVectorFactor();
8003 TE.ReuseShuffleIndices.end());
8004 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
8006 if (isa<PoisonValue>(V))
8008 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
8009 return Idx && *Idx < Sz;
8011 assert(!TE.isAltShuffle() &&
"Alternate instructions are only supported "
8012 "by BinaryOperator and CastInst.");
8014 if (TE.ReorderIndices.empty())
8015 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
8018 for (
unsigned I = 0;
I < VF; ++
I) {
8019 int &Idx = ReusedMask[
I];
8022 Value *V = TE.Scalars[ReorderMask[Idx]];
8024 Idx = std::distance(ReorderMask.
begin(),
find(ReorderMask, *EI));
8030 std::iota(ResOrder.
begin(), ResOrder.
end(), 0);
8031 auto *It = ResOrder.
begin();
8032 for (
unsigned K = 0; K < VF; K += Sz) {
8036 std::iota(SubMask.
begin(), SubMask.
end(), 0);
8038 transform(CurrentOrder, It, [K](
unsigned Pos) {
return Pos + K; });
8039 std::advance(It, Sz);
8042 return Data.index() ==
Data.value();
8044 return std::nullopt;
8045 return std::move(ResOrder);
8047 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
8048 (!TE.UserTreeIndex || !TE.UserTreeIndex.UserTE->hasState() ||
8050 (TE.ReorderIndices.empty() ||
isReverseOrder(TE.ReorderIndices)))
8051 return std::nullopt;
8052 if (TE.State == TreeEntry::SplitVectorize ||
8053 ((TE.State == TreeEntry::Vectorize ||
8054 TE.State == TreeEntry::StridedVectorize ||
8055 TE.State == TreeEntry::CompressVectorize) &&
8058 assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
8059 "Alternate instructions are only supported by "
8060 "BinaryOperator and CastInst.");
8061 return TE.ReorderIndices;
8063 if (!TopToBottom && IgnoreReorder && TE.State == TreeEntry::Vectorize &&
8064 TE.isAltShuffle()) {
8065 assert(TE.ReuseShuffleIndices.empty() &&
8066 "ReuseShuffleIndices should be "
8067 "empty for alternate instructions.");
8069 TE.buildAltOpShuffleMask(
8071 assert(TE.getMatchingMainOpOrAltOp(
I) &&
8072 "Unexpected main/alternate opcode");
8076 const int VF = TE.getVectorFactor();
8081 ResOrder[Mask[
I] % VF] =
I;
8083 return std::move(ResOrder);
8085 if (!TE.ReorderIndices.empty())
8086 return TE.ReorderIndices;
8087 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
8088 if (!TE.ReorderIndices.empty())
8089 return TE.ReorderIndices;
8092 for (
auto [
I, V] :
zip(UserBVHead, TE.Scalars)) {
8100 while (
II &&
II->hasOneUse() &&
II->getParent() == BB) {
8108 assert(BB1 != BB2 &&
"Expected different basic blocks.");
8109 if (!DT->isReachableFromEntry(BB1))
8111 if (!DT->isReachableFromEntry(BB2))
8113 auto *NodeA = DT->getNode(BB1);
8114 auto *NodeB = DT->getNode(BB2);
8115 assert(NodeA &&
"Should only process reachable instructions");
8116 assert(NodeB &&
"Should only process reachable instructions");
8117 assert((NodeA == NodeB) ==
8118 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
8119 "Different nodes should have different DFS numbers");
8120 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
8122 auto PHICompare = [&](
unsigned I1,
unsigned I2) {
8123 Value *V1 = TE.Scalars[I1];
8124 Value *V2 = TE.Scalars[I2];
8137 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
8138 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
8139 FirstUserOfPhi2->getParent());
8149 if (UserBVHead[I1] && !UserBVHead[I2])
8151 if (!UserBVHead[I1])
8153 if (UserBVHead[I1] == UserBVHead[I2])
8156 return CompareByBasicBlocks(UserBVHead[I1]->
getParent(),
8158 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
8171 if (EE1->getOperand(0) == EE2->getOperand(0))
8173 if (!Inst1 && Inst2)
8175 if (Inst1 && Inst2) {
8183 "Expected either instructions or arguments vector operands.");
8184 return P1->getArgNo() < P2->getArgNo();
8189 std::iota(Phis.
begin(), Phis.
end(), 0);
8192 return std::nullopt;
8193 return std::move(Phis);
8195 if (TE.isGather() &&
8196 (!TE.hasState() || !TE.isAltShuffle() ||
8197 ScalarsInSplitNodes.contains(TE.getMainOp())) &&
8201 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
8205 auto *EE = dyn_cast<ExtractElementInst>(V);
8206 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
8212 canReuseExtract(TE.Scalars, CurrentOrder,
true);
8213 if (Reuse || !CurrentOrder.
empty())
8214 return std::move(CurrentOrder);
8222 int Sz = TE.Scalars.size();
8226 if (It == TE.Scalars.begin())
8229 if (It != TE.Scalars.end()) {
8231 unsigned Idx = std::distance(TE.Scalars.begin(), It);
8246 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
8249 return std::move(Order);
8254 return std::nullopt;
8255 if (TE.Scalars.size() >= 3)
8260 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
8262 StridedPtrInfo SPtrInfo;
8265 CurrentOrder, PointerOps, SPtrInfo);
8268 return std::move(CurrentOrder);
8273 if (std::optional<OrdersType> CurrentOrder =
8275 return CurrentOrder;
8277 return std::nullopt;
8287 for (
unsigned I = Sz,
E = Mask.size();
I <
E;
I += Sz) {
8289 if (Cluster != FirstCluster)
8295void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const {
8298 const unsigned Sz =
TE.Scalars.size();
8300 if (!
TE.isGather() ||
8307 addMask(NewMask,
TE.ReuseShuffleIndices);
8309 TE.ReorderIndices.clear();
8316 for (
auto *It =
TE.ReuseShuffleIndices.begin(),
8317 *End =
TE.ReuseShuffleIndices.end();
8318 It != End; std::advance(It, Sz))
8319 std::iota(It, std::next(It, Sz), 0);
8325 "Expected same size of orders");
8326 size_t Sz = Order.
size();
8329 if (Order[Idx] != Sz)
8330 UsedIndices.
set(Order[Idx]);
8332 if (SecondaryOrder.
empty()) {
8334 if (Order[Idx] == Sz && !UsedIndices.
test(Idx))
8338 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
8339 !UsedIndices.
test(SecondaryOrder[Idx]))
8340 Order[Idx] = SecondaryOrder[Idx];
8348 constexpr unsigned TinyVF = 2;
8349 constexpr unsigned TinyTree = 10;
8350 constexpr unsigned PhiOpsLimit = 12;
8351 constexpr unsigned GatherLoadsLimit = 2;
8352 if (VectorizableTree.size() <= TinyTree)
8354 if (VectorizableTree.front()->hasState() &&
8355 !VectorizableTree.front()->isGather() &&
8356 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
8357 VectorizableTree.front()->getOpcode() == Instruction::PHI ||
8358 (VectorizableTree.front()->getVectorFactor() <= TinyVF &&
8359 (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
8360 VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
8361 VectorizableTree.front()->ReorderIndices.empty()) {
8365 if (VectorizableTree.front()->hasState() &&
8366 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
8367 VectorizableTree.front()->Scalars.size() == TinyVF &&
8368 VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
8371 if (VectorizableTree.front()->hasState() &&
8372 VectorizableTree.front()->getOpcode() == Instruction::Store &&
8373 VectorizableTree.front()->ReorderIndices.empty()) {
8374 const unsigned ReorderedSplitsCnt =
8375 count_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
8376 return TE->State == TreeEntry::SplitVectorize &&
8377 !TE->ReorderIndices.empty() && TE->UserTreeIndex.UserTE &&
8378 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8381 if (ReorderedSplitsCnt <= 1 &&
8383 VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
8384 return ((!TE->isGather() &&
8385 (TE->ReorderIndices.empty() ||
8386 (TE->UserTreeIndex.UserTE &&
8387 TE->UserTreeIndex.UserTE->State ==
8388 TreeEntry::Vectorize &&
8389 !TE->UserTreeIndex.UserTE->ReuseShuffleIndices
8391 (TE->isGather() && TE->ReorderIndices.empty() &&
8392 (!TE->hasState() || TE->isAltShuffle() ||
8393 TE->getOpcode() == Instruction::Load ||
8394 TE->getOpcode() == Instruction::ZExt ||
8395 TE->getOpcode() == Instruction::SExt))) &&
8396 (VectorizableTree.front()->getVectorFactor() > TinyVF ||
8397 !TE->isGather() ||
none_of(TE->Scalars, [&](
Value *V) {
8398 return !isConstant(V) && isVectorized(V);
8400 })) >= VectorizableTree.size() - ReorderedSplitsCnt)
8403 bool HasPhis =
false;
8404 bool HasLoad =
true;
8405 unsigned GatherLoads = 0;
8406 for (
const std::unique_ptr<TreeEntry> &TE :
8407 ArrayRef(VectorizableTree).drop_front()) {
8408 if (TE->State == TreeEntry::SplitVectorize)
8410 if (!TE->hasState()) {
8414 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8419 if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) {
8420 if (!TE->isGather()) {
8427 if (GatherLoads >= GatherLoadsLimit)
8430 if (TE->getOpcode() == Instruction::GetElementPtr ||
8433 if (TE->getOpcode() != Instruction::PHI &&
8434 (!TE->hasCopyableElements() ||
8436 TE->Scalars.size() / 2))
8438 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8439 TE->getNumOperands() > PhiOpsLimit)
8448void BoUpSLP::TreeEntry::reorderSplitNode(
unsigned Idx,
ArrayRef<int> Mask,
8450 assert(State == TreeEntry::SplitVectorize &&
"Expected split user node.");
8453 std::iota(NewMask.
begin(), NewMask.
end(), 0);
8454 std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0);
8457 copy(MaskOrder, NewMaskOrder.begin());
8459 assert(Idx == 1 &&
"Expected either 0 or 1 index.");
8460 unsigned Offset = CombinedEntriesWithIndices.
back().second;
8469 ReorderIndices.clear();
8488 ExternalUserReorderMap;
8492 for_each(VectorizableTree, [&, &TTIRef = *TTI](
8493 const std::unique_ptr<TreeEntry> &TE) {
8496 findExternalStoreUsersReorderIndices(TE.get());
8497 if (!ExternalUserReorderIndices.
empty()) {
8498 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
8500 std::move(ExternalUserReorderIndices));
8506 if (TE->hasState() && TE->isAltShuffle() &&
8507 TE->State != TreeEntry::SplitVectorize) {
8508 Type *ScalarTy = TE->Scalars[0]->getType();
8510 unsigned Opcode0 = TE->getOpcode();
8511 unsigned Opcode1 = TE->getAltOpcode();
8515 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
8516 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
8522 bool IgnoreReorder =
8523 !UserIgnoreList && VectorizableTree.front()->hasState() &&
8524 (VectorizableTree.front()->
getOpcode() == Instruction::InsertElement ||
8525 VectorizableTree.front()->getOpcode() == Instruction::Store);
8526 if (std::optional<OrdersType> CurrentOrder =
8536 const TreeEntry *UserTE = TE.get();
8538 if (!UserTE->UserTreeIndex)
8540 if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8541 UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
8542 UserTE->UserTreeIndex.UserTE->Idx != 0)
8544 UserTE = UserTE->UserTreeIndex.UserTE;
8547 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
8548 if (!(TE->State == TreeEntry::Vectorize ||
8549 TE->State == TreeEntry::StridedVectorize ||
8550 TE->State == TreeEntry::SplitVectorize ||
8551 TE->State == TreeEntry::CompressVectorize) ||
8552 !TE->ReuseShuffleIndices.empty())
8553 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
8554 if (TE->State == TreeEntry::Vectorize &&
8555 TE->getOpcode() == Instruction::PHI)
8556 PhisToOrders.
try_emplace(TE.get(), *CurrentOrder);
8561 for (
unsigned VF = VectorizableTree.front()->getVectorFactor();
8562 !VFToOrderedEntries.
empty() && VF > 1; VF -= 2 - (VF & 1U)) {
8563 auto It = VFToOrderedEntries.
find(VF);
8564 if (It == VFToOrderedEntries.
end())
8578 for (
const TreeEntry *OpTE : OrderedEntries) {
8581 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE) &&
8582 OpTE->State != TreeEntry::SplitVectorize)
8585 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
8587 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
8588 auto It = GathersToOrders.find(OpTE);
8589 if (It != GathersToOrders.end())
8592 if (OpTE->hasState() && OpTE->isAltShuffle()) {
8593 auto It = AltShufflesToOrders.find(OpTE);
8594 if (It != AltShufflesToOrders.end())
8597 if (OpTE->State == TreeEntry::Vectorize &&
8598 OpTE->getOpcode() == Instruction::PHI) {
8599 auto It = PhisToOrders.
find(OpTE);
8600 if (It != PhisToOrders.
end())
8603 return OpTE->ReorderIndices;
8606 auto It = ExternalUserReorderMap.
find(OpTE);
8607 if (It != ExternalUserReorderMap.
end()) {
8608 const auto &ExternalUserReorderIndices = It->second;
8612 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
8613 OrdersUses.try_emplace(
OrdersType(), 0).first->second +=
8614 ExternalUserReorderIndices.size();
8616 for (
const OrdersType &ExtOrder : ExternalUserReorderIndices)
8617 ++OrdersUses.try_emplace(ExtOrder, 0).first->second;
8624 if (OpTE->State == TreeEntry::Vectorize &&
8625 OpTE->getOpcode() == Instruction::Store && !Order.
empty()) {
8626 assert(!OpTE->isAltShuffle() &&
8627 "Alternate instructions are only supported by BinaryOperator "
8631 unsigned E = Order.
size();
8634 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8637 ++OrdersUses.try_emplace(CurrentOrder, 0).first->second;
8639 ++OrdersUses.try_emplace(Order, 0).first->second;
8642 if (OrdersUses.empty())
8645 unsigned IdentityCnt = 0;
8646 unsigned FilledIdentityCnt = 0;
8648 for (
auto &Pair : OrdersUses) {
8650 if (!Pair.first.empty())
8651 FilledIdentityCnt += Pair.second;
8652 IdentityCnt += Pair.second;
8657 unsigned Cnt = IdentityCnt;
8658 for (
auto &Pair : OrdersUses) {
8662 if (Cnt < Pair.second ||
8663 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
8664 Cnt == Pair.second && !BestOrder.
empty() &&
8667 BestOrder = Pair.first;
8680 unsigned E = BestOrder.
size();
8682 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8685 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8687 if (TE->Scalars.size() != VF) {
8688 if (TE->ReuseShuffleIndices.size() == VF) {
8689 assert(TE->State != TreeEntry::SplitVectorize &&
8690 "Split vectorized not expected.");
8695 (!TE->UserTreeIndex ||
8696 TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
8697 TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() ||
8698 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) &&
8699 "All users must be of VF size.");
8706 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->hasState() &&
8712 reorderNodeWithReuses(*TE, Mask);
8714 if (TE->UserTreeIndex &&
8715 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8716 TE->UserTreeIndex.UserTE->reorderSplitNode(
8717 TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder);
8721 if ((TE->State == TreeEntry::SplitVectorize &&
8722 TE->ReuseShuffleIndices.empty()) ||
8723 ((TE->State == TreeEntry::Vectorize ||
8724 TE->State == TreeEntry::StridedVectorize ||
8725 TE->State == TreeEntry::CompressVectorize) &&
8730 (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize &&
8731 TE->ReuseShuffleIndices.empty())) &&
8732 "Alternate instructions are only supported by BinaryOperator "
8738 TE->reorderOperands(Mask);
8741 TE->reorderOperands(Mask);
8742 assert(TE->ReorderIndices.empty() &&
8743 "Expected empty reorder sequence.");
8746 if (!TE->ReuseShuffleIndices.empty()) {
8753 addMask(NewReuses, TE->ReuseShuffleIndices);
8754 TE->ReuseShuffleIndices.swap(NewReuses);
8755 }
else if (TE->UserTreeIndex &&
8756 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8758 TE->UserTreeIndex.UserTE->reorderSplitNode(TE->UserTreeIndex.EdgeIdx,
8764void BoUpSLP::buildReorderableOperands(
8765 TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
8769 if (
any_of(Edges, [
I](
const std::pair<unsigned, TreeEntry *> &OpData) {
8770 return OpData.first ==
I &&
8771 (OpData.second->State == TreeEntry::Vectorize ||
8772 OpData.second->State == TreeEntry::StridedVectorize ||
8773 OpData.second->State == TreeEntry::CompressVectorize ||
8774 OpData.second->State == TreeEntry::SplitVectorize);
8778 if (UserTE->hasState()) {
8779 if (UserTE->getOpcode() == Instruction::ExtractElement ||
8780 UserTE->getOpcode() == Instruction::ExtractValue)
8782 if (UserTE->getOpcode() == Instruction::InsertElement &&
I == 0)
8784 if (UserTE->getOpcode() == Instruction::Store &&
8785 UserTE->State == TreeEntry::Vectorize &&
I == 1)
8787 if (UserTE->getOpcode() == Instruction::Load &&
8788 (UserTE->State == TreeEntry::Vectorize ||
8789 UserTE->State == TreeEntry::StridedVectorize ||
8790 UserTE->State == TreeEntry::CompressVectorize))
8793 TreeEntry *
TE = getOperandEntry(UserTE,
I);
8794 assert(TE &&
"Expected operand entry.");
8795 if (!
TE->isGather()) {
8798 Edges.emplace_back(
I, TE);
8804 if (
TE->State == TreeEntry::ScatterVectorize &&
8805 TE->ReuseShuffleIndices.empty() &&
TE->ReorderIndices.empty())
8809 if (ReorderableGathers.
contains(TE))
8815 struct TreeEntryCompare {
8816 bool operator()(
const TreeEntry *LHS,
const TreeEntry *RHS)
const {
8817 if (LHS->UserTreeIndex && RHS->UserTreeIndex)
8818 return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx;
8819 return LHS->Idx < RHS->Idx;
8828 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8829 if (TE->State != TreeEntry::Vectorize &&
8830 TE->State != TreeEntry::StridedVectorize &&
8831 TE->State != TreeEntry::CompressVectorize &&
8832 TE->State != TreeEntry::SplitVectorize)
8833 NonVectorized.
insert(TE.get());
8834 if (std::optional<OrdersType> CurrentOrder =
8836 Queue.push(TE.get());
8837 if (!(TE->State == TreeEntry::Vectorize ||
8838 TE->State == TreeEntry::StridedVectorize ||
8839 TE->State == TreeEntry::CompressVectorize ||
8840 TE->State == TreeEntry::SplitVectorize) ||
8841 !TE->ReuseShuffleIndices.empty())
8842 GathersToOrders.
insert(TE.get());
8851 while (!Queue.empty()) {
8853 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>
Users;
8854 TreeEntry *TE = Queue.top();
8855 const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
8858 while (!Queue.empty()) {
8860 if (!UserTE || UserTE != TE->UserTreeIndex.UserTE)
8865 for (TreeEntry *TE : OrderedOps) {
8866 if (!(TE->State == TreeEntry::Vectorize ||
8867 TE->State == TreeEntry::StridedVectorize ||
8868 TE->State == TreeEntry::CompressVectorize ||
8869 TE->State == TreeEntry::SplitVectorize ||
8870 (TE->isGather() && GathersToOrders.
contains(TE))) ||
8871 !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
8872 !Visited.
insert(TE).second)
8876 Users.first = TE->UserTreeIndex.UserTE;
8877 Users.second.emplace_back(TE->UserTreeIndex.EdgeIdx, TE);
8881 if (
Data.first->State == TreeEntry::SplitVectorize) {
8883 Data.second.size() <= 2 &&
8884 "Expected not greater than 2 operands for split vectorize node.");
8886 [](
const auto &
Op) { return !Op.second->UserTreeIndex; }))
8889 assert(
Data.first->CombinedEntriesWithIndices.size() == 2 &&
8890 "Expected exactly 2 entries.");
8891 for (
const auto &
P :
Data.first->CombinedEntriesWithIndices) {
8892 TreeEntry &OpTE = *VectorizableTree[
P.first];
8894 if (Order.
empty() || !OpTE.ReuseShuffleIndices.empty()) {
8895 if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
8897 const auto BestOrder =
8906 const unsigned E = Order.
size();
8909 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8911 Data.first->reorderSplitNode(
P.second ? 1 : 0, Mask, MaskOrder);
8913 if (!OpTE.ReorderIndices.empty()) {
8914 OpTE.ReorderIndices.clear();
8915 }
else if (!OpTE.ReuseShuffleIndices.empty()) {
8918 assert(OpTE.isGather() &&
"Expected only gather/buildvector node.");
8922 if (
Data.first->ReuseShuffleIndices.empty() &&
8923 !
Data.first->ReorderIndices.empty()) {
8926 Queue.push(
Data.first);
8932 buildReorderableOperands(
Data.first,
Data.second, NonVectorized,
8944 for (
const auto &
Op :
Data.second) {
8945 TreeEntry *OpTE =
Op.second;
8946 if (!VisitedOps.
insert(OpTE).second)
8948 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
8950 const auto Order = [&]() ->
const OrdersType {
8951 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
8955 return OpTE->ReorderIndices;
8959 if (Order.
size() == 1)
8965 Value *Root = OpTE->hasState()
8968 auto GetSameNodesUsers = [&](
Value *Root) {
8970 for (
const TreeEntry *TE : ValueToGatherNodes.lookup(Root)) {
8971 if (TE != OpTE && TE->UserTreeIndex &&
8972 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8973 TE->Scalars.size() == OpTE->Scalars.size() &&
8974 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8975 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8976 Res.
insert(TE->UserTreeIndex.UserTE);
8978 for (
const TreeEntry *TE : getTreeEntries(Root)) {
8979 if (TE != OpTE && TE->UserTreeIndex &&
8980 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8981 TE->Scalars.size() == OpTE->Scalars.size() &&
8982 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8983 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8984 Res.
insert(TE->UserTreeIndex.UserTE);
8988 auto GetNumOperands = [](
const TreeEntry *TE) {
8989 if (TE->State == TreeEntry::SplitVectorize)
8990 return TE->getNumOperands();
8992 return CI->arg_size();
8993 return TE->getNumOperands();
8995 auto NodeShouldBeReorderedWithOperands = [&, TTI = TTI](
8996 const TreeEntry *TE) {
9004 const TreeEntry *
Op = getOperandEntry(TE, Idx);
9005 if (
Op->isGather() &&
Op->hasState()) {
9006 const TreeEntry *VecOp =
9007 getSameValuesTreeEntry(
Op->getMainOp(),
Op->Scalars);
9011 if (
Op->ReorderIndices.empty() &&
Op->ReuseShuffleIndices.empty())
9018 if (!RevisitedOps.
insert(UTE).second)
9020 return UTE ==
Data.first || !UTE->ReorderIndices.empty() ||
9021 !UTE->ReuseShuffleIndices.empty() ||
9022 (UTE->UserTreeIndex &&
9023 UTE->UserTreeIndex.UserTE ==
Data.first) ||
9024 (
Data.first->UserTreeIndex &&
9025 Data.first->UserTreeIndex.UserTE == UTE) ||
9026 (IgnoreReorder && UTE->UserTreeIndex &&
9027 UTE->UserTreeIndex.UserTE->Idx == 0) ||
9028 NodeShouldBeReorderedWithOperands(UTE);
9031 for (TreeEntry *UTE :
Users) {
9039 const TreeEntry *
Op = getOperandEntry(UTE, Idx);
9041 Queue.push(
const_cast<TreeEntry *
>(
Op));
9046 Data.second, [OpTE](
const std::pair<unsigned, TreeEntry *> &
P) {
9047 return P.second == OpTE;
9050 if (OpTE->State == TreeEntry::Vectorize &&
9051 OpTE->getOpcode() == Instruction::Store && !Order.
empty()) {
9052 assert(!OpTE->isAltShuffle() &&
9053 "Alternate instructions are only supported by BinaryOperator "
9057 unsigned E = Order.
size();
9060 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
9063 OrdersUses.try_emplace(CurrentOrder, 0).first->second +=
NumOps;
9065 OrdersUses.try_emplace(Order, 0).first->second +=
NumOps;
9067 auto Res = OrdersUses.try_emplace(
OrdersType(), 0);
9068 const auto AllowsReordering = [&](
const TreeEntry *TE) {
9069 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
9070 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
9071 (IgnoreReorder && TE->Idx == 0))
9073 if (TE->isGather()) {
9083 if (OpTE->UserTreeIndex) {
9084 TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE;
9085 if (!VisitedUsers.
insert(UserTE).second)
9090 if (AllowsReordering(UserTE))
9098 if (
static_cast<unsigned>(
count_if(
9099 Ops, [UserTE, &AllowsReordering](
9100 const std::pair<unsigned, TreeEntry *> &
Op) {
9101 return AllowsReordering(
Op.second) &&
9102 Op.second->UserTreeIndex.UserTE == UserTE;
9103 })) <=
Ops.size() / 2)
9104 ++Res.first->second;
9107 if (OrdersUses.empty()) {
9112 unsigned IdentityCnt = 0;
9113 unsigned VF =
Data.second.front().second->getVectorFactor();
9115 for (
auto &Pair : OrdersUses) {
9117 IdentityCnt += Pair.second;
9122 unsigned Cnt = IdentityCnt;
9123 for (
auto &Pair : OrdersUses) {
9127 if (Cnt < Pair.second) {
9129 BestOrder = Pair.first;
9146 unsigned E = BestOrder.
size();
9148 return I < E ? static_cast<int>(I) : PoisonMaskElem;
9150 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second) {
9151 TreeEntry *TE =
Op.second;
9152 if (!VisitedOps.
insert(TE).second)
9154 if (TE->ReuseShuffleIndices.size() == BestOrder.
size()) {
9155 reorderNodeWithReuses(*TE, Mask);
9159 if (TE->State != TreeEntry::Vectorize &&
9160 TE->State != TreeEntry::StridedVectorize &&
9161 TE->State != TreeEntry::CompressVectorize &&
9162 TE->State != TreeEntry::SplitVectorize &&
9163 (TE->State != TreeEntry::ScatterVectorize ||
9164 TE->ReorderIndices.empty()))
9166 assert((BestOrder.
size() == TE->ReorderIndices.size() ||
9167 TE->ReorderIndices.empty()) &&
9168 "Non-matching sizes of user/operand entries.");
9170 if (IgnoreReorder && TE == VectorizableTree.front().get())
9171 IgnoreReorder =
false;
9174 for (TreeEntry *
Gather : GatherOps) {
9176 "Unexpected reordering of gathers.");
9177 if (!
Gather->ReuseShuffleIndices.empty()) {
9187 auto IsNotProfitableAltCodeNode = [](
const TreeEntry &TE) {
9188 return TE.isAltShuffle() &&
9189 (!TE.ReuseShuffleIndices.empty() || TE.getVectorFactor() == 2 ||
9190 TE.ReorderIndices.empty());
9192 if (
Data.first->State != TreeEntry::Vectorize ||
9194 Data.first->getMainOp()) ||
9195 IsNotProfitableAltCodeNode(*
Data.first))
9196 Data.first->reorderOperands(Mask);
9198 IsNotProfitableAltCodeNode(*
Data.first) ||
9199 Data.first->State == TreeEntry::StridedVectorize ||
9200 Data.first->State == TreeEntry::CompressVectorize) {
9204 if (
Data.first->ReuseShuffleIndices.empty() &&
9205 !
Data.first->ReorderIndices.empty() &&
9206 !IsNotProfitableAltCodeNode(*
Data.first)) {
9209 Queue.push(
Data.first);
9217 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
9218 VectorizableTree.front()->ReuseShuffleIndices.empty())
9219 VectorizableTree.front()->ReorderIndices.
clear();
9222Instruction *BoUpSLP::getRootEntryInstruction(
const TreeEntry &Entry)
const {
9223 if (Entry.hasState() &&
9224 (Entry.getOpcode() == Instruction::Store ||
9225 Entry.getOpcode() == Instruction::Load) &&
9226 Entry.State == TreeEntry::StridedVectorize &&
9227 !Entry.ReorderIndices.empty() &&
isReverseOrder(Entry.ReorderIndices))
9234 const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
9237 for (
auto &TEPtr : VectorizableTree) {
9238 TreeEntry *Entry = TEPtr.get();
9241 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize ||
9242 DeletedNodes.contains(Entry) ||
9243 TransformedToGatherNodes.contains(Entry))
9247 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
9248 Value *Scalar = Entry->Scalars[Lane];
9253 auto It = ScalarToExtUses.
find(Scalar);
9254 if (It != ScalarToExtUses.
end() && !ExternalUses[It->second].User)
9257 if (Scalar->hasNUsesOrMore(NumVectScalars)) {
9258 unsigned FoundLane = Entry->findLaneForValue(Scalar);
9259 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract from lane " << FoundLane
9260 <<
" from " << *Scalar <<
"for many users.\n");
9261 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
9262 ExternalUses.emplace_back(Scalar,
nullptr, *Entry, FoundLane);
9263 ExternalUsesWithNonUsers.insert(Scalar);
9268 const auto ExtI = ExternallyUsedValues.
find(Scalar);
9269 if (ExtI != ExternallyUsedValues.
end()) {
9270 unsigned FoundLane = Entry->findLaneForValue(Scalar);
9271 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract: Extra arg from lane "
9272 << FoundLane <<
" from " << *Scalar <<
".\n");
9273 ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size());
9274 ExternalUses.emplace_back(Scalar,
nullptr, *Entry, FoundLane);
9285 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
9290 any_of(UseEntries, [
this](
const TreeEntry *UseEntry) {
9291 return !DeletedNodes.contains(UseEntry) &&
9292 !TransformedToGatherNodes.contains(UseEntry);
9297 if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
9300 all_of(UseEntries, [&](TreeEntry *UseEntry) {
9301 if (DeletedNodes.contains(UseEntry) ||
9302 TransformedToGatherNodes.contains(UseEntry))
9304 return UseEntry->State == TreeEntry::ScatterVectorize ||
9306 Scalar, getRootEntryInstruction(*UseEntry), TLI,
9309 LLVM_DEBUG(
dbgs() <<
"SLP: \tInternal user will be removed:" << *U
9312 [](TreeEntry *UseEntry) {
9313 return UseEntry->isGather();
9319 if (It != ScalarToExtUses.
end()) {
9320 ExternalUses[It->second].User =
nullptr;
9325 if (U && Scalar->hasNUsesOrMore(
UsesLimit))
9327 unsigned FoundLane = Entry->findLaneForValue(Scalar);
9329 <<
" from lane " << FoundLane <<
" from " << *Scalar
9331 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
9332 ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
9333 ExternalUsesWithNonUsers.insert(Scalar);
9342BoUpSLP::collectUserStores(
const BoUpSLP::TreeEntry *TE)
const {
9346 for (
unsigned Lane :
seq<unsigned>(0, TE->Scalars.size())) {
9347 Value *V = TE->Scalars[Lane];
9360 if (
SI ==
nullptr || !
SI->isSimple() ||
SI->getFunction() !=
F ||
9369 auto &StoresVec = PtrToStoresMap[{
SI->getParent(),
9370 SI->getValueOperand()->getType(), Ptr}];
9373 if (StoresVec.size() > Lane)
9375 if (!StoresVec.empty()) {
9377 SI->getValueOperand()->getType(),
SI->getPointerOperand(),
9378 SI->getValueOperand()->getType(),
9379 StoresVec.front()->getPointerOperand(), *
DL, *SE,
9385 StoresVec.push_back(SI);
9390 for (
auto &
P : PtrToStoresMap) {
9405 StoreInst *S0 = StoresVec[0];
9410 StoreInst *
SI = StoresVec[Idx];
9411 std::optional<int64_t> Diff =
9413 SI->getPointerOperand(), *DL, *SE,
9419 if (StoreOffsetVec.
size() != StoresVec.
size())
9421 sort(StoreOffsetVec, llvm::less_first());
9423 int64_t PrevDist = 0;
9424 for (
const auto &
P : StoreOffsetVec) {
9425 if (Idx > 0 &&
P.first != PrevDist + 1)
9433 ReorderIndices.assign(StoresVec.
size(), 0);
9434 bool IsIdentity =
true;
9436 ReorderIndices[
P.second] =
I;
9437 IsIdentity &=
P.second ==
I;
9443 ReorderIndices.clear();
9450 for (
unsigned Idx : Order)
9451 dbgs() << Idx <<
", ";
9457BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE)
const {
9458 unsigned NumLanes =
TE->Scalars.size();
9471 if (StoresVec.
size() != NumLanes)
9476 if (!canFormVector(StoresVec, ReorderIndices))
9481 ExternalReorderIndices.
push_back(ReorderIndices);
9483 return ExternalReorderIndices;
9489 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9490 "TreeEntryToStridedPtrInfoMap is not cleared");
9491 UserIgnoreList = &UserIgnoreLst;
9494 buildTreeRec(Roots, 0,
EdgeInfo());
9499 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9500 "TreeEntryToStridedPtrInfoMap is not cleared");
9503 buildTreeRec(Roots, 0,
EdgeInfo());
9512 bool AddNew =
true) {
9520 for (
Value *V : VL) {
9524 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
9526 bool IsFound =
false;
9527 for (
auto [Map,
Data] :
zip(ClusteredDistToLoad, ClusteredLoads)) {
9528 assert(LI->getParent() ==
Data.front().first->getParent() &&
9529 LI->getType() ==
Data.front().first->getType() &&
9533 "Expected loads with the same type, same parent and same "
9534 "underlying pointer.");
9536 LI->getType(), LI->getPointerOperand(),
Data.front().first->getType(),
9537 Data.front().first->getPointerOperand(),
DL, SE,
9541 auto It = Map.find(*Dist);
9542 if (It != Map.end() && It->second != LI)
9544 if (It == Map.end()) {
9545 Data.emplace_back(LI, *Dist);
9546 Map.try_emplace(*Dist, LI);
9556 auto FindMatchingLoads =
9561 int64_t &
Offset,
unsigned &Start) {
9563 return GatheredLoads.
end();
9572 std::optional<int64_t> Dist =
9574 Data.front().first->getType(),
9575 Data.front().first->getPointerOperand(),
DL, SE,
9581 for (std::pair<LoadInst *, int64_t>
P :
Data) {
9587 unsigned NumUniques = 0;
9588 for (
auto [Cnt, Pair] :
enumerate(Loads)) {
9589 bool Used = DataLoads.
contains(Pair.first);
9590 if (!Used && !DataDists.
contains(*Dist + Pair.second)) {
9594 Repeated.insert(Cnt);
9597 if (NumUniques > 0 &&
9598 (Loads.
size() == NumUniques ||
9599 (Loads.
size() - NumUniques >= 2 &&
9600 Loads.
size() - NumUniques >= Loads.
size() / 2 &&
9606 return std::next(GatheredLoads.
begin(), Idx);
9610 return GatheredLoads.
end();
9612 for (
ArrayRef<std::pair<LoadInst *, int64_t>>
Data : ClusteredLoads) {
9616 auto *It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
9618 while (It != GatheredLoads.
end()) {
9619 assert(!LocalToAdd.
empty() &&
"Expected some elements to add.");
9620 for (
unsigned Idx : LocalToAdd)
9623 It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
Offset,
9627 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
9634 Loads.push_back(
Data[Idx]);
9640 GatheredLoads, [&](
ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9641 return PD.front().first->getParent() == LI->
getParent() &&
9642 PD.front().first->getType() == LI->
getType();
9644 while (It != GatheredLoads.
end()) {
9647 std::next(It), GatheredLoads.
end(),
9648 [&](
ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9649 return PD.front().first->getParent() == LI->getParent() &&
9650 PD.front().first->getType() == LI->getType();
9654 GatheredLoads.emplace_back().append(
Data.begin(),
Data.end());
9655 AddNewLoads(GatheredLoads.emplace_back());
9660void BoUpSLP::tryToVectorizeGatheredLoads(
9661 const SmallMapVector<
9662 std::tuple<BasicBlock *, Value *, Type *>,
9665 GatheredLoadsEntriesFirst = VectorizableTree.
size();
9668 LoadEntriesToVectorize.size());
9669 for (
auto [Idx, Set] :
zip(LoadEntriesToVectorize, LoadSetsToVectorize))
9670 Set.insert_range(VectorizableTree[Idx]->Scalars);
9673 auto LoadSorter = [](
const std::pair<LoadInst *, int64_t> &L1,
9674 const std::pair<LoadInst *, int64_t> &L2) {
9675 return L1.second > L2.second;
9682 auto *Ty =
getWidenedType(Loads.front()->getType(), Loads.size());
9683 return TTI->isLegalMaskedGather(Ty, Alignment) &&
9684 !TTI->forceScalarizeMaskedGather(Ty, Alignment);
9689 SmallVectorImpl<LoadInst *> &NonVectorized,
9690 bool Final,
unsigned MaxVF) {
9692 unsigned StartIdx = 0;
9693 SmallVector<int> CandidateVFs;
9697 *TTI, Loads.
front()->getType(), MaxVF);
9699 *TTI, Loads.
front()->getType(), NumElts - 1)) {
9705 if (Final && CandidateVFs.
empty())
9708 unsigned BestVF = Final ? CandidateVFs.
back() : 0;
9709 for (
unsigned NumElts : CandidateVFs) {
9710 if (Final && NumElts > BestVF)
9712 SmallVector<unsigned> MaskedGatherVectorized;
9713 for (
unsigned Cnt = StartIdx,
E = Loads.
size(); Cnt <
E;
9717 if (VectorizedLoads.count(Slice.
front()) ||
9718 VectorizedLoads.count(Slice.
back()) ||
9724 bool AllowToVectorize =
false;
9727 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
9730 for (LoadInst *LI : Slice) {
9732 if (LI->hasOneUse())
9738 if (
static_cast<unsigned int>(std::distance(
9739 LI->user_begin(), LI->user_end())) != LI->getNumUses())
9741 if (!IsLegalBroadcastLoad)
9745 for (User *U : LI->users()) {
9748 for (
const TreeEntry *UTE : getTreeEntries(U)) {
9749 for (
int I :
seq<int>(UTE->getNumOperands())) {
9751 return V == LI || isa<PoisonValue>(V);
9761 AllowToVectorize = CheckIfAllowed(Slice);
9765 any_of(ValueToGatherNodes.at(Slice.front()),
9766 [=](
const TreeEntry *TE) {
9767 return TE->Scalars.size() == 2 &&
9768 ((TE->Scalars.front() == Slice.front() &&
9769 TE->Scalars.back() == Slice.back()) ||
9770 (TE->Scalars.front() == Slice.back() &&
9771 TE->Scalars.back() == Slice.front()));
9776 if (AllowToVectorize) {
9781 reinterpret_cast<Value *
const *
>(Slice.begin()), Slice.size());
9782 StridedPtrInfo SPtrInfo;
9784 PointerOps, SPtrInfo, &BestVF);
9786 (BestVF > 1 &&
static_cast<unsigned>(NumElts) == 2 * BestVF)) {
9788 if (MaskedGatherVectorized.
empty() ||
9789 Cnt >= MaskedGatherVectorized.
back() + NumElts)
9794 Results.emplace_back(Values, LS);
9795 VectorizedLoads.insert_range(Slice);
9798 if (Cnt == StartIdx)
9799 StartIdx += NumElts;
9802 if (StartIdx >= Loads.
size())
9806 if (!MaskedGatherVectorized.
empty() &&
9807 Cnt < MaskedGatherVectorized.
back() + NumElts)
9813 if (!AllowToVectorize || BestVF == 0)
9817 for (
unsigned Cnt : MaskedGatherVectorized) {
9819 Cnt, std::min<unsigned>(NumElts, Loads.
size() - Cnt));
9823 VectorizedLoads.insert_range(Slice);
9825 if (Cnt == StartIdx)
9826 StartIdx += NumElts;
9829 for (LoadInst *LI : Loads) {
9830 if (!VectorizedLoads.contains(LI))
9831 NonVectorized.push_back(LI);
9835 auto ProcessGatheredLoads =
9838 bool Final =
false) {
9840 for (
ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists :
9842 if (LoadsDists.size() <= 1) {
9843 NonVectorized.
push_back(LoadsDists.back().first);
9851 unsigned MaxConsecutiveDistance = 0;
9852 unsigned CurrentConsecutiveDist = 1;
9853 int64_t LastDist = LocalLoadsDists.front().second;
9854 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
9855 for (
const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {
9858 assert(LastDist >=
L.second &&
9859 "Expected first distance always not less than second");
9860 if (
static_cast<uint64_t
>(LastDist -
L.second) ==
9861 CurrentConsecutiveDist) {
9862 ++CurrentConsecutiveDist;
9863 MaxConsecutiveDistance =
9864 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
9868 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
9871 CurrentConsecutiveDist = 1;
9872 LastDist =
L.second;
9875 if (Loads.
size() <= 1)
9877 if (AllowMaskedGather)
9878 MaxConsecutiveDistance = Loads.
size();
9879 else if (MaxConsecutiveDistance < 2)
9884 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
9885 Final, MaxConsecutiveDistance);
9887 OriginalLoads.size() == Loads.
size() &&
9888 MaxConsecutiveDistance == Loads.
size() &&
9893 VectorizedLoads.
clear();
9897 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
9898 UnsortedNonVectorized, Final,
9899 OriginalLoads.size());
9900 if (SortedNonVectorized.
size() >= UnsortedNonVectorized.
size()) {
9901 SortedNonVectorized.
swap(UnsortedNonVectorized);
9902 Results.swap(UnsortedResults);
9907 << Slice.
size() <<
")\n");
9909 for (
Value *L : Slice)
9917 unsigned MaxVF = Slice.size();
9918 unsigned UserMaxVF = 0;
9919 unsigned InterleaveFactor = 0;
9924 std::optional<unsigned> InterleavedLoadsDistance = 0;
9926 std::optional<unsigned> CommonVF = 0;
9927 DenseMap<const TreeEntry *, unsigned> EntryToPosition;
9928 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
9929 for (
auto [Idx, V] :
enumerate(Slice)) {
9930 for (
const TreeEntry *
E : ValueToGatherNodes.at(V)) {
9931 UserMaxVF = std::max<unsigned>(UserMaxVF,
E->Scalars.size());
9934 UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
9936 if (*CommonVF == 0) {
9937 CommonVF =
E->Scalars.size();
9940 if (*CommonVF !=
E->Scalars.size())
9944 if (Pos != Idx && InterleavedLoadsDistance) {
9947 if (isa<Constant>(V))
9949 if (isVectorized(V))
9951 const auto &Nodes = ValueToGatherNodes.at(V);
9952 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
9953 !is_contained(Slice, V);
9955 InterleavedLoadsDistance.reset();
9959 if (*InterleavedLoadsDistance == 0) {
9960 InterleavedLoadsDistance = Idx - Pos;
9963 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
9964 (Idx - Pos) / *InterleavedLoadsDistance < Order)
9965 InterleavedLoadsDistance.reset();
9966 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
9970 DeinterleavedNodes.
clear();
9972 if (InterleavedLoadsDistance.value_or(0) > 1 &&
9973 CommonVF.value_or(0) != 0) {
9974 InterleaveFactor =
bit_ceil(*InterleavedLoadsDistance);
9975 unsigned VF = *CommonVF;
9978 StridedPtrInfo SPtrInfo;
9980 if (InterleaveFactor <= Slice.size() &&
9981 TTI.isLegalInterleavedAccessType(
9989 UserMaxVF = InterleaveFactor * VF;
9991 InterleaveFactor = 0;
9996 unsigned ConsecutiveNodesSize = 0;
9997 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
9998 any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9999 [&, Slice = Slice](
const auto &
P) {
10001 return std::get<1>(
P).contains(V);
10003 if (It == Slice.end())
10005 const TreeEntry &
TE =
10006 *VectorizableTree[std::get<0>(
P)];
10010 StridedPtrInfo SPtrInfo;
10012 VL, VL.
front(), Order, PointerOps, SPtrInfo);
10016 ConsecutiveNodesSize += VL.
size();
10017 size_t Start = std::distance(Slice.begin(), It);
10018 size_t Sz = Slice.size() -
Start;
10019 return Sz < VL.
size() ||
10020 Slice.slice(Start, VL.
size()) != VL;
10025 if (InterleaveFactor == 0 &&
10027 [&, Slice = Slice](
unsigned Idx) {
10029 SmallVector<Value *> PointerOps;
10030 StridedPtrInfo SPtrInfo;
10031 return canVectorizeLoads(
10032 Slice.slice(Idx * UserMaxVF, UserMaxVF),
10033 Slice[Idx * UserMaxVF], Order, PointerOps,
10034 SPtrInfo) == LoadsState::ScatterVectorize;
10037 if (Slice.size() != ConsecutiveNodesSize)
10038 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
10040 for (
unsigned VF = MaxVF; VF >= 2; VF /= 2) {
10041 bool IsVectorized =
true;
10042 for (
unsigned I = 0,
E = Slice.size();
I <
E;
I += VF) {
10044 Slice.slice(
I, std::min(VF,
E -
I));
10049 if (
any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
10050 [&](
const auto &
P) {
10051 return !SubSlice.
equals(
10052 VectorizableTree[std::get<0>(
P)]
10057 unsigned Sz = VectorizableTree.size();
10058 buildTreeRec(SubSlice, 0,
EdgeInfo(), InterleaveFactor);
10059 if (Sz == VectorizableTree.size()) {
10060 IsVectorized =
false;
10063 if (InterleaveFactor > 0) {
10064 VF = 2 * (MaxVF / InterleaveFactor);
10065 InterleaveFactor = 0;
10074 NonVectorized.
append(SortedNonVectorized);
10076 return NonVectorized;
10078 for (
const auto &GLs : GatheredLoads) {
10079 const auto &
Ref = GLs.second;
10081 if (!
Ref.empty() && !NonVectorized.
empty() &&
10083 Ref.begin(),
Ref.end(), 0u,
10084 [](
unsigned S,
ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists)
10085 ->
unsigned { return S + LoadsDists.size(); }) !=
10086 NonVectorized.
size() &&
10087 IsMaskedGatherSupported(NonVectorized)) {
10089 FinalGatheredLoads;
10090 for (LoadInst *LI : NonVectorized) {
10094 FinalGatheredLoads,
10098 (void)ProcessGatheredLoads(FinalGatheredLoads,
true);
10102 for (
unsigned Idx : LoadEntriesToVectorize) {
10103 const TreeEntry &
E = *VectorizableTree[Idx];
10106 if (!
E.ReorderIndices.empty()) {
10109 SmallVector<int> ReorderMask;
10113 buildTreeRec(GatheredScalars, 0,
EdgeInfo());
10117 if (
static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
10118 VectorizableTree.size())
10119 GatheredLoadsEntriesFirst.reset();
10129 bool AllowAlternate) {
10135 if (LI->isSimple())
10146 SubKey =
hash_value(EI->getVectorOperand());
10153 if (AllowAlternate)
10164 std::pair<size_t, size_t> OpVals =
10172 if (CI->isCommutative())
10194 SubKey =
hash_value(Gep->getPointerOperand());
10206 return std::make_pair(
Key, SubKey);
10212 Instruction *AltOp,
const TargetLibraryInfo &TLI);
10214bool BoUpSLP::areAltOperandsProfitable(
const InstructionsState &S,
10216 Type *ScalarTy = S.getMainOp()->getType();
10217 unsigned Opcode0 = S.getOpcode();
10218 unsigned Opcode1 = S.getAltOpcode();
10219 SmallBitVector OpcodeMask(
getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
10222 Opcode1, OpcodeMask))
10225 for (
unsigned I :
seq<unsigned>(S.getMainOp()->getNumOperands())) {
10228 for (
Value *V : VL) {
10230 Operands.
back().push_back(
10237 if (Operands.
size() == 2) {
10241 Candidates[0] = std::make_pair(Operands[0][
I], Operands[0][
I + 1]);
10242 Candidates[1] = std::make_pair(Operands[0][
I], Operands[1][
I + 1]);
10243 Candidates[2] = std::make_pair(Operands[1][
I], Operands[0][
I + 1]);
10245 switch (Res.value_or(0)) {
10249 std::swap(Operands[0][
I + 1], Operands[1][
I + 1]);
10259 DenseSet<unsigned> UniqueOpcodes;
10260 constexpr unsigned NumAltInsts = 3;
10261 unsigned NonInstCnt = 0;
10264 unsigned UndefCnt = 0;
10266 unsigned ExtraShuffleInsts = 0;
10269 if (Operands.
size() == 2) {
10271 if (Operands.
front() == Operands.
back()) {
10275 return is_contained(Operands.back(), V);
10278 ++ExtraShuffleInsts;
10281 const Loop *
L = LI->getLoopFor(S.getMainOp()->getParent());
10293 DenseMap<Value *, unsigned> Uniques;
10303 if (!Res.second && Res.first->second == 1)
10304 ++ExtraShuffleInsts;
10305 ++Res.first->getSecond();
10307 UniqueOpcodes.
insert(
I->getOpcode());
10308 else if (Res.second)
10311 return none_of(Uniques, [&](
const auto &
P) {
10312 return P.first->hasNUsesOrMore(
P.second + 1) &&
10313 none_of(
P.first->users(), [&](User *U) {
10314 return isVectorized(U) || Uniques.contains(U);
10323 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
10324 (UniqueOpcodes.
size() + NonInstCnt + ExtraShuffleInsts +
10325 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
10332 const unsigned VF,
unsigned MinBW,
10355static std::pair<InstructionCost, InstructionCost>
10375 FMF = FPCI->getFastMathFlags();
10378 LibCost.isValid() ? LibCost : ScalarLimit);
10388BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
10390 bool IsScatterVectorizeUserTE,
OrdersType &CurrentOrder,
10391 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo) {
10393 "Expected instructions with same/alternate opcodes only.");
10395 unsigned ShuffleOrOp =
10396 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
10398 switch (ShuffleOrOp) {
10399 case Instruction::PHI: {
10402 return TreeEntry::NeedToGather;
10404 for (
Value *V : VL) {
10408 for (
Value *Incoming :
PHI->incoming_values()) {
10410 if (Term &&
Term->isTerminator()) {
10412 <<
"SLP: Need to swizzle PHINodes (terminator use).\n");
10413 return TreeEntry::NeedToGather;
10418 return TreeEntry::Vectorize;
10420 case Instruction::ExtractElement:
10427 return TreeEntry::NeedToGather;
10429 case Instruction::ExtractValue: {
10430 bool Reuse = canReuseExtract(VL, CurrentOrder);
10434 return TreeEntry::NeedToGather;
10435 if (Reuse || !CurrentOrder.empty())
10436 return TreeEntry::Vectorize;
10438 return TreeEntry::NeedToGather;
10440 case Instruction::InsertElement: {
10444 for (
Value *V : VL) {
10446 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement/poison vector.\n");
10447 return TreeEntry::NeedToGather;
10451 "Non-constant or undef index?");
10455 return !SourceVectors.contains(V);
10458 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
10459 "different source vectors.\n");
10460 return TreeEntry::NeedToGather;
10465 return SourceVectors.contains(V) && !
V->hasOneUse();
10468 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
10469 "multiple uses.\n");
10470 return TreeEntry::NeedToGather;
10473 return TreeEntry::Vectorize;
10475 case Instruction::Load: {
10482 auto IsGatheredNode = [&]() {
10483 if (!GatheredLoadsEntriesFirst)
10488 return any_of(getTreeEntries(V), [&](
const TreeEntry *TE) {
10489 return TE->Idx >= *GatheredLoadsEntriesFirst;
10495 return TreeEntry::Vectorize;
10497 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
10499 LoadEntriesToVectorize.insert(VectorizableTree.size());
10500 return TreeEntry::NeedToGather;
10502 return IsGatheredNode() ? TreeEntry::NeedToGather
10503 : TreeEntry::CompressVectorize;
10505 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
10507 LoadEntriesToVectorize.insert(VectorizableTree.size());
10508 return TreeEntry::NeedToGather;
10510 return IsGatheredNode() ? TreeEntry::NeedToGather
10511 : TreeEntry::ScatterVectorize;
10513 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
10515 LoadEntriesToVectorize.insert(VectorizableTree.size());
10516 return TreeEntry::NeedToGather;
10518 return IsGatheredNode() ? TreeEntry::NeedToGather
10519 : TreeEntry::StridedVectorize;
10523 if (DL->getTypeSizeInBits(ScalarTy) !=
10524 DL->getTypeAllocSizeInBits(ScalarTy))
10525 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering loads of non-packed type.\n");
10528 return !LI || !LI->isSimple();
10532 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering non-consecutive loads.\n");
10535 return TreeEntry::NeedToGather;
10539 case Instruction::ZExt:
10540 case Instruction::SExt:
10541 case Instruction::FPToUI:
10542 case Instruction::FPToSI:
10543 case Instruction::FPExt:
10544 case Instruction::PtrToInt:
10545 case Instruction::IntToPtr:
10546 case Instruction::SIToFP:
10547 case Instruction::UIToFP:
10548 case Instruction::Trunc:
10549 case Instruction::FPTrunc:
10550 case Instruction::BitCast: {
10552 for (
Value *V : VL) {
10558 dbgs() <<
"SLP: Gathering casts with different src types.\n");
10559 return TreeEntry::NeedToGather;
10562 return TreeEntry::Vectorize;
10564 case Instruction::ICmp:
10565 case Instruction::FCmp: {
10570 for (
Value *V : VL) {
10574 if ((
Cmp->getPredicate() != P0 &&
Cmp->getPredicate() != SwapP0) ||
10575 Cmp->getOperand(0)->getType() != ComparedTy) {
10576 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering cmp with different predicate.\n");
10577 return TreeEntry::NeedToGather;
10580 return TreeEntry::Vectorize;
10582 case Instruction::Select:
10583 case Instruction::FNeg:
10584 case Instruction::Add:
10585 case Instruction::FAdd:
10586 case Instruction::Sub:
10587 case Instruction::FSub:
10588 case Instruction::Mul:
10589 case Instruction::FMul:
10590 case Instruction::UDiv:
10591 case Instruction::SDiv:
10592 case Instruction::FDiv:
10593 case Instruction::URem:
10594 case Instruction::SRem:
10595 case Instruction::FRem:
10596 case Instruction::Shl:
10597 case Instruction::LShr:
10598 case Instruction::AShr:
10599 case Instruction::And:
10600 case Instruction::Or:
10601 case Instruction::Xor:
10602 case Instruction::Freeze:
10603 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10604 TTI->isFPVectorizationPotentiallyUnsafe() &&
any_of(VL, [](
Value *V) {
10606 return I &&
I->isBinaryOp() && !
I->isFast();
10608 return TreeEntry::NeedToGather;
10609 return TreeEntry::Vectorize;
10610 case Instruction::GetElementPtr: {
10612 for (
Value *V : VL) {
10616 if (
I->getNumOperands() != 2) {
10617 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (nested indexes).\n");
10618 return TreeEntry::NeedToGather;
10625 for (
Value *V : VL) {
10629 Type *CurTy =
GEP->getSourceElementType();
10630 if (Ty0 != CurTy) {
10631 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (different types).\n");
10632 return TreeEntry::NeedToGather;
10638 for (
Value *V : VL) {
10642 auto *
Op =
I->getOperand(1);
10644 (
Op->getType() != Ty1 &&
10646 Op->getType()->getScalarSizeInBits() >
10647 DL->getIndexSizeInBits(
10648 V->getType()->getPointerAddressSpace())))) {
10650 dbgs() <<
"SLP: not-vectorizable GEP (non-constant indexes).\n");
10651 return TreeEntry::NeedToGather;
10655 return TreeEntry::Vectorize;
10657 case Instruction::Store: {
10659 llvm::Type *ScalarTy =
cast<StoreInst>(VL0)->getValueOperand()->getType();
10662 if (DL->getTypeSizeInBits(ScalarTy) !=
10663 DL->getTypeAllocSizeInBits(ScalarTy)) {
10664 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering stores of non-packed type.\n");
10665 return TreeEntry::NeedToGather;
10669 for (
Value *V : VL) {
10671 if (!
SI->isSimple()) {
10673 return TreeEntry::NeedToGather;
10682 if (CurrentOrder.empty()) {
10683 Ptr0 = PointerOps.
front();
10684 PtrN = PointerOps.
back();
10686 Ptr0 = PointerOps[CurrentOrder.front()];
10687 PtrN = PointerOps[CurrentOrder.back()];
10689 std::optional<int64_t> Dist =
10692 if (
static_cast<uint64_t
>(*Dist) == VL.size() - 1)
10693 return TreeEntry::Vectorize;
10697 return TreeEntry::NeedToGather;
10699 case Instruction::Call: {
10700 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10701 TTI->isFPVectorizationPotentiallyUnsafe() &&
any_of(VL, [](
Value *V) {
10703 return I && !
I->isFast();
10705 return TreeEntry::NeedToGather;
10715 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10719 return TreeEntry::NeedToGather;
10722 unsigned NumArgs = CI->
arg_size();
10723 SmallVector<Value *, 4> ScalarArgs(NumArgs,
nullptr);
10724 for (
unsigned J = 0; J != NumArgs; ++J)
10727 for (
Value *V : VL) {
10732 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
10734 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched calls:" << *CI <<
"!=" << *V
10736 return TreeEntry::NeedToGather;
10740 for (
unsigned J = 0; J != NumArgs; ++J) {
10743 if (ScalarArgs[J] != A1J) {
10745 <<
"SLP: mismatched arguments in call:" << *CI
10746 <<
" argument " << ScalarArgs[J] <<
"!=" << A1J <<
"\n");
10747 return TreeEntry::NeedToGather;
10756 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched bundle operands in calls:" << *CI
10757 <<
"!=" << *V <<
'\n');
10758 return TreeEntry::NeedToGather;
10763 auto *VecTy =
getWidenedType(S.getMainOp()->getType(), VL.size());
10765 if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())
10766 return TreeEntry::NeedToGather;
10768 return TreeEntry::Vectorize;
10770 case Instruction::ShuffleVector: {
10771 if (!S.isAltShuffle()) {
10774 return TreeEntry::Vectorize;
10777 LLVM_DEBUG(
dbgs() <<
"SLP: ShuffleVector are not vectorized.\n");
10778 return TreeEntry::NeedToGather;
10783 <<
"SLP: ShuffleVector not vectorized, operands are buildvector and "
10784 "the whole alt sequence is not profitable.\n");
10785 return TreeEntry::NeedToGather;
10788 return TreeEntry::Vectorize;
10792 return TreeEntry::NeedToGather;
10801 PHINode *Main =
nullptr;
10806 PHIHandler() =
delete;
10808 : DT(DT), Main(Main), Phis(Phis),
10809 Operands(Main->getNumIncomingValues(),
10811 void buildOperands() {
10812 constexpr unsigned FastLimit = 4;
10821 for (
auto [Idx, V] :
enumerate(Phis)) {
10825 "Expected isa instruction or poison value.");
10826 Operands[
I][Idx] =
V;
10829 if (
P->getIncomingBlock(
I) == InBB)
10830 Operands[
I][Idx] =
P->getIncomingValue(
I);
10832 Operands[
I][Idx] =
P->getIncomingValueForBlock(InBB);
10837 SmallMapVector<BasicBlock *, SmallVector<unsigned>, 4>
10847 for (
auto [Idx, V] :
enumerate(Phis)) {
10850 Operands[
I][Idx] =
V;
10859 Operands[
I][Idx] =
P->getIncomingValue(
I);
10862 auto *It = Blocks.
find(InBB);
10863 if (It == Blocks.
end())
10865 Operands[It->second.front()][Idx] =
P->getIncomingValue(
I);
10868 for (
const auto &
P : Blocks) {
10869 ArrayRef<unsigned> IncomingValues =
P.second;
10870 if (IncomingValues.
size() <= 1)
10873 for (
unsigned I : IncomingValues) {
10875 [&](
const auto &
Data) {
10876 return !
Data.value() ||
10877 Data.value() == Operands[BasicI][
Data.index()];
10879 "Expected empty operands list.");
10880 Operands[
I] = Operands[BasicI];
10893static std::pair<Instruction *, Instruction *>
10897 for (
Value *V : VL) {
10907 if (MainOp->
getOpcode() ==
I->getOpcode()) {
10926 "Expected different main and alt instructions.");
10927 return std::make_pair(MainOp, AltOp);
10940 const InstructionsState &S,
10942 bool TryPad =
false) {
10946 for (
Value *V : VL) {
10962 size_t NumUniqueScalarValues = UniqueValues.
size();
10965 if (NumUniqueScalarValues == VL.
size() &&
10967 ReuseShuffleIndices.
clear();
10972 if ((UserTreeIdx.
UserTE &&
10973 UserTreeIdx.
UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(
TTI)) ||
10975 LLVM_DEBUG(
dbgs() <<
"SLP: Reshuffling scalars not yet supported "
10976 "for nodes with padding.\n");
10977 ReuseShuffleIndices.
clear();
10982 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
10986 if (TryPad && UniquePositions.
size() > 1 && NumUniqueScalarValues > 1 &&
10987 S.getMainOp()->isSafeToRemove() &&
10988 (S.areInstructionsWithCopyableElements() ||
10992 TTI, UniqueValues.
front()->getType(), UniqueValues.
size());
10993 PWSz = std::min<unsigned>(PWSz, VL.
size());
10994 if (PWSz == VL.
size()) {
10998 ReuseShuffleIndices.
clear();
11002 UniqueValues.
end());
11003 PaddedUniqueValues.
append(
11004 PWSz - UniqueValues.
size(),
11008 if ((!S.areInstructionsWithCopyableElements() &&
11010 (S.areInstructionsWithCopyableElements() && S.isMulDivLikeOp() &&
11011 (S.getMainOp()->isIntDivRem() || S.getMainOp()->isFPDivRem() ||
11014 ReuseShuffleIndices.
clear();
11017 VL = std::move(PaddedUniqueValues);
11022 ReuseShuffleIndices.
clear();
11025 VL = std::move(UniqueValues);
11030 const InstructionsState &LocalState,
11031 SmallVectorImpl<Value *> &Op1,
11032 SmallVectorImpl<Value *> &Op2,
11034 constexpr unsigned SmallNodeSize = 4;
11035 if (VL.
size() <= SmallNodeSize || TTI->preferAlternateOpcodeVectorization() ||
11040 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *LocalState.getMainOp()
11042 for (TreeEntry *
E : getSplitTreeEntries(LocalState.getMainOp())) {
11043 if (
E->isSame(VL)) {
11045 << *LocalState.getMainOp() <<
".\n");
11057 ReorderIndices.assign(VL.
size(), VL.
size());
11058 SmallBitVector Op1Indices(VL.
size());
11063 Op1Indices.set(Idx);
11066 if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
11069 (LocalState.getAltOpcode() == LocalState.getOpcode() &&
11071 LocalState.getAltOp(), *TLI))) {
11073 Op1Indices.set(Idx);
11080 unsigned Opcode0 = LocalState.getOpcode();
11081 unsigned Opcode1 = LocalState.getAltOpcode();
11082 SmallBitVector OpcodeMask(
getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
11087 if (UOp1.size() <= 1 || UOp2.size() <= 1 ||
11088 TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) ||
11093 unsigned Op1Cnt = 0, Op2Cnt = Op1.
size();
11095 if (Op1Indices.test(Idx)) {
11096 ReorderIndices[Op1Cnt] = Idx;
11099 ReorderIndices[Op2Cnt] = Idx;
11104 ReorderIndices.clear();
11105 SmallVector<int>
Mask;
11106 if (!ReorderIndices.empty())
11108 unsigned NumParts = TTI->getNumberOfParts(VecTy);
11113 if (NumParts >= VL.
size())
11118 FixedVectorType *SubVecTy =
11122 if (!LocalState.isCmpOp() && NumParts <= 1 &&
11123 (
Mask.empty() || InsertCost >= NewShuffleCost))
11125 if ((LocalState.getMainOp()->isBinaryOp() &&
11126 LocalState.getAltOp()->isBinaryOp() &&
11127 (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
11128 LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
11129 (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
11130 (LocalState.getMainOp()->isUnaryOp() &&
11131 LocalState.getAltOp()->isUnaryOp())) {
11133 TTI->getArithmeticInstrCost(Opcode0, VecTy, Kind) +
11134 TTI->getArithmeticInstrCost(Opcode1, VecTy, Kind);
11139 OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.
size());
11143 VecTy, OriginalMask, Kind);
11145 TTI->getArithmeticInstrCost(Opcode0, Op1VecTy, Kind) +
11146 TTI->getArithmeticInstrCost(Opcode1, Op2VecTy, Kind);
11148 NewVecOpsCost + InsertCost +
11149 (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
11150 VectorizableTree.front()->getOpcode() == Instruction::Store
11154 if (NewCost >= OriginalCost)
11164class InstructionsCompatibilityAnalysis {
11166 const DataLayout &
DL;
11167 const TargetTransformInfo &
TTI;
11168 const TargetLibraryInfo &TLI;
11169 unsigned MainOpcode = 0;
11174 static bool isSupportedOpcode(
const unsigned Opcode) {
11175 return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
11176 Opcode == Instruction::LShr || Opcode == Instruction::Shl ||
11177 Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
11178 Opcode == Instruction::And || Opcode == Instruction::Or ||
11179 Opcode == Instruction::Xor || Opcode == Instruction::FAdd ||
11180 Opcode == Instruction::FSub || Opcode == Instruction::FMul ||
11181 Opcode == Instruction::FDiv;
11191 auto IsSupportedInstruction = [&](
Instruction *
I,
bool AnyUndef) {
11192 if (AnyUndef && (
I->isIntDivRem() ||
I->isFPDivRem() ||
isa<CallInst>(
I)))
11194 return I && isSupportedOpcode(
I->getOpcode()) &&
11199 SmallDenseSet<Value *, 8> Operands;
11200 SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
11201 bool AnyUndef =
false;
11202 for (
Value *V : VL) {
11210 if (Candidates.
empty()) {
11211 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
11213 Operands.
insert(
I->op_begin(),
I->op_end());
11216 if (Parent ==
I->getParent()) {
11217 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
11218 Operands.
insert(
I->op_begin(),
I->op_end());
11221 auto *NodeA = DT.
getNode(Parent);
11222 auto *NodeB = DT.
getNode(
I->getParent());
11223 assert(NodeA &&
"Should only process reachable instructions");
11224 assert(NodeB &&
"Should only process reachable instructions");
11225 assert((NodeA == NodeB) ==
11226 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11227 "Different nodes should have different DFS numbers");
11228 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
11229 Candidates.
clear();
11230 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
11233 Operands.
insert(
I->op_begin(),
I->op_end());
11236 unsigned BestOpcodeNum = 0;
11238 bool UsedOutside =
false;
11239 for (
const auto &
P : Candidates) {
11241 if (UsedOutside && !PUsedOutside)
11243 if (!UsedOutside && PUsedOutside)
11245 if (
P.second.size() < BestOpcodeNum)
11248 if (!PUsedOutside &&
any_of(
P.second, [&](Instruction *
I) {
11249 return Operands.contains(I);
11252 UsedOutside = PUsedOutside;
11253 for (Instruction *
I :
P.second) {
11254 if (IsSupportedInstruction(
I, AnyUndef)) {
11256 BestOpcodeNum =
P.second.size();
11266 return I &&
I->getParent() == MainOp->
getParent() &&
11279 Value *selectBestIdempotentValue()
const {
11280 assert(isSupportedOpcode(MainOpcode) &&
"Unsupported opcode");
11291 if (!S.isCopyableElement(V))
11293 assert(isSupportedOpcode(MainOpcode) &&
"Unsupported opcode");
11294 return {
V, selectBestIdempotentValue()};
11300 SmallVectorImpl<BoUpSLP::ValueList> &Operands)
const {
11302 unsigned ShuffleOrOp =
11303 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
11306 switch (ShuffleOrOp) {
11307 case Instruction::PHI: {
11311 PHIHandler Handler(DT, PH, VL);
11312 Handler.buildOperands();
11313 Operands.
assign(PH->getNumOperands(), {});
11315 Operands[
I].
assign(Handler.getOperands(
I).begin(),
11316 Handler.getOperands(
I).end());
11319 case Instruction::ExtractValue:
11320 case Instruction::ExtractElement:
11325 case Instruction::InsertElement:
11333 case Instruction::Load:
11337 for (
auto [V,
Op] :
zip(VL, Operands.
back())) {
11341 Op = LI->getPointerOperand();
11344 case Instruction::ZExt:
11345 case Instruction::SExt:
11346 case Instruction::FPToUI:
11347 case Instruction::FPToSI:
11348 case Instruction::FPExt:
11349 case Instruction::PtrToInt:
11350 case Instruction::IntToPtr:
11351 case Instruction::SIToFP:
11352 case Instruction::UIToFP:
11353 case Instruction::Trunc:
11354 case Instruction::FPTrunc:
11355 case Instruction::BitCast:
11356 case Instruction::ICmp:
11357 case Instruction::FCmp:
11358 case Instruction::Select:
11359 case Instruction::FNeg:
11360 case Instruction::Add:
11361 case Instruction::FAdd:
11362 case Instruction::Sub:
11363 case Instruction::FSub:
11364 case Instruction::Mul:
11365 case Instruction::FMul:
11366 case Instruction::UDiv:
11367 case Instruction::SDiv:
11368 case Instruction::FDiv:
11369 case Instruction::URem:
11370 case Instruction::SRem:
11371 case Instruction::FRem:
11372 case Instruction::Shl:
11373 case Instruction::LShr:
11374 case Instruction::AShr:
11375 case Instruction::And:
11376 case Instruction::Or:
11377 case Instruction::Xor:
11378 case Instruction::Freeze:
11379 case Instruction::Store:
11380 case Instruction::ShuffleVector:
11389 auto [
Op, ConvertedOps] = convertTo(
I, S);
11394 case Instruction::GetElementPtr: {
11401 const unsigned IndexIdx = 1;
11407 return !
GEP || VL0Ty ==
GEP->getOperand(IndexIdx)->getType();
11411 ->getPointerOperandType()
11412 ->getScalarType());
11416 Operands[0][Idx] =
V;
11417 Operands[1][Idx] = ConstantInt::getNullValue(Ty);
11420 Operands[0][Idx] =
GEP->getPointerOperand();
11421 auto *
Op =
GEP->getOperand(IndexIdx);
11424 CI, Ty, CI->getValue().isSignBitSet(),
DL)
11429 case Instruction::Call: {
11436 for (
Value *V : VL) {
11438 Ops.push_back(
I ?
I->getOperand(Idx)
11451 InstructionsCompatibilityAnalysis(DominatorTree &DT,
const DataLayout &
DL,
11452 const TargetTransformInfo &
TTI,
11453 const TargetLibraryInfo &TLI)
11458 bool TryCopyableElementsVectorization,
11459 bool WithProfitabilityCheck =
false,
11460 bool SkipSameCodeCheck =
false) {
11461 InstructionsState S = (SkipSameCodeCheck || !
allSameBlock(VL))
11462 ? InstructionsState::invalid()
11468 findAndSetMainInstruction(VL, R);
11470 return InstructionsState::invalid();
11471 S = InstructionsState(MainOp, MainOp,
true);
11472 if (!WithProfitabilityCheck)
11476 auto BuildCandidates =
11477 [](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates,
Value *V1,
11483 if (I1 && I2 &&
I1->getOpcode() == I2->getOpcode() &&
11484 I1->getParent() != I2->getParent())
11488 if (VL.
size() == 2) {
11491 BuildCandidates(Candidates1, Operands[0][0], Operands[0][1]);
11492 BuildCandidates(Candidates2, Operands[1][0], Operands[1][1]);
11493 bool Res = !Candidates1.
empty() && !Candidates2.
empty() &&
11494 R.findBestRootPair(Candidates1) &&
11495 R.findBestRootPair(Candidates2);
11497 Candidates1.
clear();
11498 Candidates2.
clear();
11499 BuildCandidates(Candidates1, Operands[0][0], Operands[1][1]);
11500 BuildCandidates(Candidates2, Operands[1][0], Operands[0][1]);
11501 Res = !Candidates1.
empty() && !Candidates2.
empty() &&
11502 R.findBestRootPair(Candidates1) &&
11503 R.findBestRootPair(Candidates2);
11506 return InstructionsState::invalid();
11510 FixedVectorType *VecTy =
11512 switch (MainOpcode) {
11513 case Instruction::Add:
11514 case Instruction::Sub:
11515 case Instruction::LShr:
11516 case Instruction::Shl:
11517 case Instruction::SDiv:
11518 case Instruction::UDiv:
11519 case Instruction::And:
11520 case Instruction::Or:
11521 case Instruction::Xor:
11522 case Instruction::FAdd:
11523 case Instruction::FMul:
11524 case Instruction::FSub:
11525 case Instruction::FDiv:
11531 if (VectorCost > ScalarCost)
11532 return InstructionsState::invalid();
11535 assert(Operands.
size() == 2 &&
"Unexpected number of operands!");
11536 unsigned CopyableNum =
11537 count_if(VL, [&](
Value *V) {
return S.isCopyableElement(V); });
11538 if (CopyableNum < VL.
size() / 2)
11541 const unsigned Limit = VL.
size() / 24;
11542 if ((CopyableNum >= VL.
size() - Limit ||
11543 (CopyableNum >= VL.
size() - 1 && VL.
size() > 4) ||
11548 return InstructionsState::invalid();
11552 for (
auto &
Ops : Operands) {
11567 return InstructionsState::invalid();
11573 constexpr unsigned Limit = 4;
11574 if (Operands.front().size() >= Limit) {
11575 SmallDenseMap<const Value *, unsigned>
Counters;
11583 return C.second == 1;
11589 InstructionsCompatibilityAnalysis
Analysis(DT,
DL,
TTI, TLI);
11590 InstructionsState OpS =
Analysis.buildInstructionsState(
11592 if (!OpS || (OpS.getOpcode() == Instruction::PHI && !
allSameBlock(
Ops)))
11594 unsigned CopyableNum =
11596 return CopyableNum <= VL.
size() / 2;
11598 if (!CheckOperand(Operands.front()))
11599 return InstructionsState::invalid();
11606 assert(S &&
"Invalid state!");
11608 if (S.areInstructionsWithCopyableElements()) {
11609 MainOp = S.getMainOp();
11610 MainOpcode = S.getOpcode();
11615 for (
auto [OperandIdx, Operand] :
enumerate(OperandsForValue))
11616 Operands[OperandIdx][Idx] = Operand;
11619 buildOriginalOperands(S, VL, Operands);
11626BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
11628 bool TryCopyableElementsVectorization)
const {
11631 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
11632 InstructionsState S =
Analysis.buildInstructionsState(
11633 VL, *
this, TryCopyableElementsVectorization,
11634 true, TryCopyableElementsVectorization);
11636 bool AreScatterAllGEPSameBlock =
false;
11638 SmallVector<unsigned> SortedIndices;
11640 bool IsScatterVectorizeUserTE =
11641 UserTreeIdx.UserTE &&
11642 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11643 AreScatterAllGEPSameBlock =
11657 *SE, SortedIndices));
11658 if (!AreScatterAllGEPSameBlock) {
11659 LLVM_DEBUG(
dbgs() <<
"SLP: Try split and if failed, gathering due to "
11660 "C,S,B,O, small shuffle. \n";
11664 return ScalarsVectorizationLegality(S,
false,
11670 assert(It != VL.
end() &&
"Expected at least one GEP.");
11673 assert(S &&
"Must be valid.");
11679 return ScalarsVectorizationLegality(S,
false,
11685 BasicBlock *BB = S.getMainOp()->getParent();
11688 !DT->isReachableFromEntry(BB)) {
11694 return ScalarsVectorizationLegality(S,
false);
11703 return ScalarsVectorizationLegality(S,
false,
11708 if (S.getOpcode() == Instruction::ExtractElement &&
11711 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to scalable vector type.\n");
11712 return ScalarsVectorizationLegality(S,
false);
11719 (S.isAltShuffle() || VL.
size() < 4 ||
11726 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to max recursion depth.\n");
11727 return ScalarsVectorizationLegality(S,
false);
11731 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *S.getMainOp() <<
".\n");
11732 for (TreeEntry *
E : getTreeEntries(S.getMainOp())) {
11733 if (
E->isSame(VL)) {
11734 LLVM_DEBUG(
dbgs() <<
"SLP: Perfect diamond merge at " << *S.getMainOp()
11736 return ScalarsVectorizationLegality(S,
false);
11741 (S.getOpcode() == Instruction::PHI &&
isa<PHINode>(V) &&
11742 LI->getLoopFor(S.getMainOp()->getParent()) &&
11746 return ScalarsVectorizationLegality(S,
false);
11756 if (!S || !S.isAltShuffle() || VL.
size() > 2)
11764 SmallVector<unsigned, 8> InstsCount;
11765 for (
Value *V : VL) {
11768 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
11771 bool IsCommutative =
11773 if ((IsCommutative &&
11774 std::accumulate(InstsCount.
begin(), InstsCount.
end(), 0) < 2) ||
11776 all_of(InstsCount, [](
unsigned ICnt) {
return ICnt < 2; })))
11778 assert(VL.size() == 2 &&
"Expected only 2 alternate op instructions.");
11782 for (
int Op :
seq<int>(S.getMainOp()->getNumOperands()))
11784 I2->getOperand(
Op));
11785 if (
static_cast<unsigned>(
count_if(
11786 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
11788 })) >= S.getMainOp()->getNumOperands() / 2)
11790 if (S.getMainOp()->getNumOperands() > 2)
11792 if (IsCommutative) {
11794 Candidates.
clear();
11795 for (
int Op = 0,
E = S.getMainOp()->getNumOperands();
Op <
E; ++
Op)
11797 I2->getOperand((
Op + 1) %
E));
11799 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
11806 bool AreAllSameBlock = !AreScatterAllGEPSameBlock;
11807 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
11808 if (!AreAllSameInsts ||
isSplat(VL) ||
11812 NotProfitableForVectorization(VL)) {
11813 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to C,S,B,O, small shuffle. \n";
11817 return ScalarsVectorizationLegality(S,
false);
11821 if (!EphValues.empty()) {
11822 for (
Value *V : VL) {
11823 if (EphValues.count(V)) {
11825 <<
") is ephemeral.\n");
11827 return ScalarsVectorizationLegality(S,
false,
11839 if (S.isAltShuffle()) {
11840 auto GetNumVectorizedExtracted = [&]() {
11846 all_of(
I->operands(), [&](
const Use &U) {
11847 return isa<ExtractElementInst>(U.get());
11852 else if (!
I->hasOneUser() && !areAllUsersVectorized(
I, UserIgnoreList))
11855 return std::make_pair(Vectorized, Extracted);
11857 auto [Vectorized, Extracted] = GetNumVectorizedExtracted();
11859 bool PreferScalarize = !Vectorized.
isAllOnes() && VL.size() == 2;
11860 if (!Vectorized.
isAllOnes() && !PreferScalarize) {
11863 Type *ScalarTy = VL.front()->getType();
11868 false,
true, Kind);
11870 *TTI, ScalarTy, VecTy, Vectorized,
11871 true,
false, Kind,
false);
11872 PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate;
11874 if (PreferScalarize) {
11875 LLVM_DEBUG(
dbgs() <<
"SLP: The instructions are in tree and alternate "
11876 "node is not profitable.\n");
11877 return ScalarsVectorizationLegality(S,
false);
11882 if (UserIgnoreList && !UserIgnoreList->empty()) {
11883 for (
Value *V : VL) {
11884 if (UserIgnoreList->contains(V)) {
11885 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to gathered scalar.\n");
11886 return ScalarsVectorizationLegality(S,
false);
11891 return ScalarsVectorizationLegality(S,
true);
11896 unsigned InterleaveFactor) {
11899 SmallVector<int> ReuseShuffleIndices;
11903 auto TrySplitNode = [&](
const InstructionsState &LocalState) {
11906 if (!canBuildSplitNode(VL, LocalState, Op1, Op2, ReorderIndices))
11909 auto Invalid = ScheduleBundle::invalid();
11910 auto *
TE = newTreeEntry(VL, TreeEntry::SplitVectorize,
Invalid, LocalState,
11911 UserTreeIdx, {}, ReorderIndices);
11916 getSameValuesTreeEntry(S.getMainOp(),
Op,
true))) {
11918 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11919 Idx == 0 ? 0 : Op1.
size());
11920 (void)newTreeEntry(
Op, TreeEntry::NeedToGather,
Invalid, S, {
TE, Idx});
11922 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11923 Idx == 0 ? 0 : Op1.
size());
11933 bool AreConsts =
false;
11934 for (
Value *V : VL) {
11946 if (AreOnlyConstsWithPHIs(VL)) {
11947 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to all constants and PHIs.\n");
11948 newGatherTreeEntry(VL, InstructionsState::invalid(), UserTreeIdx);
11952 ScalarsVectorizationLegality Legality = getScalarsVectorizationLegality(
11953 VL,
Depth, UserTreeIdx,
false);
11954 InstructionsState S = Legality.getInstructionsState();
11955 if (!Legality.isLegal()) {
11956 if (Legality.trySplitVectorize()) {
11959 if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
11963 Legality = getScalarsVectorizationLegality(
11964 VL,
Depth, UserTreeIdx,
true);
11965 if (!Legality.isLegal()) {
11966 if (Legality.tryToFindDuplicates())
11970 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11973 S = Legality.getInstructionsState();
11977 if (S.isAltShuffle() && TrySplitNode(S))
11983 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11988 bool IsScatterVectorizeUserTE =
11989 UserTreeIdx.UserTE &&
11990 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11993 StridedPtrInfo SPtrInfo;
11994 TreeEntry::EntryState State = getScalarsVectorizationState(
11995 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, SPtrInfo);
11996 if (State == TreeEntry::NeedToGather) {
11997 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
12003 auto &BSRef = BlocksSchedules[BB];
12005 BSRef = std::make_unique<BlockScheduling>(BB);
12007 BlockScheduling &BS = *BSRef;
12010 std::optional<ScheduleBundle *> BundlePtr =
12011 BS.tryScheduleBundle(UniqueValues.getArrayRef(),
this, S, UserTreeIdx);
12012#ifdef EXPENSIVE_CHECKS
12016 if (!BundlePtr || (*BundlePtr && !*BundlePtr.value())) {
12017 LLVM_DEBUG(
dbgs() <<
"SLP: We are not able to schedule this bundle!\n");
12019 if (S.isAltShuffle() && ReuseShuffleIndices.
empty() && TrySplitNode(S))
12021 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
12022 NonScheduledFirst.insert(VL.front());
12023 if (S.getOpcode() == Instruction::Load &&
12024 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
12028 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
12030 ScheduleBundle
Empty;
12031 ScheduleBundle &Bundle = BundlePtr.value() ? *BundlePtr.value() :
Empty;
12032 LLVM_DEBUG(
dbgs() <<
"SLP: We are able to schedule this bundle.\n");
12034 unsigned ShuffleOrOp =
12035 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
12036 auto CreateOperandNodes = [&](TreeEntry *
TE,
const auto &Operands) {
12038 SmallVector<unsigned> PHIOps;
12044 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
12049 for (
unsigned I : PHIOps)
12050 buildTreeRec(Operands[
I],
Depth + 1, {
TE,
I});
12052 switch (ShuffleOrOp) {
12053 case Instruction::PHI: {
12055 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
12059 TE->setOperands(Operands);
12060 CreateOperandNodes(TE, Operands);
12063 case Instruction::ExtractValue:
12064 case Instruction::ExtractElement: {
12065 if (CurrentOrder.empty()) {
12066 LLVM_DEBUG(
dbgs() <<
"SLP: Reusing or shuffling extract sequence.\n");
12069 dbgs() <<
"SLP: Reusing or shuffling of reordered extract sequence "
12071 for (
unsigned Idx : CurrentOrder)
12072 dbgs() <<
" " << Idx;
12079 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12080 ReuseShuffleIndices, CurrentOrder);
12082 "(ExtractValueInst/ExtractElementInst).\n";
12086 TE->setOperands(Operands);
12089 case Instruction::InsertElement: {
12090 assert(ReuseShuffleIndices.
empty() &&
"All inserts should be unique");
12092 auto OrdCompare = [](
const std::pair<int, int> &
P1,
12093 const std::pair<int, int> &
P2) {
12094 return P1.first >
P2.first;
12097 decltype(OrdCompare)>
12098 Indices(OrdCompare);
12099 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
12101 Indices.emplace(Idx,
I);
12103 OrdersType CurrentOrder(VL.size(), VL.size());
12104 bool IsIdentity =
true;
12105 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
12106 CurrentOrder[Indices.top().second] =
I;
12107 IsIdentity &= Indices.top().second ==
I;
12111 CurrentOrder.clear();
12112 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12114 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (InsertElementInst).\n";
12117 TE->setOperands(Operands);
12118 buildTreeRec(
TE->getOperand(1),
Depth + 1, {TE, 1});
12121 case Instruction::Load: {
12128 TreeEntry *
TE =
nullptr;
12131 case TreeEntry::Vectorize:
12132 TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12133 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
12134 if (CurrentOrder.empty())
12135 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (LoadInst).\n";
12139 <<
"SLP: added a new TreeEntry (jumbled LoadInst).\n";
12142 case TreeEntry::CompressVectorize:
12144 TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S,
12145 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
12148 <<
"SLP: added a new TreeEntry (masked LoadInst + compress).\n";
12151 case TreeEntry::StridedVectorize:
12153 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
12154 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
12155 TreeEntryToStridedPtrInfoMap[
TE] = SPtrInfo;
12156 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (strided LoadInst).\n";
12159 case TreeEntry::ScatterVectorize:
12161 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
12162 UserTreeIdx, ReuseShuffleIndices);
12165 <<
"SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
12168 case TreeEntry::CombinedVectorize:
12169 case TreeEntry::SplitVectorize:
12170 case TreeEntry::NeedToGather:
12173 if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {
12174 assert(Operands.
size() == 1 &&
"Expected a single operand only");
12175 SmallVector<int>
Mask;
12179 TE->setOperands(Operands);
12180 if (State == TreeEntry::ScatterVectorize)
12181 buildTreeRec(PointerOps,
Depth + 1, {
TE, 0});
12184 case Instruction::ZExt:
12185 case Instruction::SExt:
12186 case Instruction::FPToUI:
12187 case Instruction::FPToSI:
12188 case Instruction::FPExt:
12189 case Instruction::PtrToInt:
12190 case Instruction::IntToPtr:
12191 case Instruction::SIToFP:
12192 case Instruction::UIToFP:
12193 case Instruction::Trunc:
12194 case Instruction::FPTrunc:
12195 case Instruction::BitCast: {
12196 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
12197 std::make_pair(std::numeric_limits<unsigned>::min(),
12198 std::numeric_limits<unsigned>::max()));
12199 if (ShuffleOrOp == Instruction::ZExt ||
12200 ShuffleOrOp == Instruction::SExt) {
12201 CastMaxMinBWSizes = std::make_pair(
12202 std::max<unsigned>(DL->getTypeSizeInBits(VL0->
getType()),
12204 std::min<unsigned>(
12207 }
else if (ShuffleOrOp == Instruction::Trunc) {
12208 CastMaxMinBWSizes = std::make_pair(
12209 std::max<unsigned>(
12212 std::min<unsigned>(DL->getTypeSizeInBits(VL0->
getType()),
12215 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12216 ReuseShuffleIndices);
12217 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (CastInst).\n";
12220 TE->setOperands(Operands);
12222 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
12223 if (ShuffleOrOp == Instruction::Trunc) {
12224 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
12225 }
else if (ShuffleOrOp == Instruction::SIToFP ||
12226 ShuffleOrOp == Instruction::UIToFP) {
12227 unsigned NumSignBits =
12230 APInt
Mask = DB->getDemandedBits(OpI);
12231 NumSignBits = std::max(NumSignBits,
Mask.countl_zero());
12233 if (NumSignBits * 2 >=
12235 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
12239 case Instruction::ICmp:
12240 case Instruction::FCmp: {
12243 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12244 ReuseShuffleIndices);
12253 "Commutative Predicate mismatch");
12256 Operands.
back() =
Ops.getVL(1);
12263 if (
Cmp->getPredicate() != P0)
12267 TE->setOperands(Operands);
12268 buildTreeRec(Operands.
front(),
Depth + 1, {TE, 0});
12269 buildTreeRec(Operands.
back(),
Depth + 1, {TE, 1});
12270 if (ShuffleOrOp == Instruction::ICmp) {
12271 unsigned NumSignBits0 =
12273 if (NumSignBits0 * 2 >=
12275 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
12276 unsigned NumSignBits1 =
12278 if (NumSignBits1 * 2 >=
12280 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
12284 case Instruction::Select:
12285 case Instruction::FNeg:
12286 case Instruction::Add:
12287 case Instruction::FAdd:
12288 case Instruction::Sub:
12289 case Instruction::FSub:
12290 case Instruction::Mul:
12291 case Instruction::FMul:
12292 case Instruction::UDiv:
12293 case Instruction::SDiv:
12294 case Instruction::FDiv:
12295 case Instruction::URem:
12296 case Instruction::SRem:
12297 case Instruction::FRem:
12298 case Instruction::Shl:
12299 case Instruction::LShr:
12300 case Instruction::AShr:
12301 case Instruction::And:
12302 case Instruction::Or:
12303 case Instruction::Xor:
12304 case Instruction::Freeze: {
12305 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12306 ReuseShuffleIndices);
12308 dbgs() <<
"SLP: added a new TreeEntry "
12309 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
12315 Operands[0] =
Ops.getVL(0);
12316 Operands[1] =
Ops.getVL(1);
12318 TE->setOperands(Operands);
12320 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
12323 case Instruction::GetElementPtr: {
12324 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12325 ReuseShuffleIndices);
12326 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (GetElementPtrInst).\n";
12328 TE->setOperands(Operands);
12331 buildTreeRec(Operands[
I],
Depth + 1, {
TE,
I});
12334 case Instruction::Store: {
12335 bool Consecutive = CurrentOrder.empty();
12338 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12339 ReuseShuffleIndices, CurrentOrder);
12341 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (StoreInst).\n";
12345 dbgs() <<
"SLP: added a new TreeEntry (jumbled StoreInst).\n";
12347 TE->setOperands(Operands);
12348 buildTreeRec(
TE->getOperand(0),
Depth + 1, {TE, 0});
12351 case Instruction::Call: {
12357 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12358 ReuseShuffleIndices);
12359 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (CallInst).\n";
12364 Operands[0] =
Ops.getVL(0);
12365 Operands[1] =
Ops.getVL(1);
12367 TE->setOperands(Operands);
12373 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
12377 case Instruction::ShuffleVector: {
12378 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12379 ReuseShuffleIndices);
12380 if (S.isAltShuffle()) {
12381 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (isAltShuffle).\n";
12386 dbgs() <<
"SLP: added a new TreeEntry (ShuffleVectorInst).\n";
12400 "Expected different main/alternate predicates.");
12416 TE->setOperands(Operands);
12417 buildTreeRec(Operands.
front(),
Depth + 1, {TE, 0});
12418 buildTreeRec(Operands.
back(),
Depth + 1, {TE, 1});
12425 Operands[0] =
Ops.getVL(0);
12426 Operands[1] =
Ops.getVL(1);
12428 TE->setOperands(Operands);
12430 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
12448 for (
const auto *Ty : ST->elements())
12449 if (Ty != *ST->element_begin())
12451 N *= ST->getNumElements();
12452 EltTy = *ST->element_begin();
12454 N *= AT->getNumElements();
12455 EltTy = AT->getElementType();
12458 N *= VT->getNumElements();
12459 EltTy = VT->getElementType();
12465 size_t VTSize = DL->getTypeStoreSizeInBits(
getWidenedType(EltTy,
N));
12466 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
12467 VTSize != DL->getTypeStoreSizeInBits(T))
12474 bool ResizeAllowed)
const {
12476 assert(It != VL.
end() &&
"Expected at least one extract instruction.");
12483 Value *Vec = E0->getOperand(0);
12485 CurrentOrder.
clear();
12489 if (E0->getOpcode() == Instruction::ExtractValue) {
12501 unsigned E = VL.
size();
12502 if (!ResizeAllowed && NElts !=
E)
12505 unsigned MinIdx = NElts, MaxIdx = 0;
12510 if (Inst->getOperand(0) != Vec)
12518 const unsigned ExtIdx = *Idx;
12519 if (ExtIdx >= NElts)
12521 Indices[
I] = ExtIdx;
12522 if (MinIdx > ExtIdx)
12524 if (MaxIdx < ExtIdx)
12527 if (MaxIdx - MinIdx + 1 >
E)
12529 if (MaxIdx + 1 <=
E)
12533 bool ShouldKeepOrder =
true;
12540 for (
unsigned I = 0;
I <
E; ++
I) {
12543 const unsigned ExtIdx = Indices[
I] - MinIdx;
12544 if (CurrentOrder[ExtIdx] !=
E) {
12545 CurrentOrder.
clear();
12548 ShouldKeepOrder &= ExtIdx ==
I;
12549 CurrentOrder[ExtIdx] =
I;
12551 if (ShouldKeepOrder)
12552 CurrentOrder.
clear();
12554 return ShouldKeepOrder;
12557bool BoUpSLP::areAllUsersVectorized(
12558 Instruction *
I,
const SmallDenseSet<Value *> *VectorizedVals)
const {
12559 return (
I->hasOneUse() && (!VectorizedVals || VectorizedVals->
contains(
I))) ||
12560 all_of(
I->users(), [
this](User *U) {
12561 return isVectorized(U) || isVectorLikeInstWithConstOps(U) ||
12562 (isa<ExtractElementInst>(U) && MustGather.contains(U));
12566void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
12567 const function_ref<
bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
12568 SmallVectorImpl<Value *> *OpScalars,
12569 SmallVectorImpl<Value *> *AltScalars)
const {
12570 unsigned Sz = Scalars.size();
12572 SmallVector<int> OrderMask;
12573 if (!ReorderIndices.empty())
12575 for (
unsigned I = 0;
I < Sz; ++
I) {
12577 if (!ReorderIndices.empty())
12578 Idx = OrderMask[
I];
12582 if (IsAltOp(OpInst)) {
12583 Mask[
I] = Sz + Idx;
12592 if (!ReuseShuffleIndices.
empty()) {
12594 transform(ReuseShuffleIndices, NewMask.
begin(), [&Mask](
int Idx) {
12595 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
12597 Mask.swap(NewMask);
12604 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(
I) == MainOp;
12614 assert(MainP != AltP &&
"Expected different main/alternate predicates.");
12623 assert((MainP ==
P || AltP ==
P || MainP == SwappedP || AltP == SwappedP) &&
12624 "CmpInst expected to match either main or alternate predicate or "
12626 return MainP !=
P && MainP != SwappedP;
12628 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(
I) == AltOp;
12633 const auto *Op0 =
Ops.front();
12646 return CI->getValue().isPowerOf2();
12652 return CI->getValue().isNegatedPowerOf2();
12657 if (IsConstant && IsUniform)
12659 else if (IsConstant)
12661 else if (IsUniform)
12673class BaseShuffleAnalysis {
12675 Type *ScalarTy =
nullptr;
12677 BaseShuffleAnalysis(
Type *ScalarTy) : ScalarTy(ScalarTy) {}
12685 unsigned getVF(
Value *V)
const {
12686 assert(V &&
"V cannot be nullptr");
12688 "V does not have FixedVectorType");
12689 assert(ScalarTy &&
"ScalarTy cannot be nullptr");
12691 unsigned VNumElements =
12693 assert(VNumElements > ScalarTyNumElements &&
12694 "the number of elements of V is not large enough");
12695 assert(VNumElements % ScalarTyNumElements == 0 &&
12696 "the number of elements of V is not a vectorized value");
12697 return VNumElements / ScalarTyNumElements;
12703 static bool isIdentityMask(ArrayRef<int> Mask,
const FixedVectorType *VecTy,
12705 int Limit =
Mask.size();
12717 if (Limit % VF == 0 &&
all_of(
seq<int>(0, Limit / VF), [=](
int Idx) {
12718 ArrayRef<int> Slice =
Mask.slice(Idx * VF, VF);
12731 static void combineMasks(
unsigned LocalVF, SmallVectorImpl<int> &Mask,
12732 ArrayRef<int> ExtMask) {
12733 unsigned VF =
Mask.size();
12735 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
12738 int MaskedIdx =
Mask[ExtMask[
I] % VF];
12742 Mask.swap(NewMask);
12778 static bool peekThroughShuffles(
Value *&V, SmallVectorImpl<int> &Mask,
12779 bool SinglePermute) {
12781 ShuffleVectorInst *IdentityOp =
nullptr;
12782 SmallVector<int> IdentityMask;
12791 if (isIdentityMask(Mask, SVTy,
false)) {
12792 if (!IdentityOp || !SinglePermute ||
12793 (isIdentityMask(Mask, SVTy,
true) &&
12795 IdentityMask.
size()))) {
12800 IdentityMask.
assign(Mask);
12820 if (SV->isZeroEltSplat()) {
12822 IdentityMask.
assign(Mask);
12824 int LocalVF =
Mask.size();
12827 LocalVF = SVOpTy->getNumElements();
12831 static_cast<unsigned>(
I) >= SV->getShuffleMask().size())
12833 ExtMask[Idx] = SV->getMaskValue(
I);
12843 if (!IsOp1Undef && !IsOp2Undef) {
12845 for (
int &
I : Mask) {
12848 if (SV->getMaskValue(
I % SV->getShuffleMask().size()) ==
12854 SmallVector<int> ShuffleMask(SV->getShuffleMask());
12855 combineMasks(LocalVF, ShuffleMask, Mask);
12856 Mask.swap(ShuffleMask);
12858 Op = SV->getOperand(0);
12860 Op = SV->getOperand(1);
12863 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
12868 "Expected masks of same sizes.");
12873 Mask.swap(IdentityMask);
12875 return SinglePermute &&
12878 (Shuffle &&
Mask.size() == Shuffle->getShuffleMask().size() &&
12879 Shuffle->isZeroEltSplat() &&
12883 Shuffle->getShuffleMask()[
P.index()] == 0;
12896 template <
typename T,
typename ShuffleBuilderTy>
12897 static T createShuffle(
Value *V1,
Value *V2, ArrayRef<int> Mask,
12898 ShuffleBuilderTy &Builder,
Type *ScalarTy) {
12899 assert(V1 &&
"Expected at least one vector value.");
12901 SmallVector<int> NewMask(Mask);
12902 if (ScalarTyNumElements != 1) {
12908 Builder.resizeToMatch(V1, V2);
12909 int VF =
Mask.size();
12911 VF = FTy->getNumElements();
12922 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
12924 CombinedMask1[
I] =
Mask[
I];
12926 CombinedMask2[
I] =
Mask[
I] - VF;
12933 (void)peekThroughShuffles(Op1, CombinedMask1,
false);
12934 (void)peekThroughShuffles(Op2, CombinedMask2,
false);
12940 for (
auto [Idx,
I] :
enumerate(CombinedMask1)) {
12943 ExtMask1[Idx] = SV1->getMaskValue(
I);
12947 ->getNumElements(),
12948 ExtMask1, UseMask::SecondArg);
12949 SmallVector<int> ExtMask2(CombinedMask2.size(),
PoisonMaskElem);
12950 for (
auto [Idx,
I] :
enumerate(CombinedMask2)) {
12953 ExtMask2[Idx] = SV2->getMaskValue(
I);
12957 ->getNumElements(),
12958 ExtMask2, UseMask::SecondArg);
12959 if (SV1->getOperand(0)->getType() ==
12960 SV2->getOperand(0)->getType() &&
12961 SV1->getOperand(0)->getType() != SV1->getType() &&
12964 Op1 = SV1->getOperand(0);
12965 Op2 = SV2->getOperand(0);
12966 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
12967 int LocalVF = ShuffleMask1.size();
12969 LocalVF = FTy->getNumElements();
12970 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
12971 CombinedMask1.swap(ShuffleMask1);
12972 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
12973 LocalVF = ShuffleMask2.size();
12975 LocalVF = FTy->getNumElements();
12976 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
12977 CombinedMask2.swap(ShuffleMask2);
12980 }
while (PrevOp1 != Op1 || PrevOp2 != Op2);
12981 Builder.resizeToMatch(Op1, Op2);
12983 ->getElementCount()
12984 .getKnownMinValue(),
12986 ->getElementCount()
12987 .getKnownMinValue());
12988 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
12991 "Expected undefined mask element");
12992 CombinedMask1[
I] = CombinedMask2[
I] + (Op1 == Op2 ? 0 : VF);
13001 return Builder.createIdentity(Op1);
13002 return Builder.createShuffleVector(
13007 return Builder.createPoison(
13009 bool IsIdentity = peekThroughShuffles(V1, NewMask,
true);
13010 assert(V1 &&
"Expected non-null value after looking through shuffles.");
13013 return Builder.createShuffleVector(V1, NewMask);
13014 return Builder.createIdentity(V1);
13020 ArrayRef<int> Mask) {
13029static std::pair<InstructionCost, InstructionCost>
13040 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
13049 ScalarCost =
TTI.getPointersChainCost(
13050 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
13054 for (
Value *V : Ptrs) {
13055 if (V == BasePtr) {
13068 if (PtrsRetainedInVecCode.
size() == Ptrs.size()) {
13073 VecCost =
TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
13074 TTI::PointersChainInfo::getKnownStride(),
13084 [](
const Value *V) {
13086 return Ptr && !Ptr->hasAllConstantIndices();
13088 ? TTI::PointersChainInfo::getUnknownStride()
13089 : TTI::PointersChainInfo::getKnownStride();
13092 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy,
CostKind);
13096 if (It != Ptrs.
end())
13101 VecCost =
TTI.getGEPCost(BaseGEP->getSourceElementType(),
13102 BaseGEP->getPointerOperand(), Indices, VecTy,
13107 return std::make_pair(ScalarCost, VecCost);
13110void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
13111 assert(
TE.isGather() &&
TE.ReorderIndices.empty() &&
13112 "Expected gather node without reordering.");
13114 SmallSet<size_t, 2> LoadKeyUsed;
13118 if (
TE.Scalars.size() == 2 || (
TE.hasState() && !
TE.isAltShuffle()) ||
13123 return VectorizableTree[Idx]->isSame(TE.Scalars);
13127 auto GenerateLoadsSubkey = [&](
size_t Key, LoadInst *LI) {
13132 auto LIt = LoadsMap.
find(std::make_pair(
Key, Ptr));
13133 if (LIt != LoadsMap.
end()) {
13134 for (LoadInst *RLI : LIt->second) {
13136 LI->
getType(), LI->getPointerOperand(), *DL, *SE,
13140 for (LoadInst *RLI : LIt->second) {
13142 LI->getPointerOperand(), *TLI)) {
13147 if (LIt->second.size() > 2) {
13149 hash_value(LIt->second.back()->getPointerOperand());
13155 LoadsMap.
try_emplace(std::make_pair(
Key, Ptr)).first->second.push_back(LI);
13158 MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues;
13159 SmallDenseMap<Value *, SmallVector<unsigned>, 8> KeyToIndex;
13160 bool IsOrdered =
true;
13161 unsigned NumInstructions = 0;
13165 size_t Key = 1, Idx = 1;
13173 auto &Container = SortedValues[
Key];
13174 if (IsOrdered && !KeyToIndex.
contains(V) &&
13177 ((Container.contains(Idx) &&
13178 KeyToIndex.
at(Container[Idx].back()).back() !=
I - 1) ||
13179 (!Container.empty() && !Container.contains(Idx) &&
13180 KeyToIndex.
at(Container.back().second.back()).back() !=
I - 1)))
13182 auto &KTI = KeyToIndex[
V];
13184 Container[Idx].push_back(V);
13189 if (!IsOrdered && NumInstructions > 1) {
13191 TE.ReorderIndices.resize(
TE.Scalars.size(),
TE.Scalars.size());
13192 for (
const auto &
D : SortedValues) {
13193 for (
const auto &
P :
D.second) {
13195 for (
Value *V :
P.second) {
13196 ArrayRef<unsigned> Indices = KeyToIndex.
at(V);
13197 for (
auto [K, Idx] :
enumerate(Indices)) {
13198 TE.ReorderIndices[Cnt +
K] = Idx;
13199 TE.Scalars[Cnt +
K] =
V;
13201 Sz += Indices.
size();
13202 Cnt += Indices.
size();
13206 *TTI,
TE.Scalars.front()->getType(), Sz);
13210 }
else if (!
P.second.empty() &&
isConstant(
P.second.front())) {
13218 if (!
TE.ReuseShuffleIndices.empty() ||
TE.ReorderIndices.empty())
13223 auto *ScalarTy =
TE.Scalars.front()->getType();
13225 for (
auto [Idx, Sz] : SubVectors) {
13232 int Sz =
TE.Scalars.size();
13233 SmallVector<int> ReorderMask(
TE.ReorderIndices.begin(),
13234 TE.ReorderIndices.end());
13240 ReorderMask[
I] =
I +
TE.ReorderIndices.size();
13244 any_of(ReorderMask, [&](
int I) {
return I >= Sz; })
13247 VecTy, ReorderMask);
13253 DemandedElts.clearBit(
I);
13255 ReorderMask[
I] =
I;
13257 ReorderMask[
I] =
I + Sz;
13263 if (!DemandedElts.isAllOnes())
13265 if (
Cost >= BVCost) {
13266 SmallVector<int>
Mask(
TE.ReorderIndices.begin(),
TE.ReorderIndices.end());
13268 TE.ReorderIndices.clear();
13275 const InstructionsState &S,
13281 return V->getType()->getScalarType()->isFloatingPointTy();
13283 "Can only convert to FMA for floating point types");
13284 assert(S.isAddSubLikeOp() &&
"Can only convert to FMA for add/sub");
13289 for (
Value *V : VL) {
13293 if (S.isCopyableElement(
I))
13295 Instruction *MatchingI = S.getMatchingMainOpOrAltOp(
I);
13296 if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
13299 FMF &= FPCI->getFastMathFlags();
13303 if (!CheckForContractable(VL))
13306 InstructionsCompatibilityAnalysis
Analysis(DT,
DL,
TTI, TLI);
13313 if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
13315 if (!CheckForContractable(Operands.
front()))
13323 for (
Value *V : VL) {
13327 if (!S.isCopyableElement(
I))
13329 FMF &= FPCI->getFastMathFlags();
13330 FMulPlusFAddCost +=
TTI.getInstructionCost(
I,
CostKind);
13333 for (
auto [V,
Op] :
zip(VL, Operands.
front())) {
13334 if (S.isCopyableElement(V))
13337 if (!
I || !
I->hasOneUse() || OpS.isCopyableElement(
I)) {
13339 FMACost +=
TTI.getInstructionCost(OpI,
CostKind);
13346 FMF &= FPCI->getFastMathFlags();
13347 FMulPlusFAddCost +=
TTI.getInstructionCost(
I,
CostKind);
13355bool BoUpSLP::matchesShlZExt(
const TreeEntry &TE,
OrdersType &Order,
13356 bool &IsBSwap)
const {
13357 assert(
TE.hasState() &&
TE.getOpcode() == Instruction::Shl &&
13358 "Expected Shl node.");
13360 if (
TE.State != TreeEntry::Vectorize || !
TE.ReorderIndices.empty() ||
13361 !
TE.ReuseShuffleIndices.empty() || MinBWs.contains(&TE) ||
13362 any_of(
TE.Scalars, [](
Value *V) { return !V->hasOneUse(); }))
13364 Type *ScalarTy =
TE.getMainOp()->getType();
13370 const unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
13373 const TreeEntry *LhsTE = getOperandEntry(&TE, 0);
13374 const TreeEntry *RhsTE = getOperandEntry(&TE, 1);
13376 if (!(LhsTE->State == TreeEntry::Vectorize &&
13377 LhsTE->getOpcode() == Instruction::ZExt &&
13378 LhsTE->ReorderIndices.empty() && LhsTE->ReuseShuffleIndices.empty() &&
13379 !MinBWs.contains(LhsTE) &&
13380 all_of(LhsTE->Scalars, [](
Value *V) { return V->hasOneUse(); })))
13383 unsigned Stride = DL->getTypeSizeInBits(SrcScalarTy);
13386 if (!(RhsTE->isGather() && RhsTE->ReorderIndices.empty() &&
13387 RhsTE->ReuseShuffleIndices.empty() && !MinBWs.contains(RhsTE)))
13390 unsigned CurrentValue = 0;
13392 if (
all_of(RhsTE->Scalars,
13394 CurrentValue += Stride;
13395 if (isa<UndefValue>(V))
13397 auto *C = dyn_cast<Constant>(V);
13400 return C->getUniqueInteger() == CurrentValue - Stride;
13402 CurrentValue == Sz) {
13405 const unsigned VF = RhsTE->getVectorFactor();
13406 Order.assign(VF, VF);
13409 if (VF * Stride != Sz)
13411 for (
const auto [Idx, V] :
enumerate(RhsTE->Scalars)) {
13417 const APInt &Val =
C->getUniqueInteger();
13422 if (Order[Idx] != VF || Pos >= VF)
13432 SmallPtrSet<Value *, 4> CheckedExtracts;
13434 auto *SrcVecTy =
getWidenedType(SrcScalarTy, LhsTE->getVectorFactor());
13436 getCastContextHint(*getOperandEntry(LhsTE, 0));
13438 TTI->getArithmeticReductionCost(Instruction::Or, VecTy, FMF,
CostKind) +
13439 TTI->getArithmeticInstrCost(Instruction::Shl, VecTy,
CostKind,
13440 getOperandInfo(LhsTE->Scalars)) +
13441 TTI->getCastInstrCost(
13442 Instruction::ZExt, VecTy,
13446 Instruction::BitCast, ScalarTy, SrcVecTy, CastCtx,
CostKind);
13447 if (!Order.empty()) {
13449 SmallVector<int>
Mask;
13455 constexpr unsigned ByteSize = 8;
13457 DL->getTypeSizeInBits(SrcScalarTy) == ByteSize) {
13458 IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, ScalarTy, {ScalarTy});
13460 TTI->getCastInstrCost(Instruction::BitCast, ScalarTy, SrcVecTy, CastCtx,
13462 TTI->getIntrinsicInstrCost(CostAttrs,
CostKind);
13463 if (BSwapCost <= BitcastCost) {
13464 BitcastCost = BSwapCost;
13468 return BitcastCost < VecCost;
13473 BaseGraphSize = VectorizableTree.size();
13475 class GraphTransformModeRAAI {
13476 bool &SavedIsGraphTransformMode;
13479 GraphTransformModeRAAI(
bool &IsGraphTransformMode)
13480 : SavedIsGraphTransformMode(IsGraphTransformMode) {
13481 IsGraphTransformMode =
true;
13483 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode =
false; }
13484 } TransformContext(IsGraphTransformMode);
13493 const InstructionsState &S) {
13497 I2->getOperand(
Op));
13499 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
13501 [](
const std::pair<Value *, Value *> &
P) {
13511 TreeEntry &E = *VectorizableTree[Idx];
13513 reorderGatherNode(E);
13518 constexpr unsigned VFLimit = 16;
13519 bool ForceLoadGather =
13520 count_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
13521 return TE->isGather() && TE->hasState() &&
13522 TE->getOpcode() == Instruction::Load &&
13523 TE->getVectorFactor() < VFLimit;
13529 return TE->isSame(VL) ||
all_of(VL, [&](
Value *V) {
13538 auto CheckForSameVectorNodes = [&](
const TreeEntry &E) {
13539 if (E.hasState()) {
13541 !TEs.
empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
13542 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
13543 ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
13544 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13545 return is_contained(TEs, TE);
13552 !TEs.
empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
13553 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
13554 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
13555 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13556 return is_contained(TEs, TE);
13564 if (It != E.Scalars.end()) {
13566 !TEs.empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
13567 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
13568 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
13569 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13570 return is_contained(TEs, TE);
13580 for (
unsigned Idx :
seq<unsigned>(BaseGraphSize)) {
13581 TreeEntry &
E = *VectorizableTree[Idx];
13582 if (
E.isGather()) {
13585 unsigned MinVF =
getMinVF(2 * Sz);
13588 if (VL.
size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
13589 !(!
E.hasState() ||
E.getOpcode() == Instruction::Load ||
13595 if (ForceLoadGather &&
E.hasState() &&
E.getOpcode() == Instruction::Load)
13598 if (CheckForSameVectorNodes(
E))
13602 unsigned StartIdx = 0;
13603 unsigned End = VL.
size();
13604 SmallBitVector Processed(End);
13606 *TTI, VL.
front()->getType(), VL.
size() - 1);
13608 *TTI, VL.
front()->getType(), VF - 1)) {
13609 if (StartIdx + VF > End)
13612 bool AllStrided =
true;
13613 for (
unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
13618 !getSameValuesTreeEntry(Slice.
front(), Slice,
true))
13625 bool IsSplat =
isSplat(Slice);
13626 bool IsTwoRegisterSplat =
true;
13627 if (IsSplat && VF == 2) {
13630 IsTwoRegisterSplat = NumRegs2VF == 2;
13632 if (Slices.
empty() || !IsSplat || !IsTwoRegisterSplat ||
13640 (S.getOpcode() == Instruction::Load &&
13642 (S.getOpcode() != Instruction::Load &&
13648 if ((!UserIgnoreList ||
E.Idx != 0) &&
13649 TTI->getInstructionCost(S.getMainOp(),
CostKind) <
13658 if (S.getOpcode() == Instruction::Load) {
13661 StridedPtrInfo SPtrInfo;
13663 PointerOps, SPtrInfo);
13674 if (UserIgnoreList &&
E.Idx == 0)
13679 }
else if (S.getOpcode() == Instruction::ExtractElement ||
13680 (TTI->getInstructionCost(S.getMainOp(),
CostKind) <
13682 !CheckOperandsProfitability(
13699 if (VF == 2 && AllStrided && Slices.
size() > 2)
13701 auto AddCombinedNode = [&](
unsigned Idx,
unsigned Cnt,
unsigned Sz) {
13702 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
13703 Processed.set(Cnt, Cnt + Sz);
13704 if (StartIdx == Cnt)
13705 StartIdx = Cnt + Sz;
13706 if (End == Cnt + Sz)
13709 for (
auto [Cnt, Sz] : Slices) {
13711 const TreeEntry *SameTE =
nullptr;
13713 It != Slice.
end()) {
13715 SameTE = getSameValuesTreeEntry(*It, Slice);
13717 unsigned PrevSize = VectorizableTree.size();
13718 [[maybe_unused]]
unsigned PrevEntriesSize =
13719 LoadEntriesToVectorize.size();
13720 buildTreeRec(Slice, 0,
EdgeInfo(&
E, UINT_MAX));
13721 if (PrevSize + 1 == VectorizableTree.size() && !SameTE &&
13722 VectorizableTree[PrevSize]->isGather() &&
13723 VectorizableTree[PrevSize]->hasState() &&
13724 VectorizableTree[PrevSize]->getOpcode() !=
13725 Instruction::ExtractElement &&
13727 if (UserIgnoreList &&
E.Idx == 0 && VF == 2)
13729 VectorizableTree.pop_back();
13730 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
13731 "LoadEntriesToVectorize expected to remain the same");
13734 AddCombinedNode(PrevSize, Cnt, Sz);
13738 if (
E.CombinedEntriesWithIndices.empty() && !
E.ReorderIndices.empty()) {
13739 SmallVector<int>
Mask(
E.ReorderIndices.begin(),
E.ReorderIndices.end());
13741 E.ReorderIndices.clear();
13746 switch (
E.getOpcode()) {
13747 case Instruction::Load: {
13750 if (
E.State != TreeEntry::Vectorize)
13752 Type *ScalarTy =
E.getMainOp()->getType();
13758 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13759 SmallVector<int>
Mask;
13763 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
13764 BaseLI->getPointerAddressSpace(),
CostKind,
13768 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_load,
13769 VecTy, BaseLI->getPointerOperand(),
13770 false, CommonAlignment,
13777 ->getPointerOperand()
13779 StridedPtrInfo SPtrInfo;
13780 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1);
13781 SPtrInfo.Ty = VecTy;
13782 TreeEntryToStridedPtrInfoMap[&
E] = SPtrInfo;
13783 E.State = TreeEntry::StridedVectorize;
13788 case Instruction::Store: {
13796 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13797 SmallVector<int>
Mask;
13801 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
13802 BaseSI->getPointerAddressSpace(),
CostKind,
13806 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_store,
13807 VecTy, BaseSI->getPointerOperand(),
13808 false, CommonAlignment,
13811 if (StridedCost < OriginalVecCost)
13814 E.State = TreeEntry::StridedVectorize;
13815 }
else if (!
E.ReorderIndices.empty()) {
13817 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int>
Mask) {
13819 assert(
Mask.size() > 1 &&
"Expected mask greater than 1 element.");
13820 if (
Mask.size() < 4)
13824 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
13825 TTI.isLegalInterleavedAccessType(
13826 VecTy, Factor, BaseSI->getAlign(),
13827 BaseSI->getPointerAddressSpace()))
13833 SmallVector<int>
Mask(
E.ReorderIndices.begin(),
E.ReorderIndices.end());
13834 unsigned InterleaveFactor = IsInterleaveMask(Mask);
13835 if (InterleaveFactor != 0)
13836 E.setInterleave(InterleaveFactor);
13840 case Instruction::Select: {
13841 if (
E.State != TreeEntry::Vectorize)
13847 E.CombinedOp = TreeEntry::MinMax;
13848 TreeEntry *CondEntry = getOperandEntry(&
E, 0);
13849 if (SelectOnly && CondEntry->UserTreeIndex &&
13850 CondEntry->State == TreeEntry::Vectorize) {
13852 CondEntry->State = TreeEntry::CombinedVectorize;
13856 case Instruction::FSub:
13857 case Instruction::FAdd: {
13859 if (
E.State != TreeEntry::Vectorize ||
13860 !
E.getOperations().isAddSubLikeOp())
13866 E.CombinedOp = TreeEntry::FMulAdd;
13867 TreeEntry *FMulEntry = getOperandEntry(&
E, 0);
13868 if (FMulEntry->UserTreeIndex &&
13869 FMulEntry->State == TreeEntry::Vectorize) {
13871 FMulEntry->State = TreeEntry::CombinedVectorize;
13875 case Instruction::Shl: {
13878 if (!UserIgnoreList)
13887 if (!matchesShlZExt(
E, Order, IsBSwap))
13890 TreeEntry::CombinedOpcode
Code =
13891 IsBSwap ? TreeEntry::ReducedBitcastBSwap : TreeEntry::ReducedBitcast;
13892 E.CombinedOp =
Code;
13894 E.ReorderIndices = std::move(Order);
13895 TreeEntry *ZExtEntry = getOperandEntry(&
E, 0);
13896 assert(ZExtEntry->UserTreeIndex &&
13897 ZExtEntry->State == TreeEntry::Vectorize &&
13898 ZExtEntry->getOpcode() == Instruction::ZExt &&
13899 "Expected ZExt node.");
13901 ZExtEntry->State = TreeEntry::CombinedVectorize;
13902 ZExtEntry->CombinedOp =
Code;
13903 TreeEntry *ConstEntry = getOperandEntry(&
E, 1);
13904 assert(ConstEntry->UserTreeIndex && ConstEntry->isGather() &&
13905 "Expected ZExt node.");
13907 ConstEntry->State = TreeEntry::CombinedVectorize;
13908 ConstEntry->CombinedOp =
Code;
13916 if (LoadEntriesToVectorize.empty()) {
13918 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
13919 VectorizableTree.front()->getOpcode() == Instruction::Load)
13922 constexpr unsigned SmallTree = 3;
13923 constexpr unsigned SmallVF = 2;
13924 if ((VectorizableTree.size() <= SmallTree &&
13925 VectorizableTree.front()->Scalars.size() == SmallVF) ||
13926 (VectorizableTree.size() <= 2 && UserIgnoreList))
13929 if (VectorizableTree.front()->isNonPowOf2Vec() &&
13933 [](
const std::unique_ptr<TreeEntry> &TE) {
13934 return TE->isGather() &&
TE->hasState() &&
13935 TE->getOpcode() == Instruction::Load &&
13943 SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
13947 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
13948 TreeEntry &
E = *
TE;
13949 if (
E.isGather() &&
13950 ((
E.hasState() &&
E.getOpcode() == Instruction::Load) ||
13951 (!
E.hasState() &&
any_of(
E.Scalars,
13953 return isa<LoadInst>(V) &&
13954 !isVectorized(V) &&
13955 !isDeleted(cast<Instruction>(V));
13958 for (
Value *V :
E.Scalars) {
13965 *
this, V, *DL, *SE, *TTI,
13966 GatheredLoads[std::make_tuple(
13974 if (!GatheredLoads.
empty())
13975 tryToVectorizeGatheredLoads(GatheredLoads);
13985 bool IsFinalized =
false;
13998 bool SameNodesEstimated =
true;
14001 if (Ty->getScalarType()->isPointerTy()) {
14005 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
14006 Ty->getScalarType());
14024 assert(It != VL.
end() &&
"Expected at least one non-undef value.");
14027 count(VL, *It) > 1 &&
14029 if (!NeedShuffle) {
14032 return TTI.getShuffleCost(
14037 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
14038 CostKind, std::distance(VL.
begin(), It),
14044 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
14047 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
14051 VecTy, ShuffleMask, CostKind,
14055 return GatherCost +
14058 : R.getGatherCost(Gathers, !Root && VL.
equals(Gathers),
14066 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
14067 unsigned NumParts) {
14068 assert(VL.
size() > NumParts &&
"Unexpected scalarized shuffle.");
14070 std::accumulate(VL.
begin(), VL.
end(), 0, [](
unsigned Sz,
Value *V) {
14071 auto *EE = dyn_cast<ExtractElementInst>(V);
14074 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
14077 return std::max(Sz, VecTy->getNumElements());
14084 -> std::optional<TTI::ShuffleKind> {
14085 if (NumElts <= EltsPerVector)
14086 return std::nullopt;
14088 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
14090 if (I == PoisonMaskElem)
14092 return std::min(S, I);
14095 int OffsetReg1 = OffsetReg0;
14099 int FirstRegId = -1;
14100 Indices.assign(1, OffsetReg0);
14104 int Idx =
I - OffsetReg0;
14106 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
14107 if (FirstRegId < 0)
14108 FirstRegId = RegId;
14109 RegIndices.
insert(RegId);
14110 if (RegIndices.
size() > 2)
14111 return std::nullopt;
14112 if (RegIndices.
size() == 2) {
14114 if (Indices.
size() == 1) {
14117 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
14118 [&](
int S,
int I) {
14119 if (I == PoisonMaskElem)
14121 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
14122 ((I - OffsetReg0) % NumElts) / EltsPerVector;
14123 if (RegId == FirstRegId)
14125 return std::min(S, I);
14128 unsigned Index = OffsetReg1 % NumElts;
14129 Indices.push_back(Index);
14130 SubVecSizes.push_back(std::min(NumElts - Index, EltsPerVector));
14132 Idx =
I - OffsetReg1;
14134 I = (Idx % NumElts) % EltsPerVector +
14135 (RegId == FirstRegId ? 0 : EltsPerVector);
14137 return ShuffleKind;
14145 if (!ShuffleKinds[Part])
14148 Part * EltsPerVector,
getNumElems(Mask.size(), EltsPerVector, Part));
14153 std::optional<TTI::ShuffleKind> RegShuffleKind =
14154 CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);
14155 if (!RegShuffleKind) {
14158 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.
size())))
14171 *R.TTI, VL.
front()->getType(),
alignTo(NumElts, EltsPerVector));
14172 for (
const auto [Idx, SubVecSize] :
zip(Indices, SubVecSizes)) {
14173 assert((Idx + SubVecSize) <= BaseVF &&
14174 "SK_ExtractSubvector index out of range");
14184 TTI, *ShuffleKinds[Part],
getWidenedType(ScalarTy, NumElts), SubMask);
14185 if (OriginalCost < Cost)
14186 Cost = OriginalCost;
14193 void estimateNodesPermuteCost(
const TreeEntry &E1,
const TreeEntry *E2,
14195 unsigned SliceSize) {
14196 if (SameNodesEstimated) {
14202 if ((InVectors.size() == 2 &&
14206 unsigned Limit =
getNumElems(Mask.size(), SliceSize, Part);
14209 "Expected all poisoned elements.");
14211 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
14216 Cost += createShuffle(InVectors.front(),
14217 InVectors.size() == 1 ?
nullptr : InVectors.back(),
14219 transformMaskAfterShuffle(CommonMask, CommonMask);
14220 }
else if (InVectors.size() == 2) {
14221 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14222 transformMaskAfterShuffle(CommonMask, CommonMask);
14224 SameNodesEstimated =
false;
14225 if (!E2 && InVectors.size() == 1) {
14226 unsigned VF = E1.getVectorFactor();
14228 VF = std::max(VF, getVF(V1));
14231 VF = std::max(VF, E->getVectorFactor());
14233 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14235 CommonMask[Idx] = Mask[Idx] + VF;
14236 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
14237 transformMaskAfterShuffle(CommonMask, CommonMask);
14239 auto P = InVectors.front();
14240 Cost += createShuffle(&E1, E2, Mask);
14241 unsigned VF = Mask.size();
14247 VF = std::max(VF, E->getVectorFactor());
14249 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14251 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
14252 Cost += createShuffle(
P, InVectors.front(), CommonMask);
14253 transformMaskAfterShuffle(CommonMask, CommonMask);
14257 class ShuffleCostBuilder {
14260 static bool isEmptyOrIdentity(
ArrayRef<int> Mask,
unsigned VF) {
14262 return Mask.empty() ||
14263 (VF == Mask.size() &&
14271 ~ShuffleCostBuilder() =
default;
14277 if (isEmptyOrIdentity(Mask, VF))
14286 if (isEmptyOrIdentity(Mask, VF))
14295 void resizeToMatch(
Value *&,
Value *&)
const {}
14305 ShuffleCostBuilder Builder(TTI);
14308 unsigned CommonVF = Mask.size();
14310 auto GetNodeMinBWAffectedCost = [&](
const TreeEntry &E,
14314 Type *EScalarTy = E.Scalars.front()->getType();
14315 bool IsSigned =
true;
14316 if (
auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
14318 IsSigned = It->second.second;
14320 if (EScalarTy != ScalarTy) {
14321 unsigned CastOpcode = Instruction::Trunc;
14322 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
14323 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
14325 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14326 return TTI.getCastInstrCost(CastOpcode,
getWidenedType(ScalarTy, VF),
14336 Type *EScalarTy = VecTy->getElementType();
14337 if (EScalarTy != ScalarTy) {
14339 unsigned CastOpcode = Instruction::Trunc;
14340 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
14341 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
14343 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14344 return TTI.getCastInstrCost(
14350 if (!V1 && !V2 && !P2.isNull()) {
14353 unsigned VF = E->getVectorFactor();
14355 CommonVF = std::max(VF, E2->getVectorFactor());
14358 return Idx < 2 * static_cast<int>(CommonVF);
14360 "All elements in mask must be less than 2 * CommonVF.");
14361 if (E->Scalars.size() == E2->Scalars.size()) {
14365 for (
int &Idx : CommonMask) {
14368 if (Idx <
static_cast<int>(CommonVF) && !EMask.
empty())
14370 else if (Idx >=
static_cast<int>(CommonVF))
14371 Idx = (E2Mask.
empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
14375 CommonVF = E->Scalars.size();
14376 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
14377 GetNodeMinBWAffectedCost(*E2, CommonVF);
14379 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
14380 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
14383 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
14384 }
else if (!V1 && P2.isNull()) {
14387 unsigned VF = E->getVectorFactor();
14391 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
14392 "All elements in mask must be less than CommonVF.");
14393 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
14395 assert(!EMask.
empty() &&
"Expected non-empty common mask.");
14396 for (
int &Idx : CommonMask) {
14400 CommonVF = E->Scalars.size();
14401 }
else if (
unsigned Factor = E->getInterleaveFactor();
14402 Factor > 0 && E->Scalars.size() != Mask.size() &&
14406 std::iota(CommonMask.begin(), CommonMask.end(), 0);
14408 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
14411 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
14412 CommonVF == CommonMask.size() &&
14414 [](
const auto &&
P) {
14416 static_cast<unsigned>(
P.value()) !=
P.index();
14424 }
else if (V1 && P2.isNull()) {
14426 ExtraCost += GetValueMinBWAffectedCost(V1);
14427 CommonVF = getVF(V1);
14430 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
14431 "All elements in mask must be less than CommonVF.");
14432 }
else if (V1 && !V2) {
14434 unsigned VF = getVF(V1);
14436 CommonVF = std::max(VF, E2->getVectorFactor());
14439 return Idx < 2 * static_cast<int>(CommonVF);
14441 "All elements in mask must be less than 2 * CommonVF.");
14442 if (E2->Scalars.size() == VF && VF != CommonVF) {
14444 assert(!E2Mask.
empty() &&
"Expected non-empty common mask.");
14445 for (
int &Idx : CommonMask) {
14448 if (Idx >=
static_cast<int>(CommonVF))
14449 Idx = E2Mask[Idx - CommonVF] + VF;
14453 ExtraCost += GetValueMinBWAffectedCost(V1);
14455 ExtraCost += GetNodeMinBWAffectedCost(
14456 *E2, std::min(CommonVF, E2->getVectorFactor()));
14457 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
14458 }
else if (!V1 && V2) {
14460 unsigned VF = getVF(V2);
14462 CommonVF = std::max(VF, E1->getVectorFactor());
14465 return Idx < 2 * static_cast<int>(CommonVF);
14467 "All elements in mask must be less than 2 * CommonVF.");
14468 if (E1->Scalars.size() == VF && VF != CommonVF) {
14470 assert(!E1Mask.
empty() &&
"Expected non-empty common mask.");
14471 for (
int &Idx : CommonMask) {
14474 if (Idx >=
static_cast<int>(CommonVF))
14475 Idx = E1Mask[Idx - CommonVF] + VF;
14481 ExtraCost += GetNodeMinBWAffectedCost(
14482 *E1, std::min(CommonVF, E1->getVectorFactor()));
14484 ExtraCost += GetValueMinBWAffectedCost(V2);
14485 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
14487 assert(V1 && V2 &&
"Expected both vectors.");
14488 unsigned VF = getVF(V1);
14489 CommonVF = std::max(VF, getVF(V2));
14492 return Idx < 2 * static_cast<int>(CommonVF);
14494 "All elements in mask must be less than 2 * CommonVF.");
14496 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
14499 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
14504 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
14507 InVectors.front() =
14509 if (InVectors.size() == 2)
14510 InVectors.pop_back();
14511 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
14512 V1, V2, CommonMask, Builder, ScalarTy);
14519 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
14520 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
14521 CheckedExtracts(CheckedExtracts) {}
14523 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
14524 unsigned NumParts,
bool &UseVecBaseAsInput) {
14525 UseVecBaseAsInput =
false;
14528 Value *VecBase =
nullptr;
14530 if (!E->ReorderIndices.empty()) {
14532 E->ReorderIndices.end());
14537 bool PrevNodeFound =
any_of(
14538 ArrayRef(R.VectorizableTree).take_front(E->Idx),
14539 [&](
const std::unique_ptr<TreeEntry> &TE) {
14540 return ((TE->hasState() && !TE->isAltShuffle() &&
14541 TE->getOpcode() == Instruction::ExtractElement) ||
14543 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
14544 return VL.size() > Data.index() &&
14545 (Mask[Data.index()] == PoisonMaskElem ||
14546 isa<UndefValue>(VL[Data.index()]) ||
14547 Data.value() == VL[Data.index()]);
14555 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
14569 VecBase = EE->getVectorOperand();
14570 UniqueBases.
insert(VecBase);
14572 if (!CheckedExtracts.
insert(V).second ||
14575 [&](
const TreeEntry *TE) {
14576 return R.DeletedNodes.contains(TE) ||
14577 R.TransformedToGatherNodes.contains(TE);
14579 (E->UserTreeIndex && E->UserTreeIndex.EdgeIdx == UINT_MAX &&
14580 !R.isVectorized(EE) &&
14582 count_if(E->UserTreeIndex.UserTE->Scalars,
14583 [&](
Value *V) { return V == EE; })) ||
14586 return isa<GetElementPtrInst>(U) &&
14587 !R.areAllUsersVectorized(cast<Instruction>(U),
14595 unsigned Idx = *EEIdx;
14597 if (EE->hasOneUse() || !PrevNodeFound) {
14603 Cost -=
TTI.getExtractWithExtendCost(
14607 Cost +=
TTI.getCastInstrCost(
14613 APInt &DemandedElts =
14614 VectorOpsToExtracts
14617 .first->getSecond();
14618 DemandedElts.
setBit(Idx);
14621 for (
const auto &[Vec, DemandedElts] : VectorOpsToExtracts)
14623 DemandedElts,
false,
14631 if (!PrevNodeFound)
14632 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
14635 transformMaskAfterShuffle(CommonMask, CommonMask);
14636 SameNodesEstimated =
false;
14637 if (NumParts != 1 && UniqueBases.
size() != 1) {
14638 UseVecBaseAsInput =
true;
14646 std::optional<InstructionCost>
14650 return std::nullopt;
14654 IsFinalized =
false;
14655 CommonMask.clear();
14658 VectorizedVals.clear();
14659 SameNodesEstimated =
true;
14665 return Idx < static_cast<int>(E1.getVectorFactor());
14667 "Expected single vector shuffle mask.");
14671 if (InVectors.empty()) {
14672 CommonMask.assign(Mask.begin(), Mask.end());
14673 InVectors.assign({&E1, &E2});
14676 assert(!CommonMask.empty() &&
"Expected non-empty common mask.");
14681 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
14682 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
14685 if (InVectors.empty()) {
14686 CommonMask.assign(Mask.begin(), Mask.end());
14687 InVectors.assign(1, &E1);
14690 assert(!CommonMask.empty() &&
"Expected non-empty common mask.");
14695 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
14696 estimateNodesPermuteCost(E1,
nullptr, Mask, Part, SliceSize);
14697 if (!SameNodesEstimated && InVectors.size() == 1)
14698 InVectors.emplace_back(&E1);
14704 assert(InVectors.size() == 1 &&
14711 ->getOrdered(
P.index()));
14712 return EI->getVectorOperand() == V1 ||
14713 EI->getVectorOperand() == V2;
14715 "Expected extractelement vectors.");
14719 if (InVectors.empty()) {
14720 assert(CommonMask.empty() && !ForExtracts &&
14721 "Expected empty input mask/vectors.");
14722 CommonMask.assign(Mask.begin(), Mask.end());
14723 InVectors.assign(1, V1);
14729 !CommonMask.empty() &&
14733 ->getOrdered(
P.index());
14735 return P.value() == Mask[
P.index()] ||
14740 return EI->getVectorOperand() == V1;
14742 "Expected only tree entry for extractelement vectors.");
14745 assert(!InVectors.empty() && !CommonMask.empty() &&
14746 "Expected only tree entries from extracts/reused buildvectors.");
14747 unsigned VF = getVF(V1);
14748 if (InVectors.size() == 2) {
14749 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14750 transformMaskAfterShuffle(CommonMask, CommonMask);
14751 VF = std::max<unsigned>(VF, CommonMask.size());
14752 }
else if (
const auto *InTE =
14753 InVectors.front().dyn_cast<
const TreeEntry *>()) {
14754 VF = std::max(VF, InTE->getVectorFactor());
14758 ->getNumElements());
14760 InVectors.push_back(V1);
14761 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14763 CommonMask[Idx] = Mask[Idx] + VF;
14766 Value *Root =
nullptr) {
14767 Cost += getBuildVectorCost(VL, Root);
14771 unsigned VF = VL.
size();
14773 VF = std::min(VF, MaskVF);
14774 Type *VLScalarTy = VL.
front()->getType();
14798 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
14804 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14809 IsFinalized =
true;
14812 if (InVectors.
size() == 2)
14813 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
14815 Cost += createShuffle(Vec,
nullptr, CommonMask);
14816 transformMaskAfterShuffle(CommonMask, CommonMask);
14818 "Expected vector length for the final value before action.");
14821 Cost += createShuffle(V1, V2, Mask);
14824 InVectors.
front() = V;
14826 if (!SubVectors.empty()) {
14828 if (InVectors.
size() == 2)
14829 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
14831 Cost += createShuffle(Vec,
nullptr, CommonMask);
14832 transformMaskAfterShuffle(CommonMask, CommonMask);
14834 if (!SubVectorsMask.
empty()) {
14836 "Expected same size of masks for subvectors and common mask.");
14838 copy(SubVectorsMask, SVMask.begin());
14839 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
14842 I1 = I2 + CommonMask.
size();
14849 for (
auto [
E, Idx] : SubVectors) {
14850 Type *EScalarTy =
E->Scalars.front()->getType();
14851 bool IsSigned =
true;
14852 if (
auto It =
R.MinBWs.find(
E); It !=
R.MinBWs.end()) {
14855 IsSigned = It->second.second;
14857 if (ScalarTy != EScalarTy) {
14858 unsigned CastOpcode = Instruction::Trunc;
14859 unsigned DstSz =
R.DL->getTypeSizeInBits(ScalarTy);
14860 unsigned SrcSz =
R.DL->getTypeSizeInBits(EScalarTy);
14862 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14863 Cost += TTI.getCastInstrCost(
14872 if (!CommonMask.
empty()) {
14873 std::iota(std::next(CommonMask.
begin(), Idx),
14874 std::next(CommonMask.
begin(), Idx +
E->getVectorFactor()),
14880 if (!ExtMask.
empty()) {
14881 if (CommonMask.
empty()) {
14885 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
14888 NewMask[
I] = CommonMask[ExtMask[
I]];
14890 CommonMask.
swap(NewMask);
14893 if (CommonMask.
empty()) {
14894 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
14898 createShuffle(InVectors.
front(),
14899 InVectors.
size() == 2 ? InVectors.
back() :
nullptr,
14904 assert((IsFinalized || CommonMask.empty()) &&
14905 "Shuffle construction must be finalized.");
14909const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(
const TreeEntry *E,
14910 unsigned Idx)
const {
14911 TreeEntry *
Op = OperandsToTreeEntry.
at({E, Idx});
14912 assert(
Op->isSame(
E->getOperand(Idx)) &&
"Operands mismatch!");
14917 if (
TE.State == TreeEntry::ScatterVectorize ||
14918 TE.State == TreeEntry::StridedVectorize)
14920 if (
TE.State == TreeEntry::CompressVectorize)
14922 if (
TE.State == TreeEntry::Vectorize &&
TE.getOpcode() == Instruction::Load &&
14923 !
TE.isAltShuffle()) {
14924 if (
TE.ReorderIndices.empty())
14926 SmallVector<int>
Mask;
14936 SmallPtrSetImpl<Value *> &CheckedExtracts) {
14941 return InstructionCost::getInvalid();
14946 auto It = MinBWs.find(
E);
14947 Type *OrigScalarTy = ScalarTy;
14948 if (It != MinBWs.end()) {
14954 const TreeEntry *ZExt = getOperandEntry(
E, 0);
14958 unsigned EntryVF =
E->getVectorFactor();
14961 if (
E->isGather() || TransformedToGatherNodes.contains(
E)) {
14965 return InstructionCost::getInvalid();
14967 ScalarTy = VL.
front()->getType();
14968 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
14969 E, ScalarTy, *TTI, VectorizedVals, *
this, CheckedExtracts);
14971 if (
E->State == TreeEntry::SplitVectorize) {
14972 assert(
E->CombinedEntriesWithIndices.size() == 2 &&
14973 "Expected exactly 2 combined entries.");
14974 assert(
E->ReuseShuffleIndices.empty() &&
"Expected empty reuses mask.");
14976 if (
E->ReorderIndices.empty()) {
14979 E->CombinedEntriesWithIndices.back().second,
14982 VectorizableTree[
E->CombinedEntriesWithIndices.back().first]
14983 ->getVectorFactor()));
14985 unsigned CommonVF =
14986 std::max(VectorizableTree[
E->CombinedEntriesWithIndices.front().first]
14987 ->getVectorFactor(),
14988 VectorizableTree[
E->CombinedEntriesWithIndices.back().first]
14989 ->getVectorFactor());
14994 LLVM_DEBUG(dumpTreeCosts(
E, 0, VectorCost, 0,
"Calculated costs for Tree"));
14998 SmallVector<int>
Mask;
14999 if (!
E->ReorderIndices.empty() &&
E->State != TreeEntry::CompressVectorize &&
15000 (
E->State != TreeEntry::StridedVectorize ||
15002 SmallVector<int> NewMask;
15003 if (
E->getOpcode() == Instruction::Store) {
15005 NewMask.
resize(
E->ReorderIndices.size());
15012 if (!
E->ReuseShuffleIndices.empty())
15017 assert((
E->State == TreeEntry::Vectorize ||
15018 E->State == TreeEntry::ScatterVectorize ||
15019 E->State == TreeEntry::StridedVectorize ||
15020 E->State == TreeEntry::CompressVectorize) &&
15021 "Unhandled state");
15024 (
E->getOpcode() == Instruction::GetElementPtr &&
15025 E->getMainOp()->getType()->isPointerTy()) ||
15026 E->hasCopyableElements()) &&
15029 unsigned ShuffleOrOp =
15030 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector :
E->
getOpcode();
15031 if (
E->CombinedOp != TreeEntry::NotCombinedOp)
15032 ShuffleOrOp =
E->CombinedOp;
15033 SmallSetVector<Value *, 16> UniqueValues;
15034 SmallVector<unsigned, 16> UniqueIndexes;
15036 if (UniqueValues.insert(V))
15037 UniqueIndexes.push_back(Idx);
15038 const unsigned Sz = UniqueValues.size();
15039 SmallBitVector UsedScalars(Sz,
false);
15040 for (
unsigned I = 0;
I < Sz; ++
I) {
15042 !
E->isCopyableElement(UniqueValues[
I]) &&
15043 getTreeEntries(UniqueValues[
I]).
front() ==
E)
15045 UsedScalars.set(
I);
15047 auto GetCastContextHint = [&](
Value *
V) {
15049 return getCastContextHint(*OpTEs.front());
15050 InstructionsState SrcState =
getSameOpcode(
E->getOperand(0), *TLI);
15051 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
15052 !SrcState.isAltShuffle())
15065 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
15067 for (
unsigned I = 0;
I < Sz; ++
I) {
15068 if (UsedScalars.test(
I))
15070 ScalarCost += ScalarEltCost(
I);
15077 if (It != MinBWs.end() && !UnaryInstruction::isCast(
E->getOpcode()) &&
15079 (
E->getOpcode() != Instruction::Load ||
E->UserTreeIndex)) {
15081 if (!EI.UserTE->hasState() ||
15082 EI.UserTE->getOpcode() != Instruction::Select ||
15084 auto UserBWIt = MinBWs.find(EI.UserTE);
15085 Type *UserScalarTy =
15086 (EI.UserTE->isGather() ||
15087 EI.UserTE->State == TreeEntry::SplitVectorize)
15088 ? EI.UserTE->Scalars.front()->getType()
15089 : EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
15090 if (UserBWIt != MinBWs.end())
15092 UserBWIt->second.first);
15093 if (ScalarTy != UserScalarTy) {
15094 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
15095 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
15096 unsigned VecOpcode;
15098 if (BWSz > SrcBWSz)
15099 VecOpcode = Instruction::Trunc;
15102 It->second.second ? Instruction::SExt : Instruction::ZExt;
15104 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
15109 LLVM_DEBUG(dumpTreeCosts(
E, CommonCost, VecCost - CommonCost,
15110 ScalarCost,
"Calculated costs for Tree"));
15111 return VecCost - ScalarCost;
15116 assert((
E->State == TreeEntry::Vectorize ||
15117 E->State == TreeEntry::StridedVectorize ||
15118 E->State == TreeEntry::CompressVectorize) &&
15119 "Entry state expected to be Vectorize, StridedVectorize or "
15120 "MaskedLoadCompressVectorize here.");
15124 *TTI, Ptrs, BasePtr,
E->getOpcode(),
CostKind, OrigScalarTy, VecTy);
15125 LLVM_DEBUG(dumpTreeCosts(
E, 0, VecCost, ScalarCost,
15126 "Calculated GEPs cost for Tree"));
15128 return VecCost - ScalarCost;
15134 return InstructionCost::getInvalid();
15135 Type *CanonicalType = Ty;
15141 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
15142 {CanonicalType, CanonicalType});
15144 TTI->getIntrinsicInstrCost(CostAttrs,
CostKind);
15147 if (VI && SelectOnly) {
15149 "Expected only for scalar type.");
15152 CI->
getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
15153 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
15154 {TTI::OK_AnyValue, TTI::OP_None}, CI);
15158 auto GetFMulAddCost = [&, &TTI = *TTI](
const InstructionsState &S,
15163 switch (ShuffleOrOp) {
15164 case Instruction::PHI: {
15167 SmallPtrSet<const TreeEntry *, 4> CountedOps;
15168 for (
Value *V : UniqueValues) {
15173 ValueList Operands(
PHI->getNumIncomingValues(),
nullptr);
15174 for (
unsigned I = 0,
N =
PHI->getNumIncomingValues();
I <
N; ++
I) {
15178 if (
const TreeEntry *OpTE =
15179 getSameValuesTreeEntry(Operands.
front(), Operands))
15180 if (CountedOps.
insert(OpTE).second &&
15181 !OpTE->ReuseShuffleIndices.empty())
15182 ScalarCost +=
TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
15183 OpTE->Scalars.size());
15186 return CommonCost - ScalarCost;
15188 case Instruction::ExtractValue:
15189 case Instruction::ExtractElement: {
15190 APInt DemandedElts;
15192 auto GetScalarCost = [&](
unsigned Idx) {
15198 if (ShuffleOrOp == Instruction::ExtractElement) {
15200 SrcVecTy = EE->getVectorOperandType();
15203 Type *AggregateTy = EV->getAggregateOperand()->getType();
15206 NumElts = ATy->getNumElements();
15212 if (
I->hasOneUse()) {
15222 Cost -= TTI->getCastInstrCost(
15228 if (DemandedElts.
isZero())
15234 return CommonCost - (DemandedElts.
isZero()
15236 : TTI.getScalarizationOverhead(
15237 SrcVecTy, DemandedElts,
false,
15240 return GetCostDiff(GetScalarCost, GetVectorCost);
15242 case Instruction::InsertElement: {
15243 assert(
E->ReuseShuffleIndices.empty() &&
15244 "Unique insertelements only are expected.");
15246 unsigned const NumElts = SrcVecTy->getNumElements();
15247 unsigned const NumScalars = VL.
size();
15253 unsigned OffsetEnd = OffsetBeg;
15254 InsertMask[OffsetBeg] = 0;
15257 if (OffsetBeg > Idx)
15259 else if (OffsetEnd < Idx)
15261 InsertMask[Idx] =
I + 1;
15264 if (NumOfParts > 0 && NumOfParts < NumElts)
15265 VecScalarsSz =
PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
15266 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
15268 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
15269 unsigned InsertVecSz = std::min<unsigned>(
15271 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
15272 bool IsWholeSubvector =
15273 OffsetBeg ==
Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
15277 if (OffsetBeg + InsertVecSz > VecSz) {
15280 InsertVecSz = VecSz;
15285 SmallVector<int>
Mask;
15286 if (!
E->ReorderIndices.empty()) {
15291 std::iota(
Mask.begin(), std::next(
Mask.begin(), InsertVecSz), 0);
15293 bool IsIdentity =
true;
15295 Mask.swap(PrevMask);
15296 for (
unsigned I = 0;
I < NumScalars; ++
I) {
15298 DemandedElts.
setBit(InsertIdx);
15299 IsIdentity &= InsertIdx - OffsetBeg ==
I;
15300 Mask[InsertIdx - OffsetBeg] =
I;
15302 assert(
Offset < NumElts &&
"Failed to find vector index offset");
15316 InsertVecTy, Mask);
15318 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
15324 SmallBitVector InMask =
15326 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
15327 if (!InMask.
all() && NumScalars != NumElts && !IsWholeSubvector) {
15328 if (InsertVecSz != VecSz) {
15333 for (
unsigned I = 0, End = OffsetBeg -
Offset;
I < End; ++
I)
15335 for (
unsigned I = OffsetBeg -
Offset, End = OffsetEnd -
Offset;
15339 for (
unsigned I = OffsetEnd + 1 -
Offset;
I < VecSz; ++
I)
15348 case Instruction::ZExt:
15349 case Instruction::SExt:
15350 case Instruction::FPToUI:
15351 case Instruction::FPToSI:
15352 case Instruction::FPExt:
15353 case Instruction::PtrToInt:
15354 case Instruction::IntToPtr:
15355 case Instruction::SIToFP:
15356 case Instruction::UIToFP:
15357 case Instruction::Trunc:
15358 case Instruction::FPTrunc:
15359 case Instruction::BitCast: {
15360 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
15363 unsigned Opcode = ShuffleOrOp;
15364 unsigned VecOpcode = Opcode;
15366 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
15368 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->
getScalarType());
15369 if (SrcIt != MinBWs.end()) {
15370 SrcBWSz = SrcIt->second.first;
15376 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->
getScalarType());
15377 if (BWSz == SrcBWSz) {
15378 VecOpcode = Instruction::BitCast;
15379 }
else if (BWSz < SrcBWSz) {
15380 VecOpcode = Instruction::Trunc;
15381 }
else if (It != MinBWs.end()) {
15382 assert(BWSz > SrcBWSz &&
"Invalid cast!");
15383 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
15384 }
else if (SrcIt != MinBWs.end()) {
15385 assert(BWSz > SrcBWSz &&
"Invalid cast!");
15387 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
15389 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
15390 !SrcIt->second.second) {
15391 VecOpcode = Instruction::UIToFP;
15394 assert(Idx == 0 &&
"Expected 0 index only");
15395 return TTI->getCastInstrCost(Opcode, VL0->
getType(),
15402 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
15404 auto *
VI = VL0->
getOpcode() == Opcode ? VL0 :
nullptr;
15407 bool IsArithmeticExtendedReduction =
15408 E->Idx == 0 && UserIgnoreList &&
15411 return is_contained({Instruction::Add, Instruction::FAdd,
15412 Instruction::Mul, Instruction::FMul,
15413 Instruction::And, Instruction::Or,
15417 if (IsArithmeticExtendedReduction &&
15418 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
15420 return CommonCost +
15421 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH,
CostKind,
15422 VecOpcode == Opcode ? VI :
nullptr);
15424 return GetCostDiff(GetScalarCost, GetVectorCost);
15426 case Instruction::FCmp:
15427 case Instruction::ICmp:
15428 case Instruction::Select: {
15429 CmpPredicate VecPred, SwappedVecPred;
15432 match(VL0, MatchCmp))
15438 auto GetScalarCost = [&](
unsigned Idx) {
15448 !
match(VI, MatchCmp)) ||
15456 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
15457 CostKind, getOperandInfo(
VI->getOperand(0)),
15458 getOperandInfo(
VI->getOperand(1)), VI);
15469 TTI->getCmpSelInstrCost(
E->getOpcode(), VecTy, MaskTy, VecPred,
15470 CostKind, getOperandInfo(
E->getOperand(0)),
15471 getOperandInfo(
E->getOperand(1)), VL0);
15475 unsigned CondNumElements = CondType->getNumElements();
15477 assert(VecTyNumElements >= CondNumElements &&
15478 VecTyNumElements % CondNumElements == 0 &&
15479 "Cannot vectorize Instruction::Select");
15480 if (CondNumElements != VecTyNumElements) {
15489 return VecCost + CommonCost;
15491 return GetCostDiff(GetScalarCost, GetVectorCost);
15493 case TreeEntry::MinMax: {
15494 auto GetScalarCost = [&](
unsigned Idx) {
15495 return GetMinMaxCost(OrigScalarTy);
15499 return VecCost + CommonCost;
15501 return GetCostDiff(GetScalarCost, GetVectorCost);
15503 case TreeEntry::FMulAdd: {
15504 auto GetScalarCost = [&](
unsigned Idx) {
15507 return GetFMulAddCost(
E->getOperations(),
15513 for (
Value *V :
E->Scalars) {
15515 FMF &= FPCI->getFastMathFlags();
15517 FMF &= FPCIOp->getFastMathFlags();
15520 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,
15521 {VecTy, VecTy, VecTy}, FMF);
15523 return VecCost + CommonCost;
15525 return GetCostDiff(GetScalarCost, GetVectorCost);
15527 case TreeEntry::ReducedBitcast:
15528 case TreeEntry::ReducedBitcastBSwap: {
15529 auto GetScalarCost = [&, &TTI = *TTI](
unsigned Idx) {
15539 ScalarCost += TTI.getInstructionCost(ZExt,
CostKind);
15543 const TreeEntry *LhsTE = getOperandEntry(
E, 0);
15545 getCastContextHint(*getOperandEntry(LhsTE, 0));
15547 auto *SrcVecTy =
getWidenedType(SrcScalarTy, LhsTE->getVectorFactor());
15549 Instruction::BitCast, ScalarTy, SrcVecTy, CastCtx,
CostKind);
15550 if (ShuffleOrOp == TreeEntry::ReducedBitcastBSwap) {
15551 auto *OrigScalarTy =
E->getMainOp()->getType();
15552 IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, OrigScalarTy,
15555 TTI.getIntrinsicInstrCost(CostAttrs,
CostKind);
15558 return BitcastCost + CommonCost;
15560 return GetCostDiff(GetScalarCost, GetVectorCost);
15562 case Instruction::FNeg:
15563 case Instruction::Add:
15564 case Instruction::FAdd:
15565 case Instruction::Sub:
15566 case Instruction::FSub:
15567 case Instruction::Mul:
15568 case Instruction::FMul:
15569 case Instruction::UDiv:
15570 case Instruction::SDiv:
15571 case Instruction::FDiv:
15572 case Instruction::URem:
15573 case Instruction::SRem:
15574 case Instruction::FRem:
15575 case Instruction::Shl:
15576 case Instruction::LShr:
15577 case Instruction::AShr:
15578 case Instruction::And:
15579 case Instruction::Or:
15580 case Instruction::Xor: {
15581 auto GetScalarCost = [&](
unsigned Idx) {
15588 unsigned Lane = UniqueIndexes[Idx];
15589 Value *Op1 =
E->getOperand(0)[Lane];
15591 SmallVector<const Value *, 2> Operands(1, Op1);
15595 Op2 =
E->getOperand(1)[Lane];
15601 ShuffleOrOp, OrigScalarTy,
CostKind, Op1Info, Op2Info, Operands);
15603 I && (ShuffleOrOp == Instruction::FAdd ||
15604 ShuffleOrOp == Instruction::FSub)) {
15612 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
15617 return CI && CI->getValue().countr_one() >= It->second.first;
15625 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy,
CostKind, Op1Info,
15626 Op2Info, {},
nullptr, TLI) +
15629 return GetCostDiff(GetScalarCost, GetVectorCost);
15631 case Instruction::GetElementPtr: {
15632 return CommonCost + GetGEPCostDiff(VL, VL0);
15634 case Instruction::Load: {
15635 auto GetScalarCost = [&](
unsigned Idx) {
15637 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
15638 VI->getAlign(),
VI->getPointerAddressSpace(),
15644 switch (
E->State) {
15645 case TreeEntry::Vectorize:
15646 if (
unsigned Factor =
E->getInterleaveFactor()) {
15647 VecLdCost = TTI->getInterleavedMemoryOpCost(
15648 Instruction::Load, VecTy, Factor, {}, LI0->getAlign(),
15649 LI0->getPointerAddressSpace(),
CostKind);
15652 VecLdCost = TTI->getMemoryOpCost(
15653 Instruction::Load, VecTy, LI0->getAlign(),
15657 case TreeEntry::StridedVectorize: {
15658 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(
E);
15659 FixedVectorType *StridedLoadTy = SPtrInfo.Ty;
15660 assert(StridedLoadTy &&
"Missing StridedPointerInfo for tree entry.");
15661 Align CommonAlignment =
15663 VecLdCost = TTI->getMemIntrinsicInstrCost(
15664 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_load,
15665 StridedLoadTy, LI0->getPointerOperand(),
15666 false, CommonAlignment),
15668 if (StridedLoadTy != VecTy)
15670 TTI->getCastInstrCost(Instruction::BitCast, VecTy, StridedLoadTy,
15675 case TreeEntry::CompressVectorize: {
15677 unsigned InterleaveFactor;
15678 SmallVector<int> CompressMask;
15681 if (!
E->ReorderIndices.empty()) {
15682 SmallVector<int>
Mask(
E->ReorderIndices.begin(),
15683 E->ReorderIndices.end());
15690 Scalars, PointerOps,
E->ReorderIndices, *TTI, *DL, *SE, *AC, *DT,
15691 *TLI, [](
Value *) { return true; }, IsMasked, InterleaveFactor,
15692 CompressMask, LoadVecTy);
15693 assert(IsVectorized &&
"Failed to vectorize load");
15694 CompressEntryToData.try_emplace(
E, CompressMask, LoadVecTy,
15695 InterleaveFactor, IsMasked);
15696 Align CommonAlignment = LI0->getAlign();
15697 if (InterleaveFactor) {
15698 VecLdCost = TTI->getInterleavedMemoryOpCost(
15699 Instruction::Load, LoadVecTy, InterleaveFactor, {},
15700 CommonAlignment, LI0->getPointerAddressSpace(),
CostKind);
15701 }
else if (IsMasked) {
15702 VecLdCost = TTI->getMemIntrinsicInstrCost(
15703 MemIntrinsicCostAttributes(Intrinsic::masked_load, LoadVecTy,
15705 LI0->getPointerAddressSpace()),
15709 LoadVecTy, CompressMask,
CostKind);
15711 VecLdCost = TTI->getMemoryOpCost(
15712 Instruction::Load, LoadVecTy, CommonAlignment,
15716 LoadVecTy, CompressMask,
CostKind);
15720 case TreeEntry::ScatterVectorize: {
15721 Align CommonAlignment =
15723 VecLdCost = TTI->getMemIntrinsicInstrCost(
15724 MemIntrinsicCostAttributes(Intrinsic::masked_gather, VecTy,
15725 LI0->getPointerOperand(),
15726 false, CommonAlignment),
15730 case TreeEntry::CombinedVectorize:
15731 case TreeEntry::SplitVectorize:
15732 case TreeEntry::NeedToGather:
15735 return VecLdCost + CommonCost;
15741 if (
E->State == TreeEntry::ScatterVectorize)
15748 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
15750 case Instruction::Store: {
15751 bool IsReorder = !
E->ReorderIndices.empty();
15752 auto GetScalarCost = [=](
unsigned Idx) {
15755 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
15756 VI->getAlign(),
VI->getPointerAddressSpace(),
15764 if (
E->State == TreeEntry::StridedVectorize) {
15765 Align CommonAlignment =
15767 VecStCost = TTI->getMemIntrinsicInstrCost(
15768 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_store,
15769 VecTy, BaseSI->getPointerOperand(),
15770 false, CommonAlignment),
15773 assert(
E->State == TreeEntry::Vectorize &&
15774 "Expected either strided or consecutive stores.");
15775 if (
unsigned Factor =
E->getInterleaveFactor()) {
15776 assert(
E->ReuseShuffleIndices.empty() && !
E->ReorderIndices.empty() &&
15777 "No reused shuffles expected");
15779 VecStCost = TTI->getInterleavedMemoryOpCost(
15780 Instruction::Store, VecTy, Factor, {}, BaseSI->getAlign(),
15781 BaseSI->getPointerAddressSpace(),
CostKind);
15784 VecStCost = TTI->getMemoryOpCost(
15785 Instruction::Store, VecTy, BaseSI->getAlign(),
15786 BaseSI->getPointerAddressSpace(),
CostKind, OpInfo);
15789 return VecStCost + CommonCost;
15793 unsigned Idx = IsReorder ?
E->ReorderIndices[
I] :
I;
15797 return GetCostDiff(GetScalarCost, GetVectorCost) +
15798 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
15800 case Instruction::Call: {
15801 auto GetScalarCost = [&](
unsigned Idx) {
15805 IntrinsicCostAttributes CostAttrs(
ID, *CI, 1);
15806 return TTI->getIntrinsicInstrCost(CostAttrs,
CostKind);
15816 CI,
ID, VecTy->getNumElements(),
15817 It != MinBWs.end() ? It->second.first : 0, TTI);
15819 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
15821 return GetCostDiff(GetScalarCost, GetVectorCost);
15823 case Instruction::ShuffleVector: {
15831 "Invalid Shuffle Vector Operand");
15834 auto TryFindNodeWithEqualOperands = [=]() {
15835 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
15838 if (
TE->hasState() &&
TE->isAltShuffle() &&
15839 ((
TE->getOpcode() ==
E->getOpcode() &&
15840 TE->getAltOpcode() ==
E->getAltOpcode()) ||
15841 (
TE->getOpcode() ==
E->getAltOpcode() &&
15842 TE->getAltOpcode() ==
E->getOpcode())) &&
15843 TE->hasEqualOperands(*
E))
15848 auto GetScalarCost = [&](
unsigned Idx) {
15853 assert(
E->getMatchingMainOpOrAltOp(VI) &&
15854 "Unexpected main/alternate opcode");
15856 return TTI->getInstructionCost(VI,
CostKind);
15864 if (TryFindNodeWithEqualOperands()) {
15866 dbgs() <<
"SLP: diamond match for alternate node found.\n";
15873 TTIRef.getArithmeticInstrCost(
E->getOpcode(), VecTy,
CostKind);
15875 TTIRef.getArithmeticInstrCost(
E->getAltOpcode(), VecTy,
CostKind);
15878 VecCost = TTIRef.getCmpSelInstrCost(
15879 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(),
CostKind,
15880 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15882 VecCost += TTIRef.getCmpSelInstrCost(
15883 E->getOpcode(), VecTy, MaskTy,
15885 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15888 Type *SrcSclTy =
E->getMainOp()->getOperand(0)->getType();
15891 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
15892 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
15894 DL->getTypeSizeInBits(
E->getMainOp()->getOperand(0)->getType());
15895 if (SrcIt != MinBWs.end()) {
15896 SrcBWSz = SrcIt->second.first;
15900 if (BWSz <= SrcBWSz) {
15901 if (BWSz < SrcBWSz)
15903 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
15907 <<
"SLP: alternate extension, which should be truncated.\n";
15913 VecCost = TTIRef.getCastInstrCost(
E->getOpcode(), VecTy, SrcTy,
15916 TTIRef.getCastInstrCost(
E->getAltOpcode(), VecTy, SrcTy,
15919 SmallVector<int>
Mask;
15920 E->buildAltOpShuffleMask(
15921 [&](Instruction *
I) {
15922 assert(
E->getMatchingMainOpOrAltOp(
I) &&
15923 "Unexpected main/alternate opcode");
15934 unsigned Opcode0 =
E->getOpcode();
15935 unsigned Opcode1 =
E->getAltOpcode();
15936 SmallBitVector OpcodeMask(
15940 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
15942 VecTy, Opcode0, Opcode1, OpcodeMask,
CostKind);
15943 return AltVecCost < VecCost ? AltVecCost : VecCost;
15949 return GetCostDiff(
15954 "Not supported shufflevector usage.");
15956 unsigned SVNumElements =
15958 ->getNumElements();
15959 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
15960 for (
size_t I = 0, End = VL.
size();
I != End;
I += GroupSize) {
15965 "Not supported shufflevector usage.");
15968 [[maybe_unused]]
bool IsExtractSubvectorMask =
15969 SV->isExtractSubvectorMask(Index);
15970 assert(IsExtractSubvectorMask &&
15971 "Not supported shufflevector usage.");
15972 if (NextIndex != Index)
15974 NextIndex += SV->getShuffleMask().size();
15977 return ::getShuffleCost(
15983 return GetCostDiff(GetScalarCost, GetVectorCost);
15985 case Instruction::Freeze:
15992bool BoUpSLP::isFullyVectorizableTinyTree(
bool ForReduction)
const {
15994 << VectorizableTree.size() <<
" is fully vectorizable .\n");
15996 auto &&AreVectorizableGathers = [
this](
const TreeEntry *
TE,
unsigned Limit) {
15997 SmallVector<int>
Mask;
15998 return TE->isGather() &&
16000 [
this](
Value *V) { return EphValues.contains(V); }) &&
16002 TE->Scalars.size() < Limit ||
16003 (((
TE->hasState() &&
16004 TE->getOpcode() == Instruction::ExtractElement) ||
16007 (
TE->hasState() &&
TE->getOpcode() == Instruction::Load &&
16008 !
TE->isAltShuffle()) ||
16013 if (VectorizableTree.size() == 1 &&
16014 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
16015 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
16016 VectorizableTree[0]->State == TreeEntry::CompressVectorize ||
16018 AreVectorizableGathers(VectorizableTree[0].
get(),
16019 VectorizableTree[0]->Scalars.size()) &&
16020 VectorizableTree[0]->getVectorFactor() > 2)))
16023 if (VectorizableTree.size() != 2)
16030 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
16031 AreVectorizableGathers(VectorizableTree[1].
get(),
16032 VectorizableTree[0]->Scalars.size()))
16036 if (VectorizableTree[0]->
isGather() ||
16037 (VectorizableTree[1]->
isGather() &&
16038 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
16039 VectorizableTree[0]->State != TreeEntry::StridedVectorize &&
16040 VectorizableTree[0]->State != TreeEntry::CompressVectorize))
16048 bool MustMatchOrInst) {
16052 Value *ZextLoad = Root;
16053 const APInt *ShAmtC;
16054 bool FoundOr =
false;
16058 ShAmtC->
urem(8) == 0))) {
16060 ZextLoad = BinOp->getOperand(0);
16061 if (BinOp->getOpcode() == Instruction::Or)
16066 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
16073 Type *SrcTy = Load->getType();
16074 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
16080 LLVM_DEBUG(
dbgs() <<
"SLP: Assume load combining for tree starting at "
16090 unsigned NumElts = VectorizableTree[0]->Scalars.size();
16091 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
16099 unsigned NumElts = Stores.
size();
16100 for (
Value *Scalar : Stores) {
16114 if (VectorizableTree.empty()) {
16115 assert(ExternalUses.empty() &&
"We shouldn't have any external users");
16121 if (VectorizableTree.size() == 2 &&
16123 VectorizableTree[1]->isGather() &&
16124 (VectorizableTree[1]->getVectorFactor() <= 2 ||
16125 !(
isSplat(VectorizableTree[1]->Scalars) ||
16133 constexpr int Limit = 4;
16135 !VectorizableTree.empty() &&
16136 all_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
16137 return (TE->isGather() &&
16138 (!TE->hasState() ||
16139 TE->getOpcode() != Instruction::ExtractElement) &&
16141 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
16148 VectorizableTree.size() <= Limit &&
16149 all_of(VectorizableTree,
16150 [&](
const std::unique_ptr<TreeEntry> &TE) {
16151 return (TE->isGather() &&
16152 (!TE->hasState() ||
16153 TE->getOpcode() != Instruction::ExtractElement) &&
16157 (TE->getOpcode() == Instruction::InsertElement ||
16158 (TE->getOpcode() == Instruction::PHI &&
16160 return isa<PoisonValue>(V) || MustGather.contains(V);
16163 any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
16164 return TE->State == TreeEntry::Vectorize &&
16165 TE->getOpcode() == Instruction::PHI;
16172 unsigned NumGathers = 0;
16173 constexpr int LimitTreeSize = 36;
16175 all_of(VectorizableTree,
16176 [&](
const std::unique_ptr<TreeEntry> &TE) {
16177 if (!TE->isGather() && TE->hasState() &&
16178 (TE->getOpcode() == Instruction::Load ||
16179 TE->getOpcode() == Instruction::Store)) {
16183 if (TE->isGather())
16185 return TE->State == TreeEntry::SplitVectorize ||
16186 (TE->Idx == 0 && TE->Scalars.size() == 2 &&
16187 TE->hasState() && TE->getOpcode() == Instruction::ICmp &&
16188 VectorizableTree.size() > LimitTreeSize) ||
16192 (TE->getOpcode() == Instruction::PHI ||
16193 (TE->hasCopyableElements() &&
16196 TE->Scalars.size() / 2) ||
16197 ((!TE->ReuseShuffleIndices.empty() ||
16198 !TE->ReorderIndices.empty() || TE->isAltShuffle()) &&
16199 TE->Scalars.size() == 2)));
16201 (StoreLoadNodes.
empty() ||
16202 (VectorizableTree.size() > LimitTreeSize * StoreLoadNodes.
size() &&
16203 (NumGathers > 0 ||
none_of(StoreLoadNodes, [&](
const TreeEntry *TE) {
16204 return TE->getOpcode() == Instruction::Store ||
16206 return !isa<LoadInst>(V) ||
16207 areAllUsersVectorized(cast<Instruction>(V));
16215 VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
16216 VectorizableTree.size() >= Limit &&
16218 [&](
const std::unique_ptr<TreeEntry> &TE) {
16219 return !TE->isGather() && TE->UserTreeIndex.UserTE &&
16220 TE->UserTreeIndex.UserTE->Idx == 0;
16227 VectorizableTree.size() > 2 &&
16228 VectorizableTree.front()->State == TreeEntry::Vectorize &&
16229 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
16230 VectorizableTree[1]->State == TreeEntry::Vectorize &&
16231 VectorizableTree[1]->getOpcode() == Instruction::PHI &&
16233 ArrayRef(VectorizableTree).drop_front(2),
16234 [&](
const std::unique_ptr<TreeEntry> &TE) {
return TE->isGather(); }))
16244 if (isFullyVectorizableTinyTree(ForReduction))
16249 bool IsAllowedSingleBVNode =
16250 VectorizableTree.
size() > 1 ||
16251 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
16252 !VectorizableTree.front()->isAltShuffle() &&
16253 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
16254 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
16256 if (
any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
16257 return TE->isGather() &&
all_of(TE->Scalars, [&](
Value *V) {
16258 return isa<ExtractElementInst, Constant>(V) ||
16259 (IsAllowedSingleBVNode &&
16260 !V->hasNUsesOrMore(UsesLimit) &&
16261 any_of(V->users(), IsaPred<InsertElementInst>));
16266 if (VectorizableTree.back()->isGather() &&
16267 VectorizableTree.back()->hasState() &&
16268 VectorizableTree.back()->isAltShuffle() &&
16269 VectorizableTree.back()->getVectorFactor() > 2 &&
16271 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
16272 TTI->getScalarizationOverhead(
16273 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
16274 VectorizableTree.back()->getVectorFactor()),
16287 constexpr unsigned SmallTree = 3;
16288 if (VectorizableTree.front()->isNonPowOf2Vec() &&
16291 [](
const std::unique_ptr<TreeEntry> &TE) {
16292 return TE->isGather() && TE->hasState() &&
16293 TE->getOpcode() == Instruction::Load &&
16301 TreeEntry &E = *VectorizableTree[Idx];
16302 if (E.State == TreeEntry::SplitVectorize)
16306 if ((E.hasState() && E.getOpcode() != Instruction::Load) ||
16325 const TreeEntry *Root = VectorizableTree.front().get();
16326 if (Root->isGather())
16335 for (
const auto &TEPtr : VectorizableTree) {
16336 if (TEPtr->CombinedOp == TreeEntry::ReducedBitcast ||
16337 TEPtr->CombinedOp == TreeEntry::ReducedBitcastBSwap) {
16338 ScalarOrPseudoEntries.
insert(TEPtr.get());
16341 if (!TEPtr->isGather()) {
16342 Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
16343 EntriesToLastInstruction.
try_emplace(TEPtr.get(), LastInst);
16344 LastInstructions.
insert(LastInst);
16346 if (TEPtr->UserTreeIndex)
16347 EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
16354 if (
II->isAssumeLikeIntrinsic())
16361 return IntrCost < CallCost;
16368 CheckedInstructions;
16369 unsigned Budget = 0;
16370 const unsigned BudgetLimit =
16375 "Expected instructions in same block.");
16376 if (
auto It = CheckedInstructions.
find(
Last);
16377 It != CheckedInstructions.
end()) {
16378 const Instruction *Checked = It->second.getPointer();
16380 return It->second.getInt() != 0;
16386 ++
First->getIterator().getReverse(),
16388 Last->getIterator().getReverse();
16390 while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
16396 for (
const Instruction *LastInst : LastInstsInRange)
16397 CheckedInstructions.
try_emplace(LastInst, &*PrevInstIt, 0);
16400 if (LastInstructions.
contains(&*PrevInstIt))
16401 LastInstsInRange.
push_back(&*PrevInstIt);
16406 for (
const Instruction *LastInst : LastInstsInRange)
16408 LastInst, PrevInstIt == InstIt ?
First : &*PrevInstIt,
16409 Budget <= BudgetLimit ? 1 : 0);
16410 return Budget <= BudgetLimit;
16412 auto AddCosts = [&](
const TreeEntry *
Op) {
16415 Type *ScalarTy =
Op->Scalars.front()->getType();
16416 auto It = MinBWs.find(
Op);
16417 if (It != MinBWs.end())
16420 Cost += TTI->getCostOfKeepingLiveOverCall(VecTy);
16423 Cost -=
Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy);
16430 ParentOpParentToPreds;
16433 auto Key = std::make_pair(Root, OpParent);
16434 if (
auto It = ParentOpParentToPreds.
find(
Key);
16435 It != ParentOpParentToPreds.
end())
16447 for (
const auto &KeyPair : ParentsPairsToAdd) {
16449 "Should not have been added before.");
16453 while (!Worklist.
empty()) {
16455 if (BB == OpParent || !Visited.
insert(BB).second)
16457 auto Pair = std::make_pair(BB, OpParent);
16458 if (
auto It = ParentOpParentToPreds.
find(Pair);
16459 It != ParentOpParentToPreds.
end()) {
16463 ParentsPairsToAdd.
insert(Pair);
16468 if (Budget > BudgetLimit)
16480 auto FindNonScalarParentEntry = [&](
const TreeEntry *E) ->
const TreeEntry * {
16482 "Expected scalar or pseudo entry.");
16483 const TreeEntry *Entry = E;
16484 while (Entry->UserTreeIndex) {
16485 Entry = Entry->UserTreeIndex.UserTE;
16486 if (!ScalarOrPseudoEntries.
contains(Entry))
16491 while (!LiveEntries.
empty()) {
16494 if (Operands.
empty())
16496 if (ScalarOrPseudoEntries.
contains(Entry)) {
16497 Entry = FindNonScalarParentEntry(Entry);
16499 for (
const TreeEntry *
Op : Operands) {
16500 if (!
Op->isGather())
16506 Instruction *LastInst = EntriesToLastInstruction.
at(Entry);
16508 for (
const TreeEntry *
Op : Operands) {
16509 if (!
Op->isGather())
16513 if (Entry->State == TreeEntry::SplitVectorize ||
16514 (Entry->getOpcode() != Instruction::PHI &&
Op->isGather()) ||
16520 Pred = Phi->getIncomingBlock(
Op->UserTreeIndex.EdgeIdx);
16523 if (
Op->isGather()) {
16524 assert(Entry->getOpcode() == Instruction::PHI &&
16525 "Expected phi node only.");
16527 ->getIncomingBlock(
Op->UserTreeIndex.EdgeIdx);
16529 for (
Value *V :
Op->Scalars) {
16540 OpLastInst = EntriesToLastInstruction.
at(
Op);
16544 if (OpParent == Parent) {
16545 if (Entry->getOpcode() == Instruction::PHI) {
16546 if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
16550 if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
16556 if (Entry->getOpcode() != Instruction::PHI &&
16557 !CheckForNonVecCallsInSameBlock(
16563 if (!CheckForNonVecCallsInSameBlock(OpLastInst,
16569 if (!CheckPredecessors(Parent, Pred, OpParent)) {
16585 const auto *I1 = IE1;
16586 const auto *I2 = IE2;
16598 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
16601 if (I2 && ((I2 == IE2 || I2->
hasOneUse())) &&
16604 }
while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
16611struct ValueSelect {
16612 template <
typename U>
16613 static std::enable_if_t<std::is_same_v<Value *, U>,
Value *>
get(
Value *V) {
16616 template <
typename U>
16617 static std::enable_if_t<!std::is_same_v<Value *, U>,
U>
get(
Value *) {
16635template <
typename T>
16641 assert(!ShuffleMask.empty() &&
"Empty list of shuffles for inserts.");
16643 auto VMIt = std::next(ShuffleMask.begin());
16646 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
16648 if (!IsBaseUndef.
all()) {
16650 std::pair<T *, bool> Res =
16651 ResizeAction(ShuffleMask.begin()->first, Mask,
false);
16653 for (
unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
16657 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
16659 [[maybe_unused]]
auto *V = ValueSelect::get<T *>(
Base);
16660 assert((!V || GetVF(V) == Mask.size()) &&
16661 "Expected base vector of VF number of elements.");
16662 Prev = Action(Mask, {
nullptr, Res.first});
16663 }
else if (ShuffleMask.size() == 1) {
16666 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
16672 Prev = Action(Mask, {ShuffleMask.begin()->first});
16676 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
16677 unsigned Vec2VF = GetVF(VMIt->first);
16678 if (Vec1VF == Vec2VF) {
16682 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
16685 Mask[
I] = SecMask[
I] + Vec1VF;
16688 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
16691 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
16693 std::pair<T *, bool> Res2 =
16694 ResizeAction(VMIt->first, VMIt->second,
false);
16696 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
16703 Mask[
I] = (Res2.second ?
I : SecMask[
I]) + VF;
16706 Prev = Action(Mask, {Res1.first, Res2.first});
16708 VMIt = std::next(VMIt);
16710 [[maybe_unused]]
bool IsBaseNotUndef = !IsBaseUndef.
all();
16712 for (
auto E = ShuffleMask.end(); VMIt !=
E; ++VMIt) {
16714 std::pair<T *, bool> Res =
16715 ResizeAction(VMIt->first, VMIt->second,
false);
16717 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
16720 "Multiple uses of scalars.");
16721 Mask[
I] = (Res.second ?
I : SecMask[
I]) + VF;
16726 Prev = Action(Mask, {Prev, Res.first});
16737 << VectorizableTree.size() <<
".\n");
16739 for (
const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
16740 TreeEntry &TE = *Ptr;
16743 if (TE.State == TreeEntry::CombinedVectorize) {
16745 dbgs() <<
"SLP: Skipping cost for combined node that starts with "
16746 << *TE.Scalars[0] <<
".\n";
16747 TE.dump();
dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
16751 if (TE.hasState() &&
16752 (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {
16753 if (
const TreeEntry *E =
16754 getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
16755 E && E->getVectorFactor() == TE.getVectorFactor()) {
16760 <<
"SLP: Current total cost = " << Cost <<
"\n");
16768 assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&
16769 "Expected gather nodes with users only.");
16776 <<
"SLP: Current total cost = " << Cost <<
"\n");
16778 if (TE.Idx > 0 && !TE.UserTreeIndex && TE.hasState() &&
16779 TE.getOpcode() == Instruction::Load)
16780 GatheredLoadsNodes.
insert(&TE);
16787 constexpr unsigned PartLimit = 2;
16788 const unsigned Sz =
16790 const unsigned MinVF =
getMinVF(Sz);
16792 VectorizableTree.front()->Scalars.size() * PartLimit <= MinVF &&
16793 (!VectorizableTree.front()->hasState() ||
16794 (VectorizableTree.front()->getOpcode() != Instruction::Store &&
16795 LI->getLoopFor(VectorizableTree.front()->getMainOp()->getParent()))))
16798 VectorizableTree.size());
16799 auto UpdateParentNodes =
16803 bool AddToList =
true) {
16805 VisitedUser.insert(std::make_pair(TE, UserTE)).second) {
16806 SubtreeCosts[UserTE->Idx].first +=
C;
16808 SubtreeCosts[UserTE->Idx].second.
push_back(TE->Idx);
16809 UserTE = UserTE->UserTreeIndex.UserTE;
16812 for (
const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
16813 TreeEntry &TE = *Ptr;
16815 SubtreeCosts[TE.Idx].first +=
C;
16816 if (
const TreeEntry *UserTE = TE.UserTreeIndex.UserTE) {
16819 UpdateParentNodes(UserTE, &TE,
C, VisitedUser);
16823 for (TreeEntry *TE : GatheredLoadsNodes) {
16825 for (
Value *V : TE->Scalars) {
16826 for (
const TreeEntry *BVTE : ValueToGatherNodes.lookup(V))
16827 UpdateParentNodes(BVTE, TE,
C, Visited,
false);
16831 using CostIndicesTy =
16832 std::pair<TreeEntry *, std::pair<InstructionCost, SmallVector<unsigned>>>;
16833 struct FirstGreater {
16834 bool operator()(
const CostIndicesTy &LHS,
const CostIndicesTy &RHS)
const {
16835 return LHS.second.first < RHS.second.first ||
16836 (LHS.second.first == RHS.second.first &&
16837 LHS.first->Idx < RHS.first->Idx);
16842 for (
const auto [Idx,
P] :
enumerate(SubtreeCosts))
16843 Worklist.emplace(VectorizableTree[Idx].
get(),
P);
16846 if (!UserIgnoreList && VectorizableTree.front()->getVectorFactor() < MinVF &&
16847 VectorizableTree.front()->hasState() &&
16848 VectorizableTree.front()->getOpcode() == Instruction::Store &&
16849 (Worklist.top().first->Idx == 0 || Worklist.top().first->Idx == 1))
16854 while (!Worklist.empty() && Worklist.top().second.first > 0) {
16855 TreeEntry *TE = Worklist.top().first;
16856 if (TE->isGather() || TE->Idx == 0 || DeletedNodes.contains(TE) ||
16859 (TE->UserTreeIndex &&
16860 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize &&
16862 ArrayRef<TreeEntry *> Entries = getSplitTreeEntries(V);
16863 return Entries.size() > 1;
16871 if (SubtreeCost < TE->Scalars.size()) {
16875 if (!TransformedToGatherNodes.empty()) {
16876 for (
unsigned Idx : Worklist.top().second.second) {
16877 auto It = TransformedToGatherNodes.find(VectorizableTree[Idx].
get());
16878 if (It != TransformedToGatherNodes.end()) {
16879 SubtreeCost -= SubtreeCosts[Idx].first;
16880 SubtreeCost += It->second;
16884 if (SubtreeCost < 0 || SubtreeCost < TE->Scalars.size()) {
16888 const unsigned Sz = TE->Scalars.size();
16890 for (
auto [Idx, V] :
enumerate(TE->Scalars)) {
16897 const unsigned EntryVF = TE->getVectorFactor();
16900 *TTI, ScalarTy, VecTy, DemandedElts,
16903 if (!TE->ReorderIndices.empty() &&
16904 TE->State != TreeEntry::CompressVectorize &&
16905 (TE->State != TreeEntry::StridedVectorize ||
16908 if (TE->getOpcode() == Instruction::Store) {
16910 NewMask.
resize(TE->ReorderIndices.size());
16911 copy(TE->ReorderIndices, NewMask.
begin());
16917 if (!TE->ReuseShuffleIndices.empty())
16918 ::addMask(Mask, TE->ReuseShuffleIndices);
16925 return (TE->hasCopyableElements() && TE->isCopyableElement(V)) ||
16926 isConstant(V) || isGathered(V) || getTreeEntries(V).size() > 1;
16930 if (SubtreeCost > GatherCost) {
16933 if (VectorizableTree.front()->hasState() &&
16934 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
16938 LLVM_DEBUG(
dbgs() <<
"SLP: Trimming unprofitable subtree at node "
16939 << TE->Idx <<
" with cost "
16940 << Worklist.top().second.first <<
" and gather cost "
16941 << GatherCost <<
".\n");
16942 if (TE->UserTreeIndex) {
16943 TransformedToGatherNodes.try_emplace(TE, GatherCost);
16944 NodesCosts.
erase(TE);
16946 DeletedNodes.insert(TE);
16947 TransformedToGatherNodes.erase(TE);
16948 NodesCosts.
erase(TE);
16950 for (
unsigned Idx : Worklist.top().second.second) {
16951 TreeEntry &ChildTE = *VectorizableTree[Idx];
16952 DeletedNodes.insert(&ChildTE);
16953 TransformedToGatherNodes.erase(&ChildTE);
16954 NodesCosts.
erase(&ChildTE);
16961 return SubtreeCosts.
front().first;
16970 for (TreeEntry *TE : GatheredLoadsNodes) {
16971 if (DeletedNodes.contains(TE) || TransformedToGatherNodes.contains(TE))
16973 GatheredLoadsToDelete.
insert(TE);
16977 for (
Value *V : TE->Scalars) {
16978 unsigned Pos = TE->findLaneForValue(V);
16979 for (
const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
16980 if (DeletedNodes.contains(BVE))
16982 DemandedElts.
setBit(Pos);
16983 ValuesToInsert.
try_emplace(BVE).first->second.push_back(V);
16986 if (!DemandedElts.
isZero()) {
16987 Type *ScalarTy = TE->Scalars.front()->getType();
16990 *TTI, ScalarTy, VecTy, DemandedElts,
16993 for (
const auto &[BVE, Values] : ValuesToInsert) {
16997 for (
Value *V : Values) {
16998 unsigned Pos = BVE->findLaneForValue(V);
17000 BVDemandedElts.
setBit(Pos);
17002 auto *BVVecTy =
getWidenedType(ScalarTy, BVE->getVectorFactor());
17004 *TTI, ScalarTy, BVVecTy, BVDemandedElts,
17008 if (ExtractsCost < BVCost) {
17009 LoadsExtractsCost += ExtractsCost;
17010 GatheredLoadsToDelete.
erase(TE);
17013 LoadsExtractsCost += BVCost;
17015 NodesCosts.
erase(TE);
17019 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
17020 if (TE->UserTreeIndex &&
17021 GatheredLoadsToDelete.
contains(TE->UserTreeIndex.UserTE)) {
17022 DeletedNodes.insert(TE.get());
17023 NodesCosts.
erase(TE.get());
17024 GatheredLoadsToDelete.
insert(TE.get());
17028 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
17029 if (!TE->UserTreeIndex && TransformedToGatherNodes.contains(TE.get())) {
17030 assert(TE->getOpcode() == Instruction::Load &&
"Expected load only.");
17033 if (DeletedNodes.contains(TE.get()))
17035 if (!NodesCosts.
contains(TE.get())) {
17037 getEntryCost(TE.get(), VectorizedVals, CheckedExtracts);
17042 LLVM_DEBUG(
dbgs() <<
"SLP: Recalculate costs after tree trimming.\n");
17044 for (
const auto &
P : NodesCosts) {
17045 NewCost +=
P.second;
17046 LLVM_DEBUG(
dbgs() <<
"SLP: Adding cost " <<
P.second <<
" for bundle "
17049 <<
"SLP: Current total cost = " << Cost <<
"\n");
17051 if (NewCost + LoadsExtractsCost >= Cost) {
17052 DeletedNodes.clear();
17053 TransformedToGatherNodes.clear();
17062template <
typename T>
struct ShuffledInsertData {
17076 none_of(ExternalUses, [](
const ExternalUser &EU) {
17087 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
17094 for (ExternalUser &EU : ExternalUses) {
17095 ScalarUserAndIdx.
emplace_back(EU.Scalar, EU.User, EU.Lane);
17098 for (ExternalUser &EU : ExternalUses) {
17099 LLVM_DEBUG(
dbgs() <<
"SLP: Computing cost for external use of TreeEntry "
17100 << EU.E.Idx <<
" in lane " << EU.Lane <<
"\n");
17102 else dbgs() <<
" User: nullptr\n");
17103 LLVM_DEBUG(
dbgs() <<
" Use: " << EU.Scalar->getNameOrAsOperand() <<
"\n");
17108 if (EphValues.count(EU.User))
17112 if (!CheckedScalarUser.
insert(std::make_pair(EU.Scalar, EU.User)).second ||
17114 CheckedScalarUser.
contains(std::make_pair(EU.Scalar,
nullptr))))
17122 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
17128 !ExtractCostCalculated.
insert(EU.Scalar).second)
17141 if (!UsedInserts.
insert(VU).second)
17145 const TreeEntry *ScalarTE = &EU.E;
17148 [
this, VU](
const ShuffledInsertData<const TreeEntry *> &
Data) {
17153 Value *Op0 =
II->getOperand(0);
17160 if (It == ShuffledInserts.
end()) {
17162 Data.InsertElements.emplace_back(VU);
17164 VecId = ShuffledInserts.
size() - 1;
17165 auto It = MinBWs.find(ScalarTE);
17166 if (It != MinBWs.end() &&
17168 .
insert(std::make_pair(ScalarTE, FTy->getElementType()))
17170 unsigned BWSz = It->second.first;
17171 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
17172 unsigned VecOpcode;
17173 if (DstBWSz < BWSz)
17174 VecOpcode = Instruction::Trunc;
17177 It->second.second ? Instruction::SExt : Instruction::ZExt;
17182 FTy->getNumElements()),
17185 <<
" for extending externally used vector with "
17186 "non-equal minimum bitwidth.\n");
17191 It->InsertElements.front() = VU;
17192 VecId = std::distance(ShuffledInserts.
begin(), It);
17194 int InIdx = *InsertIdx;
17196 ShuffledInserts[VecId].ValueMasks[ScalarTE];
17199 Mask[InIdx] = EU.Lane;
17200 DemandedElts[VecId].setBit(InIdx);
17211 auto *ScalarTy = EU.Scalar->getType();
17212 const unsigned BundleWidth = EU.E.getVectorFactor();
17213 assert(EU.Lane < BundleWidth &&
"Extracted lane out of bounds.");
17215 const TreeEntry *Entry = &EU.E;
17216 auto It = MinBWs.find(Entry);
17217 if (It != MinBWs.end()) {
17222 ? Instruction::ZExt
17223 : Instruction::SExt;
17228 << ExtraCost <<
"\n");
17232 CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx);
17233 LLVM_DEBUG(
dbgs() <<
" ExtractElement cost for " << *ScalarTy <<
" from "
17234 << *VecTy <<
": " << ExtraCost <<
"\n");
17237 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
17238 Entry->getOpcode() == Instruction::Load) {
17240 auto IsPhiInLoop = [&](
const ExternalUser &U) {
17243 const Loop *L = LI->getLoopFor(Phi->getParent());
17244 return L && (Phi->getParent() ==
I->getParent() ||
17245 L == LI->getLoopFor(
I->getParent()));
17249 if (!ValueToExtUses) {
17250 ValueToExtUses.emplace();
17251 for (
const auto &
P :
enumerate(ExternalUses)) {
17253 if (IsPhiInLoop(
P.value()))
17256 ValueToExtUses->try_emplace(
P.value().Scalar,
P.index());
17263 auto OperandIsScalar = [&](
Value *V) {
17269 return !EE->hasOneUse() || !MustGather.contains(EE);
17272 return ValueToExtUses->contains(V);
17274 bool CanBeUsedAsScalar =
all_of(Inst->operands(), OperandIsScalar);
17275 bool CanBeUsedAsScalarCast =
false;
17278 Op &&
all_of(
Op->operands(), OperandIsScalar)) {
17283 if (ScalarCost + OpCost <= ExtraCost) {
17284 CanBeUsedAsScalar = CanBeUsedAsScalarCast =
true;
17285 ScalarCost += OpCost;
17289 if (CanBeUsedAsScalar) {
17290 bool KeepScalar = ScalarCost <= ExtraCost;
17294 bool IsProfitablePHIUser =
17296 VectorizableTree.front()->Scalars.size() > 2)) &&
17297 VectorizableTree.front()->hasState() &&
17298 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
17302 auto *PHIUser = dyn_cast<PHINode>(U);
17303 return (!PHIUser ||
17304 PHIUser->getParent() !=
17306 VectorizableTree.front()->getMainOp())
17311 return ValueToExtUses->contains(V);
17313 if (IsProfitablePHIUser) {
17317 (!GatheredLoadsEntriesFirst.has_value() ||
17318 Entry->Idx < *GatheredLoadsEntriesFirst)) {
17319 unsigned ScalarUsesCount =
count_if(Entry->Scalars, [&](
Value *V) {
17320 return ValueToExtUses->contains(V);
17322 auto It = ExtractsCount.
find(Entry);
17323 if (It != ExtractsCount.
end()) {
17324 assert(ScalarUsesCount >= It->getSecond().size() &&
17325 "Expected total number of external uses not less than "
17326 "number of scalar uses.");
17327 ScalarUsesCount -= It->getSecond().size();
17332 KeepScalar = ScalarUsesCount <= 1 || !
has_single_bit(ScalarUsesCount);
17335 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
17336 for (
Value *V : Inst->operands()) {
17337 auto It = ValueToExtUses->find(V);
17338 if (It != ValueToExtUses->end()) {
17340 ExternalUses[It->second].User =
nullptr;
17343 ExtraCost = ScalarCost;
17344 if (!IsPhiInLoop(EU))
17345 ExtractsCount[Entry].
insert(Inst);
17346 if (CanBeUsedAsScalarCast) {
17347 ScalarOpsFromCasts.
insert(Inst->getOperand(0));
17351 for (
Value *V : IOp->operands()) {
17352 auto It = ValueToExtUses->find(V);
17353 if (It != ValueToExtUses->end()) {
17355 ExternalUses[It->second].User =
nullptr;
17364 ExtractCost += ExtraCost;
17368 for (
Value *V : ScalarOpsFromCasts) {
17369 ExternalUsesAsOriginalScalar.insert(V);
17371 const auto *It =
find_if_not(TEs, [&](TreeEntry *TE) {
17372 return TransformedToGatherNodes.contains(TE) ||
17373 DeletedNodes.contains(TE);
17375 if (It != TEs.end()) {
17376 const TreeEntry *UserTE = *It;
17377 ExternalUses.emplace_back(V,
nullptr, *UserTE,
17378 UserTE->findLaneForValue(V));
17383 if (!VectorizedVals.
empty()) {
17384 const TreeEntry &Root = *VectorizableTree.front();
17385 auto BWIt = MinBWs.find(&Root);
17386 if (BWIt != MinBWs.end()) {
17387 Type *DstTy = Root.Scalars.front()->getType();
17388 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->
getScalarType());
17390 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
17391 if (OriginalSz != SrcSz) {
17392 unsigned Opcode = Instruction::Trunc;
17393 if (OriginalSz > SrcSz)
17394 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
17400 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
17411 VectorizableTree[1]->hasState() &&
17412 VectorizableTree[1]->State == TreeEntry::Vectorize &&
17413 all_of(VectorizableTree[1]->Scalars, [&](
Value *V) {
17414 return ExternalUsesAsOriginalScalar.contains(V);
17418 Cost += ExtractCost;
17419 auto &&ResizeToVF = [
this, &Cost](
const TreeEntry *TE,
ArrayRef<int> Mask,
17420 bool ForSingleMask) {
17422 unsigned VF = Mask.size();
17423 unsigned VecVF = TE->getVectorFactor();
17424 bool HasLargeIndex =
17425 any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); });
17426 if ((VF != VecVF && HasLargeIndex) ||
17429 if (HasLargeIndex) {
17431 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
17437 dbgs() <<
"SLP: Adding cost " <<
C
17438 <<
" for final shuffle of insertelement external users.\n";
17439 TE->dump();
dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
17441 return std::make_pair(TE,
true);
17444 if (!ForSingleMask) {
17446 for (
unsigned I = 0;
I < VF; ++
I) {
17448 ResizeMask[Mask[
I]] = Mask[
I];
17455 dbgs() <<
"SLP: Adding cost " <<
C
17456 <<
" for final shuffle of insertelement external users.\n";
17457 TE->dump();
dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
17462 return std::make_pair(TE,
false);
17465 for (
int I = 0, E = ShuffledInserts.
size();
I < E; ++
I) {
17466 Value *
Base = ShuffledInserts[
I].InsertElements.
front()->getOperand(0);
17467 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
17471 assert((TEs.size() == 1 || TEs.size() == 2) &&
17472 "Expected exactly 1 or 2 tree entries.");
17473 if (TEs.size() == 1) {
17475 VF = TEs.front()->getVectorFactor();
17476 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
17480 (
Data.index() < VF &&
17481 static_cast<int>(
Data.index()) ==
Data.value());
17486 <<
" for final shuffle of insertelement "
17487 "external users.\n";
17488 TEs.front()->
dump();
17489 dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
17495 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
17496 VF = TEs.front()->getVectorFactor();
17500 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
17504 <<
" for final shuffle of vector node and external "
17505 "insertelement users.\n";
17506 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
17507 dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
17515 [](
const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
17516 EstimateShufflesCost);
17519 ShuffledInserts[
I].InsertElements.
front()->getType()),
17522 Cost -= InsertCost;
17526 if (ReductionBitWidth != 0) {
17527 assert(UserIgnoreList &&
"Expected reduction tree.");
17528 const TreeEntry &E = *VectorizableTree.front();
17529 auto It = MinBWs.find(&E);
17530 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
17531 unsigned SrcSize = It->second.first;
17532 unsigned DstSize = ReductionBitWidth;
17533 unsigned Opcode = Instruction::Trunc;
17534 if (SrcSize < DstSize) {
17535 bool IsArithmeticExtendedReduction =
17538 return is_contained({Instruction::Add, Instruction::FAdd,
17539 Instruction::Mul, Instruction::FMul,
17540 Instruction::And, Instruction::Or,
17544 if (IsArithmeticExtendedReduction)
17546 Instruction::BitCast;
17548 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
17550 if (Opcode != Instruction::BitCast) {
17552 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
17554 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
17557 switch (E.getOpcode()) {
17558 case Instruction::SExt:
17559 case Instruction::ZExt:
17560 case Instruction::Trunc: {
17561 const TreeEntry *OpTE = getOperandEntry(&E, 0);
17562 CCH = getCastContextHint(*OpTE);
17568 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
17572 <<
" for final resize for reduction from " << SrcVecTy
17573 <<
" to " << DstVecTy <<
"\n";
17574 dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
17579 std::optional<InstructionCost> SpillCost;
17582 Cost += *SpillCost;
17588 OS <<
"SLP: Spill Cost = ";
17593 OS <<
".\nSLP: Extract Cost = " << ExtractCost <<
".\n"
17594 <<
"SLP: Total Cost = " << Cost <<
".\n";
17598 ViewGraph(
this,
"SLP" + F->getName(),
false, Str);
17609std::optional<TTI::ShuffleKind>
17610BoUpSLP::tryToGatherSingleRegisterExtractElements(
17616 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
17632 if (Idx >= VecTy->getNumElements()) {
17636 SmallBitVector ExtractMask(VecTy->getNumElements(),
true);
17637 ExtractMask.reset(*Idx);
17642 VectorOpToIdx[EI->getVectorOperand()].push_back(
I);
17647 stable_sort(Vectors, [](
const auto &P1,
const auto &P2) {
17648 return P1.second.size() >
P2.second.size();
17651 const int UndefSz = UndefVectorExtracts.
size();
17652 unsigned SingleMax = 0;
17653 unsigned PairMax = 0;
17654 if (!Vectors.
empty()) {
17655 SingleMax = Vectors.
front().second.size() + UndefSz;
17656 if (Vectors.
size() > 1) {
17657 auto *ItNext = std::next(Vectors.
begin());
17658 PairMax = SingleMax + ItNext->second.size();
17661 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
17662 return std::nullopt;
17668 if (SingleMax >= PairMax && SingleMax) {
17669 for (
int Idx : Vectors.
front().second)
17670 std::swap(GatheredExtracts[Idx], VL[Idx]);
17671 }
else if (!Vectors.
empty()) {
17672 for (
unsigned Idx : {0, 1})
17673 for (
int Idx : Vectors[Idx].second)
17674 std::swap(GatheredExtracts[Idx], VL[Idx]);
17677 for (
int Idx : UndefVectorExtracts)
17678 std::swap(GatheredExtracts[Idx], VL[Idx]);
17681 std::optional<TTI::ShuffleKind> Res =
17687 return std::nullopt;
17691 for (
int I = 0,
E = GatheredExtracts.size();
I <
E; ++
I) {
17712BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
17713 SmallVectorImpl<int> &Mask,
17714 unsigned NumParts)
const {
17715 assert(NumParts > 0 &&
"NumParts expected be greater than or equal to 1.");
17724 SmallVector<int> SubMask;
17725 std::optional<TTI::ShuffleKind> Res =
17726 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
17727 ShufflesRes[Part] = Res;
17728 copy(SubMask, std::next(
Mask.begin(), Part * SliceSize));
17730 if (
none_of(ShufflesRes, [](
const std::optional<TTI::ShuffleKind> &Res) {
17731 return Res.has_value();
17733 ShufflesRes.clear();
17734 return ShufflesRes;
17737std::optional<TargetTransformInfo::ShuffleKind>
17738BoUpSLP::isGatherShuffledSingleRegisterEntry(
17740 SmallVectorImpl<const TreeEntry *> &Entries,
unsigned Part,
bool ForOrder) {
17743 return std::nullopt;
17746 auto GetUserEntry = [&](
const TreeEntry *
TE) {
17747 while (
TE->UserTreeIndex &&
TE->UserTreeIndex.EdgeIdx == UINT_MAX)
17748 TE =
TE->UserTreeIndex.UserTE;
17749 if (TE == VectorizableTree.front().get())
17750 return EdgeInfo(
const_cast<TreeEntry *
>(TE), 0);
17751 return TE->UserTreeIndex;
17753 auto HasGatherUser = [&](
const TreeEntry *
TE) {
17754 while (
TE->Idx != 0 &&
TE->UserTreeIndex) {
17755 if (
TE->UserTreeIndex.EdgeIdx == UINT_MAX)
17757 TE =
TE->UserTreeIndex.UserTE;
17761 const EdgeInfo TEUseEI = GetUserEntry(TE);
17763 return std::nullopt;
17764 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
17769 TEUseEI.UserTE->hasState() ? TEUseEI.UserTE->getMainOp() :
nullptr);
17770 PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
17771 TEInsertBlock =
PHI->getIncomingBlock(TEUseEI.EdgeIdx);
17774 TEInsertBlock = TEInsertPt->
getParent();
17776 if (!DT->isReachableFromEntry(TEInsertBlock))
17777 return std::nullopt;
17778 auto *NodeUI = DT->getNode(TEInsertBlock);
17779 assert(NodeUI &&
"Should only process reachable instructions");
17781 auto CheckOrdering = [&](
const Instruction *InsertPt) {
17794 const BasicBlock *InsertBlock = InsertPt->getParent();
17795 auto *NodeEUI = DT->getNode(InsertBlock);
17798 assert((NodeUI == NodeEUI) ==
17799 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
17800 "Different nodes should have different DFS numbers");
17802 if (TEInsertPt->
getParent() != InsertBlock &&
17803 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
17805 if (TEInsertPt->
getParent() == InsertBlock &&
17818 SmallDenseMap<Value *, int> UsedValuesEntry;
17819 SmallPtrSet<const Value *, 16> VisitedValue;
17820 auto CheckAndUseSameNode = [&](
const TreeEntry *TEPtr) {
17822 if ((TEPtr->getVectorFactor() != VL.
size() &&
17823 TEPtr->Scalars.size() != VL.
size()) ||
17824 (!TEPtr->isSame(VL) && !TEPtr->isSame(
TE->Scalars)))
17828 for (
Value *V : VL) {
17835 auto CheckParentNodes = [&](
const TreeEntry *User1,
const TreeEntry *User2,
17836 unsigned EdgeIdx) {
17837 const TreeEntry *Ptr1 = User1;
17838 const TreeEntry *Ptr2 = User2;
17839 SmallDenseMap<const TreeEntry *, unsigned> PtrToIdx;
17842 EdgeIdx = Ptr2->UserTreeIndex.EdgeIdx;
17843 Ptr2 = Ptr2->UserTreeIndex.UserTE;
17846 unsigned Idx = Ptr1->UserTreeIndex.EdgeIdx;
17847 Ptr1 = Ptr1->UserTreeIndex.UserTE;
17848 if (
auto It = PtrToIdx.
find(Ptr1); It != PtrToIdx.
end())
17849 return Idx < It->second;
17853 auto CheckNonSchedulableOrdering = [&](
const TreeEntry *
E,
17855 return TEUseEI && TEUseEI.UserTE && TEUseEI.UserTE->hasCopyableElements() &&
17856 !TEUseEI.UserTE->isCopyableElement(
17859 InsertPt->getNextNode() == TEInsertPt &&
17860 (!
E->hasCopyableElements() || !
E->isCopyableElement(InsertPt) ||
17863 for (
Value *V : VL) {
17867 SmallPtrSet<const TreeEntry *, 4> VToTEs;
17869 ValueToGatherNodes.lookup(V).takeVector());
17870 if (TransformedToGatherNodes.contains(TE)) {
17871 for (TreeEntry *
E : getSplitTreeEntries(V)) {
17872 if (TE ==
E || !TransformedToGatherNodes.contains(
E) ||
17873 !
E->UserTreeIndex ||
E->UserTreeIndex.UserTE->isGather())
17875 GatherNodes.push_back(
E);
17877 for (TreeEntry *
E : getTreeEntries(V)) {
17878 if (TE ==
E || !TransformedToGatherNodes.contains(
E) ||
17879 !
E->UserTreeIndex ||
E->UserTreeIndex.UserTE->isGather())
17881 GatherNodes.push_back(
E);
17884 for (
const TreeEntry *TEPtr : GatherNodes) {
17885 if (TEPtr == TE || TEPtr->Idx == 0 || DeletedNodes.contains(TEPtr))
17888 [&](
Value *V) { return GatheredScalars.contains(V); }) &&
17889 "Must contain at least single gathered value.");
17890 assert(TEPtr->UserTreeIndex &&
17891 "Expected only single user of a gather node.");
17892 const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
17894 PHINode *UserPHI = (UseEI.UserTE->State != TreeEntry::SplitVectorize &&
17895 UseEI.UserTE->hasState())
17900 : &getLastInstructionInBundle(UseEI.UserTE);
17901 if (TEInsertPt == InsertPt) {
17903 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
17904 (TEUseEI.UserTE->getOpcode() != Instruction::PHI ||
17905 TEUseEI.UserTE->isAltShuffle()) &&
17907 if (UseEI.UserTE->State != TreeEntry::Vectorize ||
17908 (UseEI.UserTE->hasState() &&
17909 UseEI.UserTE->getOpcode() == Instruction::PHI &&
17910 !UseEI.UserTE->isAltShuffle()) ||
17919 (TEUseEI.UserTE != UseEI.UserTE || TEUseEI.EdgeIdx < UseEI.EdgeIdx))
17922 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
17923 TEUseEI.UserTE->getOpcode() == Instruction::PHI &&
17924 UseEI.UserTE->State == TreeEntry::Vectorize &&
17925 UseEI.UserTE->getOpcode() == Instruction::PHI &&
17926 TEUseEI.UserTE != UseEI.UserTE)
17931 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
17935 if (TEUseEI.UserTE != UseEI.UserTE &&
17936 (TEUseEI.UserTE->Idx < UseEI.UserTE->Idx ||
17937 HasGatherUser(TEUseEI.UserTE)))
17940 if (CheckParentNodes(TEUseEI.UserTE, UseEI.UserTE, UseEI.EdgeIdx))
17944 if (!TEUseEI.UserTE->isGather() && !UserPHI &&
17945 TEUseEI.UserTE->doesNotNeedToSchedule() !=
17946 UseEI.UserTE->doesNotNeedToSchedule() &&
17951 if ((TEInsertBlock != InsertPt->
getParent() ||
17952 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
17953 (!CheckOrdering(InsertPt) ||
17954 (UseEI.UserTE->hasCopyableElements() &&
17959 if (CheckAndUseSameNode(TEPtr))
17964 if (CheckNonSchedulableOrdering(UseEI.UserTE, InsertPt))
17969 const auto *It =
find_if(VTEs, [&](
const TreeEntry *MTE) {
17970 return MTE !=
TE && MTE != TEUseEI.UserTE &&
17971 !DeletedNodes.contains(MTE) &&
17972 !TransformedToGatherNodes.contains(MTE);
17974 if (It != VTEs.end()) {
17975 const TreeEntry *VTE = *It;
17976 if (
none_of(
TE->CombinedEntriesWithIndices,
17977 [&](
const auto &
P) { return P.first == VTE->Idx; })) {
17978 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
17979 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
17983 if (CheckAndUseSameNode(VTE))
17989 const auto *It =
find_if(VTEs, [&, MainTE = TE](
const TreeEntry *TE) {
17990 return TE != MainTE && !DeletedNodes.contains(TE) &&
17991 !TransformedToGatherNodes.contains(TE);
17993 if (It != VTEs.end()) {
17994 const TreeEntry *VTE = *It;
17995 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
17996 VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
17997 VTEs = VTEs.drop_front();
17999 const auto *MIt =
find_if(VTEs, [](
const TreeEntry *MTE) {
18000 return MTE->State == TreeEntry::Vectorize;
18002 if (MIt == VTEs.end())
18006 if (
none_of(
TE->CombinedEntriesWithIndices,
18007 [&](
const auto &
P) { return P.first == VTE->Idx; })) {
18008 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
18009 if (&LastBundleInst == TEInsertPt ||
18010 !CheckOrdering(&LastBundleInst) ||
18011 CheckNonSchedulableOrdering(VTE, &LastBundleInst))
18015 if (CheckAndUseSameNode(VTE))
18020 if (VToTEs.
empty())
18022 if (UsedTEs.
empty()) {
18030 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
18032 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
18036 if (!VToTEs.
empty()) {
18042 VToTEs = SavedVToTEs;
18047 if (Idx == UsedTEs.
size()) {
18051 if (UsedTEs.
size() == 2)
18053 UsedTEs.push_back(SavedVToTEs);
18054 Idx = UsedTEs.
size() - 1;
18060 if (UsedTEs.
empty()) {
18062 return std::nullopt;
18066 if (UsedTEs.
size() == 1) {
18069 UsedTEs.front().
end());
18070 sort(FirstEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
18071 return TE1->Idx < TE2->Idx;
18074 auto *It =
find_if(FirstEntries, [=](
const TreeEntry *EntryPtr) {
18075 return EntryPtr->isSame(VL) || EntryPtr->isSame(
TE->Scalars);
18077 if (It != FirstEntries.end() &&
18078 ((*It)->getVectorFactor() == VL.size() ||
18079 ((*It)->getVectorFactor() ==
TE->Scalars.size() &&
18080 TE->ReuseShuffleIndices.size() == VL.size() &&
18081 (*It)->isSame(
TE->Scalars)))) {
18083 if ((*It)->getVectorFactor() == VL.size()) {
18084 std::iota(std::next(
Mask.begin(), Part * VL.size()),
18085 std::next(
Mask.begin(), (Part + 1) * VL.size()), 0);
18087 SmallVector<int> CommonMask =
TE->getCommonMask();
18098 Entries.
push_back(FirstEntries.front());
18100 for (
auto &
P : UsedValuesEntry)
18102 VF = FirstEntries.front()->getVectorFactor();
18105 assert(UsedTEs.
size() == 2 &&
"Expected at max 2 permuted entries.");
18107 DenseMap<int, const TreeEntry *> VFToTE;
18108 for (
const TreeEntry *TE : UsedTEs.front()) {
18109 unsigned VF =
TE->getVectorFactor();
18110 auto It = VFToTE.
find(VF);
18111 if (It != VFToTE.
end()) {
18112 if (It->second->Idx >
TE->Idx)
18113 It->getSecond() =
TE;
18120 UsedTEs.back().
end());
18121 sort(SecondEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
18122 return TE1->Idx < TE2->Idx;
18124 for (
const TreeEntry *TE : SecondEntries) {
18125 auto It = VFToTE.
find(
TE->getVectorFactor());
18126 if (It != VFToTE.
end()) {
18135 if (Entries.
empty()) {
18137 UsedTEs.front(), [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
18138 return TE1->Idx < TE2->Idx;
18140 Entries.
push_back(SecondEntries.front());
18141 VF = std::max(Entries.
front()->getVectorFactor(),
18142 Entries.
back()->getVectorFactor());
18144 VF = Entries.
front()->getVectorFactor();
18147 for (
const TreeEntry *
E : Entries)
18151 for (
auto &
P : UsedValuesEntry) {
18153 if (ValuesToEntries[Idx].
contains(
P.first)) {
18163 auto AreCompatiblePHIs = [&](
Value *
V,
Value *V1) {
18170 for (
int I = 0,
E =
PHI->getNumIncomingValues();
I <
E; ++
I) {
18172 Value *In1 = PHI1->getIncomingValue(
I);
18187 auto MightBeIgnored = [=](
Value *
V) {
18191 !areAllUsersVectorized(
I, UserIgnoreList) &&
isSimple(
I);
18196 auto NeighborMightBeIgnored = [&](
Value *
V,
int Idx) {
18197 Value *V1 = VL[Idx];
18198 bool UsedInSameVTE =
false;
18199 auto It = UsedValuesEntry.find(V1);
18200 if (It != UsedValuesEntry.end())
18201 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
18202 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
18209 SmallBitVector UsedIdxs(Entries.size());
18211 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
18213 auto It = UsedValuesEntry.find(V);
18214 if (It == UsedValuesEntry.end())
18220 ((
I > 0 && NeighborMightBeIgnored(V,
I - 1)) ||
18221 (
I !=
E - 1 && NeighborMightBeIgnored(V,
I + 1)))))
18223 unsigned Idx = It->second;
18230 for (
unsigned I = 0, Sz = Entries.size();
I < Sz; ++
I) {
18231 if (!UsedIdxs.test(
I))
18237 for (std::pair<unsigned, int> &Pair : EntryLanes)
18238 if (Pair.first ==
I)
18239 Pair.first = TempEntries.
size();
18242 Entries.swap(TempEntries);
18243 if (EntryLanes.size() == Entries.size() &&
18245 .slice(Part * VL.size(),
18246 std::min<int>(VL.size(),
TE->Scalars.size())))) {
18252 return std::nullopt;
18255 bool IsIdentity = Entries.size() == 1;
18258 for (
const std::pair<unsigned, int> &Pair : EntryLanes) {
18259 unsigned Idx = Part * VL.size() + Pair.second;
18262 (ForOrder ? std::distance(
18263 Entries[Pair.first]->Scalars.begin(),
18264 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
18265 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
18266 IsIdentity &=
Mask[Idx] == Pair.second;
18268 if (ForOrder || IsIdentity || Entries.empty()) {
18269 switch (Entries.size()) {
18271 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
18275 if (EntryLanes.size() > 2 || VL.size() <= 2)
18282 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
18284 SmallVector<int> SubMask(std::next(
Mask.begin(), Part * VL.size()),
18285 std::next(
Mask.begin(), (Part + 1) * VL.size()));
18286 int MinElement = SubMask.
front(), MaxElement = SubMask.
front();
18287 for (
int Idx : SubMask) {
18295 assert(MaxElement >= 0 && MinElement >= 0 &&
18296 MaxElement % VF >= MinElement % VF &&
18297 "Expected at least single element.");
18298 unsigned NewVF = std::max<unsigned>(
18300 (MaxElement % VF) -
18301 (MinElement % VF) + 1));
18303 for (
int &Idx : SubMask) {
18306 Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
18307 (Idx >=
static_cast<int>(VF) ? NewVF : 0);
18315 auto *MaskVecTy =
getWidenedType(VL.front()->getType(), SubMask.size());
18316 auto GetShuffleCost = [&,
18317 &TTI = *TTI](ArrayRef<int>
Mask,
18320 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
18322 Mask, Entries.front()->getInterleaveFactor()))
18324 return ::getShuffleCost(TTI,
18329 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
18331 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
18332 if (Entries.size() == 1 || !Entries[0]->isGather()) {
18333 FirstShuffleCost = ShuffleCost;
18337 bool IsIdentity =
true;
18338 for (
auto [
I, Idx] :
enumerate(FirstMask)) {
18339 if (Idx >=
static_cast<int>(NewVF)) {
18344 IsIdentity &=
static_cast<int>(
I) == Idx;
18348 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
18350 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
18354 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
18355 if (Entries.size() == 1 || !Entries[1]->isGather()) {
18356 SecondShuffleCost = ShuffleCost;
18360 bool IsIdentity =
true;
18361 for (
auto [
I, Idx] :
enumerate(SecondMask)) {
18362 if (Idx <
static_cast<int>(NewVF) && Idx >= 0) {
18368 IsIdentity &=
static_cast<int>(
I) == Idx;
18373 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
18375 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
18383 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
18385 const TreeEntry *BestEntry =
nullptr;
18386 if (FirstShuffleCost < ShuffleCost) {
18387 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
18388 std::next(
Mask.begin(), (Part + 1) * VL.size()),
18390 if (Idx >= static_cast<int>(VF))
18391 Idx = PoisonMaskElem;
18393 BestEntry = Entries.front();
18394 ShuffleCost = FirstShuffleCost;
18396 if (SecondShuffleCost < ShuffleCost) {
18397 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
18398 std::next(
Mask.begin(), (Part + 1) * VL.size()),
18400 if (Idx < static_cast<int>(VF))
18401 Idx = PoisonMaskElem;
18405 BestEntry = Entries[1];
18406 ShuffleCost = SecondShuffleCost;
18408 if (BuildVectorCost >= ShuffleCost) {
18411 Entries.push_back(BestEntry);
18419 std::fill(std::next(
Mask.begin(), Part * VL.size()),
18421 return std::nullopt;
18425BoUpSLP::isGatherShuffledEntry(
18429 assert(NumParts > 0 && NumParts < VL.
size() &&
18430 "Expected positive number of registers.");
18433 if (TE == VectorizableTree.front().get() &&
18434 (!GatheredLoadsEntriesFirst.has_value() ||
18436 [](
const std::unique_ptr<TreeEntry> &TE) {
18437 return !
TE->isGather();
18442 if (
TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
18445 assert((
TE->UserTreeIndex || TE == VectorizableTree.front().get()) &&
18446 "Expected only single user of the gather node.");
18448 "Number of scalars must be divisible by NumParts.");
18449 if (
TE->UserTreeIndex &&
TE->UserTreeIndex.UserTE->isGather() &&
18450 TE->UserTreeIndex.EdgeIdx == UINT_MAX &&
18452 (
TE->hasState() &&
TE->getOpcode() == Instruction::ExtractElement) ||
18455 getSameValuesTreeEntry(
TE->getMainOp(),
TE->Scalars))))
18462 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
18463 std::optional<TTI::ShuffleKind> SubRes =
18464 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
18467 SubEntries.
clear();
18470 SubEntries.
front()->getVectorFactor() == VL.
size() &&
18471 (SubEntries.
front()->isSame(
TE->Scalars) ||
18472 SubEntries.
front()->isSame(VL))) {
18474 LocalSubEntries.
swap(SubEntries);
18477 std::iota(
Mask.begin(),
Mask.end(), 0);
18479 for (
int I = 0, Sz = VL.
size();
I < Sz; ++
I)
18482 Entries.emplace_back(1, LocalSubEntries.
front());
18488 [](
const std::optional<TTI::ShuffleKind> &SK) {
return !SK; })) {
18496 Type *ScalarTy)
const {
18497 const unsigned VF = VL.
size();
18505 auto EstimateInsertCost = [&](
unsigned I,
Value *
V) {
18507 if (
V->getType() != ScalarTy)
18508 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy,
V->getType(),
18512 std::iota(ConstantShuffleMask.begin(), ConstantShuffleMask.end(), 0);
18519 ConstantShuffleMask[
I] =
I + VF;
18522 EstimateInsertCost(
I, V);
18525 bool IsAnyNonUndefConst =
18528 if (!ForPoisonSrc && IsAnyNonUndefConst) {
18530 ConstantShuffleMask);
18534 if (!DemandedElements.
isZero())
18538 ForPoisonSrc && !IsAnyNonUndefConst, VL);
18542Instruction &BoUpSLP::getLastInstructionInBundle(
const TreeEntry *
E) {
18543 auto It = EntryToLastInstruction.find(
E);
18544 if (It != EntryToLastInstruction.end())
18552 if (
E->hasState()) {
18553 Front =
E->getMainOp();
18554 Opcode =
E->getOpcode();
18561 ((GatheredLoadsEntriesFirst.has_value() && Opcode == Instruction::Load &&
18562 E->isGather() &&
E->Idx < *GatheredLoadsEntriesFirst) ||
18563 E->State == TreeEntry::SplitVectorize ||
E->hasCopyableElements() ||
18565 [=](
Value *V) ->
bool {
18566 if (Opcode == Instruction::GetElementPtr &&
18567 !isa<GetElementPtrInst>(V))
18569 auto *I = dyn_cast<Instruction>(V);
18570 return !I || !E->getMatchingMainOpOrAltOp(I) ||
18571 I->getParent() == BB || isVectorLikeInstWithConstOps(I);
18573 "Expected gathered loads or GEPs or instructions from same basic "
18576 auto FindLastInst = [&]() {
18578 for (
Value *V :
E->Scalars) {
18582 if (
E->isCopyableElement(
I))
18584 if (LastInst->
getParent() ==
I->getParent()) {
18589 assert(((Opcode == Instruction::GetElementPtr &&
18591 E->State == TreeEntry::SplitVectorize ||
18594 (GatheredLoadsEntriesFirst.has_value() &&
18595 Opcode == Instruction::Load &&
E->isGather() &&
18596 E->Idx < *GatheredLoadsEntriesFirst)) &&
18597 "Expected vector-like or non-GEP in GEP node insts only.");
18598 if (!DT->isReachableFromEntry(LastInst->
getParent())) {
18602 if (!DT->isReachableFromEntry(
I->getParent()))
18604 auto *NodeA = DT->getNode(LastInst->
getParent());
18605 auto *NodeB = DT->getNode(
I->getParent());
18606 assert(NodeA &&
"Should only process reachable instructions");
18607 assert(NodeB &&
"Should only process reachable instructions");
18608 assert((NodeA == NodeB) ==
18609 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
18610 "Different nodes should have different DFS numbers");
18611 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
18618 auto FindFirstInst = [&]() {
18620 for (
Value *V :
E->Scalars) {
18624 if (
E->isCopyableElement(
I))
18626 if (FirstInst->
getParent() ==
I->getParent()) {
18627 if (
I->comesBefore(FirstInst))
18631 assert(((Opcode == Instruction::GetElementPtr &&
18635 "Expected vector-like or non-GEP in GEP node insts only.");
18636 if (!DT->isReachableFromEntry(FirstInst->
getParent())) {
18640 if (!DT->isReachableFromEntry(
I->getParent()))
18642 auto *NodeA = DT->getNode(FirstInst->
getParent());
18643 auto *NodeB = DT->getNode(
I->getParent());
18644 assert(NodeA &&
"Should only process reachable instructions");
18645 assert(NodeB &&
"Should only process reachable instructions");
18646 assert((NodeA == NodeB) ==
18647 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
18648 "Different nodes should have different DFS numbers");
18649 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
18655 if (
E->State == TreeEntry::SplitVectorize) {
18656 Res = FindLastInst();
18658 for (
auto *
E : Entries) {
18661 I = &getLastInstructionInBundle(
E);
18666 EntryToLastInstruction.try_emplace(
E, Res);
18671 if (GatheredLoadsEntriesFirst.has_value() &&
18672 E->Idx >= *GatheredLoadsEntriesFirst && !
E->isGather() &&
18673 Opcode == Instruction::Load) {
18674 Res = FindFirstInst();
18675 EntryToLastInstruction.try_emplace(
E, Res);
18681 auto FindScheduleBundle = [&](
const TreeEntry *
E) ->
const ScheduleBundle * {
18685 const auto *It = BlocksSchedules.find(BB);
18686 if (It == BlocksSchedules.end())
18688 for (
Value *V :
E->Scalars) {
18694 if (Bundles.
empty())
18697 Bundles, [&](ScheduleBundle *
B) {
return B->getTreeEntry() ==
E; });
18698 if (It != Bundles.
end())
18703 const ScheduleBundle *Bundle = FindScheduleBundle(
E);
18704 if (!
E->isGather() && !Bundle) {
18705 if ((Opcode == Instruction::GetElementPtr &&
18708 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
18712 return isa<PoisonValue>(V) ||
18713 (E->Idx == 0 && isa<InsertElementInst>(V)) ||
18714 E->isCopyableElement(V) ||
18715 (!isVectorLikeInstWithConstOps(V) &&
18716 isUsedOutsideBlock(V));
18718 (!
E->doesNotNeedToSchedule() ||
18721 if (!isa<Instruction>(V) ||
18722 (E->hasCopyableElements() && E->isCopyableElement(V)))
18724 return !areAllOperandsNonInsts(V);
18727 if (!isa<Instruction>(V) ||
18728 (E->hasCopyableElements() && E->isCopyableElement(V)))
18730 return MustGather.contains(V);
18732 Res = FindLastInst();
18734 Res = FindFirstInst();
18735 EntryToLastInstruction.try_emplace(
E, Res);
18744 assert(!
E->isGather() &&
"Gathered instructions should not be scheduled");
18745 Res = Bundle->getBundle().back()->getInst();
18746 EntryToLastInstruction.try_emplace(
E, Res);
18769 Res = FindLastInst();
18770 assert(Res &&
"Failed to find last instruction in bundle");
18771 EntryToLastInstruction.try_emplace(
E, Res);
18775void BoUpSLP::setInsertPointAfterBundle(
const TreeEntry *
E) {
18776 auto *Front =
E->getMainOp();
18777 Instruction *LastInst = &getLastInstructionInBundle(
E);
18778 assert(LastInst &&
"Failed to find last instruction in bundle");
18783 LastInstIt = LastInst->
getParent()->getFirstNonPHIIt();
18784 if (LastInstIt != LastInst->
getParent()->end() &&
18785 LastInstIt->getParent()->isLandingPad())
18786 LastInstIt = std::next(LastInstIt);
18789 (!
E->isGather() &&
E->State != TreeEntry::SplitVectorize &&
18790 (
E->doesNotNeedToSchedule() ||
18791 (
E->hasCopyableElements() && !
E->isCopyableElement(LastInst) &&
18793 (GatheredLoadsEntriesFirst.has_value() &&
18794 E->Idx >= *GatheredLoadsEntriesFirst && !
E->isGather() &&
18795 E->getOpcode() == Instruction::Load)) {
18796 Builder.SetInsertPoint(LastInst->
getParent(), LastInstIt);
18800 Builder.SetInsertPoint(
18803 if (Instruction *Res = LastInstructionToPos.lookup(LastInst)) {
18806 Res = Builder.CreateAlignedLoad(Builder.getPtrTy(),
18811 LastInstructionToPos.try_emplace(LastInst, Res);
18814 Builder.SetCurrentDebugLocation(Front->
getDebugLoc());
18817Value *BoUpSLP::gather(
18819 function_ref<
Value *(
Value *,
Value *, ArrayRef<int>)> CreateShuffle) {
18825 SmallSet<int, 4> PostponedIndices;
18826 Loop *
L = LI->getLoopFor(Builder.GetInsertBlock());
18828 SmallPtrSet<BasicBlock *, 4> Visited;
18829 while (InsertBB && InsertBB != InstBB && Visited.
insert(InsertBB).second)
18830 InsertBB = InsertBB->getSinglePredecessor();
18831 return InsertBB && InsertBB == InstBB;
18833 for (
int I = 0,
E = VL.
size();
I <
E; ++
I) {
18835 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
18837 (L && (!Root ||
L->isLoopInvariant(Root)) &&
L->contains(Inst))) &&
18838 PostponedIndices.
insert(
I).second)
18842 auto &&CreateInsertElement = [
this](
Value *Vec,
Value *
V,
unsigned Pos,
18845 if (
Scalar->getType() != Ty) {
18856 Scalar = Builder.CreateIntCast(
18870 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
18875 GatherShuffleExtractSeq.insert(InsElt);
18880 const auto *It =
find_if(Entries, [&](
const TreeEntry *
E) {
18881 return !TransformedToGatherNodes.contains(
E) &&
18882 !DeletedNodes.contains(
E);
18884 if (It != Entries.
end()) {
18886 User *UserOp =
nullptr;
18891 if (
V->getType()->isVectorTy()) {
18893 SV && SV->getOperand(0) != V && SV->getOperand(1) != V) {
18895 auto FindOperand = [](
Value *Vec,
Value *
V) -> Instruction * {
18897 if (SV->getOperand(0) == V)
18899 if (SV->getOperand(1) == V)
18905 if (Instruction *User = FindOperand(SV->getOperand(0), V))
18907 else if (Instruction *User = FindOperand(SV->getOperand(1), V))
18910 "Failed to find shufflevector, caused by resize.");
18916 unsigned FoundLane = (*It)->findLaneForValue(V);
18917 ExternalUses.emplace_back(V, UserOp, **It, FoundLane);
18925 SmallVector<int> NonConsts;
18927 std::iota(
Mask.begin(),
Mask.end(), 0);
18928 Value *OriginalRoot = Root;
18931 SV->getOperand(0)->getType() == VecTy) {
18932 Root = SV->getOperand(0);
18933 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
18936 for (
int I = 0,
E = VL.
size();
I <
E; ++
I) {
18945 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
18950 Vec = OriginalRoot;
18952 Vec = CreateShuffle(Root, Vec, Mask);
18954 OI && OI->use_empty() &&
18955 none_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
18956 return TE->VectorizedValue == OI;
18962 for (
int I : NonConsts)
18963 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
18966 for (
const std::pair<Value *, unsigned> &Pair : PostponedInsts)
18967 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
19005 bool IsFinalized =
false;
19018 class ShuffleIRBuilder {
19031 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
19032 CSEBlocks(CSEBlocks),
DL(DL) {}
19033 ~ShuffleIRBuilder() =
default;
19039 "Expected integer vector types only.");
19045 ->getIntegerBitWidth())
19046 V2 = Builder.CreateIntCast(
19049 V1 = Builder.CreateIntCast(
19053 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
19055 GatherShuffleExtractSeq.insert(
I);
19056 CSEBlocks.insert(
I->getParent());
19065 unsigned VF = Mask.size();
19069 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
19071 GatherShuffleExtractSeq.insert(
I);
19072 CSEBlocks.insert(
I->getParent());
19076 Value *createIdentity(
Value *V) {
return V; }
19077 Value *createPoison(
Type *Ty,
unsigned VF) {
19082 void resizeToMatch(
Value *&V1,
Value *&V2) {
19087 int VF = std::max(V1VF, V2VF);
19088 int MinVF = std::min(V1VF, V2VF);
19090 std::iota(IdentityMask.
begin(), std::next(IdentityMask.
begin(), MinVF),
19092 Value *&
Op = MinVF == V1VF ? V1 : V2;
19093 Op = Builder.CreateShuffleVector(
Op, IdentityMask);
19095 GatherShuffleExtractSeq.insert(
I);
19096 CSEBlocks.insert(
I->getParent());
19109 assert(V1 &&
"Expected at least one vector value.");
19110 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
19111 R.CSEBlocks, *R.DL);
19112 return BaseShuffleAnalysis::createShuffle<Value *>(
19113 V1, V2, Mask, ShuffleBuilder, ScalarTy);
19119 std::optional<bool> IsSigned = std::nullopt) {
19122 if (VecTy->getElementType() == ScalarTy->getScalarType())
19124 return Builder.CreateIntCast(
19125 V,
VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
19129 Value *getVectorizedValue(
const TreeEntry &E) {
19130 Value *Vec = E.VectorizedValue;
19133 return castToScalarTyElem(Vec,
any_of(E.Scalars, [&](
Value *V) {
19134 return !isa<PoisonValue>(V) &&
19135 !isKnownNonNegative(
19136 V, SimplifyQuery(*R.DL));
19142 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
19146 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
19147 unsigned NumParts,
bool &UseVecBaseAsInput) {
19148 UseVecBaseAsInput =
false;
19150 Value *VecBase =
nullptr;
19152 if (!E->ReorderIndices.empty()) {
19154 E->ReorderIndices.end());
19157 for (
int I = 0, Sz = Mask.size();
I < Sz; ++
I) {
19162 VecBase = EI->getVectorOperand();
19164 VecBase = TEs.front()->VectorizedValue;
19165 assert(VecBase &&
"Expected vectorized value.");
19166 UniqueBases.
insert(VecBase);
19169 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
19170 (E->UserTreeIndex && E->UserTreeIndex.EdgeIdx == UINT_MAX &&
19171 !R.isVectorized(EI) &&
19173 count_if(E->UserTreeIndex.UserTE->Scalars,
19174 [&](
Value *V) { return V == EI; })) ||
19175 (NumParts != 1 &&
count(VL, EI) > 1) ||
19177 ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(U);
19178 return UTEs.empty() || UTEs.size() > 1 ||
19180 [&](const TreeEntry *TE) {
19181 return R.DeletedNodes.contains(TE) ||
19182 R.TransformedToGatherNodes.contains(TE);
19188 [&](
const std::unique_ptr<TreeEntry> &TE) {
19189 return TE->UserTreeIndex.UserTE ==
19191 is_contained(VL, EI);
19195 R.eraseInstruction(EI);
19197 if (NumParts == 1 || UniqueBases.
size() == 1) {
19198 assert(VecBase &&
"Expected vectorized value.");
19199 return castToScalarTyElem(VecBase);
19201 UseVecBaseAsInput =
true;
19211 Value *Vec =
nullptr;
19218 constexpr int MaxBases = 2;
19220 auto VLMask =
zip(SubVL, SubMask);
19221 const unsigned VF = std::accumulate(
19222 VLMask.begin(), VLMask.end(), 0U, [&](
unsigned S,
const auto &
D) {
19223 if (std::get<1>(D) == PoisonMaskElem)
19226 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
19227 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp);
19229 VecOp = TEs.front()->VectorizedValue;
19230 assert(VecOp &&
"Expected vectorized value.");
19231 const unsigned Size =
19232 cast<FixedVectorType>(VecOp->getType())->getNumElements();
19233 return std::max(S, Size);
19235 for (
const auto [V,
I] : VLMask) {
19240 VecOp = TEs.front()->VectorizedValue;
19241 assert(VecOp &&
"Expected vectorized value.");
19242 VecOp = castToScalarTyElem(VecOp);
19243 Bases[
I / VF] = VecOp;
19245 if (!Bases.front())
19248 if (Bases.back()) {
19249 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
19250 TransformToIdentity(SubMask);
19252 SubVec = Bases.front();
19258 ArrayRef<int> SubMask =
19259 Mask.slice(
P * SliceSize,
19262 return all_of(SubMask, [](
int Idx) {
19266 "Expected first part or all previous parts masked.");
19267 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
19272 unsigned SubVecVF =
19274 NewVF = std::max(NewVF, SubVecVF);
19277 for (
int &Idx : SubMask)
19280 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
19281 Vec = createShuffle(Vec, SubVec, VecMask);
19282 TransformToIdentity(VecMask);
19290 std::optional<Value *>
19296 TEs, [](
const TreeEntry *TE) {
return TE->VectorizedValue; });
19298 return std::nullopt;
19301 auto *ResVecTy =
getWidenedType(ScalarTy, E->getVectorFactor());
19302 return Builder.CreateAlignedLoad(
19309 IsFinalized =
false;
19310 CommonMask.clear();
19316 Value *V1 = getVectorizedValue(E1);
19317 Value *V2 = getVectorizedValue(E2);
19323 Value *V1 = getVectorizedValue(E1);
19328 assert(V1 && V2 && !Mask.empty() &&
"Expected non-empty input vectors.");
19331 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
19332 V1 = castToScalarTyElem(V1);
19333 V2 = castToScalarTyElem(V2);
19334 if (InVectors.empty()) {
19335 InVectors.push_back(V1);
19336 InVectors.push_back(V2);
19337 CommonMask.assign(Mask.begin(), Mask.end());
19340 Value *Vec = InVectors.front();
19341 if (InVectors.size() == 2) {
19342 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
19343 transformMaskAfterShuffle(CommonMask, CommonMask);
19346 Vec = createShuffle(Vec,
nullptr, CommonMask);
19347 transformMaskAfterShuffle(CommonMask, CommonMask);
19349 V1 = createShuffle(V1, V2, Mask);
19350 unsigned VF = std::max(getVF(V1), getVF(Vec));
19351 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
19353 CommonMask[Idx] = Idx + VF;
19354 InVectors.front() = Vec;
19355 if (InVectors.size() == 2)
19356 InVectors.back() = V1;
19358 InVectors.push_back(V1);
19363 "castToScalarTyElem expects V1 to be FixedVectorType");
19364 V1 = castToScalarTyElem(V1);
19365 if (InVectors.empty()) {
19366 InVectors.push_back(V1);
19367 CommonMask.assign(Mask.begin(), Mask.end());
19370 const auto *It =
find(InVectors, V1);
19371 if (It == InVectors.end()) {
19372 if (InVectors.size() == 2 ||
19373 InVectors.front()->getType() != V1->
getType()) {
19374 Value *V = InVectors.front();
19375 if (InVectors.size() == 2) {
19376 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
19377 transformMaskAfterShuffle(CommonMask, CommonMask);
19379 CommonMask.size()) {
19380 V = createShuffle(InVectors.front(),
nullptr, CommonMask);
19381 transformMaskAfterShuffle(CommonMask, CommonMask);
19383 unsigned VF = std::max(CommonMask.size(), Mask.size());
19384 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
19386 CommonMask[Idx] = V->getType() != V1->
getType()
19388 : Mask[Idx] + getVF(V1);
19389 if (V->getType() != V1->
getType())
19390 V1 = createShuffle(V1,
nullptr, Mask);
19391 InVectors.front() = V;
19392 if (InVectors.size() == 2)
19393 InVectors.back() = V1;
19395 InVectors.push_back(V1);
19400 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
19402 InVectors.push_back(V1);
19407 for (
Value *V : InVectors)
19408 VF = std::max(VF, getVF(V));
19409 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
19411 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
19420 Value *Root =
nullptr) {
19421 return R.gather(VL, Root, ScalarTy,
19423 return createShuffle(V1, V2, Mask);
19432 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
19437 IsFinalized =
true;
19440 if (InVectors.
size() == 2) {
19441 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
19444 Vec = createShuffle(Vec,
nullptr, CommonMask);
19446 transformMaskAfterShuffle(CommonMask, CommonMask);
19448 "Expected vector length for the final value before action.");
19452 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
19453 Vec = createShuffle(Vec,
nullptr, ResizeMask);
19455 Action(Vec, CommonMask, [
this](
Value *V1,
Value *V2, ArrayRef<int> Mask) {
19456 return createShuffle(V1, V2, Mask);
19458 InVectors.
front() = Vec;
19460 if (!SubVectors.empty()) {
19462 if (InVectors.
size() == 2) {
19463 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
19466 Vec = createShuffle(Vec,
nullptr, CommonMask);
19468 transformMaskAfterShuffle(CommonMask, CommonMask);
19469 auto CreateSubVectors = [&](
Value *Vec,
19470 SmallVectorImpl<int> &CommonMask) {
19471 for (
auto [
E, Idx] : SubVectors) {
19472 Value *
V = getVectorizedValue(*
E);
19479 Type *OrigScalarTy = ScalarTy;
19482 Builder, Vec, V, InsertionIndex,
19483 std::bind(&ShuffleInstructionBuilder::createShuffle,
this, _1, _2,
19485 ScalarTy = OrigScalarTy;
19486 if (!CommonMask.
empty()) {
19487 std::iota(std::next(CommonMask.
begin(), Idx),
19488 std::next(CommonMask.
begin(), Idx +
E->getVectorFactor()),
19494 if (SubVectorsMask.
empty()) {
19495 Vec = CreateSubVectors(Vec, CommonMask);
19498 copy(SubVectorsMask, SVMask.begin());
19499 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
19502 I1 = I2 + CommonMask.
size();
19507 Vec = createShuffle(InsertVec, Vec, SVMask);
19508 transformMaskAfterShuffle(CommonMask, SVMask);
19510 InVectors.
front() = Vec;
19513 if (!ExtMask.
empty()) {
19514 if (CommonMask.
empty()) {
19518 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
19521 NewMask[
I] = CommonMask[ExtMask[
I]];
19523 CommonMask.
swap(NewMask);
19526 if (CommonMask.
empty()) {
19527 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
19528 return InVectors.
front();
19530 if (InVectors.
size() == 2)
19531 return createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
19532 return createShuffle(InVectors.
front(),
nullptr, CommonMask);
19536 assert((IsFinalized || CommonMask.empty()) &&
19537 "Shuffle construction must be finalized.");
19541Value *BoUpSLP::vectorizeOperand(TreeEntry *E,
unsigned NodeIdx) {
19545template <
typename BVTy,
typename ResTy,
typename... Args>
19546ResTy BoUpSLP::processBuildVector(
const TreeEntry *E,
Type *ScalarTy,
19548 assert((E->isGather() || TransformedToGatherNodes.contains(E)) &&
19549 "Expected gather node.");
19550 unsigned VF = E->getVectorFactor();
19552 bool NeedFreeze =
false;
19556 E->CombinedEntriesWithIndices.size());
19557 if (E->State == TreeEntry::SplitVectorize &&
19558 TransformedToGatherNodes.contains(E)) {
19559 SubVectors.
clear();
19562 for (
auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
19564 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
19567 E->CombinedEntriesWithIndices, SubVectors.
begin(), [&](
const auto &
P) {
19568 return std::make_pair(VectorizableTree[P.first].get(), P.second);
19574 E->ReorderIndices.end());
19575 if (!ReorderMask.
empty())
19581 if (!SubVectors.
empty() && !SubVectorsMask.
empty()) {
19583 if (E->Scalars[
I] == GatheredScalars[ReorderMask[
I]])
19586 SubVectorsMask.
clear();
19590 unsigned I,
unsigned SliceSize,
19591 bool IsNotPoisonous) {
19593 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
19596 TreeEntry *UserTE = E->UserTreeIndex.UserTE;
19597 unsigned EdgeIdx = E->UserTreeIndex.EdgeIdx;
19598 if (UserTE->getNumOperands() != 2)
19600 if (!IsNotPoisonous) {
19601 auto *It =
find_if(
ArrayRef(VectorizableTree).drop_front(UserTE->Idx + 1),
19602 [=](
const std::unique_ptr<TreeEntry> &TE) {
19603 return TE->UserTreeIndex.UserTE == UserTE &&
19604 TE->UserTreeIndex.EdgeIdx != EdgeIdx;
19606 if (It == VectorizableTree.end())
19609 if (!(*It)->ReorderIndices.empty()) {
19613 if (!
all_of(
zip(GatheredScalars, GS), [&](
const auto &
P) {
19614 Value *V0 = std::get<0>(
P);
19615 Value *V1 = std::get<1>(
P);
19623 if ((Mask.size() < InputVF &&
19626 (Mask.size() == InputVF &&
19629 std::next(Mask.begin(),
I * SliceSize),
19630 std::next(Mask.begin(),
19637 std::next(Mask.begin(),
I * SliceSize),
19638 std::next(Mask.begin(),
19644 BVTy ShuffleBuilder(ScalarTy, Params...);
19645 ResTy Res = ResTy();
19649 Value *ExtractVecBase =
nullptr;
19650 bool UseVecBaseAsInput =
false;
19653 Type *OrigScalarTy = GatheredScalars.
front()->getType();
19658 bool Resized =
false;
19660 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
19661 if (!ExtractShuffles.
empty()) {
19663 for (
auto [Idx,
I] :
enumerate(ExtractMask)) {
19669 ExtractEntries.
append(TEs.begin(), TEs.end());
19671 if (std::optional<ResTy> Delayed =
19672 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
19674 PostponedGathers.insert(E);
19679 if (
Value *VecBase = ShuffleBuilder.adjustExtracts(
19680 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
19681 ExtractVecBase = VecBase;
19683 if (VF == VecBaseTy->getNumElements() &&
19684 GatheredScalars.
size() != VF) {
19686 GatheredScalars.
append(VF - GatheredScalars.
size(),
19694 if (!ExtractShuffles.
empty() || !E->hasState() ||
19695 E->getOpcode() != Instruction::Load ||
19696 (((E->hasState() && E->getOpcode() == Instruction::Load) ||
19700 return isa<LoadInst>(V) && isVectorized(V);
19702 (E->hasState() && E->isAltShuffle()) ||
19703 all_of(E->Scalars, [
this](
Value *V) { return isVectorized(V); }) ||
19705 (E->Scalars != GatheredScalars && GatheredScalars.
size() <= 2)) {
19707 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
19709 if (!GatherShuffles.
empty()) {
19710 if (std::optional<ResTy> Delayed =
19711 ShuffleBuilder.needToDelay(E, Entries)) {
19713 PostponedGathers.insert(E);
19718 if (GatherShuffles.
size() == 1 &&
19720 Entries.
front().front()->isSame(E->Scalars)) {
19723 LLVM_DEBUG(
dbgs() <<
"SLP: perfect diamond match for gather bundle "
19726 Mask.resize(E->Scalars.size());
19727 const TreeEntry *FrontTE = Entries.
front().front();
19728 if (FrontTE->ReorderIndices.empty() &&
19729 ((FrontTE->ReuseShuffleIndices.empty() &&
19730 E->Scalars.size() == FrontTE->Scalars.size()) ||
19731 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
19732 std::iota(Mask.begin(), Mask.end(), 0);
19739 Mask[
I] = FrontTE->findLaneForValue(V);
19744 ShuffleBuilder.resetForSameNode();
19745 ShuffleBuilder.add(*FrontTE, Mask);
19747 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
19751 if (GatheredScalars.
size() != VF &&
19753 return any_of(TEs, [&](
const TreeEntry *TE) {
19754 return TE->getVectorFactor() == VF;
19757 GatheredScalars.
append(VF - GatheredScalars.
size(),
19761 for (
int I = 0, Sz = Mask.size();
I < Sz; ++
I) {
19769 bool IsRootPoison) {
19772 bool IsSplat = IsRootPoison &&
isSplat(Scalars) &&
19779 int NumNonConsts = 0;
19798 Scalars.
front() = OrigV;
19801 const auto Res = UniquePositions.
try_emplace(OrigV,
I);
19802 Scalars[Res.first->second] = OrigV;
19803 ReuseMask[
I] = Res.first->second;
19806 if (NumNonConsts == 1) {
19811 if (!UndefPos.
empty() && UndefPos.
front() == 0)
19814 ReuseMask[SinglePos] = SinglePos;
19815 }
else if (!UndefPos.
empty() && IsSplat) {
19822 (E->UserTreeIndex &&
any_of(V->uses(), [E](
const Use &U) {
19825 return E->UserTreeIndex.EdgeIdx != U.getOperandNo() &&
19826 is_contained(E->UserTreeIndex.UserTE->Scalars,
19830 if (It != Scalars.
end()) {
19832 int Pos = std::distance(Scalars.
begin(), It);
19833 for (
int I : UndefPos) {
19835 ReuseMask[
I] = Pos;
19844 for (
int I : UndefPos) {
19853 if (!ExtractShuffles.
empty() || !GatherShuffles.
empty()) {
19854 bool IsNonPoisoned =
true;
19855 bool IsUsedInExpr =
true;
19856 Value *Vec1 =
nullptr;
19857 if (!ExtractShuffles.
empty()) {
19861 Value *Vec2 =
nullptr;
19862 for (
unsigned I = 0, Sz = ExtractMask.
size();
I < Sz; ++
I) {
19866 if (UseVecBaseAsInput) {
19867 Vec1 = ExtractVecBase;
19869 for (
unsigned I = 0, Sz = ExtractMask.
size();
I < Sz; ++
I) {
19875 Value *VecOp = EI->getVectorOperand();
19877 !TEs.
empty() && TEs.front()->VectorizedValue)
19878 VecOp = TEs.front()->VectorizedValue;
19881 }
else if (Vec1 != VecOp) {
19882 assert((!Vec2 || Vec2 == VecOp) &&
19883 "Expected only 1 or 2 vectors shuffle.");
19889 IsUsedInExpr =
false;
19892 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
19895 IsUsedInExpr &= FindReusedSplat(
19898 ExtractMask.
size(), IsNotPoisonedVec);
19899 ShuffleBuilder.add(Vec1, ExtractMask,
true);
19900 IsNonPoisoned &= IsNotPoisonedVec;
19902 IsUsedInExpr =
false;
19907 if (!GatherShuffles.
empty()) {
19908 unsigned SliceSize =
19912 for (
const auto [
I, TEs] :
enumerate(Entries)) {
19915 "No shuffles with empty entries list expected.");
19918 assert((TEs.size() == 1 || TEs.size() == 2) &&
19919 "Expected shuffle of 1 or 2 entries.");
19920 unsigned Limit =
getNumElems(Mask.size(), SliceSize,
I);
19923 copy(SubMask, std::next(VecMask.
begin(),
I * SliceSize));
19924 if (TEs.size() == 1) {
19925 bool IsNotPoisonedVec =
19926 TEs.front()->VectorizedValue
19930 FindReusedSplat(VecMask, TEs.
front()->getVectorFactor(),
I,
19931 SliceSize, IsNotPoisonedVec);
19932 ShuffleBuilder.add(*TEs.front(), VecMask);
19933 IsNonPoisoned &= IsNotPoisonedVec;
19935 IsUsedInExpr =
false;
19936 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
19937 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
19948 int EMSz = ExtractMask.
size();
19949 int MSz = Mask.size();
19952 bool IsSingleShuffle = ExtractShuffles.
empty() || GatherShuffles.
empty();
19953 bool IsIdentityShuffle =
19954 ((UseVecBaseAsInput ||
19956 [](
const std::optional<TTI::ShuffleKind> &SK) {
19960 none_of(ExtractMask, [&](
int I) {
return I >= EMSz; }) &&
19962 (!GatherShuffles.
empty() &&
19964 [](
const std::optional<TTI::ShuffleKind> &SK) {
19968 none_of(Mask, [&](
int I) {
return I >= MSz; }) &&
19970 bool EnoughConstsForShuffle =
19980 (!IsIdentityShuffle ||
19981 (GatheredScalars.
size() == 2 &&
19989 for (
int I = 0, Sz = GatheredScalars.
size();
I < Sz; ++
I) {
19990 if (EnoughConstsForShuffle &&
isa<Constant>(GatheredScalars[
I]))
19998 TryPackScalars(GatheredScalars, BVMask,
true);
19999 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.
size());
20000 ShuffleBuilder.add(BV, BVMask);
20004 (IsSingleShuffle && ((IsIdentityShuffle &&
20007 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
20010 Res = ShuffleBuilder.finalize(
20011 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.
size(),
20013 bool IsSplat = isSplat(NonConstants);
20014 SmallVector<int> BVMask(Mask.size(), PoisonMaskElem);
20015 TryPackScalars(NonConstants, BVMask, false);
20016 auto CheckIfSplatIsProfitable = [&]() {
20019 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
20020 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
20021 if (isa<ExtractElementInst>(V) || isVectorized(V))
20023 InstructionCost SplatCost = TTI->getVectorInstrCost(
20024 Instruction::InsertElement, VecTy, CostKind, 0,
20025 PoisonValue::get(VecTy), V);
20026 SmallVector<int> NewMask(Mask.begin(), Mask.end());
20027 for (auto [Idx, I] : enumerate(BVMask))
20028 if (I != PoisonMaskElem)
20029 NewMask[Idx] = Mask.size();
20030 SplatCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy,
20031 NewMask, CostKind);
20032 InstructionCost BVCost = TTI->getVectorInstrCost(
20033 Instruction::InsertElement, VecTy, CostKind,
20034 *find_if(Mask, not_equal_to(PoisonMaskElem)), Vec, V);
20036 if (count(BVMask, PoisonMaskElem) <
20037 static_cast<int>(BVMask.size() - 1)) {
20038 SmallVector<int> NewMask(Mask.begin(), Mask.end());
20039 for (auto [Idx, I] : enumerate(BVMask))
20040 if (I != PoisonMaskElem)
20042 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
20043 VecTy, NewMask, CostKind);
20045 return SplatCost <= BVCost;
20047 if (!IsSplat || Mask.size() <= 2 || !CheckIfSplatIsProfitable()) {
20051 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
20057 Value *BV = ShuffleBuilder.gather(Values, BVMask.
size());
20060 return I == PoisonMaskElem ? PoisonMaskElem : 0;
20063 BV = CreateShuffle(BV,
nullptr, SplatMask);
20066 Mask[Idx] = BVMask.size() + Idx;
20067 Vec = CreateShuffle(Vec, BV, Mask);
20076 TryPackScalars(GatheredScalars, ReuseMask,
true);
20077 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
20078 ShuffleBuilder.add(BV, ReuseMask);
20079 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
20084 for (
auto [
I, V] :
enumerate(GatheredScalars)) {
20088 Value *BV = ShuffleBuilder.gather(GatheredScalars);
20089 ShuffleBuilder.add(BV, Mask);
20090 Res = ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors,
20095 Res = ShuffleBuilder.createFreeze(Res);
20099Value *BoUpSLP::createBuildVector(
const TreeEntry *
E,
Type *ScalarTy) {
20101 if (
E->State != TreeEntry::SplitVectorize ||
20102 !TransformedToGatherNodes.contains(
E)) {
20103 for (
auto [EIdx,
_] :
E->CombinedEntriesWithIndices)
20106 return processBuildVector<ShuffleInstructionBuilder, Value *>(
E, ScalarTy,
20114 for (
Value *V : VL)
20127 IRBuilderBase::InsertPointGuard Guard(Builder);
20129 Value *
V =
E->Scalars.front();
20130 Type *ScalarTy =
V->getType();
20133 auto It = MinBWs.find(
E);
20134 if (It != MinBWs.end()) {
20140 if (
E->VectorizedValue)
20141 return E->VectorizedValue;
20143 if (
E->isGather() || TransformedToGatherNodes.contains(
E)) {
20145 if (
E->hasState() &&
E->Idx == 0 && !UserIgnoreList)
20146 setInsertPointAfterBundle(
E);
20147 Value *Vec = createBuildVector(
E, ScalarTy);
20148 E->VectorizedValue = Vec;
20151 if (
E->State == TreeEntry::SplitVectorize) {
20152 assert(
E->CombinedEntriesWithIndices.size() == 2 &&
20153 "Expected exactly 2 combined entries.");
20154 setInsertPointAfterBundle(
E);
20156 *VectorizableTree[
E->CombinedEntriesWithIndices.front().first];
20158 ArrayRef(
E->Scalars).take_front(OpTE1.getVectorFactor())) &&
20159 "Expected same first part of scalars.");
20162 *VectorizableTree[
E->CombinedEntriesWithIndices.back().first];
20164 OpTE2.isSame(
ArrayRef(
E->Scalars).take_back(OpTE2.getVectorFactor())) &&
20165 "Expected same second part of scalars.");
20167 auto GetOperandSignedness = [&](
const TreeEntry *OpE) {
20168 bool IsSigned =
false;
20169 auto It = MinBWs.find(OpE);
20170 if (It != MinBWs.end())
20171 IsSigned = It->second.second;
20174 if (isa<PoisonValue>(V))
20176 return !isKnownNonNegative(R, SimplifyQuery(*DL));
20183 Op1 = Builder.CreateIntCast(
20188 GetOperandSignedness(&OpTE1));
20193 Op2 = Builder.CreateIntCast(
20198 GetOperandSignedness(&OpTE2));
20200 if (
E->ReorderIndices.empty()) {
20204 std::next(
Mask.begin(),
E->CombinedEntriesWithIndices.back().second),
20207 if (ScalarTyNumElements != 1) {
20211 Value *Vec = Builder.CreateShuffleVector(Op1, Mask);
20213 E->CombinedEntriesWithIndices.back().second *
20214 ScalarTyNumElements);
20215 E->VectorizedValue = Vec;
20218 unsigned CommonVF =
20219 std::max(OpTE1.getVectorFactor(), OpTE2.getVectorFactor());
20222 std::iota(
Mask.begin(), std::next(
Mask.begin(), OpTE1.getVectorFactor()),
20224 Op1 = Builder.CreateShuffleVector(Op1, Mask);
20228 std::iota(
Mask.begin(), std::next(
Mask.begin(), OpTE2.getVectorFactor()),
20230 Op2 = Builder.CreateShuffleVector(Op2, Mask);
20232 Value *Vec = Builder.CreateShuffleVector(Op1, Op2,
E->getSplitMask());
20233 E->VectorizedValue = Vec;
20237 bool IsReverseOrder =
20239 auto FinalShuffle = [&](
Value *
V,
const TreeEntry *
E) {
20241 if (
E->getOpcode() == Instruction::Store &&
20242 E->State == TreeEntry::Vectorize) {
20243 ArrayRef<int>
Mask =
20244 ArrayRef(
reinterpret_cast<const int *
>(
E->ReorderIndices.begin()),
20245 E->ReorderIndices.size());
20246 ShuffleBuilder.add(V, Mask);
20247 }
else if ((
E->State == TreeEntry::StridedVectorize && IsReverseOrder) ||
20248 E->State == TreeEntry::CompressVectorize) {
20249 ShuffleBuilder.addOrdered(V, {});
20251 ShuffleBuilder.addOrdered(V,
E->ReorderIndices);
20254 E->CombinedEntriesWithIndices.size());
20256 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](
const auto &
P) {
20257 return std::make_pair(VectorizableTree[P.first].get(), P.second);
20260 (
E->CombinedEntriesWithIndices.empty() ||
E->ReorderIndices.empty()) &&
20261 "Expected either combined subnodes or reordering");
20262 return ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors, {});
20265 assert(!
E->isGather() &&
"Unhandled state");
20266 unsigned ShuffleOrOp =
20267 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector :
E->
getOpcode();
20268 if (!
E->isAltShuffle()) {
20269 switch (E->CombinedOp) {
20270 case TreeEntry::ReducedBitcast:
20271 case TreeEntry::ReducedBitcastBSwap:
20272 ShuffleOrOp = E->CombinedOp;
20279 auto GetOperandSignedness = [&](
unsigned Idx) {
20280 const TreeEntry *OpE = getOperandEntry(
E, Idx);
20281 bool IsSigned =
false;
20282 auto It = MinBWs.find(OpE);
20283 if (It != MinBWs.end())
20284 IsSigned = It->second.second;
20287 if (isa<PoisonValue>(V))
20289 return !isKnownNonNegative(R, SimplifyQuery(*DL));
20293 switch (ShuffleOrOp) {
20294 case Instruction::PHI: {
20295 assert((
E->ReorderIndices.empty() || !
E->ReuseShuffleIndices.empty() ||
20296 E != VectorizableTree.front().get() ||
E->UserTreeIndex) &&
20297 "PHI reordering is free.");
20299 Builder.SetInsertPoint(PH->getParent(),
20300 PH->getParent()->getFirstNonPHIIt());
20302 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
20306 Builder.SetInsertPoint(PH->getParent(),
20307 PH->getParent()->getFirstInsertionPt());
20310 V = FinalShuffle(V,
E);
20312 E->VectorizedValue =
V;
20319 SmallDenseMap<BasicBlock *, unsigned, 4> VisitedBBs;
20325 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
20331 TreeEntry *OpTE = getOperandEntry(
E,
I);
20332 if (OpTE->isGather() || DeletedNodes.contains(OpTE) ||
20333 TransformedToGatherNodes.contains(OpTE)) {
20336 assert(!OpTE->VectorizedValue &&
"Expected no vectorized value.");
20337 OpTE->VectorizedValue = VecOp;
20344 Value *Vec = vectorizeOperand(
E,
I);
20345 if (VecTy != Vec->
getType()) {
20347 MinBWs.contains(getOperandEntry(
E,
I))) &&
20348 "Expected item in MinBWs.");
20349 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(
I));
20355 "Invalid number of incoming values");
20356 assert(
E->VectorizedValue &&
"Expected vectorized value.");
20357 return E->VectorizedValue;
20360 case Instruction::ExtractElement: {
20361 Value *
V =
E->getSingleOperand(0);
20362 setInsertPointAfterBundle(
E);
20363 V = FinalShuffle(V,
E);
20364 E->VectorizedValue =
V;
20367 case Instruction::ExtractValue: {
20369 Builder.SetInsertPoint(LI);
20370 Value *Ptr = LI->getPointerOperand();
20371 LoadInst *
V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
20373 NewV = FinalShuffle(NewV,
E);
20374 E->VectorizedValue = NewV;
20377 case Instruction::InsertElement: {
20378 assert(
E->ReuseShuffleIndices.empty() &&
"All inserts should be unique");
20379 if (
const TreeEntry *OpE = getOperandEntry(
E, 1);
20380 OpE && !OpE->isGather() && OpE->hasState() &&
20381 !OpE->hasCopyableElements())
20384 setInsertPointAfterBundle(
E);
20385 Value *
V = vectorizeOperand(
E, 1);
20387 Type *ScalarTy =
Op.front()->getType();
20390 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(
E, 1));
20391 assert(Res.first > 0 &&
"Expected item in MinBWs.");
20392 V = Builder.CreateIntCast(
20402 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
20404 const unsigned NumElts =
20406 const unsigned NumScalars =
E->Scalars.size();
20409 assert(
Offset < NumElts &&
"Failed to find vector index offset");
20412 SmallVector<int>
Mask;
20413 if (!
E->ReorderIndices.empty()) {
20418 std::iota(
Mask.begin(), std::next(
Mask.begin(), NumScalars), 0);
20421 bool IsIdentity =
true;
20423 Mask.swap(PrevMask);
20424 for (
unsigned I = 0;
I < NumScalars; ++
I) {
20427 IsIdentity &= InsertIdx -
Offset ==
I;
20430 if (!IsIdentity || NumElts != NumScalars) {
20431 Value *V2 =
nullptr;
20432 bool IsVNonPoisonous =
20434 SmallVector<int> InsertMask(Mask);
20435 if (NumElts != NumScalars &&
Offset == 0) {
20444 InsertMask[*InsertIdx] = *InsertIdx;
20450 SmallBitVector UseMask =
20451 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
20452 SmallBitVector IsFirstPoison =
20454 SmallBitVector IsFirstUndef =
20456 if (!IsFirstPoison.
all()) {
20458 for (
unsigned I = 0;
I < NumElts;
I++) {
20460 IsFirstUndef.
test(
I)) {
20461 if (IsVNonPoisonous) {
20462 InsertMask[
I] =
I < NumScalars ?
I : 0;
20467 if (Idx >= NumScalars)
20468 Idx = NumScalars - 1;
20469 InsertMask[
I] = NumScalars + Idx;
20482 V = Builder.CreateShuffleVector(V, V2, InsertMask);
20484 GatherShuffleExtractSeq.insert(
I);
20485 CSEBlocks.insert(
I->getParent());
20490 for (
unsigned I = 0;
I < NumElts;
I++) {
20494 SmallBitVector UseMask =
20495 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
20496 SmallBitVector IsFirstUndef =
20498 if ((!IsIdentity ||
Offset != 0 || !IsFirstUndef.
all()) &&
20499 NumElts != NumScalars) {
20500 if (IsFirstUndef.
all()) {
20502 SmallBitVector IsFirstPoison =
20504 if (!IsFirstPoison.
all()) {
20505 for (
unsigned I = 0;
I < NumElts;
I++) {
20507 InsertMask[
I] =
I + NumElts;
20510 V = Builder.CreateShuffleVector(
20516 GatherShuffleExtractSeq.insert(
I);
20517 CSEBlocks.insert(
I->getParent());
20521 SmallBitVector IsFirstPoison =
20523 for (
unsigned I = 0;
I < NumElts;
I++) {
20527 InsertMask[
I] += NumElts;
20529 V = Builder.CreateShuffleVector(
20530 FirstInsert->getOperand(0), V, InsertMask,
20533 GatherShuffleExtractSeq.insert(
I);
20534 CSEBlocks.insert(
I->getParent());
20539 ++NumVectorInstructions;
20540 E->VectorizedValue =
V;
20543 case Instruction::ZExt:
20544 case Instruction::SExt:
20545 case Instruction::FPToUI:
20546 case Instruction::FPToSI:
20547 case Instruction::FPExt:
20548 case Instruction::PtrToInt:
20549 case Instruction::IntToPtr:
20550 case Instruction::SIToFP:
20551 case Instruction::UIToFP:
20552 case Instruction::Trunc:
20553 case Instruction::FPTrunc:
20554 case Instruction::BitCast: {
20555 setInsertPointAfterBundle(
E);
20557 Value *InVec = vectorizeOperand(
E, 0);
20562 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
20564 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
20567 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
20568 if (SrcIt != MinBWs.end())
20569 SrcBWSz = SrcIt->second.first;
20570 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->
getScalarType());
20571 if (BWSz == SrcBWSz) {
20572 VecOpcode = Instruction::BitCast;
20573 }
else if (BWSz < SrcBWSz) {
20574 VecOpcode = Instruction::Trunc;
20575 }
else if (It != MinBWs.end()) {
20576 assert(BWSz > SrcBWSz &&
"Invalid cast!");
20577 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
20578 }
else if (SrcIt != MinBWs.end()) {
20579 assert(BWSz > SrcBWSz &&
"Invalid cast!");
20581 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
20583 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
20584 !SrcIt->second.second) {
20585 VecOpcode = Instruction::UIToFP;
20586 }
else if (VecOpcode == Instruction::BitCast && SrcIt != MinBWs.end() &&
20588 Type *OrigSrcScalarTy = CI->getSrcTy();
20589 auto *OrigSrcVectorTy =
20592 Builder.CreateIntCast(InVec, OrigSrcVectorTy, SrcIt->second.second);
20594 Value *
V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
20596 : Builder.CreateCast(VecOpcode, InVec, VecTy);
20597 V = FinalShuffle(V,
E);
20599 E->VectorizedValue =
V;
20600 ++NumVectorInstructions;
20603 case Instruction::FCmp:
20604 case Instruction::ICmp: {
20605 setInsertPointAfterBundle(
E);
20607 Value *
L = vectorizeOperand(
E, 0);
20608 Value *
R = vectorizeOperand(
E, 1);
20609 if (
L->getType() !=
R->getType()) {
20612 MinBWs.contains(getOperandEntry(
E, 0)) ||
20613 MinBWs.contains(getOperandEntry(
E, 1))) &&
20614 "Expected item in MinBWs.");
20619 ->getIntegerBitWidth()) {
20620 Type *CastTy =
R->getType();
20621 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
20623 Type *CastTy =
L->getType();
20624 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
20629 Value *
V = Builder.CreateCmp(P0, L, R);
20632 ICmp->setSameSign(
false);
20635 V = FinalShuffle(V,
E);
20637 E->VectorizedValue =
V;
20638 ++NumVectorInstructions;
20641 case Instruction::Select: {
20642 setInsertPointAfterBundle(
E);
20645 Value *True = vectorizeOperand(
E, 1);
20646 Value *False = vectorizeOperand(
E, 2);
20650 MinBWs.contains(getOperandEntry(
E, 1)) ||
20651 MinBWs.contains(getOperandEntry(
E, 2))) &&
20652 "Expected item in MinBWs.");
20653 if (True->
getType() != VecTy)
20654 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
20655 if (False->
getType() != VecTy)
20656 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
20661 assert(TrueNumElements >= CondNumElements &&
20662 TrueNumElements % CondNumElements == 0 &&
20663 "Cannot vectorize Instruction::Select");
20665 "Cannot vectorize Instruction::Select");
20666 if (CondNumElements != TrueNumElements) {
20669 Cond = Builder.CreateShuffleVector(
20674 "Cannot vectorize Instruction::Select");
20676 Builder.CreateSelectWithUnknownProfile(
Cond, True, False,
DEBUG_TYPE);
20677 V = FinalShuffle(V,
E);
20679 E->VectorizedValue =
V;
20680 ++NumVectorInstructions;
20683 case Instruction::FNeg: {
20684 setInsertPointAfterBundle(
E);
20686 Value *
Op = vectorizeOperand(
E, 0);
20688 Value *
V = Builder.CreateUnOp(
20694 V = FinalShuffle(V,
E);
20696 E->VectorizedValue =
V;
20697 ++NumVectorInstructions;
20701 case Instruction::Freeze: {
20702 setInsertPointAfterBundle(
E);
20704 Value *
Op = vectorizeOperand(
E, 0);
20706 if (
Op->getType() != VecTy) {
20708 MinBWs.contains(getOperandEntry(
E, 0))) &&
20709 "Expected item in MinBWs.");
20710 Op = Builder.CreateIntCast(
Op, VecTy, GetOperandSignedness(0));
20712 Value *
V = Builder.CreateFreeze(
Op);
20713 V = FinalShuffle(V,
E);
20715 E->VectorizedValue =
V;
20716 ++NumVectorInstructions;
20720 case Instruction::Add:
20721 case Instruction::FAdd:
20722 case Instruction::Sub:
20723 case Instruction::FSub:
20724 case Instruction::Mul:
20725 case Instruction::FMul:
20726 case Instruction::UDiv:
20727 case Instruction::SDiv:
20728 case Instruction::FDiv:
20729 case Instruction::URem:
20730 case Instruction::SRem:
20731 case Instruction::FRem:
20732 case Instruction::Shl:
20733 case Instruction::LShr:
20734 case Instruction::AShr:
20735 case Instruction::And:
20736 case Instruction::Or:
20737 case Instruction::Xor: {
20738 setInsertPointAfterBundle(
E);
20742 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
20747 return CI && CI->getValue().countr_one() >= It->second.first;
20749 V = FinalShuffle(
I == 0 ?
RHS :
LHS,
E);
20750 E->VectorizedValue =
V;
20751 ++NumVectorInstructions;
20759 MinBWs.contains(getOperandEntry(
E, 0)) ||
20760 MinBWs.contains(getOperandEntry(
E, 1))) &&
20761 "Expected item in MinBWs.");
20763 LHS = Builder.CreateIntCast(
LHS, VecTy, GetOperandSignedness(0));
20765 RHS = Builder.CreateIntCast(
RHS, VecTy, GetOperandSignedness(1));
20768 Value *
V = Builder.CreateBinOp(
20775 if (!MinBWs.contains(
E) && ShuffleOrOp == Instruction::Sub &&
20777 return isa<PoisonValue>(V) ||
20778 (E->hasCopyableElements() && E->isCopyableElement(V)) ||
20779 isCommutative(cast<Instruction>(V));
20781 I->setHasNoUnsignedWrap(
false);
20784 V = FinalShuffle(V,
E);
20786 E->VectorizedValue =
V;
20787 ++NumVectorInstructions;
20791 case Instruction::Load: {
20794 setInsertPointAfterBundle(
E);
20798 FixedVectorType *StridedLoadTy =
nullptr;
20799 Value *PO = LI->getPointerOperand();
20800 if (
E->State == TreeEntry::Vectorize) {
20801 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
20802 }
else if (
E->State == TreeEntry::CompressVectorize) {
20803 auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
20804 CompressEntryToData.at(
E);
20805 Align CommonAlignment = LI->getAlign();
20811 for (
int I : CompressMask)
20815 MaskValues =
replicateMask(MaskValues, VecTy->getNumElements());
20818 NewLI = Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment,
20821 NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment);
20832 }
else if (
E->State == TreeEntry::StridedVectorize) {
20835 PO = IsReverseOrder ? PtrN : Ptr0;
20836 Type *StrideTy = DL->getIndexType(PO->
getType());
20838 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(
E);
20839 StridedLoadTy = SPtrInfo.Ty;
20840 assert(StridedLoadTy &&
"Missing StridedPoinerInfo for tree entry.");
20841 unsigned StridedLoadEC =
20844 Value *Stride = SPtrInfo.StrideVal;
20846 const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
20847 assert(StrideSCEV &&
"Neither StrideVal nor StrideSCEV were set.");
20848 SCEVExpander Expander(*SE,
"strided-load-vec");
20849 Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->
getType(),
20850 &*Builder.GetInsertPoint());
20853 Builder.CreateIntCast(Stride, StrideTy,
true);
20854 StrideVal = Builder.CreateMul(
20856 StrideTy, (IsReverseOrder ? -1 : 1) *
20858 DL->getTypeAllocSize(ScalarTy))));
20860 auto *Inst = Builder.CreateIntrinsic(
20861 Intrinsic::experimental_vp_strided_load,
20862 {StridedLoadTy, PO->
getType(), StrideTy},
20865 Builder.getInt32(StridedLoadEC)});
20866 Inst->addParamAttr(
20868 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
20871 assert(
E->State == TreeEntry::ScatterVectorize &&
"Unhandled state");
20872 Value *VecPtr = vectorizeOperand(
E, 0);
20877 unsigned ScalarTyNumElements =
20879 unsigned VecTyNumElements =
20881 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
20882 "Cannot expand getelementptr.");
20883 unsigned VF = VecTyNumElements / ScalarTyNumElements;
20886 return Builder.getInt64(I % ScalarTyNumElements);
20888 VecPtr = Builder.CreateGEP(
20889 VecTy->getElementType(),
20890 Builder.CreateShuffleVector(
20896 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
20898 Value *
V =
E->State == TreeEntry::CompressVectorize
20902 if (StridedLoadTy != VecTy)
20903 V = Builder.CreateBitOrPointerCast(V, VecTy);
20904 V = FinalShuffle(V,
E);
20905 E->VectorizedValue =
V;
20906 ++NumVectorInstructions;
20909 case Instruction::Store: {
20912 setInsertPointAfterBundle(
E);
20914 Value *VecValue = vectorizeOperand(
E, 0);
20915 if (VecValue->
getType() != VecTy)
20917 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
20918 VecValue = FinalShuffle(VecValue,
E);
20920 Value *Ptr =
SI->getPointerOperand();
20922 if (
E->State == TreeEntry::Vectorize) {
20923 ST = Builder.CreateAlignedStore(VecValue, Ptr,
SI->getAlign());
20925 assert(
E->State == TreeEntry::StridedVectorize &&
20926 "Expected either strided or consecutive stores.");
20927 if (!
E->ReorderIndices.empty()) {
20929 Ptr =
SI->getPointerOperand();
20932 Type *StrideTy = DL->getIndexType(
SI->getPointerOperandType());
20933 auto *Inst = Builder.CreateIntrinsic(
20934 Intrinsic::experimental_vp_strided_store,
20935 {VecTy, Ptr->
getType(), StrideTy},
20938 StrideTy, -
static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
20939 Builder.getAllOnesMask(VecTy->getElementCount()),
20940 Builder.getInt32(
E->Scalars.size())});
20941 Inst->addParamAttr(
20943 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
20949 E->VectorizedValue =
V;
20950 ++NumVectorInstructions;
20953 case Instruction::GetElementPtr: {
20955 setInsertPointAfterBundle(
E);
20957 Value *Op0 = vectorizeOperand(
E, 0);
20960 for (
int J = 1,
N = GEP0->getNumOperands(); J <
N; ++J) {
20961 Value *OpVec = vectorizeOperand(
E, J);
20965 Value *
V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
20968 for (
Value *V :
E->Scalars) {
20975 V = FinalShuffle(V,
E);
20977 E->VectorizedValue =
V;
20978 ++NumVectorInstructions;
20982 case Instruction::Call: {
20984 setInsertPointAfterBundle(
E);
20989 CI,
ID, VecTy->getNumElements(),
20990 It != MinBWs.end() ? It->second.first : 0, TTI);
20993 VecCallCosts.first <= VecCallCosts.second;
20995 Value *ScalarArg =
nullptr;
21006 ScalarArg = CEI->getArgOperand(
I);
21009 if (
ID == Intrinsic::abs && It != MinBWs.end() &&
21010 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
21011 ScalarArg = Builder.getFalse();
21018 Value *OpVec = vectorizeOperand(
E,
I);
21019 ScalarArg = CEI->getArgOperand(
I);
21022 It == MinBWs.end()) {
21025 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(
I));
21026 }
else if (It != MinBWs.end()) {
21027 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(
I));
21036 if (!UseIntrinsic) {
21041 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
21048 Value *
V = Builder.CreateCall(CF, OpVecs, OpBundles);
21051 V = FinalShuffle(V,
E);
21053 E->VectorizedValue =
V;
21054 ++NumVectorInstructions;
21057 case Instruction::ShuffleVector: {
21060 setInsertPointAfterBundle(
E);
21061 Value *Src = vectorizeOperand(
E, 0);
21064 SmallVector<int> NewMask(ThisMask.size());
21066 return SVSrc->getShuffleMask()[Mask];
21068 V = Builder.CreateShuffleVector(SVSrc->getOperand(0),
21069 SVSrc->getOperand(1), NewMask);
21071 V = Builder.CreateShuffleVector(Src, ThisMask);
21076 V = FinalShuffle(V,
E);
21084 "Invalid Shuffle Vector Operand");
21088 setInsertPointAfterBundle(
E);
21089 LHS = vectorizeOperand(
E, 0);
21090 RHS = vectorizeOperand(
E, 1);
21092 setInsertPointAfterBundle(
E);
21093 LHS = vectorizeOperand(
E, 0);
21099 assert((It != MinBWs.end() ||
21100 getOperandEntry(
E, 0)->State == TreeEntry::NeedToGather ||
21101 getOperandEntry(
E, 1)->State == TreeEntry::NeedToGather ||
21102 MinBWs.contains(getOperandEntry(
E, 0)) ||
21103 MinBWs.contains(getOperandEntry(
E, 1))) &&
21104 "Expected item in MinBWs.");
21105 Type *CastTy = VecTy;
21111 ->getIntegerBitWidth())
21117 LHS = Builder.CreateIntCast(
LHS, CastTy, GetOperandSignedness(0));
21119 RHS = Builder.CreateIntCast(
RHS, CastTy, GetOperandSignedness(1));
21124 V0 = Builder.CreateBinOp(
21126 V1 = Builder.CreateBinOp(
21129 V0 = Builder.CreateCmp(CI0->getPredicate(),
LHS,
RHS);
21132 V1 = Builder.CreateCmp(AltPred,
LHS,
RHS);
21135 unsigned SrcBWSz = DL->getTypeSizeInBits(
21137 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
21138 if (BWSz <= SrcBWSz) {
21139 if (BWSz < SrcBWSz)
21140 LHS = Builder.CreateIntCast(
LHS, VecTy, It->second.first);
21142 "Expected same type as operand.");
21146 E->VectorizedValue =
LHS;
21147 ++NumVectorInstructions;
21151 V0 = Builder.CreateCast(
21153 V1 = Builder.CreateCast(
21158 for (
Value *V : {V0, V1}) {
21160 GatherShuffleExtractSeq.insert(
I);
21161 CSEBlocks.insert(
I->getParent());
21169 SmallVector<int>
Mask;
21170 E->buildAltOpShuffleMask(
21171 [
E,
this](Instruction *
I) {
21172 assert(
E->getMatchingMainOpOrAltOp(
I) &&
21173 "Unexpected main/alternate opcode");
21177 Mask, &OpScalars, &AltScalars);
21181 auto DropNuwFlag = [&](
Value *Vec,
unsigned Opcode) {
21184 I && Opcode == Instruction::Sub && !MinBWs.contains(
E) &&
21186 if (isa<PoisonValue>(V))
21188 if (E->hasCopyableElements() && E->isCopyableElement(V))
21190 auto *IV = cast<Instruction>(V);
21191 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
21193 I->setHasNoUnsignedWrap(
false);
21195 DropNuwFlag(V0,
E->getOpcode());
21196 DropNuwFlag(V1,
E->getAltOpcode());
21202 V = Builder.CreateShuffleVector(V0, V1, Mask);
21205 GatherShuffleExtractSeq.insert(
I);
21206 CSEBlocks.insert(
I->getParent());
21210 E->VectorizedValue =
V;
21211 ++NumVectorInstructions;
21215 case TreeEntry::ReducedBitcast:
21216 case TreeEntry::ReducedBitcastBSwap: {
21217 assert(UserIgnoreList &&
"Expected reduction operations only.");
21218 setInsertPointAfterBundle(
E);
21219 TreeEntry *ZExt = getOperandEntry(
E, 0);
21221 ZExt->getMainOp()->getType(), ZExt->getVectorFactor()));
21222 TreeEntry *
Const = getOperandEntry(
E, 1);
21224 Const->Scalars.front()->getType(),
Const->getVectorFactor()));
21225 Value *
Op = vectorizeOperand(ZExt, 0);
21228 Op = FinalShuffle(
Op,
E);
21229 auto *
V = Builder.CreateBitCast(
21232 DL->getTypeSizeInBits(ZExt->getMainOp()->getType())));
21233 if (ShuffleOrOp == TreeEntry::ReducedBitcastBSwap)
21234 V = Builder.CreateUnaryIntrinsic(Intrinsic::bswap, V);
21235 E->VectorizedValue =
V;
21236 ++NumVectorInstructions;
21253 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
21256 EntryToLastInstruction.clear();
21258 for (
auto &BSIter : BlocksSchedules)
21259 scheduleBlock(*
this, BSIter.second.get());
21262 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
21263 if (TE->isGather() || DeletedNodes.contains(TE.get()) ||
21264 (TE->State == TreeEntry::CombinedVectorize &&
21265 (TE->CombinedOp == TreeEntry::ReducedBitcast ||
21266 TE->CombinedOp == TreeEntry::ReducedBitcastBSwap)))
21268 (void)getLastInstructionInBundle(TE.get());
21272 Builder.SetInsertPoint(ReductionRoot->
getParent(),
21275 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
21279 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
21280 if (DeletedNodes.contains(TE.get()))
21282 if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
21283 TE->UserTreeIndex.UserTE->hasState() &&
21284 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
21285 (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
21286 TE->UserTreeIndex.UserTE->isAltShuffle()) &&
21287 !TE->UserTreeIndex.UserTE->hasCopyableElements() &&
21288 all_of(TE->UserTreeIndex.UserTE->Scalars,
21289 [](
Value *V) { return isUsedOutsideBlock(V); })) {
21291 getLastInstructionInBundle(TE->UserTreeIndex.UserTE);
21295 for (
auto &Entry : GatherEntries) {
21297 Builder.SetInsertPoint(Entry.second);
21298 Builder.SetCurrentDebugLocation(Entry.second->getDebugLoc());
21303 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
21304 if (DeletedNodes.contains(TE.get()))
21306 if (GatheredLoadsEntriesFirst.has_value() &&
21307 TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue &&
21308 (!TE->isGather() || TE->UserTreeIndex)) {
21309 assert((TE->UserTreeIndex ||
21310 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
21311 "Expected gathered load node.");
21320 for (
const TreeEntry *E : PostponedNodes) {
21321 auto *TE =
const_cast<TreeEntry *
>(E);
21323 TE->VectorizedValue =
nullptr;
21334 (TE->UserTreeIndex.UserTE->hasState() &&
21335 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI)) {
21344 if (UI->comesBefore(InsertPt))
21347 Builder.SetInsertPoint(InsertPt);
21349 Builder.SetInsertPoint(PrevVec);
21351 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
21354 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
21355 Builder.GetInsertPoint()->comesBefore(VecI))
21356 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
21357 Builder.GetInsertPoint());
21358 if (Vec->
getType() != PrevVec->getType()) {
21360 PrevVec->getType()->isIntOrIntVectorTy() &&
21361 "Expected integer vector types only.");
21362 std::optional<bool> IsSigned;
21363 for (
Value *V : TE->Scalars) {
21365 for (
const TreeEntry *MNTE : getTreeEntries(V)) {
21366 auto It = MinBWs.find(MNTE);
21367 if (It != MinBWs.end()) {
21368 IsSigned = IsSigned.value_or(
false) || It->second.second;
21373 if (IsSigned.value_or(
false))
21376 for (
const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
21377 auto It = MinBWs.find(BVE);
21378 if (It != MinBWs.end()) {
21379 IsSigned = IsSigned.value_or(
false) || It->second.second;
21384 if (IsSigned.value_or(
false))
21388 IsSigned.value_or(
false) ||
21392 if (IsSigned.value_or(
false))
21396 if (IsSigned.value_or(
false)) {
21398 auto It = MinBWs.find(TE->UserTreeIndex.UserTE);
21399 if (It != MinBWs.end())
21400 IsSigned = It->second.second;
21403 "Expected user node or perfect diamond match in MinBWs.");
21404 Vec = Builder.CreateIntCast(Vec, PrevVec->
getType(), *IsSigned);
21406 PrevVec->replaceAllUsesWith(Vec);
21407 PostponedValues.
try_emplace(Vec).first->second.push_back(TE);
21410 auto It = PostponedValues.
find(PrevVec);
21411 if (It != PostponedValues.
end()) {
21412 for (TreeEntry *VTE : It->getSecond())
21413 VTE->VectorizedValue = Vec;
21433 for (
const auto &ExternalUse : ExternalUses) {
21434 Value *Scalar = ExternalUse.Scalar;
21441 const TreeEntry *E = &ExternalUse.E;
21442 assert(E &&
"Invalid scalar");
21443 assert(!E->isGather() &&
"Extracting from a gather list");
21445 if (E->getOpcode() == Instruction::GetElementPtr &&
21449 Value *Vec = E->VectorizedValue;
21450 assert(Vec &&
"Can't find vectorizable value");
21452 Value *Lane = Builder.getInt32(ExternalUse.Lane);
21453 auto ExtractAndExtendIfNeeded = [&](
Value *Vec) {
21454 if (Scalar->getType() != Vec->
getType()) {
21455 Value *Ex =
nullptr;
21456 Value *ExV =
nullptr;
21458 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
21459 auto It = ScalarToEEs.
find(Scalar);
21460 if (It != ScalarToEEs.
end()) {
21463 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
21464 : Builder.GetInsertBlock());
21465 if (EEIt != It->second.end()) {
21466 Value *PrevV = EEIt->second.first;
21468 I && !ReplaceInst &&
21469 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
21470 Builder.GetInsertPoint()->comesBefore(
I)) {
21471 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
21472 Builder.GetInsertPoint());
21477 ExV = EEIt->second.second ? EEIt->second.second : Ex;
21486 IgnoredExtracts.
insert(EE);
21489 auto *CloneInst = Inst->clone();
21490 CloneInst->insertBefore(Inst->getIterator());
21491 if (Inst->hasName())
21492 CloneInst->takeName(Inst);
21497 Value *V = ES->getVectorOperand();
21500 V = ETEs.front()->VectorizedValue;
21502 !
IV ||
IV == Vec ||
IV->getParent() != IVec->getParent() ||
21503 IV->comesBefore(IVec))
21504 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
21506 Ex = Builder.CreateExtractElement(Vec, Lane);
21507 }
else if (
auto *VecTy =
21510 unsigned VecTyNumElements = VecTy->getNumElements();
21515 ExternalUse.Lane * VecTyNumElements);
21517 Ex = Builder.CreateExtractElement(Vec, Lane);
21522 if (Scalar->getType() != Ex->
getType())
21523 ExV = Builder.CreateIntCast(
21528 : &F->getEntryBlock(),
21529 std::make_pair(Ex, ExV));
21535 GatherShuffleExtractSeq.insert(ExI);
21536 CSEBlocks.insert(ExI->getParent());
21542 "In-tree scalar of vector type is not insertelement?");
21551 if (!ScalarsWithNullptrUser.
insert(Scalar).second)
21554 (ExternallyUsedValues.
count(Scalar) ||
21555 ExternalUsesWithNonUsers.count(Scalar) ||
21556 ExternalUsesAsOriginalScalar.contains(Scalar) ||
21560 if (ExternalUsesAsOriginalScalar.contains(U))
21562 ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
21563 return !UseEntries.empty() &&
21564 (E->State == TreeEntry::Vectorize ||
21565 E->State == TreeEntry::StridedVectorize ||
21566 E->State == TreeEntry::CompressVectorize) &&
21567 any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {
21568 return (UseEntry->State == TreeEntry::Vectorize ||
21570 TreeEntry::StridedVectorize ||
21572 TreeEntry::CompressVectorize) &&
21573 doesInTreeUserNeedToExtract(
21574 Scalar, getRootEntryInstruction(*UseEntry),
21578 "Scalar with nullptr User must be registered in "
21579 "ExternallyUsedValues map or remain as scalar in vectorized "
21583 if (
PHI->getParent()->isLandingPad())
21584 Builder.SetInsertPoint(
21587 PHI->getParent()->getLandingPadInst()->getIterator()));
21589 Builder.SetInsertPoint(
PHI->getParent(),
21590 PHI->getParent()->getFirstNonPHIIt());
21592 Builder.SetInsertPoint(VecI->getParent(),
21593 std::next(VecI->getIterator()));
21596 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
21598 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
21600 if (Scalar != NewInst) {
21603 "Extractelements should not be replaced.");
21604 Scalar->replaceAllUsesWith(NewInst);
21614 if (!UsedInserts.
insert(VU).second)
21617 auto BWIt = MinBWs.find(E);
21619 auto *ScalarTy = FTy->getElementType();
21620 auto Key = std::make_pair(Vec, ScalarTy);
21621 auto VecIt = VectorCasts.
find(
Key);
21622 if (VecIt == VectorCasts.
end()) {
21625 if (IVec->getParent()->isLandingPad())
21626 Builder.SetInsertPoint(IVec->getParent(),
21627 std::next(IVec->getParent()
21628 ->getLandingPadInst()
21631 Builder.SetInsertPoint(
21632 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
21634 Builder.SetInsertPoint(IVec->getNextNode());
21636 Vec = Builder.CreateIntCast(
21641 BWIt->second.second);
21644 Vec = VecIt->second;
21651 ShuffledInserts, [VU](
const ShuffledInsertData<Value *> &
Data) {
21658 unsigned Idx = *InsertIdx;
21659 if (It == ShuffledInserts.
end()) {
21661 It = std::next(ShuffledInserts.
begin(),
21662 ShuffledInserts.
size() - 1);
21667 Mask[Idx] = ExternalUse.Lane;
21679 for (
unsigned I :
seq<unsigned>(0, PH->getNumIncomingValues())) {
21680 if (PH->getIncomingValue(
I) == Scalar) {
21682 PH->getIncomingBlock(
I)->getTerminator();
21684 Builder.SetInsertPoint(VecI->getParent(),
21685 std::next(VecI->getIterator()));
21687 Builder.SetInsertPoint(PH->getIncomingBlock(
I)->getTerminator());
21689 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
21690 PH->setOperand(
I, NewInst);
21695 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
21699 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
21700 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
21711 for (
int I = 0, E = Mask.size();
I < E; ++
I) {
21713 CombinedMask1[
I] = Mask[
I];
21715 CombinedMask2[
I] = Mask[
I] - VF;
21717 ShuffleInstructionBuilder ShuffleBuilder(
21719 ShuffleBuilder.add(V1, CombinedMask1);
21721 ShuffleBuilder.add(V2, CombinedMask2);
21722 return ShuffleBuilder.finalize({}, {}, {});
21725 auto &&ResizeToVF = [&CreateShuffle](
Value *Vec, ArrayRef<int>
Mask,
21726 bool ForSingleMask) {
21727 unsigned VF =
Mask.size();
21730 if (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); })) {
21731 Vec = CreateShuffle(Vec,
nullptr, Mask);
21732 return std::make_pair(Vec,
true);
21734 if (!ForSingleMask) {
21736 for (
unsigned I = 0;
I < VF; ++
I) {
21740 Vec = CreateShuffle(Vec,
nullptr, ResizeMask);
21744 return std::make_pair(Vec,
false);
21748 for (
int I = 0,
E = ShuffledInserts.size();
I <
E; ++
I) {
21751 InsertElementInst *FirstInsert = ShuffledInserts[
I].InsertElements.front();
21752 InsertElementInst *LastInsert = ShuffledInserts[
I].InsertElements.back();
21753 Builder.SetInsertPoint(LastInsert);
21754 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
21759 return cast<VectorType>(Vec->getType())
21760 ->getElementCount()
21761 .getKnownMinValue();
21764 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
21766 assert((Vals.size() == 1 || Vals.size() == 2) &&
21767 "Expected exactly 1 or 2 input values.");
21768 if (Vals.size() == 1) {
21771 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
21772 ->getNumElements() ||
21773 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
21774 return CreateShuffle(Vals.front(), nullptr, Mask);
21775 return Vals.front();
21777 return CreateShuffle(Vals.
front() ? Vals.
front()
21779 Vals.
back(), Mask);
21781 auto It = ShuffledInserts[
I].InsertElements.rbegin();
21783 InsertElementInst *
II =
nullptr;
21784 if (It != ShuffledInserts[
I].InsertElements.rend())
21787 while (It != ShuffledInserts[
I].InsertElements.rend()) {
21788 assert(
II &&
"Must be an insertelement instruction.");
21795 for (Instruction *
II :
reverse(Inserts)) {
21796 II->replaceUsesOfWith(
II->getOperand(0), NewInst);
21798 if (
II->getParent() == NewI->getParent() &&
II->comesBefore(NewI))
21799 II->moveAfter(NewI);
21803 for (InsertElementInst *IE :
reverse(ShuffledInserts[
I].InsertElements)) {
21804 IE->replaceUsesOfWith(
IE->getOperand(0),
21806 IE->replaceUsesOfWith(
IE->getOperand(1),
21810 CSEBlocks.insert(LastInsert->
getParent());
21815 for (
auto &TEPtr : VectorizableTree) {
21816 TreeEntry *
Entry = TEPtr.get();
21819 if (
Entry->isGather() ||
Entry->State == TreeEntry::SplitVectorize ||
21820 DeletedNodes.contains(Entry) ||
21821 TransformedToGatherNodes.contains(Entry))
21824 if (
Entry->CombinedOp == TreeEntry::ReducedBitcast ||
21825 Entry->CombinedOp == TreeEntry::ReducedBitcastBSwap) {
21827 if (!
Entry->hasState()) {
21834 if (!
I ||
Entry->isCopyableElement(
I))
21842 assert(
Entry->VectorizedValue &&
"Can't find vectorizable value");
21845 for (
int Lane = 0, LE =
Entry->Scalars.size(); Lane != LE; ++Lane) {
21848 if (
Entry->getOpcode() == Instruction::GetElementPtr &&
21852 EE && IgnoredExtracts.contains(EE))
21859 for (User *U :
Scalar->users()) {
21864 (UserIgnoreList && UserIgnoreList->contains(U)) ||
21867 "Deleting out-of-tree value");
21871 LLVM_DEBUG(
dbgs() <<
"SLP: \tErasing scalar:" << *Scalar <<
".\n");
21880 V->mergeDIAssignID(RemovedInsts);
21883 if (UserIgnoreList) {
21884 for (Instruction *
I : RemovedInsts) {
21885 const TreeEntry *
IE = getTreeEntries(
I).front();
21887 !SplitEntries.empty() && SplitEntries.front()->Idx <
IE->Idx)
21888 IE = SplitEntries.front();
21889 if (
IE->Idx != 0 &&
21890 !(VectorizableTree.front()->isGather() &&
IE->UserTreeIndex &&
21891 (ValueToGatherNodes.lookup(
I).contains(
21892 VectorizableTree.front().get()) ||
21893 (
IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
21894 IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
21895 !(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
21896 IE->UserTreeIndex &&
21898 !(GatheredLoadsEntriesFirst.has_value() &&
21899 IE->Idx >= *GatheredLoadsEntriesFirst &&
21900 VectorizableTree.front()->isGather() &&
21902 !(!VectorizableTree.front()->isGather() &&
21903 VectorizableTree.front()->isCopyableElement(
I)))
21908 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
21909 (match(U.getUser(), m_LogicalAnd()) ||
21910 match(U.getUser(), m_LogicalOr())) &&
21911 U.getOperandNo() == 0;
21912 if (IsPoisoningLogicalOp) {
21913 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
21916 return UserIgnoreList->contains(
U.getUser());
21920 for (SelectInst *SI : LogicalOpSelects)
21930 Builder.ClearInsertionPoint();
21931 InstrElementSize.clear();
21933 const TreeEntry &RootTE = *VectorizableTree.front();
21934 Value *Vec = RootTE.VectorizedValue;
21935 if (
auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
21936 It != MinBWs.end() &&
21937 ReductionBitWidth != It->second.first) {
21938 IRBuilder<>::InsertPointGuard Guard(Builder);
21939 Builder.SetInsertPoint(ReductionRoot->getParent(),
21940 ReductionRoot->getIterator());
21941 Vec = Builder.CreateIntCast(
21943 VectorType::get(Builder.getIntNTy(ReductionBitWidth),
21945 It->second.second);
21951 LLVM_DEBUG(
dbgs() <<
"SLP: Optimizing " << GatherShuffleExtractSeq.size()
21952 <<
" gather sequences instructions.\n");
21959 Loop *L = LI->getLoopFor(
I->getParent());
21964 BasicBlock *PreHeader = L->getLoopPreheader();
21972 auto *OpI = dyn_cast<Instruction>(V);
21973 return OpI && L->contains(OpI);
21979 CSEBlocks.insert(PreHeader);
21984 CSEWorkList.
reserve(CSEBlocks.size());
21987 assert(DT->isReachableFromEntry(
N));
21994 assert((
A ==
B) == (
A->getDFSNumIn() ==
B->getDFSNumIn()) &&
21995 "Different nodes should have different DFS numbers");
21996 return A->getDFSNumIn() <
B->getDFSNumIn();
22004 auto &&IsIdenticalOrLessDefined = [TTI = TTI](
Instruction *I1,
22007 if (I1->getType() != I2->getType())
22012 return I1->isIdenticalTo(I2);
22013 if (SI1->isIdenticalTo(SI2))
22015 for (
int I = 0, E = SI1->getNumOperands();
I < E; ++
I)
22016 if (SI1->getOperand(
I) != SI2->getOperand(
I))
22019 NewMask.
assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
22023 unsigned LastUndefsCnt = 0;
22024 for (
int I = 0, E = NewMask.
size();
I < E; ++
I) {
22030 NewMask[
I] != SM1[
I])
22033 NewMask[
I] = SM1[
I];
22037 return SM1.
size() - LastUndefsCnt > 1 &&
22041 SM1.
size() - LastUndefsCnt));
22047 for (
auto I = CSEWorkList.
begin(), E = CSEWorkList.
end();
I != E; ++
I) {
22049 (
I == CSEWorkList.
begin() || !DT->dominates(*
I, *std::prev(
I))) &&
22050 "Worklist not sorted properly!");
22057 !GatherShuffleExtractSeq.contains(&In))
22062 bool Replaced =
false;
22065 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
22066 DT->dominates(V->getParent(), In.getParent())) {
22067 In.replaceAllUsesWith(V);
22070 if (!NewMask.
empty())
22071 SI->setShuffleMask(NewMask);
22076 GatherShuffleExtractSeq.contains(V) &&
22077 IsIdenticalOrLessDefined(V, &In, NewMask) &&
22078 DT->dominates(In.getParent(), V->getParent())) {
22080 V->replaceAllUsesWith(&In);
22083 if (!NewMask.
empty())
22084 SI->setShuffleMask(NewMask);
22092 Visited.push_back(&In);
22097 GatherShuffleExtractSeq.clear();
22100BoUpSLP::ScheduleBundle &BoUpSLP::BlockScheduling::buildBundle(
22103 ScheduledBundlesList.emplace_back(std::make_unique<ScheduleBundle>());
22104 for (
Value *V : VL) {
22105 if (S.isNonSchedulable(V))
22108 if (S.isCopyableElement(V)) {
22110 ScheduleCopyableData &SD =
22111 addScheduleCopyableData(EI,
I, SchedulingRegionID, *BundlePtr);
22113 BundlePtr->add(&SD);
22116 ScheduleData *BundleMember = getScheduleData(V);
22117 assert(BundleMember &&
"no ScheduleData for bundle member "
22118 "(maybe not in same basic block)");
22120 BundlePtr->add(BundleMember);
22121 ScheduledBundles.try_emplace(
I).first->getSecond().push_back(
22124 assert(BundlePtr && *BundlePtr &&
"Failed to find schedule bundle");
22130std::optional<BoUpSLP::ScheduleBundle *>
22132 const InstructionsState &S,
22145 if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() &&
22146 EI.UserTE->doesNotNeedToSchedule() &&
22147 EI.UserTE->getOpcode() != Instruction::PHI &&
22149 auto *I = dyn_cast<Instruction>(V);
22150 if (!I || I->hasOneUser())
22152 for (User *U : I->users()) {
22153 auto *UI = cast<Instruction>(U);
22154 if (isa<BinaryOperator>(UI))
22159 return std::nullopt;
22160 if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() &&
22161 EI.UserTE->hasCopyableElements() &&
22162 EI.UserTE->getMainOp()->getParent() == S.getMainOp()->getParent() &&
22164 if (S.isCopyableElement(V))
22168 return std::nullopt;
22171 if (S.areInstructionsWithCopyableElements() &&
any_of(VL, [&](
Value *V) {
22184 return std::nullopt;
22185 if (S.areInstructionsWithCopyableElements() && EI) {
22186 bool IsNonSchedulableWithParentPhiNode =
22187 EI.UserTE->doesNotNeedToSchedule() && EI.UserTE->UserTreeIndex &&
22188 EI.UserTE->UserTreeIndex.UserTE->hasState() &&
22189 EI.UserTE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
22190 EI.UserTE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
22191 if (IsNonSchedulableWithParentPhiNode) {
22192 SmallSet<std::pair<Value *, Value *>, 4> Values;
22193 for (
const auto [Idx, V] :
22194 enumerate(EI.UserTE->UserTreeIndex.UserTE->Scalars)) {
22195 Value *
Op = EI.UserTE->UserTreeIndex.UserTE->getOperand(
22196 EI.UserTE->UserTreeIndex.EdgeIdx)[Idx];
22200 if (!Values.
insert(std::make_pair(V,
Op)).second)
22201 return std::nullopt;
22207 if (EI.UserTE->hasCopyableElements() &&
22208 EI.UserTE->isCopyableElement(V))
22210 ArrayRef<TreeEntry *> Entries = SLP->getTreeEntries(V);
22211 return any_of(Entries, [](const TreeEntry *TE) {
22212 return TE->doesNotNeedToSchedule() && TE->UserTreeIndex &&
22213 TE->UserTreeIndex.UserTE->hasState() &&
22214 TE->UserTreeIndex.UserTE->State !=
22215 TreeEntry::SplitVectorize &&
22216 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
22219 return std::nullopt;
22222 bool HasCopyables = S.areInstructionsWithCopyableElements();
22224 all_of(VL, [&](
Value *V) {
return S.isNonSchedulable(V); }))) {
22228 SmallVector<ScheduleData *> ControlDependentMembers;
22229 for (
Value *V : VL) {
22231 if (!
I || (HasCopyables && S.isCopyableElement(V)))
22233 SmallDenseMap<std::pair<Instruction *, Value *>,
unsigned> UserOpToNumOps;
22234 for (
const Use &U :
I->operands()) {
22237 .first->getSecond();
22240 Op && areAllOperandsReplacedByCopyableData(
I,
Op, *SLP,
NumOps)) {
22241 if (ScheduleData *OpSD = getScheduleData(
Op);
22242 OpSD && OpSD->hasValidDependencies())
22244 return std::nullopt;
22253 LLVM_DEBUG(
dbgs() <<
"SLP: bundle: " << *S.getMainOp() <<
"\n");
22255 auto TryScheduleBundleImpl = [=](
bool ReSchedule, ScheduleBundle &Bundle) {
22258 SmallVector<ScheduleData *> ControlDependentMembers;
22259 auto CheckIfNeedToClearDeps = [&](ScheduleBundle &Bundle) {
22260 SmallDenseMap<std::pair<Instruction *, Value *>,
unsigned> UserOpToNumOps;
22261 for (ScheduleEntity *SE : Bundle.getBundle()) {
22263 if (ScheduleData *BundleMember = getScheduleData(SD->getInst());
22264 BundleMember && BundleMember->hasValidDependencies()) {
22265 BundleMember->clearDirectDependencies();
22266 if (RegionHasStackSave ||
22268 BundleMember->getInst()))
22269 ControlDependentMembers.
push_back(BundleMember);
22274 if (SD->hasValidDependencies() &&
22275 (!S.areInstructionsWithCopyableElements() ||
22276 !S.isCopyableElement(SD->getInst())) &&
22277 !getScheduleCopyableData(SD->getInst()).empty() && EI.UserTE &&
22278 EI.UserTE->hasState() &&
22279 (!EI.UserTE->hasCopyableElements() ||
22280 !EI.UserTE->isCopyableElement(SD->getInst())))
22281 SD->clearDirectDependencies();
22282 for (
const Use &U : SD->getInst()->operands()) {
22285 .
try_emplace(std::make_pair(SD->getInst(),
U.get()), 0)
22286 .first->getSecond();
22289 Op && areAllOperandsReplacedByCopyableData(SD->getInst(),
Op,
22291 if (ScheduleData *OpSD = getScheduleData(
Op);
22292 OpSD && OpSD->hasValidDependencies()) {
22293 OpSD->clearDirectDependencies();
22294 if (RegionHasStackSave ||
22296 ControlDependentMembers.
push_back(OpSD);
22307 if (OldScheduleEnd && ScheduleEnd != OldScheduleEnd) {
22308 for_each(ScheduleDataMap, [&](
auto &
P) {
22309 if (BB !=
P.first->getParent())
22311 ScheduleData *SD =
P.second;
22312 if (isInSchedulingRegion(*SD))
22313 SD->clearDependencies();
22315 for_each(ScheduleCopyableDataMapByInst, [&](
auto &
P) {
22316 for_each(
P.second, [&](ScheduleCopyableData *SD) {
22317 if (isInSchedulingRegion(*SD))
22318 SD->clearDependencies();
22325 if (Bundle && !Bundle.getBundle().empty()) {
22326 if (S.areInstructionsWithCopyableElements() ||
22327 !ScheduleCopyableDataMap.empty())
22328 CheckIfNeedToClearDeps(Bundle);
22329 LLVM_DEBUG(
dbgs() <<
"SLP: try schedule bundle " << Bundle <<
" in block "
22331 calculateDependencies(Bundle, !ReSchedule, SLP,
22332 ControlDependentMembers);
22333 }
else if (!ControlDependentMembers.
empty()) {
22334 ScheduleBundle
Invalid = ScheduleBundle::invalid();
22335 calculateDependencies(
Invalid, !ReSchedule, SLP,
22336 ControlDependentMembers);
22341 initialFillReadyList(ReadyInsts);
22348 while (((!Bundle && ReSchedule) || (Bundle && !Bundle.isReady())) &&
22349 !ReadyInsts.empty()) {
22350 ScheduleEntity *Picked = ReadyInsts.pop_back_val();
22351 assert(Picked->isReady() &&
"must be ready to schedule");
22352 schedule(*SLP, S, EI, Picked, ReadyInsts);
22353 if (Picked == &Bundle)
22360 for (
Value *V : VL) {
22361 if (S.isNonSchedulable(V))
22363 if (!extendSchedulingRegion(V, S)) {
22370 ScheduleBundle
Invalid = ScheduleBundle::invalid();
22371 TryScheduleBundleImpl(
false,
Invalid);
22372 return std::nullopt;
22376 bool ReSchedule =
false;
22377 for (
Value *V : VL) {
22378 if (S.isNonSchedulable(V))
22382 if (!CopyableData.
empty()) {
22383 for (ScheduleCopyableData *SD : CopyableData)
22384 ReadyInsts.remove(SD);
22386 ScheduleData *BundleMember = getScheduleData(V);
22387 assert((BundleMember || S.isCopyableElement(V)) &&
22388 "no ScheduleData for bundle member (maybe not in same basic block)");
22394 ReadyInsts.remove(BundleMember);
22396 !Bundles.
empty()) {
22397 for (ScheduleBundle *
B : Bundles)
22398 ReadyInsts.remove(
B);
22401 if (!S.isCopyableElement(V) && !BundleMember->isScheduled())
22408 LLVM_DEBUG(
dbgs() <<
"SLP: reset schedule because " << *BundleMember
22409 <<
" was already scheduled\n");
22413 ScheduleBundle &Bundle = buildBundle(VL, S, EI);
22414 TryScheduleBundleImpl(ReSchedule, Bundle);
22415 if (!Bundle.isReady()) {
22416 for (ScheduleEntity *BD : Bundle.getBundle()) {
22420 if (BD->isReady()) {
22422 if (Bundles.
empty()) {
22423 ReadyInsts.insert(BD);
22426 for (ScheduleBundle *
B : Bundles)
22428 ReadyInsts.insert(
B);
22431 ScheduledBundlesList.pop_back();
22432 SmallVector<ScheduleData *> ControlDependentMembers;
22433 for (
Value *V : VL) {
22434 if (S.isNonSchedulable(V))
22437 if (S.isCopyableElement(
I)) {
22440 auto KV = std::make_pair(EI,
I);
22441 assert(ScheduleCopyableDataMap.contains(KV) &&
22442 "no ScheduleCopyableData for copyable element");
22443 ScheduleCopyableData *SD =
22444 ScheduleCopyableDataMapByInst.find(
I)->getSecond().pop_back_val();
22445 ScheduleCopyableDataMapByUsers[
I].remove(SD);
22448 const auto *It =
find(
Op,
I);
22449 assert(It !=
Op.end() &&
"Lane not set");
22450 SmallPtrSet<Instruction *, 4> Visited;
22452 int Lane = std::distance(
Op.begin(), It);
22453 assert(Lane >= 0 &&
"Lane not set");
22455 !EI.UserTE->ReorderIndices.empty())
22456 Lane = EI.UserTE->ReorderIndices[Lane];
22457 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
22458 "Couldn't find extract lane");
22460 if (!Visited.
insert(In).second) {
22464 ScheduleCopyableDataMapByInstUser
22465 [std::make_pair(std::make_pair(In, EI.EdgeIdx),
I)]
22468 }
while (It !=
Op.end());
22470 if (ScheduleCopyableData *UserCD = getScheduleCopyableData(UserEI,
I))
22471 ScheduleCopyableDataMapByUsers[
I].insert(UserCD);
22473 if (ScheduleCopyableDataMapByUsers[
I].
empty())
22474 ScheduleCopyableDataMapByUsers.erase(
I);
22475 ScheduleCopyableDataMap.erase(KV);
22477 if (ScheduleData *OpSD = getScheduleData(
I);
22478 OpSD && OpSD->hasValidDependencies()) {
22479 OpSD->clearDirectDependencies();
22480 if (RegionHasStackSave ||
22482 ControlDependentMembers.
push_back(OpSD);
22486 ScheduledBundles.find(
I)->getSecond().pop_back();
22488 if (!ControlDependentMembers.
empty()) {
22489 ScheduleBundle
Invalid = ScheduleBundle::invalid();
22490 calculateDependencies(
Invalid,
false, SLP,
22491 ControlDependentMembers);
22493 return std::nullopt;
22498BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
22500 if (ChunkPos >= ChunkSize) {
22501 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
22504 return &(ScheduleDataChunks.back()[ChunkPos++]);
22507bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
22508 Value *V,
const InstructionsState &S) {
22510 assert(
I &&
"bundle member must be an instruction");
22511 if (getScheduleData(
I))
22513 if (!ScheduleStart) {
22515 initScheduleData(
I,
I->getNextNode(),
nullptr,
nullptr);
22517 ScheduleEnd =
I->getNextNode();
22518 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
22519 LLVM_DEBUG(
dbgs() <<
"SLP: initialize schedule region to " << *
I <<
"\n");
22527 ++ScheduleStart->getIterator().getReverse();
22533 return II->isAssumeLikeIntrinsic();
22536 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
22537 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
22538 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter !=
I &&
22540 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
22541 LLVM_DEBUG(
dbgs() <<
"SLP: exceeded schedule region size limit\n");
22548 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
22549 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
22551 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter ==
I)) {
22552 assert(
I->getParent() == ScheduleStart->getParent() &&
22553 "Instruction is in wrong basic block.");
22554 initScheduleData(
I, ScheduleStart,
nullptr, FirstLoadStoreInRegion);
22560 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter ==
I)) &&
22561 "Expected to reach top of the basic block or instruction down the "
22563 assert(
I->getParent() == ScheduleEnd->getParent() &&
22564 "Instruction is in wrong basic block.");
22565 initScheduleData(ScheduleEnd,
I->getNextNode(), LastLoadStoreInRegion,
22567 ScheduleEnd =
I->getNextNode();
22568 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
22569 LLVM_DEBUG(
dbgs() <<
"SLP: extend schedule region end to " << *
I <<
"\n");
22573void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
22575 ScheduleData *PrevLoadStore,
22576 ScheduleData *NextLoadStore) {
22577 ScheduleData *CurrentLoadStore = PrevLoadStore;
22582 ScheduleData *SD = ScheduleDataMap.lookup(
I);
22584 SD = allocateScheduleDataChunks();
22585 ScheduleDataMap[
I] = SD;
22587 assert(!isInSchedulingRegion(*SD) &&
22588 "new ScheduleData already in scheduling region");
22589 SD->init(SchedulingRegionID,
I);
22596 return LI && LI->isSimple() &&
22597 LI->getMetadata(LLVMContext::MD_invariant_load);
22600 if (
I->mayReadOrWriteMemory() &&
22602 !CanIgnoreLoad(
I) &&
22606 Intrinsic::pseudoprobe))) {
22608 if (CurrentLoadStore) {
22609 CurrentLoadStore->setNextLoadStore(SD);
22611 FirstLoadStoreInRegion = SD;
22613 CurrentLoadStore = SD;
22618 RegionHasStackSave =
true;
22620 if (NextLoadStore) {
22621 if (CurrentLoadStore)
22622 CurrentLoadStore->setNextLoadStore(NextLoadStore);
22624 LastLoadStoreInRegion = CurrentLoadStore;
22628void BoUpSLP::BlockScheduling::calculateDependencies(
22629 ScheduleBundle &Bundle,
bool InsertInReadyList,
BoUpSLP *SLP,
22631 SmallVector<ScheduleEntity *> WorkList;
22632 auto ProcessNode = [&](ScheduleEntity *SE) {
22634 if (CD->hasValidDependencies())
22637 CD->initDependencies();
22638 CD->resetUnscheduledDeps();
22639 const EdgeInfo &EI = CD->getEdgeInfo();
22642 const auto *It =
find(
Op, CD->getInst());
22643 assert(It !=
Op.end() &&
"Lane not set");
22644 SmallPtrSet<Instruction *, 4> Visited;
22646 int Lane = std::distance(
Op.begin(), It);
22647 assert(Lane >= 0 &&
"Lane not set");
22649 !EI.UserTE->ReorderIndices.empty())
22650 Lane = EI.UserTE->ReorderIndices[Lane];
22651 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
22652 "Couldn't find extract lane");
22654 if (EI.UserTE->isCopyableElement(In)) {
22657 if (ScheduleCopyableData *UseSD =
22658 getScheduleCopyableData(EI.UserTE->UserTreeIndex, In)) {
22659 CD->incDependencies();
22660 if (!UseSD->isScheduled())
22661 CD->incrementUnscheduledDeps(1);
22662 if (!UseSD->hasValidDependencies() ||
22663 (InsertInReadyList && UseSD->isReady()))
22666 }
else if (Visited.
insert(In).second) {
22667 if (ScheduleData *UseSD = getScheduleData(In)) {
22668 CD->incDependencies();
22669 if (!UseSD->isScheduled())
22670 CD->incrementUnscheduledDeps(1);
22671 if (!UseSD->hasValidDependencies() ||
22672 (InsertInReadyList && UseSD->isReady()))
22677 }
while (It !=
Op.end());
22678 if (CD->isReady() && CD->getDependencies() == 0 &&
22679 (EI.UserTE->hasState() &&
22680 (EI.UserTE->getMainOp()->getParent() !=
22681 CD->getInst()->getParent() ||
22683 (EI.UserTE->getMainOp()->hasNUsesOrMore(
UsesLimit) ||
22684 any_of(EI.UserTE->getMainOp()->users(), [&](User *U) {
22685 auto *IU = dyn_cast<Instruction>(U);
22688 return IU->getParent() == EI.UserTE->getMainOp()->getParent();
22694 CD->incDependencies();
22695 CD->incrementUnscheduledDeps(1);
22701 if (BundleMember->hasValidDependencies())
22703 LLVM_DEBUG(
dbgs() <<
"SLP: update deps of " << *BundleMember <<
"\n");
22704 BundleMember->initDependencies();
22705 BundleMember->resetUnscheduledDeps();
22707 SmallDenseMap<Value *, unsigned> UserToNumOps;
22708 for (User *U : BundleMember->getInst()->users()) {
22711 if (ScheduleData *UseSD = getScheduleData(U)) {
22715 if (areAllOperandsReplacedByCopyableData(
22718 BundleMember->incDependencies();
22719 if (!UseSD->isScheduled())
22720 BundleMember->incrementUnscheduledDeps(1);
22721 if (!UseSD->hasValidDependencies() ||
22722 (InsertInReadyList && UseSD->isReady()))
22726 for (ScheduleCopyableData *UseSD :
22727 getScheduleCopyableDataUsers(BundleMember->getInst())) {
22728 BundleMember->incDependencies();
22729 if (!UseSD->isScheduled())
22730 BundleMember->incrementUnscheduledDeps(1);
22731 if (!UseSD->hasValidDependencies() ||
22732 (InsertInReadyList && UseSD->isReady()))
22736 SmallPtrSet<const Instruction *, 4> Visited;
22739 if (!Visited.
insert(
I).second)
22741 auto *DepDest = getScheduleData(
I);
22742 assert(DepDest &&
"must be in schedule window");
22743 DepDest->addControlDependency(BundleMember);
22744 BundleMember->incDependencies();
22745 if (!DepDest->isScheduled())
22746 BundleMember->incrementUnscheduledDeps(1);
22747 if (!DepDest->hasValidDependencies() ||
22748 (InsertInReadyList && DepDest->isReady()))
22756 for (Instruction *
I = BundleMember->getInst()->getNextNode();
22757 I != ScheduleEnd;
I =
I->getNextNode()) {
22762 MakeControlDependent(
I);
22770 if (RegionHasStackSave) {
22775 match(BundleMember->getInst(),
22777 for (Instruction *
I = BundleMember->getInst()->getNextNode();
22778 I != ScheduleEnd;
I =
I->getNextNode()) {
22789 MakeControlDependent(
I);
22799 BundleMember->getInst()->mayReadOrWriteMemory()) {
22800 for (Instruction *
I = BundleMember->getInst()->getNextNode();
22801 I != ScheduleEnd;
I =
I->getNextNode()) {
22807 MakeControlDependent(
I);
22814 ScheduleData *NextLoadStore = BundleMember->getNextLoadStore();
22815 if (!NextLoadStore)
22819 "NextLoadStore list for non memory effecting bundle?");
22822 unsigned NumAliased = 0;
22823 unsigned DistToSrc = 1;
22824 bool IsNonSimpleSrc = !SrcLoc.
Ptr || !
isSimple(SrcInst);
22826 for (ScheduleData *DepDest = NextLoadStore; DepDest;
22827 DepDest = DepDest->getNextLoadStore()) {
22828 assert(isInSchedulingRegion(*DepDest) &&
"Expected to be in region");
22838 ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
22840 SLP->isAliased(SrcLoc, SrcInst, DepDest->getInst())))) {
22847 DepDest->addMemoryDependency(BundleMember);
22848 BundleMember->incDependencies();
22849 if (!DepDest->isScheduled())
22850 BundleMember->incrementUnscheduledDeps(1);
22851 if (!DepDest->hasValidDependencies() ||
22852 (InsertInReadyList && DepDest->isReady()))
22876 "expected at least one instruction to schedule");
22878 WorkList.
push_back(Bundle.getBundle().front());
22880 SmallPtrSet<ScheduleBundle *, 16> Visited;
22881 while (!WorkList.
empty()) {
22886 CopyableBundle.
push_back(&CD->getBundle());
22887 Bundles = CopyableBundle;
22889 Bundles = getScheduleBundles(SD->getInst());
22891 if (Bundles.
empty()) {
22892 if (!SD->hasValidDependencies())
22894 if (InsertInReadyList && SD->isReady()) {
22895 ReadyInsts.insert(SD);
22896 LLVM_DEBUG(
dbgs() <<
"SLP: gets ready on update: " << *SD <<
"\n");
22900 for (ScheduleBundle *Bundle : Bundles) {
22901 if (Bundle->hasValidDependencies() || !Visited.
insert(Bundle).second)
22903 assert(isInSchedulingRegion(*Bundle) &&
22904 "ScheduleData not in scheduling region");
22905 for_each(Bundle->getBundle(), ProcessNode);
22907 if (InsertInReadyList && SD->isReady()) {
22908 for (ScheduleBundle *Bundle : Bundles) {
22909 assert(isInSchedulingRegion(*Bundle) &&
22910 "ScheduleData not in scheduling region");
22911 if (!Bundle->isReady())
22913 ReadyInsts.insert(Bundle);
22921void BoUpSLP::BlockScheduling::resetSchedule() {
22923 "tried to reset schedule on block which has not been scheduled");
22924 for_each(ScheduleDataMap, [&](
auto &
P) {
22925 if (BB !=
P.first->getParent())
22927 ScheduleData *SD =
P.second;
22928 if (isInSchedulingRegion(*SD)) {
22929 SD->setScheduled(
false);
22930 SD->resetUnscheduledDeps();
22933 for_each(ScheduleCopyableDataMapByInst, [&](
auto &
P) {
22934 for_each(
P.second, [&](ScheduleCopyableData *SD) {
22935 if (isInSchedulingRegion(*SD)) {
22936 SD->setScheduled(false);
22937 SD->resetUnscheduledDeps();
22941 for_each(ScheduledBundles, [&](
auto &
P) {
22942 for_each(
P.second, [&](ScheduleBundle *Bundle) {
22943 if (isInSchedulingRegion(*Bundle))
22944 Bundle->setScheduled(false);
22948 for (
auto &
P : ScheduleCopyableDataMap) {
22949 if (isInSchedulingRegion(*
P.second)) {
22950 P.second->setScheduled(
false);
22951 P.second->resetUnscheduledDeps();
22954 ReadyInsts.clear();
22957void BoUpSLP::scheduleBlock(
const BoUpSLP &R, BlockScheduling *BS) {
22958 if (!BS->ScheduleStart)
22961 LLVM_DEBUG(
dbgs() <<
"SLP: schedule block " << BS->BB->getName() <<
"\n");
22968 BS->resetSchedule();
22975 struct ScheduleDataCompare {
22976 bool operator()(
const ScheduleEntity *SD1,
22977 const ScheduleEntity *SD2)
const {
22978 return SD2->getSchedulingPriority() < SD1->getSchedulingPriority();
22981 std::set<ScheduleEntity *, ScheduleDataCompare> ReadyInsts;
22986 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
22987 I =
I->getNextNode()) {
22989 if (!Bundles.
empty()) {
22990 for (ScheduleBundle *Bundle : Bundles) {
22991 Bundle->setSchedulingPriority(Idx++);
22992 if (!Bundle->hasValidDependencies())
22993 BS->calculateDependencies(*Bundle,
false,
this);
22996 for (ScheduleCopyableData *SD :
reverse(SDs)) {
22997 ScheduleBundle &Bundle = SD->getBundle();
22998 Bundle.setSchedulingPriority(Idx++);
22999 if (!Bundle.hasValidDependencies())
23000 BS->calculateDependencies(Bundle,
false,
this);
23005 BS->getScheduleCopyableDataUsers(
I);
23006 if (ScheduleData *SD = BS->getScheduleData(
I)) {
23009 SDTEs.
front()->doesNotNeedToSchedule() ||
23011 "scheduler and vectorizer bundle mismatch");
23012 SD->setSchedulingPriority(Idx++);
23013 if (!SD->hasValidDependencies() &&
23014 (!CopyableData.
empty() ||
23015 any_of(
R.ValueToGatherNodes.lookup(
I), [&](
const TreeEntry *TE) {
23016 assert(TE->isGather() &&
"expected gather node");
23017 return TE->hasState() && TE->hasCopyableElements() &&
23018 TE->isCopyableElement(I);
23024 ScheduleBundle Bundle;
23026 BS->calculateDependencies(Bundle,
false,
this);
23029 for (ScheduleCopyableData *SD :
reverse(CopyableData)) {
23030 ScheduleBundle &Bundle = SD->getBundle();
23031 Bundle.setSchedulingPriority(Idx++);
23032 if (!Bundle.hasValidDependencies())
23033 BS->calculateDependencies(Bundle,
false,
this);
23036 BS->initialFillReadyList(ReadyInsts);
23038 Instruction *LastScheduledInst = BS->ScheduleEnd;
23041 SmallPtrSet<Instruction *, 16> Scheduled;
23042 while (!ReadyInsts.empty()) {
23043 auto *Picked = *ReadyInsts.begin();
23044 ReadyInsts.erase(ReadyInsts.begin());
23049 for (
const ScheduleEntity *BundleMember : Bundle->getBundle()) {
23050 Instruction *PickedInst = BundleMember->getInst();
23052 bool IsCopyable = Bundle->getTreeEntry()->isCopyableElement(PickedInst);
23053 if ((IsCopyable && BS->getScheduleData(PickedInst)) ||
23054 (!IsCopyable && !Scheduled.
insert(PickedInst).second))
23056 if (PickedInst->
getNextNode() != LastScheduledInst)
23058 LastScheduledInst = PickedInst;
23060 EntryToLastInstruction.try_emplace(Bundle->getTreeEntry(),
23061 LastScheduledInst);
23065 if (PickedInst->
getNextNode() != LastScheduledInst)
23067 LastScheduledInst = PickedInst;
23069 auto Invalid = InstructionsState::invalid();
23074#ifdef EXPENSIVE_CHECKS
23078#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
23080 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
23081 I =
I->getNextNode()) {
23084 [](
const ScheduleBundle *Bundle) {
23085 return Bundle->isScheduled();
23087 "must be scheduled at this point");
23092 BS->ScheduleStart =
nullptr;
23100 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
23105 auto E = InstrElementSize.find(V);
23106 if (E != InstrElementSize.end())
23123 Value *FirstNonBool =
nullptr;
23124 while (!Worklist.
empty()) {
23129 auto *Ty =
I->getType();
23132 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
23140 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
23148 for (
Use &U :
I->operands()) {
23150 if (Visited.
insert(J).second &&
23156 FirstNonBool = U.get();
23167 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
23169 Width = DL->getTypeSizeInBits(V->getType());
23173 InstrElementSize[
I] = Width;
23178bool BoUpSLP::collectValuesToDemote(
23179 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
23182 bool &IsProfitableToDemote,
bool IsTruncRoot)
const {
23187 unsigned OrigBitWidth =
23188 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
23202 if (isa<PoisonValue>(R))
23204 return !isKnownNonNegative(R, SimplifyQuery(*DL));
23206 auto IsPotentiallyTruncated = [&](
Value *V,
unsigned &
BitWidth) ->
bool {
23209 if (getTreeEntries(V).
size() > 1)
23215 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth >
BitWidth) {
23221 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
23225 APInt
Mask = DB->getDemandedBits(
I);
23226 unsigned BitWidth2 =
23227 std::max<unsigned>(1,
Mask.getBitWidth() -
Mask.countl_zero());
23228 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
23234 BitWidth1 = std::min(BitWidth1, BitWidth2);
23239 auto FinalAnalysis = [&, TTI = TTI]() {
23240 if (!IsProfitableToDemote)
23243 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(
BitWidth)));
23245 if (Res &&
E.isGather()) {
23246 if (
E.hasState()) {
23247 if (
const TreeEntry *SameTE =
23248 getSameValuesTreeEntry(
E.getMainOp(),
E.Scalars);
23250 if (collectValuesToDemote(*SameTE, IsProfitableToDemoteRoot,
BitWidth,
23251 ToDemote, Visited, NodesToKeepBWs,
23252 MaxDepthLevel, IsProfitableToDemote,
23260 SmallPtrSet<Value *, 4> UniqueBases;
23261 for (
Value *V :
E.Scalars) {
23265 UniqueBases.
insert(EE->getVectorOperand());
23267 const unsigned VF =
E.Scalars.size();
23268 Type *OrigScalarTy =
E.Scalars.front()->getType();
23269 if (UniqueBases.
size() <= 2 ||
23282 if (
E.isGather() || !Visited.
insert(&
E).second ||
23284 return !isa<Constant>(V) && all_of(V->users(), [&](User *U) {
23285 return isa<InsertElementInst>(U) && !isVectorized(U);
23288 return FinalAnalysis();
23291 return !isa<Constant>(V) && !all_of(V->users(), [=](User *U) {
23292 return isVectorized(U) ||
23293 (E.Idx == 0 && UserIgnoreList &&
23294 UserIgnoreList->contains(U)) ||
23295 (!isa<CmpInst>(U) && U->getType()->isSized() &&
23296 !U->getType()->isScalableTy() &&
23297 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
23298 }) && !IsPotentiallyTruncated(V,
BitWidth);
23303 bool &NeedToExit) {
23304 NeedToExit =
false;
23305 unsigned InitLevel = MaxDepthLevel;
23306 for (
const TreeEntry *
Op : Operands) {
23307 unsigned Level = InitLevel;
23308 if (!collectValuesToDemote(*
Op, IsProfitableToDemoteRoot,
BitWidth,
23309 ToDemote, Visited, NodesToKeepBWs, Level,
23310 IsProfitableToDemote, IsTruncRoot)) {
23311 if (!IsProfitableToDemote)
23314 if (!FinalAnalysis())
23318 MaxDepthLevel = std::max(MaxDepthLevel, Level);
23322 auto AttemptCheckBitwidth =
23323 [&](function_ref<bool(
unsigned,
unsigned)> Checker,
bool &NeedToExit) {
23325 NeedToExit =
false;
23326 unsigned BestFailBitwidth = 0;
23328 if (Checker(
BitWidth, OrigBitWidth))
23330 if (BestFailBitwidth == 0 && FinalAnalysis())
23334 if (BestFailBitwidth == 0) {
23345 auto TryProcessInstruction =
23347 function_ref<bool(
unsigned,
unsigned)> Checker = {}) {
23348 if (Operands.empty()) {
23351 for (
Value *V :
E.Scalars)
23352 (void)IsPotentiallyTruncated(V,
BitWidth);
23357 return !V->hasOneUse() && !IsPotentiallyTruncated(V, BitWidth);
23360 bool NeedToExit =
false;
23361 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
23365 if (!ProcessOperands(Operands, NeedToExit))
23374 return IsProfitableToDemote;
23377 if (
E.State == TreeEntry::SplitVectorize)
23378 return TryProcessInstruction(
23380 {VectorizableTree[
E.CombinedEntriesWithIndices.front().first].get(),
23381 VectorizableTree[
E.CombinedEntriesWithIndices.back().first].get()});
23383 if (
E.isAltShuffle()) {
23385 auto IsDangerousOpcode = [](
unsigned Opcode) {
23387 case Instruction::Shl:
23388 case Instruction::AShr:
23389 case Instruction::LShr:
23390 case Instruction::UDiv:
23391 case Instruction::SDiv:
23392 case Instruction::URem:
23393 case Instruction::SRem:
23400 if (IsDangerousOpcode(
E.getAltOpcode()))
23401 return FinalAnalysis();
23404 switch (
E.getOpcode()) {
23408 case Instruction::Trunc:
23409 if (IsProfitableToDemoteRoot)
23410 IsProfitableToDemote =
true;
23411 return TryProcessInstruction(
BitWidth);
23412 case Instruction::ZExt:
23413 case Instruction::SExt:
23414 if (
E.UserTreeIndex.UserTE &&
E.UserTreeIndex.UserTE->hasState() &&
23415 E.UserTreeIndex.UserTE->getOpcode() == Instruction::BitCast &&
23416 E.UserTreeIndex.UserTE->getMainOp()->getType()->isFPOrFPVectorTy())
23418 IsProfitableToDemote =
true;
23419 return TryProcessInstruction(
BitWidth);
23423 case Instruction::Add:
23424 case Instruction::Sub:
23425 case Instruction::Mul:
23426 case Instruction::And:
23427 case Instruction::Or:
23428 case Instruction::Xor: {
23429 return TryProcessInstruction(
23430 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)});
23432 case Instruction::Freeze:
23433 return TryProcessInstruction(
BitWidth, getOperandEntry(&
E, 0));
23434 case Instruction::Shl: {
23437 auto ShlChecker = [&](
unsigned BitWidth, unsigned) {
23439 if (isa<PoisonValue>(V))
23441 if (E.isCopyableElement(V))
23443 auto *I = cast<Instruction>(V);
23444 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
23445 return AmtKnownBits.getMaxValue().ult(BitWidth);
23448 return TryProcessInstruction(
23449 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)}, ShlChecker);
23451 case Instruction::LShr: {
23455 auto LShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
23457 if (isa<PoisonValue>(V))
23459 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
23460 if (E.isCopyableElement(V))
23461 return MaskedValueIsZero(V, ShiftedBits, SimplifyQuery(*DL));
23462 auto *I = cast<Instruction>(V);
23463 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
23464 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
23465 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
23466 SimplifyQuery(*DL));
23469 return TryProcessInstruction(
23470 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)},
23473 case Instruction::AShr: {
23477 auto AShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
23479 if (isa<PoisonValue>(V))
23481 auto *I = cast<Instruction>(V);
23482 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
23483 unsigned ShiftedBits = OrigBitWidth - BitWidth;
23484 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
23486 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
23489 return TryProcessInstruction(
23490 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)},
23493 case Instruction::UDiv:
23494 case Instruction::URem: {
23496 auto Checker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
23499 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
23500 if (E.hasCopyableElements() && E.isCopyableElement(V))
23501 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
23502 auto *I = cast<Instruction>(V);
23503 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
23504 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
23507 return TryProcessInstruction(
23508 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)}, Checker);
23512 case Instruction::Select: {
23513 return TryProcessInstruction(
23514 BitWidth, {getOperandEntry(&
E, 1), getOperandEntry(&
E, 2)});
23518 case Instruction::PHI: {
23519 const unsigned NumOps =
E.getNumOperands();
23522 [&](
unsigned Idx) { return getOperandEntry(&E, Idx); });
23527 case Instruction::Call: {
23532 if (
ID != Intrinsic::abs &&
ID != Intrinsic::smin &&
23533 ID != Intrinsic::smax &&
ID != Intrinsic::umin &&
ID != Intrinsic::umax)
23536 function_ref<bool(
unsigned,
unsigned)> CallChecker;
23537 auto CompChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
23540 auto *I = cast<Instruction>(V);
23541 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
23542 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
23543 return MaskedValueIsZero(I->getOperand(0), Mask,
23544 SimplifyQuery(*DL)) &&
23545 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
23547 assert((
ID == Intrinsic::smin ||
ID == Intrinsic::smax) &&
23548 "Expected min/max intrinsics only.");
23549 unsigned SignBits = OrigBitWidth -
BitWidth;
23551 unsigned Op0SignBits =
23553 unsigned Op1SignBits =
23555 return SignBits <= Op0SignBits &&
23556 ((SignBits != Op0SignBits &&
23559 SimplifyQuery(*DL))) &&
23560 SignBits <= Op1SignBits &&
23561 ((SignBits != Op1SignBits &&
23566 auto AbsChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
23569 auto *I = cast<Instruction>(V);
23570 unsigned SignBits = OrigBitWidth - BitWidth;
23571 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
23572 unsigned Op0SignBits =
23573 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
23574 return SignBits <= Op0SignBits &&
23575 ((SignBits != Op0SignBits &&
23576 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
23577 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
23580 if (
ID != Intrinsic::abs) {
23581 Operands.push_back(getOperandEntry(&
E, 1));
23582 CallChecker = CompChecker;
23584 CallChecker = AbsChecker;
23587 std::numeric_limits<InstructionCost::CostType>::max();
23589 unsigned VF =
E.Scalars.size();
23591 auto Checker = [&](
unsigned BitWidth, unsigned) {
23599 if (
Cost < BestCost) {
23605 [[maybe_unused]]
bool NeedToExit;
23606 (void)AttemptCheckBitwidth(Checker, NeedToExit);
23608 return TryProcessInstruction(
BitWidth, Operands, CallChecker);
23616 return FinalAnalysis();
23623 bool IsStoreOrInsertElt =
23624 VectorizableTree.front()->hasState() &&
23625 (VectorizableTree.front()->
getOpcode() == Instruction::Store ||
23626 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
23627 if ((IsStoreOrInsertElt || UserIgnoreList) &&
23628 ExtraBitWidthNodes.size() <= 1 &&
23629 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
23630 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
23633 unsigned NodeIdx = 0;
23634 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
23638 assert((VectorizableTree[NodeIdx]->
isGather() || NodeIdx != 0 ||
23639 !VectorizableTree[NodeIdx]->UserTreeIndex) &&
23640 "Unexpected tree is graph.");
23644 bool IsTruncRoot =
false;
23645 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
23648 if (NodeIdx != 0 &&
23649 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
23650 VectorizableTree[NodeIdx]->
getOpcode() == Instruction::Trunc) {
23651 assert(IsStoreOrInsertElt &&
"Expected store/insertelement seeded graph.");
23652 IsTruncRoot =
true;
23654 IsProfitableToDemoteRoot =
true;
23659 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
23663 auto ComputeMaxBitWidth =
23664 [&](
const TreeEntry &E,
bool IsTopRoot,
bool IsProfitableToDemoteRoot,
23665 unsigned Limit,
bool IsTruncRoot,
bool IsSignedCmp) ->
unsigned {
23669 if (E.isGather() && IsTruncRoot && E.UserTreeIndex &&
23670 !NodesToKeepBWs.
contains(E.Idx) &&
23671 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
23673 return V->hasOneUse() || isa<Constant>(V) ||
23674 (!V->hasNUsesOrMore(UsesLimit) &&
23675 none_of(V->users(), [&](User *U) {
23676 ArrayRef<TreeEntry *> TEs = getTreeEntries(U);
23677 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
23678 if (TEs.empty() || is_contained(TEs, UserTE))
23680 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
23682 isa<SIToFPInst, UIToFPInst>(U) ||
23683 (UserTE->hasState() &&
23684 (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
23685 SelectInst>(UserTE->getMainOp()) ||
23686 isa<SIToFPInst, UIToFPInst>(UserTE->getMainOp()))))
23688 unsigned UserTESz = DL->getTypeSizeInBits(
23689 UserTE->Scalars.front()->getType());
23690 if (all_of(TEs, [&](const TreeEntry *TE) {
23691 auto It = MinBWs.find(TE);
23692 return It != MinBWs.end() &&
23693 It->second.first > UserTESz;
23696 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
23700 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
23701 auto It = MinBWs.find(UserTE);
23702 if (It != MinBWs.end())
23703 return It->second.first;
23704 unsigned MaxBitWidth =
23705 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
23706 MaxBitWidth =
bit_ceil(MaxBitWidth);
23707 if (MaxBitWidth < 8 && MaxBitWidth > 1)
23709 return MaxBitWidth;
23715 unsigned VF = E.getVectorFactor();
23716 Type *ScalarTy = E.Scalars.front()->getType();
23723 [&](
Value *V) { return AnalyzedMinBWVals.contains(V); }))
23732 unsigned MaxBitWidth = 1u;
23740 bool IsKnownPositive = !IsSignedCmp &&
all_of(E.Scalars, [&](
Value *R) {
23741 if (isa<PoisonValue>(R))
23743 KnownBits Known = computeKnownBits(R, *DL);
23744 return Known.isNonNegative();
23747 if (!IsKnownPositive && !IsTopRoot && E.UserTreeIndex &&
23748 E.UserTreeIndex.UserTE->hasState() &&
23749 E.UserTreeIndex.UserTE->getOpcode() == Instruction::UIToFP)
23751 std::min(DL->getTypeSizeInBits(
23752 E.UserTreeIndex.UserTE->Scalars.front()->getType()),
23753 DL->getTypeSizeInBits(ScalarTy));
23757 for (
Value *Root : E.Scalars) {
23763 unsigned BitWidth1 = NumTypeBits - NumSignBits;
23779 if (!IsKnownPositive)
23784 MaxBitWidth = std::max(BitWidth1, MaxBitWidth);
23787 APInt Mask = DB->getDemandedBits(
I);
23788 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
23790 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
23793 if (MaxBitWidth < 8 && MaxBitWidth > 1)
23798 if (NumParts > 1 &&
23806 unsigned Opcode = E.getOpcode();
23807 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
23808 Opcode == Instruction::SExt ||
23809 Opcode == Instruction::ZExt || NumParts > 1;
23814 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
23815 bool NeedToDemote = IsProfitableToDemote;
23817 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
23818 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
23819 NeedToDemote, IsTruncRoot) ||
23820 (MaxDepthLevel <= Limit &&
23821 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
23822 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
23823 DL->getTypeSizeInBits(TreeRootIT) /
23824 DL->getTypeSizeInBits(
23825 E.getMainOp()->getOperand(0)->getType()) >
23829 MaxBitWidth =
bit_ceil(MaxBitWidth);
23831 return MaxBitWidth;
23838 if (UserIgnoreList &&
23842 if (
all_of(*UserIgnoreList,
23847 VectorizableTree.front()->State == TreeEntry::Vectorize &&
23848 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
23849 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
23850 Builder.getInt1Ty()) {
23851 ReductionBitWidth = 1;
23853 for (
Value *V : *UserIgnoreList) {
23857 TypeSize NumTypeBits = DL->getTypeSizeInBits(
V->getType());
23858 unsigned BitWidth1 = NumTypeBits - NumSignBits;
23861 unsigned BitWidth2 = BitWidth1;
23864 BitWidth2 =
Mask.getBitWidth() -
Mask.countl_zero();
23866 ReductionBitWidth =
23867 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
23869 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
23870 ReductionBitWidth = 8;
23872 ReductionBitWidth =
bit_ceil(ReductionBitWidth);
23875 bool IsTopRoot = NodeIdx == 0;
23876 while (NodeIdx < VectorizableTree.size() &&
23877 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
23878 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
23879 RootDemotes.push_back(NodeIdx);
23881 IsTruncRoot =
true;
23883 bool IsSignedCmp =
false;
23884 if (UserIgnoreList &&
23888 IsSignedCmp =
true;
23889 while (NodeIdx < VectorizableTree.size()) {
23891 unsigned Limit = 2;
23893 ReductionBitWidth ==
23894 DL->getTypeSizeInBits(
23895 VectorizableTree.front()->Scalars.front()->getType()))
23897 unsigned MaxBitWidth = ComputeMaxBitWidth(
23898 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
23899 IsTruncRoot, IsSignedCmp);
23900 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
23901 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
23902 ReductionBitWidth =
bit_ceil(MaxBitWidth);
23903 else if (MaxBitWidth == 0)
23904 ReductionBitWidth = 0;
23907 for (
unsigned Idx : RootDemotes) {
23908 if (
all_of(VectorizableTree[Idx]->Scalars, [&](
Value *V) {
23909 uint32_t OrigBitWidth =
23910 DL->getTypeSizeInBits(
V->getType()->getScalarType());
23911 if (OrigBitWidth > MaxBitWidth) {
23919 RootDemotes.clear();
23921 IsProfitableToDemoteRoot =
true;
23923 if (ExtraBitWidthNodes.empty()) {
23924 NodeIdx = VectorizableTree.size();
23926 unsigned NewIdx = 0;
23928 NewIdx = *ExtraBitWidthNodes.begin();
23929 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
23930 }
while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
23933 NodeIdx < VectorizableTree.size() &&
23934 VectorizableTree[NodeIdx]->UserTreeIndex &&
23935 VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&
23936 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
23937 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
23938 Instruction::Trunc &&
23939 !VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();
23941 NodeIdx < VectorizableTree.size() &&
23942 VectorizableTree[NodeIdx]->UserTreeIndex &&
23943 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
23944 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
23945 Instruction::ICmp &&
23947 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->Scalars,
23949 auto *IC = dyn_cast<ICmpInst>(V);
23950 return IC && (IC->isSigned() ||
23951 !isKnownNonNegative(IC->getOperand(0),
23952 SimplifyQuery(*DL)) ||
23953 !isKnownNonNegative(IC->getOperand(1),
23954 SimplifyQuery(*DL)));
23960 if (MaxBitWidth == 0 ||
23964 if (UserIgnoreList)
23965 AnalyzedMinBWVals.insert_range(TreeRoot);
23972 for (
unsigned Idx : ToDemote) {
23973 TreeEntry *
TE = VectorizableTree[Idx].get();
23974 if (MinBWs.contains(TE))
23977 if (isa<PoisonValue>(R))
23979 return !isKnownNonNegative(R, SimplifyQuery(*DL));
23981 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
24022 DL = &
F.getDataLayout();
24030 if (!
TTI->getNumberOfRegisters(
TTI->getRegisterClassForType(
true))) {
24032 dbgs() <<
"SLP: Didn't find any vector registers for target, abort.\n");
24037 if (
F.hasFnAttribute(Attribute::NoImplicitFloat))
24040 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing blocks in " <<
F.getName() <<
".\n");
24044 BoUpSLP R(&
F,
SE,
TTI,
TLI,
AA,
LI,
DT,
AC,
DB,
DL, ORE_);
24050 DT->updateDFSNumbers();
24053 for (
auto *BB :
post_order(&
F.getEntryBlock())) {
24058 R.clearReductionData();
24059 collectSeedInstructions(BB);
24062 if (!Stores.empty()) {
24064 <<
" underlying objects.\n");
24065 Changed |= vectorizeStoreChains(R);
24069 Changed |= vectorizeChainsInBlock(BB, R);
24074 if (!GEPs.empty()) {
24076 <<
" underlying objects.\n");
24077 Changed |= vectorizeGEPIndices(BB, R);
24082 R.optimizeGatherSequence();
24090 unsigned Idx,
unsigned MinVF,
24095 const unsigned Sz = R.getVectorElementSize(Chain[0]);
24096 unsigned VF = Chain.
size();
24102 VF < 2 || VF < MinVF) {
24110 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << VF <<
" stores at offset " << Idx
24114 for (
Value *V : Chain)
24117 InstructionsCompatibilityAnalysis
Analysis(*DT, *
DL, *
TTI, *TLI);
24118 InstructionsState S =
Analysis.buildInstructionsState(
24122 bool IsAllowedSize =
24126 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
24127 (!S.getMainOp()->isSafeToRemove() ||
24130 return !isa<ExtractElementInst>(V) &&
24131 (V->getNumUses() > Chain.size() ||
24132 any_of(V->users(), [&](User *U) {
24133 return !Stores.contains(U);
24136 (ValOps.
size() > Chain.size() / 2 && !S)) {
24137 Size = (!IsAllowedSize && S) ? 1 : 2;
24141 if (
R.isLoadCombineCandidate(Chain))
24143 R.buildTree(Chain);
24145 if (
R.isTreeTinyAndNotFullyVectorizable()) {
24146 if (
R.isGathered(Chain.front()) ||
24148 return std::nullopt;
24149 Size =
R.getCanonicalGraphSize();
24152 if (
R.isProfitableToReorder()) {
24153 R.reorderTopToBottom();
24154 R.reorderBottomToTop();
24156 R.transformNodes();
24157 R.computeMinimumValueSizes();
24160 R.buildExternalUses();
24162 Size =
R.getCanonicalGraphSize();
24163 if (S && S.getOpcode() == Instruction::Load)
24171 using namespace ore;
24173 R.getORE()->emit(OptimizationRemark(
SV_NAME,
"StoresVectorized",
24175 <<
"Stores SLP vectorized with cost " <<
NV(
"Cost",
Cost)
24176 <<
" and with tree size "
24177 <<
NV(
"TreeSize",
R.getTreeSize()));
24190 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
24191 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
24192 unsigned Size = Val.first;
24204 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
24205 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
24206 unsigned P = Val.first;
24209 return V + (P - Mean) * (P - Mean);
24212 return Dev * 96 / (Mean * Mean) == 0;
24220class RelatedStoreInsts {
24223 : AllStores(AllStores) {
24224 reset(BaseInstrIdx);
24227 void reset(
unsigned NewBaseInstr) {
24228 assert(NewBaseInstr < AllStores.size() &&
24229 "Instruction index out of bounds");
24230 BaseInstrIdx = NewBaseInstr;
24232 insertOrLookup(NewBaseInstr, 0);
24239 std::optional<unsigned> insertOrLookup(
unsigned InstrIdx, int64_t PtrDist) {
24240 auto [It,
Inserted] = Instrs.emplace(PtrDist, InstrIdx);
24241 return Inserted ? std::nullopt : std::make_optional(It->second);
24244 using DistToInstMap = std::map<int64_t, unsigned>;
24245 const DistToInstMap &getStores()
const {
return Instrs; }
24249 std::optional<int64_t> getPointerDiff(StoreInst &SI,
const DataLayout &
DL,
24250 ScalarEvolution &SE)
const {
24251 StoreInst &BaseStore = *AllStores[BaseInstrIdx];
24254 SI.getValueOperand()->getType(),
SI.getPointerOperand(),
DL, SE,
24260 void rebase(
unsigned MinSafeIdx,
unsigned NewBaseInstIdx,
24261 int64_t DistFromCurBase) {
24262 DistToInstMap PrevSet = std::move(Instrs);
24263 reset(NewBaseInstIdx);
24268 for (
auto [Dist, InstIdx] : PrevSet) {
24269 if (InstIdx >= MinSafeIdx)
24270 insertOrLookup(InstIdx, Dist - DistFromCurBase);
24276 DistToInstMap::reverse_iterator LastVectorizedStore =
find_if(
24277 reverse(Instrs), [&](
const std::pair<int64_t, unsigned> &DistAndIdx) {
24278 return VectorizedStores.
contains(AllStores[DistAndIdx.second]);
24283 DistToInstMap::iterator VectorizedStoresEnd = LastVectorizedStore.base();
24284 Instrs.erase(Instrs.begin(), VectorizedStoresEnd);
24289 unsigned BaseInstrIdx;
24292 DistToInstMap Instrs;
24300bool SLPVectorizerPass::vectorizeStores(
24302 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
24309 auto TryToVectorize = [&](
const RelatedStoreInsts::DistToInstMap &StoreSeq) {
24310 int64_t PrevDist = -1;
24314 auto &[Dist, InstIdx] =
Data;
24315 if (Operands.
empty() || Dist - PrevDist == 1) {
24318 if (Idx != StoreSeq.size() - 1)
24327 if (Operands.
size() <= 1 ||
24329 .
insert({Operands.front(),
24330 cast<StoreInst>(Operands.front())->getValueOperand(),
24332 cast<StoreInst>(Operands.back())->getValueOperand(),
24337 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
24338 unsigned EltSize =
R.getVectorElementSize(Operands[0]);
24342 std::min(
R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
24344 Type *StoreTy =
Store->getValueOperand()->getType();
24345 Type *ValueTy = StoreTy;
24347 ValueTy = Trunc->getSrcTy();
24356 R.getMinVF(DL->getTypeStoreSizeInBits(StoreScalarTy)), StoreScalarTy,
24359 MinVF = std::max<unsigned>(2, MinVF);
24361 if (MaxVF < MinVF) {
24362 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
24364 <<
"MinVF (" << MinVF <<
")\n");
24368 unsigned NonPowerOf2VF = 0;
24373 unsigned CandVF = std::clamp<unsigned>(Operands.
size(), MinVF, MaxVF);
24375 NonPowerOf2VF = CandVF;
24376 assert(NonPowerOf2VF != MaxVF &&
24377 "Non-power-of-2 VF should not be equal to MaxVF");
24384 unsigned MaxRegVF = MaxVF;
24386 MaxVF = std::min<unsigned>(MaxVF,
bit_floor(Operands.
size()));
24387 if (MaxVF < MinVF) {
24388 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
24390 <<
"MinVF (" << MinVF <<
")\n");
24394 SmallVector<unsigned> CandidateVFs;
24395 for (
unsigned VF = std::max(MaxVF, NonPowerOf2VF); VF >= MinVF;
24399 unsigned End = Operands.
size();
24400 unsigned Repeat = 0;
24401 constexpr unsigned MaxAttempts = 4;
24407 Operands.
size(), {1, 1});
24410 DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable;
24411 auto IsNotVectorized = [](
const std::pair<unsigned, unsigned> &
P) {
24412 return P.first > 0;
24414 auto IsVectorized = [](
const std::pair<unsigned, unsigned> &
P) {
24415 return P.first == 0;
24417 auto VFIsProfitable = [](
unsigned Size,
24418 const std::pair<unsigned, unsigned> &
P) {
24419 return Size >=
P.first;
24421 auto FirstSizeSame = [](
unsigned Size,
24422 const std::pair<unsigned, unsigned> &
P) {
24423 return Size ==
P.first;
24427 bool RepeatChanged =
false;
24428 bool AnyProfitableGraph =
false;
24429 for (
unsigned VF : CandidateVFs) {
24430 AnyProfitableGraph =
false;
24431 unsigned FirstUnvecStore = std::distance(
24432 RangeSizes.begin(),
find_if(RangeSizes, IsNotVectorized));
24436 while (FirstUnvecStore < End) {
24437 unsigned FirstVecStore = std::distance(
24438 RangeSizes.begin(),
24439 find_if(RangeSizes.drop_front(FirstUnvecStore), IsVectorized));
24440 unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore;
24441 for (
unsigned SliceStartIdx = FirstUnvecStore;
24442 SliceStartIdx + VF <= MaxSliceEnd;) {
24452 ->getValueOperand()
24455 ->getValueOperand()
24458 "Expected all operands of same type.");
24459 if (!NonSchedulable.
empty()) {
24460 auto [NonSchedSizeMax, NonSchedSizeMin] =
24462 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= VF) {
24465 SliceStartIdx += NonSchedSizeMax;
24470 std::optional<bool> Res =
24471 vectorizeStoreChain(Slice, R, SliceStartIdx, MinVF, TreeSize);
24477 .first->getSecond()
24485 AnyProfitableGraph = RepeatChanged =
Changed =
true;
24488 for (std::pair<unsigned, unsigned> &
P :
24489 RangeSizes.slice(SliceStartIdx, VF))
24490 P.first =
P.second = 0;
24491 if (SliceStartIdx < FirstUnvecStore + MinVF) {
24492 for (std::pair<unsigned, unsigned> &
P : RangeSizes.slice(
24493 FirstUnvecStore, SliceStartIdx - FirstUnvecStore))
24494 P.first =
P.second = 0;
24495 FirstUnvecStore = SliceStartIdx + VF;
24497 if (SliceStartIdx > MaxSliceEnd - VF - MinVF) {
24498 for (std::pair<unsigned, unsigned> &
P :
24499 RangeSizes.slice(SliceStartIdx + VF,
24500 MaxSliceEnd - (SliceStartIdx + VF)))
24501 P.first =
P.second = 0;
24502 if (MaxSliceEnd == End)
24503 End = SliceStartIdx;
24504 MaxSliceEnd = SliceStartIdx;
24506 SliceStartIdx += VF;
24509 if (VF > 2 && Res &&
24510 !
all_of(RangeSizes.slice(SliceStartIdx, VF),
24511 std::bind(VFIsProfitable, TreeSize, _1))) {
24512 SliceStartIdx += VF;
24517 if (VF > MaxRegVF && TreeSize > 1 &&
24518 all_of(RangeSizes.slice(SliceStartIdx, VF),
24519 std::bind(FirstSizeSame, TreeSize, _1))) {
24520 SliceStartIdx += VF;
24521 while (SliceStartIdx != MaxSliceEnd &&
24522 RangeSizes[SliceStartIdx].first == TreeSize)
24527 for (std::pair<unsigned, unsigned> &
P :
24528 RangeSizes.slice(SliceStartIdx, VF))
24529 P.second = std::max(
P.second, TreeSize);
24531 AnyProfitableGraph =
true;
24533 if (FirstUnvecStore >= End)
24535 if (MaxSliceEnd - FirstUnvecStore < VF &&
24536 MaxSliceEnd - FirstUnvecStore >= MinVF)
24537 AnyProfitableGraph =
true;
24538 FirstUnvecStore = std::distance(
24539 RangeSizes.begin(),
24540 find_if(RangeSizes.drop_front(MaxSliceEnd), IsNotVectorized));
24542 if (!AnyProfitableGraph && VF >= MaxRegVF &&
has_single_bit(VF))
24545 if (VF == MaxRegVF)
24546 for (std::pair<unsigned, unsigned> &
P : RangeSizes)
24548 P.first = std::max(
P.second,
P.first);
24551 if (
all_of(RangeSizes, IsVectorized))
24554 if (Repeat >= MaxAttempts ||
24555 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
24557 constexpr unsigned StoresLimit = 64;
24558 const unsigned MaxTotalNum = std::min<unsigned>(
24560 static_cast<unsigned>(
24562 std::distance(RangeSizes.begin(),
24563 find_if(RangeSizes, IsNotVectorized)) +
24565 unsigned VF =
bit_ceil(CandidateVFs.front()) * 2;
24568 CandidateVFs.clear();
24570 CandidateVFs.push_back(Limit);
24571 if (VF > MaxTotalNum || VF >= StoresLimit)
24573 for (std::pair<unsigned, unsigned> &
P : RangeSizes) {
24575 P.first = std::max(
P.second,
P.first);
24579 CandidateVFs.push_back(VF);
24619 auto FillStoresSet = [&](
unsigned Idx, StoreInst *
SI) {
24620 std::optional<int64_t> PtrDist;
24621 auto *RelatedStores =
find_if(
24622 SortedStores, [&PtrDist, SI,
this](
const RelatedStoreInsts &StoreSeq) {
24623 PtrDist = StoreSeq.getPointerDiff(*SI, *DL, *SE);
24624 return PtrDist.has_value();
24628 if (RelatedStores == SortedStores.
end()) {
24636 if (std::optional<unsigned> PrevInst =
24637 RelatedStores->insertOrLookup(Idx, *PtrDist)) {
24638 TryToVectorize(RelatedStores->getStores());
24639 RelatedStores->clearVectorizedStores(VectorizedStores);
24640 RelatedStores->rebase(*PrevInst + 1,
24645 Type *PrevValTy =
nullptr;
24647 if (
R.isDeleted(SI))
24650 PrevValTy =
SI->getValueOperand()->getType();
24652 if (PrevValTy !=
SI->getValueOperand()->getType()) {
24653 for (RelatedStoreInsts &StoreSeq : SortedStores)
24654 TryToVectorize(StoreSeq.getStores());
24655 SortedStores.clear();
24656 PrevValTy =
SI->getValueOperand()->getType();
24658 FillStoresSet(
I, SI);
24662 for (RelatedStoreInsts &StoreSeq : SortedStores)
24663 TryToVectorize(StoreSeq.getStores());
24668void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
24676 for (Instruction &
I : *BB) {
24680 if (!
SI->isSimple())
24691 if (
GEP->getNumIndices() != 1)
24693 Value *Idx =
GEP->idx_begin()->get();
24698 if (
GEP->getType()->isVectorTy())
24710 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize a list of length = "
24711 << VL.
size() <<
".\n");
24722 for (
Value *V : VL) {
24723 Type *Ty =
V->getType();
24727 R.getORE()->emit([&]() {
24728 std::string TypeStr;
24729 llvm::raw_string_ostream OS(TypeStr);
24731 return OptimizationRemarkMissed(
SV_NAME,
"UnsupportedType", I0)
24732 <<
"Cannot SLP vectorize list: type "
24733 << TypeStr +
" is unsupported by vectorizer";
24740 unsigned Sz =
R.getVectorElementSize(I0);
24741 unsigned MinVF =
R.getMinVF(Sz);
24742 unsigned MaxVF = std::max<unsigned>(
24744 MaxVF = std::min(
R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
24746 R.getORE()->emit([&]() {
24747 return OptimizationRemarkMissed(
SV_NAME,
"SmallVF", I0)
24748 <<
"Cannot SLP vectorize list: vectorization factor "
24749 <<
"less than 2 is not supported";
24755 bool CandidateFound =
false;
24758 unsigned NextInst = 0, MaxInst = VL.size();
24759 for (
unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
24765 if (TTI->getNumberOfParts(VecTy) == VF)
24767 for (
unsigned I = NextInst;
I < MaxInst; ++
I) {
24768 unsigned ActualVF = std::min(MaxInst -
I, VF);
24773 if (MaxVFOnly && ActualVF < MaxVF)
24775 if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
24780 for (
Value *V : VL.drop_front(
I)) {
24784 !Inst || !
R.isDeleted(Inst)) {
24787 if (Idx == ActualVF)
24792 if (Idx != ActualVF)
24795 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << ActualVF <<
" operations "
24799 if (
R.isTreeTinyAndNotFullyVectorizable())
24801 if (
R.isProfitableToReorder()) {
24802 R.reorderTopToBottom();
24805 R.transformNodes();
24806 R.computeMinimumValueSizes();
24808 R.buildExternalUses();
24811 CandidateFound =
true;
24812 MinCost = std::min(MinCost,
Cost);
24815 <<
" for VF=" << ActualVF <<
"\n");
24818 R.getORE()->emit(OptimizationRemark(
SV_NAME,
"VectorizedList",
24820 <<
"SLP vectorized with cost " <<
ore::NV(
"Cost",
Cost)
24821 <<
" and with tree size "
24822 <<
ore::NV(
"TreeSize",
R.getTreeSize()));
24833 if (!
Changed && CandidateFound) {
24834 R.getORE()->emit([&]() {
24835 return OptimizationRemarkMissed(
SV_NAME,
"NotBeneficial", I0)
24836 <<
"List vectorization was possible but not beneficial with cost "
24837 <<
ore::NV(
"Cost", MinCost) <<
" >= "
24841 R.getORE()->emit([&]() {
24842 return OptimizationRemarkMissed(
SV_NAME,
"NotPossible", I0)
24843 <<
"Cannot SLP vectorize list: vectorization was impossible"
24844 <<
" with available vectorization factors";
24879 using ReductionOpsType = SmallVector<Value *, 16>;
24880 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
24881 ReductionOpsListType ReductionOps;
24885 SmallDenseMap<Value *, SmallVector<Instruction *>, 16> ReducedValsToOps;
24886 WeakTrackingVH ReductionRoot;
24891 bool IsSupportedHorRdxIdentityOp =
false;
24898 static bool isCmpSelMinMax(Instruction *
I) {
24906 static bool isBoolLogicOp(Instruction *
I) {
24912 static bool isVectorizable(
RecurKind Kind, Instruction *
I,
24913 bool TwoElementReduction =
false) {
24914 if (Kind == RecurKind::None)
24923 if (TwoElementReduction)
24926 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
24930 return I->getFastMathFlags().noNaNs();
24933 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
24936 return I->isAssociative();
24939 static Value *getRdxOperand(Instruction *
I,
unsigned Index) {
24945 return I->getOperand(2);
24946 return I->getOperand(Index);
24951 Value *
RHS,
const Twine &Name,
bool UseSelect) {
24955 case RecurKind::Or: {
24964 case RecurKind::And: {
24974 case RecurKind::Add:
24975 case RecurKind::Mul:
24976 case RecurKind::Xor:
24977 case RecurKind::FAdd:
24978 case RecurKind::FMul: {
24983 case RecurKind::SMax:
24984 case RecurKind::SMin:
24985 case RecurKind::UMax:
24986 case RecurKind::UMin:
24994 case RecurKind::FMax:
24995 case RecurKind::FMin:
24996 case RecurKind::FMaximum:
24997 case RecurKind::FMinimum:
24998 case RecurKind::FMaximumNum:
24999 case RecurKind::FMinimumNum: {
25012 const ReductionOpsListType &ReductionOps) {
25013 bool UseSelect = ReductionOps.size() == 2 ||
25015 (ReductionOps.size() == 1 &&
25017 assert((!UseSelect || ReductionOps.size() != 2 ||
25019 "Expected cmp + select pairs for reduction");
25020 Value *
Op = createOp(Builder, RdxKind,
LHS,
RHS, Name, UseSelect);
25038 return RecurKind::None;
25040 return RecurKind::Add;
25042 return RecurKind::Mul;
25045 return RecurKind::And;
25048 return RecurKind::Or;
25050 return RecurKind::Xor;
25052 return RecurKind::FAdd;
25054 return RecurKind::FMul;
25057 return RecurKind::FMax;
25059 return RecurKind::FMin;
25062 return RecurKind::FMaximum;
25064 return RecurKind::FMinimum;
25070 return RecurKind::SMax;
25072 return RecurKind::SMin;
25074 return RecurKind::UMax;
25076 return RecurKind::UMin;
25102 return RecurKind::None;
25106 return RecurKind::None;
25109 return RecurKind::None;
25113 return RecurKind::None;
25118 return RecurKind::None;
25121 return RecurKind::SMax;
25124 return RecurKind::SMin;
25127 return RecurKind::UMax;
25130 return RecurKind::UMin;
25133 return RecurKind::None;
25137 static unsigned getFirstOperandIndex(Instruction *
I) {
25138 return isCmpSelMinMax(
I) ? 1 : 0;
25143 static unsigned getNumberOfOperands(Instruction *
I) {
25144 return isCmpSelMinMax(
I) ? 3 : 2;
25149 static bool hasSameParent(Instruction *
I, BasicBlock *BB) {
25150 if (isCmpSelMinMax(
I) || isBoolLogicOp(
I)) {
25153 return Sel->getParent() == BB &&
Cmp &&
Cmp->getParent() == BB;
25155 return I->getParent() == BB;
25159 static bool hasRequiredNumberOfUses(
bool IsCmpSelMinMax, Instruction *
I) {
25160 if (IsCmpSelMinMax) {
25164 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
25165 return I->hasNUses(2);
25173 void initReductionOps(Instruction *
I) {
25174 if (isCmpSelMinMax(
I))
25175 ReductionOps.assign(2, ReductionOpsType());
25177 ReductionOps.assign(1, ReductionOpsType());
25181 void addReductionOps(Instruction *
I) {
25182 if (isCmpSelMinMax(
I)) {
25184 ReductionOps[1].emplace_back(
I);
25186 ReductionOps[0].emplace_back(
I);
25191 int Sz =
Data.size();
25200 : ReductionRoot(
I), ReductionLimit(2) {
25201 RdxKind = HorizontalReduction::getRdxKind(
I);
25202 ReductionOps.emplace_back().push_back(
I);
25205 ReducedValsToOps[
V].push_back(
I);
25208 bool matchReductionForOperands()
const {
25211 assert(ReductionRoot &&
"Reduction root is not set!");
25214 return Ops.size() == 2;
25222 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
25223 ScalarEvolution &SE,
const DataLayout &
DL,
25224 const TargetLibraryInfo &TLI) {
25225 RdxKind = HorizontalReduction::getRdxKind(Root);
25226 if (!isVectorizable(RdxKind, Root))
25238 if (!Sel->getCondition()->hasOneUse())
25241 ReductionRoot = Root;
25246 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
25248 1, std::make_pair(Root, 0));
25253 SmallVectorImpl<Value *> &PossibleReducedVals,
25254 SmallVectorImpl<Instruction *> &ReductionOps,
25257 getNumberOfOperands(TreeN)))) {
25258 Value *EdgeVal = getRdxOperand(TreeN,
I);
25259 ReducedValsToOps[EdgeVal].push_back(TreeN);
25267 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
25268 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
25269 !isVectorizable(RdxKind, EdgeInst) ||
25270 (
R.isAnalyzedReductionRoot(EdgeInst) &&
25272 PossibleReducedVals.push_back(EdgeVal);
25275 ReductionOps.push_back(EdgeInst);
25284 size_t, SmallMapVector<size_t, SmallMapVector<Value *, unsigned, 2>, 2>,
25286 PossibleReducedVals;
25287 initReductionOps(Root);
25289 SmallSet<size_t, 2> LoadKeyUsed;
25291 auto GenerateLoadsSubkey = [&](
size_t Key, LoadInst *LI) {
25296 auto LIt = LoadsMap.
find(std::make_pair(
Key, Ptr));
25297 if (LIt != LoadsMap.
end()) {
25298 for (LoadInst *RLI : LIt->second) {
25304 for (LoadInst *RLI : LIt->second) {
25311 if (LIt->second.size() > 2) {
25313 hash_value(LIt->second.back()->getPointerOperand());
25319 .first->second.push_back(LI);
25323 while (!Worklist.empty()) {
25324 auto [TreeN,
Level] = Worklist.pop_back_val();
25327 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
25328 addReductionOps(TreeN);
25331 for (
Value *V : PossibleRedVals) {
25335 ++PossibleReducedVals[
Key][Idx].
try_emplace(V, 0).first->second;
25337 for (Instruction *
I :
reverse(PossibleReductionOps))
25338 Worklist.emplace_back(
I,
I->getParent() == BB ? 0 : Level + 1);
25340 auto PossibleReducedValsVect = PossibleReducedVals.
takeVector();
25343 for (
auto &PossibleReducedVals : PossibleReducedValsVect) {
25344 auto PossibleRedVals = PossibleReducedVals.second.
takeVector();
25346 for (
auto &Slice : PossibleRedVals) {
25348 auto RedValsVect = Slice.second.takeVector();
25350 for (
const std::pair<Value *, unsigned> &
Data : RedValsVect)
25351 PossibleRedValsVect.
back().append(
Data.second,
Data.first);
25353 stable_sort(PossibleRedValsVect, [](
const auto &P1,
const auto &P2) {
25354 return P1.size() >
P2.size();
25361 }
else if (!isGoodForReduction(
Data)) {
25364 if (!LI || !LastLI ||
25369 ReducedVals.
back().append(
Data.rbegin(),
Data.rend());
25375 return P1.size() >
P2.size();
25381 Value *tryToReduce(BoUpSLP &V,
const DataLayout &
DL, TargetTransformInfo *
TTI,
25382 const TargetLibraryInfo &TLI, AssumptionCache *AC,
25383 DominatorTree &DT) {
25384 constexpr unsigned RegMaxNumber = 4;
25385 constexpr unsigned RedValsMaxNumber = 128;
25389 if (
unsigned NumReducedVals = std::accumulate(
25390 ReducedVals.
begin(), ReducedVals.
end(), 0,
25392 if (!isGoodForReduction(Vals))
25394 return Num + Vals.size();
25396 NumReducedVals < ReductionLimit &&
25400 for (ReductionOpsType &RdxOps : ReductionOps)
25401 for (
Value *RdxOp : RdxOps)
25406 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
25412 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.
size() *
25413 ReducedVals.
front().size());
25417 auto &&GetCmpForMinMaxReduction = [](
Instruction *RdxRootInst) {
25419 "Expected min/max reduction to have select root instruction");
25422 "Expected min/max reduction to have compare condition");
25426 bool AnyBoolLogicOp =
any_of(ReductionOps.back(), [](
Value *V) {
25427 return isBoolLogicOp(cast<Instruction>(V));
25430 auto GetNewVectorizedTree = [&](
Value *VectorizedTree,
Value *Res) {
25431 if (VectorizedTree) {
25435 if (AnyBoolLogicOp) {
25436 auto It = ReducedValsToOps.
find(VectorizedTree);
25437 auto It1 = ReducedValsToOps.
find(Res);
25438 if ((It == ReducedValsToOps.
end() && It1 == ReducedValsToOps.
end()) ||
25440 (It != ReducedValsToOps.
end() &&
25441 any_of(It->getSecond(), [&](Instruction *
I) {
25442 return isBoolLogicOp(I) &&
25443 getRdxOperand(I, 0) == VectorizedTree;
25447 (It1 != ReducedValsToOps.
end() &&
25448 any_of(It1->getSecond(), [&](Instruction *
I) {
25449 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
25453 VectorizedTree = Builder.
CreateFreeze(VectorizedTree);
25457 return createOp(Builder, RdxKind, VectorizedTree, Res,
"op.rdx",
25463 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
25464 ReductionOps.front().size());
25465 for (ReductionOpsType &RdxOps : ReductionOps)
25466 for (
Value *RdxOp : RdxOps) {
25469 IgnoreList.insert(RdxOp);
25472 FastMathFlags RdxFMF;
25474 for (
Value *U : IgnoreList)
25476 RdxFMF &= FPMO->getFastMathFlags();
25482 for (
Value *V : Candidates)
25483 TrackedVals.try_emplace(V, V);
25485 auto At = [](SmallMapVector<Value *, unsigned, 16> &MV,
25486 Value *
V) ->
unsigned & {
25487 auto *It = MV.
find(V);
25488 assert(It != MV.
end() &&
"Unable to find given key.");
25492 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.
size());
25495 SmallPtrSet<Value *, 4> RequiredExtract;
25496 WeakTrackingVH VectorizedTree =
nullptr;
25497 bool CheckForReusedReductionOps =
false;
25507 States.
back().getOpcode() == Instruction::Load)) {
25508 LocalReducedVals.
emplace_back().append(RV.begin(), RV.end());
25509 States.
push_back(InstructionsState::invalid());
25512 if (!LocalReducedVals.
empty() &&
25515 LocalReducedVals.
emplace_back().append(RV.begin(), RV.end());
25520 if (!LocalReducedVals.
empty())
25521 Ops = LocalReducedVals.
back();
25522 Ops.append(RV.begin(), RV.end());
25523 InstructionsCompatibilityAnalysis
Analysis(DT,
DL, *
TTI, TLI);
25524 InstructionsState OpS =
25526 if (LocalReducedVals.
empty()) {
25532 LocalReducedVals.
back().swap(
Ops);
25533 States.
back() = OpS;
25536 LocalReducedVals.
emplace_back().append(RV.begin(), RV.end());
25539 ReducedVals.swap(LocalReducedVals);
25540 for (
unsigned I = 0,
E = ReducedVals.
size();
I <
E; ++
I) {
25542 InstructionsState S = States[
I];
25545 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.
size());
25546 for (
Value *ReducedVal : OrigReducedVals) {
25547 Value *RdxVal = TrackedVals.at(ReducedVal);
25554 (!S || (!S.getMatchingMainOpOrAltOp(Inst) &&
25555 !S.isCopyableElement(Inst)))) ||
25557 !S.isCopyableElement(RdxVal)))
25560 TrackedToOrig.try_emplace(RdxVal, ReducedVal);
25562 bool ShuffledExtracts =
false;
25564 if (S && S.getOpcode() == Instruction::ExtractElement &&
25565 !S.isAltShuffle() &&
I + 1 <
E) {
25567 for (
Value *RV : ReducedVals[
I + 1]) {
25568 Value *RdxVal = TrackedVals.at(RV);
25575 CommonCandidates.push_back(RdxVal);
25576 TrackedToOrig.try_emplace(RdxVal, RV);
25578 SmallVector<int>
Mask;
25581 Candidates.
swap(CommonCandidates);
25582 ShuffledExtracts =
true;
25589 Value *OrigV = TrackedToOrig.at(Candidates.
front());
25590 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
25592 Res = createOp(Builder, RdxKind, Res, VC,
"const.rdx", ReductionOps);
25593 Value *OrigV = TrackedToOrig.at(VC);
25594 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
25596 V.analyzedReductionRoot(ResI);
25598 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
25602 unsigned NumReducedVals = Candidates.
size();
25603 if (NumReducedVals < ReductionLimit &&
25604 (NumReducedVals < 2 || !
isSplat(Candidates)))
25609 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
25610 RdxKind != RecurKind::FMul &&
25611 RdxKind != RecurKind::FMulAdd;
25613 SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
25614 if (IsSupportedHorRdxIdentityOp)
25615 for (
Value *V : Candidates) {
25616 Value *OrigV = TrackedToOrig.at(V);
25617 ++SameValuesCounter.
try_emplace(OrigV).first->second;
25629 bool SameScaleFactor =
false;
25630 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
25631 SameValuesCounter.
size() != Candidates.size();
25633 if (OptReusedScalars) {
25635 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
25636 RdxKind == RecurKind::Xor) &&
25638 [&SameValuesCounter](
const std::pair<Value *, unsigned> &
P) {
25639 return P.second == SameValuesCounter.
front().second;
25641 Candidates.resize(SameValuesCounter.
size());
25642 transform(SameValuesCounter, Candidates.begin(),
25643 [&](
const auto &
P) { return TrackedVals.at(P.first); });
25644 NumReducedVals = Candidates.size();
25646 if (NumReducedVals == 1) {
25647 Value *OrigV = TrackedToOrig.at(Candidates.front());
25648 unsigned Cnt = At(SameValuesCounter, OrigV);
25650 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
25651 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
25652 VectorizedVals.try_emplace(OrigV, Cnt);
25653 ExternallyUsedValues.
insert(OrigV);
25658 unsigned MaxVecRegSize =
V.getMaxVecRegSize();
25659 unsigned EltSize =
V.getVectorElementSize(Candidates[0]);
25660 const unsigned MaxElts = std::clamp<unsigned>(
25662 RegMaxNumber * RedValsMaxNumber);
25664 unsigned ReduxWidth = NumReducedVals;
25665 auto GetVectorFactor = [&, &
TTI = *
TTI](
unsigned ReduxWidth) {
25666 unsigned NumParts, NumRegs;
25667 Type *ScalarTy = Candidates.front()->getType();
25674 while (NumParts > NumRegs) {
25675 assert(ReduxWidth > 0 &&
"ReduxWidth is unexpectedly 0.");
25676 ReduxWidth =
bit_floor(ReduxWidth - 1);
25682 if (NumParts > NumRegs / 2)
25687 ReduxWidth = GetVectorFactor(ReduxWidth);
25688 ReduxWidth = std::min(ReduxWidth, MaxElts);
25690 unsigned Start = 0;
25691 unsigned Pos =
Start;
25693 unsigned PrevReduxWidth = ReduxWidth;
25694 bool CheckForReusedReductionOpsLocal =
false;
25695 auto AdjustReducedVals = [&](
bool IgnoreVL =
false) {
25696 bool IsAnyRedOpGathered = !IgnoreVL &&
V.isAnyGathered(IgnoreList);
25697 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
25700 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
25703 if (Pos < NumReducedVals - ReduxWidth + 1)
25704 return IsAnyRedOpGathered;
25707 if (ReduxWidth > 1)
25708 ReduxWidth = GetVectorFactor(ReduxWidth);
25709 return IsAnyRedOpGathered;
25711 bool AnyVectorized =
false;
25712 SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
25713 while (Pos < NumReducedVals - ReduxWidth + 1 &&
25714 ReduxWidth >= ReductionLimit) {
25717 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
25719 CheckForReusedReductionOps =
true;
25722 PrevReduxWidth = ReduxWidth;
25725 if (IgnoredCandidates.
contains(std::make_pair(Pos, ReduxWidth)) ||
25728 std::make_pair(Pos,
bit_floor(ReduxWidth))) ||
25730 std::make_pair(Pos + (ReduxWidth -
bit_floor(ReduxWidth)),
25732 V.areAnalyzedReductionVals(VL)) {
25733 (void)AdjustReducedVals(
true);
25740 return RedValI &&
V.isDeleted(RedValI);
25743 V.buildTree(VL, IgnoreList);
25744 if (
V.isTreeTinyAndNotFullyVectorizable(
true)) {
25745 if (!AdjustReducedVals())
25746 V.analyzedReductionVals(VL);
25749 if (
V.isLoadCombineReductionCandidate(RdxKind)) {
25750 if (!AdjustReducedVals())
25751 V.analyzedReductionVals(VL);
25754 V.reorderTopToBottom();
25757 VL.front()->getType()->isIntOrIntVectorTy() ||
25758 ReductionLimit > 2);
25762 ExternallyUsedValues);
25766 LocalExternallyUsedValues.insert(ReductionRoot);
25767 for (
unsigned Cnt = 0, Sz = ReducedVals.
size(); Cnt < Sz; ++Cnt) {
25768 if (Cnt ==
I || (ShuffledExtracts && Cnt ==
I - 1))
25770 for (
Value *V : ReducedVals[Cnt])
25772 LocalExternallyUsedValues.insert(TrackedVals[V]);
25774 if (!IsSupportedHorRdxIdentityOp) {
25777 "Reused values counter map is not empty");
25778 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
25779 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
25781 Value *
V = Candidates[Cnt];
25782 Value *OrigV = TrackedToOrig.at(V);
25783 ++SameValuesCounter.
try_emplace(OrigV).first->second;
25786 V.transformNodes();
25787 V.computeMinimumValueSizes();
25792 SmallPtrSet<Value *, 4> Visited;
25793 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
25794 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
25796 Value *RdxVal = Candidates[Cnt];
25797 if (
auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
25798 RdxVal = It->second;
25799 if (!Visited.
insert(RdxVal).second)
25803 if (!VLScalars.contains(RdxVal) &&
V.isVectorized(RdxVal)) {
25804 LocalExternallyUsedValues.insert(RdxVal);
25807 Value *OrigV = TrackedToOrig.at(RdxVal);
25809 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
25810 if (
NumOps != ReducedValsToOps.
at(OrigV).size())
25811 LocalExternallyUsedValues.insert(RdxVal);
25814 if (!IsSupportedHorRdxIdentityOp)
25815 SameValuesCounter.
clear();
25816 for (
Value *RdxVal : VL)
25817 if (RequiredExtract.
contains(RdxVal))
25818 LocalExternallyUsedValues.insert(RdxVal);
25819 V.buildExternalUses(LocalExternallyUsedValues);
25823 if (
V.isReducedBitcastRoot())
25827 getReductionCost(
TTI, VL, IsCmpSelMinMax, RdxFMF, V, DT,
DL, TLI);
25830 <<
" for reduction\n");
25834 V.getORE()->emit([&]() {
25835 return OptimizationRemarkMissed(
SV_NAME,
"HorSLPNotBeneficial",
25836 ReducedValsToOps.
at(VL[0]).front())
25837 <<
"Vectorizing horizontal reduction is possible "
25838 <<
"but not beneficial with cost " <<
ore::NV(
"Cost",
Cost)
25839 <<
" and threshold "
25842 if (!AdjustReducedVals()) {
25843 V.analyzedReductionVals(VL);
25845 if (ReduxWidth > ReductionLimit &&
V.isTreeNotExtendable()) {
25848 *
TTI, VL.front()->getType(), ReduxWidth - 1);
25849 VF >= ReductionLimit;
25851 *
TTI, VL.front()->getType(), VF - 1)) {
25853 V.getCanonicalGraphSize() !=
V.getTreeSize())
25856 IgnoredCandidates.
insert(std::make_pair(
Offset + Idx, VF));
25863 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorizing horizontal reduction at cost:"
25864 <<
Cost <<
". (HorRdx)\n");
25865 V.getORE()->emit([&]() {
25866 return OptimizationRemark(
SV_NAME,
"VectorizedHorizontalReduction",
25867 ReducedValsToOps.
at(VL[0]).front())
25868 <<
"Vectorized horizontal reduction with cost "
25869 <<
ore::NV(
"Cost",
Cost) <<
" and with tree size "
25870 <<
ore::NV(
"TreeSize",
V.getTreeSize());
25879 if (IsCmpSelMinMax)
25880 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
25883 Value *VectorizedRoot =
V.vectorizeTree(
25884 LocalExternallyUsedValues, InsertPt, VectorValuesAndScales);
25887 for (
Value *RdxVal : Candidates) {
25888 Value *OrigVal = TrackedToOrig.at(RdxVal);
25889 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
25890 if (TransformedRdxVal != RdxVal)
25891 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
25900 VectorizedRoot = Builder.
CreateFreeze(VectorizedRoot);
25903 if (OptReusedScalars && !SameScaleFactor) {
25904 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
25905 SameValuesCounter, TrackedToOrig);
25908 Type *ScalarTy = VL.front()->getType();
25913 OptReusedScalars && SameScaleFactor
25914 ? SameValuesCounter.
front().second
25917 ?
V.isSignedMinBitwidthRootNode()
25921 for (
Value *RdxVal : VL) {
25922 Value *OrigV = TrackedToOrig.at(RdxVal);
25923 if (IsSupportedHorRdxIdentityOp) {
25924 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
25927 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
25928 if (!
V.isVectorized(RdxVal))
25929 RequiredExtract.
insert(RdxVal);
25933 ReduxWidth = NumReducedVals - Pos;
25934 if (ReduxWidth > 1)
25935 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
25936 AnyVectorized =
true;
25938 if (OptReusedScalars && !AnyVectorized) {
25939 for (
const std::pair<Value *, unsigned> &
P : SameValuesCounter) {
25940 Value *RdxVal = TrackedVals.at(
P.first);
25941 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder,
P.second);
25942 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
25943 VectorizedVals.try_emplace(
P.first,
P.second);
25948 if (!VectorValuesAndScales.
empty())
25949 VectorizedTree = GetNewVectorizedTree(
25950 VectorizedTree, emitReduction(Builder, *
TTI, ReductionRoot->getType(),
25951 V.isReducedBitcastRoot()));
25953 if (!VectorizedTree) {
25954 if (!CheckForReusedReductionOps) {
25955 for (ReductionOpsType &RdxOps : ReductionOps)
25956 for (
Value *RdxOp : RdxOps)
25978 auto FixBoolLogicalOps =
25981 if (!AnyBoolLogicOp)
25983 if (isBoolLogicOp(RedOp1) && ((!InitStep &&
LHS == VectorizedTree) ||
25984 getRdxOperand(RedOp1, 0) ==
LHS ||
25987 bool NeedFreeze =
LHS != VectorizedTree;
25988 if (isBoolLogicOp(RedOp2) && ((!InitStep &&
RHS == VectorizedTree) ||
25989 getRdxOperand(RedOp2, 0) ==
RHS ||
25992 if ((InitStep ||
RHS != VectorizedTree) &&
25993 getRdxOperand(RedOp2, 0) ==
RHS &&
25994 ((isBoolLogicOp(RedOp1) &&
25995 getRdxOperand(RedOp1, 1) == RedOp2) ||
25999 return OpI && isBoolLogicOp(OpI) &&
26000 getRdxOperand(OpI, 1) == RedOp2;
26003 NeedFreeze =
false;
26017 unsigned Sz = InstVals.
size();
26019 for (
unsigned I = 0,
E = (Sz / 2) * 2;
I <
E;
I += 2) {
26022 Value *RdxVal1 = InstVals[
I].second;
26023 Value *StableRdxVal1 = RdxVal1;
26024 auto It1 = TrackedVals.find(RdxVal1);
26025 if (It1 != TrackedVals.end())
26026 StableRdxVal1 = It1->second;
26027 Value *RdxVal2 = InstVals[
I + 1].second;
26028 Value *StableRdxVal2 = RdxVal2;
26029 auto It2 = TrackedVals.find(RdxVal2);
26030 if (It2 != TrackedVals.end())
26031 StableRdxVal2 = It2->second;
26035 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[
I].first,
26037 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
26038 StableRdxVal2,
"op.rdx", ReductionOps);
26039 ExtraReds[
I / 2] = std::make_pair(InstVals[
I].first, ExtraRed);
26042 ExtraReds[Sz / 2] = InstVals.
back();
26048 SmallPtrSet<Value *, 8> Visited;
26050 for (
Value *RdxVal : Candidates) {
26051 if (!Visited.
insert(RdxVal).second)
26053 unsigned NumOps = VectorizedVals.lookup(RdxVal);
26054 for (Instruction *RedOp :
26060 bool InitStep =
true;
26061 while (ExtraReductions.
size() > 1) {
26063 FinalGen(ExtraReductions, InitStep);
26064 ExtraReductions.
swap(NewReds);
26067 VectorizedTree = ExtraReductions.
front().second;
26069 ReductionRoot->replaceAllUsesWith(VectorizedTree);
26076 SmallPtrSet<Value *, 4> IgnoreSet;
26085 for (
auto *U :
Ignore->users()) {
26087 "All users must be either in the reduction ops list.");
26090 if (!
Ignore->use_empty()) {
26092 Ignore->replaceAllUsesWith(
P);
26095 V.removeInstructionsAndOperands(RdxOps, VectorValuesAndScales);
26097 return VectorizedTree;
26103 Value *createSingleOp(IRBuilderBase &Builder,
const TargetTransformInfo &
TTI,
26104 Value *Vec,
unsigned Scale,
bool IsSigned,
Type *DestTy,
26105 bool ReducedInTree) {
26107 if (ReducedInTree) {
26130 Rdx, emitReduction(Lane, Builder, &
TTI, DestTy),
I);
26133 Rdx = emitReduction(Vec, Builder, &
TTI, DestTy);
26135 if (Rdx->
getType() != DestTy)
26141 Rdx = emitScaleForReusedOps(Rdx, Builder, Scale);
26148 bool IsCmpSelMinMax, FastMathFlags FMF,
26149 const BoUpSLP &R, DominatorTree &DT,
26150 const DataLayout &
DL,
26151 const TargetLibraryInfo &TLI) {
26153 Type *ScalarTy = ReducedVals.
front()->getType();
26154 unsigned ReduxWidth = ReducedVals.
size();
26155 FixedVectorType *VectorTy =
R.getReductionType();
26160 auto EvaluateScalarCost = [&](function_ref<
InstructionCost()> GenCostFn) {
26163 int Cnt = ReducedVals.
size();
26164 for (
Value *RdxVal : ReducedVals) {
26171 Cost += GenCostFn();
26175 for (User *U : RdxVal->
users()) {
26177 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
26178 if (RdxKind == RecurKind::FAdd) {
26188 FMACost -= FMulCost;
26190 ScalarCost += FMACost;
26197 ScalarCost = InstructionCost::getInvalid();
26201 Cost += ScalarCost;
26203 Cost += GenCostFn();
26212 bool DoesRequireReductionOp = !AllConsts && VectorValuesAndScales.
empty();
26214 case RecurKind::Add:
26215 case RecurKind::Mul:
26216 case RecurKind::Or:
26217 case RecurKind::And:
26218 case RecurKind::Xor:
26219 case RecurKind::FAdd:
26220 case RecurKind::FMul: {
26223 if (DoesRequireReductionOp) {
26226 unsigned ScalarTyNumElements = VecTy->getNumElements();
26231 ReducedVals.size()),
26242 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
26243 std::make_pair(RedTy,
true));
26244 if (RType == RedTy) {
26249 RdxOpcode, !IsSigned, RedTy,
26255 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
26256 std::make_pair(RedTy,
true));
26259 if (RdxKind == RecurKind::FAdd) {
26264 for (
Value *RdxVal : ReducedVals) {
26270 FMF &= FPCI->getFastMathFlags();
26273 if (!
Ops.empty()) {
26278 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy,
26279 {RVecTy, RVecTy, RVecTy}, FMF);
26285 Instruction::FMul, RVecTy,
CostKind);
26287 <<
"Minus vector FMul cost: " << FMulCost <<
"\n");
26288 FMACost -= FMulCost;
26292 if (FMACost.isValid())
26293 VectorCost += FMACost;
26297 if (RType != RedTy) {
26298 unsigned Opcode = Instruction::Trunc;
26300 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
26306 ScalarCost = EvaluateScalarCost([&]() {
26311 case RecurKind::FMax:
26312 case RecurKind::FMin:
26313 case RecurKind::FMaximum:
26314 case RecurKind::FMinimum:
26315 case RecurKind::SMax:
26316 case RecurKind::SMin:
26317 case RecurKind::UMax:
26318 case RecurKind::UMin: {
26321 if (DoesRequireReductionOp) {
26327 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
26328 std::make_pair(RedTy,
true));
26330 IntrinsicCostAttributes ICA(Id, RVecTy, {RVecTy, RVecTy}, FMF);
26332 if (RType != RedTy) {
26333 unsigned Opcode = Instruction::Trunc;
26335 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
26341 ScalarCost = EvaluateScalarCost([&]() {
26342 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
26351 LLVM_DEBUG(
dbgs() <<
"SLP: Adding cost " << VectorCost - ScalarCost
26353 <<
" (It is a splitting reduction)\n");
26354 return VectorCost - ScalarCost;
26360 Value *emitReduction(IRBuilderBase &Builder,
const TargetTransformInfo &
TTI,
26361 Type *DestTy,
bool ReducedInTree) {
26362 Value *ReducedSubTree =
nullptr;
26364 auto CreateSingleOp = [&](
Value *Vec,
unsigned Scale,
bool IsSigned) {
26365 Value *Rdx = createSingleOp(Builder,
TTI, Vec, Scale, IsSigned, DestTy,
26367 if (ReducedSubTree)
26368 ReducedSubTree = createOp(Builder, RdxKind, ReducedSubTree, Rdx,
26369 "op.rdx", ReductionOps);
26371 ReducedSubTree = Rdx;
26373 if (VectorValuesAndScales.
size() == 1) {
26374 const auto &[Vec, Scale, IsSigned] = VectorValuesAndScales.
front();
26375 CreateSingleOp(Vec, Scale, IsSigned);
26376 return ReducedSubTree;
26380 Value *VecRes =
nullptr;
26381 bool VecResSignedness =
false;
26382 auto CreateVecOp = [&](
Value *Vec,
unsigned Cnt,
bool IsSigned) {
26388 case RecurKind::Add: {
26389 if (ScalarTy == Builder.
getInt1Ty() && ScalarTy != DestTy) {
26392 <<
". (HorRdx)\n");
26395 std::iota(std::next(
Mask.begin(), VF *
I),
26396 std::next(
Mask.begin(), VF * (
I + 1)), 0);
26397 ++NumVectorInstructions;
26408 LLVM_DEBUG(
dbgs() <<
"SLP: Add (to-mul) " << Cnt <<
"of " << Vec
26409 <<
". (HorRdx)\n");
26410 ++NumVectorInstructions;
26414 case RecurKind::Xor: {
26417 <<
"SLP: Xor " << Cnt <<
"of " << Vec <<
". (HorRdx)\n");
26422 case RecurKind::FAdd: {
26426 LLVM_DEBUG(
dbgs() <<
"SLP: FAdd (to-fmul) " << Cnt <<
"of " << Vec
26427 <<
". (HorRdx)\n");
26428 ++NumVectorInstructions;
26432 case RecurKind::And:
26433 case RecurKind::Or:
26434 case RecurKind::SMax:
26435 case RecurKind::SMin:
26436 case RecurKind::UMax:
26437 case RecurKind::UMin:
26438 case RecurKind::FMax:
26439 case RecurKind::FMin:
26440 case RecurKind::FMaximum:
26441 case RecurKind::FMinimum:
26444 case RecurKind::Sub:
26445 case RecurKind::AddChainWithSubs:
26446 case RecurKind::Mul:
26447 case RecurKind::FMul:
26448 case RecurKind::FMulAdd:
26449 case RecurKind::AnyOf:
26450 case RecurKind::FindFirstIVSMin:
26451 case RecurKind::FindFirstIVUMin:
26452 case RecurKind::FindLastIVSMax:
26453 case RecurKind::FindLastIVUMax:
26454 case RecurKind::FindLast:
26455 case RecurKind::FMaxNum:
26456 case RecurKind::FMinNum:
26457 case RecurKind::FMaximumNum:
26458 case RecurKind::FMinimumNum:
26459 case RecurKind::None:
26466 VecResSignedness = IsSigned;
26468 ++NumVectorInstructions;
26469 if (ScalarTy == Builder.
getInt1Ty() && ScalarTy != DestTy &&
26475 std::iota(
Mask.begin(),
Mask.end(), 0);
26477 if (VecResVF < VecVF) {
26481 if (VecResVF != VecVF) {
26483 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
26500 if (VecResVF < VecVF) {
26506 if (VecResVF != VecVF)
26508 Op = createOp(Builder, RdxKind,
Op, Vec,
"rdx.op", ReductionOps);
26509 if (VecResVF != VecVF)
26514 for (
auto [Vec, Scale, IsSigned] : VectorValuesAndScales)
26515 CreateVecOp(Vec, Scale, IsSigned);
26516 CreateSingleOp(VecRes, 1,
false);
26518 return ReducedSubTree;
26522 Value *emitReduction(
Value *VectorizedValue, IRBuilderBase &Builder,
26523 const TargetTransformInfo *
TTI,
Type *DestTy) {
26524 assert(VectorizedValue &&
"Need to have a vectorized tree node");
26525 assert(RdxKind != RecurKind::FMulAdd &&
26526 "A call to the llvm.fmuladd intrinsic is not handled yet");
26529 if (FTy->getScalarType() == Builder.
getInt1Ty() &&
26530 RdxKind == RecurKind::Add &&
26535 VectorizedValue, Builder.
getIntNTy(FTy->getNumElements()));
26536 ++NumVectorInstructions;
26539 ++NumVectorInstructions;
26544 Value *emitScaleForReusedOps(
Value *VectorizedValue, IRBuilderBase &Builder,
26546 assert(IsSupportedHorRdxIdentityOp &&
26547 "The optimization of matched scalar identity horizontal reductions "
26548 "must be supported.");
26550 return VectorizedValue;
26552 case RecurKind::Add: {
26554 Value *Scale = ConstantInt::get(VectorizedValue->
getType(), Cnt);
26556 << VectorizedValue <<
". (HorRdx)\n");
26557 return Builder.
CreateMul(VectorizedValue, Scale);
26559 case RecurKind::Xor: {
26561 LLVM_DEBUG(
dbgs() <<
"SLP: Xor " << Cnt <<
"of " << VectorizedValue
26562 <<
". (HorRdx)\n");
26565 return VectorizedValue;
26567 case RecurKind::FAdd: {
26569 Value *Scale = ConstantFP::get(VectorizedValue->
getType(), Cnt);
26571 << VectorizedValue <<
". (HorRdx)\n");
26572 return Builder.
CreateFMul(VectorizedValue, Scale);
26574 case RecurKind::And:
26575 case RecurKind::Or:
26576 case RecurKind::SMax:
26577 case RecurKind::SMin:
26578 case RecurKind::UMax:
26579 case RecurKind::UMin:
26580 case RecurKind::FMax:
26581 case RecurKind::FMin:
26582 case RecurKind::FMaximum:
26583 case RecurKind::FMinimum:
26585 return VectorizedValue;
26586 case RecurKind::Sub:
26587 case RecurKind::AddChainWithSubs:
26588 case RecurKind::Mul:
26589 case RecurKind::FMul:
26590 case RecurKind::FMulAdd:
26591 case RecurKind::AnyOf:
26592 case RecurKind::FindFirstIVSMin:
26593 case RecurKind::FindFirstIVUMin:
26594 case RecurKind::FindLastIVSMax:
26595 case RecurKind::FindLastIVUMax:
26596 case RecurKind::FindLast:
26597 case RecurKind::FMaxNum:
26598 case RecurKind::FMinNum:
26599 case RecurKind::FMaximumNum:
26600 case RecurKind::FMinimumNum:
26601 case RecurKind::None:
26610 emitReusedOps(
Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,
26611 const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter,
26612 const DenseMap<Value *, Value *> &TrackedToOrig) {
26613 assert(IsSupportedHorRdxIdentityOp &&
26614 "The optimization of matched scalar identity horizontal reductions "
26615 "must be supported.");
26618 if (VTy->getElementType() != VL.
front()->getType()) {
26622 R.isSignedMinBitwidthRootNode());
26625 case RecurKind::Add: {
26628 for (
Value *V : VL) {
26629 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
26630 Vals.
push_back(ConstantInt::get(
V->getType(), Cnt,
false));
26634 << VectorizedValue <<
". (HorRdx)\n");
26635 return Builder.
CreateMul(VectorizedValue, Scale);
26637 case RecurKind::And:
26638 case RecurKind::Or:
26641 <<
". (HorRdx)\n");
26642 return VectorizedValue;
26643 case RecurKind::SMax:
26644 case RecurKind::SMin:
26645 case RecurKind::UMax:
26646 case RecurKind::UMin:
26647 case RecurKind::FMax:
26648 case RecurKind::FMin:
26649 case RecurKind::FMaximum:
26650 case RecurKind::FMinimum:
26653 <<
". (HorRdx)\n");
26654 return VectorizedValue;
26655 case RecurKind::Xor: {
26660 SmallVector<int>
Mask(
26663 std::iota(
Mask.begin(),
Mask.end(), 0);
26664 bool NeedShuffle =
false;
26665 for (
unsigned I = 0, VF = VL.size();
I < VF; ++
I) {
26667 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
26668 if (Cnt % 2 == 0) {
26670 NeedShuffle =
true;
26676 dbgs() <<
"> of " << VectorizedValue <<
". (HorRdx)\n");
26680 ConstantVector::getNullValue(VectorizedValue->
getType()), Mask);
26681 return VectorizedValue;
26683 case RecurKind::FAdd: {
26686 for (
Value *V : VL) {
26687 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
26688 Vals.
push_back(ConstantFP::get(
V->getType(), Cnt));
26691 return Builder.
CreateFMul(VectorizedValue, Scale);
26693 case RecurKind::Sub:
26694 case RecurKind::AddChainWithSubs:
26695 case RecurKind::Mul:
26696 case RecurKind::FMul:
26697 case RecurKind::FMulAdd:
26698 case RecurKind::AnyOf:
26699 case RecurKind::FindFirstIVSMin:
26700 case RecurKind::FindFirstIVUMin:
26701 case RecurKind::FindLastIVSMax:
26702 case RecurKind::FindLastIVUMax:
26703 case RecurKind::FindLast:
26704 case RecurKind::FMaxNum:
26705 case RecurKind::FMinNum:
26706 case RecurKind::FMaximumNum:
26707 case RecurKind::FMinimumNum:
26708 case RecurKind::None:
26718 return HorizontalReduction::getRdxKind(V);
26724 unsigned AggregateSize = 1;
26726 Type *CurrentType =
IV->getType();
26729 for (
auto *Elt : ST->elements())
26730 if (Elt != ST->getElementType(0))
26731 return std::nullopt;
26732 AggregateSize *= ST->getNumElements();
26733 CurrentType = ST->getElementType(0);
26735 AggregateSize *= AT->getNumElements();
26736 CurrentType = AT->getElementType();
26738 AggregateSize *= VT->getNumElements();
26739 return AggregateSize;
26741 return AggregateSize;
26743 return std::nullopt;
26752 unsigned OperandOffset,
const BoUpSLP &R) {
26755 std::optional<unsigned> OperandIndex =
26757 if (!OperandIndex || R.isDeleted(LastInsertInst))
26761 BuildVectorOpds, InsertElts, *OperandIndex, R);
26764 BuildVectorOpds[*OperandIndex] = InsertedOperand;
26765 InsertElts[*OperandIndex] = LastInsertInst;
26768 }
while (LastInsertInst !=
nullptr &&
26795 "Expected insertelement or insertvalue instruction!");
26798 "Expected empty result vectors!");
26801 if (!AggregateSize)
26803 BuildVectorOpds.
resize(*AggregateSize);
26804 InsertElts.
resize(*AggregateSize);
26809 if (BuildVectorOpds.
size() >= 2)
26827 auto DominatedReduxValue = [&](
Value *R) {
26835 if (
P->getIncomingBlock(0) == ParentBB) {
26837 }
else if (
P->getIncomingBlock(1) == ParentBB) {
26841 if (Rdx && DominatedReduxValue(Rdx))
26854 if (
P->getIncomingBlock(0) == BBLatch) {
26856 }
else if (
P->getIncomingBlock(1) == BBLatch) {
26860 if (Rdx && DominatedReduxValue(Rdx))
26896 "Expected binop, select, or intrinsic for reduction matching");
26898 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root));
26900 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
26911 Value *Op0 =
nullptr;
26912 Value *Op1 =
nullptr;
26921 Value *B0 =
nullptr, *B1 =
nullptr;
26926bool SLPVectorizerPass::vectorizeHorReduction(
26927 PHINode *
P, Instruction *Root, BasicBlock *BB,
BoUpSLP &R,
26928 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
26937 auto SelectRoot = [&]() {
26939 HorizontalReduction::getRdxKind(Root) != RecurKind::None)
26956 std::queue<std::pair<Instruction *, unsigned>>
Stack;
26957 Stack.emplace(SelectRoot(), 0);
26958 SmallPtrSet<Value *, 8> VisitedInstrs;
26961 if (
R.isAnalyzedReductionRoot(Inst))
26966 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
26968 return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC, *DT);
26970 auto TryAppendToPostponedInsts = [&](
Instruction *FutureSeed) {
26971 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
26983 while (!
Stack.empty()) {
26986 std::tie(Inst, Level) =
Stack.front();
26991 if (
R.isDeleted(Inst))
26993 if (
Value *VectorizedV = TryToReduce(Inst)) {
26997 Stack.emplace(
I, Level);
27000 if (
R.isDeleted(Inst))
27004 if (!TryAppendToPostponedInsts(Inst)) {
27015 if (VisitedInstrs.
insert(
Op).second)
27020 !
R.isDeleted(
I) &&
I->getParent() == BB)
27021 Stack.emplace(
I, Level);
27026bool SLPVectorizerPass::tryToVectorize(Instruction *
I,
BoUpSLP &R) {
27033 if ((
I->getOpcode() == Instruction::FAdd ||
27034 I->getOpcode() == Instruction::FSub) &&
27044 if (!Op0 || !Op1 || Op0->getParent() !=
P || Op1->getParent() !=
P ||
27045 R.isDeleted(Op0) ||
R.isDeleted(Op1))
27055 if (
A &&
B &&
B->hasOneUse()) {
27058 if (B0 && B0->getParent() ==
P && !
R.isDeleted(B0))
27060 if (B1 && B1->getParent() ==
P && !
R.isDeleted(B1))
27064 if (
B &&
A &&
A->hasOneUse()) {
27067 if (A0 && A0->getParent() ==
P && !
R.isDeleted(A0))
27069 if (A1 && A1->getParent() ==
P && !
R.isDeleted(A1))
27073 auto TryToReduce = [
this, &
R, &TTI = *TTI](
Instruction *Inst,
27077 Type *Ty = Inst->getType();
27081 if (!HorRdx.matchReductionForOperands())
27087 TTI.getScalarizationOverhead(
27090 TTI.getInstructionCost(Inst,
CostKind);
27093 case RecurKind::Add:
27094 case RecurKind::Mul:
27095 case RecurKind::Or:
27096 case RecurKind::And:
27097 case RecurKind::Xor:
27098 case RecurKind::FAdd:
27099 case RecurKind::FMul: {
27102 FMF = FPCI->getFastMathFlags();
27103 RedCost = TTI.getArithmeticReductionCost(Inst->getOpcode(), VecTy, FMF,
27110 if (RedCost >= ScalarCost)
27113 return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC, *DT) !=
nullptr;
27115 if (Candidates.
size() == 1)
27116 return TryToReduce(
I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1},
R);
27119 std::optional<int> BestCandidate =
R.findBestRootPair(Candidates);
27120 if (!BestCandidate)
27122 return (*BestCandidate == 0 &&
27123 TryToReduce(
I, {Candidates[*BestCandidate].first,
27124 Candidates[*BestCandidate].second})) ||
27125 tryToVectorizeList({Candidates[*BestCandidate].first,
27126 Candidates[*BestCandidate].second},
27130bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *
P, Instruction *Root,
27131 BasicBlock *BB,
BoUpSLP &R) {
27133 bool Res = vectorizeHorReduction(
P, Root, BB, R, PostponedInsts);
27134 Res |= tryToVectorize(PostponedInsts, R);
27141 for (
Value *V : Insts)
27143 Res |= tryToVectorize(Inst, R);
27147bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
27150 if (!
R.canMapToVector(IVI->
getType()))
27153 SmallVector<Value *, 16> BuildVectorOpds;
27154 SmallVector<Value *, 16> BuildVectorInsts;
27158 if (MaxVFOnly && BuildVectorOpds.
size() == 2) {
27159 R.getORE()->emit([&]() {
27160 return OptimizationRemarkMissed(
SV_NAME,
"NotPossible", IVI)
27161 <<
"Cannot SLP vectorize list: only 2 elements of buildvalue, "
27162 "trying reduction first.";
27166 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IVI <<
"\n");
27168 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
27171bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
27174 SmallVector<Value *, 16> BuildVectorInsts;
27175 SmallVector<Value *, 16> BuildVectorOpds;
27176 SmallVector<int>
Mask;
27182 if (MaxVFOnly && BuildVectorInsts.
size() == 2) {
27183 R.getORE()->emit([&]() {
27184 return OptimizationRemarkMissed(
SV_NAME,
"NotPossible", IEI)
27185 <<
"Cannot SLP vectorize list: only 2 elements of buildvector, "
27186 "trying reduction first.";
27190 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IEI <<
"\n");
27191 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
27194template <
typename T>
27199 bool MaxVFOnly,
BoUpSLP &R) {
27212 if (!
I || R.isDeleted(
I)) {
27216 auto *SameTypeIt = IncIt;
27219 AreCompatible(VL, *SameTypeIt))) {
27222 if (
I && !R.isDeleted(
I))
27227 unsigned NumElts = VL.
size();
27228 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize starting at nodes ("
27229 << NumElts <<
")\n");
27239 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL), MaxVFOnly)) {
27242 VL.
swap(Candidates);
27243 Candidates.
clear();
27251 auto GetMinNumElements = [&R](
Value *V) {
27252 unsigned EltSize = R.getVectorElementSize(V);
27253 return std::max(2U, R.getMaxVecRegSize() / EltSize);
27255 if (NumElts < GetMinNumElements(*IncIt) &&
27256 (Candidates.
empty() ||
27257 Candidates.
front()->getType() == (*IncIt)->getType())) {
27265 if (Candidates.
size() > 1 &&
27266 (SameTypeIt ==
E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
27267 if (TryToVectorizeHelper(Candidates,
false)) {
27270 }
else if (MaxVFOnly) {
27273 for (
auto *It = Candidates.
begin(), *End = Candidates.
end(); It != End;
27276 if (!
I || R.isDeleted(
I)) {
27280 auto *SameTypeIt = It;
27281 while (SameTypeIt != End &&
27284 AreCompatible(*SameTypeIt, *It))) {
27287 if (
I && !R.isDeleted(
I))
27290 unsigned NumElts = VL.
size();
27291 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL),
27297 Candidates.
clear();
27301 IncIt = SameTypeIt;
27313template <
bool IsCompatibility>
27318 "Expected valid element types only.");
27320 return IsCompatibility;
27323 if (CI1->getOperand(0)->getType()->getTypeID() <
27325 return !IsCompatibility;
27326 if (CI1->getOperand(0)->getType()->getTypeID() >
27329 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
27331 return !IsCompatibility;
27332 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
27341 if (BasePred1 < BasePred2)
27342 return !IsCompatibility;
27343 if (BasePred1 > BasePred2)
27346 bool CI1Preds = Pred1 == BasePred1;
27347 bool CI2Preds = Pred2 == BasePred1;
27348 for (
int I = 0,
E = CI1->getNumOperands();
I <
E; ++
I) {
27349 auto *Op1 = CI1->getOperand(CI1Preds ?
I :
E -
I - 1);
27354 return !IsCompatibility;
27359 if (IsCompatibility) {
27360 if (I1->getParent() != I2->getParent())
27367 return NodeI2 !=
nullptr;
27370 assert((NodeI1 == NodeI2) ==
27372 "Different nodes should have different DFS numbers");
27373 if (NodeI1 != NodeI2)
27377 if (S && (IsCompatibility || !S.isAltShuffle()))
27379 if (IsCompatibility)
27381 if (I1->getOpcode() != I2->getOpcode())
27382 return I1->getOpcode() < I2->getOpcode();
27385 return IsCompatibility;
27388template <
typename ItT>
27394 if (R.isDeleted(
I))
27398 Changed |= vectorizeRootInstruction(
nullptr, RootOp, BB, R);
27399 if (R.isDeleted(
I))
27405 if (R.isDeleted(
I))
27411 auto CompareSorter = [&](
Value *V,
Value *V2) {
27427 if (Vals.
size() <= 1)
27430 Vals, CompareSorter, AreCompatibleCompares,
27433 bool ArePossiblyReducedInOtherBlock =
any_of(Candidates, [](
Value *V) {
27435 auto *Select = dyn_cast<SelectInst>(U);
27437 Select->getParent() != cast<Instruction>(V)->getParent();
27440 if (ArePossiblyReducedInOtherBlock)
27442 return tryToVectorizeList(Candidates, R, MaxVFOnly);
27448bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
27451 "This function only accepts Insert instructions");
27452 bool OpsChanged =
false;
27454 for (
auto *
I :
reverse(Instructions)) {
27460 vectorizeInsertValueInst(LastInsertValue, BB, R,
true);
27463 vectorizeInsertElementInst(LastInsertElem, BB, R,
true);
27466 if (
R.isDeleted(
I))
27468 OpsChanged |= vectorizeHorReduction(
nullptr,
I, BB, R, PostponedInsts);
27474 vectorizeInsertValueInst(LastInsertValue, BB, R,
false);
27476 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
27481 OpsChanged |= tryToVectorize(PostponedInsts, R);
27487bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB,
BoUpSLP &R) {
27489 SmallVector<Value *, 4> Incoming;
27490 SmallPtrSet<Value *, 16> VisitedInstrs;
27494 DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;
27495 auto PHICompare = [
this, &PHIToOpcodes](
Value *V1,
Value *V2) {
27498 "Expected vectorizable types only.");
27508 V2->getType()->getScalarSizeInBits())
27511 V2->getType()->getScalarSizeInBits())
27515 if (Opcodes1.
size() < Opcodes2.
size())
27517 if (Opcodes1.
size() > Opcodes2.
size())
27519 for (
int I = 0,
E = Opcodes1.
size();
I <
E; ++
I) {
27528 return NodeI2 !=
nullptr;
27531 assert((NodeI1 == NodeI2) ==
27533 "Different nodes should have different DFS numbers");
27534 if (NodeI1 != NodeI2)
27537 if (S && !S.isAltShuffle() &&
I1->getOpcode() == I2->getOpcode()) {
27553 DT->getNode(V1->getParent());
27555 DT->getNode(V2->getParent());
27557 return NodeI2 !=
nullptr;
27560 assert((NodeI1 == NodeI2) ==
27562 "Different nodes should have different DFS numbers");
27563 if (NodeI1 != NodeI2)
27565 return V1->comesBefore(V2);
27578 return *Id1 < *Id2;
27582 if (
I1->getOpcode() == I2->getOpcode())
27584 return I1->getOpcode() < I2->getOpcode();
27607 auto ValID1 = Opcodes1[
I]->getValueID();
27608 auto ValID2 = Opcodes2[
I]->getValueID();
27609 if (ValID1 == ValID2)
27611 if (ValID1 < ValID2)
27613 if (ValID1 > ValID2)
27622 assert(U1 && U2 &&
"The only thing left should be undef & undef.");
27628 if (VL.empty() || V1 == VL.back())
27630 Value *V2 = VL.back();
27635 if (Opcodes1.
size() != Opcodes2.
size())
27637 for (
int I = 0,
E = Opcodes1.
size();
I <
E; ++
I) {
27643 if (
R.isDeleted(I1) ||
R.isDeleted(I2))
27645 if (
I1->getParent() != I2->getParent())
27653 if (Opcodes1[
I]->getValueID() != Opcodes2[
I]->getValueID())
27659 bool HaveVectorizedPhiNodes =
false;
27663 for (Instruction &
I : *BB) {
27670 if (!VisitedInstrs.
count(
P) && !
R.isDeleted(
P) &&
27675 if (Incoming.
size() <= 1)
27680 for (
Value *V : Incoming) {
27681 SmallVectorImpl<Value *> &Opcodes =
27683 if (!Opcodes.
empty())
27685 SmallVector<Value *, 4> Nodes(1, V);
27686 SmallPtrSet<Value *, 4> Visited;
27687 while (!Nodes.empty()) {
27691 for (
Value *V :
PHI->incoming_values()) {
27693 Nodes.push_back(PHI1);
27702 Incoming, PHICompare, AreCompatiblePHIs,
27704 return tryToVectorizeList(Candidates, R, MaxVFOnly);
27707 Changed |= HaveVectorizedPhiNodes;
27708 if (HaveVectorizedPhiNodes &&
any_of(PHIToOpcodes, [&](
const auto &
P) {
27710 return !
PHI ||
R.isDeleted(
PHI);
27712 PHIToOpcodes.
clear();
27714 }
while (HaveVectorizedPhiNodes);
27716 VisitedInstrs.
clear();
27718 InstSetVector PostProcessInserts;
27719 SmallSetVector<CmpInst *, 8> PostProcessCmps;
27722 auto VectorizeInsertsAndCmps = [&](
bool VectorizeCmps) {
27723 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
27724 if (VectorizeCmps) {
27726 PostProcessCmps.
clear();
27728 PostProcessInserts.clear();
27734 return PostProcessCmps.
contains(Cmp);
27736 PostProcessInserts.contains(
I);
27742 return I->use_empty() &&
27752 if (
R.isDeleted(&*It))
27755 if (!VisitedInstrs.
insert(&*It).second) {
27756 if (HasNoUsers(&*It) &&
27757 VectorizeInsertsAndCmps(It->isTerminator())) {
27770 if (
P->getNumIncomingValues() == 2) {
27773 if (Root && vectorizeRootInstruction(
P, Root, BB, R)) {
27787 if (BB ==
P->getIncomingBlock(
I) ||
27788 !DT->isReachableFromEntry(
P->getIncomingBlock(
I)))
27794 PI && !IsInPostProcessInstrs(PI)) {
27796 vectorizeRootInstruction(
nullptr, PI,
P->getIncomingBlock(
I), R);
27798 if (Res &&
R.isDeleted(
P)) {
27808 if (HasNoUsers(&*It)) {
27809 bool OpsChanged =
false;
27820 TryToVectorizeRoot |= (
I == Stores.
end() ||
I->second.size() == 1) &&
27821 SI->getValueOperand()->hasOneUse();
27823 if (TryToVectorizeRoot) {
27824 for (
auto *V : It->operand_values()) {
27828 VI && !IsInPostProcessInstrs(VI))
27830 OpsChanged |= vectorizeRootInstruction(
nullptr, VI, BB, R);
27837 VectorizeInsertsAndCmps(It->isTerminator());
27849 PostProcessInserts.insert(&*It);
27857bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB,
BoUpSLP &R) {
27859 for (
auto &Entry : GEPs) {
27862 if (
Entry.second.size() < 2)
27865 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing a getelementptr list of length "
27866 <<
Entry.second.size() <<
".\n");
27874 return !R.isDeleted(GEP);
27876 if (It ==
Entry.second.end())
27878 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
27879 unsigned EltSize =
R.getVectorElementSize(*(*It)->idx_begin());
27880 if (MaxVecRegSize < EltSize)
27883 unsigned MaxElts = MaxVecRegSize / EltSize;
27884 for (
unsigned BI = 0, BE =
Entry.second.size(); BI < BE; BI += MaxElts) {
27885 auto Len = std::min<unsigned>(BE - BI, MaxElts);
27898 Candidates.remove_if([&R](
Value *
I) {
27908 for (
int I = 0,
E = GEPList.size();
I <
E && Candidates.
size() > 1; ++
I) {
27909 auto *GEPI = GEPList[
I];
27910 if (!Candidates.count(GEPI))
27912 const SCEV *SCEVI = SE->getSCEV(GEPList[
I]);
27913 for (
int J =
I + 1; J <
E && Candidates.
size() > 1; ++J) {
27914 auto *GEPJ = GEPList[J];
27915 const SCEV *SCEVJ = SE->getSCEV(GEPList[J]);
27917 Candidates.remove(GEPI);
27918 Candidates.remove(GEPJ);
27919 }
else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
27920 Candidates.remove(GEPJ);
27927 if (Candidates.
size() < 2)
27933 SmallVector<Value *, 16> Bundle(Candidates.
size());
27934 auto BundleIndex = 0
u;
27935 for (
auto *V : Candidates) {
27937 auto *GEPIdx =
GEP->idx_begin()->get();
27939 Bundle[BundleIndex++] = GEPIdx;
27951 Changed |= tryToVectorizeList(Bundle, R);
27957bool SLPVectorizerPass::vectorizeStoreChains(
BoUpSLP &R) {
27962 auto &&StoreSorter = [
this](StoreInst *
V, StoreInst *V2) {
27963 if (
V->getValueOperand()->getType()->getTypeID() <
27966 if (
V->getValueOperand()->getType()->getTypeID() >
27969 if (
V->getPointerOperandType()->getTypeID() <
27970 V2->getPointerOperandType()->getTypeID())
27972 if (
V->getPointerOperandType()->getTypeID() >
27973 V2->getPointerOperandType()->getTypeID())
27975 if (
V->getValueOperand()->getType()->getScalarSizeInBits() <
27978 if (
V->getValueOperand()->getType()->getScalarSizeInBits() >
27985 DomTreeNodeBase<llvm::BasicBlock> *NodeI1 = DT->getNode(
I1->getParent());
27986 DomTreeNodeBase<llvm::BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
27987 assert(NodeI1 &&
"Should only process reachable instructions");
27988 assert(NodeI2 &&
"Should only process reachable instructions");
27989 assert((NodeI1 == NodeI2) ==
27991 "Different nodes should have different DFS numbers");
27992 if (NodeI1 != NodeI2)
27994 return I1->getOpcode() < I2->getOpcode();
28000 return V->getValueOperand()->getValueID() <
28004 bool SameParent =
true;
28010 StoreInst *V2 = VL.
back();
28035 SameParent &=
I1 && I2 &&
I1->getParent() == I2->getParent();
28037 for (
auto [SI, V] :
zip(VL, NewVL))
28038 V =
SI->getValueOperand();
28039 NewVL.back() = V1->getValueOperand();
28040 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
28041 InstructionsState S =
Analysis.buildInstructionsState(
28049 return V1->getValueOperand()->
getValueID() ==
28054 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> Attempted;
28055 for (
auto &Pair : Stores) {
28056 if (Pair.second.size() < 2)
28060 << Pair.second.size() <<
".\n");
28069 Pair.second.rend());
28071 ReversedStores, StoreSorter, AreCompatibleStores,
28073 return vectorizeStores(Candidates, R, Attempted);
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool isConstant(const MachineInstr &MI)
AMDGPU Register Bank Select
ReachingDefInfo InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
block Block Frequency Analysis
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
static bool runImpl(Function &F, const TargetLowering &TLI, const LibcallLoweringInfo &Libcalls, AssumptionCache *AC)
This is the interface for a simple mod/ref and alias analysis over globals.
static Value * getCondition(Instruction *I)
static void setCondition(Instruction *I, Value *NewCond)
static const HTTPClientCleanup Cleanup
static Type * getIndexType(Value *In)
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
This file provides utility analysis objects describing memory locations.
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
static bool IsSelect(MachineInstr &MI)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
static std::optional< OperandInfo > getOperandInfo(const MachineOperand &MO)
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool isAlternateInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static cl::opt< bool > SplitAlternateInstructions("slp-split-alternate-instructions", cl::init(true), cl::Hidden, cl::desc("Improve the code quality by splitting alternate instructions"))
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static bool isMaskedLoadCompress(ArrayRef< Value * > VL, ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC, const DominatorTree &DT, const TargetLibraryInfo &TLI, const function_ref< bool(Value *)> AreAllUsersVectorized, bool &IsMasked, unsigned &InterleaveFactor, SmallVectorImpl< int > &CompressMask, VectorType *&LoadVecTy)
Checks if the VL can be transformed to a (masked)load + compress or (masked) interleaved load.
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
static cl::opt< bool > VectorizeCopyableElements("slp-copyable-elements", cl::init(true), cl::Hidden, cl::desc("Try to replace values with the idempotent instructions for " "better vectorization."))
Enables vectorization of copyable elements.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static bool allSameOpcode(ArrayRef< Value * > VL)
static InstructionCost canConvertToFMA(ArrayRef< Value * > VL, const InstructionsState &S, DominatorTree &DT, const DataLayout &DL, TargetTransformInfo &TTI, const TargetLibraryInfo &TLI)
Check if we can convert fadd/fsub sequence to FMAD.
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool isCommutative(Instruction *I, Value *ValWithUses, bool IsCopyable=false)
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Creates subvector insert.
static void findBuildAggregateRec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool isSimple(Instruction *I)
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static DebugLoc getDebugLocFromPHI(PHINode &PN)
static std::optional< unsigned > getExtractIndex(const Instruction *E)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static cl::opt< bool > ForceStridedLoads("slp-force-strided-loads", cl::init(false), cl::Hidden, cl::desc("Generate strided loads even if they are not " "profitable. Used for testing only."))
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static bool isMainInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an main operation for the given MainOp and AltOp instruction...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static unsigned getNumberOfPotentiallyCommutativeOps(Instruction *I)
static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy, VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={})
This is similar to TargetTransformInfo::getScalarizationOverhead, but if ScalarTy is a FixedVectorTyp...
static bool isCommutableOperand(Instruction *I, Value *ValWithUses, unsigned Op, bool IsCopyable=false)
Checks if the operand is commutative.
static bool buildCompressMask(ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, Type *ScalarTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< int > &CompressMask)
Builds compress-like mask for shuffles for the given PointerOps, ordered with Order.
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
Calculates the costs of vectorized intrinsic (if possible) and vectorized function (if possible) call...
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static bool isValidForAlternation(unsigned Opcode)
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
static SmallVector< Constant * > replicateMask(ArrayRef< Constant * > Val, unsigned VF)
Replicates the given Val VF times.
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static Instruction * findInstructionWithOpcode(ArrayRef< Value * > VL, unsigned Opcode)
Find an instruction with a specific opcode in VL.
static const SCEV * calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, SmallVectorImpl< int64_t > &Coeffs)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static InstructionCost getExtractWithExtendCost(const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput)
This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst is a FixedVectorType,...
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static std::pair< Instruction *, Instruction * > getMainAltOpsNoStateVL(ArrayRef< Value * > VL)
Returns main/alternate instructions for the given VL.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int64_t > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(ArrayRef< T * >, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static unsigned getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy, const unsigned Limit=std::numeric_limits< unsigned >::max())
Returns number of parts, the type VecTy will be split at the codegen phase.
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static bool tryToFindDuplicates(SmallVectorImpl< Value * > &VL, SmallVectorImpl< int > &ReuseShuffleIndices, const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, const InstructionsState &S, const BoUpSLP::EdgeInfo &UserTreeIdx, bool TryPad=false)
Checks that every instruction appears once in the list and if not, packs them, building ReuseShuffleI...
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Value * createExtractVector(IRBuilderBase &Builder, Value *Vec, unsigned SubVecVF, unsigned Index)
Generates subvector extract using Generator or using default shuffle.
static cl::opt< bool > DisableTreeReorder("slp-disable-tree-reorder", cl::init(false), cl::Hidden, cl::desc("Disable tree reordering even if it is " "profitable. Used for testing only."))
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static InstructionCost getVectorInstrCost(const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar, ArrayRef< std::tuple< Value *, User *, int > > ScalarUserAndIdx)
This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy is a FixedVectorType,...
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, Type *ScalarTy, unsigned Opcode0, unsigned Opcode1)
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
static const int BlockSize
LocallyHashedType DenseMapInfo< LocallyHashedType >::Empty
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
static const uint32_t IV[8]
Merges shuffle masks and emits final shuffle instruction, if required.
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost createFreeze(InstructionCost Cost)
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void resetForSameNode()
Reset the builder to handle perfect diamond match.
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Merges shuffle masks and emits final shuffle instruction, if required.
Value * createFreeze(Value *V)
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void resetForSameNode()
Reset the builder to handle perfect diamond match.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
~ShuffleInstructionBuilder()
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
A manager for alias analyses.
Class for arbitrary precision integers.
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
void clearBit(unsigned BitPosition)
Set a given bit to 0.
uint64_t getZExtValue() const
Get zero extended value.
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
unsigned getBitWidth() const
Return the number of bits in the APInt.
bool ult(const APInt &RHS) const
Unsigned less than comparison.
bool isNegative() const
Determine sign of this APInt.
void clearAllBits()
Set every bit to 0.
void negate()
Negate this APInt in place.
unsigned logBase2() const
void setAllBits()
Set every bit to 1.
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
bool isOne() const
Determine if this is a value of 1.
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
const T & back() const
back - Get the last element.
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
const T & front() const
front - Get the first element.
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
const T & consume_front()
consume_front() - Returns the first element and drops it from ArrayRef.
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::reverse_iterator reverse_iterator
InstListType::iterator iterator
Instruction iterators...
LLVM_ABI const_iterator getFirstNonPHIOrDbgOrAlloca() const
Returns an iterator to the first instruction in this block that is not a PHINode, a debug intrinsic,...
InstListType::const_reverse_iterator const_reverse_iterator
bool isEHPad() const
Return true if this basic block is an exception handling block.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Represents analyses that only rely on functions' control flow.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
LLVM_ABI void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
This class is the base class for the comparison instructions.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ ICMP_SLT
signed less than
@ ICMP_SLE
signed less or equal
@ ICMP_UGE
unsigned greater or equal
@ ICMP_UGT
unsigned greater than
@ ICMP_SGT
signed greater than
@ ICMP_ULT
unsigned less than
@ ICMP_SGE
signed greater or equal
@ ICMP_ULE
unsigned less or equal
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Predicate getPredicate() const
Return the predicate for this instruction.
static LLVM_ABI Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
static LLVM_ABI Constant * getBinOpIdentity(unsigned Opcode, Type *Ty, bool AllowRHSConstant=false, bool NSZ=false)
Return the identity constant for a binary opcode.
This is the shared class of boolean and integer constants.
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
const APInt & getValue() const
Return the constant as an APInt value reference.
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
static bool shouldExecute(CounterInfo &Counter)
static DebugLoc getUnknown()
An analysis that produces DemandedBits for a function.
ValueT & at(const_arg_type_t< KeyT > Val)
at - Return the entry for the specified key, or abort if no such entry exists.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
bool erase(const KeyT &Val)
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
Base class for the actual dominator tree node.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Convenience struct for specifying and reasoning about fast-math flags.
bool allowReassoc() const
Flag queries.
bool allowContract() const
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
ArrayRef< Type * > params() const
Type * getReturnType() const
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
For the node iterator we just need to turn the TreeEntry iterator into a TreeEntry* iterator so that ...
nodes_iterator operator++()
nodes_iterator(const ItTy &It2)
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Value * CreateFreeze(Value *V, const Twine &Name="")
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
LLVM_ABI Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
LLVM_ABI Value * CreateSelectWithUnknownProfile(Value *C, Value *True, Value *False, StringRef PassName, const Twine &Name="")
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
static InstructionCost getInvalid(CostType Val=0)
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
LLVM_ABI bool mayWriteToMemory() const LLVM_READONLY
Return true if this instruction may modify memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
const SmallVectorImpl< Type * > & getArgTypes() const
An instruction for reading from memory.
Value * getPointerOperand()
Analysis pass that exposes the LoopInfo for a function.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
This class implements a map that also provides access to all stored values in a deterministic order.
VectorType takeVector()
Clear the MapVector and return the underlying vector.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
ValueT lookup(const KeyT &Key) const
std::pair< KeyT, ValueT > & front()
Information for memory intrinsic cost model.
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
T & front() const
front - Get the first element.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
ArrayRef< value_type > getArrayRef() const
size_type size() const
Determine the number of elements in the SetVector.
const value_type & front() const
Return the first element of the SetVector.
void insert_range(Range &&R)
Vector takeVector()
Clear the SetVector and return the underlying vector.
bool contains(const_arg_type key) const
Check if the SetVector contains the given key.
void clear()
Completely clear the SetVector.
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
static LLVM_ABI bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static LLVM_ABI bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static LLVM_ABI bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
bool contains(const T &V) const
Check if the SmallSet contains the given element.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
iterator erase(const_iterator CI)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
The instances of the Type class are immutable: once they are created, they are never changed.
LLVM_ABI bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
bool isVectorTy() const
True if this is an instance of VectorType.
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
bool isPointerTy() const
True if this is an instance of PointerType.
LLVM_ABI unsigned getStructNumElements() const
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
bool isIntegerTy() const
True if this is an instance of IntegerType.
TypeID getTypeID() const
Return the type id for the type.
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
bool isVoidTy() const
Return true if this is 'void'.
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Value * getOperand(unsigned i) const
unsigned getNumOperands() const
iterator_range< value_op_iterator > operand_values()
The Vector Function Database.
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
user_iterator user_begin()
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
LLVMContext & getContext() const
All values hold a context through their type.
iterator_range< user_iterator > users()
unsigned getValueID() const
Return an ID for the concrete type of this object.
bool hasUseList() const
Check if this Value has a use-list.
LLVM_ABI bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
LLVM_ABI User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
LLVM_ABI unsigned getNumUses() const
This method computes the number of uses of this Value.
iterator_range< use_iterator > uses()
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
iterator find(const_arg_type_t< ValueT > V)
void insert_range(Range &&R)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
const ParentTy * getParent() const
self_iterator getIterator()
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator I
iterator_adaptor_base()=default
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
static const int ScoreConstants
Constants.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
A helper data structure to hold the operands of a vector of instructions.
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, ArrayRef< ValueList > Operands, const InstructionsState &S, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
Bottom Up SLP Vectorizer.
static bool isIdentityOrder(ArrayRef< unsigned > Order)
Does this non-empty order represent an identity order?
bool isProfitableToReorder() const
Checks if it is profitable to reorder the current tree.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleEntity &SE)
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleBundle &Bundle)
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
unsigned getMaxVecRegSize() const
OptimizationRemarkEmitter * getORE()
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
unsigned getTreeSize() const
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
InstructionCost getSpillCost()
bool isVectorized(const Value *V) const
Check if the value is vectorized in the tree.
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool isStridedLoad(ArrayRef< Value * > PointerOps, Type *ScalarTy, Align Alignment, const int64_t Diff, const size_t Sz) const
Checks if strided loads can be generated out of VL loads with pointers PointerOps:
SmallVector< StoreInst *, 8 > StoreList
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
unsigned getMinVecRegSize() const
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
unsigned getMinVF(unsigned Sz) const
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, StridedPtrInfo &SPtrInfo, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
SmallVector< unsigned, 4 > OrdersType
SmallVector< Instruction *, 16 > InstrList
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
InstructionCost getTreeCost(InstructionCost TreeCost, ArrayRef< Value * > VectorizedVals={}, InstructionCost ReductionCost=TTI::TCC_Free)
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool analyzeRtStrideCandidate(ArrayRef< Value * > PointerOps, Type *ScalarTy, Align CommonAlignment, SmallVectorImpl< unsigned > &SortedIndices, StridedPtrInfo &SPtrInfo) const
Return true if an array of scalar loads can be replaced with a strided load (with run-time stride).
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleData &SD)
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
SmallVector< Value *, 8 > ValueList
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isReducedBitcastRoot() const
Returns the opcode of the root node, or 0, if the root node is gather.
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
bool analyzeConstantStrideCandidate(const ArrayRef< Value * > PointerOps, Type *ElemTy, Align Alignment, const SmallVectorImpl< unsigned > &SortedIndices, const int64_t Diff, Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const
Return true if an array of scalar loads can be replaced with a strided load (with constant stride).
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
InstructionCost calculateTreeCostAndTrimNonProfitable(ArrayRef< Value * > VectorizedVals={})
Calculates the cost of the subtrees, trims non-profitable ones and returns final cost.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Gets reordering data for the given tree entry.
SmallPtrSet< Value *, 16 > ValueSet
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals, ArrayRef< std::tuple< Value *, unsigned, bool > > VectorValuesAndScales)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ BasicBlock
Various leaf nodes.
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaxNum(const Opnd0 &Op0, const Opnd1 &Op1)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
DisjointOr_match< LHS, RHS > m_DisjointOr(const LHS &L, const RHS &R)
auto match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinimum(const Opnd0 &Op0, const Opnd1 &Op1)
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaximum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinNum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
initializer< Ty > init(const Ty &Val)
unsigned combineHashValue(unsigned a, unsigned b)
Simplistic combination of 32-bit hash values into 32-bit hash values.
@ User
could "use" a pointer
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< CodeNode * > Code
friend class Instruction
Iterator for Instructions in a `BasicBlock.
LLVM_ABI iterator begin() const
LLVM_ABI Instruction & front() const
A private "module" namespace for types and utilities used by this pass.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
LLVM_ABI Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
constexpr auto not_equal_to(T &&Arg)
Functor variant of std::not_equal_to that can be used as a UnaryPredicate in functional algorithms li...
FunctionAddr VTableAddr Value
void stable_sort(R &&Range)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
void fill(R &&Range, T &&Value)
Provide wrappers to std::fill which take ranges instead of having to pass begin/end explicitly.
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
hash_code hash_value(const FixedPointSemantics &Val)
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
MaybeAlign getAlign(const CallInst &I, unsigned Index)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
@ LLVM_MARK_AS_BITMASK_ENUM
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
auto pred_end(const MachineBasicBlock *BB)
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
LLVM_ABI void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
scope_exit(Callable) -> scope_exit< Callable >
constexpr from_range_t from_range
LLVM_ABI std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
auto cast_or_null(const Y &Val)
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
iterator_range< po_iterator< T > > post_order(const T &G)
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
bool isa_and_nonnull(const Y &Val)
auto binary_search(R &&Range, T &&Value)
Provide wrappers to std::binary_search which take ranges instead of having to pass begin/end explicit...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
DomTreeNodeBase< BasicBlock > DomTreeNode
auto dyn_cast_or_null(const Y &Val)
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
constexpr bool has_single_bit(T Value) noexcept
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
bool isa_and_present(const Y &Val)
isa_and_present<X> - Functionally identical to isa, except that a null value is accepted.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
bool isPointerTy(const Type *T)
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
auto make_first_range(ContainerTy &&c)
Given a container of pairs, return a range over the first elements.
LLVM_ABI std::optional< int64_t > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
LLVM_ABI bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
bool isModOrRefSet(const ModRefInfo MRI)
bool is_sorted(R &&Range, Compare C)
Wrapper function around std::is_sorted to check if elements in a range R are sorted with respect to a...
LLVM_ABI bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Ref
The access may reference the value stored in memory.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
LLVM_ABI CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)
Returns the comparison predicate used when expanding a min/max reduction.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
FunctionAddr VTableAddr Next
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
OutputIt copy(R &&Range, OutputIt Out)
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
constexpr unsigned BitWidth
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
auto pred_begin(const MachineBasicBlock *BB)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
template class LLVM_TEMPLATE_ABI DomTreeNodeBase< BasicBlock >
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
LLVM_ABI Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
LLVM_ABI bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
LLVM_ABI bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Used to keep track of an operand bundle.
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
BoUpSLP::TreeEntry TreeEntry
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits(bool IsSimple=false)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits(bool simple=false)
DenseMapInfo< BoUpSLP::TreeEntry * > FirstInfo
static bool isEqual(const BoUpSLP::EdgeInfo &LHS, const BoUpSLP::EdgeInfo &RHS)
static BoUpSLP::EdgeInfo getEmptyKey()
DenseMapInfo< unsigned > SecondInfo
static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val)
static BoUpSLP::EdgeInfo getTombstoneKey()
An information struct used to provide DenseMap with the various necessary components for a given valu...
Add the VectorizableTree to the index iterator to be able to return TreeEntry pointers.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
ContainerTy & VectorizableTree
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
BoUpSLP::TreeEntry TreeEntry
static nodes_iterator nodes_end(BoUpSLP *R)
BoUpSLP::TreeEntry::VecTreeTy ContainerTy
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
TargetTransformInfo * TTI
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Function object to check whether the second component of a container supported by std::get (like std:...
This structure holds any data we need about the edges being traversed during buildTreeRec().
bool operator==(const EdgeInfo &Other) const
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
unsigned EdgeIdx
The operand index of the use.
void dump(raw_ostream &OS) const
Debug print.
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)