74#ifdef EXPENSIVE_CHECKS
108using namespace std::placeholders;
110#define SV_NAME "slp-vectorizer"
111#define DEBUG_TYPE "SLP"
113STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
116 "Controls which SLP graphs should be vectorized.");
120 cl::desc(
"Run the SLP vectorization passes"));
124 cl::desc(
"Enable vectorization for wider vector utilization"));
128 cl::desc(
"Only vectorize if you gain more than this "
133 cl::desc(
"When true, SLP vectorizer bypasses profitability checks based on "
134 "heuristics and makes vectorization decision via cost modeling."));
138 cl::desc(
"Attempt to vectorize horizontal reductions"));
143 "Attempt to vectorize horizontal reductions feeding into a store"));
147 cl::desc(
"Improve the code quality by splitting alternate instructions"));
151 cl::desc(
"Attempt to vectorize for this register size in bits"));
155 cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
163 cl::desc(
"Limit the size of the SLP scheduling region per block"));
167 cl::desc(
"Attempt to vectorize for this register size in bits"));
171 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
175 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
181 cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
190 cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
194 cl::desc(
"The minimum number of loads, which should be considered strided, "
195 "if the stride is > 1 or is runtime value"));
199 cl::desc(
"The maximum stride, considered to be profitable."));
203 cl::desc(
"Disable tree reordering even if it is "
204 "profitable. Used for testing only."));
208 cl::desc(
"Generate strided loads even if they are not "
209 "profitable. Used for testing only."));
213 cl::desc(
"Display the SLP trees with Graphviz"));
217 cl::desc(
"Try to vectorize with non-power-of-2 number of elements."));
222 cl::desc(
"Try to replace values with the idempotent instructions for "
223 "better vectorization."));
255 Ty = Ty->getScalarType();
257 !Ty->isPPC_FP128Ty();
266 return SI->getValueOperand()->getType();
268 return CI->getOperand(0)->getType();
270 return IE->getOperand(1)->getType();
277 "ScalableVectorType is not supported.");
279 return VecTy->getNumElements();
293 Type *Ty,
unsigned Sz) {
298 if (NumParts == 0 || NumParts >= Sz)
313 if (NumParts == 0 || NumParts >= Sz)
318 return (Sz / RegVF) * RegVF;
330 I * VecTyNumElements, VecTyNumElements)))
332 : Mask[
I] * VecTyNumElements + J;
366 unsigned SVNumElements =
368 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
369 if (SVNumElements % ShuffleMaskSize != 0)
371 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
372 if (GroupSize == 0 || (VL.
size() % GroupSize) != 0)
374 unsigned NumGroup = 0;
375 for (
size_t I = 0,
E = VL.
size();
I !=
E;
I += GroupSize) {
377 Value *Src = SV->getOperand(0);
383 if (SV->getOperand(0) != Src)
386 if (!SV->isExtractSubvectorMask(Index))
388 ExpectedIndex.
set(Index / ShuffleMaskSize);
392 if (!ExpectedIndex.
all())
396 assert(NumGroup == (VL.
size() / GroupSize) &&
"Unexpected number of groups");
415 unsigned SVNumElements =
418 unsigned AccumulateLength = 0;
419 for (
Value *V : VL) {
421 for (
int M : SV->getShuffleMask())
423 : AccumulateLength + M);
424 AccumulateLength += SVNumElements;
465 return std::min<unsigned>(PartNumElems,
Size - Part * PartNumElems);
474 OS <<
"Idx: " << Idx <<
", ";
475 OS <<
"n=" << VL.
size() <<
" [" << *VL.
front() <<
", ..]";
498 if (BB !=
II->getParent())
515 Value *FirstNonUndef =
nullptr;
516 for (
Value *V : VL) {
519 if (!FirstNonUndef) {
523 if (V != FirstNonUndef)
526 return FirstNonUndef !=
nullptr;
542 return Cmp->isCommutative();
544 return BO->isCommutative() ||
545 (BO->getOpcode() == Instruction::Sub &&
552 if (match(U.getUser(),
553 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
554 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
558 return match(U.getUser(),
559 m_Intrinsic<Intrinsic::abs>(
560 m_Specific(U.get()), m_ConstantInt(Flag))) &&
561 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
564 (BO->getOpcode() == Instruction::FSub &&
567 return match(U.getUser(),
568 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
570 return I->isCommutative();
590 constexpr unsigned IntrinsicNumOperands = 2;
591 return IntrinsicNumOperands;
593 return I->getNumOperands();
599 static_assert(std::is_same_v<T, InsertElementInst> ||
600 std::is_same_v<T, ExtractElementInst>,
610 if (CI->getValue().uge(VT->getNumElements()))
612 Index *= VT->getNumElements();
613 Index += CI->getZExtValue();
635 Type *CurrentType =
IV->getType();
636 for (
unsigned I :
IV->indices()) {
638 Index *= ST->getNumElements();
639 CurrentType = ST->getElementType(
I);
641 Index *= AT->getNumElements();
642 CurrentType = AT->getElementType();
664 return std::all_of(It, VL.
end(), [&](
Value *V) {
665 if (auto *CI = dyn_cast<CmpInst>(V))
666 return BasePred == CI->getPredicate();
667 if (auto *I = dyn_cast<Instruction>(V))
668 return I->getOpcode() == Opcode;
669 return isa<PoisonValue>(V);
697 if (MaskArg == UseMask::UndefsAsMask)
701 if (MaskArg == UseMask::FirstArg &&
Value < VF)
702 UseMask.reset(
Value);
703 else if (MaskArg == UseMask::SecondArg &&
Value >= VF)
704 UseMask.reset(
Value - VF);
712template <
bool IsPoisonOnly = false>
716 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
724 if (!UseMask.empty()) {
735 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
750 for (
unsigned I = 0,
E = VecTy->getNumElements();
I !=
E; ++
I) {
751 if (
Constant *Elem =
C->getAggregateElement(
I))
753 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
781static std::optional<TargetTransformInfo::ShuffleKind>
788 std::accumulate(VL.
begin(), VL.
end(), 0u, [](
unsigned S,
Value *V) {
789 auto *EI = dyn_cast<ExtractElementInst>(V);
792 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
795 return std::max(S, VTy->getNumElements());
798 Value *Vec1 =
nullptr;
799 Value *Vec2 =
nullptr;
804 Value *Vec = EE->getVectorOperand();
810 ShuffleMode CommonShuffleMode =
Unknown;
812 for (
unsigned I = 0,
E = VL.
size();
I <
E; ++
I) {
819 auto *Vec = EI->getVectorOperand();
833 if (Idx->getValue().uge(
Size))
835 unsigned IntIdx = Idx->getValue().getZExtValue();
842 if (!Vec1 || Vec1 == Vec) {
844 }
else if (!Vec2 || Vec2 == Vec) {
850 if (CommonShuffleMode == Permute)
854 if (Mask[
I] %
Size !=
I) {
855 CommonShuffleMode = Permute;
858 CommonShuffleMode =
Select;
861 if (CommonShuffleMode ==
Select && Vec2)
871 unsigned Opcode =
E->getOpcode();
872 assert((Opcode == Instruction::ExtractElement ||
873 Opcode == Instruction::ExtractValue) &&
874 "Expected extractelement or extractvalue instruction.");
875 if (Opcode == Instruction::ExtractElement) {
879 return CI->getZExtValue();
882 if (EI->getNumIndices() != 1)
884 return *EI->idx_begin();
910bool isValidForAlternation(
unsigned Opcode) {
919class BinOpSameOpcodeHelper {
920 using MaskType = std::uint_fast16_t;
922 constexpr static std::initializer_list<unsigned> SupportedOp = {
923 Instruction::Add, Instruction::Sub, Instruction::Mul, Instruction::Shl,
924 Instruction::AShr, Instruction::And, Instruction::Or, Instruction::Xor};
934 MainOpBIT = 0b100000000,
942 static std::pair<ConstantInt *, unsigned>
943 isBinOpWithConstantInt(
const Instruction *
I) {
944 unsigned Opcode =
I->getOpcode();
950 if (Opcode == Instruction::Sub || Opcode == Instruction::Shl ||
951 Opcode == Instruction::AShr)
957 struct InterchangeableInfo {
960 MaskType Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |
961 MulBIT | AShrBIT | ShlBIT;
966 MaskType SeenBefore = 0;
967 InterchangeableInfo(
const Instruction *I) : I(I) {}
971 bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) {
972 if (Mask & InterchangeableMask) {
973 SeenBefore |= OpcodeInMaskForm;
974 Mask &= InterchangeableMask;
979 bool equal(
unsigned Opcode) {
980 return Opcode == I->getOpcode() && trySet(MainOpBIT, MainOpBIT);
983 MaskType Candidate = Mask & SeenBefore;
984 if (Candidate & MainOpBIT)
985 return I->getOpcode();
986 if (Candidate & ShlBIT)
987 return Instruction::Shl;
988 if (Candidate & AShrBIT)
989 return Instruction::AShr;
990 if (Candidate & MulBIT)
991 return Instruction::Mul;
992 if (Candidate & AddBIT)
993 return Instruction::Add;
994 if (Candidate & SubBIT)
995 return Instruction::Sub;
996 if (Candidate & AndBIT)
997 return Instruction::And;
998 if (Candidate & OrBIT)
999 return Instruction::Or;
1000 if (Candidate & XorBIT)
1001 return Instruction::Xor;
1006 bool hasCandidateOpcode(
unsigned Opcode)
const {
1007 MaskType Candidate = Mask & SeenBefore;
1009 case Instruction::Shl:
1010 return Candidate & ShlBIT;
1011 case Instruction::AShr:
1012 return Candidate & AShrBIT;
1013 case Instruction::Mul:
1014 return Candidate & MulBIT;
1015 case Instruction::Add:
1016 return Candidate & AddBIT;
1017 case Instruction::Sub:
1018 return Candidate & SubBIT;
1019 case Instruction::And:
1020 return Candidate & AndBIT;
1021 case Instruction::Or:
1022 return Candidate & OrBIT;
1023 case Instruction::Xor:
1024 return Candidate & XorBIT;
1025 case Instruction::LShr:
1026 case Instruction::FAdd:
1027 case Instruction::FSub:
1028 case Instruction::FMul:
1029 case Instruction::SDiv:
1030 case Instruction::UDiv:
1031 case Instruction::FDiv:
1032 case Instruction::SRem:
1033 case Instruction::URem:
1034 case Instruction::FRem:
1044 unsigned FromOpcode = I->getOpcode();
1045 if (FromOpcode == ToOpcode)
1048 auto [CI, Pos] = isBinOpWithConstantInt(I);
1049 const APInt &FromCIValue = CI->getValue();
1050 unsigned FromCIValueBitWidth = FromCIValue.
getBitWidth();
1052 switch (FromOpcode) {
1053 case Instruction::Shl:
1054 if (ToOpcode == Instruction::Mul) {
1058 assert(FromCIValue.
isZero() &&
"Cannot convert the instruction.");
1059 ToCIValue = ToOpcode == Instruction::And
1061 : APInt::getZero(FromCIValueBitWidth);
1064 case Instruction::Mul:
1066 if (ToOpcode == Instruction::Shl) {
1067 ToCIValue = APInt(FromCIValueBitWidth, FromCIValue.
logBase2());
1069 assert(FromCIValue.
isOne() &&
"Cannot convert the instruction.");
1070 ToCIValue = ToOpcode == Instruction::And
1072 : APInt::getZero(FromCIValueBitWidth);
1075 case Instruction::Add:
1076 case Instruction::Sub:
1077 if (FromCIValue.
isZero()) {
1081 "Cannot convert the instruction.");
1082 ToCIValue = FromCIValue;
1086 case Instruction::And:
1088 ToCIValue = ToOpcode == Instruction::Mul
1090 : APInt::getZero(FromCIValueBitWidth);
1093 assert(FromCIValue.
isZero() &&
"Cannot convert the instruction.");
1097 Value *
LHS = I->getOperand(1 - Pos);
1099 ConstantInt::get(I->getOperand(Pos)->getType(), ToCIValue);
1103 ((FromOpcode == Instruction::Add || FromOpcode == Instruction::Or ||
1104 FromOpcode == Instruction::Xor) &&
1105 ToOpcode == Instruction::Sub))
1110 InterchangeableInfo MainOp;
1111 InterchangeableInfo AltOp;
1112 bool isValidForAlternation(
const Instruction *
I)
const {
1113 return ::isValidForAlternation(MainOp.I->getOpcode()) &&
1114 ::isValidForAlternation(
I->getOpcode());
1116 bool initializeAltOp(
const Instruction *
I) {
1119 if (!isValidForAlternation(
I))
1126 BinOpSameOpcodeHelper(
const Instruction *MainOp,
1127 const Instruction *AltOp =
nullptr)
1128 : MainOp(MainOp), AltOp(AltOp) {
1131 bool add(
const Instruction *
I) {
1133 "BinOpSameOpcodeHelper only accepts BinaryOperator.");
1134 unsigned Opcode =
I->getOpcode();
1135 MaskType OpcodeInMaskForm;
1138 case Instruction::Shl:
1139 OpcodeInMaskForm = ShlBIT;
1141 case Instruction::AShr:
1142 OpcodeInMaskForm = AShrBIT;
1144 case Instruction::Mul:
1145 OpcodeInMaskForm = MulBIT;
1147 case Instruction::Add:
1148 OpcodeInMaskForm = AddBIT;
1150 case Instruction::Sub:
1151 OpcodeInMaskForm = SubBIT;
1153 case Instruction::And:
1154 OpcodeInMaskForm = AndBIT;
1156 case Instruction::Or:
1157 OpcodeInMaskForm = OrBIT;
1159 case Instruction::Xor:
1160 OpcodeInMaskForm = XorBIT;
1163 return MainOp.equal(Opcode) ||
1164 (initializeAltOp(
I) && AltOp.equal(Opcode));
1166 MaskType InterchangeableMask = OpcodeInMaskForm;
1167 ConstantInt *CI = isBinOpWithConstantInt(
I).first;
1169 constexpr MaskType CanBeAll =
1170 XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | MulBIT | AShrBIT | ShlBIT;
1171 const APInt &CIValue = CI->
getValue();
1173 case Instruction::Shl:
1175 InterchangeableMask = CIValue.
isZero() ? CanBeAll : MulBIT | ShlBIT;
1177 case Instruction::Mul:
1178 if (CIValue.
isOne()) {
1179 InterchangeableMask = CanBeAll;
1183 InterchangeableMask = MulBIT | ShlBIT;
1185 case Instruction::Add:
1186 case Instruction::Sub:
1187 InterchangeableMask = CIValue.
isZero() ? CanBeAll : SubBIT | AddBIT;
1189 case Instruction::And:
1191 InterchangeableMask = CanBeAll;
1193 case Instruction::Xor:
1195 InterchangeableMask = XorBIT | OrBIT | AndBIT | SubBIT | AddBIT;
1199 InterchangeableMask = CanBeAll;
1203 return MainOp.trySet(OpcodeInMaskForm, InterchangeableMask) ||
1204 (initializeAltOp(
I) &&
1205 AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));
1207 unsigned getMainOpcode()
const {
return MainOp.getOpcode(); }
1209 bool hasCandidateOpcode(
unsigned Opcode)
const {
1210 return MainOp.hasCandidateOpcode(Opcode);
1212 bool hasAltOp()
const {
return AltOp.I; }
1213 unsigned getAltOpcode()
const {
1214 return hasAltOp() ? AltOp.getOpcode() : getMainOpcode();
1217 return MainOp.getOperand(
I);
1222class InstructionsState {
1248 bool HasCopyables =
false;
1252 assert(valid() &&
"InstructionsState is invalid.");
1257 assert(valid() &&
"InstructionsState is invalid.");
1262 unsigned getOpcode()
const {
return getMainOp()->getOpcode(); }
1264 unsigned getAltOpcode()
const {
return getAltOp()->getOpcode(); }
1267 bool isAltShuffle()
const {
return getMainOp() != getAltOp(); }
1276 Instruction *getMatchingMainOpOrAltOp(Instruction *
I)
const {
1277 assert(MainOp &&
"MainOp cannot be nullptr.");
1278 if (
I->getOpcode() == MainOp->getOpcode())
1281 assert(AltOp &&
"AltOp cannot be nullptr.");
1282 if (
I->getOpcode() == AltOp->getOpcode())
1284 if (!
I->isBinaryOp())
1286 BinOpSameOpcodeHelper
Converter(MainOp);
1289 if (isAltShuffle() && !
Converter.hasCandidateOpcode(MainOp->getOpcode())) {
1290 BinOpSameOpcodeHelper AltConverter(AltOp);
1291 if (AltConverter.add(
I) && AltConverter.add(AltOp) &&
1292 AltConverter.hasCandidateOpcode(AltOp->getOpcode()))
1295 if (
Converter.hasAltOp() && !isAltShuffle())
1297 return Converter.hasAltOp() ? AltOp : MainOp;
1301 bool isShiftOp()
const {
1302 return getMainOp()->isShift() && getAltOp()->isShift();
1307 return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();
1311 bool isMulDivLikeOp()
const {
1312 constexpr std::array<unsigned, 8> MulDiv = {
1313 Instruction::Mul, Instruction::FMul, Instruction::SDiv,
1314 Instruction::UDiv, Instruction::FDiv, Instruction::SRem,
1315 Instruction::URem, Instruction::FRem};
1321 bool isAddSubLikeOp()
const {
1322 constexpr std::array<unsigned, 4>
AddSub = {
1323 Instruction::Add, Instruction::Sub, Instruction::FAdd,
1330 bool isCmpOp()
const {
1331 return (
getOpcode() == Instruction::ICmp ||
1337 bool valid()
const {
return MainOp && AltOp; }
1339 explicit operator bool()
const {
return valid(); }
1341 InstructionsState() =
delete;
1342 InstructionsState(Instruction *MainOp, Instruction *AltOp,
1343 bool HasCopyables =
false)
1344 : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}
1345 static InstructionsState invalid() {
return {
nullptr,
nullptr}; }
1348 bool isCopyableElement(
Value *V)
const {
1349 assert(valid() &&
"InstructionsState is invalid.");
1352 if (isAltShuffle() ||
getOpcode() == Instruction::GetElementPtr)
1357 if (
I->getParent() != MainOp->getParent() &&
1361 if (
I->getOpcode() == MainOp->getOpcode())
1363 if (!
I->isBinaryOp())
1365 BinOpSameOpcodeHelper
Converter(MainOp);
1371 bool isNonSchedulable(
Value *V)
const {
1372 assert(valid() &&
"InstructionsState is invalid.");
1379 if (getMainOp() == V)
1381 if (isCopyableElement(V)) {
1382 auto IsNonSchedulableCopyableElement = [
this](
Value *
V) {
1384 return !
I ||
isa<PHINode>(
I) ||
I->getParent() != MainOp->getParent() ||
1389 !MainOp->comesBefore(
I));
1392 return IsNonSchedulableCopyableElement(V);
1399 bool areInstructionsWithCopyableElements()
const {
1400 assert(valid() &&
"InstructionsState is invalid.");
1401 return HasCopyables;
1405std::pair<Instruction *, SmallVector<Value *>>
1407 Instruction *SelectedOp = S.getMatchingMainOpOrAltOp(
I);
1408 assert(SelectedOp &&
"Cannot convert the instruction.");
1409 if (
I->isBinaryOp()) {
1411 return std::make_pair(SelectedOp,
Converter.getOperand(SelectedOp));
1430 for (
Value *V : VL) {
1435 if (Inst->getOpcode() == Opcode)
1449 BaseOp0 == Op0 || BaseOp1 == Op1 ||
1460 "Assessing comparisons of different types?");
1470 return (BasePred == Pred &&
1472 (BasePred == SwappedPred &&
1483 return InstructionsState::invalid();
1487 return InstructionsState::invalid();
1492 (VL.
size() == 2 && InstCnt < 2))
1493 return InstructionsState::invalid();
1502 unsigned AltOpcode = Opcode;
1504 BinOpSameOpcodeHelper BinOpHelper(MainOp);
1505 bool SwappedPredsCompatible = IsCmpOp && [&]() {
1507 UniquePreds.
insert(BasePred);
1508 UniqueNonSwappedPreds.
insert(BasePred);
1509 for (
Value *V : VL) {
1516 UniqueNonSwappedPreds.
insert(CurrentPred);
1517 if (!UniquePreds.
contains(CurrentPred) &&
1518 !UniquePreds.
contains(SwappedCurrentPred))
1519 UniquePreds.
insert(CurrentPred);
1524 return UniqueNonSwappedPreds.
size() > 2 && UniquePreds.
size() == 2;
1534 return InstructionsState::invalid();
1536 bool AnyPoison = InstCnt != VL.
size();
1547 if (AnyPoison && (
I->isIntDivRem() ||
I->isFPDivRem() ||
isa<CallInst>(
I)))
1548 return InstructionsState::invalid();
1549 unsigned InstOpcode =
I->getOpcode();
1551 if (BinOpHelper.add(
I))
1556 Value *Op1 =
I->getOperand(0);
1559 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
1561 if (Opcode == AltOpcode) {
1562 assert(isValidForAlternation(Opcode) &&
1563 isValidForAlternation(InstOpcode) &&
1564 "Cast isn't safe for alternation, logic needs to be updated!");
1565 AltOpcode = InstOpcode;
1572 Type *Ty0 = BaseInst->getOperand(0)->getType();
1573 Type *Ty1 = Inst->getOperand(0)->getType();
1575 assert(InstOpcode == Opcode &&
"Expected same CmpInst opcode.");
1576 assert(InstOpcode == AltOpcode &&
1577 "Alternate instructions are only supported by BinaryOperator "
1585 if ((VL.
size() == 2 || SwappedPredsCompatible) &&
1586 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1592 if (MainOp != AltOp) {
1595 }
else if (BasePred != CurrentPred) {
1597 isValidForAlternation(InstOpcode) &&
1598 "CmpInst isn't safe for alternation, logic needs to be updated!");
1603 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1604 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1607 }
else if (InstOpcode == Opcode) {
1608 assert(InstOpcode == AltOpcode &&
1609 "Alternate instructions are only supported by BinaryOperator and "
1612 if (Gep->getNumOperands() != 2 ||
1614 return InstructionsState::invalid();
1617 return InstructionsState::invalid();
1620 if (!LI->isSimple() || !BaseLI->isSimple())
1621 return InstructionsState::invalid();
1625 return InstructionsState::invalid();
1626 if (
Call->hasOperandBundles() &&
1628 !std::equal(
Call->op_begin() +
Call->getBundleOperandsStartIndex(),
1629 Call->op_begin() +
Call->getBundleOperandsEndIndex(),
1632 return InstructionsState::invalid();
1635 return InstructionsState::invalid();
1638 if (Mappings.
size() != BaseMappings.
size() ||
1639 Mappings.
front().ISA != BaseMappings.
front().ISA ||
1640 Mappings.
front().ScalarName != BaseMappings.
front().ScalarName ||
1641 Mappings.
front().VectorName != BaseMappings.
front().VectorName ||
1642 Mappings.
front().Shape.VF != BaseMappings.
front().Shape.VF ||
1643 Mappings.
front().Shape.Parameters !=
1644 BaseMappings.
front().Shape.Parameters)
1645 return InstructionsState::invalid();
1650 return InstructionsState::invalid();
1655 assert(MainOp &&
"Cannot find MainOp with Opcode from BinOpHelper.");
1657 assert(MainOp &&
"Cannot find AltOp with Opcode from BinOpHelper.");
1660 "Incorrect implementation of allSameOpcode.");
1661 InstructionsState S(MainOp, AltOp);
1667 "Invalid InstructionsState.");
1675 return all_of(VL, [&](
Value *V) {
return V->getType() == Ty; });
1685 unsigned Opcode = UserInst->
getOpcode();
1687 case Instruction::Load: {
1691 case Instruction::Store: {
1693 return (
SI->getPointerOperand() == Scalar);
1695 case Instruction::Call: {
1699 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1700 Arg.value().get() == Scalar;
1720 return LI->isSimple();
1722 return SI->isSimple();
1724 return !
MI->isVolatile();
1732 bool ExtendingManyInputs =
false) {
1733 if (SubMask.
empty())
1736 (!ExtendingManyInputs || SubMask.
size() > Mask.size() ||
1739 "SubMask with many inputs support must be larger than the mask.");
1741 Mask.append(SubMask.
begin(), SubMask.
end());
1745 int TermValue = std::min(Mask.size(), SubMask.
size());
1746 for (
int I = 0,
E = SubMask.
size();
I <
E; ++
I) {
1748 (!ExtendingManyInputs &&
1749 (SubMask[
I] >= TermValue || Mask[SubMask[
I]] >= TermValue)))
1751 NewMask[
I] = Mask[SubMask[
I]];
1767 const size_t Sz = Order.
size();
1770 for (
unsigned I = 0;
I < Sz; ++
I) {
1772 UnusedIndices.
reset(Order[
I]);
1774 MaskedIndices.
set(
I);
1776 if (MaskedIndices.
none())
1779 "Non-synced masked/available indices.");
1783 assert(Idx >= 0 &&
"Indices must be synced.");
1793 unsigned Opcode0,
unsigned Opcode1) {
1800 OpcodeMask.
set(Lane * ScalarTyNumElements,
1801 Lane * ScalarTyNumElements + ScalarTyNumElements);
1810 "Expected scalar constants.");
1813 std::fill_n(NewVal.begin() +
I * VF, VF, V);
1822 const unsigned E = Indices.
size();
1824 for (
unsigned I = 0;
I < E; ++
I)
1825 Mask[Indices[
I]] =
I;
1831 assert(!Mask.empty() &&
"Expected non-empty mask.");
1835 for (
unsigned I = 0, E = Prev.
size();
I < E; ++
I)
1837 Scalars[Mask[
I]] = Prev[
I];
1850 auto *IO = dyn_cast<Instruction>(V);
1853 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1866 return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(
UsesLimit) &&
1868 auto *IU = dyn_cast<Instruction>(U);
1871 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1887 return !VL.
empty() &&
1903 return NumParts > 0 && NumParts < Sz &&
has_single_bit(Sz / NumParts) &&
1912 const unsigned Limit = std::numeric_limits<unsigned>::max()) {
1913 unsigned NumParts =
TTI.getNumberOfParts(VecTy);
1914 if (NumParts == 0 || NumParts >= Limit)
1917 if (NumParts >= Sz || Sz % NumParts != 0 ||
1928 class ScheduleEntity;
1930 class ScheduleCopyableData;
1931 class ScheduleBundle;
1941 struct StridedPtrInfo {
1942 Value *StrideVal =
nullptr;
1943 const SCEV *StrideSCEV =
nullptr;
1969 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1970 AC(AC), DB(DB), DL(DL), ORE(ORE),
1989 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
2002 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales = {});
2023 assert(!VectorizableTree.empty() &&
"No graph to get the first node from");
2024 return VectorizableTree.front()->Scalars;
2030 const TreeEntry &Root = *VectorizableTree.front();
2031 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
2032 !Root.Scalars.
front()->getType()->isIntegerTy())
2033 return std::nullopt;
2034 auto It = MinBWs.find(&Root);
2035 if (It != MinBWs.end())
2039 if (Root.getOpcode() == Instruction::ZExt ||
2040 Root.getOpcode() == Instruction::SExt)
2041 return std::make_pair(
cast<CastInst>(Root.getMainOp())->getSrcTy(),
2042 Root.getOpcode() == Instruction::SExt);
2043 return std::nullopt;
2049 return MinBWs.at(VectorizableTree.front().get()).second;
2054 if (ReductionBitWidth == 0 ||
2055 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
2056 ReductionBitWidth >=
2057 DL->getTypeSizeInBits(
2058 VectorizableTree.front()->Scalars.front()->getType()))
2060 VectorizableTree.front()->Scalars.front()->getType(),
2061 VectorizableTree.front()->getVectorFactor());
2064 VectorizableTree.front()->Scalars.front()->getContext(),
2066 VectorizableTree.front()->getVectorFactor());
2081 VectorizableTree.clear();
2082 ScalarToTreeEntries.clear();
2083 OperandsToTreeEntry.clear();
2084 ScalarsInSplitNodes.clear();
2086 NonScheduledFirst.clear();
2087 EntryToLastInstruction.clear();
2088 LoadEntriesToVectorize.clear();
2089 IsGraphTransformMode =
false;
2090 GatheredLoadsEntriesFirst.reset();
2091 CompressEntryToData.clear();
2092 ExternalUses.clear();
2093 ExternalUsesAsOriginalScalar.clear();
2094 ExternalUsesWithNonUsers.clear();
2095 for (
auto &Iter : BlocksSchedules) {
2096 BlockScheduling *BS = Iter.second.get();
2100 ReductionBitWidth = 0;
2102 CastMaxMinBWSizes.reset();
2103 ExtraBitWidthNodes.clear();
2104 InstrElementSize.clear();
2105 UserIgnoreList =
nullptr;
2106 PostponedGathers.clear();
2107 ValueToGatherNodes.clear();
2108 TreeEntryToStridedPtrInfoMap.clear();
2124 assert(!Order.
empty() &&
"expected non-empty order");
2125 const unsigned Sz = Order.
size();
2127 return P.value() ==
P.index() ||
P.value() == Sz;
2140 bool IgnoreReorder);
2153 std::optional<OrdersType>
2191 return MaxVecRegSize;
2196 return MinVecRegSize;
2204 unsigned MaxVF =
MaxVFOption.getNumOccurrences() ?
2205 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
2206 return MaxVF ? MaxVF : UINT_MAX;
2245 Align Alignment,
const int64_t Diff,
2246 const size_t Sz)
const;
2287 Value *Ptr0,
Value *PtrN, StridedPtrInfo &SPtrInfo)
const;
2305 Align CommonAlignment,
2307 StridedPtrInfo &SPtrInfo)
const;
2322 StridedPtrInfo &SPtrInfo,
2323 unsigned *BestVF =
nullptr,
2324 bool TryRecursiveCheck =
true)
const;
2328 ListOfKnonwnNonVectorizableLoads.insert(
hash_value(VL));
2332 template <
typename T>
2334 return ListOfKnonwnNonVectorizableLoads.contains(
hash_value(VL));
2359 OS <<
"{User:" << (
UserTE ? std::to_string(
UserTE->Idx) :
"null")
2360 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
2385 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
2386 MaxLevel(MaxLevel) {}
2442 auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
2447 auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
2449 return U == U1 || U == U2 || R.isVectorized(U);
2452 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
2455 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
2457 ((
int)V1->getNumUses() == NumLanes ||
2458 AllUsersAreInternal(V1, V2)))
2464 auto CheckSameEntryOrFail = [&]() {
2469 any_of(TEs2, [&](TreeEntry *E) {
return Set.contains(E); }))
2478 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
2480 return CheckSameEntryOrFail();
2483 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
2484 LI2->getPointerOperand(), DL, SE,
true);
2485 if (!Dist || *Dist == 0) {
2488 R.TTI->isLegalMaskedGather(
2491 return CheckSameEntryOrFail();
2495 if (std::abs(*Dist) > NumLanes / 2)
2528 Value *EV2 =
nullptr;
2541 int Dist = Idx2 - Idx1;
2544 if (std::abs(Dist) == 0)
2546 if (std::abs(Dist) > NumLanes / 2)
2553 return CheckSameEntryOrFail();
2559 if (I1->getParent() != I2->getParent())
2560 return CheckSameEntryOrFail();
2568 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.
empty() ||
2569 !S.isAltShuffle()) &&
2573 S.getMainOp()->getNumOperands();
2585 return CheckSameEntryOrFail();
2619 int ShallowScoreAtThisLevel =
2630 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
2633 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
2635 ShallowScoreAtThisLevel))
2636 return ShallowScoreAtThisLevel;
2637 assert(I1 && I2 &&
"Should have early exited.");
2644 for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
2645 OpIdx1 != NumOperands1; ++OpIdx1) {
2647 int MaxTmpScore = 0;
2648 unsigned MaxOpIdx2 = 0;
2649 bool FoundBest =
false;
2653 ? I2->getNumOperands()
2654 : std::min(I2->getNumOperands(), OpIdx1 + 1);
2655 assert(FromIdx <= ToIdx &&
"Bad index");
2656 for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
2658 if (Op2Used.
count(OpIdx2))
2663 I1, I2, CurrLevel + 1, {});
2666 TmpScore > MaxTmpScore) {
2667 MaxTmpScore = TmpScore;
2674 Op2Used.
insert(MaxOpIdx2);
2675 ShallowScoreAtThisLevel += MaxTmpScore;
2678 return ShallowScoreAtThisLevel;
2709 struct OperandData {
2710 OperandData() =
default;
2711 OperandData(
Value *V,
bool APO,
bool IsUsed)
2712 : V(V), APO(APO), IsUsed(IsUsed) {}
2722 bool IsUsed =
false;
2731 enum class ReorderingMode {
2745 unsigned ArgSize = 0;
2751 const Loop *L =
nullptr;
2754 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
2755 return OpsVec[
OpIdx][Lane];
2759 const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const {
2760 return OpsVec[
OpIdx][Lane];
2765 for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
2767 for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2769 OpsVec[
OpIdx][Lane].IsUsed =
false;
2773 void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
2774 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2786 int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx,
2788 Value *IdxLaneV = getData(Idx, Lane).V;
2801 unsigned UniquesCount = Uniques.
size();
2802 auto IdxIt = Uniques.
find(IdxLaneV);
2803 unsigned UniquesCntWithIdxLaneV =
2804 IdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2806 auto OpIdxIt = Uniques.
find(OpIdxLaneV);
2807 unsigned UniquesCntWithOpIdxLaneV =
2808 OpIdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2809 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2811 return std::min(
bit_ceil(UniquesCntWithOpIdxLaneV) -
2812 UniquesCntWithOpIdxLaneV,
2813 UniquesCntWithOpIdxLaneV -
2815 ((IdxIt != Uniques.
end() && UsedLanes.
test(IdxIt->second))
2816 ? UniquesCntWithIdxLaneV -
bit_floor(UniquesCntWithIdxLaneV)
2817 :
bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2826 int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
2827 Value *IdxLaneV = getData(Idx, Lane).V;
2840 return R.areAllUsersVectorized(IdxLaneI)
2848 static const int ScoreScaleFactor = 10;
2856 int Lane,
unsigned OpIdx,
unsigned Idx,
2866 int SplatScore = getSplatScore(Lane,
OpIdx, Idx, UsedLanes);
2867 if (Score <= -SplatScore) {
2871 Score += SplatScore;
2877 Score *= ScoreScaleFactor;
2878 Score += getExternalUseScore(Lane,
OpIdx, Idx);
2896 std::optional<unsigned>
2897 getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
2901 unsigned NumOperands = getNumOperands();
2904 Value *OpLastLane = getData(
OpIdx, LastLane).V;
2907 ReorderingMode RMode = ReorderingModes[
OpIdx];
2908 if (RMode == ReorderingMode::Failed)
2909 return std::nullopt;
2912 bool OpIdxAPO = getData(
OpIdx, Lane).APO;
2918 std::optional<unsigned> Idx;
2922 BestScoresPerLanes.try_emplace(std::make_pair(
OpIdx, Lane), 0)
2928 bool IsUsed = RMode == ReorderingMode::Splat ||
2929 RMode == ReorderingMode::Constant ||
2930 RMode == ReorderingMode::Load;
2932 for (
unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2934 OperandData &OpData = getData(Idx, Lane);
2936 bool OpAPO = OpData.APO;
2945 if (OpAPO != OpIdxAPO)
2950 case ReorderingMode::Load:
2951 case ReorderingMode::Opcode: {
2952 bool LeftToRight = Lane > LastLane;
2953 Value *OpLeft = (LeftToRight) ? OpLastLane :
Op;
2954 Value *OpRight = (LeftToRight) ?
Op : OpLastLane;
2955 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2956 OpIdx, Idx, IsUsed, UsedLanes);
2957 if (Score >
static_cast<int>(BestOp.Score) ||
2958 (Score > 0 && Score ==
static_cast<int>(BestOp.Score) &&
2961 BestOp.Score = Score;
2962 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] = Score;
2966 case ReorderingMode::Constant:
2968 (!BestOp.Score && L && L->isLoopInvariant(
Op))) {
2972 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] =
2979 case ReorderingMode::Splat:
2981 IsUsed =
Op == OpLastLane;
2982 if (
Op == OpLastLane) {
2984 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] =
2990 case ReorderingMode::Failed:
2996 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
3000 return std::nullopt;
3007 unsigned getBestLaneToStartReordering()
const {
3008 unsigned Min = UINT_MAX;
3009 unsigned SameOpNumber = 0;
3020 for (
int I = getNumLanes();
I > 0; --
I) {
3021 unsigned Lane =
I - 1;
3022 OperandsOrderData NumFreeOpsHash =
3023 getMaxNumOperandsThatCanBeReordered(Lane);
3026 if (NumFreeOpsHash.NumOfAPOs < Min) {
3027 Min = NumFreeOpsHash.NumOfAPOs;
3028 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3030 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3031 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
3032 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
3035 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3036 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3037 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
3038 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
3039 auto [It, Inserted] =
3040 HashMap.
try_emplace(NumFreeOpsHash.Hash, 1, Lane);
3046 unsigned BestLane = 0;
3047 unsigned CntMin = UINT_MAX;
3049 if (
Data.second.first < CntMin) {
3050 CntMin =
Data.second.first;
3051 BestLane =
Data.second.second;
3058 struct OperandsOrderData {
3061 unsigned NumOfAPOs = UINT_MAX;
3064 unsigned NumOpsWithSameOpcodeParent = 0;
3078 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const {
3079 unsigned CntTrue = 0;
3080 unsigned NumOperands = getNumOperands();
3090 bool AllUndefs =
true;
3091 unsigned NumOpsWithSameOpcodeParent = 0;
3096 const OperandData &OpData = getData(
OpIdx, Lane);
3103 I->getParent() != Parent) {
3104 if (NumOpsWithSameOpcodeParent == 0) {
3105 NumOpsWithSameOpcodeParent = 1;
3107 Parent =
I->getParent();
3109 --NumOpsWithSameOpcodeParent;
3112 ++NumOpsWithSameOpcodeParent;
3121 OperandsOrderData
Data;
3122 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
3123 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
3130 const InstructionsState &S) {
3134 return VL.
size() == getNumLanes();
3136 "Expected same number of lanes");
3137 assert(S.valid() &&
"InstructionsState is invalid.");
3143 OpsVec.resize(ArgSize);
3144 unsigned NumLanes = VL.
size();
3145 for (OperandDataVec &
Ops : OpsVec)
3146 Ops.resize(NumLanes);
3161 OpsVec[
OpIdx][Lane] = {Operands[
OpIdx][Lane],
true,
false};
3164 bool IsInverseOperation =
false;
3165 if (S.isCopyableElement(VL[Lane])) {
3169 assert(
I &&
"Expected instruction");
3170 auto [SelectedOp,
Ops] = convertTo(
I, S);
3177 bool APO = (
OpIdx == 0) ?
false : IsInverseOperation;
3178 OpsVec[
OpIdx][Lane] = {Operands[
OpIdx][Lane], APO,
false};
3184 unsigned getNumOperands()
const {
return ArgSize; }
3187 unsigned getNumLanes()
const {
return OpsVec[0].size(); }
3190 Value *getValue(
unsigned OpIdx,
unsigned Lane)
const {
3191 return getData(
OpIdx, Lane).V;
3195 bool empty()
const {
return OpsVec.empty(); }
3198 void clear() { OpsVec.clear(); }
3203 bool shouldBroadcast(
Value *
Op,
unsigned OpIdx,
unsigned Lane) {
3205 "Op is expected to be getValue(OpIdx, Lane).");
3209 bool OpAPO = getData(
OpIdx, Lane).APO;
3210 bool IsInvariant = L && L->isLoopInvariant(
Op);
3212 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3216 bool FoundCandidate =
false;
3217 for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
3218 OperandData &
Data = getData(OpI, Ln);
3219 if (
Data.APO != OpAPO ||
Data.IsUsed)
3221 Value *OpILane = getValue(OpI, Lane);
3245 L->isLoopInvariant(
Data.V))) {
3246 FoundCandidate =
true;
3253 if (!FoundCandidate)
3256 return getNumLanes() == 2 || Cnt > 1;
3263 "Op is expected to be getValue(OpIdx, Lane).");
3264 bool OpAPO = getData(
OpIdx, Lane).APO;
3265 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3269 const OperandData &
Data = getData(OpI, Ln);
3270 if (
Data.APO != OpAPO ||
Data.IsUsed)
3272 Value *OpILn = getValue(OpI, Ln);
3273 return (L && L->isLoopInvariant(OpILn)) ||
3285 const InstructionsState &S,
const BoUpSLP &R)
3286 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
3287 L(R.LI->getLoopFor(S.getMainOp()->
getParent())) {
3289 appendOperands(RootVL, Operands, S);
3297 "Expected same num of lanes across all operands");
3298 for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
3299 OpVL[Lane] = OpsVec[
OpIdx][Lane].V;
3307 unsigned NumOperands = getNumOperands();
3308 unsigned NumLanes = getNumLanes();
3328 unsigned FirstLane = getBestLaneToStartReordering();
3337 if (shouldBroadcast(OpLane0,
OpIdx, FirstLane) ||
3338 !canBeVectorized(OpILane0,
OpIdx, FirstLane))
3339 ReorderingModes[
OpIdx] = ReorderingMode::Splat;
3341 ReorderingModes[
OpIdx] = ReorderingMode::Load;
3343 ReorderingModes[
OpIdx] = ReorderingMode::Opcode;
3345 ReorderingModes[
OpIdx] = ReorderingMode::Constant;
3348 ReorderingModes[
OpIdx] = ReorderingMode::Splat;
3358 auto &&SkipReordering = [
this]() {
3361 for (
const OperandData &
Data : Op0)
3364 ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
3365 if (
any_of(
Op, [&UniqueValues](
const OperandData &
Data) {
3372 return UniqueValues.
size() != 2 &&
3374 UniqueValues.
size());
3386 if (SkipReordering())
3389 bool StrategyFailed =
false;
3397 for (
unsigned I = 0;
I < NumOperands; ++
I)
3398 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
3401 UsedLanes.
set(FirstLane);
3402 for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
3404 for (
int Direction : {+1, -1}) {
3405 int Lane = FirstLane + Direction * Distance;
3406 if (Lane < 0 || Lane >= (
int)NumLanes)
3408 UsedLanes.
set(Lane);
3409 int LastLane = Lane - Direction;
3410 assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
3415 std::optional<unsigned> BestIdx =
3416 getBestOperand(
OpIdx, Lane, LastLane, ReorderingModes,
3417 MainAltOps[
OpIdx], UsedLanes);
3424 swap(
OpIdx, *BestIdx, Lane);
3427 StrategyFailed =
true;
3431 OperandData &AltOp = getData(
OpIdx, Lane);
3432 InstructionsState OpS =
3434 if (OpS && OpS.isAltShuffle())
3441 if (!StrategyFailed)
3446#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3449 case ReorderingMode::Load:
3451 case ReorderingMode::Opcode:
3453 case ReorderingMode::Constant:
3455 case ReorderingMode::Splat:
3457 case ReorderingMode::Failed:
3478 const unsigned Indent = 2;
3480 for (
const OperandDataVec &OpDataVec : OpsVec) {
3481 OS <<
"Operand " << Cnt++ <<
"\n";
3482 for (
const OperandData &OpData : OpDataVec) {
3483 OS.
indent(Indent) <<
"{";
3484 if (
Value *V = OpData.V)
3488 OS <<
", APO:" << OpData.APO <<
"}\n";
3510 int BestScore = Limit;
3511 std::optional<int> Index;
3512 for (
int I :
seq<int>(0, Candidates.size())) {
3514 Candidates[
I].second,
3517 if (Score > BestScore) {
3532 DeletedInstructions.insert(
I);
3537 template <
typename T>
3540 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
3542 for (
T *V : DeadVals) {
3547 for (
T *V : DeadVals) {
3548 if (!V || !Processed.
insert(V).second)
3553 for (
Use &U :
I->operands()) {
3555 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
3557 (Entries.
empty() ||
none_of(Entries, [&](
const TreeEntry *Entry) {
3558 return Entry->VectorizedValue == OpI;
3562 I->dropAllReferences();
3564 for (
T *V : DeadVals) {
3566 if (!
I->getParent())
3571 cast<Instruction>(U.getUser()));
3573 "trying to erase instruction with users.");
3574 I->removeFromParent();
3578 while (!DeadInsts.
empty()) {
3581 if (!VI || !VI->getParent())
3584 "Live instruction found in dead worklist!");
3585 assert(VI->use_empty() &&
"Instructions with uses are not dead.");
3592 for (
Use &OpU : VI->operands()) {
3593 Value *OpV = OpU.get();
3605 if (!DeletedInstructions.contains(OpI) &&
3606 (!OpI->getType()->isVectorTy() ||
3607 none_of(VectorValuesAndScales,
3608 [&](
const std::tuple<Value *, unsigned, bool> &V) {
3609 return std::get<0>(V) == OpI;
3615 VI->removeFromParent();
3617 SE->forgetValue(VI);
3624 return AnalyzedReductionsRoots.count(
I);
3629 AnalyzedReductionsRoots.insert(
I);
3634 return AnalyzedReductionVals.contains(
hash_value(VL));
3639 AnalyzedReductionVals.insert(
hash_value(VL));
3643 AnalyzedReductionsRoots.clear();
3644 AnalyzedReductionVals.clear();
3645 AnalyzedMinBWVals.clear();
3653 return MustGather.contains(V);
3657 return NonScheduledFirst.contains(V);
3662 assert(V &&
"V cannot be nullptr.");
3663 return ScalarToTreeEntries.contains(V);
3673 bool collectValuesToDemote(
3674 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
3677 bool &IsProfitableToDemote,
bool IsTruncRoot)
const;
3686 void buildReorderableOperands(
3694 void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
3697 bool areAllUsersVectorized(
3706 const TreeEntry *getOperandEntry(
const TreeEntry *E,
unsigned Idx)
const;
3707 TreeEntry *getOperandEntry(TreeEntry *E,
unsigned Idx) {
3708 return const_cast<TreeEntry *
>(
3709 getOperandEntry(
const_cast<const TreeEntry *
>(E), Idx));
3715 Instruction *getRootEntryInstruction(
const TreeEntry &Entry)
const;
3719 getCastContextHint(
const TreeEntry &TE)
const;
3733 const InstructionsState &LocalState,
3740 unsigned InterleaveFactor = 0);
3751 bool ResizeAllowed =
false)
const;
3758 Value *vectorizeOperand(TreeEntry *E,
unsigned NodeIdx);
3763 template <
typename BVTy,
typename ResTy,
typename... Args>
3764 ResTy processBuildVector(
const TreeEntry *E,
Type *ScalarTy, Args &...Params);
3769 Value *createBuildVector(
const TreeEntry *E,
Type *ScalarTy);
3775 Instruction &getLastInstructionInBundle(
const TreeEntry *E);
3782 std::optional<TargetTransformInfo::ShuffleKind>
3794 unsigned NumParts)
const;
3806 std::optional<TargetTransformInfo::ShuffleKind>
3807 isGatherShuffledSingleRegisterEntry(
3824 isGatherShuffledEntry(
3827 unsigned NumParts,
bool ForOrder =
false);
3833 Type *ScalarTy)
const;
3837 void setInsertPointAfterBundle(
const TreeEntry *E);
3847 bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
3852 void tryToVectorizeGatheredLoads(
3854 std::tuple<BasicBlock *, Value *, Type *>,
3862 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
3878 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
3882 void reorderGatherNode(TreeEntry &TE);
3887 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3891 if (State == TreeEntry::SplitVectorize)
3893 SmallVector<int>
Mask;
3900 SmallVector<int> getSplitMask()
const {
3901 assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
3902 "Expected only split vectorize node.");
3904 unsigned CommonVF = std::max<unsigned>(
3905 CombinedEntriesWithIndices.back().second,
3906 Scalars.size() - CombinedEntriesWithIndices.back().second);
3907 for (
auto [Idx,
I] :
enumerate(ReorderIndices))
3909 Idx + (Idx >= CombinedEntriesWithIndices.back().second
3910 ? CommonVF - CombinedEntriesWithIndices.back().second
3917 void reorderSplitNode(
unsigned Idx, ArrayRef<int> Mask,
3918 ArrayRef<int> MaskOrder);
3923 if (
Mask.size() != VL.
size() && VL.
size() == Scalars.size())
3924 return std::equal(VL.
begin(), VL.
end(), Scalars.begin());
3927 [Scalars](
Value *V,
int Idx) {
3928 return (isa<UndefValue>(V) &&
3929 Idx == PoisonMaskElem) ||
3930 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3933 if (!ReorderIndices.empty()) {
3937 SmallVector<int>
Mask;
3939 if (VL.
size() == Scalars.size())
3940 return IsSame(Scalars, Mask);
3941 if (VL.
size() == ReuseShuffleIndices.size()) {
3943 return IsSame(Scalars, Mask);
3947 return IsSame(Scalars, ReuseShuffleIndices);
3951 bool hasEqualOperands(
const TreeEntry &TE)
const {
3952 if (
TE.getNumOperands() != getNumOperands())
3954 SmallBitVector
Used(getNumOperands());
3955 for (
unsigned I = 0,
E = getNumOperands();
I <
E; ++
I) {
3956 unsigned PrevCount =
Used.count();
3957 for (
unsigned K = 0;
K <
E; ++
K) {
3960 if (getOperand(K) ==
TE.getOperand(
I)) {
3966 if (PrevCount ==
Used.count())
3975 unsigned getVectorFactor()
const {
3976 if (!ReuseShuffleIndices.empty())
3977 return ReuseShuffleIndices.size();
3978 return Scalars.size();
3982 bool isGather()
const {
return State == NeedToGather; }
3988 WeakTrackingVH VectorizedValue =
nullptr;
4009 enum CombinedOpcode {
4011 MinMax = Instruction::OtherOpsEnd + 1,
4014 CombinedOpcode CombinedOp = NotCombinedOp;
4017 SmallVector<int, 4> ReuseShuffleIndices;
4020 SmallVector<unsigned, 4> ReorderIndices;
4028 VecTreeTy &Container;
4031 EdgeInfo UserTreeIndex;
4047 SmallPtrSet<const Value *, 4> CopyableElements;
4051 InstructionsState S = InstructionsState::invalid();
4054 unsigned InterleaveFactor = 0;
4057 bool DoesNotNeedToSchedule =
false;
4061 if (Operands.size() <
OpIdx + 1)
4062 Operands.resize(
OpIdx + 1);
4065 "Number of operands is greater than the number of scalars.");
4072 unsigned getInterleaveFactor()
const {
return InterleaveFactor; }
4074 void setInterleave(
unsigned Factor) { InterleaveFactor = Factor; }
4077 void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule =
true; }
4080 bool doesNotNeedToSchedule()
const {
return DoesNotNeedToSchedule; }
4085 setOperand(
I, Operands[
I]);
4089 void reorderOperands(ArrayRef<int> Mask) {
4097 return Operands[
OpIdx];
4103 return Operands[
OpIdx];
4107 unsigned getNumOperands()
const {
return Operands.size(); }
4110 Value *getSingleOperand(
unsigned OpIdx)
const {
4113 return Operands[
OpIdx][0];
4117 bool isAltShuffle()
const {
return S.isAltShuffle(); }
4119 Instruction *getMatchingMainOpOrAltOp(Instruction *
I)
const {
4120 return S.getMatchingMainOpOrAltOp(
I);
4128 if (
I && getMatchingMainOpOrAltOp(
I))
4130 return S.getMainOp();
4133 void setOperations(
const InstructionsState &S) {
4134 assert(S &&
"InstructionsState is invalid.");
4138 Instruction *getMainOp()
const {
return S.getMainOp(); }
4140 Instruction *getAltOp()
const {
return S.getAltOp(); }
4143 unsigned getOpcode()
const {
return S.getOpcode(); }
4145 unsigned getAltOpcode()
const {
return S.getAltOpcode(); }
4147 bool hasState()
const {
return S.valid(); }
4150 void addCopyableElement(
Value *V) {
4151 assert(S.isCopyableElement(V) &&
"Not a copyable element.");
4152 CopyableElements.insert(V);
4156 bool isCopyableElement(
Value *V)
const {
4157 return CopyableElements.contains(V);
4161 bool hasCopyableElements()
const {
return !CopyableElements.empty(); }
4164 const InstructionsState &getOperations()
const {
return S; }
4168 unsigned findLaneForValue(
Value *V)
const {
4169 unsigned FoundLane = getVectorFactor();
4170 for (
auto *It =
find(Scalars, V), *End = Scalars.end(); It != End;
4171 std::advance(It, 1)) {
4174 FoundLane = std::distance(Scalars.begin(), It);
4175 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
4176 if (!ReorderIndices.empty())
4177 FoundLane = ReorderIndices[FoundLane];
4178 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
4179 if (ReuseShuffleIndices.empty())
4181 if (
auto *RIt =
find(ReuseShuffleIndices, FoundLane);
4182 RIt != ReuseShuffleIndices.end()) {
4183 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
4187 assert(FoundLane < getVectorFactor() &&
"Unable to find given value.");
4194 buildAltOpShuffleMask(
const function_ref<
bool(Instruction *)> IsAltOp,
4195 SmallVectorImpl<int> &Mask,
4196 SmallVectorImpl<Value *> *OpScalars =
nullptr,
4197 SmallVectorImpl<Value *> *AltScalars =
nullptr)
const;
4200 bool isNonPowOf2Vec()
const {
4202 return IsNonPowerOf2;
4208 hasNonWholeRegisterOrNonPowerOf2Vec(
const TargetTransformInfo &
TTI)
const {
4211 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
4212 "Reshuffling not supported with non-power-of-2 vectors yet.");
4213 return IsNonPowerOf2;
4216 Value *getOrdered(
unsigned Idx)
const {
4217 assert(isGather() &&
"Must be used only for buildvectors/gathers.");
4218 if (ReorderIndices.empty())
4219 return Scalars[Idx];
4220 SmallVector<int>
Mask;
4222 return Scalars[
Mask[Idx]];
4228 dbgs() << Idx <<
".\n";
4229 for (
unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
4230 dbgs() <<
"Operand " << OpI <<
":\n";
4231 for (
const Value *V : Operands[OpI])
4234 dbgs() <<
"Scalars: \n";
4235 for (
Value *V : Scalars)
4237 dbgs() <<
"State: ";
4238 if (S && hasCopyableElements())
4239 dbgs() <<
"[[Copyable]] ";
4242 if (InterleaveFactor > 0) {
4243 dbgs() <<
"Vectorize with interleave factor " << InterleaveFactor
4246 dbgs() <<
"Vectorize\n";
4249 case ScatterVectorize:
4250 dbgs() <<
"ScatterVectorize\n";
4252 case StridedVectorize:
4253 dbgs() <<
"StridedVectorize\n";
4255 case CompressVectorize:
4256 dbgs() <<
"CompressVectorize\n";
4259 dbgs() <<
"NeedToGather\n";
4261 case CombinedVectorize:
4262 dbgs() <<
"CombinedVectorize\n";
4264 case SplitVectorize:
4265 dbgs() <<
"SplitVectorize\n";
4269 dbgs() <<
"MainOp: " << *S.getMainOp() <<
"\n";
4270 dbgs() <<
"AltOp: " << *S.getAltOp() <<
"\n";
4272 dbgs() <<
"MainOp: NULL\n";
4273 dbgs() <<
"AltOp: NULL\n";
4275 dbgs() <<
"VectorizedValue: ";
4276 if (VectorizedValue)
4277 dbgs() << *VectorizedValue <<
"\n";
4280 dbgs() <<
"ReuseShuffleIndices: ";
4281 if (ReuseShuffleIndices.empty())
4284 for (
int ReuseIdx : ReuseShuffleIndices)
4285 dbgs() << ReuseIdx <<
", ";
4287 dbgs() <<
"ReorderIndices: ";
4288 for (
unsigned ReorderIdx : ReorderIndices)
4289 dbgs() << ReorderIdx <<
", ";
4291 dbgs() <<
"UserTreeIndex: ";
4293 dbgs() << UserTreeIndex;
4295 dbgs() <<
"<invalid>";
4297 if (!CombinedEntriesWithIndices.empty()) {
4298 dbgs() <<
"Combined entries: ";
4300 dbgs() <<
"Entry index " <<
P.first <<
" with offset " <<
P.second;
4311 StringRef Banner)
const {
4312 dbgs() <<
"SLP: " << Banner <<
":\n";
4314 dbgs() <<
"SLP: Costs:\n";
4315 dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
4316 dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
4317 dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
4318 dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = "
4319 << ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
4325 const InstructionsState &S,
4327 ArrayRef<int> ReuseShuffleIndices = {}) {
4328 auto Invalid = ScheduleBundle::invalid();
4329 return newTreeEntry(VL,
Invalid, S, UserTreeIdx, ReuseShuffleIndices);
4334 const InstructionsState &S,
4336 ArrayRef<int> ReuseShuffleIndices = {},
4337 ArrayRef<unsigned> ReorderIndices = {},
4338 unsigned InterleaveFactor = 0) {
4339 TreeEntry::EntryState EntryState =
4340 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
4341 TreeEntry *
E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
4342 ReuseShuffleIndices, ReorderIndices);
4343 if (
E && InterleaveFactor > 0)
4344 E->setInterleave(InterleaveFactor);
4349 TreeEntry::EntryState EntryState,
4350 ScheduleBundle &Bundle,
const InstructionsState &S,
4352 ArrayRef<int> ReuseShuffleIndices = {},
4353 ArrayRef<unsigned> ReorderIndices = {}) {
4354 assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||
4355 EntryState == TreeEntry::SplitVectorize)) ||
4356 (Bundle && EntryState != TreeEntry::NeedToGather &&
4357 EntryState != TreeEntry::SplitVectorize)) &&
4358 "Need to vectorize gather entry?");
4360 if (GatheredLoadsEntriesFirst.has_value() &&
4361 EntryState == TreeEntry::NeedToGather && S &&
4362 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
4363 !UserTreeIdx.UserTE)
4365 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
4366 TreeEntry *
Last = VectorizableTree.back().get();
4367 Last->Idx = VectorizableTree.size() - 1;
4368 Last->State = EntryState;
4369 if (UserTreeIdx.UserTE)
4370 OperandsToTreeEntry.try_emplace(
4371 std::make_pair(UserTreeIdx.UserTE, UserTreeIdx.EdgeIdx),
Last);
4376 ReuseShuffleIndices.empty()) &&
4377 "Reshuffling scalars not yet supported for nodes with padding");
4378 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
4379 ReuseShuffleIndices.end());
4380 if (ReorderIndices.
empty()) {
4383 Last->setOperations(S);
4386 Last->Scalars.assign(VL.
size(),
nullptr);
4388 [VL](
unsigned Idx) ->
Value * {
4389 if (Idx >= VL.size())
4390 return UndefValue::get(VL.front()->getType());
4395 Last->setOperations(S);
4396 Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
4398 if (EntryState == TreeEntry::SplitVectorize) {
4399 assert(S &&
"Split nodes must have operations.");
4400 Last->setOperations(S);
4401 SmallPtrSet<Value *, 4> Processed;
4402 for (
Value *V : VL) {
4406 auto It = ScalarsInSplitNodes.find(V);
4407 if (It == ScalarsInSplitNodes.end()) {
4408 ScalarsInSplitNodes.try_emplace(V).first->getSecond().push_back(
Last);
4409 (void)Processed.
insert(V);
4410 }
else if (Processed.
insert(V).second) {
4412 "Value already associated with the node.");
4413 It->getSecond().push_back(
Last);
4416 }
else if (!
Last->isGather()) {
4419 (!S.areInstructionsWithCopyableElements() &&
4421 all_of(VL, [&](
Value *V) {
return S.isNonSchedulable(V); }))
4422 Last->setDoesNotNeedToSchedule();
4423 SmallPtrSet<Value *, 4> Processed;
4424 for (
Value *V : VL) {
4427 if (S.isCopyableElement(V)) {
4428 Last->addCopyableElement(V);
4431 auto It = ScalarToTreeEntries.find(V);
4432 if (It == ScalarToTreeEntries.end()) {
4433 ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(
Last);
4434 (void)Processed.
insert(V);
4435 }
else if (Processed.
insert(V).second) {
4437 "Value already associated with the node.");
4438 It->getSecond().push_back(
Last);
4442 assert((!Bundle.getBundle().empty() ||
Last->doesNotNeedToSchedule()) &&
4443 "Bundle and VL out of sync");
4444 if (!Bundle.getBundle().empty()) {
4445#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
4446 auto *BundleMember = Bundle.getBundle().begin();
4447 SmallPtrSet<Value *, 4> Processed;
4448 for (
Value *V : VL) {
4449 if (S.isNonSchedulable(V) || !Processed.
insert(V).second)
4453 assert(BundleMember == Bundle.getBundle().end() &&
4454 "Bundle and VL out of sync");
4456 Bundle.setTreeEntry(
Last);
4460 bool AllConstsOrCasts =
true;
4461 for (
Value *V : VL) {
4462 if (S && S.areInstructionsWithCopyableElements() &&
4463 S.isCopyableElement(V))
4464 Last->addCopyableElement(V);
4467 AllConstsOrCasts &=
I &&
I->getType()->isIntegerTy();
4468 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
4469 !UserTreeIdx.UserTE->isGather())
4470 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(
Last);
4473 if (AllConstsOrCasts)
4475 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
4476 MustGather.insert_range(VL);
4479 if (UserTreeIdx.UserTE)
4480 Last->UserTreeIndex = UserTreeIdx;
4486 TreeEntry::VecTreeTy VectorizableTree;
4491 for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
4492 VectorizableTree[
Id]->dump();
4500 assert(V &&
"V cannot be nullptr.");
4501 auto It = ScalarToTreeEntries.find(V);
4502 if (It == ScalarToTreeEntries.end())
4504 return It->getSecond();
4509 assert(V &&
"V cannot be nullptr.");
4510 auto It = ScalarsInSplitNodes.find(V);
4511 if (It == ScalarsInSplitNodes.end())
4513 return It->getSecond();
4518 bool SameVF =
false)
const {
4519 assert(V &&
"V cannot be nullptr.");
4520 for (TreeEntry *TE : ScalarToTreeEntries.lookup(V))
4521 if ((!SameVF ||
TE->getVectorFactor() == VL.
size()) &&
TE->isSame(VL))
4532 bool areAltOperandsProfitable(
const InstructionsState &S,
4537 class ScalarsVectorizationLegality {
4538 InstructionsState S;
4540 bool TryToFindDuplicates;
4541 bool TrySplitVectorize;
4544 ScalarsVectorizationLegality(InstructionsState S,
bool IsLegal,
4545 bool TryToFindDuplicates =
true,
4546 bool TrySplitVectorize =
false)
4547 : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
4548 TrySplitVectorize(TrySplitVectorize) {
4549 assert((!IsLegal || (S.valid() && TryToFindDuplicates)) &&
4550 "Inconsistent state");
4552 const InstructionsState &getInstructionsState()
const {
return S; };
4553 bool isLegal()
const {
return IsLegal; }
4554 bool tryToFindDuplicates()
const {
return TryToFindDuplicates; }
4555 bool trySplitVectorize()
const {
return TrySplitVectorize; }
4560 ScalarsVectorizationLegality
4563 bool TryCopyableElementsVectorization)
const;
4567 TreeEntry::EntryState getScalarsVectorizationState(
4569 bool IsScatterVectorizeUserTE,
OrdersType &CurrentOrder,
4570 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo);
4573 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
4576 SmallDenseMap<std::pair<const TreeEntry *, unsigned>, TreeEntry *>
4577 OperandsToTreeEntry;
4580 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarsInSplitNodes;
4583 SmallDenseMap<Value *, unsigned> InstrElementSize;
4597 SmallDenseMap<const TreeEntry *, WeakTrackingVH> EntryToLastInstruction;
4602 SetVector<const TreeEntry *> PostponedGathers;
4604 using ValueToGatherNodesMap =
4605 DenseMap<Value *, SmallSetVector<const TreeEntry *, 4>>;
4606 ValueToGatherNodesMap ValueToGatherNodes;
4611 SetVector<unsigned> LoadEntriesToVectorize;
4614 bool IsGraphTransformMode =
false;
4617 std::optional<unsigned> GatheredLoadsEntriesFirst;
4620 SmallDenseMap<
const TreeEntry *,
4621 std::tuple<SmallVector<int>,
VectorType *, unsigned,
bool>>
4622 CompressEntryToData;
4625 struct ExternalUser {
4626 ExternalUser(
Value *S, llvm::User *U,
const TreeEntry &E,
unsigned L)
4627 : Scalar(S), User(
U), E(E), Lane(
L) {}
4630 Value *Scalar =
nullptr;
4633 llvm::User *User =
nullptr;
4641 using UserList = SmallVector<ExternalUser, 16>;
4647 bool isAliased(
const MemoryLocation &Loc1, Instruction *Inst1,
4648 Instruction *Inst2) {
4651 AliasCacheKey
Key = std::make_pair(Inst1, Inst2);
4652 auto Res = AliasCache.try_emplace(
Key);
4654 return Res.first->second;
4655 bool Aliased =
isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
4657 Res.first->getSecond() = Aliased;
4661 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
4665 SmallDenseMap<AliasCacheKey, bool> AliasCache;
4670 BatchAAResults BatchAA;
4677 DenseSet<Instruction *> DeletedInstructions;
4680 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
4683 DenseSet<size_t> AnalyzedReductionVals;
4687 DenseSet<Value *> AnalyzedMinBWVals;
4693 UserList ExternalUses;
4697 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
4701 SmallPtrSet<Value *, 4> ExternalUsesWithNonUsers;
4704 SmallPtrSet<const Value *, 32> EphValues;
4708 SetVector<Instruction *> GatherShuffleExtractSeq;
4711 DenseSet<BasicBlock *> CSEBlocks;
4714 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
4721 class ScheduleEntity {
4722 friend class ScheduleBundle;
4723 friend class ScheduleData;
4724 friend class ScheduleCopyableData;
4727 enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData };
4728 Kind getKind()
const {
return K; }
4729 ScheduleEntity(Kind K) : K(K) {}
4733 int SchedulingPriority = 0;
4736 bool IsScheduled =
false;
4738 const Kind K = Kind::ScheduleData;
4741 ScheduleEntity() =
delete;
4743 void setSchedulingPriority(
int Priority) { SchedulingPriority = Priority; }
4744 int getSchedulingPriority()
const {
return SchedulingPriority; }
4745 bool isReady()
const {
4747 return SD->isReady();
4749 return CD->isReady();
4755 bool hasValidDependencies()
const {
4757 return SD->hasValidDependencies();
4759 return CD->hasValidDependencies();
4763 int getUnscheduledDeps()
const {
4765 return SD->getUnscheduledDeps();
4767 return CD->getUnscheduledDeps();
4771 int incrementUnscheduledDeps(
int Incr) {
4773 return SD->incrementUnscheduledDeps(Incr);
4777 int getDependencies()
const {
4779 return SD->getDependencies();
4785 return SD->getInst();
4790 bool isScheduled()
const {
return IsScheduled; }
4791 void setScheduled(
bool Scheduled) { IsScheduled = Scheduled; }
4793 static bool classof(
const ScheduleEntity *) {
return true; }
4795#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4796 void dump(raw_ostream &OS)
const {
4798 return SD->dump(OS);
4800 return CD->dump(OS);
4811#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4813 const BoUpSLP::ScheduleEntity &SE) {
4823 class ScheduleData final :
public ScheduleEntity {
4827 enum { InvalidDeps = -1 };
4829 ScheduleData() : ScheduleEntity(Kind::ScheduleData) {}
4830 static bool classof(
const ScheduleEntity *Entity) {
4831 return Entity->getKind() == Kind::ScheduleData;
4834 void init(
int BlockSchedulingRegionID, Instruction *
I) {
4835 NextLoadStore =
nullptr;
4836 IsScheduled =
false;
4837 SchedulingRegionID = BlockSchedulingRegionID;
4838 clearDependencies();
4844 if (hasValidDependencies()) {
4845 assert(UnscheduledDeps <= Dependencies &&
"invariant");
4847 assert(UnscheduledDeps == Dependencies &&
"invariant");
4851 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
4852 "unexpected scheduled state");
4859 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
4863 bool isReady()
const {
return UnscheduledDeps == 0 && !IsScheduled; }
4868 int incrementUnscheduledDeps(
int Incr) {
4869 assert(hasValidDependencies() &&
4870 "increment of unscheduled deps would be meaningless");
4871 UnscheduledDeps += Incr;
4872 return UnscheduledDeps;
4877 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
4880 void clearDependencies() {
4881 clearDirectDependencies();
4882 MemoryDependencies.clear();
4883 ControlDependencies.clear();
4890 void clearDirectDependencies() {
4891 Dependencies = InvalidDeps;
4892 resetUnscheduledDeps();
4893 IsScheduled =
false;
4897 int getUnscheduledDeps()
const {
return UnscheduledDeps; }
4899 int getDependencies()
const {
return Dependencies; }
4901 void initDependencies() { Dependencies = 0; }
4903 void incDependencies() { Dependencies++; }
4906 int getSchedulingRegionID()
const {
return SchedulingRegionID; }
4913 return MemoryDependencies;
4916 void addMemoryDependency(ScheduleData *Dep) {
4917 MemoryDependencies.push_back(Dep);
4921 return ControlDependencies;
4924 void addControlDependency(ScheduleData *Dep) {
4925 ControlDependencies.push_back(Dep);
4928 ScheduleData *getNextLoadStore()
const {
return NextLoadStore; }
4929 void setNextLoadStore(ScheduleData *
Next) { NextLoadStore =
Next; }
4931 void dump(raw_ostream &OS)
const { OS << *Inst; }
4943 ScheduleData *NextLoadStore =
nullptr;
4947 SmallVector<ScheduleData *> MemoryDependencies;
4953 SmallVector<ScheduleData *> ControlDependencies;
4957 int SchedulingRegionID = 0;
4963 int Dependencies = InvalidDeps;
4969 int UnscheduledDeps = InvalidDeps;
4974 const BoUpSLP::ScheduleData &SD) {
4980 class ScheduleBundle final :
public ScheduleEntity {
4984 bool IsValid =
true;
4986 TreeEntry *TE =
nullptr;
4987 ScheduleBundle(
bool IsValid)
4988 : ScheduleEntity(Kind::ScheduleBundle), IsValid(IsValid) {}
4991 ScheduleBundle() : ScheduleEntity(Kind::ScheduleBundle) {}
4992 static bool classof(
const ScheduleEntity *Entity) {
4993 return Entity->getKind() == Kind::ScheduleBundle;
4998 for (
const ScheduleEntity *SD : Bundle) {
4999 if (SD->hasValidDependencies()) {
5000 assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
5003 assert(SD->getUnscheduledDeps() == SD->getDependencies() &&
5007 if (isScheduled()) {
5008 assert(SD->hasValidDependencies() && SD->getUnscheduledDeps() == 0 &&
5009 "unexpected scheduled state");
5015 int unscheduledDepsInBundle()
const {
5016 assert(*
this &&
"bundle must not be empty");
5018 for (
const ScheduleEntity *BundleMember : Bundle) {
5019 if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
5020 return ScheduleData::InvalidDeps;
5021 Sum += BundleMember->getUnscheduledDeps();
5029 bool hasValidDependencies()
const {
5030 return all_of(Bundle, [](
const ScheduleEntity *SD) {
5031 return SD->hasValidDependencies();
5037 bool isReady()
const {
5038 assert(*
this &&
"bundle must not be empty");
5039 return unscheduledDepsInBundle() == 0 && !isScheduled();
5047 void add(ScheduleEntity *SD) { Bundle.push_back(SD); }
5050 void setTreeEntry(TreeEntry *TE) { this->TE = TE; }
5051 TreeEntry *getTreeEntry()
const {
return TE; }
5053 static ScheduleBundle invalid() {
return {
false}; }
5055 operator bool()
const {
return IsValid; }
5058 void dump(raw_ostream &OS)
const {
5067 OS << *SD->getInst();
5081 const BoUpSLP::ScheduleBundle &Bundle) {
5092 class ScheduleCopyableData final :
public ScheduleEntity {
5099 int SchedulingRegionID = 0;
5101 ScheduleBundle &Bundle;
5104 ScheduleCopyableData(
int BlockSchedulingRegionID,
Instruction *
I,
5105 const EdgeInfo &EI, ScheduleBundle &Bundle)
5106 : ScheduleEntity(Kind::ScheduleCopyableData), Inst(
I), EI(EI),
5107 SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {}
5108 static bool classof(
const ScheduleEntity *Entity) {
5109 return Entity->getKind() == Kind::ScheduleCopyableData;
5114 if (hasValidDependencies()) {
5115 assert(UnscheduledDeps <= Dependencies &&
"invariant");
5117 assert(UnscheduledDeps == Dependencies &&
"invariant");
5121 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
5122 "unexpected scheduled state");
5129 bool hasValidDependencies()
const {
5130 return Dependencies != ScheduleData::InvalidDeps;
5135 bool isReady()
const {
return UnscheduledDeps == 0 && !IsScheduled; }
5140 int incrementUnscheduledDeps(
int Incr) {
5141 assert(hasValidDependencies() &&
5142 "increment of unscheduled deps would be meaningless");
5143 UnscheduledDeps += Incr;
5144 assert(UnscheduledDeps >= 0 &&
"invariant");
5145 return UnscheduledDeps;
5150 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5153 int getUnscheduledDeps()
const {
return UnscheduledDeps; }
5155 int getDependencies()
const {
return Dependencies; }
5157 void initDependencies() { Dependencies = 0; }
5159 void incDependencies() { Dependencies++; }
5162 int getSchedulingRegionID()
const {
return SchedulingRegionID; }
5168 void clearDependencies() {
5169 Dependencies = ScheduleData::InvalidDeps;
5170 UnscheduledDeps = ScheduleData::InvalidDeps;
5171 IsScheduled =
false;
5175 const EdgeInfo &getEdgeInfo()
const {
return EI; }
5178 ScheduleBundle &getBundle() {
return Bundle; }
5179 const ScheduleBundle &getBundle()
const {
return Bundle; }
5181#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5182 void dump(raw_ostream &OS)
const { OS <<
"[Copyable]" << *getInst(); }
5193 int Dependencies = ScheduleData::InvalidDeps;
5199 int UnscheduledDeps = ScheduleData::InvalidDeps;
5229 struct BlockScheduling {
5231 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
5234 ScheduledBundles.clear();
5235 ScheduledBundlesList.
clear();
5236 ScheduleCopyableDataMap.clear();
5237 ScheduleCopyableDataMapByInst.clear();
5238 ScheduleCopyableDataMapByInstUser.clear();
5239 ScheduleCopyableDataMapByUsers.clear();
5241 ScheduleStart =
nullptr;
5242 ScheduleEnd =
nullptr;
5243 FirstLoadStoreInRegion =
nullptr;
5244 LastLoadStoreInRegion =
nullptr;
5245 RegionHasStackSave =
false;
5249 ScheduleRegionSizeLimit -= ScheduleRegionSize;
5252 ScheduleRegionSize = 0;
5256 ++SchedulingRegionID;
5262 if (BB !=
I->getParent())
5265 ScheduleData *SD = ScheduleDataMap.lookup(
I);
5266 if (SD && isInSchedulingRegion(*SD))
5271 ScheduleData *getScheduleData(
Value *V) {
5277 ScheduleCopyableData *getScheduleCopyableData(
const EdgeInfo &EI,
5278 const Value *V)
const {
5279 if (ScheduleCopyableDataMap.empty())
5281 auto It = ScheduleCopyableDataMap.find(std::make_pair(EI, V));
5282 if (It == ScheduleCopyableDataMap.end())
5284 ScheduleCopyableData *SD = It->getSecond().get();
5285 if (!isInSchedulingRegion(*SD))
5293 getScheduleCopyableData(
const Value *User,
unsigned OperandIdx,
5295 if (ScheduleCopyableDataMapByInstUser.empty())
5297 const auto It = ScheduleCopyableDataMapByInstUser.find(
5298 std::make_pair(std::make_pair(User, OperandIdx), V));
5299 if (It == ScheduleCopyableDataMapByInstUser.end())
5302 for (ScheduleCopyableData *SD : It->getSecond()) {
5303 if (isInSchedulingRegion(*SD))
5317 bool areAllOperandsReplacedByCopyableData(Instruction *User,
5321 if (ScheduleCopyableDataMap.empty())
5323 SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;
5324 SmallDenseMap<const TreeEntry *, unsigned> OrderedEntriesCount;
5325 for (
const Use &U :
User->operands()) {
5329 if (Entries.
empty())
5333 for (TreeEntry *TE : Entries) {
5339 bool IsCommutativeUser =
5344 OrderedEntriesCount.
try_emplace(TE, 0).first->getSecond();
5345 EdgeInfo EI(TE,
U.getOperandNo());
5346 if (!getScheduleCopyableData(EI,
Op))
5352 ++PotentiallyReorderedEntriesCount.
try_emplace(TE, 0)
5353 .first->getSecond();
5356 if (PotentiallyReorderedEntriesCount.
empty())
5357 return all_of(OrderedEntriesCount,
5358 [&](
const std::pair<const TreeEntry *, unsigned> &
P) {
5362 for (
auto &
P : PotentiallyReorderedEntriesCount) {
5363 auto *It =
find(
P.first->Scalars, User);
5364 assert(It !=
P.first->Scalars.end() &&
"User is not in the tree entry");
5365 int Lane = std::distance(
P.first->Scalars.begin(), It);
5366 assert(Lane >= 0 &&
"Lane is not found");
5368 Lane =
P.first->ReorderIndices[Lane];
5369 assert(Lane <
static_cast<int>(
P.first->Scalars.size()) &&
5370 "Couldn't find extract lane");
5371 for (
unsigned OpIdx :
5373 P.first->getMainOp()))) {
5374 if (
P.first->getOperand(
OpIdx)[Lane] ==
Op &&
5375 getScheduleCopyableData(EdgeInfo(
P.first,
OpIdx),
Op))
5379 return all_of(PotentiallyReorderedEntriesCount,
5380 [&](
const std::pair<const TreeEntry *, unsigned> &
P) {
5381 return P.second ==
NumOps - 1;
5383 all_of(OrderedEntriesCount,
5384 [&](
const std::pair<const TreeEntry *, unsigned> &
P) {
5390 getScheduleCopyableData(
const Instruction *
I)
const {
5391 if (ScheduleCopyableDataMapByInst.empty())
5393 const auto It = ScheduleCopyableDataMapByInst.find(
I);
5394 if (It == ScheduleCopyableDataMapByInst.end())
5397 for (ScheduleCopyableData *SD : It->getSecond()) {
5398 if (isInSchedulingRegion(*SD))
5405 getScheduleCopyableDataUsers(
const Instruction *User)
const {
5406 if (ScheduleCopyableDataMapByUsers.empty())
5408 const auto It = ScheduleCopyableDataMapByUsers.find(User);
5409 if (It == ScheduleCopyableDataMapByUsers.end())
5412 for (ScheduleCopyableData *SD : It->getSecond()) {
5413 if (isInSchedulingRegion(*SD))
5419 ScheduleCopyableData &addScheduleCopyableData(
const EdgeInfo &EI,
5421 int SchedulingRegionID,
5422 ScheduleBundle &Bundle) {
5423 assert(!getScheduleCopyableData(EI,
I) &&
"already in the map");
5424 ScheduleCopyableData *CD =
5425 ScheduleCopyableDataMap
5426 .try_emplace(std::make_pair(EI,
I),
5427 std::make_unique<ScheduleCopyableData>(
5428 SchedulingRegionID,
I, EI, Bundle))
5431 ScheduleCopyableDataMapByInst[
I].push_back(CD);
5435 assert(It !=
Op.end() &&
"Lane not set");
5436 SmallPtrSet<Instruction *, 4> Visited;
5438 int Lane = std::distance(
Op.begin(), It);
5439 assert(Lane >= 0 &&
"Lane not set");
5441 !EI.UserTE->ReorderIndices.empty())
5442 Lane = EI.UserTE->ReorderIndices[Lane];
5443 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
5444 "Couldn't find extract lane");
5446 if (!Visited.
insert(In).second) {
5450 ScheduleCopyableDataMapByInstUser
5451 .try_emplace(std::make_pair(std::make_pair(In, EI.EdgeIdx),
I))
5454 ScheduleCopyableDataMapByUsers.try_emplace(
I)
5461 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
5462 if (ScheduleCopyableData *UserCD =
5463 getScheduleCopyableData(UserEI, In))
5464 ScheduleCopyableDataMapByUsers[
I].remove(UserCD);
5467 }
while (It !=
Op.end());
5469 ScheduleCopyableDataMapByUsers.try_emplace(
I).first->getSecond().insert(
5479 auto It = ScheduledBundles.find(
I);
5480 if (It == ScheduledBundles.end())
5482 return It->getSecond();
5486 bool isInSchedulingRegion(
const ScheduleEntity &SD)
const {
5488 return Data->getSchedulingRegionID() == SchedulingRegionID;
5490 return CD->getSchedulingRegionID() == SchedulingRegionID;
5492 [&](
const ScheduleEntity *BundleMember) {
5493 return isInSchedulingRegion(*BundleMember);
5499 template <
typename ReadyListType>
5500 void schedule(
const BoUpSLP &R,
const InstructionsState &S,
5501 const EdgeInfo &EI, ScheduleEntity *
Data,
5502 ReadyListType &ReadyList) {
5503 auto ProcessBundleMember = [&](ScheduleEntity *BundleMember,
5508 auto DecrUnsched = [&](
auto *
Data,
bool IsControl =
false) {
5509 if ((IsControl ||
Data->hasValidDependencies()) &&
5510 Data->incrementUnscheduledDeps(-1) == 0) {
5517 CopyableBundle.
push_back(&CD->getBundle());
5518 Bundles = CopyableBundle;
5520 Bundles = getScheduleBundles(
Data->getInst());
5522 if (!Bundles.
empty()) {
5523 for (ScheduleBundle *Bundle : Bundles) {
5524 if (Bundle->unscheduledDepsInBundle() == 0) {
5525 assert(!Bundle->isScheduled() &&
5526 "already scheduled bundle gets ready");
5527 ReadyList.insert(Bundle);
5529 <<
"SLP: gets ready: " << *Bundle <<
"\n");
5535 "already scheduled bundle gets ready");
5537 "Expected non-copyable data");
5538 ReadyList.insert(
Data);
5545 if (!ScheduleCopyableDataMap.empty()) {
5547 getScheduleCopyableData(User,
OpIdx,
I);
5548 for (ScheduleCopyableData *CD : CopyableData)
5549 DecrUnsched(CD,
false);
5550 if (!CopyableData.empty())
5553 if (ScheduleData *OpSD = getScheduleData(
I))
5554 DecrUnsched(OpSD,
false);
5560 if (!Bundles.empty()) {
5561 auto *
In = BundleMember->getInst();
5563 SmallDenseMap<const Instruction *, unsigned> OperandsUses;
5564 unsigned TotalOpCount = 0;
5567 TotalOpCount = OperandsUses[
In] = 1;
5569 for (
const Use &U :
In->operands()) {
5572 ++Res.first->getSecond();
5579 auto DecrUnschedForInst =
5581 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>>
5583 if (!ScheduleCopyableDataMap.empty()) {
5584 const EdgeInfo EI = {UserTE,
OpIdx};
5585 if (ScheduleCopyableData *CD =
5586 getScheduleCopyableData(EI,
I)) {
5587 if (!Checked.insert(std::make_pair(CD,
OpIdx)).second)
5589 DecrUnsched(CD,
false);
5593 auto It = OperandsUses.
find(
I);
5594 assert(It != OperandsUses.
end() &&
"Operand not found");
5595 if (It->second > 0) {
5597 assert(TotalOpCount > 0 &&
"No more operands to decrement");
5599 if (ScheduleData *OpSD = getScheduleData(
I)) {
5600 if (!Checked.insert(std::make_pair(OpSD,
OpIdx)).second)
5602 DecrUnsched(OpSD,
false);
5607 for (ScheduleBundle *Bundle : Bundles) {
5608 if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
5610 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5613 auto *It =
find(Bundle->getTreeEntry()->Scalars, In);
5614 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>> Checked;
5617 std::distance(Bundle->getTreeEntry()->Scalars.begin(), It);
5618 assert(Lane >= 0 &&
"Lane not set");
5620 !Bundle->getTreeEntry()->ReorderIndices.empty())
5621 Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
5622 assert(Lane <
static_cast<int>(
5623 Bundle->getTreeEntry()->Scalars.size()) &&
5624 "Couldn't find extract lane");
5634 In->getNumOperands() ==
5635 Bundle->getTreeEntry()->getNumOperands() ||
5636 Bundle->getTreeEntry()->isCopyableElement(In)) &&
5637 "Missed TreeEntry operands?");
5639 bool IsNonSchedulableWithParentPhiNode =
5640 Bundle->getTreeEntry()->doesNotNeedToSchedule() &&
5641 Bundle->getTreeEntry()->UserTreeIndex &&
5642 Bundle->getTreeEntry()->UserTreeIndex.UserTE->hasState() &&
5643 Bundle->getTreeEntry()->UserTreeIndex.UserTE->getOpcode() ==
5647 if (IsNonSchedulableWithParentPhiNode) {
5648 const TreeEntry *ParentTE =
5649 Bundle->getTreeEntry()->UserTreeIndex.UserTE;
5651 if (!ParentsUniqueUsers.
insert(User).second)
5655 for (
unsigned OpIdx :
5658 Bundle->getTreeEntry()->getOperand(
OpIdx)[Lane])) {
5661 DecrUnschedForInst(
I, Bundle->getTreeEntry(),
OpIdx, Checked);
5664 if (!IsNonSchedulableWithParentPhiNode)
5666 It = std::find(std::next(It),
5667 Bundle->getTreeEntry()->Scalars.end(), In);
5668 }
while (It != Bundle->getTreeEntry()->Scalars.end());
5673 for (Use &U : BundleMember->getInst()->operands()) {
5676 <<
"SLP: check for readiness (def): " << *
I <<
"\n");
5677 DecrUnschedForInst(BundleMember->getInst(),
U.getOperandNo(),
I);
5685 SmallPtrSet<const ScheduleData *, 4> VisitedMemory;
5686 for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) {
5687 if (!VisitedMemory.
insert(MemoryDep).second)
5692 << *MemoryDep <<
"\n");
5693 DecrUnsched(MemoryDep);
5696 SmallPtrSet<const ScheduleData *, 4> VisitedControl;
5697 for (ScheduleData *Dep : SD->getControlDependencies()) {
5698 if (!VisitedControl.
insert(Dep).second)
5703 <<
"SLP: check for readiness (ctrl): " << *Dep <<
"\n");
5704 DecrUnsched(Dep,
true);
5708 SD->setScheduled(
true);
5713 if (
R.isVectorized(In)) {
5715 for (TreeEntry *TE : Entries) {
5717 In->getNumOperands() !=
TE->getNumOperands())
5720 PseudoBundles.
emplace_back(std::make_unique<ScheduleBundle>());
5721 BundlePtr->setTreeEntry(TE);
5726 ProcessBundleMember(SD, Bundles);
5729 Bundle.setScheduled(
true);
5731 auto AreAllBundlesScheduled =
5732 [&](
const ScheduleEntity *SD,
5736 return !SDBundles.empty() &&
5737 all_of(SDBundles, [&](
const ScheduleBundle *SDBundle) {
5738 return SDBundle->isScheduled();
5741 for (ScheduleEntity *SD : Bundle.getBundle()) {
5744 SDBundles = getScheduleBundles(SD->getInst());
5745 if (AreAllBundlesScheduled(SD, SDBundles)) {
5746 SD->setScheduled(
true);
5759 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
5760 ScheduleStart->comesBefore(ScheduleEnd) &&
5761 "Not a valid scheduling region?");
5763 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
5765 if (!Bundles.
empty()) {
5766 for (ScheduleBundle *Bundle : Bundles) {
5767 assert(isInSchedulingRegion(*Bundle) &&
5768 "primary schedule data not in window?");
5773 auto *SD = getScheduleData(
I);
5776 assert(isInSchedulingRegion(*SD) &&
5777 "primary schedule data not in window?");
5782 [](
const ScheduleEntity *Bundle) {
5783 return Bundle->isReady();
5785 "item in ready list not ready?");
5789 template <
typename ReadyListType>
5790 void initialFillReadyList(ReadyListType &ReadyList) {
5791 SmallPtrSet<ScheduleBundle *, 16> Visited;
5792 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
5793 ScheduleData *SD = getScheduleData(
I);
5794 if (SD && SD->hasValidDependencies() && SD->isReady()) {
5797 for (ScheduleBundle *Bundle : Bundles) {
5798 if (!Visited.
insert(Bundle).second)
5800 if (Bundle->hasValidDependencies() && Bundle->isReady()) {
5801 ReadyList.insert(Bundle);
5803 << *Bundle <<
"\n");
5808 ReadyList.insert(SD);
5810 <<
"SLP: initially in ready list: " << *SD <<
"\n");
5821 const InstructionsState &S,
const EdgeInfo &EI);
5828 std::optional<ScheduleBundle *>
5830 const InstructionsState &S,
const EdgeInfo &EI);
5833 ScheduleData *allocateScheduleDataChunks();
5837 bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
5841 void initScheduleData(Instruction *FromI, Instruction *ToI,
5842 ScheduleData *PrevLoadStore,
5843 ScheduleData *NextLoadStore);
5847 void calculateDependencies(ScheduleBundle &Bundle,
bool InsertInReadyList,
5852 void resetSchedule();
5869 SmallDenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
5873 SmallDenseMap<std::pair<EdgeInfo, const Value *>,
5874 std::unique_ptr<ScheduleCopyableData>>
5875 ScheduleCopyableDataMap;
5881 SmallDenseMap<const Instruction *, SmallVector<ScheduleCopyableData *>>
5882 ScheduleCopyableDataMapByInst;
5888 SmallDenseMap<std::pair<std::pair<const Value *, unsigned>,
const Value *>,
5890 ScheduleCopyableDataMapByInstUser;
5910 SmallSetVector<ScheduleCopyableData *, 4>>
5911 ScheduleCopyableDataMapByUsers;
5914 SmallDenseMap<Instruction *, SmallVector<ScheduleBundle *>>
5920 SetVector<ScheduleEntity *> ReadyInsts;
5930 ScheduleData *FirstLoadStoreInRegion =
nullptr;
5934 ScheduleData *LastLoadStoreInRegion =
nullptr;
5939 bool RegionHasStackSave =
false;
5942 int ScheduleRegionSize = 0;
5951 int SchedulingRegionID = 1;
5955 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
5959 void scheduleBlock(
const BoUpSLP &R, BlockScheduling *BS);
5962 const SmallDenseSet<Value *> *UserIgnoreList =
nullptr;
5966 struct OrdersTypeDenseMapInfo {
5979 static unsigned getHashValue(
const OrdersType &V) {
5990 ScalarEvolution *SE;
5991 TargetTransformInfo *TTI;
5992 TargetLibraryInfo *TLI;
5995 AssumptionCache *AC;
5997 const DataLayout *DL;
5998 OptimizationRemarkEmitter *ORE;
6000 unsigned MaxVecRegSize;
6001 unsigned MinVecRegSize;
6004 IRBuilder<TargetFolder> Builder;
6011 DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
6016 unsigned ReductionBitWidth = 0;
6019 unsigned BaseGraphSize = 1;
6023 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
6027 DenseSet<unsigned> ExtraBitWidthNodes;
6037 SecondInfo::getEmptyKey());
6042 SecondInfo::getTombstoneKey());
6047 SecondInfo::getHashValue(Val.
EdgeIdx));
6068 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
6079 return R.VectorizableTree[0].get();
6083 return {&
N->UserTreeIndex,
N->Container};
6087 return {&
N->UserTreeIndex + 1,
N->Container};
6114 static unsigned size(
BoUpSLP *R) {
return R->VectorizableTree.size(); }
6125 OS << Entry->Idx <<
".\n";
6128 for (
auto *V : Entry->Scalars) {
6130 if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
6131 return EU.Scalar == V;
6141 if (Entry->isGather())
6143 if (Entry->State == TreeEntry::ScatterVectorize ||
6144 Entry->State == TreeEntry::StridedVectorize ||
6145 Entry->State == TreeEntry::CompressVectorize)
6146 return "color=blue";
6155 for (
auto *
I : DeletedInstructions) {
6156 if (!
I->getParent()) {
6161 I->insertBefore(F->getEntryBlock(),
6162 F->getEntryBlock().getFirstNonPHIIt());
6164 I->insertBefore(F->getEntryBlock().getTerminator()->getIterator());
6167 for (
Use &U :
I->operands()) {
6169 if (
Op && !DeletedInstructions.count(
Op) &&
Op->hasOneUser() &&
6173 I->dropAllReferences();
6175 for (
auto *
I : DeletedInstructions) {
6177 "trying to erase instruction with users.");
6178 I->eraseFromParent();
6184#ifdef EXPENSIVE_CHECKS
6195 assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
6196 "Expected non-empty mask.");
6199 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
6201 Reuses[Mask[
I]] = Prev[
I];
6209 bool BottomOrder =
false) {
6210 assert(!Mask.empty() &&
"Expected non-empty mask.");
6211 unsigned Sz = Mask.size();
6214 if (Order.
empty()) {
6216 std::iota(PrevOrder.
begin(), PrevOrder.
end(), 0);
6218 PrevOrder.
swap(Order);
6221 for (
unsigned I = 0;
I < Sz; ++
I)
6223 Order[
I] = PrevOrder[Mask[
I]];
6225 return Data.value() == Sz ||
Data.index() ==
Data.value();
6234 if (Order.
empty()) {
6236 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
6246 for (
unsigned I = 0;
I < Sz; ++
I)
6248 Order[MaskOrder[
I]] =
I;
6252std::optional<BoUpSLP::OrdersType>
6254 bool TopToBottom,
bool IgnoreReorder) {
6255 assert(TE.isGather() &&
"Expected gather node only.");
6259 Type *ScalarTy = GatheredScalars.
front()->getType();
6260 size_t NumScalars = GatheredScalars.
size();
6262 return std::nullopt;
6269 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
6271 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
6274 if (GatherShuffles.
empty() && ExtractShuffles.
empty())
6275 return std::nullopt;
6276 OrdersType CurrentOrder(NumScalars, NumScalars);
6277 if (GatherShuffles.
size() == 1 &&
6279 Entries.
front().front()->isSame(TE.Scalars)) {
6283 return std::nullopt;
6285 if (Entries.
front().front()->UserTreeIndex.UserTE ==
6286 TE.UserTreeIndex.UserTE)
6287 return std::nullopt;
6290 if (!IgnoreReorder && Entries.
front().front()->Idx == 0)
6291 return std::nullopt;
6294 if (!Entries.
front().front()->ReuseShuffleIndices.empty() &&
6295 TE.getVectorFactor() == 2 && Mask.size() == 2 &&
6298 return P.value() % 2 != static_cast<int>(P.index()) % 2;
6300 return std::nullopt;
6304 std::iota(CurrentOrder.
begin(), CurrentOrder.
end(), 0);
6305 return CurrentOrder;
6309 return all_of(Mask, [&](
int I) {
6316 if ((ExtractShuffles.
empty() && IsSplatMask(Mask) &&
6317 (Entries.
size() != 1 ||
6318 Entries.
front().front()->ReorderIndices.empty())) ||
6319 (GatherShuffles.
empty() && IsSplatMask(ExtractMask)))
6320 return std::nullopt;
6326 if (ShuffledSubMasks.
test(
I))
6328 const int VF = GetVF(
I);
6334 if (
any_of(Slice, [&](
unsigned I) {
return I != NumScalars; })) {
6336 ShuffledSubMasks.
set(
I);
6340 int FirstMin = INT_MAX;
6341 int SecondVecFound =
false;
6343 int Idx = Mask[
I * PartSz + K];
6345 Value *V = GatheredScalars[
I * PartSz + K];
6347 SecondVecFound =
true;
6356 SecondVecFound =
true;
6360 FirstMin = (FirstMin / PartSz) * PartSz;
6362 if (SecondVecFound) {
6364 ShuffledSubMasks.
set(
I);
6368 int Idx = Mask[
I * PartSz + K];
6372 if (Idx >= PartSz) {
6373 SecondVecFound =
true;
6376 if (CurrentOrder[
I * PartSz + Idx] >
6377 static_cast<unsigned>(
I * PartSz + K) &&
6378 CurrentOrder[
I * PartSz + Idx] !=
6379 static_cast<unsigned>(
I * PartSz + Idx))
6380 CurrentOrder[
I * PartSz + Idx] =
I * PartSz + K;
6383 if (SecondVecFound) {
6385 ShuffledSubMasks.
set(
I);
6391 if (!ExtractShuffles.
empty())
6392 TransformMaskToOrder(
6393 CurrentOrder, ExtractMask, PartSz, NumParts, [&](
unsigned I) {
6394 if (!ExtractShuffles[
I])
6397 unsigned Sz =
getNumElems(TE.getVectorFactor(), PartSz,
I);
6399 int K =
I * PartSz + Idx;
6402 if (!TE.ReuseShuffleIndices.empty())
6403 K = TE.ReuseShuffleIndices[K];
6406 if (!TE.ReorderIndices.empty())
6407 K = std::distance(TE.ReorderIndices.begin(),
6408 find(TE.ReorderIndices, K));
6414 .getKnownMinValue());
6419 if (GatherShuffles.
size() == 1 && NumParts != 1) {
6420 if (ShuffledSubMasks.
any())
6421 return std::nullopt;
6422 PartSz = NumScalars;
6425 if (!Entries.
empty())
6426 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](
unsigned I) {
6427 if (!GatherShuffles[
I])
6429 return std::max(Entries[
I].front()->getVectorFactor(),
6430 Entries[
I].back()->getVectorFactor());
6432 unsigned NumUndefs =
count(CurrentOrder, NumScalars);
6433 if (ShuffledSubMasks.
all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
6434 return std::nullopt;
6435 return std::move(CurrentOrder);
6440 bool CompareOpcodes =
true) {
6446 return (!GEP1 || GEP1->getNumOperands() == 2) &&
6447 (!GEP2 || GEP2->getNumOperands() == 2) &&
6448 (((!GEP1 ||
isConstant(GEP1->getOperand(1))) &&
6449 (!GEP2 ||
isConstant(GEP2->getOperand(1)))) ||
6452 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
6456template <
typename T>
6461 return CommonAlignment;
6467 "Order is empty. Please check it before using isReverseOrder.");
6468 unsigned Sz = Order.
size();
6470 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
6481 const SCEV *PtrSCEVLowest =
nullptr;
6482 const SCEV *PtrSCEVHighest =
nullptr;
6490 if (!PtrSCEVLowest && !PtrSCEVHighest) {
6491 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
6498 PtrSCEVLowest = PtrSCEV;
6505 PtrSCEVHighest = PtrSCEV;
6513 int Size =
DL.getTypeStoreSize(ElemTy);
6514 auto TryGetStride = [&](
const SCEV *Dist,
6515 const SCEV *Multiplier) ->
const SCEV * {
6517 if (M->getOperand(0) == Multiplier)
6518 return M->getOperand(1);
6519 if (M->getOperand(1) == Multiplier)
6520 return M->getOperand(0);
6523 if (Multiplier == Dist)
6528 const SCEV *Stride =
nullptr;
6529 if (
Size != 1 || SCEVs.
size() > 2) {
6531 Stride = TryGetStride(Dist, Sz);
6539 using DistOrdPair = std::pair<int64_t, int>;
6541 std::set<DistOrdPair,
decltype(Compare)> Offsets(Compare);
6543 bool IsConsecutive =
true;
6544 for (
const SCEV *PtrSCEV : SCEVs) {
6546 if (PtrSCEV != PtrSCEVLowest) {
6548 const SCEV *Coeff = TryGetStride(Diff, Stride);
6558 Dist = SC->getAPInt().getZExtValue();
6563 auto Res = Offsets.emplace(Dist, Cnt);
6567 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
6570 if (Offsets.size() != SCEVs.
size())
6572 SortedIndices.
clear();
6573 if (!IsConsecutive) {
6577 for (
const std::pair<int64_t, int> &Pair : Offsets) {
6578 SortedIndices[Cnt] = Pair.second;
6585static std::pair<InstructionCost, InstructionCost>
6588 Type *ScalarTy, VectorType *VecTy);
6606 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
6609 Mask, NumSrcElts, NumSubElts, Index)) {
6610 if (Index + NumSubElts > NumSrcElts &&
6611 Index + NumSrcElts <=
static_cast<int>(
Mask.size()))
6628 "ScalableVectorType is not supported.");
6631 "Incorrect usage.");
6636 unsigned ScalarTyNumElements = VecTy->getNumElements();
6639 if (!DemandedElts[
I])
6643 I * ScalarTyNumElements, VecTy);
6646 I * ScalarTyNumElements, VecTy);
6659 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
6660 if (Opcode == Instruction::ExtractElement) {
6666 Index * VecTy->getNumElements(), VecTy);
6669 return TTI.getVectorInstrCost(Opcode, Val,
CostKind, Index, Scalar,
6682 getWidenedType(VecTy->getElementType(), ScalarTy->getNumElements());
6684 Index * ScalarTy->getNumElements(), SubTp) +
6688 return TTI.getExtractWithExtendCost(Opcode, Dst, VecTy, Index,
CostKind);
6704 auto *Begin = std::next(
Mask.begin(), Index);
6705 std::iota(Begin, std::next(Begin, SubVecVF), 0);
6709 std::iota(
Mask.begin(),
Mask.end(), 0);
6710 std::iota(std::next(
Mask.begin(), Index),
6711 std::next(
Mask.begin(), Index + SubVecVF), VecVF);
6713 return Generator(Vec, V, Mask);
6716 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
6724 unsigned SubVecVF,
unsigned Index) {
6726 std::iota(Mask.begin(), Mask.end(), Index);
6727 return Builder.CreateShuffleVector(Vec, Mask);
6737 const unsigned Sz = PointerOps.
size();
6740 CompressMask[0] = 0;
6742 std::optional<unsigned> Stride = 0;
6746 std::optional<int64_t> OptPos =
6748 if (!OptPos || OptPos > std::numeric_limits<unsigned>::max())
6750 unsigned Pos =
static_cast<unsigned>(*OptPos);
6751 CompressMask[
I] = Pos;
6758 if (Pos != *Stride *
I)
6761 return Stride.has_value();
6774 InterleaveFactor = 0;
6776 const size_t Sz = VL.
size();
6784 if (AreAllUsersVectorized(V))
6787 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
CostKind,
6788 Mask.empty() ?
I : Mask[
I]);
6791 if (ExtractCost <= ScalarCost)
6796 if (Order.
empty()) {
6797 Ptr0 = PointerOps.
front();
6798 PtrN = PointerOps.
back();
6800 Ptr0 = PointerOps[Order.
front()];
6801 PtrN = PointerOps[Order.
back()];
6803 std::optional<int64_t> Diff =
6807 const size_t MaxRegSize =
6811 if (*Diff / Sz >= MaxRegSize / 8)
6815 Align CommonAlignment = LI->getAlign();
6817 Ptr0, LoadVecTy, CommonAlignment,
DL,
6820 if (IsMasked && !
TTI.isLegalMaskedLoad(LoadVecTy, CommonAlignment,
6821 LI->getPointerAddressSpace()))
6827 assert(CompressMask.
size() >= 2 &&
"At least two elements are required");
6831 auto [ScalarGEPCost, VectorGEPCost] =
6833 Instruction::GetElementPtr,
CostKind, ScalarTy, LoadVecTy);
6851 TTI.getMaskedMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
6852 LI->getPointerAddressSpace(),
CostKind);
6855 TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
6856 LI->getPointerAddressSpace(),
CostKind);
6858 if (IsStrided && !IsMasked && Order.
empty()) {
6865 AlignedLoadVecTy = LoadVecTy;
6866 if (
TTI.isLegalInterleavedAccessType(AlignedLoadVecTy, CompressMask[1],
6868 LI->getPointerAddressSpace())) {
6870 VectorGEPCost +
TTI.getInterleavedMemoryOpCost(
6871 Instruction::Load, AlignedLoadVecTy,
6872 CompressMask[1], {}, CommonAlignment,
6873 LI->getPointerAddressSpace(),
CostKind, IsMasked);
6874 if (InterleavedCost < GatherCost) {
6875 InterleaveFactor = CompressMask[1];
6876 LoadVecTy = AlignedLoadVecTy;
6883 if (!Order.
empty()) {
6886 NewMask[
I] = CompressMask[Mask[
I]];
6888 CompressMask.
swap(NewMask);
6890 InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
6891 return TotalVecCost < GatherCost;
6904 unsigned InterleaveFactor;
6908 AreAllUsersVectorized, IsMasked, InterleaveFactor,
6909 CompressMask, LoadVecTy);
6926 Align Alignment,
const int64_t Diff,
6927 const size_t Sz)
const {
6928 if (Diff % (Sz - 1) != 0)
6932 auto IsAnyPointerUsedOutGraph =
any_of(PointerOps, [&](
Value *V) {
6934 return !isVectorized(U) && !MustGather.contains(U);
6938 const uint64_t AbsoluteDiff = std::abs(Diff);
6940 if (IsAnyPointerUsedOutGraph ||
6941 (AbsoluteDiff > Sz &&
6944 AbsoluteDiff % Sz == 0 &&
has_single_bit(AbsoluteDiff / Sz)))) ||
6945 Diff == -(
static_cast<int64_t
>(Sz) - 1)) {
6946 int64_t Stride = Diff /
static_cast<int64_t
>(Sz - 1);
6947 if (Diff != Stride *
static_cast<int64_t
>(Sz - 1))
6949 if (!TTI->isLegalStridedLoadStore(VecTy, Alignment))
6959 Value *Ptr0,
Value *PtrN, StridedPtrInfo &SPtrInfo)
const {
6960 const size_t Sz = PointerOps.
size();
6961 if (!
isStridedLoad(PointerOps, ScalarTy, Alignment, Diff, Sz))
6964 int64_t Stride = Diff /
static_cast<int64_t
>(Sz - 1);
6973 else if (
Ptr != Ptr0)
6977 if (((Dist / Stride) * Stride) != Dist || !Dists.
insert(Dist).second)
6980 if (Dists.
size() == Sz) {
6981 Type *StrideTy = DL->getIndexType(Ptr0->
getType());
6982 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, Stride);
6992 StridedPtrInfo &SPtrInfo)
const {
6993 const unsigned Sz = PointerOps.
size();
6995 if (Sz <= MinProfitableStridedLoads || !TTI->isTypeLegal(StridedLoadTy) ||
6996 !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))
6998 if (
const SCEV *Stride =
7001 SPtrInfo.StrideSCEV = Stride;
7010 unsigned *BestVF,
bool TryRecursiveCheck)
const {
7023 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
7029 const size_t Sz = VL.
size();
7031 auto *POIter = PointerOps.
begin();
7032 for (
Value *V : VL) {
7034 if (!L || !L->isSimple())
7036 *POIter = L->getPointerOperand();
7042 bool IsSorted =
sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
7051 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
7052 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
7063 if (Order.
empty()) {
7064 Ptr0 = PointerOps.
front();
7065 PtrN = PointerOps.
back();
7067 Ptr0 = PointerOps[Order.
front()];
7068 PtrN = PointerOps[Order.
back()];
7070 std::optional<int64_t> Diff =
7073 if (
static_cast<uint64_t>(*Diff) == Sz - 1)
7076 *TLI, [&](
Value *V) {
7077 return areAllUsersVectorized(
7085 *Diff, Ptr0, PtrN, SPtrInfo))
7088 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
7089 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
7094 auto CheckForShuffledLoads = [&, &TTI = *TTI](
Align CommonAlignment,
7096 bool ProfitableGatherPointers) {
7101 auto [ScalarGEPCost, VectorGEPCost] =
7103 Instruction::GetElementPtr,
CostKind, ScalarTy, VecTy);
7107 Type *PtrScalarTy = PointerOps.
front()->getType()->getScalarType();
7109 if (
static_cast<unsigned>(
count_if(
7128 return C + TTI.getInstructionCost(
7134 TTI.getGatherScatterOpCost(
7136 false, CommonAlignment,
CostKind) +
7137 (ProfitableGatherPointers ? 0 : VectorGEPCost);
7145 constexpr unsigned ListLimit = 4;
7146 if (!TryRecursiveCheck || VL.
size() < ListLimit)
7155 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
7165 for (
unsigned Cnt = 0, End = VL.
size(); Cnt + VF <= End; Cnt += VF) {
7170 PointerOps, SPtrInfo, BestVF,
7178 DemandedElts.
setBits(Cnt, Cnt + VF);
7194 if (!DemandedElts.
isZero()) {
7200 if (DemandedElts[Idx])
7211 LI0->getPointerOperand(),
7212 Instruction::GetElementPtr,
CostKind, ScalarTy,
7216 if (
static_cast<unsigned>(
7218 PointerOps.
size() - 1 ||
7237 TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
7238 LI0->getPointerAddressSpace(),
CostKind,
7243 VecLdCost += TTI.getStridedMemoryOpCost(Instruction::Load, SubVecTy,
7244 LI0->getPointerOperand(),
7250 VecLdCost += TTI.getMaskedMemoryOpCost(
7251 Instruction::Load, SubVecTy, CommonAlignment,
7252 LI0->getPointerAddressSpace(),
CostKind) +
7258 VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,
7259 LI0->getPointerOperand(),
7270 ShuffleMask[Idx] = Idx / VF ==
I ? VL.
size() + Idx % VF : Idx;
7279 if (MaskedGatherCost >= VecLdCost &&
7292 bool ProfitableGatherPointers =
7293 L && Sz > 2 &&
static_cast<unsigned>(
count_if(PointerOps, [L](
Value *V) {
7294 return L->isLoopInvariant(V);
7296 if (ProfitableGatherPointers ||
all_of(PointerOps, [](
Value *
P) {
7299 (
GEP &&
GEP->getNumOperands() == 2 &&
7307 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
7308 ProfitableGatherPointers))
7320 all_of(VL, [](
const Value *V) {
return V->getType()->isPointerTy(); }) &&
7321 "Expected list of pointer operands.");
7326 std::pair<BasicBlock *, Value *>,
7330 .try_emplace(std::make_pair(
7334 SortedIndices.
clear();
7336 auto Key = std::make_pair(BBs[Cnt + 1],
7338 bool Found =
any_of(Bases.try_emplace(
Key).first->second,
7340 std::optional<int64_t> Diff =
7341 getPointersDiff(ElemTy, std::get<0>(Base.front()),
7342 ElemTy, Ptr, DL, SE,
7347 Base.emplace_back(Ptr, *Diff, Cnt + 1);
7353 if (Bases.size() > VL.
size() / 2 - 1)
7357 Bases.find(
Key)->second.emplace_back().emplace_back(
Ptr, 0, Cnt + 1);
7361 if (Bases.size() == VL.
size())
7364 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
7365 Bases.front().second.size() == VL.
size()))
7370 auto ComparePointers = [](
Value *Ptr1,
Value *Ptr2) {
7379 FirstPointers.
insert(P1);
7380 SecondPointers.
insert(P2);
7386 "Unable to find matching root.");
7389 for (
auto &
Base : Bases) {
7390 for (
auto &Vec :
Base.second) {
7391 if (Vec.size() > 1) {
7393 int64_t InitialOffset = std::get<1>(Vec[0]);
7394 bool AnyConsecutive =
7396 return std::get<1>(
P.value()) ==
7397 int64_t(
P.index()) + InitialOffset;
7401 if (!AnyConsecutive)
7406 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
7410 for (
auto &
T : Bases)
7411 for (
const auto &Vec :
T.second)
7412 for (
const auto &
P : Vec)
7416 "Expected SortedIndices to be the size of VL");
7420std::optional<BoUpSLP::OrdersType>
7422 assert(TE.isGather() &&
"Expected gather node only.");
7423 Type *ScalarTy = TE.Scalars[0]->getType();
7426 Ptrs.
reserve(TE.Scalars.size());
7428 BBs.
reserve(TE.Scalars.size());
7429 for (
Value *V : TE.Scalars) {
7431 if (!L || !L->isSimple())
7432 return std::nullopt;
7438 if (!LoadEntriesToVectorize.contains(TE.Idx) &&
7440 return std::move(Order);
7441 return std::nullopt;
7452 if (VU->
getType() != V->getType())
7455 if (!VU->
hasOneUse() && !V->hasOneUse())
7461 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
7468 bool IsReusedIdx =
false;
7470 if (IE2 == VU && !IE1)
7472 if (IE1 == V && !IE2)
7473 return V->hasOneUse();
7474 if (IE1 && IE1 != V) {
7476 IsReusedIdx |= ReusedIdx.
test(Idx1);
7477 ReusedIdx.
set(Idx1);
7478 if ((IE1 != VU && !IE1->
hasOneUse()) || IsReusedIdx)
7483 if (IE2 && IE2 != VU) {
7485 IsReusedIdx |= ReusedIdx.
test(Idx2);
7486 ReusedIdx.
set(Idx2);
7487 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
7492 }
while (!IsReusedIdx && (IE1 || IE2));
7500 const TargetLibraryInfo &TLI);
7502std::optional<BoUpSLP::OrdersType>
7504 bool IgnoreReorder) {
7507 if (!TE.ReuseShuffleIndices.empty()) {
7509 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
7510 "Reshuffling scalars not yet supported for nodes with padding");
7513 return std::nullopt;
7521 unsigned Sz = TE.Scalars.size();
7522 if (TE.isGather()) {
7523 if (std::optional<OrdersType> CurrentOrder =
7528 ::addMask(Mask, TE.ReuseShuffleIndices);
7529 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
7530 unsigned Sz = TE.Scalars.size();
7531 for (
int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
7534 Res[Idx + K * Sz] =
I + K * Sz;
7536 return std::move(Res);
7539 if (Sz == 2 && TE.getVectorFactor() == 4 &&
7541 2 * TE.getVectorFactor())) == 1)
7542 return std::nullopt;
7543 if (TE.ReuseShuffleIndices.size() % Sz != 0)
7544 return std::nullopt;
7548 if (TE.ReorderIndices.empty())
7549 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
7552 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
7553 unsigned VF = ReorderMask.
size();
7557 for (
unsigned I = 0;
I < VF;
I += Sz) {
7559 unsigned UndefCnt = 0;
7560 unsigned Limit = std::min(Sz, VF -
I);
7569 Val >=
static_cast<int>(NumParts) || UsedVals.
test(Val) ||
7571 return std::nullopt;
7573 for (
unsigned K = 0; K < NumParts; ++K) {
7574 unsigned Idx = Val + Sz * K;
7575 if (Idx < VF &&
I + K < VF)
7576 ResOrder[Idx] =
I + K;
7579 return std::move(ResOrder);
7581 unsigned VF = TE.getVectorFactor();
7584 TE.ReuseShuffleIndices.end());
7585 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
7587 if (isa<PoisonValue>(V))
7589 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
7590 return Idx && *Idx < Sz;
7592 assert(!TE.isAltShuffle() &&
"Alternate instructions are only supported "
7593 "by BinaryOperator and CastInst.");
7595 if (TE.ReorderIndices.empty())
7596 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
7599 for (
unsigned I = 0;
I < VF; ++
I) {
7600 int &Idx = ReusedMask[
I];
7603 Value *V = TE.Scalars[ReorderMask[Idx]];
7605 Idx = std::distance(ReorderMask.
begin(),
find(ReorderMask, *EI));
7611 std::iota(ResOrder.
begin(), ResOrder.
end(), 0);
7612 auto *It = ResOrder.
begin();
7613 for (
unsigned K = 0; K < VF; K += Sz) {
7617 std::iota(SubMask.
begin(), SubMask.
end(), 0);
7619 transform(CurrentOrder, It, [K](
unsigned Pos) {
return Pos + K; });
7620 std::advance(It, Sz);
7623 return Data.index() ==
Data.value();
7625 return std::nullopt;
7626 return std::move(ResOrder);
7628 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
7629 (!TE.UserTreeIndex || !TE.UserTreeIndex.UserTE->hasState() ||
7631 (TE.ReorderIndices.empty() ||
isReverseOrder(TE.ReorderIndices)))
7632 return std::nullopt;
7633 if (TE.State == TreeEntry::SplitVectorize ||
7634 ((TE.State == TreeEntry::Vectorize ||
7635 TE.State == TreeEntry::StridedVectorize ||
7636 TE.State == TreeEntry::CompressVectorize) &&
7639 assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
7640 "Alternate instructions are only supported by "
7641 "BinaryOperator and CastInst.");
7642 return TE.ReorderIndices;
7644 if (!TopToBottom && IgnoreReorder && TE.State == TreeEntry::Vectorize &&
7645 TE.isAltShuffle()) {
7646 assert(TE.ReuseShuffleIndices.empty() &&
7647 "ReuseShuffleIndices should be "
7648 "empty for alternate instructions.");
7650 TE.buildAltOpShuffleMask(
7652 assert(TE.getMatchingMainOpOrAltOp(
I) &&
7653 "Unexpected main/alternate opcode");
7657 const int VF = TE.getVectorFactor();
7662 ResOrder[Mask[
I] % VF] =
I;
7664 return std::move(ResOrder);
7666 if (!TE.ReorderIndices.empty())
7667 return TE.ReorderIndices;
7668 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
7669 if (!TE.ReorderIndices.empty())
7670 return TE.ReorderIndices;
7673 for (
auto [
I, V] :
zip(UserBVHead, TE.Scalars)) {
7681 while (
II &&
II->hasOneUse() &&
II->getParent() == BB) {
7689 assert(BB1 != BB2 &&
"Expected different basic blocks.");
7690 if (!DT->isReachableFromEntry(BB1))
7692 if (!DT->isReachableFromEntry(BB2))
7694 auto *NodeA = DT->getNode(BB1);
7695 auto *NodeB = DT->getNode(BB2);
7696 assert(NodeA &&
"Should only process reachable instructions");
7697 assert(NodeB &&
"Should only process reachable instructions");
7698 assert((NodeA == NodeB) ==
7699 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
7700 "Different nodes should have different DFS numbers");
7701 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
7703 auto PHICompare = [&](
unsigned I1,
unsigned I2) {
7704 Value *V1 = TE.Scalars[I1];
7705 Value *V2 = TE.Scalars[I2];
7718 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
7719 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
7720 FirstUserOfPhi2->getParent());
7730 if (UserBVHead[I1] && !UserBVHead[I2])
7732 if (!UserBVHead[I1])
7734 if (UserBVHead[I1] == UserBVHead[I2])
7737 return CompareByBasicBlocks(UserBVHead[I1]->
getParent(),
7739 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
7752 if (EE1->getOperand(0) == EE2->getOperand(0))
7754 if (!Inst1 && Inst2)
7756 if (Inst1 && Inst2) {
7764 "Expected either instructions or arguments vector operands.");
7765 return P1->getArgNo() < P2->getArgNo();
7770 std::iota(Phis.
begin(), Phis.
end(), 0);
7773 return std::nullopt;
7774 return std::move(Phis);
7776 if (TE.isGather() &&
7777 (!TE.hasState() || !TE.isAltShuffle() ||
7778 ScalarsInSplitNodes.contains(TE.getMainOp())) &&
7782 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
7786 auto *EE = dyn_cast<ExtractElementInst>(V);
7787 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
7793 canReuseExtract(TE.Scalars, CurrentOrder,
true);
7794 if (Reuse || !CurrentOrder.
empty())
7795 return std::move(CurrentOrder);
7803 int Sz = TE.Scalars.size();
7807 if (It == TE.Scalars.begin())
7810 if (It != TE.Scalars.end()) {
7812 unsigned Idx = std::distance(TE.Scalars.begin(), It);
7827 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
7830 return std::move(Order);
7835 return std::nullopt;
7836 if (TE.Scalars.size() >= 3)
7841 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
7843 StridedPtrInfo SPtrInfo;
7846 CurrentOrder, PointerOps, SPtrInfo);
7849 return std::move(CurrentOrder);
7854 if (std::optional<OrdersType> CurrentOrder =
7856 return CurrentOrder;
7858 return std::nullopt;
7868 for (
unsigned I = Sz,
E = Mask.size();
I <
E;
I += Sz) {
7870 if (Cluster != FirstCluster)
7876void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask)
const {
7879 const unsigned Sz =
TE.Scalars.size();
7881 if (!
TE.isGather() ||
7886 SmallVector<int> NewMask;
7888 addMask(NewMask,
TE.ReuseShuffleIndices);
7890 TE.ReorderIndices.clear();
7892 ArrayRef<int> Slice =
ArrayRef(NewMask).slice(0, Sz);
7893 SmallVector<unsigned> NewOrder(Slice);
7897 for (
auto *It =
TE.ReuseShuffleIndices.begin(),
7898 *End =
TE.ReuseShuffleIndices.end();
7899 It != End; std::advance(It, Sz))
7900 std::iota(It, std::next(It, Sz), 0);
7906 "Expected same size of orders");
7907 size_t Sz = Order.
size();
7910 if (Order[Idx] != Sz)
7911 UsedIndices.
set(Order[Idx]);
7913 if (SecondaryOrder.
empty()) {
7915 if (Order[Idx] == Sz && !UsedIndices.
test(Idx))
7919 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
7920 !UsedIndices.
test(SecondaryOrder[Idx]))
7921 Order[Idx] = SecondaryOrder[Idx];
7929 constexpr unsigned TinyVF = 2;
7930 constexpr unsigned TinyTree = 10;
7931 constexpr unsigned PhiOpsLimit = 12;
7932 constexpr unsigned GatherLoadsLimit = 2;
7933 if (VectorizableTree.size() <= TinyTree)
7935 if (VectorizableTree.front()->hasState() &&
7936 !VectorizableTree.front()->isGather() &&
7937 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
7938 VectorizableTree.front()->getOpcode() == Instruction::PHI ||
7939 (VectorizableTree.front()->getVectorFactor() <= TinyVF &&
7940 (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
7941 VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
7942 VectorizableTree.front()->ReorderIndices.empty()) {
7946 if (VectorizableTree.front()->hasState() &&
7947 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
7948 VectorizableTree.front()->Scalars.size() == TinyVF &&
7949 VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
7952 if (VectorizableTree.front()->hasState() &&
7953 VectorizableTree.front()->getOpcode() == Instruction::Store &&
7954 VectorizableTree.front()->ReorderIndices.empty()) {
7955 const unsigned ReorderedSplitsCnt =
7956 count_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
7957 return TE->State == TreeEntry::SplitVectorize &&
7958 !TE->ReorderIndices.empty() && TE->UserTreeIndex.UserTE &&
7959 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
7962 if (ReorderedSplitsCnt <= 1 &&
7964 VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
7965 return ((!TE->isGather() &&
7966 (TE->ReorderIndices.empty() ||
7967 (TE->UserTreeIndex.UserTE &&
7968 TE->UserTreeIndex.UserTE->State ==
7969 TreeEntry::Vectorize &&
7970 !TE->UserTreeIndex.UserTE->ReuseShuffleIndices
7972 (TE->isGather() && TE->ReorderIndices.empty() &&
7973 (!TE->hasState() || TE->isAltShuffle() ||
7974 TE->getOpcode() == Instruction::Load ||
7975 TE->getOpcode() == Instruction::ZExt ||
7976 TE->getOpcode() == Instruction::SExt))) &&
7977 (VectorizableTree.front()->getVectorFactor() > TinyVF ||
7978 !TE->isGather() ||
none_of(TE->Scalars, [&](
Value *V) {
7979 return !isConstant(V) && isVectorized(V);
7981 })) >= VectorizableTree.size() - ReorderedSplitsCnt)
7984 bool HasPhis =
false;
7985 bool HasLoad =
true;
7986 unsigned GatherLoads = 0;
7987 for (
const std::unique_ptr<TreeEntry> &TE :
7988 ArrayRef(VectorizableTree).drop_front()) {
7989 if (TE->State == TreeEntry::SplitVectorize)
7991 if (!TE->hasState()) {
7995 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8000 if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) {
8001 if (!TE->isGather()) {
8008 if (GatherLoads >= GatherLoadsLimit)
8011 if (TE->getOpcode() == Instruction::GetElementPtr ||
8014 if (TE->getOpcode() != Instruction::PHI &&
8015 (!TE->hasCopyableElements() ||
8017 TE->Scalars.size() / 2))
8019 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8020 TE->getNumOperands() > PhiOpsLimit)
8029void BoUpSLP::TreeEntry::reorderSplitNode(
unsigned Idx,
ArrayRef<int> Mask,
8031 assert(State == TreeEntry::SplitVectorize &&
"Expected split user node.");
8034 std::iota(NewMask.
begin(), NewMask.
end(), 0);
8035 std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0);
8038 copy(MaskOrder, NewMaskOrder.begin());
8040 assert(Idx == 1 &&
"Expected either 0 or 1 index.");
8041 unsigned Offset = CombinedEntriesWithIndices.
back().second;
8050 ReorderIndices.clear();
8069 ExternalUserReorderMap;
8073 for_each(VectorizableTree, [&, &TTIRef = *TTI](
8074 const std::unique_ptr<TreeEntry> &TE) {
8077 findExternalStoreUsersReorderIndices(TE.get());
8078 if (!ExternalUserReorderIndices.
empty()) {
8079 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
8081 std::move(ExternalUserReorderIndices));
8087 if (TE->hasState() && TE->isAltShuffle() &&
8088 TE->State != TreeEntry::SplitVectorize) {
8089 Type *ScalarTy = TE->Scalars[0]->getType();
8091 unsigned Opcode0 = TE->getOpcode();
8092 unsigned Opcode1 = TE->getAltOpcode();
8096 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
8097 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
8103 bool IgnoreReorder =
8104 !UserIgnoreList && VectorizableTree.front()->hasState() &&
8105 (VectorizableTree.front()->
getOpcode() == Instruction::InsertElement ||
8106 VectorizableTree.front()->getOpcode() == Instruction::Store);
8107 if (std::optional<OrdersType> CurrentOrder =
8117 const TreeEntry *UserTE = TE.get();
8119 if (!UserTE->UserTreeIndex)
8121 if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8122 UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
8123 UserTE->UserTreeIndex.UserTE->Idx != 0)
8125 UserTE = UserTE->UserTreeIndex.UserTE;
8128 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
8129 if (!(TE->State == TreeEntry::Vectorize ||
8130 TE->State == TreeEntry::StridedVectorize ||
8131 TE->State == TreeEntry::SplitVectorize ||
8132 TE->State == TreeEntry::CompressVectorize) ||
8133 !TE->ReuseShuffleIndices.empty())
8134 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
8135 if (TE->State == TreeEntry::Vectorize &&
8136 TE->getOpcode() == Instruction::PHI)
8137 PhisToOrders.
try_emplace(TE.get(), *CurrentOrder);
8142 for (
unsigned VF = VectorizableTree.front()->getVectorFactor();
8143 !VFToOrderedEntries.
empty() && VF > 1; VF -= 2 - (VF & 1U)) {
8144 auto It = VFToOrderedEntries.
find(VF);
8145 if (It == VFToOrderedEntries.
end())
8159 for (
const TreeEntry *OpTE : OrderedEntries) {
8162 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE) &&
8163 OpTE->State != TreeEntry::SplitVectorize)
8166 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
8168 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
8169 auto It = GathersToOrders.find(OpTE);
8170 if (It != GathersToOrders.end())
8173 if (OpTE->hasState() && OpTE->isAltShuffle()) {
8174 auto It = AltShufflesToOrders.find(OpTE);
8175 if (It != AltShufflesToOrders.end())
8178 if (OpTE->State == TreeEntry::Vectorize &&
8179 OpTE->getOpcode() == Instruction::PHI) {
8180 auto It = PhisToOrders.
find(OpTE);
8181 if (It != PhisToOrders.
end())
8184 return OpTE->ReorderIndices;
8187 auto It = ExternalUserReorderMap.
find(OpTE);
8188 if (It != ExternalUserReorderMap.
end()) {
8189 const auto &ExternalUserReorderIndices = It->second;
8193 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
8194 OrdersUses.try_emplace(
OrdersType(), 0).first->second +=
8195 ExternalUserReorderIndices.size();
8197 for (
const OrdersType &ExtOrder : ExternalUserReorderIndices)
8198 ++OrdersUses.try_emplace(ExtOrder, 0).first->second;
8205 if (OpTE->State == TreeEntry::Vectorize &&
8206 OpTE->getOpcode() == Instruction::Store && !Order.
empty()) {
8207 assert(!OpTE->isAltShuffle() &&
8208 "Alternate instructions are only supported by BinaryOperator "
8212 unsigned E = Order.
size();
8215 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8218 ++OrdersUses.try_emplace(CurrentOrder, 0).first->second;
8220 ++OrdersUses.try_emplace(Order, 0).first->second;
8223 if (OrdersUses.empty())
8226 unsigned IdentityCnt = 0;
8227 unsigned FilledIdentityCnt = 0;
8229 for (
auto &Pair : OrdersUses) {
8231 if (!Pair.first.empty())
8232 FilledIdentityCnt += Pair.second;
8233 IdentityCnt += Pair.second;
8238 unsigned Cnt = IdentityCnt;
8239 for (
auto &Pair : OrdersUses) {
8243 if (Cnt < Pair.second ||
8244 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
8245 Cnt == Pair.second && !BestOrder.
empty() &&
8248 BestOrder = Pair.first;
8261 unsigned E = BestOrder.
size();
8263 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8266 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8268 if (TE->Scalars.size() != VF) {
8269 if (TE->ReuseShuffleIndices.size() == VF) {
8270 assert(TE->State != TreeEntry::SplitVectorize &&
8271 "Split vectorized not expected.");
8276 (!TE->UserTreeIndex ||
8277 TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
8278 TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() ||
8279 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) &&
8280 "All users must be of VF size.");
8287 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->hasState() &&
8293 reorderNodeWithReuses(*TE, Mask);
8295 if (TE->UserTreeIndex &&
8296 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8297 TE->UserTreeIndex.UserTE->reorderSplitNode(
8298 TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder);
8302 if ((TE->State == TreeEntry::SplitVectorize &&
8303 TE->ReuseShuffleIndices.empty()) ||
8304 ((TE->State == TreeEntry::Vectorize ||
8305 TE->State == TreeEntry::StridedVectorize ||
8306 TE->State == TreeEntry::CompressVectorize) &&
8311 (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize &&
8312 TE->ReuseShuffleIndices.empty())) &&
8313 "Alternate instructions are only supported by BinaryOperator "
8319 TE->reorderOperands(Mask);
8322 TE->reorderOperands(Mask);
8323 assert(TE->ReorderIndices.empty() &&
8324 "Expected empty reorder sequence.");
8327 if (!TE->ReuseShuffleIndices.empty()) {
8334 addMask(NewReuses, TE->ReuseShuffleIndices);
8335 TE->ReuseShuffleIndices.swap(NewReuses);
8336 }
else if (TE->UserTreeIndex &&
8337 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8339 TE->UserTreeIndex.UserTE->reorderSplitNode(TE->UserTreeIndex.EdgeIdx,
8345void BoUpSLP::buildReorderableOperands(
8346 TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
8350 if (
any_of(Edges, [
I](
const std::pair<unsigned, TreeEntry *> &OpData) {
8351 return OpData.first ==
I &&
8352 (OpData.second->State == TreeEntry::Vectorize ||
8353 OpData.second->State == TreeEntry::StridedVectorize ||
8354 OpData.second->State == TreeEntry::CompressVectorize ||
8355 OpData.second->State == TreeEntry::SplitVectorize);
8359 if (UserTE->hasState()) {
8360 if (UserTE->getOpcode() == Instruction::ExtractElement ||
8361 UserTE->getOpcode() == Instruction::ExtractValue)
8363 if (UserTE->getOpcode() == Instruction::InsertElement &&
I == 0)
8365 if (UserTE->getOpcode() == Instruction::Store &&
8366 UserTE->State == TreeEntry::Vectorize &&
I == 1)
8368 if (UserTE->getOpcode() == Instruction::Load &&
8369 (UserTE->State == TreeEntry::Vectorize ||
8370 UserTE->State == TreeEntry::StridedVectorize ||
8371 UserTE->State == TreeEntry::CompressVectorize))
8374 TreeEntry *TE = getOperandEntry(UserTE,
I);
8375 assert(TE &&
"Expected operand entry.");
8376 if (!TE->isGather()) {
8379 Edges.emplace_back(
I, TE);
8385 if (TE->State == TreeEntry::ScatterVectorize &&
8386 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
8390 if (ReorderableGathers.
contains(TE))
8396 struct TreeEntryCompare {
8397 bool operator()(
const TreeEntry *LHS,
const TreeEntry *RHS)
const {
8398 if (LHS->UserTreeIndex && RHS->UserTreeIndex)
8399 return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx;
8400 return LHS->Idx < RHS->Idx;
8409 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8410 if (TE->State != TreeEntry::Vectorize &&
8411 TE->State != TreeEntry::StridedVectorize &&
8412 TE->State != TreeEntry::CompressVectorize &&
8413 TE->State != TreeEntry::SplitVectorize)
8414 NonVectorized.
insert(TE.get());
8415 if (std::optional<OrdersType> CurrentOrder =
8417 Queue.push(TE.get());
8418 if (!(TE->State == TreeEntry::Vectorize ||
8419 TE->State == TreeEntry::StridedVectorize ||
8420 TE->State == TreeEntry::CompressVectorize ||
8421 TE->State == TreeEntry::SplitVectorize) ||
8422 !TE->ReuseShuffleIndices.empty())
8423 GathersToOrders.
insert(TE.get());
8432 while (!Queue.empty()) {
8434 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>
Users;
8435 TreeEntry *TE = Queue.top();
8436 const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
8439 while (!Queue.empty()) {
8441 if (!UserTE || UserTE != TE->UserTreeIndex.UserTE)
8446 for (TreeEntry *TE : OrderedOps) {
8447 if (!(TE->State == TreeEntry::Vectorize ||
8448 TE->State == TreeEntry::StridedVectorize ||
8449 TE->State == TreeEntry::CompressVectorize ||
8450 TE->State == TreeEntry::SplitVectorize ||
8451 (TE->isGather() && GathersToOrders.
contains(TE))) ||
8452 !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
8453 !Visited.
insert(TE).second)
8457 Users.first = TE->UserTreeIndex.UserTE;
8458 Users.second.emplace_back(TE->UserTreeIndex.EdgeIdx, TE);
8462 if (
Data.first->State == TreeEntry::SplitVectorize) {
8464 Data.second.size() <= 2 &&
8465 "Expected not greater than 2 operands for split vectorize node.");
8467 [](
const auto &
Op) { return !Op.second->UserTreeIndex; }))
8470 assert(
Data.first->CombinedEntriesWithIndices.size() == 2 &&
8471 "Expected exactly 2 entries.");
8472 for (
const auto &
P :
Data.first->CombinedEntriesWithIndices) {
8473 TreeEntry &OpTE = *VectorizableTree[
P.first];
8475 if (Order.
empty() || !OpTE.ReuseShuffleIndices.empty()) {
8476 if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
8478 const auto BestOrder =
8487 const unsigned E = Order.
size();
8490 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8492 Data.first->reorderSplitNode(
P.second ? 1 : 0, Mask, MaskOrder);
8494 if (!OpTE.ReorderIndices.empty()) {
8495 OpTE.ReorderIndices.clear();
8496 }
else if (!OpTE.ReuseShuffleIndices.empty()) {
8499 assert(OpTE.isGather() &&
"Expected only gather/buildvector node.");
8503 if (
Data.first->ReuseShuffleIndices.empty() &&
8504 !
Data.first->ReorderIndices.empty()) {
8507 Queue.push(
Data.first);
8513 buildReorderableOperands(
Data.first,
Data.second, NonVectorized,
8525 for (
const auto &
Op :
Data.second) {
8526 TreeEntry *OpTE =
Op.second;
8527 if (!VisitedOps.
insert(OpTE).second)
8529 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
8531 const auto Order = [&]() ->
const OrdersType {
8532 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
8536 return OpTE->ReorderIndices;
8540 if (Order.
size() == 1)
8546 Value *Root = OpTE->hasState()
8549 auto GetSameNodesUsers = [&](
Value *Root) {
8551 for (
const TreeEntry *TE : ValueToGatherNodes.lookup(Root)) {
8552 if (TE != OpTE && TE->UserTreeIndex &&
8553 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8554 TE->Scalars.size() == OpTE->Scalars.size() &&
8555 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8556 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8557 Res.
insert(TE->UserTreeIndex.UserTE);
8559 for (
const TreeEntry *TE : getTreeEntries(Root)) {
8560 if (TE != OpTE && TE->UserTreeIndex &&
8561 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8562 TE->Scalars.size() == OpTE->Scalars.size() &&
8563 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8564 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8565 Res.
insert(TE->UserTreeIndex.UserTE);
8569 auto GetNumOperands = [](
const TreeEntry *TE) {
8570 if (TE->State == TreeEntry::SplitVectorize)
8571 return TE->getNumOperands();
8573 return CI->arg_size();
8574 return TE->getNumOperands();
8576 auto NodeShouldBeReorderedWithOperands = [&, TTI = TTI](
8577 const TreeEntry *TE) {
8585 const TreeEntry *
Op = getOperandEntry(TE, Idx);
8586 if (
Op->isGather() &&
Op->hasState()) {
8587 const TreeEntry *VecOp =
8588 getSameValuesTreeEntry(
Op->getMainOp(),
Op->Scalars);
8592 if (
Op->ReorderIndices.empty() &&
Op->ReuseShuffleIndices.empty())
8599 if (!RevisitedOps.
insert(UTE).second)
8601 return UTE ==
Data.first || !UTE->ReorderIndices.empty() ||
8602 !UTE->ReuseShuffleIndices.empty() ||
8603 (UTE->UserTreeIndex &&
8604 UTE->UserTreeIndex.UserTE ==
Data.first) ||
8605 (
Data.first->UserTreeIndex &&
8606 Data.first->UserTreeIndex.UserTE == UTE) ||
8607 (IgnoreReorder && UTE->UserTreeIndex &&
8608 UTE->UserTreeIndex.UserTE->Idx == 0) ||
8609 NodeShouldBeReorderedWithOperands(UTE);
8612 for (TreeEntry *UTE :
Users) {
8620 const TreeEntry *
Op = getOperandEntry(UTE, Idx);
8622 Queue.push(
const_cast<TreeEntry *
>(
Op));
8627 Data.second, [OpTE](
const std::pair<unsigned, TreeEntry *> &
P) {
8628 return P.second == OpTE;
8631 if (OpTE->State == TreeEntry::Vectorize &&
8632 OpTE->getOpcode() == Instruction::Store && !Order.
empty()) {
8633 assert(!OpTE->isAltShuffle() &&
8634 "Alternate instructions are only supported by BinaryOperator "
8638 unsigned E = Order.
size();
8641 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8644 OrdersUses.try_emplace(CurrentOrder, 0).first->second +=
NumOps;
8646 OrdersUses.try_emplace(Order, 0).first->second +=
NumOps;
8648 auto Res = OrdersUses.try_emplace(
OrdersType(), 0);
8649 const auto AllowsReordering = [&](
const TreeEntry *TE) {
8650 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
8651 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
8652 (IgnoreReorder && TE->Idx == 0))
8654 if (TE->isGather()) {
8664 if (OpTE->UserTreeIndex) {
8665 TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE;
8666 if (!VisitedUsers.
insert(UserTE).second)
8671 if (AllowsReordering(UserTE))
8679 if (
static_cast<unsigned>(
count_if(
8680 Ops, [UserTE, &AllowsReordering](
8681 const std::pair<unsigned, TreeEntry *> &
Op) {
8682 return AllowsReordering(
Op.second) &&
8683 Op.second->UserTreeIndex.UserTE == UserTE;
8684 })) <=
Ops.size() / 2)
8685 ++Res.first->second;
8688 if (OrdersUses.empty()) {
8693 unsigned IdentityCnt = 0;
8694 unsigned VF =
Data.second.front().second->getVectorFactor();
8696 for (
auto &Pair : OrdersUses) {
8698 IdentityCnt += Pair.second;
8703 unsigned Cnt = IdentityCnt;
8704 for (
auto &Pair : OrdersUses) {
8708 if (Cnt < Pair.second) {
8710 BestOrder = Pair.first;
8727 unsigned E = BestOrder.
size();
8729 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8731 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second) {
8732 TreeEntry *TE =
Op.second;
8733 if (!VisitedOps.
insert(TE).second)
8735 if (TE->ReuseShuffleIndices.size() == BestOrder.
size()) {
8736 reorderNodeWithReuses(*TE, Mask);
8740 if (TE->State != TreeEntry::Vectorize &&
8741 TE->State != TreeEntry::StridedVectorize &&
8742 TE->State != TreeEntry::CompressVectorize &&
8743 TE->State != TreeEntry::SplitVectorize &&
8744 (TE->State != TreeEntry::ScatterVectorize ||
8745 TE->ReorderIndices.empty()))
8747 assert((BestOrder.
size() == TE->ReorderIndices.size() ||
8748 TE->ReorderIndices.empty()) &&
8749 "Non-matching sizes of user/operand entries.");
8751 if (IgnoreReorder && TE == VectorizableTree.front().get())
8752 IgnoreReorder =
false;
8755 for (TreeEntry *
Gather : GatherOps) {
8757 "Unexpected reordering of gathers.");
8758 if (!
Gather->ReuseShuffleIndices.empty()) {
8768 auto IsNotProfitableAltCodeNode = [](
const TreeEntry &TE) {
8769 return TE.isAltShuffle() &&
8770 (!TE.ReuseShuffleIndices.empty() || TE.getVectorFactor() == 2 ||
8771 TE.ReorderIndices.empty());
8773 if (
Data.first->State != TreeEntry::Vectorize ||
8775 Data.first->getMainOp()) ||
8776 IsNotProfitableAltCodeNode(*
Data.first))
8777 Data.first->reorderOperands(Mask);
8779 IsNotProfitableAltCodeNode(*
Data.first) ||
8780 Data.first->State == TreeEntry::StridedVectorize ||
8781 Data.first->State == TreeEntry::CompressVectorize) {
8785 if (
Data.first->ReuseShuffleIndices.empty() &&
8786 !
Data.first->ReorderIndices.empty() &&
8787 !IsNotProfitableAltCodeNode(*
Data.first)) {
8790 Queue.push(
Data.first);
8798 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
8799 VectorizableTree.front()->ReuseShuffleIndices.empty())
8800 VectorizableTree.front()->ReorderIndices.
clear();
8803Instruction *BoUpSLP::getRootEntryInstruction(
const TreeEntry &Entry)
const {
8804 if (Entry.hasState() &&
8805 (Entry.getOpcode() == Instruction::Store ||
8806 Entry.getOpcode() == Instruction::Load) &&
8807 Entry.State == TreeEntry::StridedVectorize &&
8808 !Entry.ReorderIndices.empty() &&
isReverseOrder(Entry.ReorderIndices))
8815 const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
8818 for (
auto &TEPtr : VectorizableTree) {
8819 TreeEntry *Entry = TEPtr.get();
8822 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
8826 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
8827 Value *Scalar = Entry->Scalars[Lane];
8832 auto It = ScalarToExtUses.
find(Scalar);
8833 if (It != ScalarToExtUses.
end() && !ExternalUses[It->second].User)
8836 if (Scalar->hasNUsesOrMore(NumVectScalars)) {
8837 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8838 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract from lane " << FoundLane
8839 <<
" from " << *Scalar <<
"for many users.\n");
8840 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
8841 ExternalUses.emplace_back(Scalar,
nullptr, *Entry, FoundLane);
8842 ExternalUsesWithNonUsers.insert(Scalar);
8847 const auto ExtI = ExternallyUsedValues.
find(Scalar);
8848 if (ExtI != ExternallyUsedValues.
end()) {
8849 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8850 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract: Extra arg from lane "
8851 << FoundLane <<
" from " << *Scalar <<
".\n");
8852 ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size());
8853 ExternalUses.emplace_back(Scalar,
nullptr, *Entry, FoundLane);
8864 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
8869 !UseEntries.
empty()) {
8873 if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
8876 all_of(UseEntries, [&](TreeEntry *UseEntry) {
8877 return UseEntry->State == TreeEntry::ScatterVectorize ||
8879 Scalar, getRootEntryInstruction(*UseEntry), TLI,
8882 LLVM_DEBUG(
dbgs() <<
"SLP: \tInternal user will be removed:" << *U
8885 [](TreeEntry *UseEntry) {
8886 return UseEntry->isGather();
8892 if (It != ScalarToExtUses.
end()) {
8893 ExternalUses[It->second].User =
nullptr;
8898 if (U && Scalar->hasNUsesOrMore(
UsesLimit))
8900 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8902 <<
" from lane " << FoundLane <<
" from " << *Scalar
8904 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
8905 ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
8906 ExternalUsesWithNonUsers.insert(Scalar);
8915BoUpSLP::collectUserStores(
const BoUpSLP::TreeEntry *TE)
const {
8919 for (
unsigned Lane :
seq<unsigned>(0, TE->Scalars.size())) {
8920 Value *V = TE->Scalars[Lane];
8933 if (
SI ==
nullptr || !
SI->isSimple() ||
SI->getFunction() !=
F ||
8942 auto &StoresVec = PtrToStoresMap[{
SI->getParent(),
8943 SI->getValueOperand()->getType(),
Ptr}];
8946 if (StoresVec.size() > Lane)
8948 if (!StoresVec.empty()) {
8950 SI->getValueOperand()->getType(),
SI->getPointerOperand(),
8951 SI->getValueOperand()->getType(),
8952 StoresVec.front()->getPointerOperand(), *
DL, *SE,
8958 StoresVec.push_back(
SI);
8963 for (
auto &
P : PtrToStoresMap) {
8978 StoreInst *S0 = StoresVec[0];
8983 StoreInst *
SI = StoresVec[Idx];
8984 std::optional<int64_t> Diff =
8986 SI->getPointerOperand(), *DL, *SE,
8992 if (StoreOffsetVec.
size() != StoresVec.
size())
8994 sort(StoreOffsetVec, llvm::less_first());
8996 int64_t PrevDist = 0;
8997 for (
const auto &
P : StoreOffsetVec) {
8998 if (Idx > 0 &&
P.first != PrevDist + 1)
9006 ReorderIndices.assign(StoresVec.
size(), 0);
9007 bool IsIdentity =
true;
9009 ReorderIndices[
P.second] =
I;
9010 IsIdentity &=
P.second ==
I;
9016 ReorderIndices.clear();
9023 for (
unsigned Idx : Order)
9024 dbgs() << Idx <<
", ";
9030BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE)
const {
9031 unsigned NumLanes =
TE->Scalars.size();
9044 if (StoresVec.
size() != NumLanes)
9049 if (!canFormVector(StoresVec, ReorderIndices))
9054 ExternalReorderIndices.
push_back(ReorderIndices);
9056 return ExternalReorderIndices;
9062 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9063 "TreeEntryToStridedPtrInfoMap is not cleared");
9064 UserIgnoreList = &UserIgnoreLst;
9067 buildTreeRec(Roots, 0,
EdgeInfo());
9072 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9073 "TreeEntryToStridedPtrInfoMap is not cleared");
9076 buildTreeRec(Roots, 0,
EdgeInfo());
9085 bool AddNew =
true) {
9093 for (
Value *V : VL) {
9097 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
9099 bool IsFound =
false;
9100 for (
auto [Map,
Data] :
zip(ClusteredDistToLoad, ClusteredLoads)) {
9101 assert(LI->getParent() ==
Data.front().first->getParent() &&
9102 LI->getType() ==
Data.front().first->getType() &&
9106 "Expected loads with the same type, same parent and same "
9107 "underlying pointer.");
9109 LI->getType(), LI->getPointerOperand(),
Data.front().first->getType(),
9110 Data.front().first->getPointerOperand(),
DL, SE,
9114 auto It = Map.find(*Dist);
9115 if (It != Map.end() && It->second != LI)
9117 if (It == Map.end()) {
9118 Data.emplace_back(LI, *Dist);
9119 Map.try_emplace(*Dist, LI);
9129 auto FindMatchingLoads =
9134 int64_t &
Offset,
unsigned &Start) {
9136 return GatheredLoads.
end();
9145 std::optional<int64_t> Dist =
9147 Data.front().first->getType(),
9148 Data.front().first->getPointerOperand(),
DL, SE,
9154 for (std::pair<LoadInst *, int64_t>
P :
Data) {
9160 unsigned NumUniques = 0;
9161 for (
auto [Cnt, Pair] :
enumerate(Loads)) {
9162 bool Used = DataLoads.
contains(Pair.first);
9163 if (!Used && !DataDists.
contains(*Dist + Pair.second)) {
9167 Repeated.insert(Cnt);
9170 if (NumUniques > 0 &&
9171 (Loads.
size() == NumUniques ||
9172 (Loads.
size() - NumUniques >= 2 &&
9173 Loads.
size() - NumUniques >= Loads.
size() / 2 &&
9179 return std::next(GatheredLoads.
begin(), Idx);
9183 return GatheredLoads.
end();
9185 for (
ArrayRef<std::pair<LoadInst *, int64_t>>
Data : ClusteredLoads) {
9189 auto *It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
9191 while (It != GatheredLoads.
end()) {
9192 assert(!LocalToAdd.
empty() &&
"Expected some elements to add.");
9193 for (
unsigned Idx : LocalToAdd)
9196 It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
Offset,
9200 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
9207 Loads.push_back(
Data[Idx]);
9213 GatheredLoads, [&](
ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9214 return PD.front().first->getParent() == LI->
getParent() &&
9215 PD.front().first->getType() == LI->
getType();
9217 while (It != GatheredLoads.
end()) {
9220 std::next(It), GatheredLoads.
end(),
9221 [&](
ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9222 return PD.front().first->getParent() == LI->getParent() &&
9223 PD.front().first->getType() == LI->getType();
9227 GatheredLoads.emplace_back().append(
Data.begin(),
Data.end());
9228 AddNewLoads(GatheredLoads.emplace_back());
9233void BoUpSLP::tryToVectorizeGatheredLoads(
9234 const SmallMapVector<
9235 std::tuple<BasicBlock *, Value *, Type *>,
9238 GatheredLoadsEntriesFirst = VectorizableTree.
size();
9241 LoadEntriesToVectorize.size());
9242 for (
auto [Idx, Set] :
zip(LoadEntriesToVectorize, LoadSetsToVectorize))
9243 Set.insert_range(VectorizableTree[Idx]->Scalars);
9246 auto LoadSorter = [](
const std::pair<LoadInst *, int64_t> &L1,
9247 const std::pair<LoadInst *, int64_t> &L2) {
9248 return L1.second > L2.second;
9255 auto *Ty =
getWidenedType(Loads.front()->getType(), Loads.size());
9256 return TTI->isLegalMaskedGather(Ty, Alignment) &&
9257 !TTI->forceScalarizeMaskedGather(Ty, Alignment);
9262 SmallVectorImpl<LoadInst *> &NonVectorized,
9263 bool Final,
unsigned MaxVF) {
9265 unsigned StartIdx = 0;
9266 SmallVector<int> CandidateVFs;
9270 *TTI, Loads.
front()->getType(), MaxVF);
9272 *TTI, Loads.
front()->getType(), NumElts - 1)) {
9278 if (Final && CandidateVFs.
empty())
9281 unsigned BestVF = Final ? CandidateVFs.
back() : 0;
9282 for (
unsigned NumElts : CandidateVFs) {
9283 if (Final && NumElts > BestVF)
9285 SmallVector<unsigned> MaskedGatherVectorized;
9286 for (
unsigned Cnt = StartIdx,
E = Loads.
size(); Cnt <
E;
9290 if (VectorizedLoads.count(Slice.
front()) ||
9291 VectorizedLoads.count(Slice.
back()) ||
9297 bool AllowToVectorize =
false;
9300 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
9303 for (LoadInst *LI : Slice) {
9305 if (LI->hasOneUse())
9311 if (
static_cast<unsigned int>(std::distance(
9312 LI->user_begin(), LI->user_end())) != LI->getNumUses())
9314 if (!IsLegalBroadcastLoad)
9318 for (User *U : LI->users()) {
9321 for (
const TreeEntry *UTE : getTreeEntries(U)) {
9322 for (
int I :
seq<int>(UTE->getNumOperands())) {
9324 return V == LI || isa<PoisonValue>(V);
9334 AllowToVectorize = CheckIfAllowed(Slice);
9338 any_of(ValueToGatherNodes.at(Slice.front()),
9339 [=](
const TreeEntry *TE) {
9340 return TE->Scalars.size() == 2 &&
9341 ((TE->Scalars.front() == Slice.front() &&
9342 TE->Scalars.back() == Slice.back()) ||
9343 (TE->Scalars.front() == Slice.back() &&
9344 TE->Scalars.back() == Slice.front()));
9349 if (AllowToVectorize) {
9354 reinterpret_cast<Value *
const *
>(Slice.begin()), Slice.size());
9355 StridedPtrInfo SPtrInfo;
9357 PointerOps, SPtrInfo, &BestVF);
9359 (BestVF > 1 &&
static_cast<unsigned>(NumElts) == 2 * BestVF)) {
9361 if (MaskedGatherVectorized.
empty() ||
9362 Cnt >= MaskedGatherVectorized.
back() + NumElts)
9367 Results.emplace_back(Values, LS);
9368 VectorizedLoads.insert_range(Slice);
9371 if (Cnt == StartIdx)
9372 StartIdx += NumElts;
9375 if (StartIdx >= Loads.
size())
9379 if (!MaskedGatherVectorized.
empty() &&
9380 Cnt < MaskedGatherVectorized.
back() + NumElts)
9386 if (!AllowToVectorize || BestVF == 0)
9390 for (
unsigned Cnt : MaskedGatherVectorized) {
9392 Cnt, std::min<unsigned>(NumElts, Loads.
size() - Cnt));
9396 VectorizedLoads.insert_range(Slice);
9398 if (Cnt == StartIdx)
9399 StartIdx += NumElts;
9402 for (LoadInst *LI : Loads) {
9403 if (!VectorizedLoads.contains(LI))
9404 NonVectorized.push_back(LI);
9408 auto ProcessGatheredLoads =
9411 bool Final =
false) {
9413 for (
ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists :
9415 if (LoadsDists.size() <= 1) {
9416 NonVectorized.
push_back(LoadsDists.back().first);
9424 unsigned MaxConsecutiveDistance = 0;
9425 unsigned CurrentConsecutiveDist = 1;
9426 int64_t LastDist = LocalLoadsDists.front().second;
9427 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
9428 for (
const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {
9431 assert(LastDist >=
L.second &&
9432 "Expected first distance always not less than second");
9433 if (
static_cast<uint64_t
>(LastDist -
L.second) ==
9434 CurrentConsecutiveDist) {
9435 ++CurrentConsecutiveDist;
9436 MaxConsecutiveDistance =
9437 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
9441 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
9444 CurrentConsecutiveDist = 1;
9445 LastDist =
L.second;
9448 if (Loads.
size() <= 1)
9450 if (AllowMaskedGather)
9451 MaxConsecutiveDistance = Loads.
size();
9452 else if (MaxConsecutiveDistance < 2)
9457 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
9458 Final, MaxConsecutiveDistance);
9460 OriginalLoads.size() == Loads.
size() &&
9461 MaxConsecutiveDistance == Loads.
size() &&
9466 VectorizedLoads.
clear();
9470 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
9471 UnsortedNonVectorized, Final,
9472 OriginalLoads.size());
9473 if (SortedNonVectorized.
size() >= UnsortedNonVectorized.
size()) {
9474 SortedNonVectorized.
swap(UnsortedNonVectorized);
9475 Results.swap(UnsortedResults);
9480 << Slice.
size() <<
")\n");
9482 for (
Value *L : Slice)
9490 unsigned MaxVF = Slice.size();
9491 unsigned UserMaxVF = 0;
9492 unsigned InterleaveFactor = 0;
9497 std::optional<unsigned> InterleavedLoadsDistance = 0;
9499 std::optional<unsigned> CommonVF = 0;
9500 DenseMap<const TreeEntry *, unsigned> EntryToPosition;
9501 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
9502 for (
auto [Idx, V] :
enumerate(Slice)) {
9503 for (
const TreeEntry *
E : ValueToGatherNodes.at(V)) {
9504 UserMaxVF = std::max<unsigned>(UserMaxVF,
E->Scalars.size());
9507 UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
9509 if (*CommonVF == 0) {
9510 CommonVF =
E->Scalars.size();
9513 if (*CommonVF !=
E->Scalars.size())
9517 if (Pos != Idx && InterleavedLoadsDistance) {
9520 if (isa<Constant>(V))
9522 if (isVectorized(V))
9524 const auto &Nodes = ValueToGatherNodes.at(V);
9525 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
9526 !is_contained(Slice, V);
9528 InterleavedLoadsDistance.reset();
9532 if (*InterleavedLoadsDistance == 0) {
9533 InterleavedLoadsDistance = Idx - Pos;
9536 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
9537 (Idx - Pos) / *InterleavedLoadsDistance < Order)
9538 InterleavedLoadsDistance.reset();
9539 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
9543 DeinterleavedNodes.
clear();
9545 if (InterleavedLoadsDistance.value_or(0) > 1 &&
9546 CommonVF.value_or(0) != 0) {
9547 InterleaveFactor =
bit_ceil(*InterleavedLoadsDistance);
9548 unsigned VF = *CommonVF;
9551 StridedPtrInfo SPtrInfo;
9553 if (InterleaveFactor <= Slice.size() &&
9554 TTI.isLegalInterleavedAccessType(
9562 UserMaxVF = InterleaveFactor * VF;
9564 InterleaveFactor = 0;
9569 unsigned ConsecutiveNodesSize = 0;
9570 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
9571 any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9572 [&, Slice = Slice](
const auto &
P) {
9574 return std::get<1>(
P).contains(V);
9576 if (It == Slice.end())
9578 const TreeEntry &
TE =
9579 *VectorizableTree[std::get<0>(
P)];
9583 StridedPtrInfo SPtrInfo;
9585 VL, VL.
front(), Order, PointerOps, SPtrInfo);
9589 ConsecutiveNodesSize += VL.
size();
9590 size_t Start = std::distance(Slice.begin(), It);
9591 size_t Sz = Slice.size() -
Start;
9592 return Sz < VL.
size() ||
9593 Slice.slice(Start, VL.
size()) != VL;
9598 if (InterleaveFactor == 0 &&
9600 [&, Slice = Slice](
unsigned Idx) {
9602 SmallVector<Value *> PointerOps;
9603 StridedPtrInfo SPtrInfo;
9604 return canVectorizeLoads(
9605 Slice.slice(Idx * UserMaxVF, UserMaxVF),
9606 Slice[Idx * UserMaxVF], Order, PointerOps,
9607 SPtrInfo) == LoadsState::ScatterVectorize;
9610 if (Slice.size() != ConsecutiveNodesSize)
9611 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
9613 for (
unsigned VF = MaxVF; VF >= 2; VF /= 2) {
9614 bool IsVectorized =
true;
9615 for (
unsigned I = 0,
E = Slice.size();
I <
E;
I += VF) {
9617 Slice.slice(
I, std::min(VF,
E -
I));
9622 if (
any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9623 [&](
const auto &
P) {
9625 VectorizableTree[std::get<0>(
P)]
9630 unsigned Sz = VectorizableTree.size();
9631 buildTreeRec(SubSlice, 0,
EdgeInfo(), InterleaveFactor);
9632 if (Sz == VectorizableTree.size()) {
9633 IsVectorized =
false;
9636 if (InterleaveFactor > 0) {
9637 VF = 2 * (MaxVF / InterleaveFactor);
9638 InterleaveFactor = 0;
9647 NonVectorized.
append(SortedNonVectorized);
9649 return NonVectorized;
9651 for (
const auto &GLs : GatheredLoads) {
9652 const auto &
Ref = GLs.second;
9654 if (!
Ref.empty() && !NonVectorized.
empty() &&
9656 Ref.begin(),
Ref.end(), 0u,
9657 [](
unsigned S,
ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists)
9658 ->
unsigned { return S + LoadsDists.size(); }) !=
9659 NonVectorized.
size() &&
9660 IsMaskedGatherSupported(NonVectorized)) {
9663 for (LoadInst *LI : NonVectorized) {
9671 (void)ProcessGatheredLoads(FinalGatheredLoads,
true);
9675 for (
unsigned Idx : LoadEntriesToVectorize) {
9676 const TreeEntry &
E = *VectorizableTree[Idx];
9679 if (!
E.ReorderIndices.empty()) {
9682 SmallVector<int> ReorderMask;
9686 buildTreeRec(GatheredScalars, 0,
EdgeInfo());
9690 if (
static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
9691 VectorizableTree.size())
9692 GatheredLoadsEntriesFirst.reset();
9702 bool AllowAlternate) {
9725 isValidForAlternation(
I->getOpcode())) {
9737 std::pair<size_t, size_t> OpVals =
9745 if (CI->isCommutative())
9767 SubKey =
hash_value(Gep->getPointerOperand());
9779 return std::make_pair(
Key, SubKey);
9785 Instruction *AltOp,
const TargetLibraryInfo &TLI);
9787bool BoUpSLP::areAltOperandsProfitable(
const InstructionsState &S,
9789 Type *ScalarTy = S.getMainOp()->getType();
9790 unsigned Opcode0 = S.getOpcode();
9791 unsigned Opcode1 = S.getAltOpcode();
9792 SmallBitVector OpcodeMask(
getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
9795 Opcode1, OpcodeMask))
9798 for (
unsigned I :
seq<unsigned>(S.getMainOp()->getNumOperands())) {
9801 for (
Value *V : VL) {
9803 Operands.
back().push_back(
9810 if (Operands.
size() == 2) {
9814 Candidates[0] = std::make_pair(Operands[0][
I], Operands[0][
I + 1]);
9815 Candidates[1] = std::make_pair(Operands[0][
I], Operands[1][
I + 1]);
9816 Candidates[2] = std::make_pair(Operands[1][
I], Operands[0][
I + 1]);
9818 switch (Res.value_or(0)) {
9822 std::swap(Operands[0][
I + 1], Operands[1][
I + 1]);
9832 DenseSet<unsigned> UniqueOpcodes;
9833 constexpr unsigned NumAltInsts = 3;
9834 unsigned NonInstCnt = 0;
9837 unsigned UndefCnt = 0;
9839 unsigned ExtraShuffleInsts = 0;
9842 if (Operands.
size() == 2) {
9844 if (Operands.
front() == Operands.
back()) {
9848 return is_contained(Operands.back(), V);
9851 ++ExtraShuffleInsts;
9854 const Loop *
L = LI->getLoopFor(S.getMainOp()->getParent());
9866 DenseMap<Value *, unsigned> Uniques;
9876 if (!Res.second && Res.first->second == 1)
9877 ++ExtraShuffleInsts;
9878 ++Res.first->getSecond();
9880 UniqueOpcodes.
insert(
I->getOpcode());
9881 else if (Res.second)
9884 return none_of(Uniques, [&](
const auto &
P) {
9885 return P.first->hasNUsesOrMore(
P.second + 1) &&
9886 none_of(
P.first->users(), [&](User *U) {
9887 return isVectorized(U) || Uniques.contains(U);
9896 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
9897 (UniqueOpcodes.
size() + NonInstCnt + ExtraShuffleInsts +
9898 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
9905 const unsigned VF,
unsigned MinBW,
9928static std::pair<InstructionCost, InstructionCost>
9948 FMF = FPCI->getFastMathFlags();
9951 LibCost.isValid() ? LibCost : ScalarLimit);
9961BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
9963 bool IsScatterVectorizeUserTE,
OrdersType &CurrentOrder,
9964 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo) {
9966 "Expected instructions with same/alternate opcodes only.");
9968 unsigned ShuffleOrOp =
9969 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
9971 switch (ShuffleOrOp) {
9972 case Instruction::PHI: {
9975 return TreeEntry::NeedToGather;
9977 for (
Value *V : VL) {
9981 for (
Value *Incoming :
PHI->incoming_values()) {
9983 if (Term &&
Term->isTerminator()) {
9985 <<
"SLP: Need to swizzle PHINodes (terminator use).\n");
9986 return TreeEntry::NeedToGather;
9991 return TreeEntry::Vectorize;
9993 case Instruction::ExtractElement:
10000 return TreeEntry::NeedToGather;
10002 case Instruction::ExtractValue: {
10003 bool Reuse = canReuseExtract(VL, CurrentOrder);
10007 return TreeEntry::NeedToGather;
10008 if (Reuse || !CurrentOrder.empty())
10009 return TreeEntry::Vectorize;
10011 return TreeEntry::NeedToGather;
10013 case Instruction::InsertElement: {
10017 for (
Value *V : VL) {
10019 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement/poison vector.\n");
10020 return TreeEntry::NeedToGather;
10024 "Non-constant or undef index?");
10028 return !SourceVectors.contains(V);
10031 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
10032 "different source vectors.\n");
10033 return TreeEntry::NeedToGather;
10038 return SourceVectors.contains(V) && !
V->hasOneUse();
10041 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
10042 "multiple uses.\n");
10043 return TreeEntry::NeedToGather;
10046 return TreeEntry::Vectorize;
10048 case Instruction::Load: {
10055 auto IsGatheredNode = [&]() {
10056 if (!GatheredLoadsEntriesFirst)
10061 return any_of(getTreeEntries(V), [&](
const TreeEntry *TE) {
10062 return TE->Idx >= *GatheredLoadsEntriesFirst;
10068 return TreeEntry::Vectorize;
10070 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
10072 LoadEntriesToVectorize.insert(VectorizableTree.size());
10073 return TreeEntry::NeedToGather;
10075 return IsGatheredNode() ? TreeEntry::NeedToGather
10076 : TreeEntry::CompressVectorize;
10078 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
10080 LoadEntriesToVectorize.insert(VectorizableTree.size());
10081 return TreeEntry::NeedToGather;
10083 return IsGatheredNode() ? TreeEntry::NeedToGather
10084 : TreeEntry::ScatterVectorize;
10086 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
10088 LoadEntriesToVectorize.insert(VectorizableTree.size());
10089 return TreeEntry::NeedToGather;
10091 return IsGatheredNode() ? TreeEntry::NeedToGather
10092 : TreeEntry::StridedVectorize;
10096 if (DL->getTypeSizeInBits(ScalarTy) !=
10097 DL->getTypeAllocSizeInBits(ScalarTy))
10098 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering loads of non-packed type.\n");
10101 return !LI || !LI->isSimple();
10105 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering non-consecutive loads.\n");
10108 return TreeEntry::NeedToGather;
10112 case Instruction::ZExt:
10113 case Instruction::SExt:
10114 case Instruction::FPToUI:
10115 case Instruction::FPToSI:
10116 case Instruction::FPExt:
10117 case Instruction::PtrToInt:
10118 case Instruction::IntToPtr:
10119 case Instruction::SIToFP:
10120 case Instruction::UIToFP:
10121 case Instruction::Trunc:
10122 case Instruction::FPTrunc:
10123 case Instruction::BitCast: {
10125 for (
Value *V : VL) {
10131 dbgs() <<
"SLP: Gathering casts with different src types.\n");
10132 return TreeEntry::NeedToGather;
10135 return TreeEntry::Vectorize;
10137 case Instruction::ICmp:
10138 case Instruction::FCmp: {
10143 for (
Value *V : VL) {
10147 if ((
Cmp->getPredicate() != P0 &&
Cmp->getPredicate() != SwapP0) ||
10148 Cmp->getOperand(0)->getType() != ComparedTy) {
10149 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering cmp with different predicate.\n");
10150 return TreeEntry::NeedToGather;
10153 return TreeEntry::Vectorize;
10155 case Instruction::Select:
10156 case Instruction::FNeg:
10157 case Instruction::Add:
10158 case Instruction::FAdd:
10159 case Instruction::Sub:
10160 case Instruction::FSub:
10161 case Instruction::Mul:
10162 case Instruction::FMul:
10163 case Instruction::UDiv:
10164 case Instruction::SDiv:
10165 case Instruction::FDiv:
10166 case Instruction::URem:
10167 case Instruction::SRem:
10168 case Instruction::FRem:
10169 case Instruction::Shl:
10170 case Instruction::LShr:
10171 case Instruction::AShr:
10172 case Instruction::And:
10173 case Instruction::Or:
10174 case Instruction::Xor:
10175 case Instruction::Freeze:
10176 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10177 TTI->isFPVectorizationPotentiallyUnsafe() &&
any_of(VL, [](
Value *V) {
10179 return I &&
I->isBinaryOp() && !
I->isFast();
10181 return TreeEntry::NeedToGather;
10182 return TreeEntry::Vectorize;
10183 case Instruction::GetElementPtr: {
10185 for (
Value *V : VL) {
10189 if (
I->getNumOperands() != 2) {
10190 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (nested indexes).\n");
10191 return TreeEntry::NeedToGather;
10198 for (
Value *V : VL) {
10202 Type *CurTy =
GEP->getSourceElementType();
10203 if (Ty0 != CurTy) {
10204 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (different types).\n");
10205 return TreeEntry::NeedToGather;
10211 for (
Value *V : VL) {
10215 auto *
Op =
I->getOperand(1);
10217 (
Op->getType() != Ty1 &&
10219 Op->getType()->getScalarSizeInBits() >
10220 DL->getIndexSizeInBits(
10221 V->getType()->getPointerAddressSpace())))) {
10223 dbgs() <<
"SLP: not-vectorizable GEP (non-constant indexes).\n");
10224 return TreeEntry::NeedToGather;
10228 return TreeEntry::Vectorize;
10230 case Instruction::Store: {
10232 llvm::Type *ScalarTy =
cast<StoreInst>(VL0)->getValueOperand()->getType();
10235 if (DL->getTypeSizeInBits(ScalarTy) !=
10236 DL->getTypeAllocSizeInBits(ScalarTy)) {
10237 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering stores of non-packed type.\n");
10238 return TreeEntry::NeedToGather;
10242 for (
Value *V : VL) {
10244 if (!
SI->isSimple()) {
10246 return TreeEntry::NeedToGather;
10255 if (CurrentOrder.empty()) {
10256 Ptr0 = PointerOps.
front();
10257 PtrN = PointerOps.
back();
10259 Ptr0 = PointerOps[CurrentOrder.front()];
10260 PtrN = PointerOps[CurrentOrder.back()];
10262 std::optional<int64_t> Dist =
10265 if (
static_cast<uint64_t
>(*Dist) == VL.size() - 1)
10266 return TreeEntry::Vectorize;
10270 return TreeEntry::NeedToGather;
10272 case Instruction::Call: {
10273 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10274 TTI->isFPVectorizationPotentiallyUnsafe() &&
any_of(VL, [](
Value *V) {
10276 return I && !
I->isFast();
10278 return TreeEntry::NeedToGather;
10288 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10292 return TreeEntry::NeedToGather;
10295 unsigned NumArgs = CI->
arg_size();
10297 for (
unsigned J = 0; J != NumArgs; ++J)
10300 for (
Value *V : VL) {
10305 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
10307 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched calls:" << *CI <<
"!=" << *V
10309 return TreeEntry::NeedToGather;
10313 for (
unsigned J = 0; J != NumArgs; ++J) {
10316 if (ScalarArgs[J] != A1J) {
10318 <<
"SLP: mismatched arguments in call:" << *CI
10319 <<
" argument " << ScalarArgs[J] <<
"!=" << A1J <<
"\n");
10320 return TreeEntry::NeedToGather;
10329 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched bundle operands in calls:" << *CI
10330 <<
"!=" << *V <<
'\n');
10331 return TreeEntry::NeedToGather;
10336 auto *VecTy =
getWidenedType(S.getMainOp()->getType(), VL.size());
10338 if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())
10339 return TreeEntry::NeedToGather;
10341 return TreeEntry::Vectorize;
10343 case Instruction::ShuffleVector: {
10344 if (!S.isAltShuffle()) {
10347 return TreeEntry::Vectorize;
10350 LLVM_DEBUG(
dbgs() <<
"SLP: ShuffleVector are not vectorized.\n");
10351 return TreeEntry::NeedToGather;
10356 <<
"SLP: ShuffleVector not vectorized, operands are buildvector and "
10357 "the whole alt sequence is not profitable.\n");
10358 return TreeEntry::NeedToGather;
10361 return TreeEntry::Vectorize;
10365 return TreeEntry::NeedToGather;
10374 PHINode *Main =
nullptr;
10379 PHIHandler() =
delete;
10381 : DT(DT), Main(Main), Phis(Phis),
10382 Operands(Main->getNumIncomingValues(),
10384 void buildOperands() {
10385 constexpr unsigned FastLimit = 4;
10394 for (
auto [Idx, V] :
enumerate(Phis)) {
10398 "Expected isa instruction or poison value.");
10399 Operands[
I][Idx] =
V;
10402 if (
P->getIncomingBlock(
I) == InBB)
10403 Operands[
I][Idx] =
P->getIncomingValue(
I);
10405 Operands[
I][Idx] =
P->getIncomingValueForBlock(InBB);
10410 SmallMapVector<BasicBlock *, SmallVector<unsigned>, 4>
10420 for (
auto [Idx, V] :
enumerate(Phis)) {
10423 Operands[
I][Idx] =
V;
10432 Operands[
I][Idx] =
P->getIncomingValue(
I);
10435 auto *It = Blocks.
find(InBB);
10436 if (It == Blocks.
end())
10438 Operands[It->second.front()][Idx] =
P->getIncomingValue(
I);
10441 for (
const auto &
P : Blocks) {
10442 ArrayRef<unsigned> IncomingValues =
P.second;
10443 if (IncomingValues.
size() <= 1)
10446 for (
unsigned I : IncomingValues) {
10448 [&](
const auto &
Data) {
10449 return !
Data.value() ||
10450 Data.value() == Operands[BasicI][
Data.index()];
10452 "Expected empty operands list.");
10453 Operands[
I] = Operands[BasicI];
10466static std::pair<Instruction *, Instruction *>
10470 for (
Value *V : VL) {
10480 if (MainOp->
getOpcode() ==
I->getOpcode()) {
10499 "Expected different main and alt instructions.");
10500 return std::make_pair(MainOp, AltOp);
10513 const InstructionsState &S,
10515 bool TryPad =
false) {
10519 for (
Value *V : VL) {
10535 size_t NumUniqueScalarValues = UniqueValues.
size();
10538 if (NumUniqueScalarValues == VL.
size() &&
10540 ReuseShuffleIndices.
clear();
10545 if ((UserTreeIdx.
UserTE &&
10546 UserTreeIdx.
UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(
TTI)) ||
10548 LLVM_DEBUG(
dbgs() <<
"SLP: Reshuffling scalars not yet supported "
10549 "for nodes with padding.\n");
10550 ReuseShuffleIndices.
clear();
10555 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
10559 if (TryPad && UniquePositions.
size() > 1 && NumUniqueScalarValues > 1 &&
10560 S.getMainOp()->isSafeToRemove() &&
10561 (S.areInstructionsWithCopyableElements() ||
10565 TTI, UniqueValues.
front()->getType(), UniqueValues.
size());
10566 PWSz = std::min<unsigned>(PWSz, VL.
size());
10567 if (PWSz == VL.
size()) {
10571 ReuseShuffleIndices.
clear();
10575 UniqueValues.
end());
10576 PaddedUniqueValues.
append(
10577 PWSz - UniqueValues.
size(),
10581 if ((!S.areInstructionsWithCopyableElements() &&
10583 (S.areInstructionsWithCopyableElements() && S.isMulDivLikeOp() &&
10584 (S.getMainOp()->isIntDivRem() || S.getMainOp()->isFPDivRem() ||
10587 ReuseShuffleIndices.
clear();
10590 VL = std::move(PaddedUniqueValues);
10595 ReuseShuffleIndices.
clear();
10598 VL = std::move(UniqueValues);
10603 const InstructionsState &LocalState,
10604 SmallVectorImpl<Value *> &Op1,
10605 SmallVectorImpl<Value *> &Op2,
10607 constexpr unsigned SmallNodeSize = 4;
10608 if (VL.
size() <= SmallNodeSize || TTI->preferAlternateOpcodeVectorization() ||
10613 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *LocalState.getMainOp()
10615 for (TreeEntry *
E : getSplitTreeEntries(LocalState.getMainOp())) {
10616 if (
E->isSame(VL)) {
10618 << *LocalState.getMainOp() <<
".\n");
10630 ReorderIndices.assign(VL.
size(), VL.
size());
10631 SmallBitVector Op1Indices(VL.
size());
10636 Op1Indices.set(Idx);
10639 if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
10642 (LocalState.getAltOpcode() == LocalState.getOpcode() &&
10644 LocalState.getAltOp(), *TLI))) {
10646 Op1Indices.set(Idx);
10653 unsigned Opcode0 = LocalState.getOpcode();
10654 unsigned Opcode1 = LocalState.getAltOpcode();
10655 SmallBitVector OpcodeMask(
getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
10660 if (UOp1.size() <= 1 || UOp2.size() <= 1 ||
10661 TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) ||
10666 unsigned Op1Cnt = 0, Op2Cnt = Op1.
size();
10668 if (Op1Indices.test(Idx)) {
10669 ReorderIndices[Op1Cnt] = Idx;
10672 ReorderIndices[Op2Cnt] = Idx;
10677 ReorderIndices.clear();
10678 SmallVector<int>
Mask;
10679 if (!ReorderIndices.empty())
10681 unsigned NumParts = TTI->getNumberOfParts(VecTy);
10686 if (NumParts >= VL.
size())
10691 FixedVectorType *SubVecTy =
10695 if (!LocalState.isCmpOp() && NumParts <= 1 &&
10696 (
Mask.empty() || InsertCost >= NewShuffleCost))
10698 if ((LocalState.getMainOp()->isBinaryOp() &&
10699 LocalState.getAltOp()->isBinaryOp() &&
10700 (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
10701 LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
10702 (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
10703 (LocalState.getMainOp()->isUnaryOp() &&
10704 LocalState.getAltOp()->isUnaryOp())) {
10706 TTI->getArithmeticInstrCost(Opcode0, VecTy, Kind) +
10707 TTI->getArithmeticInstrCost(Opcode1, VecTy, Kind);
10712 OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.
size());
10716 VecTy, OriginalMask, Kind);
10718 TTI->getArithmeticInstrCost(Opcode0, Op1VecTy, Kind) +
10719 TTI->getArithmeticInstrCost(Opcode1, Op2VecTy, Kind);
10721 NewVecOpsCost + InsertCost +
10722 (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
10723 VectorizableTree.front()->getOpcode() == Instruction::Store
10727 if (NewCost >= OriginalCost)
10737class InstructionsCompatibilityAnalysis {
10739 const DataLayout &
DL;
10740 const TargetTransformInfo &
TTI;
10741 const TargetLibraryInfo &TLI;
10742 unsigned MainOpcode = 0;
10747 static bool isSupportedOpcode(
const unsigned Opcode) {
10748 return Opcode == Instruction::Add || Opcode == Instruction::LShr ||
10749 Opcode == Instruction::Shl || Opcode == Instruction::SDiv ||
10750 Opcode == Instruction::UDiv || Opcode == Instruction::And ||
10751 Opcode == Instruction::Or || Opcode == Instruction::Xor;
10761 auto IsSupportedInstruction = [&](
Instruction *
I,
bool AnyUndef) {
10762 if (AnyUndef && (
I->isIntDivRem() ||
I->isFPDivRem() ||
isa<CallInst>(
I)))
10764 return I && isSupportedOpcode(
I->getOpcode()) &&
10769 SmallDenseSet<Value *, 8> Operands;
10770 SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
10771 bool AnyUndef =
false;
10772 for (
Value *V : VL) {
10780 if (Candidates.
empty()) {
10781 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
10783 Operands.
insert(
I->op_begin(),
I->op_end());
10786 if (Parent ==
I->getParent()) {
10787 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
10788 Operands.
insert(
I->op_begin(),
I->op_end());
10791 auto *NodeA = DT.
getNode(Parent);
10792 auto *NodeB = DT.
getNode(
I->getParent());
10793 assert(NodeA &&
"Should only process reachable instructions");
10794 assert(NodeB &&
"Should only process reachable instructions");
10795 assert((NodeA == NodeB) ==
10796 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10797 "Different nodes should have different DFS numbers");
10798 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
10799 Candidates.
clear();
10800 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
10803 Operands.
insert(
I->op_begin(),
I->op_end());
10806 unsigned BestOpcodeNum = 0;
10808 for (
const auto &
P : Candidates) {
10809 if (
P.second.size() < BestOpcodeNum)
10811 for (Instruction *
I :
P.second) {
10812 if (IsSupportedInstruction(
I, AnyUndef) && !Operands.
contains(
I)) {
10814 BestOpcodeNum =
P.second.size();
10824 return I &&
I->getParent() == MainOp->
getParent() &&
10837 Value *selectBestIdempotentValue()
const {
10838 assert(isSupportedOpcode(MainOpcode) &&
"Unsupported opcode");
10849 if (!S.isCopyableElement(V))
10851 assert(isSupportedOpcode(MainOpcode) &&
"Unsupported opcode");
10852 return {
V, selectBestIdempotentValue()};
10858 SmallVectorImpl<BoUpSLP::ValueList> &Operands)
const {
10860 unsigned ShuffleOrOp =
10861 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
10864 switch (ShuffleOrOp) {
10865 case Instruction::PHI: {
10869 PHIHandler Handler(DT, PH, VL);
10870 Handler.buildOperands();
10871 Operands.
assign(PH->getNumOperands(), {});
10873 Operands[
I].
assign(Handler.getOperands(
I).begin(),
10874 Handler.getOperands(
I).end());
10877 case Instruction::ExtractValue:
10878 case Instruction::ExtractElement:
10883 case Instruction::InsertElement:
10891 case Instruction::Load:
10895 for (
auto [V,
Op] :
zip(VL, Operands.
back())) {
10899 Op = LI->getPointerOperand();
10902 case Instruction::ZExt:
10903 case Instruction::SExt:
10904 case Instruction::FPToUI:
10905 case Instruction::FPToSI:
10906 case Instruction::FPExt:
10907 case Instruction::PtrToInt:
10908 case Instruction::IntToPtr:
10909 case Instruction::SIToFP:
10910 case Instruction::UIToFP:
10911 case Instruction::Trunc:
10912 case Instruction::FPTrunc:
10913 case Instruction::BitCast:
10914 case Instruction::ICmp:
10915 case Instruction::FCmp:
10916 case Instruction::Select:
10917 case Instruction::FNeg:
10918 case Instruction::Add:
10919 case Instruction::FAdd:
10920 case Instruction::Sub:
10921 case Instruction::FSub:
10922 case Instruction::Mul:
10923 case Instruction::FMul:
10924 case Instruction::UDiv:
10925 case Instruction::SDiv:
10926 case Instruction::FDiv:
10927 case Instruction::URem:
10928 case Instruction::SRem:
10929 case Instruction::FRem:
10930 case Instruction::Shl:
10931 case Instruction::LShr:
10932 case Instruction::AShr:
10933 case Instruction::And:
10934 case Instruction::Or:
10935 case Instruction::Xor:
10936 case Instruction::Freeze:
10937 case Instruction::Store:
10938 case Instruction::ShuffleVector:
10947 auto [
Op, ConvertedOps] = convertTo(
I, S);
10952 case Instruction::GetElementPtr: {
10959 const unsigned IndexIdx = 1;
10965 return !
GEP || VL0Ty ==
GEP->getOperand(IndexIdx)->getType();
10969 ->getPointerOperandType()
10970 ->getScalarType());
10974 Operands[0][Idx] =
V;
10975 Operands[1][Idx] = ConstantInt::getNullValue(Ty);
10978 Operands[0][Idx] =
GEP->getPointerOperand();
10979 auto *
Op =
GEP->getOperand(IndexIdx);
10982 CI, Ty, CI->getValue().isSignBitSet(),
DL)
10987 case Instruction::Call: {
10994 for (
Value *V : VL) {
10996 Ops.push_back(
I ?
I->getOperand(Idx)
11009 InstructionsCompatibilityAnalysis(DominatorTree &DT,
const DataLayout &
DL,
11010 const TargetTransformInfo &
TTI,
11011 const TargetLibraryInfo &TLI)
11016 bool TryCopyableElementsVectorization,
11017 bool WithProfitabilityCheck =
false,
11018 bool SkipSameCodeCheck =
false) {
11019 InstructionsState S = (SkipSameCodeCheck || !
allSameBlock(VL))
11020 ? InstructionsState::invalid()
11026 findAndSetMainInstruction(VL, R);
11028 return InstructionsState::invalid();
11029 S = InstructionsState(MainOp, MainOp,
true);
11030 if (!WithProfitabilityCheck)
11034 auto BuildCandidates =
11035 [](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates,
Value *V1,
11041 if (I1 && I2 &&
I1->getOpcode() == I2->getOpcode() &&
11042 I1->getParent() != I2->getParent())
11046 if (VL.
size() == 2) {
11049 BuildCandidates(Candidates1, Operands[0][0], Operands[0][1]);
11050 BuildCandidates(Candidates2, Operands[1][0], Operands[1][1]);
11051 bool Res = !Candidates1.
empty() && !Candidates2.
empty() &&
11052 R.findBestRootPair(Candidates1) &&
11053 R.findBestRootPair(Candidates2);
11055 Candidates1.
clear();
11056 Candidates2.
clear();
11057 BuildCandidates(Candidates1, Operands[0][0], Operands[1][1]);
11058 BuildCandidates(Candidates2, Operands[1][0], Operands[0][1]);
11059 Res = !Candidates1.
empty() && !Candidates2.
empty() &&
11060 R.findBestRootPair(Candidates1) &&
11061 R.findBestRootPair(Candidates2);
11064 return InstructionsState::invalid();
11068 FixedVectorType *VecTy =
11070 switch (MainOpcode) {
11071 case Instruction::Add:
11072 case Instruction::LShr:
11073 case Instruction::Shl:
11074 case Instruction::SDiv:
11075 case Instruction::UDiv:
11076 case Instruction::And:
11077 case Instruction::Or:
11078 case Instruction::Xor:
11084 if (VectorCost > ScalarCost)
11085 return InstructionsState::invalid();
11088 assert(Operands.
size() == 2 &&
"Unexpected number of operands!");
11089 unsigned CopyableNum =
11090 count_if(VL, [&](
Value *V) {
return S.isCopyableElement(V); });
11091 if (CopyableNum < VL.
size() / 2)
11094 const unsigned Limit = VL.
size() / 24;
11095 if ((CopyableNum >= VL.
size() - Limit ||
11096 (CopyableNum >= VL.
size() - 1 && VL.
size() > 4) ||
11101 return InstructionsState::invalid();
11105 for (
auto &
Ops : Operands) {
11120 return InstructionsState::invalid();
11126 constexpr unsigned Limit = 4;
11127 if (Operands.front().size() >= Limit) {
11128 SmallDenseMap<const Value *, unsigned>
Counters;
11136 return C.second == 1;
11142 InstructionsCompatibilityAnalysis
Analysis(DT,
DL,
TTI, TLI);
11143 InstructionsState OpS =
Analysis.buildInstructionsState(
11145 if (!OpS || (OpS.getOpcode() == Instruction::PHI && !
allSameBlock(
Ops)))
11147 unsigned CopyableNum =
11149 return CopyableNum <= VL.
size() / 2;
11151 if (!CheckOperand(Operands.front()))
11152 return InstructionsState::invalid();
11159 assert(S &&
"Invalid state!");
11161 if (S.areInstructionsWithCopyableElements()) {
11162 MainOp = S.getMainOp();
11163 MainOpcode = S.getOpcode();
11168 for (
auto [OperandIdx, Operand] :
enumerate(OperandsForValue))
11169 Operands[OperandIdx][Idx] = Operand;
11172 buildOriginalOperands(S, VL, Operands);
11179BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
11181 bool TryCopyableElementsVectorization)
const {
11184 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
11185 InstructionsState S =
Analysis.buildInstructionsState(
11186 VL, *
this, TryCopyableElementsVectorization,
11187 true, TryCopyableElementsVectorization);
11195 return ScalarsVectorizationLegality(S,
false,
11201 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *S.getMainOp() <<
".\n");
11202 for (TreeEntry *
E : getTreeEntries(S.getMainOp())) {
11203 if (
E->isSame(VL)) {
11204 LLVM_DEBUG(
dbgs() <<
"SLP: Perfect diamond merge at " << *S.getMainOp()
11206 return ScalarsVectorizationLegality(S,
false);
11211 (S.getOpcode() == Instruction::PHI &&
isa<PHINode>(V) &&
11212 LI->getLoopFor(S.getMainOp()->getParent()) &&
11216 return ScalarsVectorizationLegality(S,
false);
11225 !(S && !S.isAltShuffle() && VL.
size() >= 4 &&
11232 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to max recursion depth.\n");
11233 return ScalarsVectorizationLegality(S,
false);
11237 if (S && S.getOpcode() == Instruction::ExtractElement &&
11240 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to scalable vector type.\n");
11241 return ScalarsVectorizationLegality(S,
false);
11248 return ScalarsVectorizationLegality(S,
false,
11258 if (!S || !S.isAltShuffle() || VL.
size() > 2)
11266 SmallVector<unsigned, 8> InstsCount;
11267 for (
Value *V : VL) {
11270 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
11273 bool IsCommutative =
11275 if ((IsCommutative &&
11276 std::accumulate(InstsCount.
begin(), InstsCount.
end(), 0) < 2) ||
11278 all_of(InstsCount, [](
unsigned ICnt) {
return ICnt < 2; })))
11280 assert(VL.size() == 2 &&
"Expected only 2 alternate op instructions.");
11284 for (
int Op :
seq<int>(S.getMainOp()->getNumOperands()))
11286 I2->getOperand(
Op));
11287 if (
static_cast<unsigned>(
count_if(
11288 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
11290 })) >= S.getMainOp()->getNumOperands() / 2)
11292 if (S.getMainOp()->getNumOperands() > 2)
11294 if (IsCommutative) {
11296 Candidates.
clear();
11297 for (
int Op = 0,
E = S.getMainOp()->getNumOperands();
Op <
E; ++
Op)
11299 I2->getOperand((
Op + 1) %
E));
11301 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
11308 SmallVector<unsigned> SortedIndices;
11310 bool IsScatterVectorizeUserTE =
11311 UserTreeIdx.UserTE &&
11312 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11313 bool AreAllSameBlock = S.valid();
11314 bool AreScatterAllGEPSameBlock =
11327 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
11329 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
11335 NotProfitableForVectorization(VL)) {
11337 LLVM_DEBUG(
dbgs() <<
"SLP: Try split and if failed, gathering due to "
11338 "C,S,B,O, small shuffle. \n";
11342 return ScalarsVectorizationLegality(S,
false,
11346 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to C,S,B,O, small shuffle. \n";
11350 return ScalarsVectorizationLegality(S,
false);
11354 if (S && !EphValues.empty()) {
11355 for (
Value *V : VL) {
11356 if (EphValues.count(V)) {
11358 <<
") is ephemeral.\n");
11360 return ScalarsVectorizationLegality(S,
false,
11372 if (S && S.isAltShuffle()) {
11373 auto GetNumVectorizedExtracted = [&]() {
11379 all_of(
I->operands(), [&](
const Use &U) {
11380 return isa<ExtractElementInst>(U.get());
11385 else if (!
I->hasOneUser() && !areAllUsersVectorized(
I, UserIgnoreList))
11388 return std::make_pair(Vectorized, Extracted);
11390 auto [Vectorized, Extracted] = GetNumVectorizedExtracted();
11392 bool PreferScalarize = !Vectorized.
isAllOnes() && VL.size() == 2;
11393 if (!Vectorized.
isAllOnes() && !PreferScalarize) {
11396 Type *ScalarTy = VL.front()->getType();
11401 false,
true, Kind);
11403 *TTI, ScalarTy, VecTy, Vectorized,
11404 true,
false, Kind,
false);
11405 PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate;
11407 if (PreferScalarize) {
11408 LLVM_DEBUG(
dbgs() <<
"SLP: The instructions are in tree and alternate "
11409 "node is not profitable.\n");
11410 return ScalarsVectorizationLegality(S,
false);
11415 if (UserIgnoreList && !UserIgnoreList->empty()) {
11416 for (
Value *V : VL) {
11417 if (UserIgnoreList->contains(V)) {
11418 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to gathered scalar.\n");
11419 return ScalarsVectorizationLegality(S,
false);
11426 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
11427 assert(VL.front()->getType()->isPointerTy() &&
11429 "Expected pointers only.");
11432 assert(It != VL.end() &&
"Expected at least one GEP.");
11443 !DT->isReachableFromEntry(BB))) {
11449 return ScalarsVectorizationLegality(S,
false);
11451 return ScalarsVectorizationLegality(S,
true);
11456 unsigned InterleaveFactor) {
11459 SmallVector<int> ReuseShuffleIndices;
11463 auto TrySplitNode = [&](
const InstructionsState &LocalState) {
11466 if (!canBuildSplitNode(VL, LocalState, Op1, Op2, ReorderIndices))
11469 auto Invalid = ScheduleBundle::invalid();
11470 auto *
TE = newTreeEntry(VL, TreeEntry::SplitVectorize,
Invalid, LocalState,
11471 UserTreeIdx, {}, ReorderIndices);
11476 getSameValuesTreeEntry(S.getMainOp(),
Op,
true))) {
11478 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11479 Idx == 0 ? 0 : Op1.
size());
11480 (void)newTreeEntry(
Op, TreeEntry::NeedToGather,
Invalid, S, {
TE, Idx});
11482 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11483 Idx == 0 ? 0 : Op1.
size());
11493 bool AreConsts =
false;
11494 for (
Value *V : VL) {
11506 if (AreOnlyConstsWithPHIs(VL)) {
11507 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to all constants and PHIs.\n");
11508 newGatherTreeEntry(VL, InstructionsState::invalid(), UserTreeIdx);
11512 ScalarsVectorizationLegality Legality = getScalarsVectorizationLegality(
11513 VL,
Depth, UserTreeIdx,
false);
11514 InstructionsState S = Legality.getInstructionsState();
11515 if (!Legality.isLegal()) {
11516 if (Legality.trySplitVectorize()) {
11519 if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
11523 Legality = getScalarsVectorizationLegality(
11524 VL,
Depth, UserTreeIdx,
true);
11525 if (!Legality.isLegal()) {
11526 if (Legality.tryToFindDuplicates())
11530 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11533 S = Legality.getInstructionsState();
11537 if (S.isAltShuffle() && TrySplitNode(S))
11543 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11548 bool IsScatterVectorizeUserTE =
11549 UserTreeIdx.UserTE &&
11550 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11553 StridedPtrInfo SPtrInfo;
11554 TreeEntry::EntryState State = getScalarsVectorizationState(
11555 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, SPtrInfo);
11556 if (State == TreeEntry::NeedToGather) {
11557 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11563 auto &BSRef = BlocksSchedules[BB];
11565 BSRef = std::make_unique<BlockScheduling>(BB);
11567 BlockScheduling &BS = *BSRef;
11570 std::optional<ScheduleBundle *> BundlePtr =
11571 BS.tryScheduleBundle(UniqueValues.getArrayRef(),
this, S, UserTreeIdx);
11572#ifdef EXPENSIVE_CHECKS
11576 if (!BundlePtr || (*BundlePtr && !*BundlePtr.value())) {
11577 LLVM_DEBUG(
dbgs() <<
"SLP: We are not able to schedule this bundle!\n");
11579 if (S.isAltShuffle() && ReuseShuffleIndices.
empty() && TrySplitNode(S))
11581 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11582 NonScheduledFirst.insert(VL.front());
11583 if (S.getOpcode() == Instruction::Load &&
11584 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
11588 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
11590 ScheduleBundle
Empty;
11591 ScheduleBundle &Bundle = BundlePtr.value() ? *BundlePtr.value() :
Empty;
11592 LLVM_DEBUG(
dbgs() <<
"SLP: We are able to schedule this bundle.\n");
11594 unsigned ShuffleOrOp =
11595 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
11596 auto CreateOperandNodes = [&](TreeEntry *
TE,
const auto &Operands) {
11598 SmallVector<unsigned> PHIOps;
11604 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
11609 for (
unsigned I : PHIOps)
11610 buildTreeRec(Operands[
I],
Depth + 1, {
TE,
I});
11612 switch (ShuffleOrOp) {
11613 case Instruction::PHI: {
11615 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
11619 TE->setOperands(Operands);
11620 CreateOperandNodes(TE, Operands);
11623 case Instruction::ExtractValue:
11624 case Instruction::ExtractElement: {
11625 if (CurrentOrder.empty()) {
11626 LLVM_DEBUG(
dbgs() <<
"SLP: Reusing or shuffling extract sequence.\n");
11629 dbgs() <<
"SLP: Reusing or shuffling of reordered extract sequence "
11631 for (
unsigned Idx : CurrentOrder)
11632 dbgs() <<
" " << Idx;
11639 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11640 ReuseShuffleIndices, CurrentOrder);
11642 "(ExtractValueInst/ExtractElementInst).\n";
11646 TE->setOperands(Operands);
11649 case Instruction::InsertElement: {
11650 assert(ReuseShuffleIndices.
empty() &&
"All inserts should be unique");
11652 auto OrdCompare = [](
const std::pair<int, int> &
P1,
11653 const std::pair<int, int> &P2) {
11654 return P1.first > P2.first;
11657 decltype(OrdCompare)>
11658 Indices(OrdCompare);
11659 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
11661 Indices.emplace(Idx,
I);
11663 OrdersType CurrentOrder(VL.size(), VL.size());
11664 bool IsIdentity =
true;
11665 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
11666 CurrentOrder[Indices.top().second] =
I;
11667 IsIdentity &= Indices.top().second ==
I;
11671 CurrentOrder.clear();
11672 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11674 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (InsertElementInst).\n";
11677 TE->setOperands(Operands);
11678 buildTreeRec(
TE->getOperand(1),
Depth + 1, {TE, 1});
11681 case Instruction::Load: {
11688 TreeEntry *
TE =
nullptr;
11691 case TreeEntry::Vectorize:
11692 TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11693 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
11694 if (CurrentOrder.empty())
11695 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (LoadInst).\n";
11699 <<
"SLP: added a new TreeEntry (jumbled LoadInst).\n";
11702 case TreeEntry::CompressVectorize:
11704 TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S,
11705 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11708 <<
"SLP: added a new TreeEntry (masked LoadInst + compress).\n";
11711 case TreeEntry::StridedVectorize:
11713 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
11714 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11715 TreeEntryToStridedPtrInfoMap[
TE] = SPtrInfo;
11716 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (strided LoadInst).\n";
11719 case TreeEntry::ScatterVectorize:
11721 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
11722 UserTreeIdx, ReuseShuffleIndices);
11725 <<
"SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
11728 case TreeEntry::CombinedVectorize:
11729 case TreeEntry::SplitVectorize:
11730 case TreeEntry::NeedToGather:
11733 if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {
11734 assert(Operands.
size() == 1 &&
"Expected a single operand only");
11735 SmallVector<int>
Mask;
11739 TE->setOperands(Operands);
11740 if (State == TreeEntry::ScatterVectorize)
11741 buildTreeRec(PointerOps,
Depth + 1, {
TE, 0});
11744 case Instruction::ZExt:
11745 case Instruction::SExt:
11746 case Instruction::FPToUI:
11747 case Instruction::FPToSI:
11748 case Instruction::FPExt:
11749 case Instruction::PtrToInt:
11750 case Instruction::IntToPtr:
11751 case Instruction::SIToFP:
11752 case Instruction::UIToFP:
11753 case Instruction::Trunc:
11754 case Instruction::FPTrunc:
11755 case Instruction::BitCast: {
11756 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
11757 std::make_pair(std::numeric_limits<unsigned>::min(),
11758 std::numeric_limits<unsigned>::max()));
11759 if (ShuffleOrOp == Instruction::ZExt ||
11760 ShuffleOrOp == Instruction::SExt) {
11761 CastMaxMinBWSizes = std::make_pair(
11762 std::max<unsigned>(DL->getTypeSizeInBits(VL0->
getType()),
11764 std::min<unsigned>(
11767 }
else if (ShuffleOrOp == Instruction::Trunc) {
11768 CastMaxMinBWSizes = std::make_pair(
11769 std::max<unsigned>(
11772 std::min<unsigned>(DL->getTypeSizeInBits(VL0->
getType()),
11775 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11776 ReuseShuffleIndices);
11777 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (CastInst).\n";
11780 TE->setOperands(Operands);
11782 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
11783 if (ShuffleOrOp == Instruction::Trunc) {
11784 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11785 }
else if (ShuffleOrOp == Instruction::SIToFP ||
11786 ShuffleOrOp == Instruction::UIToFP) {
11787 unsigned NumSignBits =
11790 APInt
Mask = DB->getDemandedBits(OpI);
11791 NumSignBits = std::max(NumSignBits,
Mask.countl_zero());
11793 if (NumSignBits * 2 >=
11795 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11799 case Instruction::ICmp:
11800 case Instruction::FCmp: {
11803 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11804 ReuseShuffleIndices);
11813 "Commutative Predicate mismatch");
11816 Operands.
back() =
Ops.getVL(1);
11823 if (
Cmp->getPredicate() != P0)
11827 TE->setOperands(Operands);
11828 buildTreeRec(Operands.
front(),
Depth + 1, {TE, 0});
11829 buildTreeRec(Operands.
back(),
Depth + 1, {TE, 1});
11830 if (ShuffleOrOp == Instruction::ICmp) {
11831 unsigned NumSignBits0 =
11833 if (NumSignBits0 * 2 >=
11835 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11836 unsigned NumSignBits1 =
11838 if (NumSignBits1 * 2 >=
11840 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
11844 case Instruction::Select:
11845 case Instruction::FNeg:
11846 case Instruction::Add:
11847 case Instruction::FAdd:
11848 case Instruction::Sub:
11849 case Instruction::FSub:
11850 case Instruction::Mul:
11851 case Instruction::FMul:
11852 case Instruction::UDiv:
11853 case Instruction::SDiv:
11854 case Instruction::FDiv:
11855 case Instruction::URem:
11856 case Instruction::SRem:
11857 case Instruction::FRem:
11858 case Instruction::Shl:
11859 case Instruction::LShr:
11860 case Instruction::AShr:
11861 case Instruction::And:
11862 case Instruction::Or:
11863 case Instruction::Xor:
11864 case Instruction::Freeze: {
11865 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11866 ReuseShuffleIndices);
11868 dbgs() <<
"SLP: added a new TreeEntry "
11869 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
11875 Operands[0] =
Ops.getVL(0);
11876 Operands[1] =
Ops.getVL(1);
11878 TE->setOperands(Operands);
11880 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
11883 case Instruction::GetElementPtr: {
11884 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11885 ReuseShuffleIndices);
11886 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (GetElementPtrInst).\n";
11888 TE->setOperands(Operands);
11891 buildTreeRec(Operands[
I],
Depth + 1, {
TE,
I});
11894 case Instruction::Store: {
11895 bool Consecutive = CurrentOrder.empty();
11898 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11899 ReuseShuffleIndices, CurrentOrder);
11901 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (StoreInst).\n";
11905 dbgs() <<
"SLP: added a new TreeEntry (jumbled StoreInst).\n";
11907 TE->setOperands(Operands);
11908 buildTreeRec(
TE->getOperand(0),
Depth + 1, {TE, 0});
11911 case Instruction::Call: {
11917 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11918 ReuseShuffleIndices);
11919 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (CallInst).\n";
11924 Operands[0] =
Ops.getVL(0);
11925 Operands[1] =
Ops.getVL(1);
11927 TE->setOperands(Operands);
11933 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
11937 case Instruction::ShuffleVector: {
11938 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11939 ReuseShuffleIndices);
11940 if (S.isAltShuffle()) {
11941 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (isAltShuffle).\n";
11946 dbgs() <<
"SLP: added a new TreeEntry (ShuffleVectorInst).\n";
11960 "Expected different main/alternate predicates.");
11976 TE->setOperands(Operands);
11977 buildTreeRec(Operands.
front(),
Depth + 1, {TE, 0});
11978 buildTreeRec(Operands.
back(),
Depth + 1, {TE, 1});
11985 Operands[0] =
Ops.getVL(0);
11986 Operands[1] =
Ops.getVL(1);
11988 TE->setOperands(Operands);
11990 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
12008 for (
const auto *Ty : ST->elements())
12009 if (Ty != *ST->element_begin())
12011 N *= ST->getNumElements();
12012 EltTy = *ST->element_begin();
12014 N *= AT->getNumElements();
12015 EltTy = AT->getElementType();
12018 N *= VT->getNumElements();
12019 EltTy = VT->getElementType();
12025 size_t VTSize = DL->getTypeStoreSizeInBits(
getWidenedType(EltTy,
N));
12026 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
12027 VTSize != DL->getTypeStoreSizeInBits(
T))
12034 bool ResizeAllowed)
const {
12036 assert(It != VL.
end() &&
"Expected at least one extract instruction.");
12043 Value *Vec = E0->getOperand(0);
12045 CurrentOrder.
clear();
12049 if (E0->getOpcode() == Instruction::ExtractValue) {
12061 unsigned E = VL.
size();
12062 if (!ResizeAllowed && NElts !=
E)
12065 unsigned MinIdx = NElts, MaxIdx = 0;
12070 if (Inst->getOperand(0) != Vec)
12078 const unsigned ExtIdx = *Idx;
12079 if (ExtIdx >= NElts)
12081 Indices[
I] = ExtIdx;
12082 if (MinIdx > ExtIdx)
12084 if (MaxIdx < ExtIdx)
12087 if (MaxIdx - MinIdx + 1 >
E)
12089 if (MaxIdx + 1 <=
E)
12093 bool ShouldKeepOrder =
true;
12100 for (
unsigned I = 0;
I <
E; ++
I) {
12103 const unsigned ExtIdx = Indices[
I] - MinIdx;
12104 if (CurrentOrder[ExtIdx] !=
E) {
12105 CurrentOrder.
clear();
12108 ShouldKeepOrder &= ExtIdx ==
I;
12109 CurrentOrder[ExtIdx] =
I;
12111 if (ShouldKeepOrder)
12112 CurrentOrder.
clear();
12114 return ShouldKeepOrder;
12117bool BoUpSLP::areAllUsersVectorized(
12118 Instruction *
I,
const SmallDenseSet<Value *> *VectorizedVals)
const {
12119 return (
I->hasOneUse() && (!VectorizedVals || VectorizedVals->
contains(
I))) ||
12120 all_of(
I->users(), [
this](User *U) {
12121 return isVectorized(U) || isVectorLikeInstWithConstOps(U) ||
12122 (isa<ExtractElementInst>(U) && MustGather.contains(U));
12126void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
12127 const function_ref<
bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
12128 SmallVectorImpl<Value *> *OpScalars,
12129 SmallVectorImpl<Value *> *AltScalars)
const {
12130 unsigned Sz = Scalars.size();
12132 SmallVector<int> OrderMask;
12133 if (!ReorderIndices.empty())
12135 for (
unsigned I = 0;
I < Sz; ++
I) {
12137 if (!ReorderIndices.empty())
12138 Idx = OrderMask[
I];
12142 if (IsAltOp(OpInst)) {
12143 Mask[
I] = Sz + Idx;
12152 if (!ReuseShuffleIndices.
empty()) {
12154 transform(ReuseShuffleIndices, NewMask.
begin(), [&Mask](
int Idx) {
12155 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
12157 Mask.swap(NewMask);
12164 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(
I) == MainOp;
12174 assert(MainP != AltP &&
"Expected different main/alternate predicates.");
12183 assert((MainP ==
P || AltP ==
P || MainP == SwappedP || AltP == SwappedP) &&
12184 "CmpInst expected to match either main or alternate predicate or "
12186 return MainP !=
P && MainP != SwappedP;
12188 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(
I) == AltOp;
12193 const auto *Op0 =
Ops.front();
12206 return CI->getValue().isPowerOf2();
12212 return CI->getValue().isNegatedPowerOf2();
12217 if (IsConstant && IsUniform)
12219 else if (IsConstant)
12221 else if (IsUniform)
12233class BaseShuffleAnalysis {
12235 Type *ScalarTy =
nullptr;
12237 BaseShuffleAnalysis(
Type *ScalarTy) : ScalarTy(ScalarTy) {}
12245 unsigned getVF(
Value *V)
const {
12246 assert(V &&
"V cannot be nullptr");
12248 "V does not have FixedVectorType");
12249 assert(ScalarTy &&
"ScalarTy cannot be nullptr");
12251 unsigned VNumElements =
12253 assert(VNumElements > ScalarTyNumElements &&
12254 "the number of elements of V is not large enough");
12255 assert(VNumElements % ScalarTyNumElements == 0 &&
12256 "the number of elements of V is not a vectorized value");
12257 return VNumElements / ScalarTyNumElements;
12263 static bool isIdentityMask(ArrayRef<int> Mask,
const FixedVectorType *VecTy,
12265 int Limit =
Mask.size();
12277 if (Limit % VF == 0 &&
all_of(
seq<int>(0, Limit / VF), [=](
int Idx) {
12278 ArrayRef<int> Slice =
Mask.slice(Idx * VF, VF);
12291 static void combineMasks(
unsigned LocalVF, SmallVectorImpl<int> &Mask,
12292 ArrayRef<int> ExtMask) {
12293 unsigned VF =
Mask.size();
12295 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
12298 int MaskedIdx =
Mask[ExtMask[
I] % VF];
12302 Mask.swap(NewMask);
12338 static bool peekThroughShuffles(
Value *&V, SmallVectorImpl<int> &Mask,
12339 bool SinglePermute) {
12341 ShuffleVectorInst *IdentityOp =
nullptr;
12342 SmallVector<int> IdentityMask;
12351 if (isIdentityMask(Mask, SVTy,
false)) {
12352 if (!IdentityOp || !SinglePermute ||
12353 (isIdentityMask(Mask, SVTy,
true) &&
12355 IdentityMask.
size()))) {
12360 IdentityMask.
assign(Mask);
12380 if (SV->isZeroEltSplat()) {
12382 IdentityMask.
assign(Mask);
12384 int LocalVF =
Mask.size();
12387 LocalVF = SVOpTy->getNumElements();
12391 static_cast<unsigned>(
I) >= SV->getShuffleMask().size())
12393 ExtMask[Idx] = SV->getMaskValue(
I);
12403 if (!IsOp1Undef && !IsOp2Undef) {
12405 for (
int &
I : Mask) {
12408 if (SV->getMaskValue(
I % SV->getShuffleMask().size()) ==
12414 SmallVector<int> ShuffleMask(SV->getShuffleMask());
12415 combineMasks(LocalVF, ShuffleMask, Mask);
12416 Mask.swap(ShuffleMask);
12418 Op = SV->getOperand(0);
12420 Op = SV->getOperand(1);
12423 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
12428 "Expected masks of same sizes.");
12433 Mask.swap(IdentityMask);
12435 return SinglePermute &&
12438 (Shuffle &&
Mask.size() == Shuffle->getShuffleMask().size() &&
12439 Shuffle->isZeroEltSplat() &&
12443 Shuffle->getShuffleMask()[
P.index()] == 0;
12456 template <
typename T,
typename ShuffleBuilderTy>
12457 static T createShuffle(
Value *V1,
Value *V2, ArrayRef<int> Mask,
12458 ShuffleBuilderTy &Builder,
Type *ScalarTy) {
12459 assert(V1 &&
"Expected at least one vector value.");
12461 SmallVector<int> NewMask(Mask);
12462 if (ScalarTyNumElements != 1) {
12468 Builder.resizeToMatch(V1, V2);
12469 int VF =
Mask.size();
12471 VF = FTy->getNumElements();
12482 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
12484 CombinedMask1[
I] =
Mask[
I];
12486 CombinedMask2[
I] =
Mask[
I] - VF;
12493 (void)peekThroughShuffles(Op1, CombinedMask1,
false);
12494 (void)peekThroughShuffles(Op2, CombinedMask2,
false);
12500 for (
auto [Idx,
I] :
enumerate(CombinedMask1)) {
12503 ExtMask1[Idx] = SV1->getMaskValue(
I);
12507 ->getNumElements(),
12508 ExtMask1, UseMask::SecondArg);
12509 SmallVector<int> ExtMask2(CombinedMask2.size(),
PoisonMaskElem);
12510 for (
auto [Idx,
I] :
enumerate(CombinedMask2)) {
12513 ExtMask2[Idx] = SV2->getMaskValue(
I);
12517 ->getNumElements(),
12518 ExtMask2, UseMask::SecondArg);
12519 if (SV1->getOperand(0)->getType() ==
12520 SV2->getOperand(0)->getType() &&
12521 SV1->getOperand(0)->getType() != SV1->getType() &&
12524 Op1 = SV1->getOperand(0);
12525 Op2 = SV2->getOperand(0);
12526 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
12527 int LocalVF = ShuffleMask1.size();
12529 LocalVF = FTy->getNumElements();
12530 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
12531 CombinedMask1.swap(ShuffleMask1);
12532 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
12533 LocalVF = ShuffleMask2.size();
12535 LocalVF = FTy->getNumElements();
12536 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
12537 CombinedMask2.swap(ShuffleMask2);
12540 }
while (PrevOp1 != Op1 || PrevOp2 != Op2);
12541 Builder.resizeToMatch(Op1, Op2);
12543 ->getElementCount()
12544 .getKnownMinValue(),
12546 ->getElementCount()
12547 .getKnownMinValue());
12548 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
12551 "Expected undefined mask element");
12552 CombinedMask1[
I] = CombinedMask2[
I] + (Op1 == Op2 ? 0 : VF);
12561 return Builder.createIdentity(Op1);
12562 return Builder.createShuffleVector(
12567 return Builder.createPoison(
12569 bool IsIdentity = peekThroughShuffles(V1, NewMask,
true);
12570 assert(V1 &&
"Expected non-null value after looking through shuffles.");
12573 return Builder.createShuffleVector(V1, NewMask);
12574 return Builder.createIdentity(V1);
12580 ArrayRef<int> Mask) {
12589static std::pair<InstructionCost, InstructionCost>
12600 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
12609 ScalarCost =
TTI.getPointersChainCost(
12610 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
12614 for (
Value *V : Ptrs) {
12615 if (V == BasePtr) {
12624 if (!
Ptr || !
Ptr->hasOneUse())
12628 if (PtrsRetainedInVecCode.
size() == Ptrs.size()) {
12633 VecCost =
TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
12634 TTI::PointersChainInfo::getKnownStride(),
12644 [](
const Value *V) {
12646 return Ptr && !
Ptr->hasAllConstantIndices();
12648 ? TTI::PointersChainInfo::getUnknownStride()
12649 : TTI::PointersChainInfo::getKnownStride();
12652 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy,
CostKind);
12656 if (It != Ptrs.
end())
12661 VecCost =
TTI.getGEPCost(BaseGEP->getSourceElementType(),
12662 BaseGEP->getPointerOperand(), Indices, VecTy,
12667 return std::make_pair(ScalarCost, VecCost);
12670void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
12671 assert(
TE.isGather() &&
TE.ReorderIndices.empty() &&
12672 "Expected gather node without reordering.");
12674 SmallSet<size_t, 2> LoadKeyUsed;
12678 if (
TE.Scalars.size() == 2 || (
TE.hasState() && !
TE.isAltShuffle()) ||
12683 return VectorizableTree[Idx]->isSame(TE.Scalars);
12687 auto GenerateLoadsSubkey = [&](
size_t Key, LoadInst *LI) {
12692 auto LIt = LoadsMap.
find(std::make_pair(
Key,
Ptr));
12693 if (LIt != LoadsMap.
end()) {
12694 for (LoadInst *RLI : LIt->second) {
12696 LI->
getType(), LI->getPointerOperand(), *DL, *SE,
12700 for (LoadInst *RLI : LIt->second) {
12702 LI->getPointerOperand(), *TLI)) {
12707 if (LIt->second.size() > 2) {
12709 hash_value(LIt->second.back()->getPointerOperand());
12718 MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues;
12719 SmallDenseMap<Value *, SmallVector<unsigned>, 8> KeyToIndex;
12720 bool IsOrdered =
true;
12721 unsigned NumInstructions = 0;
12725 size_t Key = 1, Idx = 1;
12733 auto &Container = SortedValues[
Key];
12734 if (IsOrdered && !KeyToIndex.
contains(V) &&
12737 ((Container.contains(Idx) &&
12738 KeyToIndex.
at(Container[Idx].back()).back() !=
I - 1) ||
12739 (!Container.empty() && !Container.contains(Idx) &&
12740 KeyToIndex.
at(Container.back().second.back()).back() !=
I - 1)))
12742 auto &KTI = KeyToIndex[
V];
12744 Container[Idx].push_back(V);
12749 if (!IsOrdered && NumInstructions > 1) {
12751 TE.ReorderIndices.resize(
TE.Scalars.size(),
TE.Scalars.size());
12752 for (
const auto &
D : SortedValues) {
12753 for (
const auto &
P :
D.second) {
12755 for (
Value *V :
P.second) {
12756 ArrayRef<unsigned> Indices = KeyToIndex.
at(V);
12757 for (
auto [K, Idx] :
enumerate(Indices)) {
12758 TE.ReorderIndices[Cnt +
K] = Idx;
12759 TE.Scalars[Cnt +
K] =
V;
12761 Sz += Indices.
size();
12762 Cnt += Indices.
size();
12766 *TTI,
TE.Scalars.front()->getType(), Sz);
12770 }
else if (!
P.second.empty() &&
isConstant(
P.second.front())) {
12778 if (!
TE.ReuseShuffleIndices.empty() ||
TE.ReorderIndices.empty())
12783 auto *ScalarTy =
TE.Scalars.front()->getType();
12785 for (
auto [Idx, Sz] : SubVectors) {
12792 int Sz =
TE.Scalars.size();
12793 SmallVector<int> ReorderMask(
TE.ReorderIndices.begin(),
12794 TE.ReorderIndices.end());
12800 ReorderMask[
I] =
I +
TE.ReorderIndices.size();
12804 any_of(ReorderMask, [&](
int I) {
return I >= Sz; })
12807 VecTy, ReorderMask);
12813 DemandedElts.clearBit(
I);
12815 ReorderMask[
I] =
I;
12817 ReorderMask[
I] =
I + Sz;
12823 if (!DemandedElts.isAllOnes())
12825 if (
Cost >= BVCost) {
12826 SmallVector<int>
Mask(
TE.ReorderIndices.begin(),
TE.ReorderIndices.end());
12828 TE.ReorderIndices.clear();
12835 const InstructionsState &S,
12841 return V->getType()->getScalarType()->isFloatingPointTy();
12843 "Can only convert to FMA for floating point types");
12844 assert(S.isAddSubLikeOp() &&
"Can only convert to FMA for add/sub");
12849 for (
Value *V : VL) {
12853 if (S.isCopyableElement(
I))
12855 Instruction *MatchingI = S.getMatchingMainOpOrAltOp(
I);
12856 if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
12859 FMF &= FPCI->getFastMathFlags();
12863 if (!CheckForContractable(VL))
12866 InstructionsCompatibilityAnalysis
Analysis(DT,
DL,
TTI, TLI);
12873 if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
12875 if (!CheckForContractable(Operands.
front()))
12883 for (
Value *V : VL) {
12887 if (!S.isCopyableElement(
I))
12889 FMF &= FPCI->getFastMathFlags();
12890 FMulPlusFAddCost +=
TTI.getInstructionCost(
I,
CostKind);
12893 for (
auto [V,
Op] :
zip(VL, Operands.
front())) {
12894 if (S.isCopyableElement(V))
12897 if (!
I || !
I->hasOneUse() || OpS.isCopyableElement(
I)) {
12899 FMACost +=
TTI.getInstructionCost(OpI,
CostKind);
12906 FMF &= FPCI->getFastMathFlags();
12907 FMulPlusFAddCost +=
TTI.getInstructionCost(
I,
CostKind);
12917 BaseGraphSize = VectorizableTree.size();
12919 class GraphTransformModeRAAI {
12920 bool &SavedIsGraphTransformMode;
12923 GraphTransformModeRAAI(
bool &IsGraphTransformMode)
12924 : SavedIsGraphTransformMode(IsGraphTransformMode) {
12925 IsGraphTransformMode =
true;
12927 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode =
false; }
12928 } TransformContext(IsGraphTransformMode);
12937 const InstructionsState &S) {
12941 I2->getOperand(
Op));
12943 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
12945 [](
const std::pair<Value *, Value *> &
P) {
12955 TreeEntry &E = *VectorizableTree[Idx];
12957 reorderGatherNode(E);
12962 constexpr unsigned VFLimit = 16;
12963 bool ForceLoadGather =
12964 count_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
12965 return TE->isGather() && TE->hasState() &&
12966 TE->getOpcode() == Instruction::Load &&
12967 TE->getVectorFactor() < VFLimit;
12973 return TE->isSame(VL) ||
all_of(VL, [&](
Value *V) {
12982 auto CheckForSameVectorNodes = [&](
const TreeEntry &E) {
12983 if (E.hasState()) {
12985 !TEs.
empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
12986 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
12987 ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
12988 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12989 return is_contained(TEs, TE);
12996 !TEs.
empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
12997 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
12998 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
12999 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13000 return is_contained(TEs, TE);
13008 if (It != E.Scalars.end()) {
13010 !TEs.empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
13011 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
13012 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
13013 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13014 return is_contained(TEs, TE);
13024 for (
unsigned Idx :
seq<unsigned>(BaseGraphSize)) {
13025 TreeEntry &
E = *VectorizableTree[Idx];
13026 if (
E.isGather()) {
13029 unsigned MinVF =
getMinVF(2 * Sz);
13032 if (VL.
size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
13033 !(!
E.hasState() ||
E.getOpcode() == Instruction::Load ||
13039 if (ForceLoadGather &&
E.hasState() &&
E.getOpcode() == Instruction::Load)
13042 if (CheckForSameVectorNodes(
E))
13046 unsigned StartIdx = 0;
13047 unsigned End = VL.
size();
13049 *TTI, VL.
front()->getType(), VL.
size() - 1);
13051 *TTI, VL.
front()->getType(), VF - 1)) {
13052 if (StartIdx + VF > End)
13055 bool AllStrided =
true;
13056 for (
unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
13061 !getSameValuesTreeEntry(Slice.
front(), Slice,
true))
13068 bool IsSplat =
isSplat(Slice);
13069 bool IsTwoRegisterSplat =
true;
13070 if (IsSplat && VF == 2) {
13073 IsTwoRegisterSplat = NumRegs2VF == 2;
13075 if (Slices.
empty() || !IsSplat || !IsTwoRegisterSplat ||
13083 (S.getOpcode() == Instruction::Load &&
13085 (S.getOpcode() != Instruction::Load &&
13091 if ((!UserIgnoreList ||
E.Idx != 0) &&
13092 TTI->getInstructionCost(S.getMainOp(),
CostKind) <
13101 if (S.getOpcode() == Instruction::Load) {
13104 StridedPtrInfo SPtrInfo;
13106 PointerOps, SPtrInfo);
13117 if (UserIgnoreList &&
E.Idx == 0)
13122 }
else if (S.getOpcode() == Instruction::ExtractElement ||
13123 (TTI->getInstructionCost(S.getMainOp(),
CostKind) <
13125 !CheckOperandsProfitability(
13142 if (VF == 2 && AllStrided && Slices.
size() > 2)
13144 auto AddCombinedNode = [&](
unsigned Idx,
unsigned Cnt,
unsigned Sz) {
13145 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
13146 if (StartIdx == Cnt)
13147 StartIdx = Cnt + Sz;
13148 if (End == Cnt + Sz)
13151 for (
auto [Cnt, Sz] : Slices) {
13153 const TreeEntry *SameTE =
nullptr;
13155 It != Slice.
end()) {
13157 SameTE = getSameValuesTreeEntry(*It, Slice);
13159 unsigned PrevSize = VectorizableTree.size();
13160 [[maybe_unused]]
unsigned PrevEntriesSize =
13161 LoadEntriesToVectorize.size();
13162 buildTreeRec(Slice, 0,
EdgeInfo(&
E, UINT_MAX));
13163 if (PrevSize + 1 == VectorizableTree.size() && !SameTE &&
13164 VectorizableTree[PrevSize]->isGather() &&
13165 VectorizableTree[PrevSize]->hasState() &&
13166 VectorizableTree[PrevSize]->getOpcode() !=
13167 Instruction::ExtractElement &&
13169 if (UserIgnoreList &&
E.Idx == 0 && VF == 2)
13171 VectorizableTree.pop_back();
13172 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
13173 "LoadEntriesToVectorize expected to remain the same");
13176 AddCombinedNode(PrevSize, Cnt, Sz);
13180 if (
E.CombinedEntriesWithIndices.empty() && !
E.ReorderIndices.empty()) {
13181 SmallVector<int>
Mask(
E.ReorderIndices.begin(),
E.ReorderIndices.end());
13183 E.ReorderIndices.clear();
13188 switch (
E.getOpcode()) {
13189 case Instruction::Load: {
13192 if (
E.State != TreeEntry::Vectorize)
13194 Type *ScalarTy =
E.getMainOp()->getType();
13200 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13201 SmallVector<int>
Mask;
13205 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
13206 BaseLI->getPointerAddressSpace(),
CostKind,
13210 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
13211 false, CommonAlignment,
CostKind, BaseLI);
13216 ->getPointerOperand()
13218 StridedPtrInfo SPtrInfo;
13219 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1);
13220 SPtrInfo.Ty = VecTy;
13221 TreeEntryToStridedPtrInfoMap[&
E] = SPtrInfo;
13222 E.State = TreeEntry::StridedVectorize;
13227 case Instruction::Store: {
13235 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13236 SmallVector<int>
Mask;
13240 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
13241 BaseSI->getPointerAddressSpace(),
CostKind,
13245 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
13246 false, CommonAlignment,
CostKind, BaseSI);
13247 if (StridedCost < OriginalVecCost)
13250 E.State = TreeEntry::StridedVectorize;
13251 }
else if (!
E.ReorderIndices.empty()) {
13253 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int>
Mask) {
13255 assert(
Mask.size() > 1 &&
"Expected mask greater than 1 element.");
13256 if (
Mask.size() < 4)
13260 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
13261 TTI.isLegalInterleavedAccessType(
13262 VecTy, Factor, BaseSI->getAlign(),
13263 BaseSI->getPointerAddressSpace()))
13269 SmallVector<int>
Mask(
E.ReorderIndices.begin(),
E.ReorderIndices.end());
13270 unsigned InterleaveFactor = IsInterleaveMask(Mask);
13271 if (InterleaveFactor != 0)
13272 E.setInterleave(InterleaveFactor);
13276 case Instruction::Select: {
13277 if (
E.State != TreeEntry::Vectorize)
13283 E.CombinedOp = TreeEntry::MinMax;
13284 TreeEntry *CondEntry = getOperandEntry(&
E, 0);
13285 if (SelectOnly && CondEntry->UserTreeIndex &&
13286 CondEntry->State == TreeEntry::Vectorize) {
13288 CondEntry->State = TreeEntry::CombinedVectorize;
13292 case Instruction::FSub:
13293 case Instruction::FAdd: {
13295 if (
E.State != TreeEntry::Vectorize ||
13296 !
E.getOperations().isAddSubLikeOp())
13302 E.CombinedOp = TreeEntry::FMulAdd;
13303 TreeEntry *FMulEntry = getOperandEntry(&
E, 0);
13304 if (FMulEntry->UserTreeIndex &&
13305 FMulEntry->State == TreeEntry::Vectorize) {
13307 FMulEntry->State = TreeEntry::CombinedVectorize;
13316 if (LoadEntriesToVectorize.empty()) {
13318 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
13319 VectorizableTree.front()->getOpcode() == Instruction::Load)
13322 constexpr unsigned SmallTree = 3;
13323 constexpr unsigned SmallVF = 2;
13324 if ((VectorizableTree.size() <= SmallTree &&
13325 VectorizableTree.front()->Scalars.size() == SmallVF) ||
13326 (VectorizableTree.size() <= 2 && UserIgnoreList))
13329 if (VectorizableTree.front()->isNonPowOf2Vec() &&
13333 [](
const std::unique_ptr<TreeEntry> &TE) {
13334 return TE->isGather() &&
TE->hasState() &&
13335 TE->getOpcode() == Instruction::Load &&
13343 SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
13347 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
13348 TreeEntry &
E = *
TE;
13349 if (
E.isGather() &&
13350 ((
E.hasState() &&
E.getOpcode() == Instruction::Load) ||
13351 (!
E.hasState() &&
any_of(
E.Scalars,
13353 return isa<LoadInst>(V) &&
13354 !isVectorized(V) &&
13355 !isDeleted(cast<Instruction>(V));
13358 for (
Value *V :
E.Scalars) {
13365 *
this, V, *DL, *SE, *TTI,
13366 GatheredLoads[std::make_tuple(
13374 if (!GatheredLoads.
empty())
13375 tryToVectorizeGatheredLoads(GatheredLoads);
13385 bool IsFinalized =
false;
13398 bool SameNodesEstimated =
true;
13401 if (Ty->getScalarType()->isPointerTy()) {
13405 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
13406 Ty->getScalarType());
13424 assert(It != VL.
end() &&
"Expected at least one non-undef value.");
13427 count(VL, *It) > 1 &&
13429 if (!NeedShuffle) {
13432 return TTI.getShuffleCost(
13437 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
13438 CostKind, std::distance(VL.
begin(), It),
13444 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
13447 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
13451 VecTy, ShuffleMask, CostKind,
13455 return GatherCost +
13458 : R.getGatherCost(Gathers, !Root && VL.
equals(Gathers),
13466 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13467 unsigned NumParts) {
13468 assert(VL.
size() > NumParts &&
"Unexpected scalarized shuffle.");
13470 std::accumulate(VL.
begin(), VL.
end(), 0, [](
unsigned Sz,
Value *V) {
13471 auto *EE = dyn_cast<ExtractElementInst>(V);
13474 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
13477 return std::max(Sz, VecTy->getNumElements());
13484 -> std::optional<TTI::ShuffleKind> {
13485 if (NumElts <= EltsPerVector)
13486 return std::nullopt;
13488 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
13490 if (I == PoisonMaskElem)
13492 return std::min(S, I);
13495 int OffsetReg1 = OffsetReg0;
13499 int FirstRegId = -1;
13500 Indices.assign(1, OffsetReg0);
13504 int Idx =
I - OffsetReg0;
13506 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
13507 if (FirstRegId < 0)
13508 FirstRegId = RegId;
13509 RegIndices.
insert(RegId);
13510 if (RegIndices.
size() > 2)
13511 return std::nullopt;
13512 if (RegIndices.
size() == 2) {
13514 if (Indices.
size() == 1) {
13517 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
13518 [&](
int S,
int I) {
13519 if (I == PoisonMaskElem)
13521 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
13522 ((I - OffsetReg0) % NumElts) / EltsPerVector;
13523 if (RegId == FirstRegId)
13525 return std::min(S, I);
13528 unsigned Index = OffsetReg1 % NumElts;
13529 Indices.push_back(Index);
13530 SubVecSizes.push_back(std::min(NumElts - Index, EltsPerVector));
13532 Idx =
I - OffsetReg1;
13534 I = (Idx % NumElts) % EltsPerVector +
13535 (RegId == FirstRegId ? 0 : EltsPerVector);
13537 return ShuffleKind;
13545 if (!ShuffleKinds[Part])
13548 Part * EltsPerVector,
getNumElems(Mask.size(), EltsPerVector, Part));
13553 std::optional<TTI::ShuffleKind> RegShuffleKind =
13554 CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);
13555 if (!RegShuffleKind) {
13558 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.
size())))
13571 *R.TTI, VL.
front()->getType(),
alignTo(NumElts, EltsPerVector));
13572 for (
const auto [Idx, SubVecSize] :
zip(Indices, SubVecSizes)) {
13573 assert((Idx + SubVecSize) <= BaseVF &&
13574 "SK_ExtractSubvector index out of range");
13584 TTI, *ShuffleKinds[Part],
getWidenedType(ScalarTy, NumElts), SubMask);
13585 if (OriginalCost < Cost)
13586 Cost = OriginalCost;
13593 void estimateNodesPermuteCost(
const TreeEntry &E1,
const TreeEntry *E2,
13595 unsigned SliceSize) {
13596 if (SameNodesEstimated) {
13602 if ((InVectors.size() == 2 &&
13606 unsigned Limit =
getNumElems(Mask.size(), SliceSize, Part);
13609 "Expected all poisoned elements.");
13611 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
13616 Cost += createShuffle(InVectors.front(),
13617 InVectors.size() == 1 ?
nullptr : InVectors.back(),
13619 transformMaskAfterShuffle(CommonMask, CommonMask);
13620 }
else if (InVectors.size() == 2) {
13621 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
13622 transformMaskAfterShuffle(CommonMask, CommonMask);
13624 SameNodesEstimated =
false;
13625 if (!E2 && InVectors.size() == 1) {
13626 unsigned VF = E1.getVectorFactor();
13628 VF = std::max(VF, getVF(V1));
13631 VF = std::max(VF, E->getVectorFactor());
13633 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13635 CommonMask[Idx] = Mask[Idx] + VF;
13636 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
13637 transformMaskAfterShuffle(CommonMask, CommonMask);
13639 auto P = InVectors.front();
13640 Cost += createShuffle(&E1, E2, Mask);
13641 unsigned VF = Mask.size();
13647 VF = std::max(VF, E->getVectorFactor());
13649 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13651 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
13652 Cost += createShuffle(
P, InVectors.front(), CommonMask);
13653 transformMaskAfterShuffle(CommonMask, CommonMask);
13657 class ShuffleCostBuilder {
13660 static bool isEmptyOrIdentity(
ArrayRef<int> Mask,
unsigned VF) {
13662 return Mask.empty() ||
13663 (VF == Mask.size() &&
13671 ~ShuffleCostBuilder() =
default;
13677 if (isEmptyOrIdentity(Mask, VF))
13686 if (isEmptyOrIdentity(Mask, VF))
13695 void resizeToMatch(
Value *&,
Value *&)
const {}
13705 ShuffleCostBuilder Builder(TTI);
13708 unsigned CommonVF = Mask.size();
13710 auto GetNodeMinBWAffectedCost = [&](
const TreeEntry &E,
13714 Type *EScalarTy = E.Scalars.front()->getType();
13715 bool IsSigned =
true;
13716 if (
auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
13718 IsSigned = It->second.second;
13720 if (EScalarTy != ScalarTy) {
13721 unsigned CastOpcode = Instruction::Trunc;
13722 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13723 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13725 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13726 return TTI.getCastInstrCost(CastOpcode,
getWidenedType(ScalarTy, VF),
13736 Type *EScalarTy = VecTy->getElementType();
13737 if (EScalarTy != ScalarTy) {
13739 unsigned CastOpcode = Instruction::Trunc;
13740 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13741 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13743 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13744 return TTI.getCastInstrCost(
13750 if (!V1 && !V2 && !P2.
isNull()) {
13753 unsigned VF = E->getVectorFactor();
13755 CommonVF = std::max(VF, E2->getVectorFactor());
13758 return Idx < 2 * static_cast<int>(CommonVF);
13760 "All elements in mask must be less than 2 * CommonVF.");
13761 if (E->Scalars.size() == E2->Scalars.size()) {
13765 for (
int &Idx : CommonMask) {
13768 if (Idx <
static_cast<int>(CommonVF) && !EMask.
empty())
13770 else if (Idx >=
static_cast<int>(CommonVF))
13771 Idx = (E2Mask.
empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
13775 CommonVF = E->Scalars.size();
13776 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
13777 GetNodeMinBWAffectedCost(*E2, CommonVF);
13779 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
13780 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
13783 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13784 }
else if (!V1 && P2.
isNull()) {
13787 unsigned VF = E->getVectorFactor();
13791 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
13792 "All elements in mask must be less than CommonVF.");
13793 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
13795 assert(!EMask.
empty() &&
"Expected non-empty common mask.");
13796 for (
int &Idx : CommonMask) {
13800 CommonVF = E->Scalars.size();
13801 }
else if (
unsigned Factor = E->getInterleaveFactor();
13802 Factor > 0 && E->Scalars.size() != Mask.size() &&
13806 std::iota(CommonMask.begin(), CommonMask.end(), 0);
13808 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
13811 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
13812 CommonVF == CommonMask.size() &&
13814 [](
const auto &&
P) {
13816 static_cast<unsigned>(
P.value()) !=
P.index();
13824 }
else if (V1 && P2.
isNull()) {
13826 ExtraCost += GetValueMinBWAffectedCost(V1);
13827 CommonVF = getVF(V1);
13830 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
13831 "All elements in mask must be less than CommonVF.");
13832 }
else if (V1 && !V2) {
13834 unsigned VF = getVF(V1);
13836 CommonVF = std::max(VF, E2->getVectorFactor());
13839 return Idx < 2 * static_cast<int>(CommonVF);
13841 "All elements in mask must be less than 2 * CommonVF.");
13842 if (E2->Scalars.size() == VF && VF != CommonVF) {
13844 assert(!E2Mask.
empty() &&
"Expected non-empty common mask.");
13845 for (
int &Idx : CommonMask) {
13848 if (Idx >=
static_cast<int>(CommonVF))
13849 Idx = E2Mask[Idx - CommonVF] + VF;
13853 ExtraCost += GetValueMinBWAffectedCost(V1);
13855 ExtraCost += GetNodeMinBWAffectedCost(
13856 *E2, std::min(CommonVF, E2->getVectorFactor()));
13857 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13858 }
else if (!V1 && V2) {
13860 unsigned VF = getVF(V2);
13862 CommonVF = std::max(VF, E1->getVectorFactor());
13865 return Idx < 2 * static_cast<int>(CommonVF);
13867 "All elements in mask must be less than 2 * CommonVF.");
13868 if (E1->Scalars.size() == VF && VF != CommonVF) {
13870 assert(!E1Mask.
empty() &&
"Expected non-empty common mask.");
13871 for (
int &Idx : CommonMask) {
13874 if (Idx >=
static_cast<int>(CommonVF))
13875 Idx = E1Mask[Idx - CommonVF] + VF;
13881 ExtraCost += GetNodeMinBWAffectedCost(
13882 *E1, std::min(CommonVF, E1->getVectorFactor()));
13884 ExtraCost += GetValueMinBWAffectedCost(V2);
13885 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13887 assert(V1 && V2 &&
"Expected both vectors.");
13888 unsigned VF = getVF(V1);
13889 CommonVF = std::max(VF, getVF(V2));
13892 return Idx < 2 * static_cast<int>(CommonVF);
13894 "All elements in mask must be less than 2 * CommonVF.");
13896 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
13899 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13904 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13907 InVectors.front() =
13909 if (InVectors.size() == 2)
13910 InVectors.pop_back();
13911 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
13912 V1, V2, CommonMask, Builder, ScalarTy);
13919 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
13920 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
13921 CheckedExtracts(CheckedExtracts) {}
13923 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13924 unsigned NumParts,
bool &UseVecBaseAsInput) {
13925 UseVecBaseAsInput =
false;
13928 Value *VecBase =
nullptr;
13930 if (!E->ReorderIndices.empty()) {
13932 E->ReorderIndices.end());
13937 bool PrevNodeFound =
any_of(
13938 ArrayRef(R.VectorizableTree).take_front(E->Idx),
13939 [&](
const std::unique_ptr<TreeEntry> &TE) {
13940 return ((TE->hasState() && !TE->isAltShuffle() &&
13941 TE->getOpcode() == Instruction::ExtractElement) ||
13943 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
13944 return VL.size() > Data.index() &&
13945 (Mask[Data.index()] == PoisonMaskElem ||
13946 isa<UndefValue>(VL[Data.index()]) ||
13947 Data.value() == VL[Data.index()]);
13955 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
13969 VecBase = EE->getVectorOperand();
13970 UniqueBases.
insert(VecBase);
13972 if (!CheckedExtracts.
insert(V).second ||
13976 return isa<GetElementPtrInst>(U) &&
13977 !R.areAllUsersVectorized(cast<Instruction>(U),
13985 unsigned Idx = *EEIdx;
13987 if (EE->hasOneUse() || !PrevNodeFound) {
13993 Cost -= TTI.getExtractWithExtendCost(
13994 Ext->getOpcode(), Ext->getType(), EE->getVectorOperandType(),
13997 Cost += TTI.getCastInstrCost(
13998 Ext->getOpcode(), Ext->getType(), EE->getType(),
14003 APInt &DemandedElts =
14004 VectorOpsToExtracts
14007 .first->getSecond();
14008 DemandedElts.
setBit(Idx);
14011 for (
const auto &[Vec, DemandedElts] : VectorOpsToExtracts)
14013 DemandedElts,
false,
14021 if (!PrevNodeFound)
14022 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
14025 transformMaskAfterShuffle(CommonMask, CommonMask);
14026 SameNodesEstimated =
false;
14027 if (NumParts != 1 && UniqueBases.
size() != 1) {
14028 UseVecBaseAsInput =
true;
14036 std::optional<InstructionCost>
14040 return std::nullopt;
14044 IsFinalized =
false;
14045 CommonMask.clear();
14048 VectorizedVals.clear();
14049 SameNodesEstimated =
true;
14055 return Idx < static_cast<int>(E1.getVectorFactor());
14057 "Expected single vector shuffle mask.");
14061 if (InVectors.empty()) {
14062 CommonMask.assign(Mask.begin(), Mask.end());
14063 InVectors.assign({&E1, &E2});
14066 assert(!CommonMask.empty() &&
"Expected non-empty common mask.");
14072 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
14073 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
14076 if (InVectors.empty()) {
14077 CommonMask.assign(Mask.begin(), Mask.end());
14078 InVectors.assign(1, &E1);
14081 assert(!CommonMask.empty() &&
"Expected non-empty common mask.");
14087 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
14088 estimateNodesPermuteCost(E1,
nullptr, Mask, Part, SliceSize);
14089 if (!SameNodesEstimated && InVectors.size() == 1)
14090 InVectors.emplace_back(&E1);
14096 assert(InVectors.size() == 1 &&
14103 ->getOrdered(
P.index()));
14104 return EI->getVectorOperand() == V1 ||
14105 EI->getVectorOperand() == V2;
14107 "Expected extractelement vectors.");
14111 if (InVectors.empty()) {
14112 assert(CommonMask.empty() && !ForExtracts &&
14113 "Expected empty input mask/vectors.");
14114 CommonMask.assign(Mask.begin(), Mask.end());
14115 InVectors.assign(1, V1);
14121 !CommonMask.empty() &&
14125 ->getOrdered(
P.index());
14127 return P.value() == Mask[
P.index()] ||
14132 return EI->getVectorOperand() == V1;
14134 "Expected only tree entry for extractelement vectors.");
14137 assert(!InVectors.empty() && !CommonMask.empty() &&
14138 "Expected only tree entries from extracts/reused buildvectors.");
14139 unsigned VF = getVF(V1);
14140 if (InVectors.size() == 2) {
14141 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14142 transformMaskAfterShuffle(CommonMask, CommonMask);
14143 VF = std::max<unsigned>(VF, CommonMask.size());
14144 }
else if (
const auto *InTE =
14145 InVectors.front().dyn_cast<
const TreeEntry *>()) {
14146 VF = std::max(VF, InTE->getVectorFactor());
14150 ->getNumElements());
14152 InVectors.push_back(V1);
14153 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14155 CommonMask[Idx] = Mask[Idx] + VF;
14158 Value *Root =
nullptr) {
14159 Cost += getBuildVectorCost(VL, Root);
14163 unsigned VF = VL.
size();
14165 VF = std::min(VF, MaskVF);
14166 Type *VLScalarTy = VL.
front()->getType();
14190 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
14196 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14201 IsFinalized =
true;
14204 if (InVectors.
size() == 2)
14205 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
14207 Cost += createShuffle(Vec,
nullptr, CommonMask);
14208 transformMaskAfterShuffle(CommonMask, CommonMask);
14210 "Expected vector length for the final value before action.");
14213 Cost += createShuffle(V1, V2, Mask);
14216 InVectors.
front() = V;
14218 if (!SubVectors.empty()) {
14220 if (InVectors.
size() == 2)
14221 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
14223 Cost += createShuffle(Vec,
nullptr, CommonMask);
14224 transformMaskAfterShuffle(CommonMask, CommonMask);
14226 if (!SubVectorsMask.
empty()) {
14228 "Expected same size of masks for subvectors and common mask.");
14230 copy(SubVectorsMask, SVMask.begin());
14231 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
14234 I1 = I2 + CommonMask.
size();
14241 for (
auto [
E, Idx] : SubVectors) {
14242 Type *EScalarTy =
E->Scalars.front()->getType();
14243 bool IsSigned =
true;
14244 if (
auto It =
R.MinBWs.find(
E); It !=
R.MinBWs.end()) {
14247 IsSigned = It->second.second;
14249 if (ScalarTy != EScalarTy) {
14250 unsigned CastOpcode = Instruction::Trunc;
14251 unsigned DstSz =
R.DL->getTypeSizeInBits(ScalarTy);
14252 unsigned SrcSz =
R.DL->getTypeSizeInBits(EScalarTy);
14254 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14255 Cost += TTI.getCastInstrCost(
14264 if (!CommonMask.
empty()) {
14265 std::iota(std::next(CommonMask.
begin(), Idx),
14266 std::next(CommonMask.
begin(), Idx +
E->getVectorFactor()),
14272 if (!ExtMask.
empty()) {
14273 if (CommonMask.
empty()) {
14277 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
14280 NewMask[
I] = CommonMask[ExtMask[
I]];
14282 CommonMask.
swap(NewMask);
14285 if (CommonMask.
empty()) {
14286 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
14290 createShuffle(InVectors.
front(),
14291 InVectors.
size() == 2 ? InVectors.
back() :
nullptr,
14296 assert((IsFinalized || CommonMask.empty()) &&
14297 "Shuffle construction must be finalized.");
14301const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(
const TreeEntry *E,
14302 unsigned Idx)
const {
14303 TreeEntry *
Op = OperandsToTreeEntry.
at({E, Idx});
14304 assert(
Op->isSame(E->getOperand(Idx)) &&
"Operands mismatch!");
14309 if (TE.State == TreeEntry::ScatterVectorize ||
14310 TE.State == TreeEntry::StridedVectorize)
14312 if (TE.State == TreeEntry::CompressVectorize)
14314 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
14315 !TE.isAltShuffle()) {
14316 if (TE.ReorderIndices.empty())
14328 SmallPtrSetImpl<Value *> &CheckedExtracts) {
14338 auto It = MinBWs.find(
E);
14339 Type *OrigScalarTy = ScalarTy;
14340 if (It != MinBWs.end()) {
14347 unsigned EntryVF =
E->getVectorFactor();
14350 if (
E->isGather()) {
14356 ScalarTy = VL.
front()->getType();
14357 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
14358 E, ScalarTy, *TTI, VectorizedVals, *
this, CheckedExtracts);
14360 if (
E->State == TreeEntry::SplitVectorize) {
14361 assert(
E->CombinedEntriesWithIndices.size() == 2 &&
14362 "Expected exactly 2 combined entries.");
14363 assert(
E->ReuseShuffleIndices.empty() &&
"Expected empty reuses mask.");
14365 if (
E->ReorderIndices.empty()) {
14368 E->CombinedEntriesWithIndices.back().second,
14371 VectorizableTree[
E->CombinedEntriesWithIndices.back().first]
14372 ->getVectorFactor()));
14374 unsigned CommonVF =
14375 std::max(VectorizableTree[
E->CombinedEntriesWithIndices.front().first]
14376 ->getVectorFactor(),
14377 VectorizableTree[
E->CombinedEntriesWithIndices.back().first]
14378 ->getVectorFactor());
14383 LLVM_DEBUG(dumpTreeCosts(
E, 0, VectorCost, 0,
"Calculated costs for Tree"));
14387 SmallVector<int>
Mask;
14388 if (!
E->ReorderIndices.empty() &&
E->State != TreeEntry::CompressVectorize &&
14389 (
E->State != TreeEntry::StridedVectorize ||
14391 SmallVector<int> NewMask;
14392 if (
E->getOpcode() == Instruction::Store) {
14394 NewMask.
resize(
E->ReorderIndices.size());
14401 if (!
E->ReuseShuffleIndices.empty())
14406 assert((
E->State == TreeEntry::Vectorize ||
14407 E->State == TreeEntry::ScatterVectorize ||
14408 E->State == TreeEntry::StridedVectorize ||
14409 E->State == TreeEntry::CompressVectorize) &&
14410 "Unhandled state");
14413 (
E->getOpcode() == Instruction::GetElementPtr &&
14414 E->getMainOp()->getType()->isPointerTy()) ||
14415 E->hasCopyableElements()) &&
14418 unsigned ShuffleOrOp =
14419 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector :
E->
getOpcode();
14420 if (
E->CombinedOp != TreeEntry::NotCombinedOp)
14421 ShuffleOrOp =
E->CombinedOp;
14422 SmallSetVector<Value *, 16> UniqueValues(VL.
begin(), VL.
end());
14423 const unsigned Sz = UniqueValues.size();
14424 SmallBitVector UsedScalars(Sz,
false);
14425 for (
unsigned I = 0;
I < Sz; ++
I) {
14427 !
E->isCopyableElement(UniqueValues[
I]) &&
14428 getTreeEntries(UniqueValues[
I]).
front() ==
E)
14430 UsedScalars.set(
I);
14432 auto GetCastContextHint = [&](
Value *
V) {
14434 return getCastContextHint(*OpTEs.front());
14435 InstructionsState SrcState =
getSameOpcode(
E->getOperand(0), *TLI);
14436 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
14437 !SrcState.isAltShuffle())
14450 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
14452 for (
unsigned I = 0;
I < Sz; ++
I) {
14453 if (UsedScalars.test(
I))
14455 ScalarCost += ScalarEltCost(
I);
14464 (
E->getOpcode() != Instruction::Load ||
E->UserTreeIndex)) {
14466 if (!EI.UserTE->hasState() ||
14467 EI.UserTE->getOpcode() != Instruction::Select ||
14469 auto UserBWIt = MinBWs.find(EI.UserTE);
14470 Type *UserScalarTy =
14471 (EI.UserTE->isGather() ||
14472 EI.UserTE->State == TreeEntry::SplitVectorize)
14473 ? EI.UserTE->Scalars.front()->getType()
14474 : EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
14475 if (UserBWIt != MinBWs.end())
14477 UserBWIt->second.first);
14478 if (ScalarTy != UserScalarTy) {
14479 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
14480 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
14481 unsigned VecOpcode;
14483 if (BWSz > SrcBWSz)
14484 VecOpcode = Instruction::Trunc;
14487 It->second.second ? Instruction::SExt : Instruction::ZExt;
14489 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
14494 LLVM_DEBUG(dumpTreeCosts(
E, CommonCost, VecCost - CommonCost,
14495 ScalarCost,
"Calculated costs for Tree"));
14496 return VecCost - ScalarCost;
14501 assert((
E->State == TreeEntry::Vectorize ||
14502 E->State == TreeEntry::StridedVectorize ||
14503 E->State == TreeEntry::CompressVectorize) &&
14504 "Entry state expected to be Vectorize, StridedVectorize or "
14505 "MaskedLoadCompressVectorize here.");
14509 *TTI, Ptrs, BasePtr,
E->getOpcode(),
CostKind, OrigScalarTy, VecTy);
14510 LLVM_DEBUG(dumpTreeCosts(
E, 0, VecCost, ScalarCost,
14511 "Calculated GEPs cost for Tree"));
14513 return VecCost - ScalarCost;
14520 Type *CanonicalType = Ty;
14526 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
14527 {CanonicalType, CanonicalType});
14529 TTI->getIntrinsicInstrCost(CostAttrs,
CostKind);
14532 if (VI && SelectOnly) {
14534 "Expected only for scalar type.");
14537 CI->
getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
14538 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
14539 {TTI::OK_AnyValue, TTI::OP_None}, CI);
14543 auto GetFMulAddCost = [&, &TTI = *TTI](
const InstructionsState &S,
14548 switch (ShuffleOrOp) {
14549 case Instruction::PHI: {
14552 SmallPtrSet<const TreeEntry *, 4> CountedOps;
14553 for (
Value *V : UniqueValues) {
14558 ValueList Operands(
PHI->getNumIncomingValues(),
nullptr);
14559 for (
unsigned I = 0,
N =
PHI->getNumIncomingValues();
I <
N; ++
I) {
14563 if (
const TreeEntry *OpTE =
14564 getSameValuesTreeEntry(Operands.
front(), Operands))
14565 if (CountedOps.
insert(OpTE).second &&
14566 !OpTE->ReuseShuffleIndices.empty())
14567 ScalarCost +=
TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
14568 OpTE->Scalars.size());
14571 return CommonCost - ScalarCost;
14573 case Instruction::ExtractValue:
14574 case Instruction::ExtractElement: {
14575 APInt DemandedElts;
14577 auto GetScalarCost = [&](
unsigned Idx) {
14583 if (ShuffleOrOp == Instruction::ExtractElement) {
14585 SrcVecTy = EE->getVectorOperandType();
14588 Type *AggregateTy = EV->getAggregateOperand()->getType();
14591 NumElts = ATy->getNumElements();
14597 if (
I->hasOneUse()) {
14607 Cost -= TTI->getCastInstrCost(
14608 Ext->getOpcode(),
Ext->getType(),
I->getType(),
14613 if (DemandedElts.
isZero())
14619 return CommonCost - (DemandedElts.
isZero()
14621 : TTI.getScalarizationOverhead(
14622 SrcVecTy, DemandedElts,
false,
14625 return GetCostDiff(GetScalarCost, GetVectorCost);
14627 case Instruction::InsertElement: {
14628 assert(
E->ReuseShuffleIndices.empty() &&
14629 "Unique insertelements only are expected.");
14631 unsigned const NumElts = SrcVecTy->getNumElements();
14632 unsigned const NumScalars = VL.
size();
14638 unsigned OffsetEnd = OffsetBeg;
14639 InsertMask[OffsetBeg] = 0;
14642 if (OffsetBeg > Idx)
14644 else if (OffsetEnd < Idx)
14646 InsertMask[Idx] =
I + 1;
14649 if (NumOfParts > 0 && NumOfParts < NumElts)
14650 VecScalarsSz =
PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
14651 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
14653 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
14654 unsigned InsertVecSz = std::min<unsigned>(
14656 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
14657 bool IsWholeSubvector =
14658 OffsetBeg ==
Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
14662 if (OffsetBeg + InsertVecSz > VecSz) {
14665 InsertVecSz = VecSz;
14670 SmallVector<int>
Mask;
14671 if (!
E->ReorderIndices.empty()) {
14676 std::iota(
Mask.begin(), std::next(
Mask.begin(), InsertVecSz), 0);
14678 bool IsIdentity =
true;
14680 Mask.swap(PrevMask);
14681 for (
unsigned I = 0;
I < NumScalars; ++
I) {
14683 DemandedElts.
setBit(InsertIdx);
14684 IsIdentity &= InsertIdx - OffsetBeg ==
I;
14685 Mask[InsertIdx - OffsetBeg] =
I;
14687 assert(
Offset < NumElts &&
"Failed to find vector index offset");
14701 InsertVecTy, Mask);
14703 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
14709 SmallBitVector InMask =
14711 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
14712 if (!InMask.
all() && NumScalars != NumElts && !IsWholeSubvector) {
14713 if (InsertVecSz != VecSz) {
14718 for (
unsigned I = 0, End = OffsetBeg -
Offset;
I < End; ++
I)
14720 for (
unsigned I = OffsetBeg -
Offset, End = OffsetEnd -
Offset;
14724 for (
unsigned I = OffsetEnd + 1 -
Offset;
I < VecSz; ++
I)
14733 case Instruction::ZExt:
14734 case Instruction::SExt:
14735 case Instruction::FPToUI:
14736 case Instruction::FPToSI:
14737 case Instruction::FPExt:
14738 case Instruction::PtrToInt:
14739 case Instruction::IntToPtr:
14740 case Instruction::SIToFP:
14741 case Instruction::UIToFP:
14742 case Instruction::Trunc:
14743 case Instruction::FPTrunc:
14744 case Instruction::BitCast: {
14745 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
14748 unsigned Opcode = ShuffleOrOp;
14749 unsigned VecOpcode = Opcode;
14751 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
14753 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->
getScalarType());
14754 if (SrcIt != MinBWs.end()) {
14755 SrcBWSz = SrcIt->second.first;
14761 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->
getScalarType());
14762 if (BWSz == SrcBWSz) {
14763 VecOpcode = Instruction::BitCast;
14764 }
else if (BWSz < SrcBWSz) {
14765 VecOpcode = Instruction::Trunc;
14766 }
else if (It != MinBWs.end()) {
14767 assert(BWSz > SrcBWSz &&
"Invalid cast!");
14768 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
14769 }
else if (SrcIt != MinBWs.end()) {
14770 assert(BWSz > SrcBWSz &&
"Invalid cast!");
14772 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
14774 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
14775 !SrcIt->second.second) {
14776 VecOpcode = Instruction::UIToFP;
14779 assert(Idx == 0 &&
"Expected 0 index only");
14780 return TTI->getCastInstrCost(Opcode, VL0->
getType(),
14787 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
14789 auto *
VI = VL0->
getOpcode() == Opcode ? VL0 :
nullptr;
14792 bool IsArithmeticExtendedReduction =
14793 E->Idx == 0 && UserIgnoreList &&
14796 return is_contained({Instruction::Add, Instruction::FAdd,
14797 Instruction::Mul, Instruction::FMul,
14798 Instruction::And, Instruction::Or,
14802 if (IsArithmeticExtendedReduction &&
14803 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
14805 return CommonCost +
14806 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH,
CostKind,
14807 VecOpcode == Opcode ? VI :
nullptr);
14809 return GetCostDiff(GetScalarCost, GetVectorCost);
14811 case Instruction::FCmp:
14812 case Instruction::ICmp:
14813 case Instruction::Select: {
14814 CmpPredicate VecPred, SwappedVecPred;
14817 match(VL0, MatchCmp))
14823 auto GetScalarCost = [&](
unsigned Idx) {
14833 !
match(VI, MatchCmp)) ||
14841 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
14842 CostKind, getOperandInfo(
VI->getOperand(0)),
14843 getOperandInfo(
VI->getOperand(1)), VI);
14854 TTI->getCmpSelInstrCost(
E->getOpcode(), VecTy, MaskTy, VecPred,
14855 CostKind, getOperandInfo(
E->getOperand(0)),
14856 getOperandInfo(
E->getOperand(1)), VL0);
14860 unsigned CondNumElements = CondType->getNumElements();
14862 assert(VecTyNumElements >= CondNumElements &&
14863 VecTyNumElements % CondNumElements == 0 &&
14864 "Cannot vectorize Instruction::Select");
14865 if (CondNumElements != VecTyNumElements) {
14874 return VecCost + CommonCost;
14876 return GetCostDiff(GetScalarCost, GetVectorCost);
14878 case TreeEntry::MinMax: {
14879 auto GetScalarCost = [&](
unsigned Idx) {
14880 return GetMinMaxCost(OrigScalarTy);
14884 return VecCost + CommonCost;
14886 return GetCostDiff(GetScalarCost, GetVectorCost);
14888 case TreeEntry::FMulAdd: {
14889 auto GetScalarCost = [&](
unsigned Idx) {
14892 return GetFMulAddCost(
E->getOperations(),
14898 for (
Value *V :
E->Scalars) {
14900 FMF &= FPCI->getFastMathFlags();
14902 FMF &= FPCIOp->getFastMathFlags();
14905 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,
14906 {VecTy, VecTy, VecTy}, FMF);
14908 return VecCost + CommonCost;
14910 return GetCostDiff(GetScalarCost, GetVectorCost);
14912 case Instruction::FNeg:
14913 case Instruction::Add:
14914 case Instruction::FAdd:
14915 case Instruction::Sub:
14916 case Instruction::FSub:
14917 case Instruction::Mul:
14918 case Instruction::FMul:
14919 case Instruction::UDiv:
14920 case Instruction::SDiv:
14921 case Instruction::FDiv:
14922 case Instruction::URem:
14923 case Instruction::SRem:
14924 case Instruction::FRem:
14925 case Instruction::Shl:
14926 case Instruction::LShr:
14927 case Instruction::AShr:
14928 case Instruction::And:
14929 case Instruction::Or:
14930 case Instruction::Xor: {
14931 auto GetScalarCost = [&](
unsigned Idx) {
14938 Value *Op1 =
E->getOperand(0)[Idx];
14940 SmallVector<const Value *, 2> Operands(1, Op1);
14944 Op2 =
E->getOperand(1)[Idx];
14950 ShuffleOrOp, OrigScalarTy,
CostKind, Op1Info, Op2Info, Operands);
14952 I && (ShuffleOrOp == Instruction::FAdd ||
14953 ShuffleOrOp == Instruction::FSub)) {
14961 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
14966 return CI && CI->getValue().countr_one() >= It->second.first;
14974 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy,
CostKind, Op1Info,
14975 Op2Info, {},
nullptr, TLI) +
14978 return GetCostDiff(GetScalarCost, GetVectorCost);
14980 case Instruction::GetElementPtr: {
14981 return CommonCost + GetGEPCostDiff(VL, VL0);
14983 case Instruction::Load: {
14984 auto GetScalarCost = [&](
unsigned Idx) {
14986 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
14987 VI->getAlign(),
VI->getPointerAddressSpace(),
14993 switch (
E->State) {
14994 case TreeEntry::Vectorize:
14995 if (
unsigned Factor =
E->getInterleaveFactor()) {
14996 VecLdCost = TTI->getInterleavedMemoryOpCost(
14997 Instruction::Load, VecTy, Factor, {}, LI0->getAlign(),
14998 LI0->getPointerAddressSpace(),
CostKind);
15001 VecLdCost = TTI->getMemoryOpCost(
15002 Instruction::Load, VecTy, LI0->getAlign(),
15006 case TreeEntry::StridedVectorize: {
15007 Align CommonAlignment =
15009 VecLdCost = TTI->getStridedMemoryOpCost(
15010 Instruction::Load, VecTy, LI0->getPointerOperand(),
15011 false, CommonAlignment,
CostKind);
15014 case TreeEntry::CompressVectorize: {
15016 unsigned InterleaveFactor;
15017 SmallVector<int> CompressMask;
15020 if (!
E->ReorderIndices.empty()) {
15021 SmallVector<int>
Mask(
E->ReorderIndices.begin(),
15022 E->ReorderIndices.end());
15029 Scalars, PointerOps,
E->ReorderIndices, *TTI, *DL, *SE, *AC, *DT,
15030 *TLI, [](
Value *) { return true; }, IsMasked, InterleaveFactor,
15031 CompressMask, LoadVecTy);
15032 assert(IsVectorized &&
"Failed to vectorize load");
15033 CompressEntryToData.try_emplace(
E, CompressMask, LoadVecTy,
15034 InterleaveFactor, IsMasked);
15035 Align CommonAlignment = LI0->getAlign();
15036 if (InterleaveFactor) {
15037 VecLdCost = TTI->getInterleavedMemoryOpCost(
15038 Instruction::Load, LoadVecTy, InterleaveFactor, {},
15039 CommonAlignment, LI0->getPointerAddressSpace(),
CostKind);
15040 }
else if (IsMasked) {
15041 VecLdCost = TTI->getMaskedMemoryOpCost(
15042 Instruction::Load, LoadVecTy, CommonAlignment,
15043 LI0->getPointerAddressSpace(),
CostKind);
15046 LoadVecTy, CompressMask,
CostKind);
15048 VecLdCost = TTI->getMemoryOpCost(
15049 Instruction::Load, LoadVecTy, CommonAlignment,
15053 LoadVecTy, CompressMask,
CostKind);
15057 case TreeEntry::ScatterVectorize: {
15058 Align CommonAlignment =
15060 VecLdCost = TTI->getGatherScatterOpCost(
15061 Instruction::Load, VecTy, LI0->getPointerOperand(),
15062 false, CommonAlignment,
CostKind);
15065 case TreeEntry::CombinedVectorize:
15066 case TreeEntry::SplitVectorize:
15067 case TreeEntry::NeedToGather:
15070 return VecLdCost + CommonCost;
15076 if (
E->State == TreeEntry::ScatterVectorize)
15083 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
15085 case Instruction::Store: {
15086 bool IsReorder = !
E->ReorderIndices.empty();
15087 auto GetScalarCost = [=](
unsigned Idx) {
15090 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
15091 VI->getAlign(),
VI->getPointerAddressSpace(),
15099 if (
E->State == TreeEntry::StridedVectorize) {
15100 Align CommonAlignment =
15102 VecStCost = TTI->getStridedMemoryOpCost(
15103 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
15104 false, CommonAlignment,
CostKind);
15106 assert(
E->State == TreeEntry::Vectorize &&
15107 "Expected either strided or consecutive stores.");
15108 if (
unsigned Factor =
E->getInterleaveFactor()) {
15109 assert(
E->ReuseShuffleIndices.empty() && !
E->ReorderIndices.empty() &&
15110 "No reused shuffles expected");
15112 VecStCost = TTI->getInterleavedMemoryOpCost(
15113 Instruction::Store, VecTy, Factor, {}, BaseSI->getAlign(),
15114 BaseSI->getPointerAddressSpace(),
CostKind);
15117 VecStCost = TTI->getMemoryOpCost(
15118 Instruction::Store, VecTy, BaseSI->getAlign(),
15119 BaseSI->getPointerAddressSpace(),
CostKind, OpInfo);
15122 return VecStCost + CommonCost;
15126 unsigned Idx = IsReorder ?
E->ReorderIndices[
I] :
I;
15130 return GetCostDiff(GetScalarCost, GetVectorCost) +
15131 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
15133 case Instruction::Call: {
15134 auto GetScalarCost = [&](
unsigned Idx) {
15138 IntrinsicCostAttributes CostAttrs(
ID, *CI, 1);
15139 return TTI->getIntrinsicInstrCost(CostAttrs,
CostKind);
15149 CI,
ID, VecTy->getNumElements(),
15150 It != MinBWs.end() ? It->second.first : 0, TTI);
15152 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
15154 return GetCostDiff(GetScalarCost, GetVectorCost);
15156 case Instruction::ShuffleVector: {
15164 "Invalid Shuffle Vector Operand");
15167 auto TryFindNodeWithEqualOperands = [=]() {
15168 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
15171 if (
TE->hasState() &&
TE->isAltShuffle() &&
15172 ((
TE->getOpcode() ==
E->getOpcode() &&
15173 TE->getAltOpcode() ==
E->getAltOpcode()) ||
15174 (
TE->getOpcode() ==
E->getAltOpcode() &&
15175 TE->getAltOpcode() ==
E->getOpcode())) &&
15176 TE->hasEqualOperands(*
E))
15181 auto GetScalarCost = [&](
unsigned Idx) {
15186 assert(
E->getMatchingMainOpOrAltOp(VI) &&
15187 "Unexpected main/alternate opcode");
15189 return TTI->getInstructionCost(VI,
CostKind);
15197 if (TryFindNodeWithEqualOperands()) {
15199 dbgs() <<
"SLP: diamond match for alternate node found.\n";
15206 TTIRef.getArithmeticInstrCost(
E->getOpcode(), VecTy,
CostKind);
15208 TTIRef.getArithmeticInstrCost(
E->getAltOpcode(), VecTy,
CostKind);
15211 VecCost = TTIRef.getCmpSelInstrCost(
15212 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(),
CostKind,
15213 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15215 VecCost += TTIRef.getCmpSelInstrCost(
15216 E->getOpcode(), VecTy, MaskTy,
15218 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15221 Type *SrcSclTy =
E->getMainOp()->getOperand(0)->getType();
15224 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
15225 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
15227 DL->getTypeSizeInBits(
E->getMainOp()->getOperand(0)->getType());
15228 if (SrcIt != MinBWs.end()) {
15229 SrcBWSz = SrcIt->second.first;
15233 if (BWSz <= SrcBWSz) {
15234 if (BWSz < SrcBWSz)
15236 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
15240 <<
"SLP: alternate extension, which should be truncated.\n";
15246 VecCost = TTIRef.getCastInstrCost(
E->getOpcode(), VecTy, SrcTy,
15249 TTIRef.getCastInstrCost(
E->getAltOpcode(), VecTy, SrcTy,
15252 SmallVector<int>
Mask;
15253 E->buildAltOpShuffleMask(
15254 [&](Instruction *
I) {
15255 assert(
E->getMatchingMainOpOrAltOp(
I) &&
15256 "Unexpected main/alternate opcode");
15267 unsigned Opcode0 =
E->getOpcode();
15268 unsigned Opcode1 =
E->getAltOpcode();
15269 SmallBitVector OpcodeMask(
15273 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
15275 VecTy, Opcode0, Opcode1, OpcodeMask,
CostKind);
15276 return AltVecCost < VecCost ? AltVecCost : VecCost;
15282 return GetCostDiff(
15287 "Not supported shufflevector usage.");
15289 unsigned SVNumElements =
15291 ->getNumElements();
15292 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
15293 for (
size_t I = 0, End = VL.
size();
I != End;
I += GroupSize) {
15298 "Not supported shufflevector usage.");
15301 [[maybe_unused]]
bool IsExtractSubvectorMask =
15302 SV->isExtractSubvectorMask(Index);
15303 assert(IsExtractSubvectorMask &&
15304 "Not supported shufflevector usage.");
15305 if (NextIndex != Index)
15307 NextIndex += SV->getShuffleMask().size();
15310 return ::getShuffleCost(
15316 return GetCostDiff(GetScalarCost, GetVectorCost);
15318 case Instruction::Freeze:
15325bool BoUpSLP::isFullyVectorizableTinyTree(
bool ForReduction)
const {
15327 << VectorizableTree.size() <<
" is fully vectorizable .\n");
15329 auto &&AreVectorizableGathers = [
this](
const TreeEntry *
TE,
unsigned Limit) {
15330 SmallVector<int>
Mask;
15331 return TE->isGather() &&
15333 [
this](
Value *V) { return EphValues.contains(V); }) &&
15335 TE->Scalars.size() < Limit ||
15336 (((
TE->hasState() &&
15337 TE->getOpcode() == Instruction::ExtractElement) ||
15340 (
TE->hasState() &&
TE->getOpcode() == Instruction::Load &&
15341 !
TE->isAltShuffle()) ||
15346 if (VectorizableTree.size() == 1 &&
15347 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
15348 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
15349 VectorizableTree[0]->State == TreeEntry::CompressVectorize ||
15351 AreVectorizableGathers(VectorizableTree[0].
get(),
15352 VectorizableTree[0]->Scalars.size()) &&
15353 VectorizableTree[0]->getVectorFactor() > 2)))
15356 if (VectorizableTree.size() != 2)
15363 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
15364 AreVectorizableGathers(VectorizableTree[1].
get(),
15365 VectorizableTree[0]->Scalars.size()))
15369 if (VectorizableTree[0]->
isGather() ||
15370 (VectorizableTree[1]->
isGather() &&
15371 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
15372 VectorizableTree[0]->State != TreeEntry::StridedVectorize &&
15373 VectorizableTree[0]->State != TreeEntry::CompressVectorize))
15381 bool MustMatchOrInst) {
15385 Value *ZextLoad = Root;
15386 const APInt *ShAmtC;
15387 bool FoundOr =
false;
15391 ShAmtC->
urem(8) == 0))) {
15393 ZextLoad = BinOp->getOperand(0);
15394 if (BinOp->getOpcode() == Instruction::Or)
15399 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
15406 Type *SrcTy = Load->getType();
15407 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
15413 LLVM_DEBUG(
dbgs() <<
"SLP: Assume load combining for tree starting at "
15423 unsigned NumElts = VectorizableTree[0]->Scalars.size();
15424 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
15432 unsigned NumElts = Stores.
size();
15433 for (
Value *Scalar : Stores) {
15447 if (VectorizableTree.empty()) {
15448 assert(ExternalUses.empty() &&
"We shouldn't have any external users");
15454 if (VectorizableTree.size() == 2 &&
15456 VectorizableTree[1]->isGather() &&
15457 (VectorizableTree[1]->getVectorFactor() <= 2 ||
15458 !(
isSplat(VectorizableTree[1]->Scalars) ||
15466 constexpr int Limit = 4;
15468 !VectorizableTree.empty() &&
15469 all_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
15470 return (TE->isGather() &&
15471 (!TE->hasState() ||
15472 TE->getOpcode() != Instruction::ExtractElement) &&
15474 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
15481 VectorizableTree.size() <= Limit &&
15482 all_of(VectorizableTree,
15483 [&](
const std::unique_ptr<TreeEntry> &TE) {
15484 return (TE->isGather() &&
15485 (!TE->hasState() ||
15486 TE->getOpcode() != Instruction::ExtractElement) &&
15490 (TE->getOpcode() == Instruction::InsertElement ||
15491 (TE->getOpcode() == Instruction::PHI &&
15493 return isa<PoisonValue>(V) || MustGather.contains(V);
15496 any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
15497 return TE->State == TreeEntry::Vectorize &&
15498 TE->getOpcode() == Instruction::PHI;
15505 unsigned NumGathers = 0;
15506 constexpr int LimitTreeSize = 36;
15508 all_of(VectorizableTree,
15509 [&](
const std::unique_ptr<TreeEntry> &TE) {
15510 if (!TE->isGather() && TE->hasState() &&
15511 (TE->getOpcode() == Instruction::Load ||
15512 TE->getOpcode() == Instruction::Store)) {
15516 if (TE->isGather())
15518 return TE->State == TreeEntry::SplitVectorize ||
15519 (TE->Idx == 0 && TE->Scalars.size() == 2 &&
15520 TE->hasState() && TE->getOpcode() == Instruction::ICmp &&
15521 VectorizableTree.size() > LimitTreeSize) ||
15525 (TE->getOpcode() == Instruction::PHI ||
15526 (TE->hasCopyableElements() &&
15529 TE->Scalars.size() / 2) ||
15530 ((!TE->ReuseShuffleIndices.empty() ||
15531 !TE->ReorderIndices.empty() || TE->isAltShuffle()) &&
15532 TE->Scalars.size() == 2)));
15534 (StoreLoadNodes.
empty() ||
15535 (VectorizableTree.size() > LimitTreeSize * StoreLoadNodes.
size() &&
15536 (NumGathers > 0 ||
none_of(StoreLoadNodes, [&](
const TreeEntry *TE) {
15537 return TE->getOpcode() == Instruction::Store ||
15539 return !isa<LoadInst>(V) ||
15540 areAllUsersVectorized(cast<Instruction>(V));
15548 VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
15549 VectorizableTree.size() >= Limit &&
15551 [&](
const std::unique_ptr<TreeEntry> &TE) {
15552 return !TE->isGather() && TE->UserTreeIndex.UserTE &&
15553 TE->UserTreeIndex.UserTE->Idx == 0;
15560 VectorizableTree.size() > 2 &&
15561 VectorizableTree.front()->State == TreeEntry::Vectorize &&
15562 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
15563 VectorizableTree[1]->State == TreeEntry::Vectorize &&
15564 VectorizableTree[1]->getOpcode() == Instruction::PHI &&
15566 ArrayRef(VectorizableTree).drop_front(2),
15567 [&](
const std::unique_ptr<TreeEntry> &TE) {
return TE->isGather(); }))
15577 if (isFullyVectorizableTinyTree(ForReduction))
15582 bool IsAllowedSingleBVNode =
15583 VectorizableTree.
size() > 1 ||
15584 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
15585 !VectorizableTree.front()->isAltShuffle() &&
15586 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
15587 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
15589 if (
any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
15590 return TE->isGather() &&
all_of(TE->Scalars, [&](
Value *V) {
15591 return isa<ExtractElementInst, Constant>(V) ||
15592 (IsAllowedSingleBVNode &&
15593 !V->hasNUsesOrMore(UsesLimit) &&
15594 any_of(V->users(), IsaPred<InsertElementInst>));
15599 if (VectorizableTree.back()->isGather() &&
15600 VectorizableTree.back()->hasState() &&
15601 VectorizableTree.back()->isAltShuffle() &&
15602 VectorizableTree.back()->getVectorFactor() > 2 &&
15604 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
15605 TTI->getScalarizationOverhead(
15606 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
15607 VectorizableTree.back()->getVectorFactor()),
15620 constexpr unsigned SmallTree = 3;
15621 if (VectorizableTree.front()->isNonPowOf2Vec() &&
15624 [](
const std::unique_ptr<TreeEntry> &TE) {
15625 return TE->isGather() && TE->hasState() &&
15626 TE->getOpcode() == Instruction::Load &&
15634 TreeEntry &E = *VectorizableTree[Idx];
15635 if (E.State == TreeEntry::SplitVectorize)
15639 if ((E.hasState() && E.getOpcode() != Instruction::Load) ||
15658 const TreeEntry *Root = VectorizableTree.front().get();
15659 if (Root->isGather())
15667 for (
const auto &TEPtr : VectorizableTree) {
15668 if (!TEPtr->isGather()) {
15669 Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
15670 EntriesToLastInstruction.
try_emplace(TEPtr.get(), LastInst);
15671 LastInstructions.
insert(LastInst);
15673 if (TEPtr->UserTreeIndex)
15674 EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
15681 if (
II->isAssumeLikeIntrinsic())
15688 return IntrCost < CallCost;
15695 CheckedInstructions;
15696 unsigned Budget = 0;
15697 const unsigned BudgetLimit =
15702 "Expected instructions in same block.");
15703 if (
auto It = CheckedInstructions.
find(
Last);
15704 It != CheckedInstructions.
end()) {
15705 const Instruction *Checked = It->second.getPointer();
15707 return It->second.getInt() != 0;
15713 ++
First->getIterator().getReverse(),
15715 Last->getIterator().getReverse();
15717 while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
15723 for (
const Instruction *LastInst : LastInstsInRange)
15724 CheckedInstructions.
try_emplace(LastInst, &*PrevInstIt, 0);
15727 if (LastInstructions.
contains(&*PrevInstIt))
15728 LastInstsInRange.
push_back(&*PrevInstIt);
15733 for (
const Instruction *LastInst : LastInstsInRange)
15735 LastInst, PrevInstIt == InstIt ?
First : &*PrevInstIt,
15736 Budget <= BudgetLimit ? 1 : 0);
15737 return Budget <= BudgetLimit;
15739 auto AddCosts = [&](
const TreeEntry *
Op) {
15740 Type *ScalarTy =
Op->Scalars.front()->getType();
15741 auto It = MinBWs.find(
Op);
15742 if (It != MinBWs.end())
15745 Cost += TTI->getCostOfKeepingLiveOverCall(VecTy);
15748 Cost -=
Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy);
15755 ParentOpParentToPreds;
15758 auto Key = std::make_pair(Root, OpParent);
15759 if (
auto It = ParentOpParentToPreds.
find(
Key);
15760 It != ParentOpParentToPreds.
end())
15772 for (
const auto &KeyPair : ParentsPairsToAdd) {
15774 "Should not have been added before.");
15778 while (!Worklist.
empty()) {
15780 if (BB == OpParent || !Visited.
insert(BB).second)
15782 auto Pair = std::make_pair(BB, OpParent);
15783 if (
auto It = ParentOpParentToPreds.
find(Pair);
15784 It != ParentOpParentToPreds.
end()) {
15788 ParentsPairsToAdd.
insert(Pair);
15793 if (Budget > BudgetLimit)
15805 while (!LiveEntries.
empty()) {
15808 if (Operands.
empty())
15810 Instruction *LastInst = EntriesToLastInstruction.
at(Entry);
15812 for (
const TreeEntry *
Op : Operands) {
15813 if (!
Op->isGather())
15815 if (Entry->State == TreeEntry::SplitVectorize ||
15816 (Entry->getOpcode() != Instruction::PHI &&
Op->isGather()) ||
15822 Pred = Phi->getIncomingBlock(
Op->UserTreeIndex.EdgeIdx);
15825 if (
Op->isGather()) {
15826 assert(Entry->getOpcode() == Instruction::PHI &&
15827 "Expected phi node only.");
15829 ->getIncomingBlock(
Op->UserTreeIndex.EdgeIdx);
15831 for (
Value *V :
Op->Scalars) {
15842 OpLastInst = EntriesToLastInstruction.
at(
Op);
15846 if (OpParent == Parent) {
15847 if (Entry->getOpcode() == Instruction::PHI) {
15848 if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
15852 if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
15858 if (Entry->getOpcode() != Instruction::PHI &&
15859 !CheckForNonVecCallsInSameBlock(
15860 &*LastInst->
getParent()->getFirstNonPHIOrDbgOrAlloca(),
15866 if (!CheckForNonVecCallsInSameBlock(OpLastInst,
15872 if (!CheckPredecessors(Parent, Pred, OpParent)) {
15888 const auto *I1 = IE1;
15889 const auto *I2 = IE2;
15901 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
15904 if (I2 && ((I2 == IE2 || I2->
hasOneUse())) &&
15907 }
while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
15914struct ValueSelect {
15915 template <
typename U>
15916 static std::enable_if_t<std::is_same_v<Value *, U>,
Value *>
get(
Value *V) {
15919 template <
typename U>
15920 static std::enable_if_t<!std::is_same_v<Value *, U>,
U>
get(
Value *) {
15938template <
typename T>
15944 assert(!ShuffleMask.empty() &&
"Empty list of shuffles for inserts.");
15946 auto VMIt = std::next(ShuffleMask.begin());
15949 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
15951 if (!IsBaseUndef.
all()) {
15953 std::pair<T *, bool> Res =
15954 ResizeAction(ShuffleMask.begin()->first, Mask,
false);
15956 for (
unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
15960 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
15962 [[maybe_unused]]
auto *V = ValueSelect::get<T *>(
Base);
15963 assert((!V || GetVF(V) == Mask.size()) &&
15964 "Expected base vector of VF number of elements.");
15965 Prev = Action(Mask, {
nullptr, Res.first});
15966 }
else if (ShuffleMask.size() == 1) {
15969 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
15975 Prev = Action(Mask, {ShuffleMask.begin()->first});
15979 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
15980 unsigned Vec2VF = GetVF(VMIt->first);
15981 if (Vec1VF == Vec2VF) {
15985 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
15988 Mask[
I] = SecMask[
I] + Vec1VF;
15991 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
15994 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
15996 std::pair<T *, bool> Res2 =
15997 ResizeAction(VMIt->first, VMIt->second,
false);
15999 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
16006 Mask[
I] = (Res2.second ?
I : SecMask[
I]) + VF;
16009 Prev = Action(Mask, {Res1.first, Res2.first});
16011 VMIt = std::next(VMIt);
16013 [[maybe_unused]]
bool IsBaseNotUndef = !IsBaseUndef.
all();
16015 for (
auto E = ShuffleMask.end(); VMIt !=
E; ++VMIt) {
16017 std::pair<T *, bool> Res =
16018 ResizeAction(VMIt->first, VMIt->second,
false);
16020 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
16023 "Multiple uses of scalars.");
16024 Mask[
I] = (Res.second ?
I : SecMask[
I]) + VF;
16029 Prev = Action(Mask, {Prev, Res.first});
16037template <
typename T>
struct ShuffledInsertData {
16041 MapVector<T, SmallVector<int>> ValueMasks;
16049 << VectorizableTree.size() <<
".\n");
16052 for (
unsigned I = 0, E = VectorizableTree.size();
I < E; ++
I) {
16053 TreeEntry &TE = *VectorizableTree[
I];
16056 if (TE.State == TreeEntry::CombinedVectorize) {
16058 dbgs() <<
"SLP: Skipping cost for combined node that starts with "
16059 << *TE.Scalars[0] <<
".\n";
16060 TE.dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16063 if (TE.hasState() &&
16064 (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {
16065 if (
const TreeEntry *E =
16066 getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
16067 E && E->getVectorFactor() == TE.getVectorFactor()) {
16072 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16079 assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&
16080 "Expected gather nodes with users only.");
16086 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16090 none_of(ExternalUses, [](
const ExternalUser &EU) {
16101 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
16108 for (ExternalUser &EU : ExternalUses) {
16109 ScalarUserAndIdx.
emplace_back(EU.Scalar, EU.User, EU.Lane);
16112 for (ExternalUser &EU : ExternalUses) {
16113 LLVM_DEBUG(
dbgs() <<
"SLP: Computing cost for external use of TreeEntry "
16114 << EU.E.Idx <<
" in lane " << EU.Lane <<
"\n");
16116 else dbgs() <<
" User: nullptr\n");
16117 LLVM_DEBUG(
dbgs() <<
" Use: " << EU.Scalar->getNameOrAsOperand() <<
"\n");
16122 if (EphValues.count(EU.User))
16126 if (!CheckedScalarUser.
insert(std::make_pair(EU.Scalar, EU.User)).second ||
16128 CheckedScalarUser.
contains(std::make_pair(EU.Scalar,
nullptr))))
16136 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
16142 !ExtractCostCalculated.
insert(EU.Scalar).second)
16155 if (!UsedInserts.
insert(VU).second)
16159 const TreeEntry *ScalarTE = &EU.E;
16162 [
this, VU](
const ShuffledInsertData<const TreeEntry *> &
Data) {
16167 Value *Op0 =
II->getOperand(0);
16174 if (It == ShuffledInserts.
end()) {
16176 Data.InsertElements.emplace_back(VU);
16178 VecId = ShuffledInserts.
size() - 1;
16179 auto It = MinBWs.find(ScalarTE);
16180 if (It != MinBWs.end() &&
16182 .
insert(std::make_pair(ScalarTE, FTy->getElementType()))
16184 unsigned BWSz = It->second.first;
16185 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
16186 unsigned VecOpcode;
16187 if (DstBWSz < BWSz)
16188 VecOpcode = Instruction::Trunc;
16191 It->second.second ? Instruction::SExt : Instruction::ZExt;
16196 FTy->getNumElements()),
16199 <<
" for extending externally used vector with "
16200 "non-equal minimum bitwidth.\n");
16205 It->InsertElements.front() = VU;
16206 VecId = std::distance(ShuffledInserts.
begin(), It);
16208 int InIdx = *InsertIdx;
16210 ShuffledInserts[VecId].ValueMasks[ScalarTE];
16213 Mask[InIdx] = EU.Lane;
16214 DemandedElts[VecId].setBit(InIdx);
16225 auto *ScalarTy = EU.Scalar->getType();
16226 const unsigned BundleWidth = EU.E.getVectorFactor();
16227 assert(EU.Lane < BundleWidth &&
"Extracted lane out of bounds.");
16229 const TreeEntry *Entry = &EU.E;
16230 auto It = MinBWs.find(Entry);
16231 if (It != MinBWs.end()) {
16236 ? Instruction::ZExt
16237 : Instruction::SExt;
16242 << ExtraCost <<
"\n");
16246 CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx);
16247 LLVM_DEBUG(
dbgs() <<
" ExtractElement cost for " << *ScalarTy <<
" from "
16248 << *VecTy <<
": " << ExtraCost <<
"\n");
16251 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
16252 Entry->getOpcode() == Instruction::Load) {
16254 auto IsPhiInLoop = [&](
const ExternalUser &U) {
16257 const Loop *L = LI->getLoopFor(Phi->getParent());
16258 return L && (Phi->getParent() ==
I->getParent() ||
16259 L == LI->getLoopFor(
I->getParent()));
16263 if (!ValueToExtUses) {
16264 ValueToExtUses.emplace();
16265 for (
const auto &
P :
enumerate(ExternalUses)) {
16267 if (IsPhiInLoop(
P.value()))
16270 ValueToExtUses->try_emplace(
P.value().Scalar,
P.index());
16277 auto OperandIsScalar = [&](
Value *V) {
16283 return !EE->hasOneUse() || !MustGather.contains(EE);
16286 return ValueToExtUses->contains(V);
16288 bool CanBeUsedAsScalar =
all_of(Inst->operands(), OperandIsScalar);
16289 bool CanBeUsedAsScalarCast =
false;
16292 Op &&
all_of(
Op->operands(), OperandIsScalar)) {
16297 if (ScalarCost + OpCost <= ExtraCost) {
16298 CanBeUsedAsScalar = CanBeUsedAsScalarCast =
true;
16299 ScalarCost += OpCost;
16303 if (CanBeUsedAsScalar) {
16304 bool KeepScalar = ScalarCost <= ExtraCost;
16308 bool IsProfitablePHIUser =
16310 VectorizableTree.front()->Scalars.size() > 2)) &&
16311 VectorizableTree.front()->hasState() &&
16312 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
16316 auto *PHIUser = dyn_cast<PHINode>(U);
16317 return (!PHIUser ||
16318 PHIUser->getParent() !=
16320 VectorizableTree.front()->getMainOp())
16325 return ValueToExtUses->contains(V);
16327 if (IsProfitablePHIUser) {
16331 (!GatheredLoadsEntriesFirst.has_value() ||
16332 Entry->Idx < *GatheredLoadsEntriesFirst)) {
16333 unsigned ScalarUsesCount =
count_if(Entry->Scalars, [&](
Value *V) {
16334 return ValueToExtUses->contains(V);
16336 auto It = ExtractsCount.
find(Entry);
16337 if (It != ExtractsCount.
end()) {
16338 assert(ScalarUsesCount >= It->getSecond().size() &&
16339 "Expected total number of external uses not less than "
16340 "number of scalar uses.");
16341 ScalarUsesCount -= It->getSecond().size();
16346 KeepScalar = ScalarUsesCount <= 1 || !
has_single_bit(ScalarUsesCount);
16349 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
16350 for (
Value *V : Inst->operands()) {
16351 auto It = ValueToExtUses->find(V);
16352 if (It != ValueToExtUses->end()) {
16354 ExternalUses[It->second].User =
nullptr;
16357 ExtraCost = ScalarCost;
16358 if (!IsPhiInLoop(EU))
16359 ExtractsCount[Entry].
insert(Inst);
16360 if (CanBeUsedAsScalarCast) {
16361 ScalarOpsFromCasts.
insert(Inst->getOperand(0));
16365 for (
Value *V : IOp->operands()) {
16366 auto It = ValueToExtUses->find(V);
16367 if (It != ValueToExtUses->end()) {
16369 ExternalUses[It->second].User =
nullptr;
16378 ExtractCost += ExtraCost;
16382 for (
Value *V : ScalarOpsFromCasts) {
16383 ExternalUsesAsOriginalScalar.insert(V);
16385 ExternalUses.emplace_back(V,
nullptr, *TEs.front(),
16386 TEs.front()->findLaneForValue(V));
16390 if (!VectorizedVals.
empty()) {
16391 const TreeEntry &Root = *VectorizableTree.front();
16392 auto BWIt = MinBWs.find(&Root);
16393 if (BWIt != MinBWs.end()) {
16394 Type *DstTy = Root.Scalars.front()->getType();
16395 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->
getScalarType());
16397 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
16398 if (OriginalSz != SrcSz) {
16399 unsigned Opcode = Instruction::Trunc;
16400 if (OriginalSz > SrcSz)
16401 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
16407 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
16414 Cost += ExtractCost;
16416 bool ForSingleMask) {
16418 unsigned VF = Mask.size();
16419 unsigned VecVF = TE->getVectorFactor();
16420 bool HasLargeIndex =
16421 any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); });
16422 if ((VF != VecVF && HasLargeIndex) ||
16425 if (HasLargeIndex) {
16427 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
16433 dbgs() <<
"SLP: Adding cost " <<
C
16434 <<
" for final shuffle of insertelement external users.\n";
16435 TE->dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16437 return std::make_pair(TE,
true);
16440 if (!ForSingleMask) {
16442 for (
unsigned I = 0;
I < VF; ++
I) {
16444 ResizeMask[Mask[
I]] = Mask[
I];
16451 dbgs() <<
"SLP: Adding cost " <<
C
16452 <<
" for final shuffle of insertelement external users.\n";
16453 TE->dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16458 return std::make_pair(TE,
false);
16461 for (
int I = 0, E = ShuffledInserts.
size();
I < E; ++
I) {
16462 Value *
Base = ShuffledInserts[
I].InsertElements.
front()->getOperand(0);
16463 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
16467 assert((TEs.size() == 1 || TEs.size() == 2) &&
16468 "Expected exactly 1 or 2 tree entries.");
16469 if (TEs.size() == 1) {
16471 VF = TEs.front()->getVectorFactor();
16472 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16476 (
Data.index() < VF &&
16477 static_cast<int>(
Data.index()) ==
Data.value());
16482 <<
" for final shuffle of insertelement "
16483 "external users.\n";
16484 TEs.front()->
dump();
16485 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16491 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
16492 VF = TEs.front()->getVectorFactor();
16496 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16500 <<
" for final shuffle of vector node and external "
16501 "insertelement users.\n";
16502 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
16503 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16511 [](
const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
16512 EstimateShufflesCost);
16515 ShuffledInserts[
I].InsertElements.
front()->getType()),
16518 Cost -= InsertCost;
16522 if (ReductionBitWidth != 0) {
16523 assert(UserIgnoreList &&
"Expected reduction tree.");
16524 const TreeEntry &E = *VectorizableTree.front();
16525 auto It = MinBWs.find(&E);
16526 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
16527 unsigned SrcSize = It->second.first;
16528 unsigned DstSize = ReductionBitWidth;
16529 unsigned Opcode = Instruction::Trunc;
16530 if (SrcSize < DstSize) {
16531 bool IsArithmeticExtendedReduction =
16534 return is_contained({Instruction::Add, Instruction::FAdd,
16535 Instruction::Mul, Instruction::FMul,
16536 Instruction::And, Instruction::Or,
16540 if (IsArithmeticExtendedReduction)
16542 Instruction::BitCast;
16544 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
16546 if (Opcode != Instruction::BitCast) {
16548 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
16550 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
16553 switch (E.getOpcode()) {
16554 case Instruction::SExt:
16555 case Instruction::ZExt:
16556 case Instruction::Trunc: {
16557 const TreeEntry *OpTE = getOperandEntry(&E, 0);
16558 CCH = getCastContextHint(*OpTE);
16564 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
16568 <<
" for final resize for reduction from " << SrcVecTy
16569 <<
" to " << DstVecTy <<
"\n";
16570 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16575 std::optional<InstructionCost> SpillCost;
16578 Cost += *SpillCost;
16584 OS <<
"SLP: Spill Cost = ";
16589 OS <<
".\nSLP: Extract Cost = " << ExtractCost <<
".\n"
16590 <<
"SLP: Total Cost = " <<
Cost <<
".\n";
16594 ViewGraph(
this,
"SLP" + F->getName(),
false, Str);
16605std::optional<TTI::ShuffleKind>
16606BoUpSLP::tryToGatherSingleRegisterExtractElements(
16612 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
16628 if (Idx >= VecTy->getNumElements()) {
16632 SmallBitVector ExtractMask(VecTy->getNumElements(),
true);
16633 ExtractMask.reset(*Idx);
16638 VectorOpToIdx[EI->getVectorOperand()].push_back(
I);
16643 stable_sort(Vectors, [](
const auto &P1,
const auto &P2) {
16644 return P1.second.size() > P2.second.size();
16647 const int UndefSz = UndefVectorExtracts.
size();
16648 unsigned SingleMax = 0;
16649 unsigned PairMax = 0;
16650 if (!Vectors.
empty()) {
16651 SingleMax = Vectors.
front().second.size() + UndefSz;
16652 if (Vectors.
size() > 1) {
16653 auto *ItNext = std::next(Vectors.
begin());
16654 PairMax = SingleMax + ItNext->second.size();
16657 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
16658 return std::nullopt;
16664 if (SingleMax >= PairMax && SingleMax) {
16665 for (
int Idx : Vectors.
front().second)
16666 std::swap(GatheredExtracts[Idx], VL[Idx]);
16667 }
else if (!Vectors.
empty()) {
16668 for (
unsigned Idx : {0, 1})
16669 for (
int Idx : Vectors[Idx].second)
16670 std::swap(GatheredExtracts[Idx], VL[Idx]);
16673 for (
int Idx : UndefVectorExtracts)
16674 std::swap(GatheredExtracts[Idx], VL[Idx]);
16677 std::optional<TTI::ShuffleKind> Res =
16683 return std::nullopt;
16687 for (
int I = 0,
E = GatheredExtracts.size();
I <
E; ++
I) {
16708BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
16709 SmallVectorImpl<int> &Mask,
16710 unsigned NumParts)
const {
16711 assert(NumParts > 0 &&
"NumParts expected be greater than or equal to 1.");
16720 SmallVector<int> SubMask;
16721 std::optional<TTI::ShuffleKind> Res =
16722 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
16723 ShufflesRes[Part] = Res;
16724 copy(SubMask, std::next(
Mask.begin(), Part * SliceSize));
16726 if (
none_of(ShufflesRes, [](
const std::optional<TTI::ShuffleKind> &Res) {
16727 return Res.has_value();
16729 ShufflesRes.clear();
16730 return ShufflesRes;
16733std::optional<TargetTransformInfo::ShuffleKind>
16734BoUpSLP::isGatherShuffledSingleRegisterEntry(
16736 SmallVectorImpl<const TreeEntry *> &Entries,
unsigned Part,
bool ForOrder) {
16740 auto GetUserEntry = [&](
const TreeEntry *
TE) {
16741 while (
TE->UserTreeIndex &&
TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16742 TE =
TE->UserTreeIndex.UserTE;
16743 if (TE == VectorizableTree.front().get())
16744 return EdgeInfo(
const_cast<TreeEntry *
>(TE), 0);
16745 return TE->UserTreeIndex;
16747 auto HasGatherUser = [&](
const TreeEntry *
TE) {
16748 while (
TE->Idx != 0 &&
TE->UserTreeIndex) {
16749 if (
TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16751 TE =
TE->UserTreeIndex.UserTE;
16755 const EdgeInfo TEUseEI = GetUserEntry(TE);
16757 return std::nullopt;
16758 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
16763 TEUseEI.UserTE->hasState() ? TEUseEI.UserTE->getMainOp() :
nullptr);
16764 PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
16765 TEInsertBlock =
PHI->getIncomingBlock(TEUseEI.EdgeIdx);
16768 TEInsertBlock = TEInsertPt->
getParent();
16770 if (!DT->isReachableFromEntry(TEInsertBlock))
16771 return std::nullopt;
16772 auto *NodeUI = DT->getNode(TEInsertBlock);
16773 assert(NodeUI &&
"Should only process reachable instructions");
16775 auto CheckOrdering = [&](
const Instruction *InsertPt) {
16788 const BasicBlock *InsertBlock = InsertPt->getParent();
16789 auto *NodeEUI = DT->getNode(InsertBlock);
16792 assert((NodeUI == NodeEUI) ==
16793 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
16794 "Different nodes should have different DFS numbers");
16796 if (TEInsertPt->
getParent() != InsertBlock &&
16797 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
16799 if (TEInsertPt->
getParent() == InsertBlock &&
16812 SmallDenseMap<Value *, int> UsedValuesEntry;
16813 SmallPtrSet<const Value *, 16> VisitedValue;
16814 auto CheckAndUseSameNode = [&](
const TreeEntry *TEPtr) {
16816 if ((TEPtr->getVectorFactor() != VL.
size() &&
16817 TEPtr->Scalars.size() != VL.
size()) ||
16818 (!TEPtr->isSame(VL) && !TEPtr->isSame(
TE->Scalars)))
16822 for (
Value *V : VL) {
16829 auto CheckParentNodes = [&](
const TreeEntry *User1,
const TreeEntry *User2,
16830 unsigned EdgeIdx) {
16831 const TreeEntry *Ptr1 = User1;
16832 const TreeEntry *Ptr2 = User2;
16833 SmallDenseMap<const TreeEntry *, unsigned> PtrToIdx;
16836 EdgeIdx = Ptr2->UserTreeIndex.EdgeIdx;
16837 Ptr2 = Ptr2->UserTreeIndex.UserTE;
16840 unsigned Idx = Ptr1->UserTreeIndex.EdgeIdx;
16841 Ptr1 = Ptr1->UserTreeIndex.UserTE;
16842 if (
auto It = PtrToIdx.
find(Ptr1); It != PtrToIdx.
end())
16843 return Idx < It->second;
16847 for (
Value *V : VL) {
16851 SmallPtrSet<const TreeEntry *, 4> VToTEs;
16852 for (
const TreeEntry *TEPtr : ValueToGatherNodes.lookup(V)) {
16853 if (TEPtr == TE || TEPtr->Idx == 0)
16856 [&](
Value *V) { return GatheredScalars.contains(V); }) &&
16857 "Must contain at least single gathered value.");
16858 assert(TEPtr->UserTreeIndex &&
16859 "Expected only single user of a gather node.");
16860 const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
16862 PHINode *UserPHI = (UseEI.UserTE->State != TreeEntry::SplitVectorize &&
16863 UseEI.UserTE->hasState())
16868 : &getLastInstructionInBundle(UseEI.UserTE);
16869 if (TEInsertPt == InsertPt) {
16871 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
16872 (TEUseEI.UserTE->getOpcode() != Instruction::PHI ||
16873 TEUseEI.UserTE->isAltShuffle()) &&
16875 if (UseEI.UserTE->State != TreeEntry::Vectorize ||
16876 (UseEI.UserTE->hasState() &&
16877 UseEI.UserTE->getOpcode() == Instruction::PHI &&
16878 !UseEI.UserTE->isAltShuffle()) ||
16887 (TEUseEI.UserTE != UseEI.UserTE || TEUseEI.EdgeIdx < UseEI.EdgeIdx))
16890 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
16891 TEUseEI.UserTE->getOpcode() == Instruction::PHI &&
16892 UseEI.UserTE->State == TreeEntry::Vectorize &&
16893 UseEI.UserTE->getOpcode() == Instruction::PHI &&
16894 TEUseEI.UserTE != UseEI.UserTE)
16899 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
16903 if (TEUseEI.UserTE != UseEI.UserTE &&
16904 (TEUseEI.UserTE->Idx < UseEI.UserTE->Idx ||
16905 HasGatherUser(TEUseEI.UserTE)))
16908 if (CheckParentNodes(TEUseEI.UserTE, UseEI.UserTE, UseEI.EdgeIdx))
16912 if (!TEUseEI.UserTE->isGather() && !UserPHI &&
16913 TEUseEI.UserTE->doesNotNeedToSchedule() !=
16914 UseEI.UserTE->doesNotNeedToSchedule() &&
16919 if ((TEInsertBlock != InsertPt->
getParent() ||
16920 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
16921 (!CheckOrdering(InsertPt) ||
16922 (UseEI.UserTE->hasCopyableElements() &&
16927 if (CheckAndUseSameNode(TEPtr))
16933 VTEs, [&](
const TreeEntry *MTE) {
return MTE != TEUseEI.UserTE; });
16934 if (It != VTEs.end()) {
16935 const TreeEntry *VTE = *It;
16936 if (
none_of(
TE->CombinedEntriesWithIndices,
16937 [&](
const auto &
P) { return P.first == VTE->Idx; })) {
16938 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
16939 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
16943 if (CheckAndUseSameNode(VTE))
16949 const TreeEntry *VTE = VTEs.front();
16950 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
16951 VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
16952 VTEs = VTEs.drop_front();
16954 const auto *MIt =
find_if(VTEs, [](
const TreeEntry *MTE) {
16955 return MTE->State == TreeEntry::Vectorize;
16957 if (MIt == VTEs.end())
16961 if (
none_of(
TE->CombinedEntriesWithIndices,
16962 [&](
const auto &
P) { return P.first == VTE->Idx; })) {
16963 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
16964 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
16968 if (CheckAndUseSameNode(VTE))
16972 if (VToTEs.
empty())
16974 if (UsedTEs.
empty()) {
16982 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
16984 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
16988 if (!VToTEs.
empty()) {
16994 VToTEs = SavedVToTEs;
16999 if (Idx == UsedTEs.
size()) {
17003 if (UsedTEs.
size() == 2)
17005 UsedTEs.push_back(SavedVToTEs);
17006 Idx = UsedTEs.
size() - 1;
17012 if (UsedTEs.
empty()) {
17014 return std::nullopt;
17018 if (UsedTEs.
size() == 1) {
17021 UsedTEs.front().
end());
17022 sort(FirstEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
17023 return TE1->Idx < TE2->Idx;
17026 auto *It =
find_if(FirstEntries, [=](
const TreeEntry *EntryPtr) {
17027 return EntryPtr->isSame(VL) || EntryPtr->isSame(
TE->Scalars);
17029 if (It != FirstEntries.end() &&
17030 ((*It)->getVectorFactor() == VL.size() ||
17031 ((*It)->getVectorFactor() ==
TE->Scalars.size() &&
17032 TE->ReuseShuffleIndices.size() == VL.size() &&
17033 (*It)->isSame(
TE->Scalars)))) {
17035 if ((*It)->getVectorFactor() == VL.size()) {
17036 std::iota(std::next(
Mask.begin(), Part * VL.size()),
17037 std::next(
Mask.begin(), (Part + 1) * VL.size()), 0);
17039 SmallVector<int> CommonMask =
TE->getCommonMask();
17050 Entries.
push_back(FirstEntries.front());
17052 for (
auto &
P : UsedValuesEntry)
17054 VF = FirstEntries.front()->getVectorFactor();
17057 assert(UsedTEs.
size() == 2 &&
"Expected at max 2 permuted entries.");
17059 DenseMap<int, const TreeEntry *> VFToTE;
17060 for (
const TreeEntry *TE : UsedTEs.front()) {
17061 unsigned VF =
TE->getVectorFactor();
17062 auto It = VFToTE.
find(VF);
17063 if (It != VFToTE.
end()) {
17064 if (It->second->Idx >
TE->Idx)
17065 It->getSecond() =
TE;
17072 UsedTEs.back().
end());
17073 sort(SecondEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
17074 return TE1->Idx < TE2->Idx;
17076 for (
const TreeEntry *TE : SecondEntries) {
17077 auto It = VFToTE.
find(
TE->getVectorFactor());
17078 if (It != VFToTE.
end()) {
17087 if (Entries.
empty()) {
17089 UsedTEs.front(), [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
17090 return TE1->Idx < TE2->Idx;
17092 Entries.
push_back(SecondEntries.front());
17093 VF = std::max(Entries.
front()->getVectorFactor(),
17094 Entries.
back()->getVectorFactor());
17096 VF = Entries.
front()->getVectorFactor();
17099 for (
const TreeEntry *
E : Entries)
17103 for (
auto &
P : UsedValuesEntry) {
17105 if (ValuesToEntries[Idx].
contains(
P.first)) {
17115 auto AreCompatiblePHIs = [&](
Value *
V,
Value *V1) {
17122 for (
int I = 0,
E =
PHI->getNumIncomingValues();
I <
E; ++
I) {
17124 Value *In1 = PHI1->getIncomingValue(
I);
17139 auto MightBeIgnored = [=](
Value *
V) {
17143 !areAllUsersVectorized(
I, UserIgnoreList) &&
isSimple(
I);
17148 auto NeighborMightBeIgnored = [&](
Value *
V,
int Idx) {
17149 Value *V1 = VL[Idx];
17150 bool UsedInSameVTE =
false;
17151 auto It = UsedValuesEntry.find(V1);
17152 if (It != UsedValuesEntry.end())
17153 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
17154 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
17161 SmallBitVector UsedIdxs(Entries.size());
17163 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
17165 auto It = UsedValuesEntry.find(V);
17166 if (It == UsedValuesEntry.end())
17172 ((
I > 0 && NeighborMightBeIgnored(V,
I - 1)) ||
17173 (
I !=
E - 1 && NeighborMightBeIgnored(V,
I + 1)))))
17175 unsigned Idx = It->second;
17182 for (
unsigned I = 0, Sz = Entries.size();
I < Sz; ++
I) {
17183 if (!UsedIdxs.test(
I))
17189 for (std::pair<unsigned, int> &Pair : EntryLanes)
17190 if (Pair.first ==
I)
17191 Pair.first = TempEntries.
size();
17194 Entries.swap(TempEntries);
17195 if (EntryLanes.size() == Entries.size() &&
17197 .slice(Part * VL.size(),
17198 std::min<int>(VL.size(),
TE->Scalars.size())))) {
17204 return std::nullopt;
17207 bool IsIdentity = Entries.size() == 1;
17210 for (
const std::pair<unsigned, int> &Pair : EntryLanes) {
17211 unsigned Idx = Part * VL.size() + Pair.second;
17214 (ForOrder ? std::distance(
17215 Entries[Pair.first]->Scalars.begin(),
17216 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
17217 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
17218 IsIdentity &=
Mask[Idx] == Pair.second;
17220 if (ForOrder || IsIdentity || Entries.empty()) {
17221 switch (Entries.size()) {
17223 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
17227 if (EntryLanes.size() > 2 || VL.size() <= 2)
17234 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
17236 SmallVector<int> SubMask(std::next(
Mask.begin(), Part * VL.size()),
17237 std::next(
Mask.begin(), (Part + 1) * VL.size()));
17238 int MinElement = SubMask.
front(), MaxElement = SubMask.
front();
17239 for (
int Idx : SubMask) {
17247 assert(MaxElement >= 0 && MinElement >= 0 &&
17248 MaxElement % VF >= MinElement % VF &&
17249 "Expected at least single element.");
17250 unsigned NewVF = std::max<unsigned>(
17252 (MaxElement % VF) -
17253 (MinElement % VF) + 1));
17255 for (
int &Idx : SubMask) {
17258 Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
17259 (Idx >=
static_cast<int>(VF) ? NewVF : 0);
17267 auto *MaskVecTy =
getWidenedType(VL.front()->getType(), SubMask.size());
17268 auto GetShuffleCost = [&,
17269 &TTI = *TTI](ArrayRef<int>
Mask,
17272 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
17274 Mask, Entries.front()->getInterleaveFactor()))
17276 return ::getShuffleCost(TTI,
17281 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
17283 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
17284 if (Entries.size() == 1 || !Entries[0]->isGather()) {
17285 FirstShuffleCost = ShuffleCost;
17289 bool IsIdentity =
true;
17290 for (
auto [
I, Idx] :
enumerate(FirstMask)) {
17291 if (Idx >=
static_cast<int>(NewVF)) {
17296 IsIdentity &=
static_cast<int>(
I) == Idx;
17300 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
17302 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
17306 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
17307 if (Entries.size() == 1 || !Entries[1]->isGather()) {
17308 SecondShuffleCost = ShuffleCost;
17312 bool IsIdentity =
true;
17313 for (
auto [
I, Idx] :
enumerate(SecondMask)) {
17314 if (Idx <
static_cast<int>(NewVF) && Idx >= 0) {
17320 IsIdentity &=
static_cast<int>(
I) == Idx;
17325 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
17327 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
17335 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
17337 const TreeEntry *BestEntry =
nullptr;
17338 if (FirstShuffleCost < ShuffleCost) {
17339 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
17340 std::next(
Mask.begin(), (Part + 1) * VL.size()),
17342 if (Idx >= static_cast<int>(VF))
17343 Idx = PoisonMaskElem;
17345 BestEntry = Entries.front();
17346 ShuffleCost = FirstShuffleCost;
17348 if (SecondShuffleCost < ShuffleCost) {
17349 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
17350 std::next(
Mask.begin(), (Part + 1) * VL.size()),
17352 if (Idx < static_cast<int>(VF))
17353 Idx = PoisonMaskElem;
17357 BestEntry = Entries[1];
17358 ShuffleCost = SecondShuffleCost;
17360 if (BuildVectorCost >= ShuffleCost) {
17363 Entries.push_back(BestEntry);
17371 std::fill(std::next(
Mask.begin(), Part * VL.size()),
17373 return std::nullopt;
17377BoUpSLP::isGatherShuffledEntry(
17381 assert(NumParts > 0 && NumParts < VL.
size() &&
17382 "Expected positive number of registers.");
17385 if (TE == VectorizableTree.front().get() &&
17386 (!GatheredLoadsEntriesFirst.has_value() ||
17388 [](
const std::unique_ptr<TreeEntry> &TE) {
17389 return !
TE->isGather();
17394 if (
TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
17397 assert((
TE->UserTreeIndex || TE == VectorizableTree.front().get()) &&
17398 "Expected only single user of the gather node.");
17400 "Number of scalars must be divisible by NumParts.");
17401 if (
TE->UserTreeIndex &&
TE->UserTreeIndex.UserTE->isGather() &&
17402 TE->UserTreeIndex.EdgeIdx == UINT_MAX &&
17404 (
TE->hasState() &&
TE->getOpcode() == Instruction::ExtractElement) ||
17407 getSameValuesTreeEntry(
TE->getMainOp(),
TE->Scalars))))
17414 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
17415 std::optional<TTI::ShuffleKind> SubRes =
17416 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
17419 SubEntries.
clear();
17422 SubEntries.
front()->getVectorFactor() == VL.
size() &&
17423 (SubEntries.
front()->isSame(
TE->Scalars) ||
17424 SubEntries.
front()->isSame(VL))) {
17426 LocalSubEntries.
swap(SubEntries);
17429 std::iota(
Mask.begin(),
Mask.end(), 0);
17431 for (
int I = 0, Sz = VL.
size();
I < Sz; ++
I)
17434 Entries.emplace_back(1, LocalSubEntries.
front());
17440 [](
const std::optional<TTI::ShuffleKind> &SK) {
return !SK; })) {
17448 Type *ScalarTy)
const {
17449 const unsigned VF = VL.
size();
17457 auto EstimateInsertCost = [&](
unsigned I,
Value *
V) {
17459 if (
V->getType() != ScalarTy)
17460 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy,
V->getType(),
17464 std::iota(ConstantShuffleMask.begin(), ConstantShuffleMask.end(), 0);
17471 ConstantShuffleMask[
I] =
I + VF;
17474 EstimateInsertCost(
I, V);
17477 bool IsAnyNonUndefConst =
17480 if (!ForPoisonSrc && IsAnyNonUndefConst) {
17482 ConstantShuffleMask);
17486 if (!DemandedElements.
isZero())
17490 ForPoisonSrc && !IsAnyNonUndefConst, VL);
17494Instruction &BoUpSLP::getLastInstructionInBundle(
const TreeEntry *
E) {
17495 auto It = EntryToLastInstruction.find(
E);
17496 if (It != EntryToLastInstruction.end())
17504 if (
E->hasState()) {
17505 Front =
E->getMainOp();
17506 Opcode =
E->getOpcode();
17513 ((GatheredLoadsEntriesFirst.has_value() && Opcode == Instruction::Load &&
17514 E->isGather() &&
E->Idx < *GatheredLoadsEntriesFirst) ||
17515 E->State == TreeEntry::SplitVectorize ||
E->hasCopyableElements() ||
17517 [=](
Value *V) ->
bool {
17518 if (Opcode == Instruction::GetElementPtr &&
17519 !isa<GetElementPtrInst>(V))
17521 auto *I = dyn_cast<Instruction>(V);
17522 return !I || !E->getMatchingMainOpOrAltOp(I) ||
17523 I->getParent() == BB || isVectorLikeInstWithConstOps(I);
17525 "Expected gathered loads or GEPs or instructions from same basic "
17528 auto FindLastInst = [&]() {
17530 for (
Value *V :
E->Scalars) {
17534 if (
E->isCopyableElement(
I))
17536 if (LastInst->
getParent() ==
I->getParent()) {
17541 assert(((Opcode == Instruction::GetElementPtr &&
17543 E->State == TreeEntry::SplitVectorize ||
17546 (GatheredLoadsEntriesFirst.has_value() &&
17547 Opcode == Instruction::Load &&
E->isGather() &&
17548 E->Idx < *GatheredLoadsEntriesFirst)) &&
17549 "Expected vector-like or non-GEP in GEP node insts only.");
17550 if (!DT->isReachableFromEntry(LastInst->
getParent())) {
17554 if (!DT->isReachableFromEntry(
I->getParent()))
17556 auto *NodeA = DT->getNode(LastInst->
getParent());
17557 auto *NodeB = DT->getNode(
I->getParent());
17558 assert(NodeA &&
"Should only process reachable instructions");
17559 assert(NodeB &&
"Should only process reachable instructions");
17560 assert((NodeA == NodeB) ==
17561 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17562 "Different nodes should have different DFS numbers");
17563 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
17570 auto FindFirstInst = [&]() {
17572 for (
Value *V :
E->Scalars) {
17576 if (
E->isCopyableElement(
I))
17578 if (FirstInst->
getParent() ==
I->getParent()) {
17579 if (
I->comesBefore(FirstInst))
17583 assert(((Opcode == Instruction::GetElementPtr &&
17587 "Expected vector-like or non-GEP in GEP node insts only.");
17588 if (!DT->isReachableFromEntry(FirstInst->
getParent())) {
17592 if (!DT->isReachableFromEntry(
I->getParent()))
17594 auto *NodeA = DT->getNode(FirstInst->
getParent());
17595 auto *NodeB = DT->getNode(
I->getParent());
17596 assert(NodeA &&
"Should only process reachable instructions");
17597 assert(NodeB &&
"Should only process reachable instructions");
17598 assert((NodeA == NodeB) ==
17599 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17600 "Different nodes should have different DFS numbers");
17601 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
17607 if (
E->State == TreeEntry::SplitVectorize) {
17608 Res = FindLastInst();
17610 for (
auto *
E : Entries) {
17613 I = &getLastInstructionInBundle(
E);
17618 EntryToLastInstruction.try_emplace(
E, Res);
17623 if (GatheredLoadsEntriesFirst.has_value() &&
17624 E->Idx >= *GatheredLoadsEntriesFirst && !
E->isGather() &&
17625 Opcode == Instruction::Load) {
17626 Res = FindFirstInst();
17627 EntryToLastInstruction.try_emplace(
E, Res);
17633 auto FindScheduleBundle = [&](
const TreeEntry *
E) ->
const ScheduleBundle * {
17637 const auto *It = BlocksSchedules.find(BB);
17638 if (It == BlocksSchedules.end())
17640 for (
Value *V :
E->Scalars) {
17646 if (Bundles.
empty())
17649 Bundles, [&](ScheduleBundle *
B) {
return B->getTreeEntry() ==
E; });
17650 if (It != Bundles.
end())
17655 const ScheduleBundle *Bundle = FindScheduleBundle(
E);
17656 if (!
E->isGather() && !Bundle) {
17657 if ((Opcode == Instruction::GetElementPtr &&
17660 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
17664 return isa<PoisonValue>(V) ||
17665 (E->Idx == 0 && isa<InsertElementInst>(V)) ||
17666 E->isCopyableElement(V) ||
17667 (!isVectorLikeInstWithConstOps(V) &&
17668 isUsedOutsideBlock(V));
17670 (!
E->doesNotNeedToSchedule() ||
17673 if (!isa<Instruction>(V) ||
17674 (E->hasCopyableElements() && E->isCopyableElement(V)))
17676 return !areAllOperandsNonInsts(V);
17679 if (!isa<Instruction>(V) ||
17680 (E->hasCopyableElements() && E->isCopyableElement(V)))
17682 return MustGather.contains(V);
17684 Res = FindLastInst();
17686 Res = FindFirstInst();
17687 EntryToLastInstruction.try_emplace(
E, Res);
17696 assert(!
E->isGather() &&
"Gathered instructions should not be scheduled");
17697 Res = Bundle->getBundle().back()->getInst();
17698 EntryToLastInstruction.try_emplace(
E, Res);
17721 Res = FindLastInst();
17722 assert(Res &&
"Failed to find last instruction in bundle");
17723 EntryToLastInstruction.try_emplace(
E, Res);
17727void BoUpSLP::setInsertPointAfterBundle(
const TreeEntry *
E) {
17728 auto *Front =
E->getMainOp();
17729 Instruction *LastInst = &getLastInstructionInBundle(
E);
17730 assert(LastInst &&
"Failed to find last instruction in bundle");
17735 LastInstIt = LastInst->
getParent()->getFirstNonPHIIt();
17736 if (LastInstIt != LastInst->
getParent()->end() &&
17737 LastInstIt->getParent()->isLandingPad())
17738 LastInstIt = std::next(LastInstIt);
17741 (!
E->isGather() &&
E->State != TreeEntry::SplitVectorize &&
17742 (
E->doesNotNeedToSchedule() ||
17743 (
E->hasCopyableElements() && !
E->isCopyableElement(LastInst) &&
17745 (GatheredLoadsEntriesFirst.has_value() &&
17746 E->Idx >= *GatheredLoadsEntriesFirst && !
E->isGather() &&
17747 E->getOpcode() == Instruction::Load)) {
17748 Builder.SetInsertPoint(LastInst->
getParent(), LastInstIt);
17752 Builder.SetInsertPoint(
17756 Builder.SetCurrentDebugLocation(Front->
getDebugLoc());
17759Value *BoUpSLP::gather(
17761 function_ref<
Value *(
Value *,
Value *, ArrayRef<int>)> CreateShuffle) {
17767 SmallSet<int, 4> PostponedIndices;
17768 Loop *
L = LI->getLoopFor(Builder.GetInsertBlock());
17770 SmallPtrSet<BasicBlock *, 4> Visited;
17771 while (InsertBB && InsertBB != InstBB && Visited.
insert(InsertBB).second)
17772 InsertBB = InsertBB->getSinglePredecessor();
17773 return InsertBB && InsertBB == InstBB;
17775 for (
int I = 0,
E = VL.
size();
I <
E; ++
I) {
17777 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
17779 (L && (!Root ||
L->isLoopInvariant(Root)) &&
L->contains(Inst))) &&
17780 PostponedIndices.
insert(
I).second)
17784 auto &&CreateInsertElement = [
this](
Value *Vec,
Value *
V,
unsigned Pos,
17787 if (
Scalar->getType() != Ty) {
17798 Scalar = Builder.CreateIntCast(
17812 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
17817 GatherShuffleExtractSeq.insert(InsElt);
17823 User *UserOp =
nullptr;
17828 if (
V->getType()->isVectorTy()) {
17830 SV && SV->getOperand(0) != V && SV->getOperand(1) != V) {
17832 auto FindOperand = [](
Value *Vec,
Value *
V) -> Instruction * {
17834 if (SV->getOperand(0) == V)
17836 if (SV->getOperand(1) == V)
17842 if (Instruction *User = FindOperand(SV->getOperand(0), V))
17844 else if (Instruction *User = FindOperand(SV->getOperand(1), V))
17847 "Failed to find shufflevector, caused by resize.");
17853 unsigned FoundLane = Entries.front()->findLaneForValue(V);
17854 ExternalUses.emplace_back(V, UserOp, *Entries.front(), FoundLane);
17862 SmallVector<int> NonConsts;
17864 std::iota(
Mask.begin(),
Mask.end(), 0);
17865 Value *OriginalRoot = Root;
17868 SV->getOperand(0)->getType() == VecTy) {
17869 Root = SV->getOperand(0);
17870 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
17873 for (
int I = 0,
E = VL.
size();
I <
E; ++
I) {
17882 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
17887 Vec = OriginalRoot;
17889 Vec = CreateShuffle(Root, Vec, Mask);
17891 OI && OI->use_empty() &&
17892 none_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
17893 return TE->VectorizedValue == OI;
17899 for (
int I : NonConsts)
17900 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
17903 for (
const std::pair<Value *, unsigned> &Pair : PostponedInsts)
17904 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
17942 bool IsFinalized =
false;
17955 class ShuffleIRBuilder {
17968 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
17969 CSEBlocks(CSEBlocks),
DL(DL) {}
17970 ~ShuffleIRBuilder() =
default;
17976 "Expected integer vector types only.");
17982 ->getIntegerBitWidth())
17983 V2 = Builder.CreateIntCast(
17986 V1 = Builder.CreateIntCast(
17990 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
17992 GatherShuffleExtractSeq.insert(
I);
17993 CSEBlocks.insert(
I->getParent());
18002 unsigned VF = Mask.size();
18006 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
18008 GatherShuffleExtractSeq.insert(
I);
18009 CSEBlocks.insert(
I->getParent());
18013 Value *createIdentity(
Value *V) {
return V; }
18014 Value *createPoison(
Type *Ty,
unsigned VF) {
18019 void resizeToMatch(
Value *&V1,
Value *&V2) {
18024 int VF = std::max(V1VF, V2VF);
18025 int MinVF = std::min(V1VF, V2VF);
18027 std::iota(IdentityMask.
begin(), std::next(IdentityMask.
begin(), MinVF),
18029 Value *&
Op = MinVF == V1VF ? V1 : V2;
18030 Op = Builder.CreateShuffleVector(
Op, IdentityMask);
18032 GatherShuffleExtractSeq.insert(
I);
18033 CSEBlocks.insert(
I->getParent());
18046 assert(V1 &&
"Expected at least one vector value.");
18047 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
18048 R.CSEBlocks, *R.DL);
18049 return BaseShuffleAnalysis::createShuffle<Value *>(
18050 V1, V2, Mask, ShuffleBuilder, ScalarTy);
18056 std::optional<bool> IsSigned = std::nullopt) {
18059 if (VecTy->getElementType() == ScalarTy->getScalarType())
18061 return Builder.CreateIntCast(
18062 V,
VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
18066 Value *getVectorizedValue(
const TreeEntry &E) {
18067 Value *Vec = E.VectorizedValue;
18070 return castToScalarTyElem(Vec,
any_of(E.Scalars, [&](
Value *V) {
18071 return !isa<PoisonValue>(V) &&
18072 !isKnownNonNegative(
18073 V, SimplifyQuery(*R.DL));
18079 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
18083 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
18084 unsigned NumParts,
bool &UseVecBaseAsInput) {
18085 UseVecBaseAsInput =
false;
18087 Value *VecBase =
nullptr;
18089 if (!E->ReorderIndices.empty()) {
18091 E->ReorderIndices.end());
18094 for (
int I = 0, Sz = Mask.size();
I < Sz; ++
I) {
18099 VecBase = EI->getVectorOperand();
18101 VecBase = TEs.front()->VectorizedValue;
18102 assert(VecBase &&
"Expected vectorized value.");
18103 UniqueBases.
insert(VecBase);
18106 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
18107 (NumParts != 1 &&
count(VL, EI) > 1) ||
18109 ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(U);
18110 return UTEs.empty() || UTEs.size() > 1 ||
18111 (isa<GetElementPtrInst>(U) &&
18112 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
18114 count_if(R.VectorizableTree,
18115 [&](const std::unique_ptr<TreeEntry> &TE) {
18116 return TE->UserTreeIndex.UserTE ==
18118 is_contained(VL, EI);
18122 R.eraseInstruction(EI);
18124 if (NumParts == 1 || UniqueBases.
size() == 1) {
18125 assert(VecBase &&
"Expected vectorized value.");
18126 return castToScalarTyElem(VecBase);
18128 UseVecBaseAsInput =
true;
18138 Value *Vec =
nullptr;
18145 constexpr int MaxBases = 2;
18147 auto VLMask =
zip(SubVL, SubMask);
18148 const unsigned VF = std::accumulate(
18149 VLMask.begin(), VLMask.end(), 0U, [&](
unsigned S,
const auto &
D) {
18150 if (std::get<1>(D) == PoisonMaskElem)
18153 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
18154 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp);
18156 VecOp = TEs.front()->VectorizedValue;
18157 assert(VecOp &&
"Expected vectorized value.");
18158 const unsigned Size =
18159 cast<FixedVectorType>(VecOp->getType())->getNumElements();
18160 return std::max(S, Size);
18162 for (
const auto [V,
I] : VLMask) {
18167 VecOp = TEs.front()->VectorizedValue;
18168 assert(VecOp &&
"Expected vectorized value.");
18169 VecOp = castToScalarTyElem(VecOp);
18170 Bases[
I / VF] = VecOp;
18172 if (!Bases.front())
18175 if (Bases.back()) {
18176 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
18177 TransformToIdentity(SubMask);
18179 SubVec = Bases.front();
18185 ArrayRef<int> SubMask =
18186 Mask.slice(
P * SliceSize,
18189 return all_of(SubMask, [](
int Idx) {
18193 "Expected first part or all previous parts masked.");
18194 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18199 unsigned SubVecVF =
18201 NewVF = std::max(NewVF, SubVecVF);
18204 for (
int &Idx : SubMask)
18207 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18208 Vec = createShuffle(Vec, SubVec, VecMask);
18209 TransformToIdentity(VecMask);
18217 std::optional<Value *>
18223 TEs, [](
const TreeEntry *TE) {
return TE->VectorizedValue; });
18225 return std::nullopt;
18228 auto *ResVecTy =
getWidenedType(ScalarTy, E->getVectorFactor());
18229 return Builder.CreateAlignedLoad(
18236 IsFinalized =
false;
18237 CommonMask.clear();
18243 Value *V1 = getVectorizedValue(E1);
18244 Value *V2 = getVectorizedValue(E2);
18250 Value *V1 = getVectorizedValue(E1);
18255 assert(V1 && V2 && !Mask.empty() &&
"Expected non-empty input vectors.");
18258 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
18259 V1 = castToScalarTyElem(V1);
18260 V2 = castToScalarTyElem(V2);
18261 if (InVectors.empty()) {
18262 InVectors.push_back(V1);
18263 InVectors.push_back(V2);
18264 CommonMask.assign(Mask.begin(), Mask.end());
18267 Value *Vec = InVectors.front();
18268 if (InVectors.size() == 2) {
18269 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18270 transformMaskAfterShuffle(CommonMask, CommonMask);
18273 Vec = createShuffle(Vec,
nullptr, CommonMask);
18274 transformMaskAfterShuffle(CommonMask, CommonMask);
18276 V1 = createShuffle(V1, V2, Mask);
18277 unsigned VF = std::max(getVF(V1), getVF(Vec));
18278 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18280 CommonMask[Idx] = Idx + VF;
18281 InVectors.front() = Vec;
18282 if (InVectors.size() == 2)
18283 InVectors.back() = V1;
18285 InVectors.push_back(V1);
18290 "castToScalarTyElem expects V1 to be FixedVectorType");
18291 V1 = castToScalarTyElem(V1);
18292 if (InVectors.empty()) {
18293 InVectors.push_back(V1);
18294 CommonMask.assign(Mask.begin(), Mask.end());
18297 const auto *It =
find(InVectors, V1);
18298 if (It == InVectors.end()) {
18299 if (InVectors.size() == 2 ||
18300 InVectors.front()->getType() != V1->
getType()) {
18301 Value *V = InVectors.front();
18302 if (InVectors.size() == 2) {
18303 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
18304 transformMaskAfterShuffle(CommonMask, CommonMask);
18306 CommonMask.size()) {
18307 V = createShuffle(InVectors.front(),
nullptr, CommonMask);
18308 transformMaskAfterShuffle(CommonMask, CommonMask);
18310 unsigned VF = std::max(CommonMask.size(), Mask.size());
18311 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18313 CommonMask[Idx] = V->getType() != V1->
getType()
18315 : Mask[Idx] + getVF(V1);
18316 if (V->getType() != V1->
getType())
18317 V1 = createShuffle(V1,
nullptr, Mask);
18318 InVectors.front() = V;
18319 if (InVectors.size() == 2)
18320 InVectors.back() = V1;
18322 InVectors.push_back(V1);
18327 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18329 InVectors.push_back(V1);
18334 for (
Value *V : InVectors)
18335 VF = std::max(VF, getVF(V));
18336 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18338 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
18347 Value *Root =
nullptr) {
18348 return R.gather(VL, Root, ScalarTy,
18350 return createShuffle(V1, V2, Mask);
18359 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
18364 IsFinalized =
true;
18367 if (InVectors.
size() == 2) {
18368 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
18371 Vec = createShuffle(Vec,
nullptr, CommonMask);
18373 transformMaskAfterShuffle(CommonMask, CommonMask);
18375 "Expected vector length for the final value before action.");
18379 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
18380 Vec = createShuffle(Vec,
nullptr, ResizeMask);
18383 return createShuffle(V1, V2, Mask);
18385 InVectors.
front() = Vec;
18387 if (!SubVectors.empty()) {
18389 if (InVectors.
size() == 2) {
18390 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
18393 Vec = createShuffle(Vec,
nullptr, CommonMask);
18395 transformMaskAfterShuffle(CommonMask, CommonMask);
18396 auto CreateSubVectors = [&](
Value *Vec,
18397 SmallVectorImpl<int> &CommonMask) {
18398 for (
auto [
E, Idx] : SubVectors) {
18399 Value *
V = getVectorizedValue(*
E);
18406 Type *OrigScalarTy = ScalarTy;
18409 Builder, Vec, V, InsertionIndex,
18410 std::bind(&ShuffleInstructionBuilder::createShuffle,
this, _1, _2,
18412 ScalarTy = OrigScalarTy;
18413 if (!CommonMask.
empty()) {
18414 std::iota(std::next(CommonMask.
begin(), Idx),
18415 std::next(CommonMask.
begin(), Idx +
E->getVectorFactor()),
18421 if (SubVectorsMask.
empty()) {
18422 Vec = CreateSubVectors(Vec, CommonMask);
18425 copy(SubVectorsMask, SVMask.begin());
18426 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
18429 I1 = I2 + CommonMask.
size();
18434 Vec = createShuffle(InsertVec, Vec, SVMask);
18435 transformMaskAfterShuffle(CommonMask, SVMask);
18437 InVectors.
front() = Vec;
18440 if (!ExtMask.
empty()) {
18441 if (CommonMask.
empty()) {
18445 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
18448 NewMask[
I] = CommonMask[ExtMask[
I]];
18450 CommonMask.
swap(NewMask);
18453 if (CommonMask.
empty()) {
18454 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
18455 return InVectors.
front();
18457 if (InVectors.
size() == 2)
18458 return createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
18459 return createShuffle(InVectors.
front(),
nullptr, CommonMask);
18463 assert((IsFinalized || CommonMask.empty()) &&
18464 "Shuffle construction must be finalized.");
18468Value *BoUpSLP::vectorizeOperand(TreeEntry *E,
unsigned NodeIdx) {
18472template <
typename BVTy,
typename ResTy,
typename... Args>
18473ResTy BoUpSLP::processBuildVector(
const TreeEntry *E,
Type *ScalarTy,
18475 assert(E->isGather() &&
"Expected gather node.");
18476 unsigned VF = E->getVectorFactor();
18478 bool NeedFreeze =
false;
18481 for (
auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
18483 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
18486 E->CombinedEntriesWithIndices.size());
18487 transform(
E->CombinedEntriesWithIndices, SubVectors.begin(),
18488 [&](
const auto &
P) {
18489 return std::make_pair(VectorizableTree[P.first].get(), P.second);
18494 E->ReorderIndices.end());
18495 if (!ReorderMask.empty())
18501 if (!SubVectors.empty() && !SubVectorsMask.
empty()) {
18503 if (
E->Scalars[
I] == GatheredScalars[ReorderMask[
I]])
18506 SubVectorsMask.
clear();
18510 unsigned I,
unsigned SliceSize,
18511 bool IsNotPoisonous) {
18513 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
18516 TreeEntry *UserTE =
E->UserTreeIndex.UserTE;
18517 unsigned EdgeIdx =
E->UserTreeIndex.EdgeIdx;
18518 if (UserTE->getNumOperands() != 2)
18520 if (!IsNotPoisonous) {
18521 auto *It =
find_if(
ArrayRef(VectorizableTree).drop_front(UserTE->Idx + 1),
18522 [=](
const std::unique_ptr<TreeEntry> &TE) {
18523 return TE->UserTreeIndex.UserTE == UserTE &&
18524 TE->UserTreeIndex.EdgeIdx != EdgeIdx;
18526 if (It == VectorizableTree.end())
18529 if (!(*It)->ReorderIndices.empty()) {
18533 if (!
all_of(
zip(GatheredScalars, GS), [&](
const auto &
P) {
18534 Value *V0 = std::get<0>(
P);
18535 Value *V1 = std::get<1>(
P);
18543 if ((
Mask.size() < InputVF &&
18546 (
Mask.size() == InputVF &&
18549 std::next(
Mask.begin(),
I * SliceSize),
18550 std::next(
Mask.begin(),
18557 std::next(
Mask.begin(),
I * SliceSize),
18558 std::next(
Mask.begin(),
18564 BVTy ShuffleBuilder(ScalarTy, Params...);
18565 ResTy Res = ResTy();
18566 SmallVector<int>
Mask;
18567 SmallVector<int> ExtractMask(GatheredScalars.size(),
PoisonMaskElem);
18569 Value *ExtractVecBase =
nullptr;
18570 bool UseVecBaseAsInput =
false;
18573 Type *OrigScalarTy = GatheredScalars.front()->getType();
18578 bool Resized =
false;
18580 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
18581 if (!ExtractShuffles.
empty()) {
18583 for (
auto [Idx,
I] :
enumerate(ExtractMask)) {
18589 ExtractEntries.
append(TEs.begin(), TEs.end());
18591 if (std::optional<ResTy> Delayed =
18592 ShuffleBuilder.needToDelay(
E, ExtractEntries)) {
18594 PostponedGathers.insert(
E);
18599 if (
Value *VecBase = ShuffleBuilder.adjustExtracts(
18600 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
18601 ExtractVecBase = VecBase;
18603 if (VF == VecBaseTy->getNumElements() &&
18604 GatheredScalars.size() != VF) {
18606 GatheredScalars.append(VF - GatheredScalars.size(),
18614 if (!ExtractShuffles.
empty() || !
E->hasState() ||
18615 E->getOpcode() != Instruction::Load ||
18616 (((
E->hasState() &&
E->getOpcode() == Instruction::Load) ||
18620 return isa<LoadInst>(V) && isVectorized(V);
18622 (
E->hasState() &&
E->isAltShuffle()) ||
18623 all_of(
E->Scalars, [
this](
Value *V) { return isVectorized(V); }) ||
18625 (
E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
18627 isGatherShuffledEntry(
E, GatheredScalars, Mask, Entries, NumParts);
18629 if (!GatherShuffles.
empty()) {
18630 if (std::optional<ResTy> Delayed =
18631 ShuffleBuilder.needToDelay(
E, Entries)) {
18633 PostponedGathers.insert(
E);
18638 if (GatherShuffles.
size() == 1 &&
18640 Entries.
front().front()->isSame(
E->Scalars)) {
18643 LLVM_DEBUG(
dbgs() <<
"SLP: perfect diamond match for gather bundle "
18646 Mask.resize(
E->Scalars.size());
18647 const TreeEntry *FrontTE = Entries.
front().front();
18648 if (FrontTE->ReorderIndices.empty() &&
18649 ((FrontTE->ReuseShuffleIndices.empty() &&
18650 E->Scalars.size() == FrontTE->Scalars.size()) ||
18651 (
E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
18652 std::iota(
Mask.begin(),
Mask.end(), 0);
18659 Mask[
I] = FrontTE->findLaneForValue(V);
18664 ShuffleBuilder.resetForSameNode();
18665 ShuffleBuilder.add(*FrontTE, Mask);
18667 Res = ShuffleBuilder.finalize(
E->getCommonMask(), {}, {});
18671 if (GatheredScalars.size() != VF &&
18673 return any_of(TEs, [&](
const TreeEntry *TE) {
18674 return TE->getVectorFactor() == VF;
18677 GatheredScalars.append(VF - GatheredScalars.size(),
18681 for (
int I = 0, Sz =
Mask.size();
I < Sz; ++
I) {
18687 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
18688 SmallVectorImpl<int> &ReuseMask,
18689 bool IsRootPoison) {
18692 bool IsSplat = IsRootPoison &&
isSplat(Scalars) &&
18695 SmallVector<int> UndefPos;
18696 DenseMap<Value *, unsigned> UniquePositions;
18699 int NumNonConsts = 0;
18718 Scalars.
front() = OrigV;
18721 const auto Res = UniquePositions.
try_emplace(OrigV,
I);
18722 Scalars[Res.first->second] = OrigV;
18723 ReuseMask[
I] = Res.first->second;
18726 if (NumNonConsts == 1) {
18731 if (!UndefPos.
empty() && UndefPos.
front() == 0)
18734 ReuseMask[SinglePos] = SinglePos;
18735 }
else if (!UndefPos.
empty() && IsSplat) {
18742 (
E->UserTreeIndex &&
any_of(
V->uses(), [
E](
const Use &U) {
18745 return E->UserTreeIndex.EdgeIdx != U.getOperandNo() &&
18746 is_contained(E->UserTreeIndex.UserTE->Scalars,
18750 if (It != Scalars.
end()) {
18752 int Pos = std::distance(Scalars.
begin(), It);
18753 for (
int I : UndefPos) {
18755 ReuseMask[
I] = Pos;
18764 for (
int I : UndefPos) {
18773 if (!ExtractShuffles.
empty() || !GatherShuffles.
empty()) {
18774 bool IsNonPoisoned =
true;
18775 bool IsUsedInExpr =
true;
18776 Value *Vec1 =
nullptr;
18777 if (!ExtractShuffles.
empty()) {
18781 Value *Vec2 =
nullptr;
18782 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
18786 if (UseVecBaseAsInput) {
18787 Vec1 = ExtractVecBase;
18789 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
18795 Value *VecOp = EI->getVectorOperand();
18797 !TEs.
empty() && TEs.
front()->VectorizedValue)
18798 VecOp = TEs.
front()->VectorizedValue;
18801 }
else if (Vec1 != VecOp) {
18802 assert((!Vec2 || Vec2 == VecOp) &&
18803 "Expected only 1 or 2 vectors shuffle.");
18809 IsUsedInExpr =
false;
18812 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
18815 IsUsedInExpr &= FindReusedSplat(
18818 ExtractMask.size(), IsNotPoisonedVec);
18819 ShuffleBuilder.add(Vec1, ExtractMask,
true);
18820 IsNonPoisoned &= IsNotPoisonedVec;
18822 IsUsedInExpr =
false;
18827 if (!GatherShuffles.
empty()) {
18828 unsigned SliceSize =
18832 for (
const auto [
I, TEs] :
enumerate(Entries)) {
18835 "No shuffles with empty entries list expected.");
18839 "Expected shuffle of 1 or 2 entries.");
18843 copy(SubMask, std::next(VecMask.begin(),
I * SliceSize));
18844 if (TEs.
size() == 1) {
18845 bool IsNotPoisonedVec =
18846 TEs.
front()->VectorizedValue
18850 FindReusedSplat(VecMask, TEs.
front()->getVectorFactor(),
I,
18851 SliceSize, IsNotPoisonedVec);
18852 ShuffleBuilder.add(*TEs.
front(), VecMask);
18853 IsNonPoisoned &= IsNotPoisonedVec;
18855 IsUsedInExpr =
false;
18856 ShuffleBuilder.add(*TEs.
front(), *TEs.
back(), VecMask);
18857 if (TEs.
front()->VectorizedValue && TEs.
back()->VectorizedValue)
18868 int EMSz = ExtractMask.size();
18869 int MSz =
Mask.size();
18872 bool IsSingleShuffle = ExtractShuffles.
empty() || GatherShuffles.
empty();
18873 bool IsIdentityShuffle =
18874 ((UseVecBaseAsInput ||
18876 [](
const std::optional<TTI::ShuffleKind> &SK) {
18880 none_of(ExtractMask, [&](
int I) {
return I >= EMSz; }) &&
18882 (!GatherShuffles.
empty() &&
18884 [](
const std::optional<TTI::ShuffleKind> &SK) {
18888 none_of(Mask, [&](
int I) {
return I >= MSz; }) &&
18890 bool EnoughConstsForShuffle =
18900 (!IsIdentityShuffle ||
18901 (GatheredScalars.size() == 2 &&
18909 for (
int I = 0, Sz = GatheredScalars.size();
I < Sz; ++
I) {
18910 if (EnoughConstsForShuffle &&
isa<Constant>(GatheredScalars[
I]))
18917 SmallVector<int> BVMask(GatheredScalars.size(),
PoisonMaskElem);
18918 TryPackScalars(GatheredScalars, BVMask,
true);
18919 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
18920 ShuffleBuilder.add(BV, BVMask);
18924 (IsSingleShuffle && ((IsIdentityShuffle &&
18927 Res = ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors,
18930 Res = ShuffleBuilder.finalize(
18931 E->ReuseShuffleIndices, SubVectors, SubVectorsMask,
E->Scalars.size(),
18932 [&](
Value *&Vec, SmallVectorImpl<int> &Mask,
auto CreateShuffle) {
18933 bool IsSplat = isSplat(NonConstants);
18934 SmallVector<int> BVMask(Mask.size(), PoisonMaskElem);
18935 TryPackScalars(NonConstants, BVMask, false);
18936 auto CheckIfSplatIsProfitable = [&]() {
18939 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
18940 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
18941 if (isa<ExtractElementInst>(V) || isVectorized(V))
18943 InstructionCost SplatCost = TTI->getVectorInstrCost(
18944 Instruction::InsertElement, VecTy, CostKind, 0,
18945 PoisonValue::get(VecTy), V);
18946 SmallVector<int> NewMask(Mask.begin(), Mask.end());
18947 for (auto [Idx, I] : enumerate(BVMask))
18948 if (I != PoisonMaskElem)
18949 NewMask[Idx] = Mask.size();
18950 SplatCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy,
18951 NewMask, CostKind);
18952 InstructionCost BVCost = TTI->getVectorInstrCost(
18953 Instruction::InsertElement, VecTy, CostKind,
18954 *find_if(Mask, [](int I) { return I != PoisonMaskElem; }),
18957 if (count(BVMask, PoisonMaskElem) <
18958 static_cast<int>(BVMask.size() - 1)) {
18959 SmallVector<int> NewMask(Mask.begin(), Mask.end());
18960 for (auto [Idx, I] : enumerate(BVMask))
18961 if (I != PoisonMaskElem)
18963 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
18964 VecTy, NewMask, CostKind);
18966 return SplatCost <= BVCost;
18968 if (!IsSplat ||
Mask.size() <= 2 || !CheckIfSplatIsProfitable()) {
18972 Vec = ShuffleBuilder.gather(NonConstants,
Mask.size(), Vec);
18978 Value *BV = ShuffleBuilder.gather(Values, BVMask.size());
18980 transform(BVMask, SplatMask.begin(), [](
int I) {
18981 return I == PoisonMaskElem ? PoisonMaskElem : 0;
18984 BV = CreateShuffle(BV,
nullptr, SplatMask);
18987 Mask[Idx] = BVMask.size() + Idx;
18988 Vec = CreateShuffle(Vec, BV, Mask);
18996 SmallVector<int> ReuseMask(GatheredScalars.size(),
PoisonMaskElem);
18997 TryPackScalars(GatheredScalars, ReuseMask,
true);
18998 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.
size());
18999 ShuffleBuilder.add(BV, ReuseMask);
19000 Res = ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors,
19005 for (
auto [
I, V] :
enumerate(GatheredScalars)) {
19009 Value *BV = ShuffleBuilder.gather(GatheredScalars);
19010 ShuffleBuilder.add(BV, Mask);
19011 Res = ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors,
19016 Res = ShuffleBuilder.createFreeze(Res);
19020Value *BoUpSLP::createBuildVector(
const TreeEntry *
E,
Type *ScalarTy) {
19021 for (
auto [EIdx,
_] :
E->CombinedEntriesWithIndices)
19023 return processBuildVector<ShuffleInstructionBuilder, Value *>(
E, ScalarTy,
19031 for (
Value *V : VL)
19044 IRBuilderBase::InsertPointGuard Guard(Builder);
19046 Value *
V =
E->Scalars.front();
19047 Type *ScalarTy =
V->getType();
19050 auto It = MinBWs.find(
E);
19051 if (It != MinBWs.end()) {
19057 if (
E->VectorizedValue)
19058 return E->VectorizedValue;
19060 if (
E->isGather()) {
19062 if (
E->hasState() &&
E->Idx == 0 && !UserIgnoreList)
19063 setInsertPointAfterBundle(
E);
19064 Value *Vec = createBuildVector(
E, ScalarTy);
19065 E->VectorizedValue = Vec;
19068 if (
E->State == TreeEntry::SplitVectorize) {
19069 assert(
E->CombinedEntriesWithIndices.size() == 2 &&
19070 "Expected exactly 2 combined entries.");
19071 setInsertPointAfterBundle(
E);
19073 *VectorizableTree[
E->CombinedEntriesWithIndices.front().first];
19075 ArrayRef(
E->Scalars).take_front(OpTE1.getVectorFactor())) &&
19076 "Expected same first part of scalars.");
19079 *VectorizableTree[
E->CombinedEntriesWithIndices.back().first];
19081 OpTE2.isSame(
ArrayRef(
E->Scalars).take_back(OpTE2.getVectorFactor())) &&
19082 "Expected same second part of scalars.");
19084 auto GetOperandSignedness = [&](
const TreeEntry *OpE) {
19085 bool IsSigned =
false;
19086 auto It = MinBWs.find(OpE);
19087 if (It != MinBWs.end())
19088 IsSigned = It->second.second;
19091 if (isa<PoisonValue>(V))
19093 return !isKnownNonNegative(R, SimplifyQuery(*DL));
19100 Op1 = Builder.CreateIntCast(
19105 GetOperandSignedness(&OpTE1));
19110 Op2 = Builder.CreateIntCast(
19115 GetOperandSignedness(&OpTE2));
19117 if (
E->ReorderIndices.empty()) {
19121 std::next(
Mask.begin(),
E->CombinedEntriesWithIndices.back().second),
19124 if (ScalarTyNumElements != 1) {
19128 Value *Vec = Builder.CreateShuffleVector(Op1, Mask);
19130 E->CombinedEntriesWithIndices.back().second *
19131 ScalarTyNumElements);
19132 E->VectorizedValue = Vec;
19135 unsigned CommonVF =
19136 std::max(OpTE1.getVectorFactor(), OpTE2.getVectorFactor());
19139 std::iota(
Mask.begin(), std::next(
Mask.begin(), OpTE1.getVectorFactor()),
19141 Op1 = Builder.CreateShuffleVector(Op1, Mask);
19145 std::iota(
Mask.begin(), std::next(
Mask.begin(), OpTE2.getVectorFactor()),
19147 Op2 = Builder.CreateShuffleVector(Op2, Mask);
19149 Value *Vec = Builder.CreateShuffleVector(Op1, Op2,
E->getSplitMask());
19150 E->VectorizedValue = Vec;
19154 bool IsReverseOrder =
19156 auto FinalShuffle = [&](
Value *
V,
const TreeEntry *
E) {
19158 if (
E->getOpcode() == Instruction::Store &&
19159 E->State == TreeEntry::Vectorize) {
19160 ArrayRef<int>
Mask =
19161 ArrayRef(
reinterpret_cast<const int *
>(
E->ReorderIndices.begin()),
19162 E->ReorderIndices.size());
19163 ShuffleBuilder.add(V, Mask);
19164 }
else if ((
E->State == TreeEntry::StridedVectorize && IsReverseOrder) ||
19165 E->State == TreeEntry::CompressVectorize) {
19166 ShuffleBuilder.addOrdered(V, {});
19168 ShuffleBuilder.addOrdered(V,
E->ReorderIndices);
19171 E->CombinedEntriesWithIndices.size());
19173 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](
const auto &
P) {
19174 return std::make_pair(VectorizableTree[P.first].get(), P.second);
19177 (
E->CombinedEntriesWithIndices.empty() ||
E->ReorderIndices.empty()) &&
19178 "Expected either combined subnodes or reordering");
19179 return ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors, {});
19182 assert(!
E->isGather() &&
"Unhandled state");
19183 unsigned ShuffleOrOp =
19184 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector :
E->
getOpcode();
19186 auto GetOperandSignedness = [&](
unsigned Idx) {
19187 const TreeEntry *OpE = getOperandEntry(
E, Idx);
19188 bool IsSigned =
false;
19189 auto It = MinBWs.find(OpE);
19190 if (It != MinBWs.end())
19191 IsSigned = It->second.second;
19194 if (isa<PoisonValue>(V))
19196 return !isKnownNonNegative(R, SimplifyQuery(*DL));
19200 switch (ShuffleOrOp) {
19201 case Instruction::PHI: {
19202 assert((
E->ReorderIndices.empty() || !
E->ReuseShuffleIndices.empty() ||
19203 E != VectorizableTree.front().get() ||
E->UserTreeIndex) &&
19204 "PHI reordering is free.");
19206 Builder.SetInsertPoint(PH->getParent(),
19207 PH->getParent()->getFirstNonPHIIt());
19209 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
19213 Builder.SetInsertPoint(PH->getParent(),
19214 PH->getParent()->getFirstInsertionPt());
19217 V = FinalShuffle(V,
E);
19219 E->VectorizedValue =
V;
19226 SmallPtrSet<BasicBlock *, 4> VisitedBBs;
19233 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
19237 if (!VisitedBBs.
insert(IBB).second) {
19240 TreeEntry *OpTE = getOperandEntry(
E,
I);
19241 assert(!OpTE->VectorizedValue &&
"Expected no vectorized value.");
19242 OpTE->VectorizedValue = VecOp;
19248 Value *Vec = vectorizeOperand(
E,
I);
19249 if (VecTy != Vec->
getType()) {
19251 MinBWs.contains(getOperandEntry(
E,
I))) &&
19252 "Expected item in MinBWs.");
19253 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(
I));
19259 "Invalid number of incoming values");
19260 assert(
E->VectorizedValue &&
"Expected vectorized value.");
19261 return E->VectorizedValue;
19264 case Instruction::ExtractElement: {
19265 Value *
V =
E->getSingleOperand(0);
19266 setInsertPointAfterBundle(
E);
19267 V = FinalShuffle(V,
E);
19268 E->VectorizedValue =
V;
19271 case Instruction::ExtractValue: {
19273 Builder.SetInsertPoint(LI);
19274 Value *
Ptr = LI->getPointerOperand();
19275 LoadInst *
V = Builder.CreateAlignedLoad(VecTy,
Ptr, LI->getAlign());
19277 NewV = FinalShuffle(NewV,
E);
19278 E->VectorizedValue = NewV;
19281 case Instruction::InsertElement: {
19282 assert(
E->ReuseShuffleIndices.empty() &&
"All inserts should be unique");
19283 if (
const TreeEntry *OpE = getOperandEntry(
E, 1);
19284 OpE && !OpE->isGather() && OpE->hasState() &&
19285 !OpE->hasCopyableElements())
19288 setInsertPointAfterBundle(
E);
19289 Value *
V = vectorizeOperand(
E, 1);
19291 Type *ScalarTy =
Op.front()->getType();
19294 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(
E, 1));
19295 assert(Res.first > 0 &&
"Expected item in MinBWs.");
19296 V = Builder.CreateIntCast(
19306 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
19308 const unsigned NumElts =
19310 const unsigned NumScalars =
E->Scalars.size();
19313 assert(
Offset < NumElts &&
"Failed to find vector index offset");
19316 SmallVector<int>
Mask;
19317 if (!
E->ReorderIndices.empty()) {
19322 std::iota(
Mask.begin(), std::next(
Mask.begin(), NumScalars), 0);
19325 bool IsIdentity =
true;
19327 Mask.swap(PrevMask);
19328 for (
unsigned I = 0;
I < NumScalars; ++
I) {
19331 IsIdentity &= InsertIdx -
Offset ==
I;
19334 if (!IsIdentity || NumElts != NumScalars) {
19335 Value *V2 =
nullptr;
19336 bool IsVNonPoisonous =
19338 SmallVector<int> InsertMask(Mask);
19339 if (NumElts != NumScalars &&
Offset == 0) {
19348 InsertMask[*InsertIdx] = *InsertIdx;
19349 if (!
Ins->hasOneUse())
19352 Ins->getUniqueUndroppableUser());
19354 SmallBitVector UseMask =
19355 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19356 SmallBitVector IsFirstPoison =
19358 SmallBitVector IsFirstUndef =
19360 if (!IsFirstPoison.
all()) {
19362 for (
unsigned I = 0;
I < NumElts;
I++) {
19364 IsFirstUndef.
test(
I)) {
19365 if (IsVNonPoisonous) {
19366 InsertMask[
I] =
I < NumScalars ?
I : 0;
19371 if (Idx >= NumScalars)
19372 Idx = NumScalars - 1;
19373 InsertMask[
I] = NumScalars + Idx;
19386 V = Builder.CreateShuffleVector(V, V2, InsertMask);
19388 GatherShuffleExtractSeq.insert(
I);
19389 CSEBlocks.insert(
I->getParent());
19394 for (
unsigned I = 0;
I < NumElts;
I++) {
19398 SmallBitVector UseMask =
19399 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19400 SmallBitVector IsFirstUndef =
19402 if ((!IsIdentity ||
Offset != 0 || !IsFirstUndef.
all()) &&
19403 NumElts != NumScalars) {
19404 if (IsFirstUndef.
all()) {
19406 SmallBitVector IsFirstPoison =
19408 if (!IsFirstPoison.
all()) {
19409 for (
unsigned I = 0;
I < NumElts;
I++) {
19411 InsertMask[
I] =
I + NumElts;
19414 V = Builder.CreateShuffleVector(
19420 GatherShuffleExtractSeq.insert(
I);
19421 CSEBlocks.insert(
I->getParent());
19425 SmallBitVector IsFirstPoison =
19427 for (
unsigned I = 0;
I < NumElts;
I++) {
19431 InsertMask[
I] += NumElts;
19433 V = Builder.CreateShuffleVector(
19434 FirstInsert->getOperand(0), V, InsertMask,
19437 GatherShuffleExtractSeq.insert(
I);
19438 CSEBlocks.insert(
I->getParent());
19443 ++NumVectorInstructions;
19444 E->VectorizedValue =
V;
19447 case Instruction::ZExt:
19448 case Instruction::SExt:
19449 case Instruction::FPToUI:
19450 case Instruction::FPToSI:
19451 case Instruction::FPExt:
19452 case Instruction::PtrToInt:
19453 case Instruction::IntToPtr:
19454 case Instruction::SIToFP:
19455 case Instruction::UIToFP:
19456 case Instruction::Trunc:
19457 case Instruction::FPTrunc:
19458 case Instruction::BitCast: {
19459 setInsertPointAfterBundle(
E);
19461 Value *InVec = vectorizeOperand(
E, 0);
19466 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
19468 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
19471 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
19472 if (SrcIt != MinBWs.end())
19473 SrcBWSz = SrcIt->second.first;
19474 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->
getScalarType());
19475 if (BWSz == SrcBWSz) {
19476 VecOpcode = Instruction::BitCast;
19477 }
else if (BWSz < SrcBWSz) {
19478 VecOpcode = Instruction::Trunc;
19479 }
else if (It != MinBWs.end()) {
19480 assert(BWSz > SrcBWSz &&
"Invalid cast!");
19481 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
19482 }
else if (SrcIt != MinBWs.end()) {
19483 assert(BWSz > SrcBWSz &&
"Invalid cast!");
19485 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
19487 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
19488 !SrcIt->second.second) {
19489 VecOpcode = Instruction::UIToFP;
19491 Value *
V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
19493 : Builder.CreateCast(VecOpcode, InVec, VecTy);
19494 V = FinalShuffle(V,
E);
19496 E->VectorizedValue =
V;
19497 ++NumVectorInstructions;
19500 case Instruction::FCmp:
19501 case Instruction::ICmp: {
19502 setInsertPointAfterBundle(
E);
19504 Value *
L = vectorizeOperand(
E, 0);
19505 Value *
R = vectorizeOperand(
E, 1);
19506 if (
L->getType() !=
R->getType()) {
19509 MinBWs.contains(getOperandEntry(
E, 0)) ||
19510 MinBWs.contains(getOperandEntry(
E, 1))) &&
19511 "Expected item in MinBWs.");
19516 ->getIntegerBitWidth()) {
19517 Type *CastTy =
R->getType();
19518 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
19520 Type *CastTy =
L->getType();
19521 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
19526 Value *
V = Builder.CreateCmp(P0, L, R);
19529 ICmp->setSameSign(
false);
19532 V = FinalShuffle(V,
E);
19534 E->VectorizedValue =
V;
19535 ++NumVectorInstructions;
19538 case Instruction::Select: {
19539 setInsertPointAfterBundle(
E);
19542 Value *True = vectorizeOperand(
E, 1);
19543 Value *False = vectorizeOperand(
E, 2);
19547 MinBWs.contains(getOperandEntry(
E, 1)) ||
19548 MinBWs.contains(getOperandEntry(
E, 2))) &&
19549 "Expected item in MinBWs.");
19550 if (True->
getType() != VecTy)
19551 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
19552 if (False->
getType() != VecTy)
19553 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
19558 assert(TrueNumElements >= CondNumElements &&
19559 TrueNumElements % CondNumElements == 0 &&
19560 "Cannot vectorize Instruction::Select");
19562 "Cannot vectorize Instruction::Select");
19563 if (CondNumElements != TrueNumElements) {
19566 Cond = Builder.CreateShuffleVector(
19571 "Cannot vectorize Instruction::Select");
19573 Builder.CreateSelectWithUnknownProfile(
Cond, True, False,
DEBUG_TYPE);
19574 V = FinalShuffle(V,
E);
19576 E->VectorizedValue =
V;
19577 ++NumVectorInstructions;
19580 case Instruction::FNeg: {
19581 setInsertPointAfterBundle(
E);
19583 Value *
Op = vectorizeOperand(
E, 0);
19585 Value *
V = Builder.CreateUnOp(
19591 V = FinalShuffle(V,
E);
19593 E->VectorizedValue =
V;
19594 ++NumVectorInstructions;
19598 case Instruction::Freeze: {
19599 setInsertPointAfterBundle(
E);
19601 Value *
Op = vectorizeOperand(
E, 0);
19603 if (
Op->getType() != VecTy) {
19605 MinBWs.contains(getOperandEntry(
E, 0))) &&
19606 "Expected item in MinBWs.");
19607 Op = Builder.CreateIntCast(
Op, VecTy, GetOperandSignedness(0));
19609 Value *
V = Builder.CreateFreeze(
Op);
19610 V = FinalShuffle(V,
E);
19612 E->VectorizedValue =
V;
19613 ++NumVectorInstructions;
19617 case Instruction::Add:
19618 case Instruction::FAdd:
19619 case Instruction::Sub:
19620 case Instruction::FSub:
19621 case Instruction::Mul:
19622 case Instruction::FMul:
19623 case Instruction::UDiv:
19624 case Instruction::SDiv:
19625 case Instruction::FDiv:
19626 case Instruction::URem:
19627 case Instruction::SRem:
19628 case Instruction::FRem:
19629 case Instruction::Shl:
19630 case Instruction::LShr:
19631 case Instruction::AShr:
19632 case Instruction::And:
19633 case Instruction::Or:
19634 case Instruction::Xor: {
19635 setInsertPointAfterBundle(
E);
19639 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
19644 return CI && CI->getValue().countr_one() >= It->second.first;
19646 V = FinalShuffle(
I == 0 ?
RHS :
LHS,
E);
19647 E->VectorizedValue =
V;
19648 ++NumVectorInstructions;
19656 MinBWs.contains(getOperandEntry(
E, 0)) ||
19657 MinBWs.contains(getOperandEntry(
E, 1))) &&
19658 "Expected item in MinBWs.");
19660 LHS = Builder.CreateIntCast(
LHS, VecTy, GetOperandSignedness(0));
19662 RHS = Builder.CreateIntCast(
RHS, VecTy, GetOperandSignedness(1));
19665 Value *
V = Builder.CreateBinOp(
19672 if (!MinBWs.contains(
E) && ShuffleOrOp == Instruction::Sub &&
19674 return isa<PoisonValue>(V) || isCommutative(cast<Instruction>(V));
19676 I->setHasNoUnsignedWrap(
false);
19679 V = FinalShuffle(V,
E);
19681 E->VectorizedValue =
V;
19682 ++NumVectorInstructions;
19686 case Instruction::Load: {
19689 setInsertPointAfterBundle(
E);
19693 FixedVectorType *StridedLoadTy =
nullptr;
19694 Value *PO = LI->getPointerOperand();
19695 if (
E->State == TreeEntry::Vectorize) {
19696 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
19697 }
else if (
E->State == TreeEntry::CompressVectorize) {
19698 auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
19699 CompressEntryToData.at(
E);
19700 Align CommonAlignment = LI->getAlign();
19706 for (
int I : CompressMask)
19710 MaskValues =
replicateMask(MaskValues, VecTy->getNumElements());
19713 NewLI = Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment,
19716 NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment);
19727 }
else if (
E->State == TreeEntry::StridedVectorize) {
19730 PO = IsReverseOrder ? PtrN : Ptr0;
19731 Type *StrideTy = DL->getIndexType(PO->
getType());
19733 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(
E);
19734 StridedLoadTy = SPtrInfo.Ty;
19735 assert(StridedLoadTy &&
"Missing StridedPoinerInfo for tree entry.");
19736 unsigned StridedLoadEC =
19739 Value *Stride = SPtrInfo.StrideVal;
19741 const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
19742 assert(StrideSCEV &&
"Neither StrideVal nor StrideSCEV were set.");
19743 SCEVExpander Expander(*SE, *DL,
"strided-load-vec");
19744 Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->
getType(),
19745 &*Builder.GetInsertPoint());
19748 Builder.CreateIntCast(Stride, StrideTy,
true);
19749 StrideVal = Builder.CreateMul(
19750 NewStride, ConstantInt::get(
19751 StrideTy, (IsReverseOrder ? -1 : 1) *
19753 DL->getTypeAllocSize(ScalarTy))));
19755 auto *Inst = Builder.CreateIntrinsic(
19756 Intrinsic::experimental_vp_strided_load,
19757 {StridedLoadTy, PO->
getType(), StrideTy},
19760 Builder.getInt32(StridedLoadEC)});
19761 Inst->addParamAttr(
19766 assert(
E->State == TreeEntry::ScatterVectorize &&
"Unhandled state");
19767 Value *VecPtr = vectorizeOperand(
E, 0);
19772 unsigned ScalarTyNumElements =
19774 unsigned VecTyNumElements =
19776 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
19777 "Cannot expand getelementptr.");
19778 unsigned VF = VecTyNumElements / ScalarTyNumElements;
19781 return Builder.getInt64(I % ScalarTyNumElements);
19783 VecPtr = Builder.CreateGEP(
19784 VecTy->getElementType(),
19785 Builder.CreateShuffleVector(
19791 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
19793 Value *
V =
E->State == TreeEntry::CompressVectorize
19797 V = FinalShuffle(V,
E);
19798 E->VectorizedValue =
V;
19799 ++NumVectorInstructions;
19802 case Instruction::Store: {
19805 setInsertPointAfterBundle(
E);
19807 Value *VecValue = vectorizeOperand(
E, 0);
19808 if (VecValue->
getType() != VecTy)
19810 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
19811 VecValue = FinalShuffle(VecValue,
E);
19815 if (
E->State == TreeEntry::Vectorize) {
19816 ST = Builder.CreateAlignedStore(VecValue,
Ptr,
SI->getAlign());
19818 assert(
E->State == TreeEntry::StridedVectorize &&
19819 "Expected either strided or consecutive stores.");
19820 if (!
E->ReorderIndices.empty()) {
19822 Ptr =
SI->getPointerOperand();
19825 Type *StrideTy = DL->getIndexType(
SI->getPointerOperandType());
19826 auto *Inst = Builder.CreateIntrinsic(
19827 Intrinsic::experimental_vp_strided_store,
19828 {VecTy,
Ptr->getType(), StrideTy},
19831 StrideTy, -
static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
19832 Builder.getAllOnesMask(VecTy->getElementCount()),
19833 Builder.getInt32(
E->Scalars.size())});
19834 Inst->addParamAttr(
19842 E->VectorizedValue =
V;
19843 ++NumVectorInstructions;
19846 case Instruction::GetElementPtr: {
19848 setInsertPointAfterBundle(
E);
19850 Value *Op0 = vectorizeOperand(
E, 0);
19853 for (
int J = 1,
N = GEP0->getNumOperands(); J <
N; ++J) {
19854 Value *OpVec = vectorizeOperand(
E, J);
19858 Value *
V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
19861 for (
Value *V :
E->Scalars) {
19868 V = FinalShuffle(V,
E);
19870 E->VectorizedValue =
V;
19871 ++NumVectorInstructions;
19875 case Instruction::Call: {
19877 setInsertPointAfterBundle(
E);
19882 CI,
ID, VecTy->getNumElements(),
19883 It != MinBWs.end() ? It->second.first : 0, TTI);
19886 VecCallCosts.first <= VecCallCosts.second;
19888 Value *ScalarArg =
nullptr;
19899 ScalarArg = CEI->getArgOperand(
I);
19902 if (
ID == Intrinsic::abs && It != MinBWs.end() &&
19903 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
19904 ScalarArg = Builder.getFalse();
19911 Value *OpVec = vectorizeOperand(
E,
I);
19912 ScalarArg = CEI->getArgOperand(
I);
19915 It == MinBWs.end()) {
19918 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(
I));
19919 }
else if (It != MinBWs.end()) {
19920 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(
I));
19929 if (!UseIntrinsic) {
19934 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
19941 Value *
V = Builder.CreateCall(CF, OpVecs, OpBundles);
19944 V = FinalShuffle(V,
E);
19946 E->VectorizedValue =
V;
19947 ++NumVectorInstructions;
19950 case Instruction::ShuffleVector: {
19953 setInsertPointAfterBundle(
E);
19954 Value *Src = vectorizeOperand(
E, 0);
19957 SmallVector<int> NewMask(ThisMask.size());
19959 return SVSrc->getShuffleMask()[Mask];
19961 V = Builder.CreateShuffleVector(SVSrc->getOperand(0),
19962 SVSrc->getOperand(1), NewMask);
19964 V = Builder.CreateShuffleVector(Src, ThisMask);
19969 V = FinalShuffle(V,
E);
19977 "Invalid Shuffle Vector Operand");
19981 setInsertPointAfterBundle(
E);
19982 LHS = vectorizeOperand(
E, 0);
19983 RHS = vectorizeOperand(
E, 1);
19985 setInsertPointAfterBundle(
E);
19986 LHS = vectorizeOperand(
E, 0);
19992 assert((It != MinBWs.end() ||
19993 getOperandEntry(
E, 0)->State == TreeEntry::NeedToGather ||
19994 getOperandEntry(
E, 1)->State == TreeEntry::NeedToGather ||
19995 MinBWs.contains(getOperandEntry(
E, 0)) ||
19996 MinBWs.contains(getOperandEntry(
E, 1))) &&
19997 "Expected item in MinBWs.");
19998 Type *CastTy = VecTy;
20004 ->getIntegerBitWidth())
20010 LHS = Builder.CreateIntCast(
LHS, CastTy, GetOperandSignedness(0));
20012 RHS = Builder.CreateIntCast(
RHS, CastTy, GetOperandSignedness(1));
20017 V0 = Builder.CreateBinOp(
20019 V1 = Builder.CreateBinOp(
20022 V0 = Builder.CreateCmp(CI0->getPredicate(),
LHS,
RHS);
20025 V1 = Builder.CreateCmp(AltPred,
LHS,
RHS);
20028 unsigned SrcBWSz = DL->getTypeSizeInBits(
20030 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
20031 if (BWSz <= SrcBWSz) {
20032 if (BWSz < SrcBWSz)
20033 LHS = Builder.CreateIntCast(
LHS, VecTy, It->second.first);
20035 "Expected same type as operand.");
20039 E->VectorizedValue =
LHS;
20040 ++NumVectorInstructions;
20044 V0 = Builder.CreateCast(
20046 V1 = Builder.CreateCast(
20051 for (
Value *V : {V0, V1}) {
20053 GatherShuffleExtractSeq.insert(
I);
20054 CSEBlocks.insert(
I->getParent());
20062 SmallVector<int>
Mask;
20063 E->buildAltOpShuffleMask(
20064 [
E,
this](Instruction *
I) {
20065 assert(
E->getMatchingMainOpOrAltOp(
I) &&
20066 "Unexpected main/alternate opcode");
20070 Mask, &OpScalars, &AltScalars);
20074 auto DropNuwFlag = [&](
Value *Vec,
unsigned Opcode) {
20077 I && Opcode == Instruction::Sub && !MinBWs.contains(
E) &&
20079 if (isa<PoisonValue>(V))
20081 auto *IV = cast<Instruction>(V);
20082 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
20084 I->setHasNoUnsignedWrap(
false);
20086 DropNuwFlag(V0,
E->getOpcode());
20087 DropNuwFlag(V1,
E->getAltOpcode());
20093 V = Builder.CreateShuffleVector(V0, V1, Mask);
20096 GatherShuffleExtractSeq.insert(
I);
20097 CSEBlocks.insert(
I->getParent());
20101 E->VectorizedValue =
V;
20102 ++NumVectorInstructions;
20120 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
20123 EntryToLastInstruction.clear();
20125 for (
auto &BSIter : BlocksSchedules)
20126 scheduleBlock(*
this, BSIter.second.get());
20129 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20130 if (TE->isGather())
20132 (void)getLastInstructionInBundle(TE.get());
20136 Builder.SetInsertPoint(ReductionRoot->
getParent(),
20139 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20143 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20144 if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
20145 TE->UserTreeIndex.UserTE->hasState() &&
20146 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
20147 (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
20148 TE->UserTreeIndex.UserTE->isAltShuffle()) &&
20149 !TE->UserTreeIndex.UserTE->hasCopyableElements() &&
20150 all_of(TE->UserTreeIndex.UserTE->Scalars,
20151 [](
Value *V) { return isUsedOutsideBlock(V); })) {
20153 getLastInstructionInBundle(TE->UserTreeIndex.UserTE);
20157 for (
auto &Entry : GatherEntries) {
20159 Builder.SetInsertPoint(Entry.second);
20160 Builder.SetCurrentDebugLocation(Entry.second->getDebugLoc());
20165 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20166 if (GatheredLoadsEntriesFirst.has_value() &&
20167 TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue &&
20168 (!TE->isGather() || TE->UserTreeIndex)) {
20169 assert((TE->UserTreeIndex ||
20170 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
20171 "Expected gathered load node.");
20180 for (
const TreeEntry *E : PostponedNodes) {
20181 auto *TE =
const_cast<TreeEntry *
>(E);
20183 TE->VectorizedValue =
nullptr;
20194 (TE->UserTreeIndex.UserTE->hasState() &&
20195 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI)) {
20204 if (UI->comesBefore(InsertPt))
20207 Builder.SetInsertPoint(InsertPt);
20209 Builder.SetInsertPoint(PrevVec);
20211 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
20214 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
20215 Builder.GetInsertPoint()->comesBefore(VecI))
20216 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
20217 Builder.GetInsertPoint());
20218 if (Vec->
getType() != PrevVec->getType()) {
20220 PrevVec->getType()->isIntOrIntVectorTy() &&
20221 "Expected integer vector types only.");
20222 std::optional<bool> IsSigned;
20223 for (
Value *V : TE->Scalars) {
20225 for (
const TreeEntry *MNTE : getTreeEntries(V)) {
20226 auto It = MinBWs.find(MNTE);
20227 if (It != MinBWs.end()) {
20228 IsSigned = IsSigned.value_or(
false) || It->second.second;
20233 if (IsSigned.value_or(
false))
20236 for (
const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
20237 auto It = MinBWs.find(BVE);
20238 if (It != MinBWs.end()) {
20239 IsSigned = IsSigned.value_or(
false) || It->second.second;
20244 if (IsSigned.value_or(
false))
20248 IsSigned.value_or(
false) ||
20252 if (IsSigned.value_or(
false))
20256 if (IsSigned.value_or(
false)) {
20258 auto It = MinBWs.find(TE->UserTreeIndex.UserTE);
20259 if (It != MinBWs.end())
20260 IsSigned = It->second.second;
20263 "Expected user node or perfect diamond match in MinBWs.");
20264 Vec = Builder.CreateIntCast(Vec, PrevVec->
getType(), *IsSigned);
20266 PrevVec->replaceAllUsesWith(Vec);
20267 PostponedValues.
try_emplace(Vec).first->second.push_back(TE);
20270 auto It = PostponedValues.
find(PrevVec);
20271 if (It != PostponedValues.
end()) {
20272 for (TreeEntry *VTE : It->getSecond())
20273 VTE->VectorizedValue = Vec;
20293 for (
const auto &ExternalUse : ExternalUses) {
20294 Value *Scalar = ExternalUse.Scalar;
20301 const TreeEntry *E = &ExternalUse.E;
20302 assert(E &&
"Invalid scalar");
20303 assert(!E->isGather() &&
"Extracting from a gather list");
20305 if (E->getOpcode() == Instruction::GetElementPtr &&
20309 Value *Vec = E->VectorizedValue;
20310 assert(Vec &&
"Can't find vectorizable value");
20312 Value *Lane = Builder.getInt32(ExternalUse.Lane);
20313 auto ExtractAndExtendIfNeeded = [&](
Value *Vec) {
20314 if (Scalar->getType() != Vec->
getType()) {
20315 Value *Ex =
nullptr;
20316 Value *ExV =
nullptr;
20318 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
20319 auto It = ScalarToEEs.
find(Scalar);
20320 if (It != ScalarToEEs.
end()) {
20323 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
20324 : Builder.GetInsertBlock());
20325 if (EEIt != It->second.end()) {
20326 Value *PrevV = EEIt->second.first;
20328 I && !ReplaceInst &&
20329 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
20330 Builder.GetInsertPoint()->comesBefore(
I)) {
20331 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
20332 Builder.GetInsertPoint());
20337 ExV = EEIt->second.second ? EEIt->second.second : Ex;
20346 IgnoredExtracts.
insert(EE);
20349 auto *CloneInst = Inst->clone();
20350 CloneInst->insertBefore(Inst->getIterator());
20351 if (Inst->hasName())
20352 CloneInst->takeName(Inst);
20357 Value *V = ES->getVectorOperand();
20360 V = ETEs.front()->VectorizedValue;
20362 !
IV ||
IV == Vec ||
IV->getParent() != IVec->getParent() ||
20363 IV->comesBefore(IVec))
20364 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
20366 Ex = Builder.CreateExtractElement(Vec, Lane);
20367 }
else if (
auto *VecTy =
20370 unsigned VecTyNumElements = VecTy->getNumElements();
20375 ExternalUse.Lane * VecTyNumElements);
20377 Ex = Builder.CreateExtractElement(Vec, Lane);
20382 if (Scalar->getType() != Ex->
getType())
20383 ExV = Builder.CreateIntCast(
20388 : &F->getEntryBlock(),
20389 std::make_pair(Ex, ExV));
20395 GatherShuffleExtractSeq.insert(ExI);
20396 CSEBlocks.insert(ExI->getParent());
20402 "In-tree scalar of vector type is not insertelement?");
20411 if (!ScalarsWithNullptrUser.
insert(Scalar).second)
20414 (ExternallyUsedValues.
count(Scalar) ||
20415 ExternalUsesWithNonUsers.count(Scalar) ||
20416 ExternalUsesAsOriginalScalar.contains(Scalar) ||
20420 if (ExternalUsesAsOriginalScalar.contains(U))
20422 ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
20423 return !UseEntries.empty() &&
20424 (E->State == TreeEntry::Vectorize ||
20425 E->State == TreeEntry::StridedVectorize ||
20426 E->State == TreeEntry::CompressVectorize) &&
20427 any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {
20428 return (UseEntry->State == TreeEntry::Vectorize ||
20430 TreeEntry::StridedVectorize ||
20432 TreeEntry::CompressVectorize) &&
20433 doesInTreeUserNeedToExtract(
20434 Scalar, getRootEntryInstruction(*UseEntry),
20438 "Scalar with nullptr User must be registered in "
20439 "ExternallyUsedValues map or remain as scalar in vectorized "
20443 if (
PHI->getParent()->isLandingPad())
20444 Builder.SetInsertPoint(
20447 PHI->getParent()->getLandingPadInst()->getIterator()));
20449 Builder.SetInsertPoint(
PHI->getParent(),
20450 PHI->getParent()->getFirstNonPHIIt());
20452 Builder.SetInsertPoint(VecI->getParent(),
20453 std::next(VecI->getIterator()));
20456 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20458 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20460 if (Scalar != NewInst) {
20463 "Extractelements should not be replaced.");
20464 Scalar->replaceAllUsesWith(NewInst);
20474 if (!UsedInserts.
insert(VU).second)
20477 auto BWIt = MinBWs.find(E);
20479 auto *ScalarTy = FTy->getElementType();
20480 auto Key = std::make_pair(Vec, ScalarTy);
20481 auto VecIt = VectorCasts.
find(
Key);
20482 if (VecIt == VectorCasts.
end()) {
20485 if (IVec->getParent()->isLandingPad())
20486 Builder.SetInsertPoint(IVec->getParent(),
20487 std::next(IVec->getParent()
20488 ->getLandingPadInst()
20491 Builder.SetInsertPoint(
20492 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
20494 Builder.SetInsertPoint(IVec->getNextNode());
20496 Vec = Builder.CreateIntCast(
20501 BWIt->second.second);
20504 Vec = VecIt->second;
20511 ShuffledInserts, [VU](
const ShuffledInsertData<Value *> &
Data) {
20518 unsigned Idx = *InsertIdx;
20519 if (It == ShuffledInserts.
end()) {
20521 It = std::next(ShuffledInserts.
begin(),
20522 ShuffledInserts.
size() - 1);
20527 Mask[Idx] = ExternalUse.Lane;
20539 for (
unsigned I :
seq<unsigned>(0, PH->getNumIncomingValues())) {
20540 if (PH->getIncomingValue(
I) == Scalar) {
20542 PH->getIncomingBlock(
I)->getTerminator();
20544 Builder.SetInsertPoint(VecI->getParent(),
20545 std::next(VecI->getIterator()));
20547 Builder.SetInsertPoint(PH->getIncomingBlock(
I)->getTerminator());
20549 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20550 PH->setOperand(
I, NewInst);
20555 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20559 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20560 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20571 for (
int I = 0, E = Mask.size();
I < E; ++
I) {
20573 CombinedMask1[
I] = Mask[
I];
20575 CombinedMask2[
I] = Mask[
I] - VF;
20577 ShuffleInstructionBuilder ShuffleBuilder(
20579 ShuffleBuilder.add(V1, CombinedMask1);
20581 ShuffleBuilder.add(V2, CombinedMask2);
20582 return ShuffleBuilder.finalize({}, {}, {});
20585 auto &&ResizeToVF = [&CreateShuffle](
Value *Vec, ArrayRef<int>
Mask,
20586 bool ForSingleMask) {
20587 unsigned VF =
Mask.size();
20590 if (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); })) {
20591 Vec = CreateShuffle(Vec,
nullptr, Mask);
20592 return std::make_pair(Vec,
true);
20594 if (!ForSingleMask) {
20596 for (
unsigned I = 0;
I < VF; ++
I) {
20600 Vec = CreateShuffle(Vec,
nullptr, ResizeMask);
20604 return std::make_pair(Vec,
false);
20608 for (
int I = 0,
E = ShuffledInserts.size();
I <
E; ++
I) {
20611 InsertElementInst *FirstInsert = ShuffledInserts[
I].InsertElements.front();
20612 InsertElementInst *LastInsert = ShuffledInserts[
I].InsertElements.back();
20613 Builder.SetInsertPoint(LastInsert);
20614 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
20619 return cast<VectorType>(Vec->getType())
20620 ->getElementCount()
20621 .getKnownMinValue();
20624 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
20626 assert((Vals.size() == 1 || Vals.size() == 2) &&
20627 "Expected exactly 1 or 2 input values.");
20628 if (Vals.size() == 1) {
20631 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
20632 ->getNumElements() ||
20633 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
20634 return CreateShuffle(Vals.front(), nullptr, Mask);
20635 return Vals.front();
20637 return CreateShuffle(Vals.
front() ? Vals.
front()
20639 Vals.
back(), Mask);
20641 auto It = ShuffledInserts[
I].InsertElements.rbegin();
20643 InsertElementInst *
II =
nullptr;
20644 if (It != ShuffledInserts[
I].InsertElements.rend())
20647 while (It != ShuffledInserts[
I].InsertElements.rend()) {
20648 assert(
II &&
"Must be an insertelement instruction.");
20655 for (Instruction *
II :
reverse(Inserts)) {
20656 II->replaceUsesOfWith(
II->getOperand(0), NewInst);
20658 if (
II->getParent() == NewI->getParent() &&
II->comesBefore(NewI))
20659 II->moveAfter(NewI);
20663 for (InsertElementInst *IE :
reverse(ShuffledInserts[
I].InsertElements)) {
20664 IE->replaceUsesOfWith(
IE->getOperand(0),
20666 IE->replaceUsesOfWith(
IE->getOperand(1),
20670 CSEBlocks.insert(LastInsert->
getParent());
20675 for (
auto &TEPtr : VectorizableTree) {
20676 TreeEntry *
Entry = TEPtr.get();
20679 if (
Entry->isGather() ||
Entry->State == TreeEntry::SplitVectorize)
20682 assert(
Entry->VectorizedValue &&
"Can't find vectorizable value");
20685 for (
int Lane = 0, LE =
Entry->Scalars.size(); Lane != LE; ++Lane) {
20688 if (
Entry->getOpcode() == Instruction::GetElementPtr &&
20692 EE && IgnoredExtracts.contains(EE))
20699 for (User *U :
Scalar->users()) {
20704 (UserIgnoreList && UserIgnoreList->contains(U)) ||
20707 "Deleting out-of-tree value");
20711 LLVM_DEBUG(
dbgs() <<
"SLP: \tErasing scalar:" << *Scalar <<
".\n");
20720 V->mergeDIAssignID(RemovedInsts);
20723 if (UserIgnoreList) {
20724 for (Instruction *
I : RemovedInsts) {
20725 const TreeEntry *
IE = getTreeEntries(
I).front();
20726 if (
IE->Idx != 0 &&
20727 !(VectorizableTree.front()->isGather() &&
IE->UserTreeIndex &&
20728 (ValueToGatherNodes.lookup(
I).contains(
20729 VectorizableTree.front().get()) ||
20730 (
IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
20731 IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
20732 !(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
20733 IE->UserTreeIndex &&
20735 !(GatheredLoadsEntriesFirst.has_value() &&
20736 IE->Idx >= *GatheredLoadsEntriesFirst &&
20737 VectorizableTree.front()->isGather() &&
20739 !(!VectorizableTree.front()->isGather() &&
20740 VectorizableTree.front()->isCopyableElement(
I)))
20745 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
20746 (match(U.getUser(), m_LogicalAnd()) ||
20747 match(U.getUser(), m_LogicalOr())) &&
20748 U.getOperandNo() == 0;
20749 if (IsPoisoningLogicalOp) {
20750 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
20753 return UserIgnoreList->contains(
U.getUser());
20757 for (SelectInst *SI : LogicalOpSelects)
20767 Builder.ClearInsertionPoint();
20768 InstrElementSize.clear();
20770 const TreeEntry &RootTE = *VectorizableTree.front();
20771 Value *Vec = RootTE.VectorizedValue;
20772 if (
auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
20773 It != MinBWs.end() &&
20774 ReductionBitWidth != It->second.first) {
20775 IRBuilder<>::InsertPointGuard Guard(Builder);
20776 Builder.SetInsertPoint(ReductionRoot->getParent(),
20777 ReductionRoot->getIterator());
20778 Vec = Builder.CreateIntCast(
20782 It->second.second);
20788 LLVM_DEBUG(
dbgs() <<
"SLP: Optimizing " << GatherShuffleExtractSeq.size()
20789 <<
" gather sequences instructions.\n");
20796 Loop *L = LI->getLoopFor(
I->getParent());
20801 BasicBlock *PreHeader = L->getLoopPreheader();
20809 auto *OpI = dyn_cast<Instruction>(V);
20810 return OpI && L->contains(OpI);
20816 CSEBlocks.insert(PreHeader);
20821 CSEWorkList.
reserve(CSEBlocks.size());
20824 assert(DT->isReachableFromEntry(
N));
20831 assert((
A ==
B) == (
A->getDFSNumIn() ==
B->getDFSNumIn()) &&
20832 "Different nodes should have different DFS numbers");
20833 return A->getDFSNumIn() <
B->getDFSNumIn();
20841 auto &&IsIdenticalOrLessDefined = [TTI = TTI](
Instruction *I1,
20844 if (I1->getType() != I2->getType())
20849 return I1->isIdenticalTo(I2);
20850 if (SI1->isIdenticalTo(SI2))
20852 for (
int I = 0, E = SI1->getNumOperands();
I < E; ++
I)
20853 if (SI1->getOperand(
I) != SI2->getOperand(
I))
20856 NewMask.
assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
20860 unsigned LastUndefsCnt = 0;
20861 for (
int I = 0, E = NewMask.
size();
I < E; ++
I) {
20867 NewMask[
I] != SM1[
I])
20870 NewMask[
I] = SM1[
I];
20874 return SM1.
size() - LastUndefsCnt > 1 &&
20878 SM1.
size() - LastUndefsCnt));
20884 for (
auto I = CSEWorkList.
begin(), E = CSEWorkList.
end();
I != E; ++
I) {
20886 (
I == CSEWorkList.
begin() || !DT->dominates(*
I, *std::prev(
I))) &&
20887 "Worklist not sorted properly!");
20894 !GatherShuffleExtractSeq.contains(&In))
20899 bool Replaced =
false;
20902 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
20903 DT->dominates(V->getParent(), In.getParent())) {
20904 In.replaceAllUsesWith(V);
20907 if (!NewMask.
empty())
20908 SI->setShuffleMask(NewMask);
20913 GatherShuffleExtractSeq.contains(V) &&
20914 IsIdenticalOrLessDefined(V, &In, NewMask) &&
20915 DT->dominates(In.getParent(), V->getParent())) {
20917 V->replaceAllUsesWith(&In);
20920 if (!NewMask.
empty())
20921 SI->setShuffleMask(NewMask);
20929 Visited.push_back(&In);
20934 GatherShuffleExtractSeq.clear();
20937BoUpSLP::ScheduleBundle &BoUpSLP::BlockScheduling::buildBundle(
20940 ScheduledBundlesList.emplace_back(std::make_unique<ScheduleBundle>());
20941 for (
Value *V : VL) {
20942 if (S.isNonSchedulable(V))
20945 if (S.isCopyableElement(V)) {
20947 ScheduleCopyableData &SD =
20948 addScheduleCopyableData(EI,
I, SchedulingRegionID, *BundlePtr);
20950 BundlePtr->add(&SD);
20953 ScheduleData *BundleMember = getScheduleData(V);
20954 assert(BundleMember &&
"no ScheduleData for bundle member "
20955 "(maybe not in same basic block)");
20957 BundlePtr->add(BundleMember);
20958 ScheduledBundles.try_emplace(
I).first->getSecond().push_back(
20961 assert(BundlePtr && *BundlePtr &&
"Failed to find schedule bundle");
20967std::optional<BoUpSLP::ScheduleBundle *>
20969 const InstructionsState &S,
20982 if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() &&
20983 EI.UserTE->doesNotNeedToSchedule() &&
20984 EI.UserTE->getOpcode() != Instruction::PHI &&
20986 auto *I = dyn_cast<Instruction>(V);
20987 if (!I || I->hasOneUser())
20989 for (User *U : I->users()) {
20990 auto *UI = cast<Instruction>(U);
20991 if (isa<BinaryOperator>(UI))
20996 return std::nullopt;
20997 if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() &&
20998 EI.UserTE->hasCopyableElements() &&
20999 EI.UserTE->getMainOp()->getParent() == S.getMainOp()->getParent() &&
21001 if (S.isCopyableElement(V))
21005 return std::nullopt;
21006 bool HasCopyables = S.areInstructionsWithCopyableElements();
21008 all_of(VL, [&](
Value *V) {
return S.isNonSchedulable(V); }))) {
21012 SmallVector<ScheduleData *> ControlDependentMembers;
21013 for (
Value *V : VL) {
21015 if (!
I || (HasCopyables && S.isCopyableElement(V)))
21017 SmallDenseMap<std::pair<Instruction *, Value *>,
unsigned> UserOpToNumOps;
21018 for (
const Use &U :
I->operands()) {
21021 .first->getSecond();
21024 Op && areAllOperandsReplacedByCopyableData(
I,
Op, *SLP,
NumOps)) {
21025 if (ScheduleData *OpSD = getScheduleData(
Op);
21026 OpSD && OpSD->hasValidDependencies()) {
21027 OpSD->clearDirectDependencies();
21028 if (RegionHasStackSave ||
21030 ControlDependentMembers.
push_back(OpSD);
21035 if (!ControlDependentMembers.
empty()) {
21036 ScheduleBundle
Invalid = ScheduleBundle::invalid();
21037 calculateDependencies(
Invalid,
true, SLP,
21038 ControlDependentMembers);
21045 LLVM_DEBUG(
dbgs() <<
"SLP: bundle: " << *S.getMainOp() <<
"\n");
21047 auto TryScheduleBundleImpl = [=](
bool ReSchedule, ScheduleBundle &Bundle) {
21050 SmallVector<ScheduleData *> ControlDependentMembers;
21051 auto CheckIfNeedToClearDeps = [&](ScheduleBundle &Bundle) {
21052 SmallDenseMap<std::pair<Instruction *, Value *>,
unsigned> UserOpToNumOps;
21053 for (ScheduleEntity *SE : Bundle.getBundle()) {
21055 if (ScheduleData *BundleMember = getScheduleData(SD->getInst());
21056 BundleMember && BundleMember->hasValidDependencies()) {
21057 BundleMember->clearDirectDependencies();
21058 if (RegionHasStackSave ||
21060 BundleMember->getInst()))
21061 ControlDependentMembers.
push_back(BundleMember);
21066 if (SD->hasValidDependencies() &&
21067 (!S.areInstructionsWithCopyableElements() ||
21068 !S.isCopyableElement(SD->getInst())) &&
21069 !getScheduleCopyableData(SD->getInst()).empty() && EI.UserTE &&
21070 EI.UserTE->hasState() &&
21071 (!EI.UserTE->hasCopyableElements() ||
21072 !EI.UserTE->isCopyableElement(SD->getInst())))
21073 SD->clearDirectDependencies();
21074 for (
const Use &U : SD->getInst()->operands()) {
21077 .
try_emplace(std::make_pair(SD->getInst(),
U.get()), 0)
21078 .first->getSecond();
21081 Op && areAllOperandsReplacedByCopyableData(SD->getInst(),
Op,
21083 if (ScheduleData *OpSD = getScheduleData(
Op);
21084 OpSD && OpSD->hasValidDependencies()) {
21085 OpSD->clearDirectDependencies();
21086 if (RegionHasStackSave ||
21088 ControlDependentMembers.
push_back(OpSD);
21099 if (OldScheduleEnd && ScheduleEnd != OldScheduleEnd) {
21100 for_each(ScheduleDataMap, [&](
auto &
P) {
21101 if (BB !=
P.first->getParent())
21103 ScheduleData *SD =
P.second;
21104 if (isInSchedulingRegion(*SD))
21105 SD->clearDependencies();
21107 for_each(ScheduleCopyableDataMapByInst, [&](
auto &
P) {
21108 for_each(
P.second, [&](ScheduleCopyableData *SD) {
21109 if (isInSchedulingRegion(*SD))
21110 SD->clearDependencies();
21117 if (Bundle && !Bundle.getBundle().empty()) {
21118 if (S.areInstructionsWithCopyableElements() ||
21119 !ScheduleCopyableDataMap.empty())
21120 CheckIfNeedToClearDeps(Bundle);
21121 LLVM_DEBUG(
dbgs() <<
"SLP: try schedule bundle " << Bundle <<
" in block "
21123 calculateDependencies(Bundle, !ReSchedule, SLP,
21124 ControlDependentMembers);
21125 }
else if (!ControlDependentMembers.
empty()) {
21126 ScheduleBundle
Invalid = ScheduleBundle::invalid();
21127 calculateDependencies(
Invalid, !ReSchedule, SLP,
21128 ControlDependentMembers);
21133 initialFillReadyList(ReadyInsts);
21140 while (((!Bundle && ReSchedule) || (Bundle && !Bundle.isReady())) &&
21141 !ReadyInsts.empty()) {
21142 ScheduleEntity *Picked = ReadyInsts.pop_back_val();
21143 assert(Picked->isReady() &&
"must be ready to schedule");
21144 schedule(*SLP, S, EI, Picked, ReadyInsts);
21145 if (Picked == &Bundle)
21152 for (
Value *V : VL) {
21153 if (S.isNonSchedulable(V))
21155 if (!extendSchedulingRegion(V, S)) {
21162 ScheduleBundle
Invalid = ScheduleBundle::invalid();
21163 TryScheduleBundleImpl(
false,
Invalid);
21164 return std::nullopt;
21168 bool ReSchedule =
false;
21169 for (
Value *V : VL) {
21170 if (S.isNonSchedulable(V))
21174 if (!CopyableData.
empty()) {
21175 for (ScheduleCopyableData *SD : CopyableData)
21176 ReadyInsts.remove(SD);
21178 ScheduleData *BundleMember = getScheduleData(V);
21179 assert((BundleMember || S.isCopyableElement(V)) &&
21180 "no ScheduleData for bundle member (maybe not in same basic block)");
21186 ReadyInsts.remove(BundleMember);
21188 !Bundles.
empty()) {
21189 for (ScheduleBundle *
B : Bundles)
21190 ReadyInsts.remove(
B);
21193 if (!S.isCopyableElement(V) && !BundleMember->isScheduled())
21200 LLVM_DEBUG(
dbgs() <<
"SLP: reset schedule because " << *BundleMember
21201 <<
" was already scheduled\n");
21205 ScheduleBundle &Bundle = buildBundle(VL, S, EI);
21206 TryScheduleBundleImpl(ReSchedule, Bundle);
21207 if (!Bundle.isReady()) {
21208 for (ScheduleEntity *BD : Bundle.getBundle()) {
21212 if (BD->isReady()) {
21214 if (Bundles.
empty()) {
21215 ReadyInsts.insert(BD);
21218 for (ScheduleBundle *
B : Bundles)
21220 ReadyInsts.insert(
B);
21223 ScheduledBundlesList.pop_back();
21224 SmallVector<ScheduleData *> ControlDependentMembers;
21225 for (
Value *V : VL) {
21226 if (S.isNonSchedulable(V))
21229 if (S.isCopyableElement(
I)) {
21232 auto KV = std::make_pair(EI,
I);
21233 assert(ScheduleCopyableDataMap.contains(KV) &&
21234 "no ScheduleCopyableData for copyable element");
21235 ScheduleCopyableData *SD =
21236 ScheduleCopyableDataMapByInst.find(
I)->getSecond().pop_back_val();
21237 ScheduleCopyableDataMapByUsers[
I].remove(SD);
21240 const auto *It =
find(
Op,
I);
21241 assert(It !=
Op.end() &&
"Lane not set");
21242 SmallPtrSet<Instruction *, 4> Visited;
21244 int Lane = std::distance(
Op.begin(), It);
21245 assert(Lane >= 0 &&
"Lane not set");
21247 !EI.UserTE->ReorderIndices.empty())
21248 Lane = EI.UserTE->ReorderIndices[Lane];
21249 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
21250 "Couldn't find extract lane");
21252 if (!Visited.
insert(In).second) {
21256 ScheduleCopyableDataMapByInstUser
21257 [std::make_pair(std::make_pair(In, EI.EdgeIdx),
I)]
21260 }
while (It !=
Op.end());
21262 if (ScheduleCopyableData *UserCD = getScheduleCopyableData(UserEI,
I))
21263 ScheduleCopyableDataMapByUsers[
I].insert(UserCD);
21265 if (ScheduleCopyableDataMapByUsers[
I].
empty())
21266 ScheduleCopyableDataMapByUsers.erase(
I);
21267 ScheduleCopyableDataMap.erase(KV);
21269 if (ScheduleData *OpSD = getScheduleData(
I);
21270 OpSD && OpSD->hasValidDependencies()) {
21271 OpSD->clearDirectDependencies();
21272 if (RegionHasStackSave ||
21274 ControlDependentMembers.
push_back(OpSD);
21278 ScheduledBundles.find(
I)->getSecond().pop_back();
21280 if (!ControlDependentMembers.
empty()) {
21281 ScheduleBundle
Invalid = ScheduleBundle::invalid();
21282 calculateDependencies(
Invalid,
false, SLP,
21283 ControlDependentMembers);
21285 return std::nullopt;
21290BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
21292 if (ChunkPos >= ChunkSize) {
21293 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
21296 return &(ScheduleDataChunks.back()[ChunkPos++]);
21299bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
21300 Value *V,
const InstructionsState &S) {
21302 assert(
I &&
"bundle member must be an instruction");
21303 if (getScheduleData(
I))
21305 if (!ScheduleStart) {
21307 initScheduleData(
I,
I->getNextNode(),
nullptr,
nullptr);
21309 ScheduleEnd =
I->getNextNode();
21310 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
21311 LLVM_DEBUG(
dbgs() <<
"SLP: initialize schedule region to " << *
I <<
"\n");
21319 ++ScheduleStart->getIterator().getReverse();
21325 return II->isAssumeLikeIntrinsic();
21328 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21329 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21330 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter !=
I &&
21332 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
21333 LLVM_DEBUG(
dbgs() <<
"SLP: exceeded schedule region size limit\n");
21340 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21341 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21343 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter ==
I)) {
21344 assert(
I->getParent() == ScheduleStart->getParent() &&
21345 "Instruction is in wrong basic block.");
21346 initScheduleData(
I, ScheduleStart,
nullptr, FirstLoadStoreInRegion);
21352 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter ==
I)) &&
21353 "Expected to reach top of the basic block or instruction down the "
21355 assert(
I->getParent() == ScheduleEnd->getParent() &&
21356 "Instruction is in wrong basic block.");
21357 initScheduleData(ScheduleEnd,
I->getNextNode(), LastLoadStoreInRegion,
21359 ScheduleEnd =
I->getNextNode();
21360 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
21361 LLVM_DEBUG(
dbgs() <<
"SLP: extend schedule region end to " << *
I <<
"\n");
21365void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
21367 ScheduleData *PrevLoadStore,
21368 ScheduleData *NextLoadStore) {
21369 ScheduleData *CurrentLoadStore = PrevLoadStore;
21374 ScheduleData *SD = ScheduleDataMap.lookup(
I);
21376 SD = allocateScheduleDataChunks();
21377 ScheduleDataMap[
I] = SD;
21379 assert(!isInSchedulingRegion(*SD) &&
21380 "new ScheduleData already in scheduling region");
21381 SD->init(SchedulingRegionID,
I);
21383 if (
I->mayReadOrWriteMemory() &&
21387 Intrinsic::pseudoprobe))) {
21389 if (CurrentLoadStore) {
21390 CurrentLoadStore->setNextLoadStore(SD);
21392 FirstLoadStoreInRegion = SD;
21394 CurrentLoadStore = SD;
21399 RegionHasStackSave =
true;
21401 if (NextLoadStore) {
21402 if (CurrentLoadStore)
21403 CurrentLoadStore->setNextLoadStore(NextLoadStore);
21405 LastLoadStoreInRegion = CurrentLoadStore;
21409void BoUpSLP::BlockScheduling::calculateDependencies(
21410 ScheduleBundle &Bundle,
bool InsertInReadyList,
BoUpSLP *SLP,
21412 SmallVector<ScheduleEntity *> WorkList;
21413 auto ProcessNode = [&](ScheduleEntity *SE) {
21415 if (CD->hasValidDependencies())
21418 CD->initDependencies();
21419 CD->resetUnscheduledDeps();
21420 const EdgeInfo &EI = CD->getEdgeInfo();
21423 const auto *It =
find(
Op, CD->getInst());
21424 assert(It !=
Op.end() &&
"Lane not set");
21425 SmallPtrSet<Instruction *, 4> Visited;
21427 int Lane = std::distance(
Op.begin(), It);
21428 assert(Lane >= 0 &&
"Lane not set");
21430 !EI.UserTE->ReorderIndices.empty())
21431 Lane = EI.UserTE->ReorderIndices[Lane];
21432 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
21433 "Couldn't find extract lane");
21435 if (EI.UserTE->isCopyableElement(In)) {
21438 if (ScheduleCopyableData *UseSD =
21439 getScheduleCopyableData(EI.UserTE->UserTreeIndex, In)) {
21440 CD->incDependencies();
21441 if (!UseSD->isScheduled())
21442 CD->incrementUnscheduledDeps(1);
21443 if (!UseSD->hasValidDependencies() ||
21444 (InsertInReadyList && UseSD->isReady()))
21447 }
else if (Visited.
insert(In).second) {
21448 if (ScheduleData *UseSD = getScheduleData(In)) {
21449 CD->incDependencies();
21450 if (!UseSD->isScheduled())
21451 CD->incrementUnscheduledDeps(1);
21452 if (!UseSD->hasValidDependencies() ||
21453 (InsertInReadyList && UseSD->isReady()))
21458 }
while (It !=
Op.end());
21459 if (CD->isReady() && CD->getDependencies() == 0 &&
21460 (EI.UserTE->hasState() &&
21461 (EI.UserTE->getMainOp()->getParent() !=
21462 CD->getInst()->getParent() ||
21464 (EI.UserTE->getMainOp()->hasNUsesOrMore(
UsesLimit) ||
21465 any_of(EI.UserTE->getMainOp()->users(), [&](User *U) {
21466 auto *IU = dyn_cast<Instruction>(U);
21469 return IU->getParent() == EI.UserTE->getMainOp()->getParent();
21475 CD->incDependencies();
21476 CD->incrementUnscheduledDeps(1);
21482 if (BundleMember->hasValidDependencies())
21484 LLVM_DEBUG(
dbgs() <<
"SLP: update deps of " << *BundleMember <<
"\n");
21485 BundleMember->initDependencies();
21486 BundleMember->resetUnscheduledDeps();
21488 SmallDenseMap<Value *, unsigned> UserToNumOps;
21489 for (User *U : BundleMember->getInst()->users()) {
21492 if (ScheduleData *UseSD = getScheduleData(U)) {
21496 if (areAllOperandsReplacedByCopyableData(
21499 BundleMember->incDependencies();
21500 if (!UseSD->isScheduled())
21501 BundleMember->incrementUnscheduledDeps(1);
21502 if (!UseSD->hasValidDependencies() ||
21503 (InsertInReadyList && UseSD->isReady()))
21507 for (ScheduleCopyableData *UseSD :
21508 getScheduleCopyableDataUsers(BundleMember->getInst())) {
21509 BundleMember->incDependencies();
21510 if (!UseSD->isScheduled())
21511 BundleMember->incrementUnscheduledDeps(1);
21512 if (!UseSD->hasValidDependencies() ||
21513 (InsertInReadyList && UseSD->isReady()))
21517 SmallPtrSet<const Instruction *, 4> Visited;
21520 if (!Visited.
insert(
I).second)
21522 auto *DepDest = getScheduleData(
I);
21523 assert(DepDest &&
"must be in schedule window");
21524 DepDest->addControlDependency(BundleMember);
21525 BundleMember->incDependencies();
21526 if (!DepDest->isScheduled())
21527 BundleMember->incrementUnscheduledDeps(1);
21528 if (!DepDest->hasValidDependencies() ||
21529 (InsertInReadyList && DepDest->isReady()))
21537 for (Instruction *
I = BundleMember->getInst()->getNextNode();
21538 I != ScheduleEnd;
I =
I->getNextNode()) {
21543 MakeControlDependent(
I);
21551 if (RegionHasStackSave) {
21556 match(BundleMember->getInst(),
21558 for (Instruction *
I = BundleMember->getInst()->getNextNode();
21559 I != ScheduleEnd;
I =
I->getNextNode()) {
21570 MakeControlDependent(
I);
21580 BundleMember->getInst()->mayReadOrWriteMemory()) {
21581 for (Instruction *
I = BundleMember->getInst()->getNextNode();
21582 I != ScheduleEnd;
I =
I->getNextNode()) {
21588 MakeControlDependent(
I);
21595 ScheduleData *NextLoadStore = BundleMember->getNextLoadStore();
21596 if (!NextLoadStore)
21600 "NextLoadStore list for non memory effecting bundle?");
21603 unsigned NumAliased = 0;
21604 unsigned DistToSrc = 1;
21605 bool IsNonSimpleSrc = !SrcLoc.
Ptr || !
isSimple(SrcInst);
21607 for (ScheduleData *DepDest = NextLoadStore; DepDest;
21608 DepDest = DepDest->getNextLoadStore()) {
21609 assert(isInSchedulingRegion(*DepDest) &&
"Expected to be in region");
21619 ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
21621 SLP->isAliased(SrcLoc, SrcInst, DepDest->getInst())))) {
21628 DepDest->addMemoryDependency(BundleMember);
21629 BundleMember->incDependencies();
21630 if (!DepDest->isScheduled())
21631 BundleMember->incrementUnscheduledDeps(1);
21632 if (!DepDest->hasValidDependencies() ||
21633 (InsertInReadyList && DepDest->isReady()))
21657 "expected at least one instruction to schedule");
21659 WorkList.
push_back(Bundle.getBundle().front());
21661 SmallPtrSet<ScheduleBundle *, 16> Visited;
21662 while (!WorkList.
empty()) {
21667 CopyableBundle.
push_back(&CD->getBundle());
21668 Bundles = CopyableBundle;
21670 Bundles = getScheduleBundles(SD->getInst());
21672 if (Bundles.
empty()) {
21673 if (!SD->hasValidDependencies())
21675 if (InsertInReadyList && SD->isReady()) {
21676 ReadyInsts.insert(SD);
21677 LLVM_DEBUG(
dbgs() <<
"SLP: gets ready on update: " << *SD <<
"\n");
21681 for (ScheduleBundle *Bundle : Bundles) {
21682 if (Bundle->hasValidDependencies() || !Visited.
insert(Bundle).second)
21684 assert(isInSchedulingRegion(*Bundle) &&
21685 "ScheduleData not in scheduling region");
21686 for_each(Bundle->getBundle(), ProcessNode);
21688 if (InsertInReadyList && SD->isReady()) {
21689 for (ScheduleBundle *Bundle : Bundles) {
21690 assert(isInSchedulingRegion(*Bundle) &&
21691 "ScheduleData not in scheduling region");
21692 if (!Bundle->isReady())
21694 ReadyInsts.insert(Bundle);
21702void BoUpSLP::BlockScheduling::resetSchedule() {
21704 "tried to reset schedule on block which has not been scheduled");
21705 for_each(ScheduleDataMap, [&](
auto &
P) {
21706 if (BB !=
P.first->getParent())
21708 ScheduleData *SD =
P.second;
21709 if (isInSchedulingRegion(*SD)) {
21710 SD->setScheduled(
false);
21711 SD->resetUnscheduledDeps();
21714 for_each(ScheduleCopyableDataMapByInst, [&](
auto &
P) {
21715 for_each(
P.second, [&](ScheduleCopyableData *SD) {
21716 if (isInSchedulingRegion(*SD)) {
21717 SD->setScheduled(false);
21718 SD->resetUnscheduledDeps();
21722 for_each(ScheduledBundles, [&](
auto &
P) {
21723 for_each(
P.second, [&](ScheduleBundle *Bundle) {
21724 if (isInSchedulingRegion(*Bundle))
21725 Bundle->setScheduled(false);
21729 for (
auto &
P : ScheduleCopyableDataMap) {
21730 if (isInSchedulingRegion(*
P.second)) {
21731 P.second->setScheduled(
false);
21732 P.second->resetUnscheduledDeps();
21735 ReadyInsts.clear();
21738void BoUpSLP::scheduleBlock(
const BoUpSLP &R, BlockScheduling *BS) {
21739 if (!BS->ScheduleStart)
21742 LLVM_DEBUG(
dbgs() <<
"SLP: schedule block " << BS->BB->getName() <<
"\n");
21749 BS->resetSchedule();
21756 struct ScheduleDataCompare {
21757 bool operator()(
const ScheduleEntity *SD1,
21758 const ScheduleEntity *SD2)
const {
21759 return SD2->getSchedulingPriority() < SD1->getSchedulingPriority();
21762 std::set<ScheduleEntity *, ScheduleDataCompare> ReadyInsts;
21767 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
21768 I =
I->getNextNode()) {
21770 if (!Bundles.
empty()) {
21771 for (ScheduleBundle *Bundle : Bundles) {
21772 Bundle->setSchedulingPriority(Idx++);
21773 if (!Bundle->hasValidDependencies())
21774 BS->calculateDependencies(*Bundle,
false,
this);
21777 for (ScheduleCopyableData *SD :
reverse(SDs)) {
21778 ScheduleBundle &Bundle = SD->getBundle();
21779 Bundle.setSchedulingPriority(Idx++);
21780 if (!Bundle.hasValidDependencies())
21781 BS->calculateDependencies(Bundle,
false,
this);
21786 BS->getScheduleCopyableDataUsers(
I);
21787 if (ScheduleData *SD = BS->getScheduleData(
I)) {
21790 SDTEs.
front()->doesNotNeedToSchedule() ||
21792 "scheduler and vectorizer bundle mismatch");
21793 SD->setSchedulingPriority(Idx++);
21794 if (!SD->hasValidDependencies() &&
21795 (!CopyableData.
empty() ||
21796 any_of(
R.ValueToGatherNodes.lookup(
I), [&](
const TreeEntry *TE) {
21797 assert(TE->isGather() &&
"expected gather node");
21798 return TE->hasState() && TE->hasCopyableElements() &&
21799 TE->isCopyableElement(I);
21805 ScheduleBundle Bundle;
21807 BS->calculateDependencies(Bundle,
false,
this);
21810 for (ScheduleCopyableData *SD :
reverse(CopyableData)) {
21811 ScheduleBundle &Bundle = SD->getBundle();
21812 Bundle.setSchedulingPriority(Idx++);
21813 if (!Bundle.hasValidDependencies())
21814 BS->calculateDependencies(Bundle,
false,
this);
21817 BS->initialFillReadyList(ReadyInsts);
21819 Instruction *LastScheduledInst = BS->ScheduleEnd;
21822 SmallPtrSet<Instruction *, 16> Scheduled;
21823 while (!ReadyInsts.empty()) {
21824 auto *Picked = *ReadyInsts.begin();
21825 ReadyInsts.erase(ReadyInsts.begin());
21830 for (
const ScheduleEntity *BundleMember : Bundle->getBundle()) {
21831 Instruction *PickedInst = BundleMember->getInst();
21833 bool IsCopyable = Bundle->getTreeEntry()->isCopyableElement(PickedInst);
21834 if ((IsCopyable && BS->getScheduleData(PickedInst)) ||
21835 (!IsCopyable && !Scheduled.
insert(PickedInst).second))
21837 if (PickedInst->
getNextNode() != LastScheduledInst)
21839 LastScheduledInst = PickedInst;
21841 EntryToLastInstruction.try_emplace(Bundle->getTreeEntry(),
21842 LastScheduledInst);
21846 if (PickedInst->
getNextNode() != LastScheduledInst)
21848 LastScheduledInst = PickedInst;
21850 auto Invalid = InstructionsState::invalid();
21855#ifdef EXPENSIVE_CHECKS
21859#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
21861 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
21862 I =
I->getNextNode()) {
21865 [](
const ScheduleBundle *Bundle) {
21866 return Bundle->isScheduled();
21868 "must be scheduled at this point");
21873 BS->ScheduleStart =
nullptr;
21881 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
21886 auto E = InstrElementSize.find(V);
21887 if (E != InstrElementSize.end())
21904 Value *FirstNonBool =
nullptr;
21905 while (!Worklist.
empty()) {
21910 auto *Ty =
I->getType();
21913 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
21921 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
21929 for (
Use &U :
I->operands()) {
21931 if (Visited.
insert(J).second &&
21937 FirstNonBool = U.get();
21948 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
21950 Width = DL->getTypeSizeInBits(V->getType());
21954 InstrElementSize[
I] = Width;
21959bool BoUpSLP::collectValuesToDemote(
21960 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
21963 bool &IsProfitableToDemote,
bool IsTruncRoot)
const {
21968 unsigned OrigBitWidth =
21969 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
21976 if (NodesToKeepBWs.
contains(E.Idx))
21982 bool IsSignedNode =
any_of(E.Scalars, [&](
Value *R) {
21983 if (isa<PoisonValue>(R))
21985 return !isKnownNonNegative(R, SimplifyQuery(*DL));
21987 auto IsPotentiallyTruncated = [&](
Value *V,
unsigned &
BitWidth) ->
bool {
21990 if (getTreeEntries(V).
size() > 1)
21996 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth >
BitWidth) {
22002 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
22007 unsigned BitWidth2 =
22008 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
22009 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
22015 BitWidth1 = std::min(BitWidth1, BitWidth2);
22020 auto FinalAnalysis = [&, TTI = TTI]() {
22021 if (!IsProfitableToDemote)
22024 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(
BitWidth)));
22026 if (Res &&
E.isGather()) {
22027 if (
E.hasState()) {
22028 if (
const TreeEntry *SameTE =
22029 getSameValuesTreeEntry(
E.getMainOp(),
E.Scalars);
22031 if (collectValuesToDemote(*SameTE, IsProfitableToDemoteRoot,
BitWidth,
22032 ToDemote, Visited, NodesToKeepBWs,
22033 MaxDepthLevel, IsProfitableToDemote,
22041 SmallPtrSet<Value *, 4> UniqueBases;
22042 for (
Value *V :
E.Scalars) {
22046 UniqueBases.
insert(EE->getVectorOperand());
22048 const unsigned VF =
E.Scalars.size();
22049 Type *OrigScalarTy =
E.Scalars.front()->getType();
22050 if (UniqueBases.
size() <= 2 ||
22063 if (
E.isGather() || !Visited.
insert(&
E).second ||
22065 return !isa<Constant>(V) && all_of(V->users(), [&](User *U) {
22066 return isa<InsertElementInst>(U) && !isVectorized(U);
22069 return FinalAnalysis();
22072 return !isa<Constant>(V) && !all_of(V->users(), [=](User *U) {
22073 return isVectorized(U) ||
22074 (E.Idx == 0 && UserIgnoreList &&
22075 UserIgnoreList->contains(U)) ||
22076 (!isa<CmpInst>(U) && U->getType()->isSized() &&
22077 !U->getType()->isScalableTy() &&
22078 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
22079 }) && !IsPotentiallyTruncated(V,
BitWidth);
22084 bool &NeedToExit) {
22085 NeedToExit =
false;
22086 unsigned InitLevel = MaxDepthLevel;
22087 for (
const TreeEntry *
Op : Operands) {
22088 unsigned Level = InitLevel;
22089 if (!collectValuesToDemote(*
Op, IsProfitableToDemoteRoot,
BitWidth,
22090 ToDemote, Visited, NodesToKeepBWs, Level,
22091 IsProfitableToDemote, IsTruncRoot)) {
22092 if (!IsProfitableToDemote)
22095 if (!FinalAnalysis())
22099 MaxDepthLevel = std::max(MaxDepthLevel, Level);
22103 auto AttemptCheckBitwidth =
22104 [&](function_ref<bool(
unsigned,
unsigned)> Checker,
bool &NeedToExit) {
22106 NeedToExit =
false;
22107 unsigned BestFailBitwidth = 0;
22109 if (Checker(
BitWidth, OrigBitWidth))
22111 if (BestFailBitwidth == 0 && FinalAnalysis())
22115 if (BestFailBitwidth == 0) {
22126 auto TryProcessInstruction =
22128 function_ref<bool(
unsigned,
unsigned)> Checker = {}) {
22129 if (Operands.empty()) {
22132 for (
Value *V :
E.Scalars)
22133 (void)IsPotentiallyTruncated(V,
BitWidth);
22138 return !V->hasOneUse() && !IsPotentiallyTruncated(V, BitWidth);
22141 bool NeedToExit =
false;
22142 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
22146 if (!ProcessOperands(Operands, NeedToExit))
22155 return IsProfitableToDemote;
22158 if (
E.State == TreeEntry::SplitVectorize)
22159 return TryProcessInstruction(
22161 {VectorizableTree[
E.CombinedEntriesWithIndices.front().first].get(),
22162 VectorizableTree[
E.CombinedEntriesWithIndices.back().first].get()});
22164 if (
E.isAltShuffle()) {
22166 auto IsDangerousOpcode = [](
unsigned Opcode) {
22168 case Instruction::Shl:
22169 case Instruction::AShr:
22170 case Instruction::LShr:
22171 case Instruction::UDiv:
22172 case Instruction::SDiv:
22173 case Instruction::URem:
22174 case Instruction::SRem:
22181 if (IsDangerousOpcode(
E.getAltOpcode()))
22182 return FinalAnalysis();
22185 switch (
E.getOpcode()) {
22189 case Instruction::Trunc:
22190 if (IsProfitableToDemoteRoot)
22191 IsProfitableToDemote =
true;
22192 return TryProcessInstruction(
BitWidth);
22193 case Instruction::ZExt:
22194 case Instruction::SExt:
22195 if (
E.UserTreeIndex.UserTE &&
E.UserTreeIndex.UserTE->hasState() &&
22196 E.UserTreeIndex.UserTE->getOpcode() == Instruction::BitCast &&
22197 E.UserTreeIndex.UserTE->getMainOp()->getType()->isFPOrFPVectorTy())
22199 IsProfitableToDemote =
true;
22200 return TryProcessInstruction(
BitWidth);
22204 case Instruction::Add:
22205 case Instruction::Sub:
22206 case Instruction::Mul:
22207 case Instruction::And:
22208 case Instruction::Or:
22209 case Instruction::Xor: {
22210 return TryProcessInstruction(
22211 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)});
22213 case Instruction::Freeze:
22214 return TryProcessInstruction(
BitWidth, getOperandEntry(&
E, 0));
22215 case Instruction::Shl: {
22218 auto ShlChecker = [&](
unsigned BitWidth, unsigned) {
22220 if (isa<PoisonValue>(V))
22222 if (E.isCopyableElement(V))
22224 auto *I = cast<Instruction>(V);
22225 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22226 return AmtKnownBits.getMaxValue().ult(BitWidth);
22229 return TryProcessInstruction(
22230 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)}, ShlChecker);
22232 case Instruction::LShr: {
22236 auto LShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22238 if (isa<PoisonValue>(V))
22240 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22241 if (E.isCopyableElement(V))
22242 return MaskedValueIsZero(V, ShiftedBits, SimplifyQuery(*DL));
22243 auto *I = cast<Instruction>(V);
22244 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22245 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
22246 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
22247 SimplifyQuery(*DL));
22250 return TryProcessInstruction(
22251 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)},
22254 case Instruction::AShr: {
22258 auto AShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22260 if (isa<PoisonValue>(V))
22262 auto *I = cast<Instruction>(V);
22263 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22264 unsigned ShiftedBits = OrigBitWidth - BitWidth;
22265 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
22267 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22270 return TryProcessInstruction(
22271 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)},
22274 case Instruction::UDiv:
22275 case Instruction::URem: {
22277 auto Checker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22280 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22281 if (E.hasCopyableElements() && E.isCopyableElement(V))
22282 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
22283 auto *I = cast<Instruction>(V);
22284 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
22285 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
22288 return TryProcessInstruction(
22289 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)}, Checker);
22293 case Instruction::Select: {
22294 return TryProcessInstruction(
22295 BitWidth, {getOperandEntry(&
E, 1), getOperandEntry(&
E, 2)});
22299 case Instruction::PHI: {
22300 const unsigned NumOps =
E.getNumOperands();
22303 [&](
unsigned Idx) { return getOperandEntry(&E, Idx); });
22308 case Instruction::Call: {
22313 if (
ID != Intrinsic::abs &&
ID != Intrinsic::smin &&
22314 ID != Intrinsic::smax &&
ID != Intrinsic::umin &&
ID != Intrinsic::umax)
22317 function_ref<bool(
unsigned,
unsigned)> CallChecker;
22318 auto CompChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22321 auto *I = cast<Instruction>(V);
22322 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
22323 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22324 return MaskedValueIsZero(I->getOperand(0), Mask,
22325 SimplifyQuery(*DL)) &&
22326 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
22328 assert((
ID == Intrinsic::smin ||
ID == Intrinsic::smax) &&
22329 "Expected min/max intrinsics only.");
22330 unsigned SignBits = OrigBitWidth -
BitWidth;
22332 unsigned Op0SignBits =
22334 unsigned Op1SignBits =
22336 return SignBits <= Op0SignBits &&
22337 ((SignBits != Op0SignBits &&
22340 SimplifyQuery(*DL))) &&
22341 SignBits <= Op1SignBits &&
22342 ((SignBits != Op1SignBits &&
22347 auto AbsChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22350 auto *I = cast<Instruction>(V);
22351 unsigned SignBits = OrigBitWidth - BitWidth;
22352 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
22353 unsigned Op0SignBits =
22354 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22355 return SignBits <= Op0SignBits &&
22356 ((SignBits != Op0SignBits &&
22357 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
22358 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
22361 if (
ID != Intrinsic::abs) {
22362 Operands.push_back(getOperandEntry(&
E, 1));
22363 CallChecker = CompChecker;
22365 CallChecker = AbsChecker;
22368 std::numeric_limits<InstructionCost::CostType>::max();
22370 unsigned VF =
E.Scalars.size();
22372 auto Checker = [&](
unsigned BitWidth, unsigned) {
22380 if (
Cost < BestCost) {
22386 [[maybe_unused]]
bool NeedToExit;
22387 (void)AttemptCheckBitwidth(Checker, NeedToExit);
22389 return TryProcessInstruction(
BitWidth, Operands, CallChecker);
22397 return FinalAnalysis();
22404 bool IsStoreOrInsertElt =
22405 VectorizableTree.front()->hasState() &&
22406 (VectorizableTree.front()->
getOpcode() == Instruction::Store ||
22407 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
22408 if ((IsStoreOrInsertElt || UserIgnoreList) &&
22409 ExtraBitWidthNodes.size() <= 1 &&
22410 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
22411 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
22414 unsigned NodeIdx = 0;
22415 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
22419 assert((VectorizableTree[NodeIdx]->
isGather() || NodeIdx != 0 ||
22420 !VectorizableTree[NodeIdx]->UserTreeIndex) &&
22421 "Unexpected tree is graph.");
22425 bool IsTruncRoot =
false;
22426 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
22429 if (NodeIdx != 0 &&
22430 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22431 VectorizableTree[NodeIdx]->
getOpcode() == Instruction::Trunc) {
22432 assert(IsStoreOrInsertElt &&
"Expected store/insertelement seeded graph.");
22433 IsTruncRoot =
true;
22435 IsProfitableToDemoteRoot =
true;
22440 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
22444 auto ComputeMaxBitWidth =
22445 [&](
const TreeEntry &E,
bool IsTopRoot,
bool IsProfitableToDemoteRoot,
22446 unsigned Limit,
bool IsTruncRoot,
bool IsSignedCmp) ->
unsigned {
22450 if (E.isGather() && IsTruncRoot && E.UserTreeIndex &&
22451 !NodesToKeepBWs.
contains(E.Idx) &&
22452 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
22454 return V->hasOneUse() || isa<Constant>(V) ||
22455 (!V->hasNUsesOrMore(UsesLimit) &&
22456 none_of(V->users(), [&](User *U) {
22457 ArrayRef<TreeEntry *> TEs = getTreeEntries(U);
22458 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22459 if (TEs.empty() || is_contained(TEs, UserTE))
22461 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22463 isa<SIToFPInst, UIToFPInst>(U) ||
22464 (UserTE->hasState() &&
22465 (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22466 SelectInst>(UserTE->getMainOp()) ||
22467 isa<SIToFPInst, UIToFPInst>(UserTE->getMainOp()))))
22469 unsigned UserTESz = DL->getTypeSizeInBits(
22470 UserTE->Scalars.front()->getType());
22471 if (all_of(TEs, [&](const TreeEntry *TE) {
22472 auto It = MinBWs.find(TE);
22473 return It != MinBWs.end() &&
22474 It->second.first > UserTESz;
22477 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
22481 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22482 auto It = MinBWs.find(UserTE);
22483 if (It != MinBWs.end())
22484 return It->second.first;
22485 unsigned MaxBitWidth =
22486 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
22487 MaxBitWidth =
bit_ceil(MaxBitWidth);
22488 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22490 return MaxBitWidth;
22496 unsigned VF = E.getVectorFactor();
22497 Type *ScalarTy = E.Scalars.front()->getType();
22504 [&](
Value *V) { return AnalyzedMinBWVals.contains(V); }))
22513 unsigned MaxBitWidth = 1u;
22521 bool IsKnownPositive = !IsSignedCmp &&
all_of(E.Scalars, [&](
Value *R) {
22522 if (isa<PoisonValue>(R))
22524 KnownBits Known = computeKnownBits(R, *DL);
22525 return Known.isNonNegative();
22528 if (!IsKnownPositive && !IsTopRoot && E.UserTreeIndex &&
22529 E.UserTreeIndex.UserTE->hasState() &&
22530 E.UserTreeIndex.UserTE->getOpcode() == Instruction::UIToFP)
22532 std::min(DL->getTypeSizeInBits(
22533 E.UserTreeIndex.UserTE->Scalars.front()->getType()),
22534 DL->getTypeSizeInBits(ScalarTy));
22538 for (
Value *Root : E.Scalars) {
22544 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22560 if (!IsKnownPositive)
22565 MaxBitWidth = std::max(BitWidth1, MaxBitWidth);
22568 APInt Mask = DB->getDemandedBits(
I);
22569 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
22571 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
22574 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22579 if (NumParts > 1 &&
22587 unsigned Opcode = E.getOpcode();
22588 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
22589 Opcode == Instruction::SExt ||
22590 Opcode == Instruction::ZExt || NumParts > 1;
22595 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
22596 bool NeedToDemote = IsProfitableToDemote;
22598 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
22599 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
22600 NeedToDemote, IsTruncRoot) ||
22601 (MaxDepthLevel <= Limit &&
22602 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
22603 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
22604 DL->getTypeSizeInBits(TreeRootIT) /
22605 DL->getTypeSizeInBits(
22606 E.getMainOp()->getOperand(0)->getType()) >
22610 MaxBitWidth =
bit_ceil(MaxBitWidth);
22612 return MaxBitWidth;
22619 if (UserIgnoreList &&
22623 if (
all_of(*UserIgnoreList,
22628 VectorizableTree.front()->State == TreeEntry::Vectorize &&
22629 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
22630 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
22631 Builder.getInt1Ty()) {
22632 ReductionBitWidth = 1;
22634 for (
Value *V : *UserIgnoreList) {
22638 TypeSize NumTypeBits =
DL->getTypeSizeInBits(V->getType());
22639 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22642 unsigned BitWidth2 = BitWidth1;
22645 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
22647 ReductionBitWidth =
22648 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
22650 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
22651 ReductionBitWidth = 8;
22653 ReductionBitWidth =
bit_ceil(ReductionBitWidth);
22656 bool IsTopRoot = NodeIdx == 0;
22657 while (NodeIdx < VectorizableTree.size() &&
22658 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22659 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
22660 RootDemotes.push_back(NodeIdx);
22662 IsTruncRoot =
true;
22664 bool IsSignedCmp =
false;
22665 if (UserIgnoreList &&
22669 IsSignedCmp =
true;
22670 while (NodeIdx < VectorizableTree.size()) {
22672 unsigned Limit = 2;
22674 ReductionBitWidth ==
22675 DL->getTypeSizeInBits(
22676 VectorizableTree.front()->Scalars.front()->getType()))
22678 unsigned MaxBitWidth = ComputeMaxBitWidth(
22679 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
22680 IsTruncRoot, IsSignedCmp);
22681 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
22682 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
22683 ReductionBitWidth =
bit_ceil(MaxBitWidth);
22684 else if (MaxBitWidth == 0)
22685 ReductionBitWidth = 0;
22688 for (
unsigned Idx : RootDemotes) {
22689 if (
all_of(VectorizableTree[Idx]->Scalars, [&](
Value *V) {
22690 uint32_t OrigBitWidth =
22691 DL->getTypeSizeInBits(
V->getType()->getScalarType());
22692 if (OrigBitWidth > MaxBitWidth) {
22700 RootDemotes.clear();
22702 IsProfitableToDemoteRoot =
true;
22704 if (ExtraBitWidthNodes.empty()) {
22705 NodeIdx = VectorizableTree.size();
22707 unsigned NewIdx = 0;
22709 NewIdx = *ExtraBitWidthNodes.begin();
22710 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
22711 }
while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
22714 NodeIdx < VectorizableTree.size() &&
22715 VectorizableTree[NodeIdx]->UserTreeIndex &&
22716 VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&
22717 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22718 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22719 Instruction::Trunc &&
22720 !VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();
22722 NodeIdx < VectorizableTree.size() &&
22723 VectorizableTree[NodeIdx]->UserTreeIndex &&
22724 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22725 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22726 Instruction::ICmp &&
22728 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->Scalars,
22730 auto *IC = dyn_cast<ICmpInst>(V);
22731 return IC && (IC->isSigned() ||
22732 !isKnownNonNegative(IC->getOperand(0),
22733 SimplifyQuery(*DL)) ||
22734 !isKnownNonNegative(IC->getOperand(1),
22735 SimplifyQuery(*DL)));
22741 if (MaxBitWidth == 0 ||
22745 if (UserIgnoreList)
22746 AnalyzedMinBWVals.insert_range(TreeRoot);
22753 for (
unsigned Idx : ToDemote) {
22754 TreeEntry *
TE = VectorizableTree[Idx].get();
22755 if (MinBWs.contains(TE))
22758 if (isa<PoisonValue>(R))
22760 return !isKnownNonNegative(R, SimplifyQuery(*DL));
22762 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
22803 DL = &
F.getDataLayout();
22811 if (!
TTI->getNumberOfRegisters(
TTI->getRegisterClassForType(
true))) {
22813 dbgs() <<
"SLP: Didn't find any vector registers for target, abort.\n");
22818 if (
F.hasFnAttribute(Attribute::NoImplicitFloat))
22821 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing blocks in " <<
F.getName() <<
".\n");
22825 BoUpSLP R(&
F,
SE,
TTI,
TLI,
AA,
LI,
DT,
AC,
DB,
DL, ORE_);
22831 DT->updateDFSNumbers();
22834 for (
auto *BB :
post_order(&
F.getEntryBlock())) {
22839 R.clearReductionData();
22840 collectSeedInstructions(BB);
22843 if (!Stores.empty()) {
22845 <<
" underlying objects.\n");
22846 Changed |= vectorizeStoreChains(R);
22850 Changed |= vectorizeChainsInBlock(BB, R);
22855 if (!GEPs.empty()) {
22857 <<
" underlying objects.\n");
22858 Changed |= vectorizeGEPIndices(BB, R);
22863 R.optimizeGatherSequence();
22871 unsigned Idx,
unsigned MinVF,
22876 const unsigned Sz = R.getVectorElementSize(Chain[0]);
22877 unsigned VF = Chain.
size();
22883 VF < 2 || VF < MinVF) {
22891 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << VF <<
" stores at offset " << Idx
22895 for (
Value *V : Chain)
22898 InstructionsCompatibilityAnalysis
Analysis(*DT, *
DL, *
TTI, *TLI);
22899 InstructionsState S =
Analysis.buildInstructionsState(
22903 bool IsAllowedSize =
22907 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
22908 (!S.getMainOp()->isSafeToRemove() ||
22911 return !isa<ExtractElementInst>(V) &&
22912 (V->getNumUses() > Chain.size() ||
22913 any_of(V->users(), [&](User *U) {
22914 return !Stores.contains(U);
22917 (ValOps.
size() > Chain.size() / 2 && !S)) {
22918 Size = (!IsAllowedSize && S) ? 1 : 2;
22922 if (
R.isLoadCombineCandidate(Chain))
22924 R.buildTree(Chain);
22926 if (
R.isTreeTinyAndNotFullyVectorizable()) {
22927 if (
R.isGathered(Chain.front()) ||
22929 return std::nullopt;
22930 Size =
R.getCanonicalGraphSize();
22933 if (
R.isProfitableToReorder()) {
22934 R.reorderTopToBottom();
22935 R.reorderBottomToTop();
22937 R.transformNodes();
22938 R.buildExternalUses();
22940 R.computeMinimumValueSizes();
22942 Size =
R.getCanonicalGraphSize();
22943 if (S && S.getOpcode() == Instruction::Load)
22951 using namespace ore;
22953 R.getORE()->emit(OptimizationRemark(
SV_NAME,
"StoresVectorized",
22955 <<
"Stores SLP vectorized with cost " <<
NV(
"Cost",
Cost)
22956 <<
" and with tree size "
22957 <<
NV(
"TreeSize",
R.getTreeSize()));
22971 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
22972 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
22973 unsigned Size = First ? Val.first : Val.second;
22985 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
22986 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
22987 unsigned P = First ? Val.first : Val.second;
22990 return V + (P - Mean) * (P - Mean);
22993 return Dev * 96 / (Mean * Mean) == 0;
23001class RelatedStoreInsts {
23004 : AllStores(AllStores) {
23005 reset(BaseInstrIdx);
23008 void reset(
unsigned NewBaseInstr) {
23009 assert(NewBaseInstr < AllStores.size() &&
23010 "Instruction index out of bounds");
23011 BaseInstrIdx = NewBaseInstr;
23013 insertOrLookup(NewBaseInstr, 0);
23020 std::optional<unsigned> insertOrLookup(
unsigned InstrIdx, int64_t PtrDist) {
23021 auto [It,
Inserted] = Instrs.emplace(PtrDist, InstrIdx);
23022 return Inserted ? std::nullopt : std::make_optional(It->second);
23025 using DistToInstMap = std::map<int64_t, unsigned>;
23026 const DistToInstMap &getStores()
const {
return Instrs; }
23030 std::optional<int64_t> getPointerDiff(StoreInst &SI,
const DataLayout &
DL,
23031 ScalarEvolution &SE)
const {
23032 StoreInst &BaseStore = *AllStores[BaseInstrIdx];
23035 SI.getValueOperand()->getType(),
SI.getPointerOperand(),
DL, SE,
23041 void rebase(
unsigned MinSafeIdx,
unsigned NewBaseInstIdx,
23042 int64_t DistFromCurBase) {
23043 DistToInstMap PrevSet = std::move(Instrs);
23044 reset(NewBaseInstIdx);
23049 for (
auto [Dist, InstIdx] : PrevSet) {
23050 if (InstIdx >= MinSafeIdx)
23051 insertOrLookup(InstIdx, Dist - DistFromCurBase);
23057 DistToInstMap::reverse_iterator LastVectorizedStore =
find_if(
23058 reverse(Instrs), [&](
const std::pair<int64_t, unsigned> &DistAndIdx) {
23059 return VectorizedStores.
contains(AllStores[DistAndIdx.second]);
23064 DistToInstMap::iterator VectorizedStoresEnd = LastVectorizedStore.base();
23065 Instrs.erase(Instrs.begin(), VectorizedStoresEnd);
23070 unsigned BaseInstrIdx;
23073 DistToInstMap Instrs;
23081bool SLPVectorizerPass::vectorizeStores(
23083 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
23090 auto TryToVectorize = [&](
const RelatedStoreInsts::DistToInstMap &StoreSeq) {
23091 int64_t PrevDist = -1;
23095 auto &[Dist, InstIdx] =
Data;
23096 if (Operands.
empty() || Dist - PrevDist == 1) {
23099 if (Idx != StoreSeq.size() - 1)
23108 if (Operands.
size() <= 1 ||
23110 .
insert({Operands.front(),
23111 cast<StoreInst>(Operands.front())->getValueOperand(),
23113 cast<StoreInst>(Operands.back())->getValueOperand(),
23118 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
23119 unsigned EltSize =
R.getVectorElementSize(Operands[0]);
23123 std::min(
R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
23125 Type *StoreTy =
Store->getValueOperand()->getType();
23126 Type *ValueTy = StoreTy;
23128 ValueTy = Trunc->getSrcTy();
23137 R.getMinVF(DL->getTypeStoreSizeInBits(StoreScalarTy)), StoreScalarTy,
23140 MinVF = std::max<unsigned>(2, MinVF);
23142 if (MaxVF < MinVF) {
23143 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
23145 <<
"MinVF (" << MinVF <<
")\n");
23149 unsigned NonPowerOf2VF = 0;
23154 unsigned CandVF = std::clamp<unsigned>(Operands.
size(), MinVF, MaxVF);
23156 NonPowerOf2VF = CandVF;
23157 assert(NonPowerOf2VF != MaxVF &&
23158 "Non-power-of-2 VF should not be equal to MaxVF");
23165 unsigned MaxRegVF = MaxVF;
23167 MaxVF = std::min<unsigned>(MaxVF,
bit_floor(Operands.
size()));
23168 if (MaxVF < MinVF) {
23169 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
23171 <<
"MinVF (" << MinVF <<
")\n");
23175 SmallVector<unsigned> CandidateVFs;
23176 for (
unsigned VF = std::max(MaxVF, NonPowerOf2VF); VF >= MinVF;
23180 unsigned End = Operands.
size();
23181 unsigned Repeat = 0;
23182 constexpr unsigned MaxAttempts = 4;
23183 OwningArrayRef<std::pair<unsigned, unsigned>> RangeSizes(Operands.
size());
23184 for (std::pair<unsigned, unsigned> &
P : RangeSizes)
23185 P.first =
P.second = 1;
23186 DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable;
23187 auto IsNotVectorized = [](
bool First,
23188 const std::pair<unsigned, unsigned> &
P) {
23189 return First ?
P.first > 0 :
P.second > 0;
23191 auto IsVectorized = [](
bool First,
23192 const std::pair<unsigned, unsigned> &
P) {
23193 return First ?
P.first == 0 :
P.second == 0;
23195 auto VFIsProfitable = [](
bool First,
unsigned Size,
23196 const std::pair<unsigned, unsigned> &
P) {
23199 auto FirstSizeSame = [](
unsigned Size,
23200 const std::pair<unsigned, unsigned> &
P) {
23201 return Size ==
P.first;
23205 bool RepeatChanged =
false;
23206 bool AnyProfitableGraph =
false;
23207 for (
unsigned VF : CandidateVFs) {
23208 AnyProfitableGraph =
false;
23209 unsigned FirstUnvecStore =
23210 std::distance(RangeSizes.begin(),
23211 find_if(RangeSizes, std::bind(IsNotVectorized,
23212 VF >= MaxRegVF, _1)));
23216 while (FirstUnvecStore < End) {
23217 unsigned FirstVecStore = std::distance(
23218 RangeSizes.begin(),
23219 find_if(RangeSizes.drop_front(FirstUnvecStore),
23220 std::bind(IsVectorized, VF >= MaxRegVF, _1)));
23221 unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore;
23222 for (
unsigned SliceStartIdx = FirstUnvecStore;
23223 SliceStartIdx + VF <= MaxSliceEnd;) {
23234 ->getValueOperand()
23237 ->getValueOperand()
23240 "Expected all operands of same type.");
23241 if (!NonSchedulable.
empty()) {
23242 auto [NonSchedSizeMax, NonSchedSizeMin] =
23244 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= VF) {
23247 SliceStartIdx += NonSchedSizeMax;
23252 std::optional<bool> Res =
23253 vectorizeStoreChain(Slice, R, SliceStartIdx, MinVF, TreeSize);
23259 .first->getSecond()
23267 AnyProfitableGraph = RepeatChanged =
Changed =
true;
23270 for (std::pair<unsigned, unsigned> &
P :
23271 RangeSizes.slice(SliceStartIdx, VF))
23272 P.first =
P.second = 0;
23273 if (SliceStartIdx < FirstUnvecStore + MinVF) {
23274 for (std::pair<unsigned, unsigned> &
P : RangeSizes.slice(
23275 FirstUnvecStore, SliceStartIdx - FirstUnvecStore))
23276 P.first =
P.second = 0;
23277 FirstUnvecStore = SliceStartIdx + VF;
23279 if (SliceStartIdx > MaxSliceEnd - VF - MinVF) {
23280 for (std::pair<unsigned, unsigned> &
P :
23281 RangeSizes.slice(SliceStartIdx + VF,
23282 MaxSliceEnd - (SliceStartIdx + VF)))
23283 P.first =
P.second = 0;
23284 if (MaxSliceEnd == End)
23285 End = SliceStartIdx;
23286 MaxSliceEnd = SliceStartIdx;
23288 SliceStartIdx += VF;
23291 if (VF > 2 && Res &&
23292 !
all_of(RangeSizes.slice(SliceStartIdx, VF),
23293 std::bind(VFIsProfitable, VF >= MaxRegVF, TreeSize,
23295 SliceStartIdx += VF;
23300 if (VF > MaxRegVF && TreeSize > 1 &&
23301 all_of(RangeSizes.slice(SliceStartIdx, VF),
23302 std::bind(FirstSizeSame, TreeSize, _1))) {
23303 SliceStartIdx += VF;
23304 while (SliceStartIdx != MaxSliceEnd &&
23305 RangeSizes[SliceStartIdx].first == TreeSize)
23309 if (TreeSize > 1) {
23310 for (std::pair<unsigned, unsigned> &
P :
23311 RangeSizes.slice(SliceStartIdx, VF)) {
23312 if (VF >= MaxRegVF)
23313 P.second = std::max(
P.second, TreeSize);
23315 P.first = std::max(
P.first, TreeSize);
23319 AnyProfitableGraph =
true;
23321 if (FirstUnvecStore >= End)
23323 if (MaxSliceEnd - FirstUnvecStore < VF &&
23324 MaxSliceEnd - FirstUnvecStore >= MinVF)
23325 AnyProfitableGraph =
true;
23326 FirstUnvecStore = std::distance(
23327 RangeSizes.begin(),
23328 find_if(RangeSizes.drop_front(MaxSliceEnd),
23329 std::bind(IsNotVectorized, VF >= MaxRegVF, _1)));
23331 if (!AnyProfitableGraph && VF >= MaxRegVF &&
has_single_bit(VF))
23335 if (
all_of(RangeSizes, [](
const std::pair<unsigned, unsigned> &
P) {
23336 return P.first == 0 &&
P.second == 0;
23340 if (Repeat >= MaxAttempts ||
23341 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
23343 constexpr unsigned StoresLimit = 64;
23344 const unsigned MaxTotalNum = std::min<unsigned>(
23346 static_cast<unsigned>(
23349 RangeSizes.begin(),
23350 find_if(RangeSizes, std::bind(IsNotVectorized,
true, _1))) +
23352 unsigned VF =
bit_ceil(CandidateVFs.front()) * 2;
23355 CandidateVFs.clear();
23357 CandidateVFs.push_back(Limit);
23358 if (VF > MaxTotalNum || VF >= StoresLimit)
23360 for (std::pair<unsigned, unsigned> &
P : RangeSizes) {
23362 P.first = std::max(
P.second,
P.first);
23366 CandidateVFs.push_back(VF);
23406 auto FillStoresSet = [&](
unsigned Idx, StoreInst *
SI) {
23407 std::optional<int64_t> PtrDist;
23408 auto *RelatedStores =
find_if(
23409 SortedStores, [&PtrDist, SI,
this](
const RelatedStoreInsts &StoreSeq) {
23410 PtrDist = StoreSeq.getPointerDiff(*SI, *DL, *SE);
23411 return PtrDist.has_value();
23415 if (RelatedStores == SortedStores.
end()) {
23423 if (std::optional<unsigned> PrevInst =
23424 RelatedStores->insertOrLookup(Idx, *PtrDist)) {
23425 TryToVectorize(RelatedStores->getStores());
23426 RelatedStores->clearVectorizedStores(VectorizedStores);
23427 RelatedStores->rebase(*PrevInst + 1,
23432 Type *PrevValTy =
nullptr;
23434 if (
R.isDeleted(SI))
23437 PrevValTy =
SI->getValueOperand()->getType();
23439 if (PrevValTy !=
SI->getValueOperand()->getType()) {
23440 for (RelatedStoreInsts &StoreSeq : SortedStores)
23441 TryToVectorize(StoreSeq.getStores());
23442 SortedStores.clear();
23443 PrevValTy =
SI->getValueOperand()->getType();
23445 FillStoresSet(
I, SI);
23449 for (RelatedStoreInsts &StoreSeq : SortedStores)
23450 TryToVectorize(StoreSeq.getStores());
23455void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
23463 for (Instruction &
I : *BB) {
23467 if (!
SI->isSimple())
23478 if (
GEP->getNumIndices() != 1)
23480 Value *Idx =
GEP->idx_begin()->get();
23485 if (
GEP->getType()->isVectorTy())
23497 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize a list of length = "
23498 << VL.
size() <<
".\n");
23509 for (
Value *V : VL) {
23510 Type *Ty =
V->getType();
23514 R.getORE()->emit([&]() {
23515 std::string TypeStr;
23516 llvm::raw_string_ostream OS(TypeStr);
23518 return OptimizationRemarkMissed(
SV_NAME,
"UnsupportedType", I0)
23519 <<
"Cannot SLP vectorize list: type "
23520 << TypeStr +
" is unsupported by vectorizer";
23527 unsigned Sz =
R.getVectorElementSize(I0);
23528 unsigned MinVF =
R.getMinVF(Sz);
23529 unsigned MaxVF = std::max<unsigned>(
23531 MaxVF = std::min(
R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
23533 R.getORE()->emit([&]() {
23534 return OptimizationRemarkMissed(
SV_NAME,
"SmallVF", I0)
23535 <<
"Cannot SLP vectorize list: vectorization factor "
23536 <<
"less than 2 is not supported";
23542 bool CandidateFound =
false;
23545 unsigned NextInst = 0, MaxInst = VL.size();
23546 for (
unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
23552 if (TTI->getNumberOfParts(VecTy) == VF)
23554 for (
unsigned I = NextInst;
I < MaxInst; ++
I) {
23555 unsigned ActualVF = std::min(MaxInst -
I, VF);
23560 if (MaxVFOnly && ActualVF < MaxVF)
23562 if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
23567 for (
Value *V : VL.drop_front(
I)) {
23571 !Inst || !
R.isDeleted(Inst)) {
23574 if (Idx == ActualVF)
23579 if (Idx != ActualVF)
23582 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << ActualVF <<
" operations "
23586 if (
R.isTreeTinyAndNotFullyVectorizable())
23588 if (
R.isProfitableToReorder()) {
23589 R.reorderTopToBottom();
23592 R.transformNodes();
23593 R.buildExternalUses();
23595 R.computeMinimumValueSizes();
23597 CandidateFound =
true;
23598 MinCost = std::min(MinCost,
Cost);
23601 <<
" for VF=" << ActualVF <<
"\n");
23604 R.getORE()->emit(OptimizationRemark(
SV_NAME,
"VectorizedList",
23606 <<
"SLP vectorized with cost " <<
ore::NV(
"Cost",
Cost)
23607 <<
" and with tree size "
23608 <<
ore::NV(
"TreeSize",
R.getTreeSize()));
23619 if (!
Changed && CandidateFound) {
23620 R.getORE()->emit([&]() {
23621 return OptimizationRemarkMissed(
SV_NAME,
"NotBeneficial", I0)
23622 <<
"List vectorization was possible but not beneficial with cost "
23623 <<
ore::NV(
"Cost", MinCost) <<
" >= "
23627 R.getORE()->emit([&]() {
23628 return OptimizationRemarkMissed(
SV_NAME,
"NotPossible", I0)
23629 <<
"Cannot SLP vectorize list: vectorization was impossible"
23630 <<
" with available vectorization factors";
23665 using ReductionOpsType = SmallVector<Value *, 16>;
23666 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
23667 ReductionOpsListType ReductionOps;
23671 SmallDenseMap<Value *, SmallVector<Instruction *>, 16> ReducedValsToOps;
23672 WeakTrackingVH ReductionRoot;
23677 bool IsSupportedHorRdxIdentityOp =
false;
23684 static bool isCmpSelMinMax(Instruction *
I) {
23692 static bool isBoolLogicOp(Instruction *
I) {
23698 static bool isVectorizable(
RecurKind Kind, Instruction *
I,
23699 bool TwoElementReduction =
false) {
23700 if (Kind == RecurKind::None)
23709 if (TwoElementReduction)
23712 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
23716 return I->getFastMathFlags().noNaNs();
23719 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
23722 return I->isAssociative();
23725 static Value *getRdxOperand(Instruction *
I,
unsigned Index) {
23731 return I->getOperand(2);
23732 return I->getOperand(Index);
23737 Value *
RHS,
const Twine &Name,
bool UseSelect) {
23741 case RecurKind::Or: {
23750 case RecurKind::And: {
23760 case RecurKind::Add:
23761 case RecurKind::Mul:
23762 case RecurKind::Xor:
23763 case RecurKind::FAdd:
23764 case RecurKind::FMul: {
23769 case RecurKind::SMax:
23770 case RecurKind::SMin:
23771 case RecurKind::UMax:
23772 case RecurKind::UMin:
23780 case RecurKind::FMax:
23781 case RecurKind::FMin:
23782 case RecurKind::FMaximum:
23783 case RecurKind::FMinimum:
23784 case RecurKind::FMaximumNum:
23785 case RecurKind::FMinimumNum: {
23798 const ReductionOpsListType &ReductionOps) {
23799 bool UseSelect = ReductionOps.size() == 2 ||
23801 (ReductionOps.size() == 1 &&
23803 assert((!UseSelect || ReductionOps.size() != 2 ||
23805 "Expected cmp + select pairs for reduction");
23806 Value *
Op = createOp(Builder, RdxKind,
LHS,
RHS, Name, UseSelect);
23824 return RecurKind::None;
23826 return RecurKind::Add;
23828 return RecurKind::Mul;
23831 return RecurKind::And;
23834 return RecurKind::Or;
23836 return RecurKind::Xor;
23838 return RecurKind::FAdd;
23840 return RecurKind::FMul;
23843 return RecurKind::FMax;
23845 return RecurKind::FMin;
23848 return RecurKind::FMaximum;
23850 return RecurKind::FMinimum;
23856 return RecurKind::SMax;
23858 return RecurKind::SMin;
23860 return RecurKind::UMax;
23862 return RecurKind::UMin;
23888 return RecurKind::None;
23892 return RecurKind::None;
23895 return RecurKind::None;
23899 return RecurKind::None;
23904 return RecurKind::None;
23907 return RecurKind::SMax;
23910 return RecurKind::SMin;
23913 return RecurKind::UMax;
23916 return RecurKind::UMin;
23919 return RecurKind::None;
23923 static unsigned getFirstOperandIndex(Instruction *
I) {
23924 return isCmpSelMinMax(
I) ? 1 : 0;
23929 static unsigned getNumberOfOperands(Instruction *
I) {
23930 return isCmpSelMinMax(
I) ? 3 : 2;
23935 static bool hasSameParent(Instruction *
I, BasicBlock *BB) {
23936 if (isCmpSelMinMax(
I) || isBoolLogicOp(
I)) {
23939 return Sel->getParent() == BB &&
Cmp &&
Cmp->getParent() == BB;
23941 return I->getParent() == BB;
23945 static bool hasRequiredNumberOfUses(
bool IsCmpSelMinMax, Instruction *
I) {
23946 if (IsCmpSelMinMax) {
23950 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
23951 return I->hasNUses(2);
23959 void initReductionOps(Instruction *
I) {
23960 if (isCmpSelMinMax(
I))
23961 ReductionOps.assign(2, ReductionOpsType());
23963 ReductionOps.assign(1, ReductionOpsType());
23967 void addReductionOps(Instruction *
I) {
23968 if (isCmpSelMinMax(
I)) {
23970 ReductionOps[1].emplace_back(
I);
23972 ReductionOps[0].emplace_back(
I);
23977 int Sz =
Data.size();
23986 : ReductionRoot(
I), ReductionLimit(2) {
23987 RdxKind = HorizontalReduction::getRdxKind(
I);
23988 ReductionOps.emplace_back().push_back(
I);
23991 ReducedValsToOps[
V].push_back(
I);
23994 bool matchReductionForOperands()
const {
23997 assert(ReductionRoot &&
"Reduction root is not set!");
24000 return Ops.size() == 2;
24008 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
24009 ScalarEvolution &SE,
const DataLayout &
DL,
24010 const TargetLibraryInfo &TLI) {
24011 RdxKind = HorizontalReduction::getRdxKind(Root);
24012 if (!isVectorizable(RdxKind, Root))
24024 if (!Sel->getCondition()->hasOneUse())
24027 ReductionRoot = Root;
24032 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
24034 1, std::make_pair(Root, 0));
24039 SmallVectorImpl<Value *> &PossibleReducedVals,
24040 SmallVectorImpl<Instruction *> &ReductionOps,
24043 getNumberOfOperands(TreeN)))) {
24044 Value *EdgeVal = getRdxOperand(TreeN,
I);
24045 ReducedValsToOps[EdgeVal].push_back(TreeN);
24053 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
24054 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
24055 !isVectorizable(RdxKind, EdgeInst) ||
24056 (
R.isAnalyzedReductionRoot(EdgeInst) &&
24058 PossibleReducedVals.push_back(EdgeVal);
24061 ReductionOps.push_back(EdgeInst);
24070 size_t, SmallMapVector<size_t, SmallMapVector<Value *, unsigned, 2>, 2>,
24072 PossibleReducedVals;
24073 initReductionOps(Root);
24075 SmallSet<size_t, 2> LoadKeyUsed;
24077 auto GenerateLoadsSubkey = [&](
size_t Key, LoadInst *LI) {
24082 auto LIt = LoadsMap.
find(std::make_pair(
Key,
Ptr));
24083 if (LIt != LoadsMap.
end()) {
24084 for (LoadInst *RLI : LIt->second) {
24090 for (LoadInst *RLI : LIt->second) {
24097 if (LIt->second.size() > 2) {
24099 hash_value(LIt->second.back()->getPointerOperand());
24105 .first->second.push_back(LI);
24109 while (!Worklist.empty()) {
24110 auto [TreeN,
Level] = Worklist.pop_back_val();
24113 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
24114 addReductionOps(TreeN);
24117 for (
Value *V : PossibleRedVals) {
24121 ++PossibleReducedVals[
Key][Idx].
try_emplace(V, 0).first->second;
24123 for (Instruction *
I :
reverse(PossibleReductionOps))
24124 Worklist.emplace_back(
I,
I->getParent() == BB ? 0 : Level + 1);
24126 auto PossibleReducedValsVect = PossibleReducedVals.
takeVector();
24129 for (
auto &PossibleReducedVals : PossibleReducedValsVect) {
24130 auto PossibleRedVals = PossibleReducedVals.second.
takeVector();
24132 for (
auto &Slice : PossibleRedVals) {
24134 auto RedValsVect = Slice.second.takeVector();
24136 for (
const std::pair<Value *, unsigned> &
Data : RedValsVect)
24137 PossibleRedValsVect.
back().append(
Data.second,
Data.first);
24139 stable_sort(PossibleRedValsVect, [](
const auto &P1,
const auto &P2) {
24140 return P1.size() > P2.size();
24147 }
else if (!isGoodForReduction(
Data)) {
24150 if (!LI || !LastLI ||
24155 ReducedVals.
back().append(
Data.rbegin(),
Data.rend());
24161 return P1.size() > P2.
size();
24167 Value *tryToReduce(BoUpSLP &V,
const DataLayout &
DL, TargetTransformInfo *
TTI,
24168 const TargetLibraryInfo &TLI, AssumptionCache *AC,
24169 DominatorTree &DT) {
24170 constexpr unsigned RegMaxNumber = 4;
24171 constexpr unsigned RedValsMaxNumber = 128;
24175 if (
unsigned NumReducedVals = std::accumulate(
24176 ReducedVals.
begin(), ReducedVals.
end(), 0,
24178 if (!isGoodForReduction(Vals))
24180 return Num + Vals.size();
24182 NumReducedVals < ReductionLimit &&
24186 for (ReductionOpsType &RdxOps : ReductionOps)
24187 for (
Value *RdxOp : RdxOps)
24192 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
24198 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.
size() *
24199 ReducedVals.
front().size());
24203 auto &&GetCmpForMinMaxReduction = [](
Instruction *RdxRootInst) {
24205 "Expected min/max reduction to have select root instruction");
24208 "Expected min/max reduction to have compare condition");
24212 bool AnyBoolLogicOp =
any_of(ReductionOps.back(), [](
Value *V) {
24213 return isBoolLogicOp(cast<Instruction>(V));
24216 auto GetNewVectorizedTree = [&](
Value *VectorizedTree,
Value *Res) {
24217 if (VectorizedTree) {
24221 if (AnyBoolLogicOp) {
24222 auto It = ReducedValsToOps.
find(VectorizedTree);
24223 auto It1 = ReducedValsToOps.
find(Res);
24224 if ((It == ReducedValsToOps.
end() && It1 == ReducedValsToOps.
end()) ||
24226 (It != ReducedValsToOps.
end() &&
24227 any_of(It->getSecond(), [&](Instruction *
I) {
24228 return isBoolLogicOp(I) &&
24229 getRdxOperand(I, 0) == VectorizedTree;
24233 (It1 != ReducedValsToOps.
end() &&
24234 any_of(It1->getSecond(), [&](Instruction *
I) {
24235 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
24239 VectorizedTree = Builder.
CreateFreeze(VectorizedTree);
24243 return createOp(Builder, RdxKind, VectorizedTree, Res,
"op.rdx",
24249 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
24250 ReductionOps.front().size());
24251 for (ReductionOpsType &RdxOps : ReductionOps)
24252 for (
Value *RdxOp : RdxOps) {
24255 IgnoreList.insert(RdxOp);
24258 FastMathFlags RdxFMF;
24260 for (
Value *U : IgnoreList)
24262 RdxFMF &= FPMO->getFastMathFlags();
24268 for (
Value *V : Candidates)
24269 TrackedVals.try_emplace(V, V);
24271 auto At = [](SmallMapVector<Value *, unsigned, 16> &MV,
24272 Value *
V) ->
unsigned & {
24273 auto *It = MV.
find(V);
24274 assert(It != MV.
end() &&
"Unable to find given key.");
24278 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.
size());
24281 SmallPtrSet<Value *, 4> RequiredExtract;
24282 WeakTrackingVH VectorizedTree =
nullptr;
24283 bool CheckForReusedReductionOps =
false;
24288 for (
unsigned I = 0,
E = ReducedVals.
size();
I <
E; ++
I) {
24290 InstructionsState S = States[
I];
24293 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.
size());
24294 for (
Value *ReducedVal : OrigReducedVals) {
24295 Value *RdxVal = TrackedVals.at(ReducedVal);
24302 (!S || !S.getMatchingMainOpOrAltOp(Inst))) ||
24306 TrackedToOrig.try_emplace(RdxVal, ReducedVal);
24308 bool ShuffledExtracts =
false;
24310 if (S && S.getOpcode() == Instruction::ExtractElement &&
24311 !S.isAltShuffle() &&
I + 1 <
E) {
24313 for (
Value *RV : ReducedVals[
I + 1]) {
24314 Value *RdxVal = TrackedVals.at(RV);
24321 CommonCandidates.push_back(RdxVal);
24322 TrackedToOrig.try_emplace(RdxVal, RV);
24324 SmallVector<int>
Mask;
24327 Candidates.
swap(CommonCandidates);
24328 ShuffledExtracts =
true;
24335 Value *OrigV = TrackedToOrig.at(Candidates.
front());
24336 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24338 Res = createOp(Builder, RdxKind, Res, VC,
"const.rdx", ReductionOps);
24339 Value *OrigV = TrackedToOrig.at(VC);
24340 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24342 V.analyzedReductionRoot(ResI);
24344 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
24348 unsigned NumReducedVals = Candidates.
size();
24349 if (NumReducedVals < ReductionLimit &&
24350 (NumReducedVals < 2 || !
isSplat(Candidates)))
24355 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
24356 RdxKind != RecurKind::FMul &&
24357 RdxKind != RecurKind::FMulAdd;
24359 SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
24360 if (IsSupportedHorRdxIdentityOp)
24361 for (
Value *V : Candidates) {
24362 Value *OrigV = TrackedToOrig.at(V);
24363 ++SameValuesCounter.
try_emplace(OrigV).first->second;
24375 bool SameScaleFactor =
false;
24376 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
24377 SameValuesCounter.
size() != Candidates.size();
24379 if (OptReusedScalars) {
24381 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
24382 RdxKind == RecurKind::Xor) &&
24384 [&SameValuesCounter](
const std::pair<Value *, unsigned> &
P) {
24385 return P.second == SameValuesCounter.
front().second;
24387 Candidates.resize(SameValuesCounter.
size());
24388 transform(SameValuesCounter, Candidates.begin(),
24389 [&](
const auto &
P) { return TrackedVals.at(P.first); });
24390 NumReducedVals = Candidates.size();
24392 if (NumReducedVals == 1) {
24393 Value *OrigV = TrackedToOrig.at(Candidates.front());
24394 unsigned Cnt = At(SameValuesCounter, OrigV);
24396 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
24397 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
24398 VectorizedVals.try_emplace(OrigV, Cnt);
24399 ExternallyUsedValues.
insert(OrigV);
24404 unsigned MaxVecRegSize =
V.getMaxVecRegSize();
24405 unsigned EltSize =
V.getVectorElementSize(Candidates[0]);
24406 const unsigned MaxElts = std::clamp<unsigned>(
24408 RegMaxNumber * RedValsMaxNumber);
24410 unsigned ReduxWidth = NumReducedVals;
24411 auto GetVectorFactor = [&, &
TTI = *
TTI](
unsigned ReduxWidth) {
24412 unsigned NumParts, NumRegs;
24413 Type *ScalarTy = Candidates.front()->getType();
24420 while (NumParts > NumRegs) {
24421 assert(ReduxWidth > 0 &&
"ReduxWidth is unexpectedly 0.");
24422 ReduxWidth =
bit_floor(ReduxWidth - 1);
24428 if (NumParts > NumRegs / 2)
24433 ReduxWidth = GetVectorFactor(ReduxWidth);
24434 ReduxWidth = std::min(ReduxWidth, MaxElts);
24436 unsigned Start = 0;
24437 unsigned Pos =
Start;
24439 unsigned PrevReduxWidth = ReduxWidth;
24440 bool CheckForReusedReductionOpsLocal =
false;
24441 auto AdjustReducedVals = [&](
bool IgnoreVL =
false) {
24442 bool IsAnyRedOpGathered = !IgnoreVL &&
V.isAnyGathered(IgnoreList);
24443 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
24446 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
24449 if (Pos < NumReducedVals - ReduxWidth + 1)
24450 return IsAnyRedOpGathered;
24453 if (ReduxWidth > 1)
24454 ReduxWidth = GetVectorFactor(ReduxWidth);
24455 return IsAnyRedOpGathered;
24457 bool AnyVectorized =
false;
24458 SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
24459 while (Pos < NumReducedVals - ReduxWidth + 1 &&
24460 ReduxWidth >= ReductionLimit) {
24463 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
24465 CheckForReusedReductionOps =
true;
24468 PrevReduxWidth = ReduxWidth;
24471 if (IgnoredCandidates.
contains(std::make_pair(Pos, ReduxWidth)) ||
24474 std::make_pair(Pos,
bit_floor(ReduxWidth))) ||
24476 std::make_pair(Pos + (ReduxWidth -
bit_floor(ReduxWidth)),
24478 V.areAnalyzedReductionVals(VL)) {
24479 (void)AdjustReducedVals(
true);
24486 return RedValI &&
V.isDeleted(RedValI);
24489 V.buildTree(VL, IgnoreList);
24490 if (
V.isTreeTinyAndNotFullyVectorizable(
true)) {
24491 if (!AdjustReducedVals())
24492 V.analyzedReductionVals(VL);
24495 if (
V.isLoadCombineReductionCandidate(RdxKind)) {
24496 if (!AdjustReducedVals())
24497 V.analyzedReductionVals(VL);
24500 V.reorderTopToBottom();
24503 VL.front()->getType()->isIntOrIntVectorTy() ||
24504 ReductionLimit > 2);
24508 ExternallyUsedValues);
24512 LocalExternallyUsedValues.insert(ReductionRoot);
24513 for (
unsigned Cnt = 0, Sz = ReducedVals.
size(); Cnt < Sz; ++Cnt) {
24514 if (Cnt ==
I || (ShuffledExtracts && Cnt ==
I - 1))
24516 for (
Value *V : ReducedVals[Cnt])
24518 LocalExternallyUsedValues.insert(TrackedVals[V]);
24520 if (!IsSupportedHorRdxIdentityOp) {
24523 "Reused values counter map is not empty");
24524 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
24525 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
24527 Value *
V = Candidates[Cnt];
24528 Value *OrigV = TrackedToOrig.at(V);
24529 ++SameValuesCounter.
try_emplace(OrigV).first->second;
24532 V.transformNodes();
24535 SmallPtrSet<Value *, 4> Visited;
24536 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
24537 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
24539 Value *RdxVal = Candidates[Cnt];
24540 if (
auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
24541 RdxVal = It->second;
24542 if (!Visited.
insert(RdxVal).second)
24546 if (!VLScalars.contains(RdxVal) &&
V.isVectorized(RdxVal)) {
24547 LocalExternallyUsedValues.insert(RdxVal);
24550 Value *OrigV = TrackedToOrig.at(RdxVal);
24552 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
24553 if (
NumOps != ReducedValsToOps.
at(OrigV).size())
24554 LocalExternallyUsedValues.insert(RdxVal);
24557 if (!IsSupportedHorRdxIdentityOp)
24558 SameValuesCounter.
clear();
24559 for (
Value *RdxVal : VL)
24560 if (RequiredExtract.
contains(RdxVal))
24561 LocalExternallyUsedValues.insert(RdxVal);
24562 V.buildExternalUses(LocalExternallyUsedValues);
24564 V.computeMinimumValueSizes();
24568 getReductionCost(
TTI, VL, IsCmpSelMinMax, RdxFMF, V, DT,
DL, TLI);
24571 <<
" for reduction\n");
24575 V.getORE()->emit([&]() {
24576 return OptimizationRemarkMissed(
SV_NAME,
"HorSLPNotBeneficial",
24577 ReducedValsToOps.
at(VL[0]).front())
24578 <<
"Vectorizing horizontal reduction is possible "
24579 <<
"but not beneficial with cost " <<
ore::NV(
"Cost",
Cost)
24580 <<
" and threshold "
24583 if (!AdjustReducedVals()) {
24584 V.analyzedReductionVals(VL);
24586 if (ReduxWidth > ReductionLimit &&
V.isTreeNotExtendable()) {
24589 *
TTI, VL.front()->getType(), ReduxWidth - 1);
24590 VF >= ReductionLimit;
24592 *
TTI, VL.front()->getType(), VF - 1)) {
24594 V.getCanonicalGraphSize() !=
V.getTreeSize())
24597 IgnoredCandidates.
insert(std::make_pair(
Offset + Idx, VF));
24604 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorizing horizontal reduction at cost:"
24605 <<
Cost <<
". (HorRdx)\n");
24606 V.getORE()->emit([&]() {
24607 return OptimizationRemark(
SV_NAME,
"VectorizedHorizontalReduction",
24608 ReducedValsToOps.
at(VL[0]).front())
24609 <<
"Vectorized horizontal reduction with cost "
24610 <<
ore::NV(
"Cost",
Cost) <<
" and with tree size "
24611 <<
ore::NV(
"TreeSize",
V.getTreeSize());
24620 if (IsCmpSelMinMax)
24621 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
24624 Value *VectorizedRoot =
V.vectorizeTree(
24625 LocalExternallyUsedValues, InsertPt, VectorValuesAndScales);
24628 for (
Value *RdxVal : Candidates) {
24629 Value *OrigVal = TrackedToOrig.at(RdxVal);
24630 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
24631 if (TransformedRdxVal != RdxVal)
24632 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
24641 VectorizedRoot = Builder.
CreateFreeze(VectorizedRoot);
24644 if (OptReusedScalars && !SameScaleFactor) {
24645 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
24646 SameValuesCounter, TrackedToOrig);
24649 Type *ScalarTy = VL.front()->getType();
24654 OptReusedScalars && SameScaleFactor
24655 ? SameValuesCounter.
front().second
24658 ?
V.isSignedMinBitwidthRootNode()
24662 for (
Value *RdxVal : VL) {
24663 Value *OrigV = TrackedToOrig.at(RdxVal);
24664 if (IsSupportedHorRdxIdentityOp) {
24665 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
24668 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24669 if (!
V.isVectorized(RdxVal))
24670 RequiredExtract.
insert(RdxVal);
24674 ReduxWidth = NumReducedVals - Pos;
24675 if (ReduxWidth > 1)
24676 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
24677 AnyVectorized =
true;
24679 if (OptReusedScalars && !AnyVectorized) {
24680 for (
const std::pair<Value *, unsigned> &
P : SameValuesCounter) {
24681 Value *RdxVal = TrackedVals.at(
P.first);
24682 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder,
P.second);
24683 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
24684 VectorizedVals.try_emplace(
P.first,
P.second);
24689 if (!VectorValuesAndScales.
empty())
24690 VectorizedTree = GetNewVectorizedTree(
24692 emitReduction(Builder, *
TTI, ReductionRoot->getType()));
24694 if (!VectorizedTree) {
24695 if (!CheckForReusedReductionOps) {
24696 for (ReductionOpsType &RdxOps : ReductionOps)
24697 for (
Value *RdxOp : RdxOps)
24719 auto FixBoolLogicalOps =
24722 if (!AnyBoolLogicOp)
24724 if (isBoolLogicOp(RedOp1) && ((!InitStep &&
LHS == VectorizedTree) ||
24725 getRdxOperand(RedOp1, 0) ==
LHS ||
24728 if (isBoolLogicOp(RedOp2) && ((!InitStep &&
RHS == VectorizedTree) ||
24729 getRdxOperand(RedOp2, 0) ==
RHS ||
24734 if (
LHS != VectorizedTree)
24742 unsigned Sz = InstVals.
size();
24744 for (
unsigned I = 0,
E = (Sz / 2) * 2;
I <
E;
I += 2) {
24747 Value *RdxVal1 = InstVals[
I].second;
24748 Value *StableRdxVal1 = RdxVal1;
24749 auto It1 = TrackedVals.find(RdxVal1);
24750 if (It1 != TrackedVals.end())
24751 StableRdxVal1 = It1->second;
24752 Value *RdxVal2 = InstVals[
I + 1].second;
24753 Value *StableRdxVal2 = RdxVal2;
24754 auto It2 = TrackedVals.find(RdxVal2);
24755 if (It2 != TrackedVals.end())
24756 StableRdxVal2 = It2->second;
24760 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[
I].first,
24762 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
24763 StableRdxVal2,
"op.rdx", ReductionOps);
24764 ExtraReds[
I / 2] = std::make_pair(InstVals[
I].first, ExtraRed);
24767 ExtraReds[Sz / 2] = InstVals.
back();
24773 SmallPtrSet<Value *, 8> Visited;
24775 for (
Value *RdxVal : Candidates) {
24776 if (!Visited.
insert(RdxVal).second)
24778 unsigned NumOps = VectorizedVals.lookup(RdxVal);
24779 for (Instruction *RedOp :
24785 bool InitStep =
true;
24786 while (ExtraReductions.
size() > 1) {
24788 FinalGen(ExtraReductions, InitStep);
24789 ExtraReductions.
swap(NewReds);
24792 VectorizedTree = ExtraReductions.
front().second;
24794 ReductionRoot->replaceAllUsesWith(VectorizedTree);
24801 SmallPtrSet<Value *, 4> IgnoreSet;
24810 for (
auto *U :
Ignore->users()) {
24812 "All users must be either in the reduction ops list.");
24815 if (!
Ignore->use_empty()) {
24817 Ignore->replaceAllUsesWith(
P);
24820 V.removeInstructionsAndOperands(RdxOps, VectorValuesAndScales);
24822 return VectorizedTree;
24828 Value *createSingleOp(IRBuilderBase &Builder,
const TargetTransformInfo &
TTI,
24829 Value *Vec,
unsigned Scale,
bool IsSigned,
24853 Rdx, emitReduction(Lane, Builder, &
TTI, DestTy),
I);
24856 Rdx = emitReduction(Vec, Builder, &
TTI, DestTy);
24858 if (Rdx->
getType() != DestTy)
24864 Rdx = emitScaleForReusedOps(Rdx, Builder, Scale);
24871 bool IsCmpSelMinMax, FastMathFlags FMF,
24872 const BoUpSLP &R, DominatorTree &DT,
24873 const DataLayout &
DL,
24874 const TargetLibraryInfo &TLI) {
24876 Type *ScalarTy = ReducedVals.
front()->getType();
24877 unsigned ReduxWidth = ReducedVals.
size();
24878 FixedVectorType *VectorTy =
R.getReductionType();
24883 auto EvaluateScalarCost = [&](function_ref<
InstructionCost()> GenCostFn) {
24886 int Cnt = ReducedVals.
size();
24887 for (
Value *RdxVal : ReducedVals) {
24892 Cost += GenCostFn();
24896 for (User *U : RdxVal->
users()) {
24898 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
24899 if (RdxKind == RecurKind::FAdd) {
24909 FMACost -= FMulCost;
24911 ScalarCost += FMACost;
24918 ScalarCost = InstructionCost::getInvalid();
24922 Cost += ScalarCost;
24924 Cost += GenCostFn();
24933 bool DoesRequireReductionOp = !AllConsts && VectorValuesAndScales.
empty();
24935 case RecurKind::Add:
24936 case RecurKind::Mul:
24937 case RecurKind::Or:
24938 case RecurKind::And:
24939 case RecurKind::Xor:
24940 case RecurKind::FAdd:
24941 case RecurKind::FMul: {
24944 if (DoesRequireReductionOp) {
24947 unsigned ScalarTyNumElements = VecTy->getNumElements();
24952 ReducedVals.size()),
24963 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
24964 std::make_pair(RedTy,
true));
24965 if (RType == RedTy) {
24970 RdxOpcode, !IsSigned, RedTy,
24976 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
24977 std::make_pair(RedTy,
true));
24980 if (RdxKind == RecurKind::FAdd) {
24985 for (
Value *RdxVal : ReducedVals) {
24991 FMF &= FPCI->getFastMathFlags();
24994 if (!
Ops.empty()) {
24999 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy,
25000 {RVecTy, RVecTy, RVecTy}, FMF);
25006 Instruction::FMul, RVecTy,
CostKind);
25008 <<
"Minus vector FMul cost: " << FMulCost <<
"\n");
25009 FMACost -= FMulCost;
25013 if (FMACost.isValid())
25014 VectorCost += FMACost;
25018 if (RType != RedTy) {
25019 unsigned Opcode = Instruction::Trunc;
25021 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
25027 ScalarCost = EvaluateScalarCost([&]() {
25032 case RecurKind::FMax:
25033 case RecurKind::FMin:
25034 case RecurKind::FMaximum:
25035 case RecurKind::FMinimum:
25036 case RecurKind::SMax:
25037 case RecurKind::SMin:
25038 case RecurKind::UMax:
25039 case RecurKind::UMin: {
25042 if (DoesRequireReductionOp) {
25048 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
25049 std::make_pair(RedTy,
true));
25051 IntrinsicCostAttributes ICA(Id, RVecTy, {RVecTy, RVecTy}, FMF);
25053 if (RType != RedTy) {
25054 unsigned Opcode = Instruction::Trunc;
25056 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
25062 ScalarCost = EvaluateScalarCost([&]() {
25063 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
25072 LLVM_DEBUG(
dbgs() <<
"SLP: Adding cost " << VectorCost - ScalarCost
25074 <<
" (It is a splitting reduction)\n");
25075 return VectorCost - ScalarCost;
25081 Value *emitReduction(IRBuilderBase &Builder,
const TargetTransformInfo &
TTI,
25083 Value *ReducedSubTree =
nullptr;
25085 auto CreateSingleOp = [&](
Value *Vec,
unsigned Scale,
bool IsSigned) {
25086 Value *Rdx = createSingleOp(Builder,
TTI, Vec, Scale, IsSigned, DestTy);
25087 if (ReducedSubTree)
25088 ReducedSubTree = createOp(Builder, RdxKind, ReducedSubTree, Rdx,
25089 "op.rdx", ReductionOps);
25091 ReducedSubTree = Rdx;
25093 if (VectorValuesAndScales.
size() == 1) {
25094 const auto &[Vec, Scale, IsSigned] = VectorValuesAndScales.
front();
25095 CreateSingleOp(Vec, Scale, IsSigned);
25096 return ReducedSubTree;
25100 Value *VecRes =
nullptr;
25101 bool VecResSignedness =
false;
25102 auto CreateVecOp = [&](
Value *Vec,
unsigned Cnt,
bool IsSigned) {
25108 case RecurKind::Add: {
25109 if (ScalarTy == Builder.
getInt1Ty() && ScalarTy != DestTy) {
25112 <<
". (HorRdx)\n");
25115 std::iota(std::next(
Mask.begin(), VF *
I),
25116 std::next(
Mask.begin(), VF * (
I + 1)), 0);
25117 ++NumVectorInstructions;
25128 LLVM_DEBUG(
dbgs() <<
"SLP: Add (to-mul) " << Cnt <<
"of " << Vec
25129 <<
". (HorRdx)\n");
25130 ++NumVectorInstructions;
25134 case RecurKind::Xor: {
25137 <<
"SLP: Xor " << Cnt <<
"of " << Vec <<
". (HorRdx)\n");
25142 case RecurKind::FAdd: {
25146 LLVM_DEBUG(
dbgs() <<
"SLP: FAdd (to-fmul) " << Cnt <<
"of " << Vec
25147 <<
". (HorRdx)\n");
25148 ++NumVectorInstructions;
25152 case RecurKind::And:
25153 case RecurKind::Or:
25154 case RecurKind::SMax:
25155 case RecurKind::SMin:
25156 case RecurKind::UMax:
25157 case RecurKind::UMin:
25158 case RecurKind::FMax:
25159 case RecurKind::FMin:
25160 case RecurKind::FMaximum:
25161 case RecurKind::FMinimum:
25164 case RecurKind::Sub:
25165 case RecurKind::AddChainWithSubs:
25166 case RecurKind::Mul:
25167 case RecurKind::FMul:
25168 case RecurKind::FMulAdd:
25169 case RecurKind::AnyOf:
25170 case RecurKind::FindFirstIVSMin:
25171 case RecurKind::FindFirstIVUMin:
25172 case RecurKind::FindLastIVSMax:
25173 case RecurKind::FindLastIVUMax:
25174 case RecurKind::FMaxNum:
25175 case RecurKind::FMinNum:
25176 case RecurKind::FMaximumNum:
25177 case RecurKind::FMinimumNum:
25178 case RecurKind::None:
25185 VecResSignedness = IsSigned;
25187 ++NumVectorInstructions;
25188 if (ScalarTy == Builder.
getInt1Ty() && ScalarTy != DestTy &&
25194 std::iota(
Mask.begin(),
Mask.end(), 0);
25196 if (VecResVF < VecVF) {
25200 if (VecResVF != VecVF) {
25202 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
25219 if (VecResVF < VecVF) {
25225 if (VecResVF != VecVF)
25227 Op = createOp(Builder, RdxKind,
Op, Vec,
"rdx.op", ReductionOps);
25228 if (VecResVF != VecVF)
25233 for (
auto [Vec, Scale, IsSigned] : VectorValuesAndScales)
25234 CreateVecOp(Vec, Scale, IsSigned);
25235 CreateSingleOp(VecRes, 1,
false);
25237 return ReducedSubTree;
25241 Value *emitReduction(
Value *VectorizedValue, IRBuilderBase &Builder,
25242 const TargetTransformInfo *
TTI,
Type *DestTy) {
25243 assert(VectorizedValue &&
"Need to have a vectorized tree node");
25244 assert(RdxKind != RecurKind::FMulAdd &&
25245 "A call to the llvm.fmuladd intrinsic is not handled yet");
25248 if (FTy->getScalarType() == Builder.
getInt1Ty() &&
25249 RdxKind == RecurKind::Add &&
25254 VectorizedValue, Builder.
getIntNTy(FTy->getNumElements()));
25255 ++NumVectorInstructions;
25258 ++NumVectorInstructions;
25263 Value *emitScaleForReusedOps(
Value *VectorizedValue, IRBuilderBase &Builder,
25265 assert(IsSupportedHorRdxIdentityOp &&
25266 "The optimization of matched scalar identity horizontal reductions "
25267 "must be supported.");
25269 return VectorizedValue;
25271 case RecurKind::Add: {
25273 Value *Scale = ConstantInt::get(VectorizedValue->
getType(), Cnt);
25275 << VectorizedValue <<
". (HorRdx)\n");
25276 return Builder.
CreateMul(VectorizedValue, Scale);
25278 case RecurKind::Xor: {
25280 LLVM_DEBUG(
dbgs() <<
"SLP: Xor " << Cnt <<
"of " << VectorizedValue
25281 <<
". (HorRdx)\n");
25284 return VectorizedValue;
25286 case RecurKind::FAdd: {
25288 Value *Scale = ConstantFP::get(VectorizedValue->
getType(), Cnt);
25290 << VectorizedValue <<
". (HorRdx)\n");
25291 return Builder.
CreateFMul(VectorizedValue, Scale);
25293 case RecurKind::And:
25294 case RecurKind::Or:
25295 case RecurKind::SMax:
25296 case RecurKind::SMin:
25297 case RecurKind::UMax:
25298 case RecurKind::UMin:
25299 case RecurKind::FMax:
25300 case RecurKind::FMin:
25301 case RecurKind::FMaximum:
25302 case RecurKind::FMinimum:
25304 return VectorizedValue;
25305 case RecurKind::Sub:
25306 case RecurKind::AddChainWithSubs:
25307 case RecurKind::Mul:
25308 case RecurKind::FMul:
25309 case RecurKind::FMulAdd:
25310 case RecurKind::AnyOf:
25311 case RecurKind::FindFirstIVSMin:
25312 case RecurKind::FindFirstIVUMin:
25313 case RecurKind::FindLastIVSMax:
25314 case RecurKind::FindLastIVUMax:
25315 case RecurKind::FMaxNum:
25316 case RecurKind::FMinNum:
25317 case RecurKind::FMaximumNum:
25318 case RecurKind::FMinimumNum:
25319 case RecurKind::None:
25328 emitReusedOps(
Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,
25329 const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter,
25330 const DenseMap<Value *, Value *> &TrackedToOrig) {
25331 assert(IsSupportedHorRdxIdentityOp &&
25332 "The optimization of matched scalar identity horizontal reductions "
25333 "must be supported.");
25336 if (VTy->getElementType() != VL.
front()->getType()) {
25340 R.isSignedMinBitwidthRootNode());
25343 case RecurKind::Add: {
25346 for (
Value *V : VL) {
25347 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
25348 Vals.
push_back(ConstantInt::get(
V->getType(), Cnt,
false));
25352 << VectorizedValue <<
". (HorRdx)\n");
25353 return Builder.
CreateMul(VectorizedValue, Scale);
25355 case RecurKind::And:
25356 case RecurKind::Or:
25359 <<
". (HorRdx)\n");
25360 return VectorizedValue;
25361 case RecurKind::SMax:
25362 case RecurKind::SMin:
25363 case RecurKind::UMax:
25364 case RecurKind::UMin:
25365 case RecurKind::FMax:
25366 case RecurKind::FMin:
25367 case RecurKind::FMaximum:
25368 case RecurKind::FMinimum:
25371 <<
". (HorRdx)\n");
25372 return VectorizedValue;
25373 case RecurKind::Xor: {
25378 SmallVector<int>
Mask(
25381 std::iota(
Mask.begin(),
Mask.end(), 0);
25382 bool NeedShuffle =
false;
25383 for (
unsigned I = 0, VF = VL.size();
I < VF; ++
I) {
25385 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
25386 if (Cnt % 2 == 0) {
25388 NeedShuffle =
true;
25394 dbgs() <<
"> of " << VectorizedValue <<
". (HorRdx)\n");
25398 ConstantVector::getNullValue(VectorizedValue->
getType()), Mask);
25399 return VectorizedValue;
25401 case RecurKind::FAdd: {
25404 for (
Value *V : VL) {
25405 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
25406 Vals.
push_back(ConstantFP::get(
V->getType(), Cnt));
25409 return Builder.
CreateFMul(VectorizedValue, Scale);
25411 case RecurKind::Sub:
25412 case RecurKind::AddChainWithSubs:
25413 case RecurKind::Mul:
25414 case RecurKind::FMul:
25415 case RecurKind::FMulAdd:
25416 case RecurKind::AnyOf:
25417 case RecurKind::FindFirstIVSMin:
25418 case RecurKind::FindFirstIVUMin:
25419 case RecurKind::FindLastIVSMax:
25420 case RecurKind::FindLastIVUMax:
25421 case RecurKind::FMaxNum:
25422 case RecurKind::FMinNum:
25423 case RecurKind::FMaximumNum:
25424 case RecurKind::FMinimumNum:
25425 case RecurKind::None:
25435 return HorizontalReduction::getRdxKind(V);
25441 unsigned AggregateSize = 1;
25443 Type *CurrentType =
IV->getType();
25446 for (
auto *Elt : ST->elements())
25447 if (Elt != ST->getElementType(0))
25448 return std::nullopt;
25449 AggregateSize *= ST->getNumElements();
25450 CurrentType = ST->getElementType(0);
25452 AggregateSize *= AT->getNumElements();
25453 CurrentType = AT->getElementType();
25455 AggregateSize *= VT->getNumElements();
25456 return AggregateSize;
25458 return AggregateSize;
25460 return std::nullopt;
25469 unsigned OperandOffset,
const BoUpSLP &R) {
25472 std::optional<unsigned> OperandIndex =
25474 if (!OperandIndex || R.isDeleted(LastInsertInst))
25478 BuildVectorOpds, InsertElts, *OperandIndex, R);
25481 BuildVectorOpds[*OperandIndex] = InsertedOperand;
25482 InsertElts[*OperandIndex] = LastInsertInst;
25485 }
while (LastInsertInst !=
nullptr &&
25512 "Expected insertelement or insertvalue instruction!");
25515 "Expected empty result vectors!");
25518 if (!AggregateSize)
25520 BuildVectorOpds.
resize(*AggregateSize);
25521 InsertElts.
resize(*AggregateSize);
25526 if (BuildVectorOpds.
size() >= 2)
25544 auto DominatedReduxValue = [&](
Value *R) {
25552 if (
P->getIncomingBlock(0) == ParentBB) {
25554 }
else if (
P->getIncomingBlock(1) == ParentBB) {
25558 if (Rdx && DominatedReduxValue(Rdx))
25571 if (
P->getIncomingBlock(0) == BBLatch) {
25573 }
else if (
P->getIncomingBlock(1) == BBLatch) {
25577 if (Rdx && DominatedReduxValue(Rdx))
25613 "Expected binop, select, or intrinsic for reduction matching");
25615 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root));
25617 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
25628 Value *Op0 =
nullptr;
25629 Value *Op1 =
nullptr;
25638 Value *B0 =
nullptr, *B1 =
nullptr;
25643bool SLPVectorizerPass::vectorizeHorReduction(
25644 PHINode *
P, Instruction *Root, BasicBlock *BB,
BoUpSLP &R,
25645 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
25654 auto SelectRoot = [&]() {
25673 std::queue<std::pair<Instruction *, unsigned>>
Stack;
25674 Stack.emplace(SelectRoot(), 0);
25675 SmallPtrSet<Value *, 8> VisitedInstrs;
25678 if (
R.isAnalyzedReductionRoot(Inst))
25683 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
25685 return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC, *DT);
25687 auto TryAppendToPostponedInsts = [&](
Instruction *FutureSeed) {
25688 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
25700 while (!
Stack.empty()) {
25703 std::tie(Inst, Level) =
Stack.front();
25708 if (
R.isDeleted(Inst))
25710 if (
Value *VectorizedV = TryToReduce(Inst)) {
25714 Stack.emplace(
I, Level);
25717 if (
R.isDeleted(Inst))
25721 if (!TryAppendToPostponedInsts(Inst)) {
25732 if (VisitedInstrs.
insert(
Op).second)
25737 !
R.isDeleted(
I) &&
I->getParent() == BB)
25738 Stack.emplace(
I, Level);
25743bool SLPVectorizerPass::tryToVectorize(Instruction *
I,
BoUpSLP &R) {
25750 if ((
I->getOpcode() == Instruction::FAdd ||
25751 I->getOpcode() == Instruction::FSub) &&
25761 if (!Op0 || !Op1 || Op0->getParent() !=
P || Op1->getParent() !=
P ||
25762 R.isDeleted(Op0) ||
R.isDeleted(Op1))
25772 if (
A &&
B &&
B->hasOneUse()) {
25775 if (B0 && B0->getParent() ==
P && !
R.isDeleted(B0))
25777 if (B1 && B1->getParent() ==
P && !
R.isDeleted(B1))
25781 if (
B &&
A &&
A->hasOneUse()) {
25784 if (A0 && A0->getParent() ==
P && !
R.isDeleted(A0))
25786 if (A1 && A1->getParent() ==
P && !
R.isDeleted(A1))
25790 auto TryToReduce = [
this, &
R, &TTI = *TTI](
Instruction *Inst,
25794 Type *Ty = Inst->getType();
25798 if (!HorRdx.matchReductionForOperands())
25804 TTI.getScalarizationOverhead(
25807 TTI.getInstructionCost(Inst,
CostKind);
25819 FMF = FPCI->getFastMathFlags();
25820 RedCost = TTI.getArithmeticReductionCost(Inst->getOpcode(), VecTy, FMF,
25827 if (RedCost >= ScalarCost)
25830 return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC, *DT) !=
nullptr;
25832 if (Candidates.
size() == 1)
25833 return TryToReduce(
I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1},
R);
25836 std::optional<int> BestCandidate =
R.findBestRootPair(Candidates);
25837 if (!BestCandidate)
25839 return (*BestCandidate == 0 &&
25840 TryToReduce(
I, {Candidates[*BestCandidate].first,
25841 Candidates[*BestCandidate].second})) ||
25842 tryToVectorizeList({Candidates[*BestCandidate].first,
25843 Candidates[*BestCandidate].second},
25847bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *
P, Instruction *Root,
25848 BasicBlock *BB,
BoUpSLP &R) {
25850 bool Res = vectorizeHorReduction(
P, Root, BB, R, PostponedInsts);
25851 Res |= tryToVectorize(PostponedInsts, R);
25858 for (
Value *V : Insts)
25860 Res |= tryToVectorize(Inst, R);
25864bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
25867 if (!
R.canMapToVector(IVI->
getType()))
25870 SmallVector<Value *, 16> BuildVectorOpds;
25871 SmallVector<Value *, 16> BuildVectorInsts;
25875 if (MaxVFOnly && BuildVectorOpds.
size() == 2) {
25876 R.getORE()->emit([&]() {
25877 return OptimizationRemarkMissed(
SV_NAME,
"NotPossible", IVI)
25878 <<
"Cannot SLP vectorize list: only 2 elements of buildvalue, "
25879 "trying reduction first.";
25883 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IVI <<
"\n");
25885 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
25888bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
25891 SmallVector<Value *, 16> BuildVectorInsts;
25892 SmallVector<Value *, 16> BuildVectorOpds;
25893 SmallVector<int>
Mask;
25899 if (MaxVFOnly && BuildVectorInsts.
size() == 2) {
25900 R.getORE()->emit([&]() {
25901 return OptimizationRemarkMissed(
SV_NAME,
"NotPossible", IEI)
25902 <<
"Cannot SLP vectorize list: only 2 elements of buildvector, "
25903 "trying reduction first.";
25907 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IEI <<
"\n");
25908 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
25911template <
typename T>
25916 bool MaxVFOnly,
BoUpSLP &R) {
25929 if (!
I || R.isDeleted(
I)) {
25933 auto *SameTypeIt = IncIt;
25936 AreCompatible(VL, *SameTypeIt))) {
25939 if (
I && !R.isDeleted(
I))
25944 unsigned NumElts = VL.
size();
25945 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize starting at nodes ("
25946 << NumElts <<
")\n");
25956 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL), MaxVFOnly)) {
25959 VL.
swap(Candidates);
25960 Candidates.
clear();
25968 auto GetMinNumElements = [&R](
Value *V) {
25969 unsigned EltSize = R.getVectorElementSize(V);
25970 return std::max(2U, R.getMaxVecRegSize() / EltSize);
25972 if (NumElts < GetMinNumElements(*IncIt) &&
25973 (Candidates.
empty() ||
25974 Candidates.
front()->getType() == (*IncIt)->getType())) {
25982 if (Candidates.
size() > 1 &&
25983 (SameTypeIt ==
E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
25984 if (TryToVectorizeHelper(Candidates,
false)) {
25987 }
else if (MaxVFOnly) {
25990 for (
auto *It = Candidates.
begin(), *End = Candidates.
end(); It != End;
25993 if (!
I || R.isDeleted(
I)) {
25997 auto *SameTypeIt = It;
25998 while (SameTypeIt != End &&
26001 AreCompatible(*SameTypeIt, *It))) {
26004 if (
I && !R.isDeleted(
I))
26007 unsigned NumElts = VL.
size();
26008 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL),
26014 Candidates.
clear();
26018 IncIt = SameTypeIt;
26030template <
bool IsCompatibility>
26035 "Expected valid element types only.");
26037 return IsCompatibility;
26040 if (CI1->getOperand(0)->getType()->getTypeID() <
26042 return !IsCompatibility;
26043 if (CI1->getOperand(0)->getType()->getTypeID() >
26046 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
26048 return !IsCompatibility;
26049 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
26058 if (BasePred1 < BasePred2)
26059 return !IsCompatibility;
26060 if (BasePred1 > BasePred2)
26063 bool CI1Preds = Pred1 == BasePred1;
26064 bool CI2Preds = Pred2 == BasePred1;
26065 for (
int I = 0,
E = CI1->getNumOperands();
I <
E; ++
I) {
26066 auto *Op1 = CI1->getOperand(CI1Preds ?
I :
E -
I - 1);
26071 return !IsCompatibility;
26076 if (IsCompatibility) {
26077 if (I1->getParent() != I2->getParent())
26084 return NodeI2 !=
nullptr;
26087 assert((NodeI1 == NodeI2) ==
26089 "Different nodes should have different DFS numbers");
26090 if (NodeI1 != NodeI2)
26094 if (S && (IsCompatibility || !S.isAltShuffle()))
26096 if (IsCompatibility)
26098 if (I1->getOpcode() != I2->getOpcode())
26099 return I1->getOpcode() < I2->getOpcode();
26102 return IsCompatibility;
26105template <
typename ItT>
26107 BasicBlock *BB,
BoUpSLP &R) {
26110 for (CmpInst *
I : CmpInsts) {
26111 if (
R.isDeleted(
I))
26115 Changed |= vectorizeRootInstruction(
nullptr, RootOp, BB, R);
26116 if (
R.isDeleted(
I))
26121 for (CmpInst *
I : CmpInsts) {
26122 if (
R.isDeleted(
I))
26141 for (Instruction *V : CmpInsts)
26144 if (Vals.
size() <= 1)
26147 Vals, CompareSorter, AreCompatibleCompares,
26150 bool ArePossiblyReducedInOtherBlock =
any_of(Candidates, [](
Value *V) {
26151 return any_of(
V->users(), [V](User *U) {
26152 auto *Select = dyn_cast<SelectInst>(U);
26154 Select->getParent() != cast<Instruction>(V)->getParent();
26157 if (ArePossiblyReducedInOtherBlock)
26159 return tryToVectorizeList(Candidates, R, MaxVFOnly);
26165bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
26166 BasicBlock *BB,
BoUpSLP &R) {
26168 "This function only accepts Insert instructions");
26169 bool OpsChanged =
false;
26171 for (
auto *
I :
reverse(Instructions)) {
26177 vectorizeInsertValueInst(LastInsertValue, BB, R,
true);
26180 vectorizeInsertElementInst(LastInsertElem, BB, R,
true);
26183 if (
R.isDeleted(
I))
26185 OpsChanged |= vectorizeHorReduction(
nullptr,
I, BB, R, PostponedInsts);
26191 vectorizeInsertValueInst(LastInsertValue, BB, R,
false);
26193 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
26198 OpsChanged |= tryToVectorize(PostponedInsts, R);
26204bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB,
BoUpSLP &R) {
26207 SmallPtrSet<Value *, 16> VisitedInstrs;
26211 DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;
26212 auto PHICompare = [
this, &PHIToOpcodes](
Value *V1,
Value *V2) {
26215 "Expected vectorizable types only.");
26225 V2->getType()->getScalarSizeInBits())
26228 V2->getType()->getScalarSizeInBits())
26232 if (Opcodes1.
size() < Opcodes2.
size())
26234 if (Opcodes1.
size() > Opcodes2.
size())
26236 for (
int I = 0,
E = Opcodes1.
size();
I <
E; ++
I) {
26245 return NodeI2 !=
nullptr;
26248 assert((NodeI1 == NodeI2) ==
26250 "Different nodes should have different DFS numbers");
26251 if (NodeI1 != NodeI2)
26254 if (S && !S.isAltShuffle() &&
I1->getOpcode() == I2->getOpcode()) {
26270 DT->getNode(V1->getParent());
26272 DT->getNode(V2->getParent());
26274 return NodeI2 !=
nullptr;
26277 assert((NodeI1 == NodeI2) ==
26279 "Different nodes should have different DFS numbers");
26280 if (NodeI1 != NodeI2)
26282 return V1->comesBefore(V2);
26295 return *Id1 < *Id2;
26299 if (
I1->getOpcode() == I2->getOpcode())
26301 return I1->getOpcode() < I2->getOpcode();
26324 auto ValID1 = Opcodes1[
I]->getValueID();
26325 auto ValID2 = Opcodes2[
I]->getValueID();
26326 if (ValID1 == ValID2)
26328 if (ValID1 < ValID2)
26330 if (ValID1 > ValID2)
26339 assert(U1 && U2 &&
"The only thing left should be undef & undef.");
26345 if (VL.empty() || V1 == VL.back())
26347 Value *V2 = VL.back();
26352 if (Opcodes1.
size() != Opcodes2.
size())
26354 for (
int I = 0,
E = Opcodes1.
size();
I <
E; ++
I) {
26360 if (
R.isDeleted(I1) ||
R.isDeleted(I2))
26362 if (
I1->getParent() != I2->getParent())
26370 if (Opcodes1[
I]->getValueID() != Opcodes2[
I]->getValueID())
26376 bool HaveVectorizedPhiNodes =
false;
26380 for (Instruction &
I : *BB) {
26387 if (!VisitedInstrs.
count(
P) && !
R.isDeleted(
P) &&
26392 if (Incoming.
size() <= 1)
26397 for (
Value *V : Incoming) {
26398 SmallVectorImpl<Value *> &Opcodes =
26400 if (!Opcodes.
empty())
26403 SmallPtrSet<Value *, 4> Visited;
26404 while (!Nodes.empty()) {
26408 for (
Value *V :
PHI->incoming_values()) {
26410 Nodes.push_back(PHI1);
26419 Incoming, PHICompare, AreCompatiblePHIs,
26421 return tryToVectorizeList(Candidates, R, MaxVFOnly);
26424 Changed |= HaveVectorizedPhiNodes;
26425 if (HaveVectorizedPhiNodes &&
any_of(PHIToOpcodes, [&](
const auto &
P) {
26427 return !
PHI ||
R.isDeleted(
PHI);
26429 PHIToOpcodes.
clear();
26431 }
while (HaveVectorizedPhiNodes);
26433 VisitedInstrs.
clear();
26435 InstSetVector PostProcessInserts;
26436 SmallSetVector<CmpInst *, 8> PostProcessCmps;
26439 auto VectorizeInsertsAndCmps = [&](
bool VectorizeCmps) {
26440 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
26441 if (VectorizeCmps) {
26443 PostProcessCmps.
clear();
26445 PostProcessInserts.clear();
26451 return PostProcessCmps.
contains(Cmp);
26453 PostProcessInserts.contains(
I);
26459 return I->use_empty() &&
26469 if (
R.isDeleted(&*It))
26472 if (!VisitedInstrs.
insert(&*It).second) {
26473 if (HasNoUsers(&*It) &&
26474 VectorizeInsertsAndCmps(It->isTerminator())) {
26487 if (
P->getNumIncomingValues() == 2) {
26490 if (Root && vectorizeRootInstruction(
P, Root, BB, R)) {
26504 if (BB ==
P->getIncomingBlock(
I) ||
26505 !DT->isReachableFromEntry(
P->getIncomingBlock(
I)))
26511 PI && !IsInPostProcessInstrs(PI)) {
26513 vectorizeRootInstruction(
nullptr, PI,
P->getIncomingBlock(
I), R);
26515 if (Res &&
R.isDeleted(
P)) {
26525 if (HasNoUsers(&*It)) {
26526 bool OpsChanged =
false;
26537 TryToVectorizeRoot |= (
I == Stores.
end() ||
I->second.size() == 1) &&
26538 SI->getValueOperand()->hasOneUse();
26540 if (TryToVectorizeRoot) {
26541 for (
auto *V : It->operand_values()) {
26545 VI && !IsInPostProcessInstrs(VI))
26547 OpsChanged |= vectorizeRootInstruction(
nullptr, VI, BB, R);
26554 VectorizeInsertsAndCmps(It->isTerminator());
26566 PostProcessInserts.insert(&*It);
26574bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB,
BoUpSLP &R) {
26576 for (
auto &Entry : GEPs) {
26579 if (
Entry.second.size() < 2)
26582 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing a getelementptr list of length "
26583 <<
Entry.second.size() <<
".\n");
26591 return !R.isDeleted(GEP);
26593 if (It ==
Entry.second.end())
26595 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
26596 unsigned EltSize =
R.getVectorElementSize(*(*It)->idx_begin());
26597 if (MaxVecRegSize < EltSize)
26600 unsigned MaxElts = MaxVecRegSize / EltSize;
26601 for (
unsigned BI = 0, BE =
Entry.second.size(); BI < BE; BI += MaxElts) {
26602 auto Len = std::min<unsigned>(BE - BI, MaxElts);
26615 Candidates.remove_if([&R](
Value *
I) {
26625 for (
int I = 0,
E = GEPList.size();
I <
E && Candidates.
size() > 1; ++
I) {
26626 auto *GEPI = GEPList[
I];
26627 if (!Candidates.count(GEPI))
26629 const SCEV *SCEVI = SE->getSCEV(GEPList[
I]);
26630 for (
int J =
I + 1; J <
E && Candidates.
size() > 1; ++J) {
26631 auto *GEPJ = GEPList[J];
26632 const SCEV *SCEVJ = SE->getSCEV(GEPList[J]);
26634 Candidates.remove(GEPI);
26635 Candidates.remove(GEPJ);
26636 }
else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
26637 Candidates.remove(GEPJ);
26644 if (Candidates.
size() < 2)
26650 SmallVector<Value *, 16> Bundle(Candidates.
size());
26651 auto BundleIndex = 0
u;
26652 for (
auto *V : Candidates) {
26654 auto *GEPIdx =
GEP->idx_begin()->get();
26656 Bundle[BundleIndex++] = GEPIdx;
26668 Changed |= tryToVectorizeList(Bundle, R);
26674bool SLPVectorizerPass::vectorizeStoreChains(
BoUpSLP &R) {
26679 auto &&StoreSorter = [
this](StoreInst *
V, StoreInst *V2) {
26680 if (
V->getValueOperand()->getType()->getTypeID() <
26683 if (
V->getValueOperand()->getType()->getTypeID() >
26686 if (
V->getPointerOperandType()->getTypeID() <
26687 V2->getPointerOperandType()->getTypeID())
26689 if (
V->getPointerOperandType()->getTypeID() >
26690 V2->getPointerOperandType()->getTypeID())
26692 if (
V->getValueOperand()->getType()->getScalarSizeInBits() <
26695 if (
V->getValueOperand()->getType()->getScalarSizeInBits() >
26701 DomTreeNodeBase<llvm::BasicBlock> *NodeI1 =
26702 DT->getNode(
I1->getParent());
26703 DomTreeNodeBase<llvm::BasicBlock> *NodeI2 =
26704 DT->getNode(I2->getParent());
26705 assert(NodeI1 &&
"Should only process reachable instructions");
26706 assert(NodeI2 &&
"Should only process reachable instructions");
26707 assert((NodeI1 == NodeI2) ==
26709 "Different nodes should have different DFS numbers");
26710 if (NodeI1 != NodeI2)
26712 return I1->getOpcode() < I2->getOpcode();
26714 return V->getValueOperand()->getValueID() <
26718 bool SameParent =
true;
26724 StoreInst *V2 = VL.
back();
26749 SameParent &=
I1 && I2 &&
I1->getParent() == I2->getParent();
26751 for (
auto [SI, V] :
zip(VL, NewVL))
26752 V =
SI->getValueOperand();
26753 NewVL.back() = V1->getValueOperand();
26754 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
26755 InstructionsState S =
Analysis.buildInstructionsState(
26763 return V1->getValueOperand()->
getValueID() ==
26768 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> Attempted;
26769 for (
auto &Pair : Stores) {
26770 if (Pair.second.size() < 2)
26774 << Pair.second.size() <<
".\n");
26783 Pair.second.rend());
26785 ReversedStores, StoreSorter, AreCompatibleStores,
26787 return vectorizeStores(Candidates, R, Attempted);
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool isConstant(const MachineInstr &MI)
AMDGPU Register Bank Select
ReachingDefInfo InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
block Block Frequency Analysis
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
static bool runImpl(Function &F, const TargetLowering &TLI, AssumptionCache *AC)
This is the interface for a simple mod/ref and alias analysis over globals.
static Value * getCondition(Instruction *I)
static void setCondition(Instruction *I, Value *NewCond)
static const HTTPClientCleanup Cleanup
static Type * getIndexType(Value *In)
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
This file provides utility analysis objects describing memory locations.
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
static bool IsSelect(MachineInstr &MI)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
static std::optional< OperandInfo > getOperandInfo(const MachineOperand &MO)
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool isAlternateInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static cl::opt< bool > SplitAlternateInstructions("slp-split-alternate-instructions", cl::init(true), cl::Hidden, cl::desc("Improve the code quality by splitting alternate instructions"))
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static bool isMaskedLoadCompress(ArrayRef< Value * > VL, ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC, const DominatorTree &DT, const TargetLibraryInfo &TLI, const function_ref< bool(Value *)> AreAllUsersVectorized, bool &IsMasked, unsigned &InterleaveFactor, SmallVectorImpl< int > &CompressMask, VectorType *&LoadVecTy)
Checks if the VL can be transformed to a (masked)load + compress or (masked) interleaved load.
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< bool > VectorizeCopyableElements("slp-copyable-elements", cl::init(true), cl::Hidden, cl::desc("Try to replace values with the idempotent instructions for " "better vectorization."))
Enables vectorization of copyable elements.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static bool isCommutative(Instruction *I, Value *ValWithUses)
static bool allSameOpcode(ArrayRef< Value * > VL)
static InstructionCost canConvertToFMA(ArrayRef< Value * > VL, const InstructionsState &S, DominatorTree &DT, const DataLayout &DL, TargetTransformInfo &TTI, const TargetLibraryInfo &TLI)
Check if we can convert fadd/fsub sequence to FMAD.
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Creates subvector insert.
static void findBuildAggregateRec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static const SCEV * calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static DebugLoc getDebugLocFromPHI(PHINode &PN)
static std::optional< unsigned > getExtractIndex(const Instruction *E)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static cl::opt< bool > ForceStridedLoads("slp-force-strided-loads", cl::init(false), cl::Hidden, cl::desc("Generate strided loads even if they are not " "profitable. Used for testing only."))
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static bool isMainInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an main operation for the given MainOp and AltOp instruction...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static unsigned getNumberOfPotentiallyCommutativeOps(Instruction *I)
static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy, VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={})
This is similar to TargetTransformInfo::getScalarizationOverhead, but if ScalarTy is a FixedVectorTyp...
static bool buildCompressMask(ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, Type *ScalarTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< int > &CompressMask)
Builds compress-like mask for shuffles for the given PointerOps, ordered with Order.
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
Calculates the costs of vectorized intrinsic (if possible) and vectorized function (if possible) call...
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static SmallVector< Constant * > replicateMask(ArrayRef< Constant * > Val, unsigned VF)
Replicates the given Val VF times.
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static Instruction * findInstructionWithOpcode(ArrayRef< Value * > VL, unsigned Opcode)
Find an instruction with a specific opcode in VL.
static InstructionCost getExtractWithExtendCost(const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput)
This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst is a FixedVectorType,...
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static std::pair< Instruction *, Instruction * > getMainAltOpsNoStateVL(ArrayRef< Value * > VL)
Returns main/alternate instructions for the given VL.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int64_t > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(ArrayRef< T * >, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static bool tryToFindDuplicates(SmallVectorImpl< Value * > &VL, SmallVectorImpl< int > &ReuseShuffleIndices, const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, const InstructionsState &S, const BoUpSLP::EdgeInfo &UserTreeIdx, bool TryPad=false)
Checks that every instruction appears once in the list and if not, packs them, building ReuseShuffleI...
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Value * createExtractVector(IRBuilderBase &Builder, Value *Vec, unsigned SubVecVF, unsigned Index)
Generates subvector extract using Generator or using default shuffle.
static cl::opt< bool > DisableTreeReorder("slp-disable-tree-reorder", cl::init(false), cl::Hidden, cl::desc("Disable tree reordering even if it is " "profitable. Used for testing only."))
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static InstructionCost getVectorInstrCost(const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar, ArrayRef< std::tuple< Value *, User *, int > > ScalarUserAndIdx)
This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy is a FixedVectorType,...
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, Type *ScalarTy, unsigned Opcode0, unsigned Opcode1)
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
static const int BlockSize
LocallyHashedType DenseMapInfo< LocallyHashedType >::Empty
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
static const uint32_t IV[8]
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void resetForSameNode()
Reset the builder to handle perfect diamond match.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Value * createFreeze(Value *V)
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
~ShuffleInstructionBuilder()
void resetForSameNode()
Reset the builder to handle perfect diamond match.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
A manager for alias analyses.
Class for arbitrary precision integers.
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
void clearBit(unsigned BitPosition)
Set a given bit to 0.
uint64_t getZExtValue() const
Get zero extended value.
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
unsigned getBitWidth() const
Return the number of bits in the APInt.
bool ult(const APInt &RHS) const
Unsigned less than comparison.
void clearAllBits()
Set every bit to 0.
void negate()
Negate this APInt in place.
unsigned logBase2() const
void setAllBits()
Set every bit to 1.
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
bool isOne() const
Determine if this is a value of 1.
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
const T & back() const
back - Get the last element.
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
const T & front() const
front - Get the first element.
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
const T & consume_front()
consume_front() - Returns the first element and drops it from ArrayRef.
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::reverse_iterator reverse_iterator
InstListType::iterator iterator
Instruction iterators...
LLVM_ABI const_iterator getFirstNonPHIOrDbgOrAlloca() const
Returns an iterator to the first instruction in this block that is not a PHINode, a debug intrinsic,...
InstListType::const_reverse_iterator const_reverse_iterator
bool isEHPad() const
Return true if this basic block is an exception handling block.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Represents analyses that only rely on functions' control flow.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
LLVM_ABI void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
This class is the base class for the comparison instructions.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ ICMP_SLT
signed less than
@ ICMP_SLE
signed less or equal
@ ICMP_UGE
unsigned greater or equal
@ ICMP_UGT
unsigned greater than
@ ICMP_SGT
signed greater than
@ ICMP_ULT
unsigned less than
@ ICMP_SGE
signed greater or equal
@ ICMP_ULE
unsigned less or equal
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Predicate getPredicate() const
Return the predicate for this instruction.
static LLVM_ABI Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
static LLVM_ABI Constant * getBinOpIdentity(unsigned Opcode, Type *Ty, bool AllowRHSConstant=false, bool NSZ=false)
Return the identity constant for a binary opcode.
This is the shared class of boolean and integer constants.
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
const APInt & getValue() const
Return the constant as an APInt value reference.
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
static bool shouldExecute(unsigned CounterName)
static DebugLoc getUnknown()
An analysis that produces DemandedBits for a function.
LLVM_ABI APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
bool erase(const KeyT &Val)
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
Base class for the actual dominator tree node.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Convenience struct for specifying and reasoning about fast-math flags.
bool allowReassoc() const
Flag queries.
bool allowContract() const
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
ArrayRef< Type * > params() const
Type * getReturnType() const
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
For the node iterator we just need to turn the TreeEntry iterator into a TreeEntry* iterator so that ...
nodes_iterator operator++()
nodes_iterator(const ItTy &It2)
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Value * CreateFreeze(Value *V, const Twine &Name="")
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
LLVM_ABI Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
LLVM_ABI Value * CreateSelectWithUnknownProfile(Value *C, Value *True, Value *False, StringRef PassName, const Twine &Name="")
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
static InstructionCost getInvalid(CostType Val=0)
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
LLVM_ABI bool mayWriteToMemory() const LLVM_READONLY
Return true if this instruction may modify memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
const SmallVectorImpl< Type * > & getArgTypes() const
An instruction for reading from memory.
Value * getPointerOperand()
Analysis pass that exposes the LoopInfo for a function.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
This class implements a map that also provides access to all stored values in a deterministic order.
VectorType takeVector()
Clear the MapVector and return the underlying vector.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
ValueT lookup(const KeyT &Key) const
std::pair< KeyT, ValueT > & front()
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
T & front() const
front - Get the first element.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
ArrayRef< value_type > getArrayRef() const
size_type size() const
Determine the number of elements in the SetVector.
const value_type & front() const
Return the first element of the SetVector.
void insert_range(Range &&R)
Vector takeVector()
Clear the SetVector and return the underlying vector.
void clear()
Completely clear the SetVector.
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
static LLVM_ABI bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static LLVM_ABI bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static LLVM_ABI bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
bool contains(const T &V) const
Check if the SmallSet contains the given element.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
iterator erase(const_iterator CI)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
The instances of the Type class are immutable: once they are created, they are never changed.
LLVM_ABI bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
bool isVectorTy() const
True if this is an instance of VectorType.
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
bool isPointerTy() const
True if this is an instance of PointerType.
LLVM_ABI unsigned getStructNumElements() const
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
bool isIntegerTy() const
True if this is an instance of IntegerType.
TypeID getTypeID() const
Return the type id for the type.
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
bool isVoidTy() const
Return true if this is 'void'.
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Value * getOperand(unsigned i) const
unsigned getNumOperands() const
iterator_range< value_op_iterator > operand_values()
The Vector Function Database.
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
user_iterator user_begin()
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
unsigned getValueID() const
Return an ID for the concrete type of this object.
LLVM_ABI bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
LLVM_ABI unsigned getNumUses() const
This method computes the number of uses of this Value.
iterator_range< use_iterator > uses()
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
iterator find(const_arg_type_t< ValueT > V)
void insert_range(Range &&R)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
const ParentTy * getParent() const
self_iterator getIterator()
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator I
iterator_adaptor_base()=default
This class implements an extremely fast bulk output stream that can only output to a stream.
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreConstants
Constants.
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
VLOperands(ArrayRef< Value * > RootVL, ArrayRef< ValueList > Operands, const InstructionsState &S, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
Bottom Up SLP Vectorizer.
bool isProfitableToReorder() const
Checks if it is profitable to reorder the current tree.
SmallVector< unsigned, 4 > OrdersType
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleEntity &SE)
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleBundle &Bundle)
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
InstructionCost getSpillCost()
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
SmallVector< StoreInst *, 8 > StoreList
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={}, InstructionCost ReductionCost=TTI::TCC_Free)
unsigned getTreeSize() const
bool isStridedLoad(ArrayRef< Value * > PointerOps, Type *ScalarTy, Align Alignment, const int64_t Diff, const size_t Sz) const
Checks if strided loads can be generated out of VL loads with pointers PointerOps:
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
SmallVector< Instruction *, 16 > InstrList
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, StridedPtrInfo &SPtrInfo, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
SmallPtrSet< Value *, 16 > ValueSet
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
bool analyzeRtStrideCandidate(ArrayRef< Value * > PointerOps, Type *ScalarTy, Align CommonAlignment, SmallVectorImpl< unsigned > &SortedIndices, StridedPtrInfo &SPtrInfo) const
Return true if an array of scalar loads can be replaced with a strided load (with run-time stride).
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isVectorized(const Value *V) const
Check if the value is vectorized in the tree.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleData &SD)
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals, ArrayRef< std::tuple< Value *, unsigned, bool > > VectorValuesAndScales)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
static bool isIdentityOrder(ArrayRef< unsigned > Order)
Does this non-empty order represent an identity order?
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
unsigned getMaxVecRegSize() const
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
bool analyzeConstantStrideCandidate(const ArrayRef< Value * > PointerOps, Type *ElemTy, Align Alignment, const SmallVectorImpl< unsigned > &SortedIndices, const int64_t Diff, Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const
Return true if an array of scalar loads can be replaced with a strided load (with constant stride).
unsigned getMinVecRegSize() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Gets reordering data for the given tree entry.
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ BasicBlock
Various leaf nodes.
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaxNum(const Opnd0 &Op0, const Opnd1 &Op1)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinimum(const Opnd0 &Op0, const Opnd1 &Op1)
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaximum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinNum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MatchFunctor< Val, Pattern > match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
initializer< Ty > init(const Ty &Val)
unsigned combineHashValue(unsigned a, unsigned b)
Simplistic combination of 32-bit hash values into 32-bit hash values.
@ User
could "use" a pointer
DiagnosticInfoOptimizationBase::Argument NV
friend class Instruction
Iterator for Instructions in a `BasicBlock.
LLVM_ABI iterator begin() const
LLVM_ABI Instruction & front() const
A private "module" namespace for types and utilities used by this pass.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
LLVM_ABI Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
FunctionAddr VTableAddr Value
void stable_sort(R &&Range)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
void fill(R &&Range, T &&Value)
Provide wrappers to std::fill which take ranges instead of having to pass begin/end explicitly.
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
hash_code hash_value(const FixedPointSemantics &Val)
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
MaybeAlign getAlign(const CallInst &I, unsigned Index)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
auto pred_end(const MachineBasicBlock *BB)
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
LLVM_ABI void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
constexpr from_range_t from_range
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
LLVM_ABI std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
auto cast_or_null(const Y &Val)
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
iterator_range< po_iterator< T > > post_order(const T &G)
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
bool isa_and_nonnull(const Y &Val)
auto binary_search(R &&Range, T &&Value)
Provide wrappers to std::binary_search which take ranges instead of having to pass begin/end explicit...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
DomTreeNodeBase< BasicBlock > DomTreeNode
auto dyn_cast_or_null(const Y &Val)
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
constexpr bool has_single_bit(T Value) noexcept
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
bool isa_and_present(const Y &Val)
isa_and_present<X> - Functionally identical to isa, except that a null value is accepted.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
bool isPointerTy(const Type *T)
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
auto make_first_range(ContainerTy &&c)
Given a container of pairs, return a range over the first elements.
LLVM_ABI std::optional< int64_t > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
LLVM_ABI bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
bool isModOrRefSet(const ModRefInfo MRI)
bool is_sorted(R &&Range, Compare C)
Wrapper function around std::is_sorted to check if elements in a range R are sorted with respect to a...
LLVM_ABI bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Ref
The access may reference the value stored in memory.
@ LLVM_MARK_AS_BITMASK_ENUM
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
LLVM_ABI CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)
Returns the comparison predicate used when expanding a min/max reduction.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Xor
Bitwise or logical XOR of integers.
@ And
Bitwise or logical AND of integers.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
FunctionAddr VTableAddr Next
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
OutputIt copy(R &&Range, OutputIt Out)
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
static unsigned getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy, const unsigned Limit=std::numeric_limits< unsigned >::max())
Returns number of parts, the type VecTy will be split at the codegen phase.
auto pred_begin(const MachineBasicBlock *BB)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
template class LLVM_TEMPLATE_ABI DomTreeNodeBase< BasicBlock >
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
LLVM_ABI Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
LLVM_ABI bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
LLVM_ABI bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Used to keep track of an operand bundle.
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
BoUpSLP::TreeEntry TreeEntry
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits(bool IsSimple=false)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits(bool simple=false)
DenseMapInfo< BoUpSLP::TreeEntry * > FirstInfo
static bool isEqual(const BoUpSLP::EdgeInfo &LHS, const BoUpSLP::EdgeInfo &RHS)
static BoUpSLP::EdgeInfo getEmptyKey()
DenseMapInfo< unsigned > SecondInfo
static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val)
static BoUpSLP::EdgeInfo getTombstoneKey()
An information struct used to provide DenseMap with the various necessary components for a given valu...
Add the VectorizableTree to the index iterator to be able to return TreeEntry pointers.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
ContainerTy & VectorizableTree
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
BoUpSLP::TreeEntry TreeEntry
static nodes_iterator nodes_end(BoUpSLP *R)
BoUpSLP::TreeEntry::VecTreeTy ContainerTy
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
TargetTransformInfo * TTI
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Function object to check whether the second component of a container supported by std::get (like std:...
This structure holds any data we need about the edges being traversed during buildTreeRec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.
bool operator==(const EdgeInfo &Other) const