74#ifdef EXPENSIVE_CHECKS
108using namespace std::placeholders;
110#define SV_NAME "slp-vectorizer"
111#define DEBUG_TYPE "SLP"
113STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
116 "Controls which SLP graphs should be vectorized.");
120 cl::desc(
"Run the SLP vectorization passes"));
124 cl::desc(
"Enable vectorization for wider vector utilization"));
128 cl::desc(
"Only vectorize if you gain more than this "
133 cl::desc(
"When true, SLP vectorizer bypasses profitability checks based on "
134 "heuristics and makes vectorization decision via cost modeling."));
138 cl::desc(
"Attempt to vectorize horizontal reductions"));
143 "Attempt to vectorize horizontal reductions feeding into a store"));
147 cl::desc(
"Improve the code quality by splitting alternate instructions"));
151 cl::desc(
"Attempt to vectorize for this register size in bits"));
155 cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
163 cl::desc(
"Limit the size of the SLP scheduling region per block"));
167 cl::desc(
"Attempt to vectorize for this register size in bits"));
171 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
175 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
181 cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
190 cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
194 cl::desc(
"The minimum number of loads, which should be considered strided, "
195 "if the stride is > 1 or is runtime value"));
199 cl::desc(
"The maximum stride, considered to be profitable."));
203 cl::desc(
"Disable tree reordering even if it is "
204 "profitable. Used for testing only."));
208 cl::desc(
"Generate strided loads even if they are not "
209 "profitable. Used for testing only."));
213 cl::desc(
"Display the SLP trees with Graphviz"));
217 cl::desc(
"Try to vectorize with non-power-of-2 number of elements."));
222 cl::desc(
"Try to replace values with the idempotent instructions for "
223 "better vectorization."));
255 Ty = Ty->getScalarType();
257 !Ty->isPPC_FP128Ty();
266 return SI->getValueOperand()->getType();
268 return CI->getOperand(0)->getType();
270 return IE->getOperand(1)->getType();
277 "ScalableVectorType is not supported.");
279 return VecTy->getNumElements();
293 Type *Ty,
unsigned Sz) {
298 if (NumParts == 0 || NumParts >= Sz)
313 if (NumParts == 0 || NumParts >= Sz)
318 return (Sz / RegVF) * RegVF;
330 I * VecTyNumElements, VecTyNumElements)))
332 : Mask[
I] * VecTyNumElements + J;
366 unsigned SVNumElements =
368 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
369 if (SVNumElements % ShuffleMaskSize != 0)
371 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
372 if (GroupSize == 0 || (VL.
size() % GroupSize) != 0)
374 unsigned NumGroup = 0;
375 for (
size_t I = 0,
E = VL.
size();
I !=
E;
I += GroupSize) {
377 Value *Src = SV->getOperand(0);
383 if (SV->getOperand(0) != Src)
386 if (!SV->isExtractSubvectorMask(Index))
388 ExpectedIndex.
set(Index / ShuffleMaskSize);
392 if (!ExpectedIndex.
all())
396 assert(NumGroup == (VL.
size() / GroupSize) &&
"Unexpected number of groups");
415 unsigned SVNumElements =
418 unsigned AccumulateLength = 0;
419 for (
Value *V : VL) {
421 for (
int M : SV->getShuffleMask())
423 : AccumulateLength + M);
424 AccumulateLength += SVNumElements;
465 return std::min<unsigned>(PartNumElems,
Size - Part * PartNumElems);
474 OS <<
"Idx: " << Idx <<
", ";
475 OS <<
"n=" << VL.
size() <<
" [" << *VL.
front() <<
", ..]";
498 if (BB !=
II->getParent())
515 Value *FirstNonUndef =
nullptr;
516 for (
Value *V : VL) {
519 if (!FirstNonUndef) {
523 if (V != FirstNonUndef)
526 return FirstNonUndef !=
nullptr;
542 return Cmp->isCommutative();
544 return BO->isCommutative() ||
545 (BO->getOpcode() == Instruction::Sub &&
552 if (match(U.getUser(),
553 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
554 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
558 return match(U.getUser(),
559 m_Intrinsic<Intrinsic::abs>(
560 m_Specific(U.get()), m_ConstantInt(Flag))) &&
561 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
564 (BO->getOpcode() == Instruction::FSub &&
567 return match(U.getUser(),
568 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
570 return I->isCommutative();
590 constexpr unsigned IntrinsicNumOperands = 2;
591 return IntrinsicNumOperands;
593 return I->getNumOperands();
599 static_assert(std::is_same_v<T, InsertElementInst> ||
600 std::is_same_v<T, ExtractElementInst>,
610 if (CI->getValue().uge(VT->getNumElements()))
612 Index *= VT->getNumElements();
613 Index += CI->getZExtValue();
635 Type *CurrentType =
IV->getType();
636 for (
unsigned I :
IV->indices()) {
638 Index *= ST->getNumElements();
639 CurrentType = ST->getElementType(
I);
641 Index *= AT->getNumElements();
642 CurrentType = AT->getElementType();
664 return std::all_of(It, VL.
end(), [&](
Value *V) {
665 if (auto *CI = dyn_cast<CmpInst>(V))
666 return BasePred == CI->getPredicate();
667 if (auto *I = dyn_cast<Instruction>(V))
668 return I->getOpcode() == Opcode;
669 return isa<PoisonValue>(V);
697 if (MaskArg == UseMask::UndefsAsMask)
701 if (MaskArg == UseMask::FirstArg &&
Value < VF)
702 UseMask.reset(
Value);
703 else if (MaskArg == UseMask::SecondArg &&
Value >= VF)
704 UseMask.reset(
Value - VF);
712template <
bool IsPoisonOnly = false>
716 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
724 if (!UseMask.empty()) {
735 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
750 for (
unsigned I = 0,
E = VecTy->getNumElements();
I !=
E; ++
I) {
751 if (
Constant *Elem =
C->getAggregateElement(
I))
753 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
781static std::optional<TargetTransformInfo::ShuffleKind>
788 std::accumulate(VL.
begin(), VL.
end(), 0u, [](
unsigned S,
Value *V) {
789 auto *EI = dyn_cast<ExtractElementInst>(V);
792 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
795 return std::max(S, VTy->getNumElements());
798 Value *Vec1 =
nullptr;
799 Value *Vec2 =
nullptr;
804 Value *Vec = EE->getVectorOperand();
810 ShuffleMode CommonShuffleMode =
Unknown;
812 for (
unsigned I = 0,
E = VL.
size();
I <
E; ++
I) {
819 auto *Vec = EI->getVectorOperand();
833 if (Idx->getValue().uge(
Size))
835 unsigned IntIdx = Idx->getValue().getZExtValue();
842 if (!Vec1 || Vec1 == Vec) {
844 }
else if (!Vec2 || Vec2 == Vec) {
850 if (CommonShuffleMode == Permute)
854 if (Mask[
I] %
Size !=
I) {
855 CommonShuffleMode = Permute;
858 CommonShuffleMode =
Select;
861 if (CommonShuffleMode ==
Select && Vec2)
871 unsigned Opcode =
E->getOpcode();
872 assert((Opcode == Instruction::ExtractElement ||
873 Opcode == Instruction::ExtractValue) &&
874 "Expected extractelement or extractvalue instruction.");
875 if (Opcode == Instruction::ExtractElement) {
879 return CI->getZExtValue();
882 if (EI->getNumIndices() != 1)
884 return *EI->idx_begin();
910bool isValidForAlternation(
unsigned Opcode) {
919class BinOpSameOpcodeHelper {
920 using MaskType = std::uint_fast16_t;
922 constexpr static std::initializer_list<unsigned> SupportedOp = {
923 Instruction::Add, Instruction::Sub, Instruction::Mul, Instruction::Shl,
924 Instruction::AShr, Instruction::And, Instruction::Or, Instruction::Xor};
934 MainOpBIT = 0b100000000,
942 static std::pair<ConstantInt *, unsigned>
943 isBinOpWithConstantInt(
const Instruction *
I) {
944 unsigned Opcode =
I->getOpcode();
950 if (Opcode == Instruction::Sub || Opcode == Instruction::Shl ||
951 Opcode == Instruction::AShr)
957 struct InterchangeableInfo {
960 MaskType Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |
961 MulBIT | AShrBIT | ShlBIT;
966 MaskType SeenBefore = 0;
967 InterchangeableInfo(
const Instruction *I) : I(I) {}
971 bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) {
972 if (Mask & InterchangeableMask) {
973 SeenBefore |= OpcodeInMaskForm;
974 Mask &= InterchangeableMask;
979 bool equal(
unsigned Opcode) {
980 return Opcode == I->getOpcode() && trySet(MainOpBIT, MainOpBIT);
983 MaskType Candidate = Mask & SeenBefore;
984 if (Candidate & MainOpBIT)
985 return I->getOpcode();
986 if (Candidate & ShlBIT)
987 return Instruction::Shl;
988 if (Candidate & AShrBIT)
989 return Instruction::AShr;
990 if (Candidate & MulBIT)
991 return Instruction::Mul;
992 if (Candidate & AddBIT)
993 return Instruction::Add;
994 if (Candidate & SubBIT)
995 return Instruction::Sub;
996 if (Candidate & AndBIT)
997 return Instruction::And;
998 if (Candidate & OrBIT)
999 return Instruction::Or;
1000 if (Candidate & XorBIT)
1001 return Instruction::Xor;
1006 bool hasCandidateOpcode(
unsigned Opcode)
const {
1007 MaskType Candidate = Mask & SeenBefore;
1009 case Instruction::Shl:
1010 return Candidate & ShlBIT;
1011 case Instruction::AShr:
1012 return Candidate & AShrBIT;
1013 case Instruction::Mul:
1014 return Candidate & MulBIT;
1015 case Instruction::Add:
1016 return Candidate & AddBIT;
1017 case Instruction::Sub:
1018 return Candidate & SubBIT;
1019 case Instruction::And:
1020 return Candidate & AndBIT;
1021 case Instruction::Or:
1022 return Candidate & OrBIT;
1023 case Instruction::Xor:
1024 return Candidate & XorBIT;
1025 case Instruction::LShr:
1026 case Instruction::FAdd:
1027 case Instruction::FSub:
1028 case Instruction::FMul:
1029 case Instruction::SDiv:
1030 case Instruction::UDiv:
1031 case Instruction::FDiv:
1032 case Instruction::SRem:
1033 case Instruction::URem:
1034 case Instruction::FRem:
1044 unsigned FromOpcode = I->getOpcode();
1045 if (FromOpcode == ToOpcode)
1048 auto [CI, Pos] = isBinOpWithConstantInt(I);
1049 const APInt &FromCIValue = CI->getValue();
1050 unsigned FromCIValueBitWidth = FromCIValue.
getBitWidth();
1052 switch (FromOpcode) {
1053 case Instruction::Shl:
1054 if (ToOpcode == Instruction::Mul) {
1058 assert(FromCIValue.
isZero() &&
"Cannot convert the instruction.");
1059 ToCIValue = ToOpcode == Instruction::And
1061 : APInt::getZero(FromCIValueBitWidth);
1064 case Instruction::Mul:
1066 if (ToOpcode == Instruction::Shl) {
1067 ToCIValue = APInt(FromCIValueBitWidth, FromCIValue.
logBase2());
1069 assert(FromCIValue.
isOne() &&
"Cannot convert the instruction.");
1070 ToCIValue = ToOpcode == Instruction::And
1072 : APInt::getZero(FromCIValueBitWidth);
1075 case Instruction::Add:
1076 case Instruction::Sub:
1077 if (FromCIValue.
isZero()) {
1081 "Cannot convert the instruction.");
1082 ToCIValue = FromCIValue;
1086 case Instruction::And:
1088 ToCIValue = ToOpcode == Instruction::Mul
1090 : APInt::getZero(FromCIValueBitWidth);
1093 assert(FromCIValue.
isZero() &&
"Cannot convert the instruction.");
1097 Value *
LHS = I->getOperand(1 - Pos);
1099 ConstantInt::get(I->getOperand(Pos)->getType(), ToCIValue);
1103 ((FromOpcode == Instruction::Add || FromOpcode == Instruction::Or ||
1104 FromOpcode == Instruction::Xor) &&
1105 ToOpcode == Instruction::Sub))
1110 InterchangeableInfo MainOp;
1111 InterchangeableInfo AltOp;
1112 bool isValidForAlternation(
const Instruction *
I)
const {
1113 return ::isValidForAlternation(MainOp.I->getOpcode()) &&
1114 ::isValidForAlternation(
I->getOpcode());
1116 bool initializeAltOp(
const Instruction *
I) {
1119 if (!isValidForAlternation(
I))
1126 BinOpSameOpcodeHelper(
const Instruction *MainOp,
1127 const Instruction *AltOp =
nullptr)
1128 : MainOp(MainOp), AltOp(AltOp) {
1131 bool add(
const Instruction *
I) {
1133 "BinOpSameOpcodeHelper only accepts BinaryOperator.");
1134 unsigned Opcode =
I->getOpcode();
1135 MaskType OpcodeInMaskForm;
1138 case Instruction::Shl:
1139 OpcodeInMaskForm = ShlBIT;
1141 case Instruction::AShr:
1142 OpcodeInMaskForm = AShrBIT;
1144 case Instruction::Mul:
1145 OpcodeInMaskForm = MulBIT;
1147 case Instruction::Add:
1148 OpcodeInMaskForm = AddBIT;
1150 case Instruction::Sub:
1151 OpcodeInMaskForm = SubBIT;
1153 case Instruction::And:
1154 OpcodeInMaskForm = AndBIT;
1156 case Instruction::Or:
1157 OpcodeInMaskForm = OrBIT;
1159 case Instruction::Xor:
1160 OpcodeInMaskForm = XorBIT;
1163 return MainOp.equal(Opcode) ||
1164 (initializeAltOp(
I) && AltOp.equal(Opcode));
1166 MaskType InterchangeableMask = OpcodeInMaskForm;
1167 ConstantInt *CI = isBinOpWithConstantInt(
I).first;
1169 constexpr MaskType CanBeAll =
1170 XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | MulBIT | AShrBIT | ShlBIT;
1171 const APInt &CIValue = CI->
getValue();
1173 case Instruction::Shl:
1175 InterchangeableMask = CIValue.
isZero() ? CanBeAll : MulBIT | ShlBIT;
1177 case Instruction::Mul:
1178 if (CIValue.
isOne()) {
1179 InterchangeableMask = CanBeAll;
1183 InterchangeableMask = MulBIT | ShlBIT;
1185 case Instruction::Add:
1186 case Instruction::Sub:
1187 InterchangeableMask = CIValue.
isZero() ? CanBeAll : SubBIT | AddBIT;
1189 case Instruction::And:
1191 InterchangeableMask = CanBeAll;
1193 case Instruction::Xor:
1195 InterchangeableMask = XorBIT | OrBIT | AndBIT | SubBIT | AddBIT;
1199 InterchangeableMask = CanBeAll;
1203 return MainOp.trySet(OpcodeInMaskForm, InterchangeableMask) ||
1204 (initializeAltOp(
I) &&
1205 AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));
1207 unsigned getMainOpcode()
const {
return MainOp.getOpcode(); }
1209 bool hasCandidateOpcode(
unsigned Opcode)
const {
1210 return MainOp.hasCandidateOpcode(Opcode);
1212 bool hasAltOp()
const {
return AltOp.I; }
1213 unsigned getAltOpcode()
const {
1214 return hasAltOp() ? AltOp.getOpcode() : getMainOpcode();
1217 return MainOp.getOperand(
I);
1222class InstructionsState {
1248 bool HasCopyables =
false;
1252 assert(valid() &&
"InstructionsState is invalid.");
1257 assert(valid() &&
"InstructionsState is invalid.");
1262 unsigned getOpcode()
const {
return getMainOp()->getOpcode(); }
1264 unsigned getAltOpcode()
const {
return getAltOp()->getOpcode(); }
1267 bool isAltShuffle()
const {
return getMainOp() != getAltOp(); }
1276 Instruction *getMatchingMainOpOrAltOp(Instruction *
I)
const {
1277 assert(MainOp &&
"MainOp cannot be nullptr.");
1278 if (
I->getOpcode() == MainOp->getOpcode())
1281 assert(AltOp &&
"AltOp cannot be nullptr.");
1282 if (
I->getOpcode() == AltOp->getOpcode())
1284 if (!
I->isBinaryOp())
1286 BinOpSameOpcodeHelper
Converter(MainOp);
1289 if (isAltShuffle() && !
Converter.hasCandidateOpcode(MainOp->getOpcode())) {
1290 BinOpSameOpcodeHelper AltConverter(AltOp);
1291 if (AltConverter.add(
I) && AltConverter.add(AltOp) &&
1292 AltConverter.hasCandidateOpcode(AltOp->getOpcode()))
1295 if (
Converter.hasAltOp() && !isAltShuffle())
1297 return Converter.hasAltOp() ? AltOp : MainOp;
1301 bool isShiftOp()
const {
1302 return getMainOp()->isShift() && getAltOp()->isShift();
1307 return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();
1311 bool isMulDivLikeOp()
const {
1312 constexpr std::array<unsigned, 8> MulDiv = {
1313 Instruction::Mul, Instruction::FMul, Instruction::SDiv,
1314 Instruction::UDiv, Instruction::FDiv, Instruction::SRem,
1315 Instruction::URem, Instruction::FRem};
1321 bool isAddSubLikeOp()
const {
1322 constexpr std::array<unsigned, 4>
AddSub = {
1323 Instruction::Add, Instruction::Sub, Instruction::FAdd,
1330 bool isCmpOp()
const {
1331 return (
getOpcode() == Instruction::ICmp ||
1337 bool valid()
const {
return MainOp && AltOp; }
1339 explicit operator bool()
const {
return valid(); }
1341 InstructionsState() =
delete;
1342 InstructionsState(Instruction *MainOp, Instruction *AltOp,
1343 bool HasCopyables =
false)
1344 : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}
1345 static InstructionsState invalid() {
return {
nullptr,
nullptr}; }
1348 bool isCopyableElement(
Value *V)
const {
1349 assert(valid() &&
"InstructionsState is invalid.");
1352 if (isAltShuffle() ||
getOpcode() == Instruction::GetElementPtr)
1357 if (
I->getParent() != MainOp->getParent() &&
1361 if (
I->getOpcode() == MainOp->getOpcode())
1363 if (!
I->isBinaryOp())
1365 BinOpSameOpcodeHelper
Converter(MainOp);
1371 bool isNonSchedulable(
Value *V)
const {
1372 assert(valid() &&
"InstructionsState is invalid.");
1379 if (getMainOp() == V)
1381 if (isCopyableElement(V)) {
1382 auto IsNonSchedulableCopyableElement = [
this](
Value *
V) {
1384 return !
I ||
isa<PHINode>(
I) ||
I->getParent() != MainOp->getParent() ||
1389 !MainOp->comesBefore(
I));
1392 return IsNonSchedulableCopyableElement(V);
1399 bool areInstructionsWithCopyableElements()
const {
1400 assert(valid() &&
"InstructionsState is invalid.");
1401 return HasCopyables;
1405std::pair<Instruction *, SmallVector<Value *>>
1407 Instruction *SelectedOp = S.getMatchingMainOpOrAltOp(
I);
1408 assert(SelectedOp &&
"Cannot convert the instruction.");
1409 if (
I->isBinaryOp()) {
1411 return std::make_pair(SelectedOp,
Converter.getOperand(SelectedOp));
1430 for (
Value *V : VL) {
1435 if (Inst->getOpcode() == Opcode)
1449 BaseOp0 == Op0 || BaseOp1 == Op1 ||
1460 "Assessing comparisons of different types?");
1470 return (BasePred == Pred &&
1472 (BasePred == SwappedPred &&
1483 return InstructionsState::invalid();
1487 return InstructionsState::invalid();
1492 (VL.
size() == 2 && InstCnt < 2))
1493 return InstructionsState::invalid();
1502 unsigned AltOpcode = Opcode;
1504 BinOpSameOpcodeHelper BinOpHelper(MainOp);
1505 bool SwappedPredsCompatible = IsCmpOp && [&]() {
1507 UniquePreds.
insert(BasePred);
1508 UniqueNonSwappedPreds.
insert(BasePred);
1509 for (
Value *V : VL) {
1516 UniqueNonSwappedPreds.
insert(CurrentPred);
1517 if (!UniquePreds.
contains(CurrentPred) &&
1518 !UniquePreds.
contains(SwappedCurrentPred))
1519 UniquePreds.
insert(CurrentPred);
1524 return UniqueNonSwappedPreds.
size() > 2 && UniquePreds.
size() == 2;
1534 return InstructionsState::invalid();
1536 bool AnyPoison = InstCnt != VL.
size();
1547 if (AnyPoison && (
I->isIntDivRem() ||
I->isFPDivRem() ||
isa<CallInst>(
I)))
1548 return InstructionsState::invalid();
1549 unsigned InstOpcode =
I->getOpcode();
1551 if (BinOpHelper.add(
I))
1556 Value *Op1 =
I->getOperand(0);
1559 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
1561 if (Opcode == AltOpcode) {
1562 assert(isValidForAlternation(Opcode) &&
1563 isValidForAlternation(InstOpcode) &&
1564 "Cast isn't safe for alternation, logic needs to be updated!");
1565 AltOpcode = InstOpcode;
1572 Type *Ty0 = BaseInst->getOperand(0)->getType();
1573 Type *Ty1 = Inst->getOperand(0)->getType();
1575 assert(InstOpcode == Opcode &&
"Expected same CmpInst opcode.");
1576 assert(InstOpcode == AltOpcode &&
1577 "Alternate instructions are only supported by BinaryOperator "
1585 if ((VL.
size() == 2 || SwappedPredsCompatible) &&
1586 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1592 if (MainOp != AltOp) {
1595 }
else if (BasePred != CurrentPred) {
1597 isValidForAlternation(InstOpcode) &&
1598 "CmpInst isn't safe for alternation, logic needs to be updated!");
1603 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1604 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1607 }
else if (InstOpcode == Opcode) {
1608 assert(InstOpcode == AltOpcode &&
1609 "Alternate instructions are only supported by BinaryOperator and "
1612 if (Gep->getNumOperands() != 2 ||
1614 return InstructionsState::invalid();
1617 return InstructionsState::invalid();
1620 if (!LI->isSimple() || !BaseLI->isSimple())
1621 return InstructionsState::invalid();
1625 return InstructionsState::invalid();
1626 if (
Call->hasOperandBundles() &&
1628 !std::equal(
Call->op_begin() +
Call->getBundleOperandsStartIndex(),
1629 Call->op_begin() +
Call->getBundleOperandsEndIndex(),
1632 return InstructionsState::invalid();
1635 return InstructionsState::invalid();
1638 if (Mappings.
size() != BaseMappings.
size() ||
1639 Mappings.
front().ISA != BaseMappings.
front().ISA ||
1640 Mappings.
front().ScalarName != BaseMappings.
front().ScalarName ||
1641 Mappings.
front().VectorName != BaseMappings.
front().VectorName ||
1642 Mappings.
front().Shape.VF != BaseMappings.
front().Shape.VF ||
1643 Mappings.
front().Shape.Parameters !=
1644 BaseMappings.
front().Shape.Parameters)
1645 return InstructionsState::invalid();
1650 return InstructionsState::invalid();
1655 assert(MainOp &&
"Cannot find MainOp with Opcode from BinOpHelper.");
1657 assert(MainOp &&
"Cannot find AltOp with Opcode from BinOpHelper.");
1660 "Incorrect implementation of allSameOpcode.");
1661 InstructionsState S(MainOp, AltOp);
1667 "Invalid InstructionsState.");
1675 return all_of(VL, [&](
Value *V) {
return V->getType() == Ty; });
1685 unsigned Opcode = UserInst->
getOpcode();
1687 case Instruction::Load: {
1691 case Instruction::Store: {
1693 return (
SI->getPointerOperand() == Scalar);
1695 case Instruction::Call: {
1699 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1700 Arg.value().get() == Scalar;
1720 return LI->isSimple();
1722 return SI->isSimple();
1724 return !
MI->isVolatile();
1732 bool ExtendingManyInputs =
false) {
1733 if (SubMask.
empty())
1736 (!ExtendingManyInputs || SubMask.
size() > Mask.size() ||
1739 "SubMask with many inputs support must be larger than the mask.");
1741 Mask.append(SubMask.
begin(), SubMask.
end());
1745 int TermValue = std::min(Mask.size(), SubMask.
size());
1746 for (
int I = 0,
E = SubMask.
size();
I <
E; ++
I) {
1748 (!ExtendingManyInputs &&
1749 (SubMask[
I] >= TermValue || Mask[SubMask[
I]] >= TermValue)))
1751 NewMask[
I] = Mask[SubMask[
I]];
1767 const size_t Sz = Order.
size();
1770 for (
unsigned I = 0;
I < Sz; ++
I) {
1772 UnusedIndices.
reset(Order[
I]);
1774 MaskedIndices.
set(
I);
1776 if (MaskedIndices.
none())
1779 "Non-synced masked/available indices.");
1783 assert(Idx >= 0 &&
"Indices must be synced.");
1793 unsigned Opcode0,
unsigned Opcode1) {
1800 OpcodeMask.
set(Lane * ScalarTyNumElements,
1801 Lane * ScalarTyNumElements + ScalarTyNumElements);
1810 "Expected scalar constants.");
1813 std::fill_n(NewVal.begin() +
I * VF, VF, V);
1822 const unsigned E = Indices.
size();
1824 for (
unsigned I = 0;
I < E; ++
I)
1825 Mask[Indices[
I]] =
I;
1831 assert(!Mask.empty() &&
"Expected non-empty mask.");
1835 for (
unsigned I = 0, E = Prev.
size();
I < E; ++
I)
1837 Scalars[Mask[
I]] = Prev[
I];
1850 auto *IO = dyn_cast<Instruction>(V);
1853 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1866 return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(
UsesLimit) &&
1868 auto *IU = dyn_cast<Instruction>(U);
1871 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1887 return !VL.
empty() &&
1903 return NumParts > 0 && NumParts < Sz &&
has_single_bit(Sz / NumParts) &&
1912 const unsigned Limit = std::numeric_limits<unsigned>::max()) {
1913 unsigned NumParts =
TTI.getNumberOfParts(VecTy);
1914 if (NumParts == 0 || NumParts >= Limit)
1917 if (NumParts >= Sz || Sz % NumParts != 0 ||
1928 class ScheduleEntity;
1930 class ScheduleCopyableData;
1931 class ScheduleBundle;
1941 struct StridedPtrInfo {
1942 Value *StrideVal =
nullptr;
1943 const SCEV *StrideSCEV =
nullptr;
1969 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1970 AC(AC), DB(DB), DL(DL), ORE(ORE),
1989 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
2002 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales = {});
2023 assert(!VectorizableTree.empty() &&
"No graph to get the first node from");
2024 return VectorizableTree.front()->Scalars;
2030 const TreeEntry &Root = *VectorizableTree.front();
2031 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
2032 !Root.Scalars.
front()->getType()->isIntegerTy())
2033 return std::nullopt;
2034 auto It = MinBWs.find(&Root);
2035 if (It != MinBWs.end())
2039 if (Root.getOpcode() == Instruction::ZExt ||
2040 Root.getOpcode() == Instruction::SExt)
2041 return std::make_pair(
cast<CastInst>(Root.getMainOp())->getSrcTy(),
2042 Root.getOpcode() == Instruction::SExt);
2043 return std::nullopt;
2049 return MinBWs.at(VectorizableTree.front().get()).second;
2054 if (ReductionBitWidth == 0 ||
2055 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
2056 ReductionBitWidth >=
2057 DL->getTypeSizeInBits(
2058 VectorizableTree.front()->Scalars.front()->getType()))
2060 VectorizableTree.front()->Scalars.front()->getType(),
2061 VectorizableTree.front()->getVectorFactor());
2064 VectorizableTree.front()->Scalars.front()->getContext(),
2066 VectorizableTree.front()->getVectorFactor());
2081 VectorizableTree.clear();
2082 ScalarToTreeEntries.clear();
2083 OperandsToTreeEntry.clear();
2084 ScalarsInSplitNodes.clear();
2086 NonScheduledFirst.clear();
2087 EntryToLastInstruction.clear();
2088 LoadEntriesToVectorize.clear();
2089 IsGraphTransformMode =
false;
2090 GatheredLoadsEntriesFirst.reset();
2091 CompressEntryToData.clear();
2092 ExternalUses.clear();
2093 ExternalUsesAsOriginalScalar.clear();
2094 ExternalUsesWithNonUsers.clear();
2095 for (
auto &Iter : BlocksSchedules) {
2096 BlockScheduling *BS = Iter.second.get();
2100 ReductionBitWidth = 0;
2102 CastMaxMinBWSizes.reset();
2103 ExtraBitWidthNodes.clear();
2104 InstrElementSize.clear();
2105 UserIgnoreList =
nullptr;
2106 PostponedGathers.clear();
2107 ValueToGatherNodes.clear();
2108 TreeEntryToStridedPtrInfoMap.clear();
2124 assert(!Order.
empty() &&
"expected non-empty order");
2125 const unsigned Sz = Order.
size();
2127 return P.value() ==
P.index() ||
P.value() == Sz;
2140 bool IgnoreReorder);
2153 std::optional<OrdersType>
2191 return MaxVecRegSize;
2196 return MinVecRegSize;
2204 unsigned MaxVF =
MaxVFOption.getNumOccurrences() ?
2205 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
2206 return MaxVF ? MaxVF : UINT_MAX;
2245 Align Alignment,
const int64_t Diff,
2246 const size_t Sz)
const;
2287 Value *Ptr0,
Value *PtrN, StridedPtrInfo &SPtrInfo)
const;
2305 Align CommonAlignment,
2307 StridedPtrInfo &SPtrInfo)
const;
2322 StridedPtrInfo &SPtrInfo,
2323 unsigned *BestVF =
nullptr,
2324 bool TryRecursiveCheck =
true)
const;
2328 ListOfKnonwnNonVectorizableLoads.insert(
hash_value(VL));
2332 template <
typename T>
2334 return ListOfKnonwnNonVectorizableLoads.contains(
hash_value(VL));
2359 OS <<
"{User:" << (
UserTE ? std::to_string(
UserTE->Idx) :
"null")
2360 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
2385 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
2386 MaxLevel(MaxLevel) {}
2442 auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
2447 auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
2449 return U == U1 || U == U2 || R.isVectorized(U);
2452 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
2455 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
2457 ((
int)V1->getNumUses() == NumLanes ||
2458 AllUsersAreInternal(V1, V2)))
2464 auto CheckSameEntryOrFail = [&]() {
2469 any_of(TEs2, [&](TreeEntry *E) {
return Set.contains(E); }))
2478 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
2480 return CheckSameEntryOrFail();
2483 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
2484 LI2->getPointerOperand(), DL, SE,
true);
2485 if (!Dist || *Dist == 0) {
2488 R.TTI->isLegalMaskedGather(
2491 return CheckSameEntryOrFail();
2495 if (std::abs(*Dist) > NumLanes / 2)
2528 Value *EV2 =
nullptr;
2541 int Dist = Idx2 - Idx1;
2544 if (std::abs(Dist) == 0)
2546 if (std::abs(Dist) > NumLanes / 2)
2553 return CheckSameEntryOrFail();
2559 if (I1->getParent() != I2->getParent())
2560 return CheckSameEntryOrFail();
2568 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.
empty() ||
2569 !S.isAltShuffle()) &&
2573 S.getMainOp()->getNumOperands();
2585 return CheckSameEntryOrFail();
2619 int ShallowScoreAtThisLevel =
2630 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
2633 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
2635 ShallowScoreAtThisLevel))
2636 return ShallowScoreAtThisLevel;
2637 assert(I1 && I2 &&
"Should have early exited.");
2644 for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
2645 OpIdx1 != NumOperands1; ++OpIdx1) {
2647 int MaxTmpScore = 0;
2648 unsigned MaxOpIdx2 = 0;
2649 bool FoundBest =
false;
2653 ? I2->getNumOperands()
2654 : std::min(I2->getNumOperands(), OpIdx1 + 1);
2655 assert(FromIdx <= ToIdx &&
"Bad index");
2656 for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
2658 if (Op2Used.
count(OpIdx2))
2663 I1, I2, CurrLevel + 1, {});
2666 TmpScore > MaxTmpScore) {
2667 MaxTmpScore = TmpScore;
2674 Op2Used.
insert(MaxOpIdx2);
2675 ShallowScoreAtThisLevel += MaxTmpScore;
2678 return ShallowScoreAtThisLevel;
2709 struct OperandData {
2710 OperandData() =
default;
2711 OperandData(
Value *V,
bool APO,
bool IsUsed)
2712 : V(V), APO(APO), IsUsed(IsUsed) {}
2722 bool IsUsed =
false;
2731 enum class ReorderingMode {
2745 unsigned ArgSize = 0;
2751 const Loop *L =
nullptr;
2754 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
2755 return OpsVec[
OpIdx][Lane];
2759 const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const {
2760 return OpsVec[
OpIdx][Lane];
2765 for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
2767 for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2769 OpsVec[
OpIdx][Lane].IsUsed =
false;
2773 void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
2774 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2786 int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx,
2788 Value *IdxLaneV = getData(Idx, Lane).V;
2801 unsigned UniquesCount = Uniques.
size();
2802 auto IdxIt = Uniques.
find(IdxLaneV);
2803 unsigned UniquesCntWithIdxLaneV =
2804 IdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2806 auto OpIdxIt = Uniques.
find(OpIdxLaneV);
2807 unsigned UniquesCntWithOpIdxLaneV =
2808 OpIdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2809 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2811 return std::min(
bit_ceil(UniquesCntWithOpIdxLaneV) -
2812 UniquesCntWithOpIdxLaneV,
2813 UniquesCntWithOpIdxLaneV -
2815 ((IdxIt != Uniques.
end() && UsedLanes.
test(IdxIt->second))
2816 ? UniquesCntWithIdxLaneV -
bit_floor(UniquesCntWithIdxLaneV)
2817 :
bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2826 int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
2827 Value *IdxLaneV = getData(Idx, Lane).V;
2840 return R.areAllUsersVectorized(IdxLaneI)
2848 static const int ScoreScaleFactor = 10;
2856 int Lane,
unsigned OpIdx,
unsigned Idx,
2866 int SplatScore = getSplatScore(Lane,
OpIdx, Idx, UsedLanes);
2867 if (Score <= -SplatScore) {
2871 Score += SplatScore;
2877 Score *= ScoreScaleFactor;
2878 Score += getExternalUseScore(Lane,
OpIdx, Idx);
2896 std::optional<unsigned>
2897 getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
2901 unsigned NumOperands = getNumOperands();
2904 Value *OpLastLane = getData(
OpIdx, LastLane).V;
2907 ReorderingMode RMode = ReorderingModes[
OpIdx];
2908 if (RMode == ReorderingMode::Failed)
2909 return std::nullopt;
2912 bool OpIdxAPO = getData(
OpIdx, Lane).APO;
2918 std::optional<unsigned> Idx;
2922 BestScoresPerLanes.try_emplace(std::make_pair(
OpIdx, Lane), 0)
2928 bool IsUsed = RMode == ReorderingMode::Splat ||
2929 RMode == ReorderingMode::Constant ||
2930 RMode == ReorderingMode::Load;
2932 for (
unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2934 OperandData &OpData = getData(Idx, Lane);
2936 bool OpAPO = OpData.APO;
2945 if (OpAPO != OpIdxAPO)
2950 case ReorderingMode::Load:
2951 case ReorderingMode::Opcode: {
2952 bool LeftToRight = Lane > LastLane;
2953 Value *OpLeft = (LeftToRight) ? OpLastLane :
Op;
2954 Value *OpRight = (LeftToRight) ?
Op : OpLastLane;
2955 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2956 OpIdx, Idx, IsUsed, UsedLanes);
2957 if (Score >
static_cast<int>(BestOp.Score) ||
2958 (Score > 0 && Score ==
static_cast<int>(BestOp.Score) &&
2961 BestOp.Score = Score;
2962 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] = Score;
2966 case ReorderingMode::Constant:
2968 (!BestOp.Score && L && L->isLoopInvariant(
Op))) {
2972 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] =
2979 case ReorderingMode::Splat:
2981 IsUsed =
Op == OpLastLane;
2982 if (
Op == OpLastLane) {
2984 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] =
2990 case ReorderingMode::Failed:
2996 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
3000 return std::nullopt;
3007 unsigned getBestLaneToStartReordering()
const {
3008 unsigned Min = UINT_MAX;
3009 unsigned SameOpNumber = 0;
3020 for (
int I = getNumLanes();
I > 0; --
I) {
3021 unsigned Lane =
I - 1;
3022 OperandsOrderData NumFreeOpsHash =
3023 getMaxNumOperandsThatCanBeReordered(Lane);
3026 if (NumFreeOpsHash.NumOfAPOs < Min) {
3027 Min = NumFreeOpsHash.NumOfAPOs;
3028 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3030 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3031 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
3032 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
3035 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3036 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3037 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
3038 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
3039 auto [It, Inserted] =
3040 HashMap.
try_emplace(NumFreeOpsHash.Hash, 1, Lane);
3046 unsigned BestLane = 0;
3047 unsigned CntMin = UINT_MAX;
3049 if (
Data.second.first < CntMin) {
3050 CntMin =
Data.second.first;
3051 BestLane =
Data.second.second;
3058 struct OperandsOrderData {
3061 unsigned NumOfAPOs = UINT_MAX;
3064 unsigned NumOpsWithSameOpcodeParent = 0;
3078 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const {
3079 unsigned CntTrue = 0;
3080 unsigned NumOperands = getNumOperands();
3090 bool AllUndefs =
true;
3091 unsigned NumOpsWithSameOpcodeParent = 0;
3096 const OperandData &OpData = getData(
OpIdx, Lane);
3103 I->getParent() != Parent) {
3104 if (NumOpsWithSameOpcodeParent == 0) {
3105 NumOpsWithSameOpcodeParent = 1;
3107 Parent =
I->getParent();
3109 --NumOpsWithSameOpcodeParent;
3112 ++NumOpsWithSameOpcodeParent;
3121 OperandsOrderData
Data;
3122 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
3123 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
3130 const InstructionsState &S) {
3134 return VL.
size() == getNumLanes();
3136 "Expected same number of lanes");
3137 assert(S.valid() &&
"InstructionsState is invalid.");
3143 OpsVec.resize(ArgSize);
3144 unsigned NumLanes = VL.
size();
3145 for (OperandDataVec &
Ops : OpsVec)
3146 Ops.resize(NumLanes);
3161 OpsVec[
OpIdx][Lane] = {Operands[
OpIdx][Lane],
true,
false};
3164 bool IsInverseOperation =
false;
3165 if (S.isCopyableElement(VL[Lane])) {
3169 assert(
I &&
"Expected instruction");
3170 auto [SelectedOp,
Ops] = convertTo(
I, S);
3177 bool APO = (
OpIdx == 0) ?
false : IsInverseOperation;
3178 OpsVec[
OpIdx][Lane] = {Operands[
OpIdx][Lane], APO,
false};
3184 unsigned getNumOperands()
const {
return ArgSize; }
3187 unsigned getNumLanes()
const {
return OpsVec[0].size(); }
3190 Value *getValue(
unsigned OpIdx,
unsigned Lane)
const {
3191 return getData(
OpIdx, Lane).V;
3195 bool empty()
const {
return OpsVec.empty(); }
3198 void clear() { OpsVec.clear(); }
3203 bool shouldBroadcast(
Value *
Op,
unsigned OpIdx,
unsigned Lane) {
3205 "Op is expected to be getValue(OpIdx, Lane).");
3209 bool OpAPO = getData(
OpIdx, Lane).APO;
3210 bool IsInvariant = L && L->isLoopInvariant(
Op);
3212 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3216 bool FoundCandidate =
false;
3217 for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
3218 OperandData &
Data = getData(OpI, Ln);
3219 if (
Data.APO != OpAPO ||
Data.IsUsed)
3221 Value *OpILane = getValue(OpI, Lane);
3245 L->isLoopInvariant(
Data.V))) {
3246 FoundCandidate =
true;
3253 if (!FoundCandidate)
3256 return getNumLanes() == 2 || Cnt > 1;
3263 "Op is expected to be getValue(OpIdx, Lane).");
3264 bool OpAPO = getData(
OpIdx, Lane).APO;
3265 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3269 const OperandData &
Data = getData(OpI, Ln);
3270 if (
Data.APO != OpAPO ||
Data.IsUsed)
3272 Value *OpILn = getValue(OpI, Ln);
3273 return (L && L->isLoopInvariant(OpILn)) ||
3285 const InstructionsState &S,
const BoUpSLP &R)
3286 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
3287 L(R.LI->getLoopFor(S.getMainOp()->
getParent())) {
3289 appendOperands(RootVL, Operands, S);
3297 "Expected same num of lanes across all operands");
3298 for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
3299 OpVL[Lane] = OpsVec[
OpIdx][Lane].V;
3307 unsigned NumOperands = getNumOperands();
3308 unsigned NumLanes = getNumLanes();
3328 unsigned FirstLane = getBestLaneToStartReordering();
3337 if (shouldBroadcast(OpLane0,
OpIdx, FirstLane) ||
3338 !canBeVectorized(OpILane0,
OpIdx, FirstLane))
3339 ReorderingModes[
OpIdx] = ReorderingMode::Splat;
3341 ReorderingModes[
OpIdx] = ReorderingMode::Load;
3343 ReorderingModes[
OpIdx] = ReorderingMode::Opcode;
3345 ReorderingModes[
OpIdx] = ReorderingMode::Constant;
3348 ReorderingModes[
OpIdx] = ReorderingMode::Splat;
3358 auto &&SkipReordering = [
this]() {
3361 for (
const OperandData &
Data : Op0)
3364 ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
3365 if (
any_of(
Op, [&UniqueValues](
const OperandData &
Data) {
3372 return UniqueValues.
size() != 2 &&
3374 UniqueValues.
size());
3386 if (SkipReordering())
3389 bool StrategyFailed =
false;
3397 for (
unsigned I = 0;
I < NumOperands; ++
I)
3398 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
3401 UsedLanes.
set(FirstLane);
3402 for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
3404 for (
int Direction : {+1, -1}) {
3405 int Lane = FirstLane + Direction * Distance;
3406 if (Lane < 0 || Lane >= (
int)NumLanes)
3408 UsedLanes.
set(Lane);
3409 int LastLane = Lane - Direction;
3410 assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
3415 std::optional<unsigned> BestIdx =
3416 getBestOperand(
OpIdx, Lane, LastLane, ReorderingModes,
3417 MainAltOps[
OpIdx], UsedLanes);
3424 swap(
OpIdx, *BestIdx, Lane);
3427 StrategyFailed =
true;
3431 OperandData &AltOp = getData(
OpIdx, Lane);
3432 InstructionsState OpS =
3434 if (OpS && OpS.isAltShuffle())
3441 if (!StrategyFailed)
3446#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3449 case ReorderingMode::Load:
3451 case ReorderingMode::Opcode:
3453 case ReorderingMode::Constant:
3455 case ReorderingMode::Splat:
3457 case ReorderingMode::Failed:
3478 const unsigned Indent = 2;
3480 for (
const OperandDataVec &OpDataVec : OpsVec) {
3481 OS <<
"Operand " << Cnt++ <<
"\n";
3482 for (
const OperandData &OpData : OpDataVec) {
3483 OS.
indent(Indent) <<
"{";
3484 if (
Value *V = OpData.V)
3488 OS <<
", APO:" << OpData.APO <<
"}\n";
3510 int BestScore = Limit;
3511 std::optional<int> Index;
3512 for (
int I :
seq<int>(0, Candidates.size())) {
3514 Candidates[
I].second,
3517 if (Score > BestScore) {
3532 DeletedInstructions.insert(
I);
3537 template <
typename T>
3540 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
3542 for (
T *V : DeadVals) {
3547 for (
T *V : DeadVals) {
3548 if (!V || !Processed.
insert(V).second)
3553 for (
Use &U :
I->operands()) {
3555 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
3557 (Entries.
empty() ||
none_of(Entries, [&](
const TreeEntry *Entry) {
3558 return Entry->VectorizedValue == OpI;
3562 I->dropAllReferences();
3564 for (
T *V : DeadVals) {
3566 if (!
I->getParent())
3571 cast<Instruction>(U.getUser()));
3573 "trying to erase instruction with users.");
3574 I->removeFromParent();
3578 while (!DeadInsts.
empty()) {
3581 if (!VI || !VI->getParent())
3584 "Live instruction found in dead worklist!");
3585 assert(VI->use_empty() &&
"Instructions with uses are not dead.");
3592 for (
Use &OpU : VI->operands()) {
3593 Value *OpV = OpU.get();
3605 if (!DeletedInstructions.contains(OpI) &&
3606 (!OpI->getType()->isVectorTy() ||
3607 none_of(VectorValuesAndScales,
3608 [&](
const std::tuple<Value *, unsigned, bool> &V) {
3609 return std::get<0>(V) == OpI;
3615 VI->removeFromParent();
3617 SE->forgetValue(VI);
3624 return AnalyzedReductionsRoots.count(
I);
3629 AnalyzedReductionsRoots.insert(
I);
3634 return AnalyzedReductionVals.contains(
hash_value(VL));
3639 AnalyzedReductionVals.insert(
hash_value(VL));
3643 AnalyzedReductionsRoots.clear();
3644 AnalyzedReductionVals.clear();
3645 AnalyzedMinBWVals.clear();
3653 return MustGather.contains(V);
3657 return NonScheduledFirst.contains(V);
3662 assert(V &&
"V cannot be nullptr.");
3663 return ScalarToTreeEntries.contains(V);
3673 bool collectValuesToDemote(
3674 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
3677 bool &IsProfitableToDemote,
bool IsTruncRoot)
const;
3686 void buildReorderableOperands(
3694 void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
3697 bool areAllUsersVectorized(
3706 const TreeEntry *getOperandEntry(
const TreeEntry *E,
unsigned Idx)
const;
3707 TreeEntry *getOperandEntry(TreeEntry *E,
unsigned Idx) {
3708 return const_cast<TreeEntry *
>(
3709 getOperandEntry(
const_cast<const TreeEntry *
>(E), Idx));
3715 Instruction *getRootEntryInstruction(
const TreeEntry &Entry)
const;
3719 getCastContextHint(
const TreeEntry &TE)
const;
3733 const InstructionsState &LocalState,
3740 unsigned InterleaveFactor = 0);
3751 bool ResizeAllowed =
false)
const;
3758 Value *vectorizeOperand(TreeEntry *E,
unsigned NodeIdx);
3763 template <
typename BVTy,
typename ResTy,
typename... Args>
3764 ResTy processBuildVector(
const TreeEntry *E,
Type *ScalarTy, Args &...Params);
3769 Value *createBuildVector(
const TreeEntry *E,
Type *ScalarTy);
3775 Instruction &getLastInstructionInBundle(
const TreeEntry *E);
3782 std::optional<TargetTransformInfo::ShuffleKind>
3794 unsigned NumParts)
const;
3806 std::optional<TargetTransformInfo::ShuffleKind>
3807 isGatherShuffledSingleRegisterEntry(
3824 isGatherShuffledEntry(
3827 unsigned NumParts,
bool ForOrder =
false);
3833 Type *ScalarTy)
const;
3837 void setInsertPointAfterBundle(
const TreeEntry *E);
3847 bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
3852 void tryToVectorizeGatheredLoads(
3854 std::tuple<BasicBlock *, Value *, Type *>,
3862 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
3878 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
3882 void reorderGatherNode(TreeEntry &TE);
3887 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3891 if (State == TreeEntry::SplitVectorize)
3893 SmallVector<int>
Mask;
3900 SmallVector<int> getSplitMask()
const {
3901 assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
3902 "Expected only split vectorize node.");
3904 unsigned CommonVF = std::max<unsigned>(
3905 CombinedEntriesWithIndices.back().second,
3906 Scalars.size() - CombinedEntriesWithIndices.back().second);
3907 for (
auto [Idx,
I] :
enumerate(ReorderIndices))
3909 Idx + (Idx >= CombinedEntriesWithIndices.back().second
3910 ? CommonVF - CombinedEntriesWithIndices.back().second
3917 void reorderSplitNode(
unsigned Idx, ArrayRef<int> Mask,
3918 ArrayRef<int> MaskOrder);
3923 if (
Mask.size() != VL.
size() && VL.
size() == Scalars.size())
3924 return std::equal(VL.
begin(), VL.
end(), Scalars.begin());
3927 [Scalars](
Value *V,
int Idx) {
3928 return (isa<UndefValue>(V) &&
3929 Idx == PoisonMaskElem) ||
3930 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3933 if (!ReorderIndices.empty()) {
3937 SmallVector<int>
Mask;
3939 if (VL.
size() == Scalars.size())
3940 return IsSame(Scalars, Mask);
3941 if (VL.
size() == ReuseShuffleIndices.size()) {
3943 return IsSame(Scalars, Mask);
3947 return IsSame(Scalars, ReuseShuffleIndices);
3951 bool hasEqualOperands(
const TreeEntry &TE)
const {
3952 if (
TE.getNumOperands() != getNumOperands())
3954 SmallBitVector
Used(getNumOperands());
3955 for (
unsigned I = 0,
E = getNumOperands();
I <
E; ++
I) {
3956 unsigned PrevCount =
Used.count();
3957 for (
unsigned K = 0;
K <
E; ++
K) {
3960 if (getOperand(K) ==
TE.getOperand(
I)) {
3966 if (PrevCount ==
Used.count())
3975 unsigned getVectorFactor()
const {
3976 if (!ReuseShuffleIndices.empty())
3977 return ReuseShuffleIndices.size();
3978 return Scalars.size();
3982 bool isGather()
const {
return State == NeedToGather; }
3988 WeakTrackingVH VectorizedValue =
nullptr;
4009 enum CombinedOpcode {
4011 MinMax = Instruction::OtherOpsEnd + 1,
4014 CombinedOpcode CombinedOp = NotCombinedOp;
4017 SmallVector<int, 4> ReuseShuffleIndices;
4020 SmallVector<unsigned, 4> ReorderIndices;
4028 VecTreeTy &Container;
4031 EdgeInfo UserTreeIndex;
4047 SmallPtrSet<const Value *, 4> CopyableElements;
4051 InstructionsState S = InstructionsState::invalid();
4054 unsigned InterleaveFactor = 0;
4057 bool DoesNotNeedToSchedule =
false;
4061 if (Operands.size() <
OpIdx + 1)
4062 Operands.resize(
OpIdx + 1);
4065 "Number of operands is greater than the number of scalars.");
4072 unsigned getInterleaveFactor()
const {
return InterleaveFactor; }
4074 void setInterleave(
unsigned Factor) { InterleaveFactor = Factor; }
4077 void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule =
true; }
4080 bool doesNotNeedToSchedule()
const {
return DoesNotNeedToSchedule; }
4085 setOperand(
I, Operands[
I]);
4089 void reorderOperands(ArrayRef<int> Mask) {
4097 return Operands[
OpIdx];
4103 return Operands[
OpIdx];
4107 unsigned getNumOperands()
const {
return Operands.size(); }
4110 Value *getSingleOperand(
unsigned OpIdx)
const {
4113 return Operands[
OpIdx][0];
4117 bool isAltShuffle()
const {
return S.isAltShuffle(); }
4119 Instruction *getMatchingMainOpOrAltOp(Instruction *
I)
const {
4120 return S.getMatchingMainOpOrAltOp(
I);
4128 if (
I && getMatchingMainOpOrAltOp(
I))
4130 return S.getMainOp();
4133 void setOperations(
const InstructionsState &S) {
4134 assert(S &&
"InstructionsState is invalid.");
4138 Instruction *getMainOp()
const {
return S.getMainOp(); }
4140 Instruction *getAltOp()
const {
return S.getAltOp(); }
4143 unsigned getOpcode()
const {
return S.getOpcode(); }
4145 unsigned getAltOpcode()
const {
return S.getAltOpcode(); }
4147 bool hasState()
const {
return S.valid(); }
4150 void addCopyableElement(
Value *V) {
4151 assert(S.isCopyableElement(V) &&
"Not a copyable element.");
4152 CopyableElements.insert(V);
4156 bool isCopyableElement(
Value *V)
const {
4157 return CopyableElements.contains(V);
4161 bool hasCopyableElements()
const {
return !CopyableElements.empty(); }
4164 const InstructionsState &getOperations()
const {
return S; }
4168 unsigned findLaneForValue(
Value *V)
const {
4169 unsigned FoundLane = getVectorFactor();
4170 for (
auto *It =
find(Scalars, V), *End = Scalars.end(); It != End;
4171 std::advance(It, 1)) {
4174 FoundLane = std::distance(Scalars.begin(), It);
4175 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
4176 if (!ReorderIndices.empty())
4177 FoundLane = ReorderIndices[FoundLane];
4178 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
4179 if (ReuseShuffleIndices.empty())
4181 if (
auto *RIt =
find(ReuseShuffleIndices, FoundLane);
4182 RIt != ReuseShuffleIndices.end()) {
4183 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
4187 assert(FoundLane < getVectorFactor() &&
"Unable to find given value.");
4194 buildAltOpShuffleMask(
const function_ref<
bool(Instruction *)> IsAltOp,
4195 SmallVectorImpl<int> &Mask,
4196 SmallVectorImpl<Value *> *OpScalars =
nullptr,
4197 SmallVectorImpl<Value *> *AltScalars =
nullptr)
const;
4200 bool isNonPowOf2Vec()
const {
4202 return IsNonPowerOf2;
4208 hasNonWholeRegisterOrNonPowerOf2Vec(
const TargetTransformInfo &
TTI)
const {
4211 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
4212 "Reshuffling not supported with non-power-of-2 vectors yet.");
4213 return IsNonPowerOf2;
4216 Value *getOrdered(
unsigned Idx)
const {
4217 assert(isGather() &&
"Must be used only for buildvectors/gathers.");
4218 if (ReorderIndices.empty())
4219 return Scalars[Idx];
4220 SmallVector<int>
Mask;
4222 return Scalars[
Mask[Idx]];
4228 dbgs() << Idx <<
".\n";
4229 for (
unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
4230 dbgs() <<
"Operand " << OpI <<
":\n";
4231 for (
const Value *V : Operands[OpI])
4234 dbgs() <<
"Scalars: \n";
4235 for (
Value *V : Scalars)
4237 dbgs() <<
"State: ";
4238 if (S && hasCopyableElements())
4239 dbgs() <<
"[[Copyable]] ";
4242 if (InterleaveFactor > 0) {
4243 dbgs() <<
"Vectorize with interleave factor " << InterleaveFactor
4246 dbgs() <<
"Vectorize\n";
4249 case ScatterVectorize:
4250 dbgs() <<
"ScatterVectorize\n";
4252 case StridedVectorize:
4253 dbgs() <<
"StridedVectorize\n";
4255 case CompressVectorize:
4256 dbgs() <<
"CompressVectorize\n";
4259 dbgs() <<
"NeedToGather\n";
4261 case CombinedVectorize:
4262 dbgs() <<
"CombinedVectorize\n";
4264 case SplitVectorize:
4265 dbgs() <<
"SplitVectorize\n";
4269 dbgs() <<
"MainOp: " << *S.getMainOp() <<
"\n";
4270 dbgs() <<
"AltOp: " << *S.getAltOp() <<
"\n";
4272 dbgs() <<
"MainOp: NULL\n";
4273 dbgs() <<
"AltOp: NULL\n";
4275 dbgs() <<
"VectorizedValue: ";
4276 if (VectorizedValue)
4277 dbgs() << *VectorizedValue <<
"\n";
4280 dbgs() <<
"ReuseShuffleIndices: ";
4281 if (ReuseShuffleIndices.empty())
4284 for (
int ReuseIdx : ReuseShuffleIndices)
4285 dbgs() << ReuseIdx <<
", ";
4287 dbgs() <<
"ReorderIndices: ";
4288 for (
unsigned ReorderIdx : ReorderIndices)
4289 dbgs() << ReorderIdx <<
", ";
4291 dbgs() <<
"UserTreeIndex: ";
4293 dbgs() << UserTreeIndex;
4295 dbgs() <<
"<invalid>";
4297 if (!CombinedEntriesWithIndices.empty()) {
4298 dbgs() <<
"Combined entries: ";
4300 dbgs() <<
"Entry index " <<
P.first <<
" with offset " <<
P.second;
4311 StringRef Banner)
const {
4312 dbgs() <<
"SLP: " << Banner <<
":\n";
4314 dbgs() <<
"SLP: Costs:\n";
4315 dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
4316 dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
4317 dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
4318 dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = "
4319 << ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
4325 const InstructionsState &S,
4327 ArrayRef<int> ReuseShuffleIndices = {}) {
4328 auto Invalid = ScheduleBundle::invalid();
4329 return newTreeEntry(VL,
Invalid, S, UserTreeIdx, ReuseShuffleIndices);
4334 const InstructionsState &S,
4336 ArrayRef<int> ReuseShuffleIndices = {},
4337 ArrayRef<unsigned> ReorderIndices = {},
4338 unsigned InterleaveFactor = 0) {
4339 TreeEntry::EntryState EntryState =
4340 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
4341 TreeEntry *
E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
4342 ReuseShuffleIndices, ReorderIndices);
4343 if (
E && InterleaveFactor > 0)
4344 E->setInterleave(InterleaveFactor);
4349 TreeEntry::EntryState EntryState,
4350 ScheduleBundle &Bundle,
const InstructionsState &S,
4352 ArrayRef<int> ReuseShuffleIndices = {},
4353 ArrayRef<unsigned> ReorderIndices = {}) {
4354 assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||
4355 EntryState == TreeEntry::SplitVectorize)) ||
4356 (Bundle && EntryState != TreeEntry::NeedToGather &&
4357 EntryState != TreeEntry::SplitVectorize)) &&
4358 "Need to vectorize gather entry?");
4360 if (GatheredLoadsEntriesFirst.has_value() &&
4361 EntryState == TreeEntry::NeedToGather && S &&
4362 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
4363 !UserTreeIdx.UserTE)
4365 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
4366 TreeEntry *
Last = VectorizableTree.back().get();
4367 Last->Idx = VectorizableTree.size() - 1;
4368 Last->State = EntryState;
4369 if (UserTreeIdx.UserTE)
4370 OperandsToTreeEntry.try_emplace(
4371 std::make_pair(UserTreeIdx.UserTE, UserTreeIdx.EdgeIdx),
Last);
4376 ReuseShuffleIndices.empty()) &&
4377 "Reshuffling scalars not yet supported for nodes with padding");
4378 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
4379 ReuseShuffleIndices.end());
4380 if (ReorderIndices.
empty()) {
4383 Last->setOperations(S);
4386 Last->Scalars.assign(VL.
size(),
nullptr);
4388 [VL](
unsigned Idx) ->
Value * {
4389 if (Idx >= VL.size())
4390 return UndefValue::get(VL.front()->getType());
4395 Last->setOperations(S);
4396 Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
4398 if (EntryState == TreeEntry::SplitVectorize) {
4399 assert(S &&
"Split nodes must have operations.");
4400 Last->setOperations(S);
4401 SmallPtrSet<Value *, 4> Processed;
4402 for (
Value *V : VL) {
4406 auto It = ScalarsInSplitNodes.find(V);
4407 if (It == ScalarsInSplitNodes.end()) {
4408 ScalarsInSplitNodes.try_emplace(V).first->getSecond().push_back(
Last);
4409 (void)Processed.
insert(V);
4410 }
else if (Processed.
insert(V).second) {
4412 "Value already associated with the node.");
4413 It->getSecond().push_back(
Last);
4416 }
else if (!
Last->isGather()) {
4419 (!S.areInstructionsWithCopyableElements() &&
4421 all_of(VL, [&](
Value *V) {
return S.isNonSchedulable(V); }))
4422 Last->setDoesNotNeedToSchedule();
4423 SmallPtrSet<Value *, 4> Processed;
4424 for (
Value *V : VL) {
4427 if (S.isCopyableElement(V)) {
4428 Last->addCopyableElement(V);
4431 auto It = ScalarToTreeEntries.find(V);
4432 if (It == ScalarToTreeEntries.end()) {
4433 ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(
Last);
4434 (void)Processed.
insert(V);
4435 }
else if (Processed.
insert(V).second) {
4437 "Value already associated with the node.");
4438 It->getSecond().push_back(
Last);
4442 assert((!Bundle.getBundle().empty() ||
Last->doesNotNeedToSchedule()) &&
4443 "Bundle and VL out of sync");
4444 if (!Bundle.getBundle().empty()) {
4445#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
4446 auto *BundleMember = Bundle.getBundle().begin();
4447 SmallPtrSet<Value *, 4> Processed;
4448 for (
Value *V : VL) {
4449 if (S.isNonSchedulable(V) || !Processed.
insert(V).second)
4453 assert(BundleMember == Bundle.getBundle().end() &&
4454 "Bundle and VL out of sync");
4456 Bundle.setTreeEntry(
Last);
4460 bool AllConstsOrCasts =
true;
4461 for (
Value *V : VL) {
4462 if (S && S.areInstructionsWithCopyableElements() &&
4463 S.isCopyableElement(V))
4464 Last->addCopyableElement(V);
4467 AllConstsOrCasts &=
I &&
I->getType()->isIntegerTy();
4468 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
4469 !UserTreeIdx.UserTE->isGather())
4470 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(
Last);
4473 if (AllConstsOrCasts)
4475 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
4476 MustGather.insert_range(VL);
4479 if (UserTreeIdx.UserTE)
4480 Last->UserTreeIndex = UserTreeIdx;
4486 TreeEntry::VecTreeTy VectorizableTree;
4491 for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
4492 VectorizableTree[
Id]->dump();
4500 assert(V &&
"V cannot be nullptr.");
4501 auto It = ScalarToTreeEntries.find(V);
4502 if (It == ScalarToTreeEntries.end())
4504 return It->getSecond();
4509 assert(V &&
"V cannot be nullptr.");
4510 auto It = ScalarsInSplitNodes.find(V);
4511 if (It == ScalarsInSplitNodes.end())
4513 return It->getSecond();
4518 bool SameVF =
false)
const {
4519 assert(V &&
"V cannot be nullptr.");
4520 for (TreeEntry *TE : ScalarToTreeEntries.lookup(V))
4521 if ((!SameVF ||
TE->getVectorFactor() == VL.
size()) &&
TE->isSame(VL))
4532 bool areAltOperandsProfitable(
const InstructionsState &S,
4537 class ScalarsVectorizationLegality {
4538 InstructionsState S;
4540 bool TryToFindDuplicates;
4541 bool TrySplitVectorize;
4544 ScalarsVectorizationLegality(InstructionsState S,
bool IsLegal,
4545 bool TryToFindDuplicates =
true,
4546 bool TrySplitVectorize =
false)
4547 : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
4548 TrySplitVectorize(TrySplitVectorize) {
4549 assert((!IsLegal || (S.valid() && TryToFindDuplicates)) &&
4550 "Inconsistent state");
4552 const InstructionsState &getInstructionsState()
const {
return S; };
4553 bool isLegal()
const {
return IsLegal; }
4554 bool tryToFindDuplicates()
const {
return TryToFindDuplicates; }
4555 bool trySplitVectorize()
const {
return TrySplitVectorize; }
4560 ScalarsVectorizationLegality
4563 bool TryCopyableElementsVectorization)
const;
4567 TreeEntry::EntryState getScalarsVectorizationState(
4569 bool IsScatterVectorizeUserTE,
OrdersType &CurrentOrder,
4570 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo);
4573 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
4576 SmallDenseMap<std::pair<const TreeEntry *, unsigned>, TreeEntry *>
4577 OperandsToTreeEntry;
4580 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarsInSplitNodes;
4583 SmallDenseMap<Value *, unsigned> InstrElementSize;
4597 SmallDenseMap<const TreeEntry *, WeakTrackingVH> EntryToLastInstruction;
4602 SetVector<const TreeEntry *> PostponedGathers;
4604 using ValueToGatherNodesMap =
4605 DenseMap<Value *, SmallSetVector<const TreeEntry *, 4>>;
4606 ValueToGatherNodesMap ValueToGatherNodes;
4611 SetVector<unsigned> LoadEntriesToVectorize;
4614 bool IsGraphTransformMode =
false;
4617 std::optional<unsigned> GatheredLoadsEntriesFirst;
4620 SmallDenseMap<
const TreeEntry *,
4621 std::tuple<SmallVector<int>,
VectorType *, unsigned,
bool>>
4622 CompressEntryToData;
4625 struct ExternalUser {
4626 ExternalUser(
Value *S, llvm::User *U,
const TreeEntry &E,
unsigned L)
4627 : Scalar(S), User(
U), E(E), Lane(
L) {}
4630 Value *Scalar =
nullptr;
4633 llvm::User *User =
nullptr;
4641 using UserList = SmallVector<ExternalUser, 16>;
4647 bool isAliased(
const MemoryLocation &Loc1, Instruction *Inst1,
4648 Instruction *Inst2) {
4651 AliasCacheKey
Key = std::make_pair(Inst1, Inst2);
4652 auto Res = AliasCache.try_emplace(
Key);
4654 return Res.first->second;
4655 bool Aliased =
isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
4657 Res.first->getSecond() = Aliased;
4661 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
4665 SmallDenseMap<AliasCacheKey, bool> AliasCache;
4670 BatchAAResults BatchAA;
4677 DenseSet<Instruction *> DeletedInstructions;
4680 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
4683 DenseSet<size_t> AnalyzedReductionVals;
4687 DenseSet<Value *> AnalyzedMinBWVals;
4693 UserList ExternalUses;
4697 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
4701 SmallPtrSet<Value *, 4> ExternalUsesWithNonUsers;
4704 SmallPtrSet<const Value *, 32> EphValues;
4708 SetVector<Instruction *> GatherShuffleExtractSeq;
4711 DenseSet<BasicBlock *> CSEBlocks;
4714 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
4721 class ScheduleEntity {
4722 friend class ScheduleBundle;
4723 friend class ScheduleData;
4724 friend class ScheduleCopyableData;
4727 enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData };
4728 Kind getKind()
const {
return K; }
4729 ScheduleEntity(Kind K) : K(K) {}
4733 int SchedulingPriority = 0;
4736 bool IsScheduled =
false;
4738 const Kind K = Kind::ScheduleData;
4741 ScheduleEntity() =
delete;
4743 void setSchedulingPriority(
int Priority) { SchedulingPriority = Priority; }
4744 int getSchedulingPriority()
const {
return SchedulingPriority; }
4745 bool isReady()
const {
4747 return SD->isReady();
4749 return CD->isReady();
4755 bool hasValidDependencies()
const {
4757 return SD->hasValidDependencies();
4759 return CD->hasValidDependencies();
4763 int getUnscheduledDeps()
const {
4765 return SD->getUnscheduledDeps();
4767 return CD->getUnscheduledDeps();
4771 int incrementUnscheduledDeps(
int Incr) {
4773 return SD->incrementUnscheduledDeps(Incr);
4777 int getDependencies()
const {
4779 return SD->getDependencies();
4785 return SD->getInst();
4790 bool isScheduled()
const {
return IsScheduled; }
4791 void setScheduled(
bool Scheduled) { IsScheduled = Scheduled; }
4793 static bool classof(
const ScheduleEntity *) {
return true; }
4795#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4796 void dump(raw_ostream &OS)
const {
4798 return SD->dump(OS);
4800 return CD->dump(OS);
4811#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4813 const BoUpSLP::ScheduleEntity &SE) {
4823 class ScheduleData final :
public ScheduleEntity {
4827 enum { InvalidDeps = -1 };
4829 ScheduleData() : ScheduleEntity(Kind::ScheduleData) {}
4830 static bool classof(
const ScheduleEntity *Entity) {
4831 return Entity->getKind() == Kind::ScheduleData;
4834 void init(
int BlockSchedulingRegionID, Instruction *
I) {
4835 NextLoadStore =
nullptr;
4836 IsScheduled =
false;
4837 SchedulingRegionID = BlockSchedulingRegionID;
4838 clearDependencies();
4844 if (hasValidDependencies()) {
4845 assert(UnscheduledDeps <= Dependencies &&
"invariant");
4847 assert(UnscheduledDeps == Dependencies &&
"invariant");
4851 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
4852 "unexpected scheduled state");
4859 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
4863 bool isReady()
const {
return UnscheduledDeps == 0 && !IsScheduled; }
4868 int incrementUnscheduledDeps(
int Incr) {
4869 assert(hasValidDependencies() &&
4870 "increment of unscheduled deps would be meaningless");
4871 UnscheduledDeps += Incr;
4872 return UnscheduledDeps;
4877 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
4880 void clearDependencies() {
4881 clearDirectDependencies();
4882 MemoryDependencies.clear();
4883 ControlDependencies.clear();
4890 void clearDirectDependencies() {
4891 Dependencies = InvalidDeps;
4892 resetUnscheduledDeps();
4893 IsScheduled =
false;
4897 int getUnscheduledDeps()
const {
return UnscheduledDeps; }
4899 int getDependencies()
const {
return Dependencies; }
4901 void initDependencies() { Dependencies = 0; }
4903 void incDependencies() { Dependencies++; }
4906 int getSchedulingRegionID()
const {
return SchedulingRegionID; }
4913 return MemoryDependencies;
4916 void addMemoryDependency(ScheduleData *Dep) {
4917 MemoryDependencies.push_back(Dep);
4921 return ControlDependencies;
4924 void addControlDependency(ScheduleData *Dep) {
4925 ControlDependencies.push_back(Dep);
4928 ScheduleData *getNextLoadStore()
const {
return NextLoadStore; }
4929 void setNextLoadStore(ScheduleData *
Next) { NextLoadStore =
Next; }
4931 void dump(raw_ostream &OS)
const { OS << *Inst; }
4943 ScheduleData *NextLoadStore =
nullptr;
4947 SmallVector<ScheduleData *> MemoryDependencies;
4953 SmallVector<ScheduleData *> ControlDependencies;
4957 int SchedulingRegionID = 0;
4963 int Dependencies = InvalidDeps;
4969 int UnscheduledDeps = InvalidDeps;
4974 const BoUpSLP::ScheduleData &SD) {
4980 class ScheduleBundle final :
public ScheduleEntity {
4984 bool IsValid =
true;
4986 TreeEntry *TE =
nullptr;
4987 ScheduleBundle(
bool IsValid)
4988 : ScheduleEntity(Kind::ScheduleBundle), IsValid(IsValid) {}
4991 ScheduleBundle() : ScheduleEntity(Kind::ScheduleBundle) {}
4992 static bool classof(
const ScheduleEntity *Entity) {
4993 return Entity->getKind() == Kind::ScheduleBundle;
4998 for (
const ScheduleEntity *SD : Bundle) {
4999 if (SD->hasValidDependencies()) {
5000 assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
5003 assert(SD->getUnscheduledDeps() == SD->getDependencies() &&
5007 if (isScheduled()) {
5008 assert(SD->hasValidDependencies() && SD->getUnscheduledDeps() == 0 &&
5009 "unexpected scheduled state");
5015 int unscheduledDepsInBundle()
const {
5016 assert(*
this &&
"bundle must not be empty");
5018 for (
const ScheduleEntity *BundleMember : Bundle) {
5019 if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
5020 return ScheduleData::InvalidDeps;
5021 Sum += BundleMember->getUnscheduledDeps();
5029 bool hasValidDependencies()
const {
5030 return all_of(Bundle, [](
const ScheduleEntity *SD) {
5031 return SD->hasValidDependencies();
5037 bool isReady()
const {
5038 assert(*
this &&
"bundle must not be empty");
5039 return unscheduledDepsInBundle() == 0 && !isScheduled();
5047 void add(ScheduleEntity *SD) { Bundle.push_back(SD); }
5050 void setTreeEntry(TreeEntry *TE) { this->TE = TE; }
5051 TreeEntry *getTreeEntry()
const {
return TE; }
5053 static ScheduleBundle invalid() {
return {
false}; }
5055 operator bool()
const {
return IsValid; }
5058 void dump(raw_ostream &OS)
const {
5067 OS << *SD->getInst();
5081 const BoUpSLP::ScheduleBundle &Bundle) {
5092 class ScheduleCopyableData final :
public ScheduleEntity {
5099 int SchedulingRegionID = 0;
5101 ScheduleBundle &Bundle;
5104 ScheduleCopyableData(
int BlockSchedulingRegionID,
Instruction *
I,
5105 const EdgeInfo &EI, ScheduleBundle &Bundle)
5106 : ScheduleEntity(Kind::ScheduleCopyableData), Inst(
I), EI(EI),
5107 SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {}
5108 static bool classof(
const ScheduleEntity *Entity) {
5109 return Entity->getKind() == Kind::ScheduleCopyableData;
5114 if (hasValidDependencies()) {
5115 assert(UnscheduledDeps <= Dependencies &&
"invariant");
5117 assert(UnscheduledDeps == Dependencies &&
"invariant");
5121 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
5122 "unexpected scheduled state");
5129 bool hasValidDependencies()
const {
5130 return Dependencies != ScheduleData::InvalidDeps;
5135 bool isReady()
const {
return UnscheduledDeps == 0 && !IsScheduled; }
5140 int incrementUnscheduledDeps(
int Incr) {
5141 assert(hasValidDependencies() &&
5142 "increment of unscheduled deps would be meaningless");
5143 UnscheduledDeps += Incr;
5144 assert(UnscheduledDeps >= 0 &&
"invariant");
5145 return UnscheduledDeps;
5150 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5153 int getUnscheduledDeps()
const {
return UnscheduledDeps; }
5155 int getDependencies()
const {
return Dependencies; }
5157 void initDependencies() { Dependencies = 0; }
5159 void incDependencies() { Dependencies++; }
5162 int getSchedulingRegionID()
const {
return SchedulingRegionID; }
5168 void clearDependencies() {
5169 Dependencies = ScheduleData::InvalidDeps;
5170 UnscheduledDeps = ScheduleData::InvalidDeps;
5171 IsScheduled =
false;
5175 const EdgeInfo &getEdgeInfo()
const {
return EI; }
5178 ScheduleBundle &getBundle() {
return Bundle; }
5179 const ScheduleBundle &getBundle()
const {
return Bundle; }
5181#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5182 void dump(raw_ostream &OS)
const { OS <<
"[Copyable]" << *getInst(); }
5193 int Dependencies = ScheduleData::InvalidDeps;
5199 int UnscheduledDeps = ScheduleData::InvalidDeps;
5229 struct BlockScheduling {
5231 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
5234 ScheduledBundles.clear();
5235 ScheduledBundlesList.
clear();
5236 ScheduleCopyableDataMap.clear();
5237 ScheduleCopyableDataMapByInst.clear();
5238 ScheduleCopyableDataMapByInstUser.clear();
5239 ScheduleCopyableDataMapByUsers.clear();
5241 ScheduleStart =
nullptr;
5242 ScheduleEnd =
nullptr;
5243 FirstLoadStoreInRegion =
nullptr;
5244 LastLoadStoreInRegion =
nullptr;
5245 RegionHasStackSave =
false;
5249 ScheduleRegionSizeLimit -= ScheduleRegionSize;
5252 ScheduleRegionSize = 0;
5256 ++SchedulingRegionID;
5262 if (BB !=
I->getParent())
5265 ScheduleData *SD = ScheduleDataMap.lookup(
I);
5266 if (SD && isInSchedulingRegion(*SD))
5271 ScheduleData *getScheduleData(
Value *V) {
5277 ScheduleCopyableData *getScheduleCopyableData(
const EdgeInfo &EI,
5278 const Value *V)
const {
5279 if (ScheduleCopyableDataMap.empty())
5281 auto It = ScheduleCopyableDataMap.find(std::make_pair(EI, V));
5282 if (It == ScheduleCopyableDataMap.end())
5284 ScheduleCopyableData *SD = It->getSecond().get();
5285 if (!isInSchedulingRegion(*SD))
5293 getScheduleCopyableData(
const Value *User,
unsigned OperandIdx,
5295 if (ScheduleCopyableDataMapByInstUser.empty())
5297 const auto It = ScheduleCopyableDataMapByInstUser.find(
5298 std::make_pair(std::make_pair(User, OperandIdx), V));
5299 if (It == ScheduleCopyableDataMapByInstUser.end())
5302 for (ScheduleCopyableData *SD : It->getSecond()) {
5303 if (isInSchedulingRegion(*SD))
5317 bool areAllOperandsReplacedByCopyableData(Instruction *User,
5321 if (ScheduleCopyableDataMap.empty())
5323 SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;
5324 SmallDenseMap<const TreeEntry *, unsigned> OrderedEntriesCount;
5325 for (
const Use &U :
User->operands()) {
5329 if (Entries.
empty())
5333 for (TreeEntry *TE : Entries) {
5339 bool IsCommutativeUser =
5344 OrderedEntriesCount.
try_emplace(TE, 0).first->getSecond();
5345 EdgeInfo EI(TE,
U.getOperandNo());
5346 if (!getScheduleCopyableData(EI,
Op))
5352 ++PotentiallyReorderedEntriesCount.
try_emplace(TE, 0)
5353 .first->getSecond();
5356 if (PotentiallyReorderedEntriesCount.
empty())
5357 return all_of(OrderedEntriesCount,
5358 [&](
const std::pair<const TreeEntry *, unsigned> &
P) {
5362 for (
auto &
P : PotentiallyReorderedEntriesCount) {
5363 auto *It =
find(
P.first->Scalars, User);
5364 assert(It !=
P.first->Scalars.end() &&
"User is not in the tree entry");
5365 int Lane = std::distance(
P.first->Scalars.begin(), It);
5366 assert(Lane >= 0 &&
"Lane is not found");
5368 Lane =
P.first->ReorderIndices[Lane];
5369 assert(Lane <
static_cast<int>(
P.first->Scalars.size()) &&
5370 "Couldn't find extract lane");
5371 SmallVector<unsigned> OpIndices;
5372 for (
unsigned OpIdx :
5374 P.first->getMainOp()))) {
5375 if (
P.first->getOperand(
OpIdx)[Lane] ==
Op &&
5376 getScheduleCopyableData(EdgeInfo(
P.first,
OpIdx),
Op))
5380 return all_of(PotentiallyReorderedEntriesCount,
5381 [&](
const std::pair<const TreeEntry *, unsigned> &
P) {
5382 return P.second ==
NumOps - 1;
5384 all_of(OrderedEntriesCount,
5385 [&](
const std::pair<const TreeEntry *, unsigned> &
P) {
5391 getScheduleCopyableData(
const Instruction *
I)
const {
5392 if (ScheduleCopyableDataMapByInst.empty())
5394 const auto It = ScheduleCopyableDataMapByInst.find(
I);
5395 if (It == ScheduleCopyableDataMapByInst.end())
5398 for (ScheduleCopyableData *SD : It->getSecond()) {
5399 if (isInSchedulingRegion(*SD))
5406 getScheduleCopyableDataUsers(
const Instruction *User)
const {
5407 if (ScheduleCopyableDataMapByUsers.empty())
5409 const auto It = ScheduleCopyableDataMapByUsers.find(User);
5410 if (It == ScheduleCopyableDataMapByUsers.end())
5413 for (ScheduleCopyableData *SD : It->getSecond()) {
5414 if (isInSchedulingRegion(*SD))
5420 ScheduleCopyableData &addScheduleCopyableData(
const EdgeInfo &EI,
5422 int SchedulingRegionID,
5423 ScheduleBundle &Bundle) {
5424 assert(!getScheduleCopyableData(EI,
I) &&
"already in the map");
5425 ScheduleCopyableData *CD =
5426 ScheduleCopyableDataMap
5427 .try_emplace(std::make_pair(EI,
I),
5428 std::make_unique<ScheduleCopyableData>(
5429 SchedulingRegionID,
I, EI, Bundle))
5432 ScheduleCopyableDataMapByInst[
I].push_back(CD);
5436 assert(It !=
Op.end() &&
"Lane not set");
5437 SmallPtrSet<Instruction *, 4> Visited;
5439 int Lane = std::distance(
Op.begin(), It);
5440 assert(Lane >= 0 &&
"Lane not set");
5442 !EI.UserTE->ReorderIndices.empty())
5443 Lane = EI.UserTE->ReorderIndices[Lane];
5444 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
5445 "Couldn't find extract lane");
5447 if (!Visited.
insert(In).second) {
5451 ScheduleCopyableDataMapByInstUser
5452 .try_emplace(std::make_pair(std::make_pair(In, EI.EdgeIdx),
I))
5455 ScheduleCopyableDataMapByUsers.try_emplace(
I)
5462 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
5463 if (ScheduleCopyableData *UserCD =
5464 getScheduleCopyableData(UserEI, In))
5465 ScheduleCopyableDataMapByUsers[
I].remove(UserCD);
5468 }
while (It !=
Op.end());
5470 ScheduleCopyableDataMapByUsers.try_emplace(
I).first->getSecond().insert(
5480 auto It = ScheduledBundles.find(
I);
5481 if (It == ScheduledBundles.end())
5483 return It->getSecond();
5487 bool isInSchedulingRegion(
const ScheduleEntity &SD)
const {
5489 return Data->getSchedulingRegionID() == SchedulingRegionID;
5491 return CD->getSchedulingRegionID() == SchedulingRegionID;
5493 [&](
const ScheduleEntity *BundleMember) {
5494 return isInSchedulingRegion(*BundleMember);
5500 template <
typename ReadyListType>
5501 void schedule(
const BoUpSLP &R,
const InstructionsState &S,
5502 const EdgeInfo &EI, ScheduleEntity *
Data,
5503 ReadyListType &ReadyList) {
5504 auto ProcessBundleMember = [&](ScheduleEntity *BundleMember,
5509 auto DecrUnsched = [&](
auto *
Data,
bool IsControl =
false) {
5510 if ((IsControl ||
Data->hasValidDependencies()) &&
5511 Data->incrementUnscheduledDeps(-1) == 0) {
5518 CopyableBundle.
push_back(&CD->getBundle());
5519 Bundles = CopyableBundle;
5521 Bundles = getScheduleBundles(
Data->getInst());
5523 if (!Bundles.
empty()) {
5524 for (ScheduleBundle *Bundle : Bundles) {
5525 if (Bundle->unscheduledDepsInBundle() == 0) {
5526 assert(!Bundle->isScheduled() &&
5527 "already scheduled bundle gets ready");
5528 ReadyList.insert(Bundle);
5530 <<
"SLP: gets ready: " << *Bundle <<
"\n");
5536 "already scheduled bundle gets ready");
5538 "Expected non-copyable data");
5539 ReadyList.insert(
Data);
5546 if (!ScheduleCopyableDataMap.empty()) {
5548 getScheduleCopyableData(User,
OpIdx,
I);
5549 for (ScheduleCopyableData *CD : CopyableData)
5550 DecrUnsched(CD,
false);
5551 if (!CopyableData.empty())
5554 if (ScheduleData *OpSD = getScheduleData(
I))
5555 DecrUnsched(OpSD,
false);
5561 if (!Bundles.empty()) {
5562 auto *
In = BundleMember->getInst();
5564 SmallDenseMap<const Instruction *, unsigned> OperandsUses;
5565 unsigned TotalOpCount = 0;
5568 TotalOpCount = OperandsUses[
In] = 1;
5570 for (
const Use &U :
In->operands()) {
5573 ++Res.first->getSecond();
5580 auto DecrUnschedForInst =
5582 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>>
5584 if (!ScheduleCopyableDataMap.empty()) {
5585 const EdgeInfo EI = {UserTE,
OpIdx};
5586 if (ScheduleCopyableData *CD =
5587 getScheduleCopyableData(EI,
I)) {
5588 if (!Checked.insert(std::make_pair(CD,
OpIdx)).second)
5590 DecrUnsched(CD,
false);
5594 auto It = OperandsUses.
find(
I);
5595 assert(It != OperandsUses.
end() &&
"Operand not found");
5596 if (It->second > 0) {
5598 assert(TotalOpCount > 0 &&
"No more operands to decrement");
5600 if (ScheduleData *OpSD = getScheduleData(
I)) {
5601 if (!Checked.insert(std::make_pair(OpSD,
OpIdx)).second)
5603 DecrUnsched(OpSD,
false);
5608 for (ScheduleBundle *Bundle : Bundles) {
5609 if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
5613 auto *It =
find(Bundle->getTreeEntry()->Scalars, In);
5614 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>> Checked;
5617 std::distance(Bundle->getTreeEntry()->Scalars.begin(), It);
5618 assert(Lane >= 0 &&
"Lane not set");
5620 !Bundle->getTreeEntry()->ReorderIndices.empty())
5621 Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
5622 assert(Lane <
static_cast<int>(
5623 Bundle->getTreeEntry()->Scalars.size()) &&
5624 "Couldn't find extract lane");
5634 In->getNumOperands() ==
5635 Bundle->getTreeEntry()->getNumOperands() ||
5636 Bundle->getTreeEntry()->isCopyableElement(In)) &&
5637 "Missed TreeEntry operands?");
5639 for (
unsigned OpIdx :
5642 Bundle->getTreeEntry()->getOperand(
OpIdx)[Lane])) {
5645 DecrUnschedForInst(
I, Bundle->getTreeEntry(),
OpIdx, Checked);
5648 if (!Bundle->getTreeEntry()->doesNotNeedToSchedule())
5650 It = std::find(std::next(It),
5651 Bundle->getTreeEntry()->Scalars.end(), In);
5652 }
while (It != Bundle->getTreeEntry()->Scalars.end());
5657 for (Use &U : BundleMember->getInst()->operands()) {
5660 <<
"SLP: check for readiness (def): " << *
I <<
"\n");
5661 DecrUnschedForInst(BundleMember->getInst(),
U.getOperandNo(),
I);
5669 SmallPtrSet<const ScheduleData *, 4> VisitedMemory;
5670 for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) {
5671 if (!VisitedMemory.
insert(MemoryDep).second)
5676 << *MemoryDep <<
"\n");
5677 DecrUnsched(MemoryDep);
5680 SmallPtrSet<const ScheduleData *, 4> VisitedControl;
5681 for (ScheduleData *Dep : SD->getControlDependencies()) {
5682 if (!VisitedControl.
insert(Dep).second)
5687 <<
"SLP: check for readiness (ctrl): " << *Dep <<
"\n");
5688 DecrUnsched(Dep,
true);
5692 SD->setScheduled(
true);
5697 if (
R.isVectorized(In)) {
5699 for (TreeEntry *TE : Entries) {
5701 In->getNumOperands() !=
TE->getNumOperands())
5704 PseudoBundles.
emplace_back(std::make_unique<ScheduleBundle>());
5705 BundlePtr->setTreeEntry(TE);
5710 ProcessBundleMember(SD, Bundles);
5713 Bundle.setScheduled(
true);
5715 auto AreAllBundlesScheduled =
5716 [&](
const ScheduleEntity *SD,
5720 return !SDBundles.empty() &&
5721 all_of(SDBundles, [&](
const ScheduleBundle *SDBundle) {
5722 return SDBundle->isScheduled();
5725 for (ScheduleEntity *SD : Bundle.getBundle()) {
5728 SDBundles = getScheduleBundles(SD->getInst());
5729 if (AreAllBundlesScheduled(SD, SDBundles)) {
5730 SD->setScheduled(
true);
5743 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
5744 ScheduleStart->comesBefore(ScheduleEnd) &&
5745 "Not a valid scheduling region?");
5747 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
5749 if (!Bundles.
empty()) {
5750 for (ScheduleBundle *Bundle : Bundles) {
5751 assert(isInSchedulingRegion(*Bundle) &&
5752 "primary schedule data not in window?");
5757 auto *SD = getScheduleData(
I);
5760 assert(isInSchedulingRegion(*SD) &&
5761 "primary schedule data not in window?");
5766 [](
const ScheduleEntity *Bundle) {
5767 return Bundle->isReady();
5769 "item in ready list not ready?");
5773 template <
typename ReadyListType>
5774 void initialFillReadyList(ReadyListType &ReadyList) {
5775 SmallPtrSet<ScheduleBundle *, 16> Visited;
5776 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
5777 ScheduleData *SD = getScheduleData(
I);
5778 if (SD && SD->hasValidDependencies() && SD->isReady()) {
5781 for (ScheduleBundle *Bundle : Bundles) {
5782 if (!Visited.
insert(Bundle).second)
5784 if (Bundle->hasValidDependencies() && Bundle->isReady()) {
5785 ReadyList.insert(Bundle);
5787 << *Bundle <<
"\n");
5792 ReadyList.insert(SD);
5794 <<
"SLP: initially in ready list: " << *SD <<
"\n");
5805 const InstructionsState &S,
const EdgeInfo &EI);
5812 std::optional<ScheduleBundle *>
5814 const InstructionsState &S,
const EdgeInfo &EI);
5817 ScheduleData *allocateScheduleDataChunks();
5821 bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
5825 void initScheduleData(Instruction *FromI, Instruction *ToI,
5826 ScheduleData *PrevLoadStore,
5827 ScheduleData *NextLoadStore);
5831 void calculateDependencies(ScheduleBundle &Bundle,
bool InsertInReadyList,
5836 void resetSchedule();
5853 SmallDenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
5857 SmallDenseMap<std::pair<EdgeInfo, const Value *>,
5858 std::unique_ptr<ScheduleCopyableData>>
5859 ScheduleCopyableDataMap;
5865 SmallDenseMap<const Instruction *, SmallVector<ScheduleCopyableData *>>
5866 ScheduleCopyableDataMapByInst;
5872 SmallDenseMap<std::pair<std::pair<const Value *, unsigned>,
const Value *>,
5874 ScheduleCopyableDataMapByInstUser;
5894 SmallSetVector<ScheduleCopyableData *, 4>>
5895 ScheduleCopyableDataMapByUsers;
5898 SmallDenseMap<Instruction *, SmallVector<ScheduleBundle *>>
5904 SetVector<ScheduleEntity *> ReadyInsts;
5914 ScheduleData *FirstLoadStoreInRegion =
nullptr;
5918 ScheduleData *LastLoadStoreInRegion =
nullptr;
5923 bool RegionHasStackSave =
false;
5926 int ScheduleRegionSize = 0;
5935 int SchedulingRegionID = 1;
5939 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
5943 void scheduleBlock(
const BoUpSLP &R, BlockScheduling *BS);
5946 const SmallDenseSet<Value *> *UserIgnoreList =
nullptr;
5950 struct OrdersTypeDenseMapInfo {
5963 static unsigned getHashValue(
const OrdersType &V) {
5974 ScalarEvolution *SE;
5975 TargetTransformInfo *TTI;
5976 TargetLibraryInfo *TLI;
5979 AssumptionCache *AC;
5981 const DataLayout *DL;
5982 OptimizationRemarkEmitter *ORE;
5984 unsigned MaxVecRegSize;
5985 unsigned MinVecRegSize;
5988 IRBuilder<TargetFolder> Builder;
5995 DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
6000 unsigned ReductionBitWidth = 0;
6003 unsigned BaseGraphSize = 1;
6007 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
6011 DenseSet<unsigned> ExtraBitWidthNodes;
6021 SecondInfo::getEmptyKey());
6026 SecondInfo::getTombstoneKey());
6031 SecondInfo::getHashValue(Val.
EdgeIdx));
6052 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
6063 return R.VectorizableTree[0].get();
6067 return {&
N->UserTreeIndex,
N->Container};
6071 return {&
N->UserTreeIndex + 1,
N->Container};
6098 static unsigned size(
BoUpSLP *R) {
return R->VectorizableTree.size(); }
6109 OS << Entry->Idx <<
".\n";
6112 for (
auto *V : Entry->Scalars) {
6114 if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
6115 return EU.Scalar == V;
6125 if (Entry->isGather())
6127 if (Entry->State == TreeEntry::ScatterVectorize ||
6128 Entry->State == TreeEntry::StridedVectorize ||
6129 Entry->State == TreeEntry::CompressVectorize)
6130 return "color=blue";
6139 for (
auto *
I : DeletedInstructions) {
6140 if (!
I->getParent()) {
6145 I->insertBefore(F->getEntryBlock(),
6146 F->getEntryBlock().getFirstNonPHIIt());
6148 I->insertBefore(F->getEntryBlock().getTerminator()->getIterator());
6151 for (
Use &U :
I->operands()) {
6153 if (
Op && !DeletedInstructions.count(
Op) &&
Op->hasOneUser() &&
6157 I->dropAllReferences();
6159 for (
auto *
I : DeletedInstructions) {
6161 "trying to erase instruction with users.");
6162 I->eraseFromParent();
6168#ifdef EXPENSIVE_CHECKS
6179 assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
6180 "Expected non-empty mask.");
6183 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
6185 Reuses[Mask[
I]] = Prev[
I];
6193 bool BottomOrder =
false) {
6194 assert(!Mask.empty() &&
"Expected non-empty mask.");
6195 unsigned Sz = Mask.size();
6198 if (Order.
empty()) {
6200 std::iota(PrevOrder.
begin(), PrevOrder.
end(), 0);
6202 PrevOrder.
swap(Order);
6205 for (
unsigned I = 0;
I < Sz; ++
I)
6207 Order[
I] = PrevOrder[Mask[
I]];
6209 return Data.value() == Sz ||
Data.index() ==
Data.value();
6218 if (Order.
empty()) {
6220 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
6230 for (
unsigned I = 0;
I < Sz; ++
I)
6232 Order[MaskOrder[
I]] =
I;
6236std::optional<BoUpSLP::OrdersType>
6238 bool TopToBottom,
bool IgnoreReorder) {
6239 assert(TE.isGather() &&
"Expected gather node only.");
6243 Type *ScalarTy = GatheredScalars.
front()->getType();
6244 size_t NumScalars = GatheredScalars.
size();
6246 return std::nullopt;
6253 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
6255 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
6258 if (GatherShuffles.
empty() && ExtractShuffles.
empty())
6259 return std::nullopt;
6260 OrdersType CurrentOrder(NumScalars, NumScalars);
6261 if (GatherShuffles.
size() == 1 &&
6263 Entries.
front().front()->isSame(TE.Scalars)) {
6267 return std::nullopt;
6269 if (Entries.
front().front()->UserTreeIndex.UserTE ==
6270 TE.UserTreeIndex.UserTE)
6271 return std::nullopt;
6274 if (!IgnoreReorder && Entries.
front().front()->Idx == 0)
6275 return std::nullopt;
6278 if (!Entries.
front().front()->ReuseShuffleIndices.empty() &&
6279 TE.getVectorFactor() == 2 && Mask.size() == 2 &&
6282 return P.value() % 2 != static_cast<int>(P.index()) % 2;
6284 return std::nullopt;
6288 std::iota(CurrentOrder.
begin(), CurrentOrder.
end(), 0);
6289 return CurrentOrder;
6293 return all_of(Mask, [&](
int I) {
6300 if ((ExtractShuffles.
empty() && IsSplatMask(Mask) &&
6301 (Entries.
size() != 1 ||
6302 Entries.
front().front()->ReorderIndices.empty())) ||
6303 (GatherShuffles.
empty() && IsSplatMask(ExtractMask)))
6304 return std::nullopt;
6310 if (ShuffledSubMasks.
test(
I))
6312 const int VF = GetVF(
I);
6318 if (
any_of(Slice, [&](
unsigned I) {
return I != NumScalars; })) {
6320 ShuffledSubMasks.
set(
I);
6324 int FirstMin = INT_MAX;
6325 int SecondVecFound =
false;
6327 int Idx = Mask[
I * PartSz + K];
6329 Value *V = GatheredScalars[
I * PartSz + K];
6331 SecondVecFound =
true;
6340 SecondVecFound =
true;
6344 FirstMin = (FirstMin / PartSz) * PartSz;
6346 if (SecondVecFound) {
6348 ShuffledSubMasks.
set(
I);
6352 int Idx = Mask[
I * PartSz + K];
6356 if (Idx >= PartSz) {
6357 SecondVecFound =
true;
6360 if (CurrentOrder[
I * PartSz + Idx] >
6361 static_cast<unsigned>(
I * PartSz + K) &&
6362 CurrentOrder[
I * PartSz + Idx] !=
6363 static_cast<unsigned>(
I * PartSz + Idx))
6364 CurrentOrder[
I * PartSz + Idx] =
I * PartSz + K;
6367 if (SecondVecFound) {
6369 ShuffledSubMasks.
set(
I);
6375 if (!ExtractShuffles.
empty())
6376 TransformMaskToOrder(
6377 CurrentOrder, ExtractMask, PartSz, NumParts, [&](
unsigned I) {
6378 if (!ExtractShuffles[
I])
6381 unsigned Sz =
getNumElems(TE.getVectorFactor(), PartSz,
I);
6383 int K =
I * PartSz + Idx;
6386 if (!TE.ReuseShuffleIndices.empty())
6387 K = TE.ReuseShuffleIndices[K];
6390 if (!TE.ReorderIndices.empty())
6391 K = std::distance(TE.ReorderIndices.begin(),
6392 find(TE.ReorderIndices, K));
6398 .getKnownMinValue());
6403 if (GatherShuffles.
size() == 1 && NumParts != 1) {
6404 if (ShuffledSubMasks.
any())
6405 return std::nullopt;
6406 PartSz = NumScalars;
6409 if (!Entries.
empty())
6410 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](
unsigned I) {
6411 if (!GatherShuffles[
I])
6413 return std::max(Entries[
I].front()->getVectorFactor(),
6414 Entries[
I].back()->getVectorFactor());
6416 unsigned NumUndefs =
count(CurrentOrder, NumScalars);
6417 if (ShuffledSubMasks.
all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
6418 return std::nullopt;
6419 return std::move(CurrentOrder);
6424 bool CompareOpcodes =
true) {
6430 return (!GEP1 || GEP1->getNumOperands() == 2) &&
6431 (!GEP2 || GEP2->getNumOperands() == 2) &&
6432 (((!GEP1 ||
isConstant(GEP1->getOperand(1))) &&
6433 (!GEP2 ||
isConstant(GEP2->getOperand(1)))) ||
6436 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
6440template <
typename T>
6445 return CommonAlignment;
6451 "Order is empty. Please check it before using isReverseOrder.");
6452 unsigned Sz = Order.
size();
6454 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
6465 const SCEV *PtrSCEVLowest =
nullptr;
6466 const SCEV *PtrSCEVHighest =
nullptr;
6474 if (!PtrSCEVLowest && !PtrSCEVHighest) {
6475 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
6482 PtrSCEVLowest = PtrSCEV;
6489 PtrSCEVHighest = PtrSCEV;
6497 int Size =
DL.getTypeStoreSize(ElemTy);
6498 auto TryGetStride = [&](
const SCEV *Dist,
6499 const SCEV *Multiplier) ->
const SCEV * {
6501 if (M->getOperand(0) == Multiplier)
6502 return M->getOperand(1);
6503 if (M->getOperand(1) == Multiplier)
6504 return M->getOperand(0);
6507 if (Multiplier == Dist)
6512 const SCEV *Stride =
nullptr;
6513 if (
Size != 1 || SCEVs.
size() > 2) {
6515 Stride = TryGetStride(Dist, Sz);
6523 using DistOrdPair = std::pair<int64_t, int>;
6525 std::set<DistOrdPair,
decltype(Compare)> Offsets(Compare);
6527 bool IsConsecutive =
true;
6528 for (
const SCEV *PtrSCEV : SCEVs) {
6530 if (PtrSCEV != PtrSCEVLowest) {
6532 const SCEV *Coeff = TryGetStride(Diff, Stride);
6542 Dist = SC->getAPInt().getZExtValue();
6547 auto Res = Offsets.emplace(Dist, Cnt);
6551 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
6554 if (Offsets.size() != SCEVs.
size())
6556 SortedIndices.
clear();
6557 if (!IsConsecutive) {
6561 for (
const std::pair<int64_t, int> &Pair : Offsets) {
6562 SortedIndices[Cnt] = Pair.second;
6569static std::pair<InstructionCost, InstructionCost>
6572 Type *ScalarTy, VectorType *VecTy);
6590 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
6593 Mask, NumSrcElts, NumSubElts, Index)) {
6594 if (Index + NumSubElts > NumSrcElts &&
6595 Index + NumSrcElts <=
static_cast<int>(
Mask.size()))
6612 "ScalableVectorType is not supported.");
6615 "Incorrect usage.");
6620 unsigned ScalarTyNumElements = VecTy->getNumElements();
6623 if (!DemandedElts[
I])
6627 I * ScalarTyNumElements, VecTy);
6630 I * ScalarTyNumElements, VecTy);
6643 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
6644 if (Opcode == Instruction::ExtractElement) {
6650 Index * VecTy->getNumElements(), VecTy);
6653 return TTI.getVectorInstrCost(Opcode, Val,
CostKind, Index, Scalar,
6666 getWidenedType(VecTy->getElementType(), ScalarTy->getNumElements());
6668 Index * ScalarTy->getNumElements(), SubTp) +
6672 return TTI.getExtractWithExtendCost(Opcode, Dst, VecTy, Index,
CostKind);
6688 auto *Begin = std::next(
Mask.begin(), Index);
6689 std::iota(Begin, std::next(Begin, SubVecVF), 0);
6693 std::iota(
Mask.begin(),
Mask.end(), 0);
6694 std::iota(std::next(
Mask.begin(), Index),
6695 std::next(
Mask.begin(), Index + SubVecVF), VecVF);
6697 return Generator(Vec, V, Mask);
6700 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
6708 unsigned SubVecVF,
unsigned Index) {
6710 std::iota(Mask.begin(), Mask.end(), Index);
6711 return Builder.CreateShuffleVector(Vec, Mask);
6721 const unsigned Sz = PointerOps.
size();
6724 CompressMask[0] = 0;
6726 std::optional<unsigned> Stride = 0;
6730 std::optional<int64_t> OptPos =
6732 if (!OptPos || OptPos > std::numeric_limits<unsigned>::max())
6734 unsigned Pos =
static_cast<unsigned>(*OptPos);
6735 CompressMask[
I] = Pos;
6742 if (Pos != *Stride *
I)
6745 return Stride.has_value();
6758 InterleaveFactor = 0;
6760 const size_t Sz = VL.
size();
6768 if (AreAllUsersVectorized(V))
6771 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
CostKind,
6772 Mask.empty() ?
I : Mask[
I]);
6775 if (ExtractCost <= ScalarCost)
6780 if (Order.
empty()) {
6781 Ptr0 = PointerOps.
front();
6782 PtrN = PointerOps.
back();
6784 Ptr0 = PointerOps[Order.
front()];
6785 PtrN = PointerOps[Order.
back()];
6787 std::optional<int64_t> Diff =
6791 const size_t MaxRegSize =
6795 if (*Diff / Sz >= MaxRegSize / 8)
6799 Align CommonAlignment = LI->getAlign();
6801 Ptr0, LoadVecTy, CommonAlignment,
DL,
6804 if (IsMasked && !
TTI.isLegalMaskedLoad(LoadVecTy, CommonAlignment,
6805 LI->getPointerAddressSpace()))
6811 assert(CompressMask.
size() >= 2 &&
"At least two elements are required");
6815 auto [ScalarGEPCost, VectorGEPCost] =
6817 Instruction::GetElementPtr,
CostKind, ScalarTy, LoadVecTy);
6835 TTI.getMaskedMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
6836 LI->getPointerAddressSpace(),
CostKind);
6839 TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
6840 LI->getPointerAddressSpace(),
CostKind);
6842 if (IsStrided && !IsMasked && Order.
empty()) {
6849 AlignedLoadVecTy = LoadVecTy;
6850 if (
TTI.isLegalInterleavedAccessType(AlignedLoadVecTy, CompressMask[1],
6852 LI->getPointerAddressSpace())) {
6854 VectorGEPCost +
TTI.getInterleavedMemoryOpCost(
6855 Instruction::Load, AlignedLoadVecTy,
6856 CompressMask[1], {}, CommonAlignment,
6857 LI->getPointerAddressSpace(),
CostKind, IsMasked);
6858 if (InterleavedCost < GatherCost) {
6859 InterleaveFactor = CompressMask[1];
6860 LoadVecTy = AlignedLoadVecTy;
6867 if (!Order.
empty()) {
6870 NewMask[
I] = CompressMask[Mask[
I]];
6872 CompressMask.
swap(NewMask);
6874 InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
6875 return TotalVecCost < GatherCost;
6888 unsigned InterleaveFactor;
6892 AreAllUsersVectorized, IsMasked, InterleaveFactor,
6893 CompressMask, LoadVecTy);
6910 Align Alignment,
const int64_t Diff,
6911 const size_t Sz)
const {
6912 if (Diff % (Sz - 1) != 0)
6916 auto IsAnyPointerUsedOutGraph =
any_of(PointerOps, [&](
Value *V) {
6918 return !isVectorized(U) && !MustGather.contains(U);
6922 const uint64_t AbsoluteDiff = std::abs(Diff);
6924 if (IsAnyPointerUsedOutGraph ||
6925 (AbsoluteDiff > Sz &&
6928 AbsoluteDiff % Sz == 0 &&
has_single_bit(AbsoluteDiff / Sz)))) ||
6929 Diff == -(
static_cast<int64_t
>(Sz) - 1)) {
6930 int64_t Stride = Diff /
static_cast<int64_t
>(Sz - 1);
6931 if (Diff != Stride *
static_cast<int64_t
>(Sz - 1))
6933 if (!TTI->isLegalStridedLoadStore(VecTy, Alignment))
6943 Value *Ptr0,
Value *PtrN, StridedPtrInfo &SPtrInfo)
const {
6944 const size_t Sz = PointerOps.
size();
6945 if (!
isStridedLoad(PointerOps, ScalarTy, Alignment, Diff, Sz))
6948 int64_t Stride = Diff /
static_cast<int64_t
>(Sz - 1);
6957 else if (
Ptr != Ptr0)
6961 if (((Dist / Stride) * Stride) != Dist || !Dists.
insert(Dist).second)
6964 if (Dists.
size() == Sz) {
6965 Type *StrideTy = DL->getIndexType(Ptr0->
getType());
6966 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, Stride);
6976 StridedPtrInfo &SPtrInfo)
const {
6977 const unsigned Sz = PointerOps.
size();
6979 if (Sz <= MinProfitableStridedLoads || !TTI->isTypeLegal(StridedLoadTy) ||
6980 !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))
6982 if (
const SCEV *Stride =
6985 SPtrInfo.StrideSCEV = Stride;
6994 unsigned *BestVF,
bool TryRecursiveCheck)
const {
7007 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
7013 const size_t Sz = VL.
size();
7015 auto *POIter = PointerOps.
begin();
7016 for (
Value *V : VL) {
7018 if (!L || !L->isSimple())
7020 *POIter = L->getPointerOperand();
7026 bool IsSorted =
sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
7035 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
7036 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
7047 if (Order.
empty()) {
7048 Ptr0 = PointerOps.
front();
7049 PtrN = PointerOps.
back();
7051 Ptr0 = PointerOps[Order.
front()];
7052 PtrN = PointerOps[Order.
back()];
7054 std::optional<int64_t> Diff =
7057 if (
static_cast<uint64_t>(*Diff) == Sz - 1)
7060 *TLI, [&](
Value *V) {
7061 return areAllUsersVectorized(
7069 *Diff, Ptr0, PtrN, SPtrInfo))
7072 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
7073 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
7078 auto CheckForShuffledLoads = [&, &TTI = *TTI](
Align CommonAlignment,
7080 bool ProfitableGatherPointers) {
7085 auto [ScalarGEPCost, VectorGEPCost] =
7087 Instruction::GetElementPtr,
CostKind, ScalarTy, VecTy);
7091 Type *PtrScalarTy = PointerOps.
front()->getType()->getScalarType();
7093 if (
static_cast<unsigned>(
count_if(
7112 return C + TTI.getInstructionCost(
7118 TTI.getGatherScatterOpCost(
7120 false, CommonAlignment,
CostKind) +
7121 (ProfitableGatherPointers ? 0 : VectorGEPCost);
7129 constexpr unsigned ListLimit = 4;
7130 if (!TryRecursiveCheck || VL.
size() < ListLimit)
7139 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
7149 for (
unsigned Cnt = 0, End = VL.
size(); Cnt + VF <= End; Cnt += VF) {
7154 PointerOps, SPtrInfo, BestVF,
7162 DemandedElts.
setBits(Cnt, Cnt + VF);
7178 if (!DemandedElts.
isZero()) {
7184 if (DemandedElts[Idx])
7195 LI0->getPointerOperand(),
7196 Instruction::GetElementPtr,
CostKind, ScalarTy,
7200 if (
static_cast<unsigned>(
7202 PointerOps.
size() - 1 ||
7221 TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
7222 LI0->getPointerAddressSpace(),
CostKind,
7227 VecLdCost += TTI.getStridedMemoryOpCost(Instruction::Load, SubVecTy,
7228 LI0->getPointerOperand(),
7234 VecLdCost += TTI.getMaskedMemoryOpCost(
7235 Instruction::Load, SubVecTy, CommonAlignment,
7236 LI0->getPointerAddressSpace(),
CostKind) +
7242 VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,
7243 LI0->getPointerOperand(),
7254 ShuffleMask[Idx] = Idx / VF ==
I ? VL.
size() + Idx % VF : Idx;
7263 if (MaskedGatherCost >= VecLdCost &&
7276 bool ProfitableGatherPointers =
7277 L && Sz > 2 &&
static_cast<unsigned>(
count_if(PointerOps, [L](
Value *V) {
7278 return L->isLoopInvariant(V);
7280 if (ProfitableGatherPointers ||
all_of(PointerOps, [](
Value *
P) {
7283 (
GEP &&
GEP->getNumOperands() == 2 &&
7291 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
7292 ProfitableGatherPointers))
7304 all_of(VL, [](
const Value *V) {
return V->getType()->isPointerTy(); }) &&
7305 "Expected list of pointer operands.");
7310 std::pair<BasicBlock *, Value *>,
7314 .try_emplace(std::make_pair(
7318 SortedIndices.
clear();
7320 auto Key = std::make_pair(BBs[Cnt + 1],
7322 bool Found =
any_of(Bases.try_emplace(
Key).first->second,
7324 std::optional<int64_t> Diff =
7325 getPointersDiff(ElemTy, std::get<0>(Base.front()),
7326 ElemTy, Ptr, DL, SE,
7331 Base.emplace_back(Ptr, *Diff, Cnt + 1);
7337 if (Bases.size() > VL.
size() / 2 - 1)
7341 Bases.find(
Key)->second.emplace_back().emplace_back(
Ptr, 0, Cnt + 1);
7345 if (Bases.size() == VL.
size())
7348 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
7349 Bases.front().second.size() == VL.
size()))
7354 auto ComparePointers = [](
Value *Ptr1,
Value *Ptr2) {
7363 FirstPointers.
insert(P1);
7364 SecondPointers.
insert(P2);
7370 "Unable to find matching root.");
7373 for (
auto &
Base : Bases) {
7374 for (
auto &Vec :
Base.second) {
7375 if (Vec.size() > 1) {
7377 int64_t InitialOffset = std::get<1>(Vec[0]);
7378 bool AnyConsecutive =
7380 return std::get<1>(
P.value()) ==
7381 int64_t(
P.index()) + InitialOffset;
7385 if (!AnyConsecutive)
7390 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
7394 for (
auto &
T : Bases)
7395 for (
const auto &Vec :
T.second)
7396 for (
const auto &
P : Vec)
7400 "Expected SortedIndices to be the size of VL");
7404std::optional<BoUpSLP::OrdersType>
7406 assert(TE.isGather() &&
"Expected gather node only.");
7407 Type *ScalarTy = TE.Scalars[0]->getType();
7410 Ptrs.
reserve(TE.Scalars.size());
7412 BBs.
reserve(TE.Scalars.size());
7413 for (
Value *V : TE.Scalars) {
7415 if (!L || !L->isSimple())
7416 return std::nullopt;
7422 if (!LoadEntriesToVectorize.contains(TE.Idx) &&
7424 return std::move(Order);
7425 return std::nullopt;
7436 if (VU->
getType() != V->getType())
7439 if (!VU->
hasOneUse() && !V->hasOneUse())
7445 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
7452 bool IsReusedIdx =
false;
7454 if (IE2 == VU && !IE1)
7456 if (IE1 == V && !IE2)
7457 return V->hasOneUse();
7458 if (IE1 && IE1 != V) {
7460 IsReusedIdx |= ReusedIdx.
test(Idx1);
7461 ReusedIdx.
set(Idx1);
7462 if ((IE1 != VU && !IE1->
hasOneUse()) || IsReusedIdx)
7467 if (IE2 && IE2 != VU) {
7469 IsReusedIdx |= ReusedIdx.
test(Idx2);
7470 ReusedIdx.
set(Idx2);
7471 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
7476 }
while (!IsReusedIdx && (IE1 || IE2));
7484 const TargetLibraryInfo &TLI);
7486std::optional<BoUpSLP::OrdersType>
7488 bool IgnoreReorder) {
7491 if (!TE.ReuseShuffleIndices.empty()) {
7493 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
7494 "Reshuffling scalars not yet supported for nodes with padding");
7497 return std::nullopt;
7505 unsigned Sz = TE.Scalars.size();
7506 if (TE.isGather()) {
7507 if (std::optional<OrdersType> CurrentOrder =
7512 ::addMask(Mask, TE.ReuseShuffleIndices);
7513 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
7514 unsigned Sz = TE.Scalars.size();
7515 for (
int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
7518 Res[Idx + K * Sz] =
I + K * Sz;
7520 return std::move(Res);
7523 if (Sz == 2 && TE.getVectorFactor() == 4 &&
7525 2 * TE.getVectorFactor())) == 1)
7526 return std::nullopt;
7527 if (TE.ReuseShuffleIndices.size() % Sz != 0)
7528 return std::nullopt;
7532 if (TE.ReorderIndices.empty())
7533 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
7536 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
7537 unsigned VF = ReorderMask.
size();
7541 for (
unsigned I = 0;
I < VF;
I += Sz) {
7543 unsigned UndefCnt = 0;
7544 unsigned Limit = std::min(Sz, VF -
I);
7553 Val >=
static_cast<int>(NumParts) || UsedVals.
test(Val) ||
7555 return std::nullopt;
7557 for (
unsigned K = 0; K < NumParts; ++K) {
7558 unsigned Idx = Val + Sz * K;
7559 if (Idx < VF &&
I + K < VF)
7560 ResOrder[Idx] =
I + K;
7563 return std::move(ResOrder);
7565 unsigned VF = TE.getVectorFactor();
7568 TE.ReuseShuffleIndices.end());
7569 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
7571 if (isa<PoisonValue>(V))
7573 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
7574 return Idx && *Idx < Sz;
7576 assert(!TE.isAltShuffle() &&
"Alternate instructions are only supported "
7577 "by BinaryOperator and CastInst.");
7579 if (TE.ReorderIndices.empty())
7580 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
7583 for (
unsigned I = 0;
I < VF; ++
I) {
7584 int &Idx = ReusedMask[
I];
7587 Value *V = TE.Scalars[ReorderMask[Idx]];
7589 Idx = std::distance(ReorderMask.
begin(),
find(ReorderMask, *EI));
7595 std::iota(ResOrder.
begin(), ResOrder.
end(), 0);
7596 auto *It = ResOrder.
begin();
7597 for (
unsigned K = 0; K < VF; K += Sz) {
7601 std::iota(SubMask.
begin(), SubMask.
end(), 0);
7603 transform(CurrentOrder, It, [K](
unsigned Pos) {
return Pos + K; });
7604 std::advance(It, Sz);
7607 return Data.index() ==
Data.value();
7609 return std::nullopt;
7610 return std::move(ResOrder);
7612 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
7613 (!TE.UserTreeIndex || !TE.UserTreeIndex.UserTE->hasState() ||
7615 (TE.ReorderIndices.empty() ||
isReverseOrder(TE.ReorderIndices)))
7616 return std::nullopt;
7617 if (TE.State == TreeEntry::SplitVectorize ||
7618 ((TE.State == TreeEntry::Vectorize ||
7619 TE.State == TreeEntry::StridedVectorize ||
7620 TE.State == TreeEntry::CompressVectorize) &&
7623 assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
7624 "Alternate instructions are only supported by "
7625 "BinaryOperator and CastInst.");
7626 return TE.ReorderIndices;
7628 if (!TopToBottom && IgnoreReorder && TE.State == TreeEntry::Vectorize &&
7629 TE.isAltShuffle()) {
7630 assert(TE.ReuseShuffleIndices.empty() &&
7631 "ReuseShuffleIndices should be "
7632 "empty for alternate instructions.");
7634 TE.buildAltOpShuffleMask(
7636 assert(TE.getMatchingMainOpOrAltOp(
I) &&
7637 "Unexpected main/alternate opcode");
7641 const int VF = TE.getVectorFactor();
7646 ResOrder[Mask[
I] % VF] =
I;
7648 return std::move(ResOrder);
7650 if (!TE.ReorderIndices.empty())
7651 return TE.ReorderIndices;
7652 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
7653 if (!TE.ReorderIndices.empty())
7654 return TE.ReorderIndices;
7657 for (
auto [
I, V] :
zip(UserBVHead, TE.Scalars)) {
7665 while (
II &&
II->hasOneUse() &&
II->getParent() == BB) {
7673 assert(BB1 != BB2 &&
"Expected different basic blocks.");
7674 if (!DT->isReachableFromEntry(BB1))
7676 if (!DT->isReachableFromEntry(BB2))
7678 auto *NodeA = DT->getNode(BB1);
7679 auto *NodeB = DT->getNode(BB2);
7680 assert(NodeA &&
"Should only process reachable instructions");
7681 assert(NodeB &&
"Should only process reachable instructions");
7682 assert((NodeA == NodeB) ==
7683 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
7684 "Different nodes should have different DFS numbers");
7685 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
7687 auto PHICompare = [&](
unsigned I1,
unsigned I2) {
7688 Value *V1 = TE.Scalars[I1];
7689 Value *V2 = TE.Scalars[I2];
7702 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
7703 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
7704 FirstUserOfPhi2->getParent());
7714 if (UserBVHead[I1] && !UserBVHead[I2])
7716 if (!UserBVHead[I1])
7718 if (UserBVHead[I1] == UserBVHead[I2])
7721 return CompareByBasicBlocks(UserBVHead[I1]->
getParent(),
7723 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
7736 if (EE1->getOperand(0) == EE2->getOperand(0))
7738 if (!Inst1 && Inst2)
7740 if (Inst1 && Inst2) {
7748 "Expected either instructions or arguments vector operands.");
7749 return P1->getArgNo() < P2->getArgNo();
7754 std::iota(Phis.
begin(), Phis.
end(), 0);
7757 return std::nullopt;
7758 return std::move(Phis);
7760 if (TE.isGather() &&
7761 (!TE.hasState() || !TE.isAltShuffle() ||
7762 ScalarsInSplitNodes.contains(TE.getMainOp())) &&
7766 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
7770 auto *EE = dyn_cast<ExtractElementInst>(V);
7771 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
7777 canReuseExtract(TE.Scalars, CurrentOrder,
true);
7778 if (Reuse || !CurrentOrder.
empty())
7779 return std::move(CurrentOrder);
7787 int Sz = TE.Scalars.size();
7791 if (It == TE.Scalars.begin())
7794 if (It != TE.Scalars.end()) {
7796 unsigned Idx = std::distance(TE.Scalars.begin(), It);
7811 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
7814 return std::move(Order);
7819 return std::nullopt;
7820 if (TE.Scalars.size() >= 3)
7825 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
7827 StridedPtrInfo SPtrInfo;
7830 CurrentOrder, PointerOps, SPtrInfo);
7833 return std::move(CurrentOrder);
7838 if (std::optional<OrdersType> CurrentOrder =
7840 return CurrentOrder;
7842 return std::nullopt;
7852 for (
unsigned I = Sz,
E = Mask.size();
I <
E;
I += Sz) {
7854 if (Cluster != FirstCluster)
7860void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask)
const {
7863 const unsigned Sz =
TE.Scalars.size();
7865 if (!
TE.isGather() ||
7870 SmallVector<int> NewMask;
7872 addMask(NewMask,
TE.ReuseShuffleIndices);
7874 TE.ReorderIndices.clear();
7876 ArrayRef<int> Slice =
ArrayRef(NewMask).slice(0, Sz);
7877 SmallVector<unsigned> NewOrder(Slice);
7881 for (
auto *It =
TE.ReuseShuffleIndices.begin(),
7882 *End =
TE.ReuseShuffleIndices.end();
7883 It != End; std::advance(It, Sz))
7884 std::iota(It, std::next(It, Sz), 0);
7890 "Expected same size of orders");
7891 size_t Sz = Order.
size();
7894 if (Order[Idx] != Sz)
7895 UsedIndices.
set(Order[Idx]);
7897 if (SecondaryOrder.
empty()) {
7899 if (Order[Idx] == Sz && !UsedIndices.
test(Idx))
7903 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
7904 !UsedIndices.
test(SecondaryOrder[Idx]))
7905 Order[Idx] = SecondaryOrder[Idx];
7913 constexpr unsigned TinyVF = 2;
7914 constexpr unsigned TinyTree = 10;
7915 constexpr unsigned PhiOpsLimit = 12;
7916 constexpr unsigned GatherLoadsLimit = 2;
7917 if (VectorizableTree.size() <= TinyTree)
7919 if (VectorizableTree.front()->hasState() &&
7920 !VectorizableTree.front()->isGather() &&
7921 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
7922 VectorizableTree.front()->getOpcode() == Instruction::PHI ||
7923 (VectorizableTree.front()->getVectorFactor() <= TinyVF &&
7924 (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
7925 VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
7926 VectorizableTree.front()->ReorderIndices.empty()) {
7930 if (VectorizableTree.front()->hasState() &&
7931 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
7932 VectorizableTree.front()->Scalars.size() == TinyVF &&
7933 VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
7936 if (VectorizableTree.front()->hasState() &&
7937 VectorizableTree.front()->getOpcode() == Instruction::Store &&
7938 VectorizableTree.front()->ReorderIndices.empty()) {
7939 const unsigned ReorderedSplitsCnt =
7940 count_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
7941 return TE->State == TreeEntry::SplitVectorize &&
7942 !TE->ReorderIndices.empty() && TE->UserTreeIndex.UserTE &&
7943 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
7946 if (ReorderedSplitsCnt <= 1 &&
7948 VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
7949 return ((!TE->isGather() &&
7950 (TE->ReorderIndices.empty() ||
7951 (TE->UserTreeIndex.UserTE &&
7952 TE->UserTreeIndex.UserTE->State ==
7953 TreeEntry::Vectorize &&
7954 !TE->UserTreeIndex.UserTE->ReuseShuffleIndices
7956 (TE->isGather() && TE->ReorderIndices.empty() &&
7957 (!TE->hasState() || TE->isAltShuffle() ||
7958 TE->getOpcode() == Instruction::Load ||
7959 TE->getOpcode() == Instruction::ZExt ||
7960 TE->getOpcode() == Instruction::SExt))) &&
7961 (VectorizableTree.front()->getVectorFactor() > TinyVF ||
7962 !TE->isGather() ||
none_of(TE->Scalars, [&](
Value *V) {
7963 return !isConstant(V) && isVectorized(V);
7965 })) >= VectorizableTree.size() - ReorderedSplitsCnt)
7968 bool HasPhis =
false;
7969 bool HasLoad =
true;
7970 unsigned GatherLoads = 0;
7971 for (
const std::unique_ptr<TreeEntry> &TE :
7972 ArrayRef(VectorizableTree).drop_front()) {
7973 if (TE->State == TreeEntry::SplitVectorize)
7975 if (!TE->hasState()) {
7979 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
7984 if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) {
7985 if (!TE->isGather()) {
7992 if (GatherLoads >= GatherLoadsLimit)
7995 if (TE->getOpcode() == Instruction::GetElementPtr ||
7998 if (TE->getOpcode() != Instruction::PHI &&
7999 (!TE->hasCopyableElements() ||
8001 TE->Scalars.size() / 2))
8003 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8004 TE->getNumOperands() > PhiOpsLimit)
8013void BoUpSLP::TreeEntry::reorderSplitNode(
unsigned Idx,
ArrayRef<int> Mask,
8015 assert(State == TreeEntry::SplitVectorize &&
"Expected split user node.");
8018 std::iota(NewMask.
begin(), NewMask.
end(), 0);
8019 std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0);
8022 copy(MaskOrder, NewMaskOrder.begin());
8024 assert(Idx == 1 &&
"Expected either 0 or 1 index.");
8025 unsigned Offset = CombinedEntriesWithIndices.
back().second;
8034 ReorderIndices.clear();
8053 ExternalUserReorderMap;
8057 for_each(VectorizableTree, [&, &TTIRef = *TTI](
8058 const std::unique_ptr<TreeEntry> &TE) {
8061 findExternalStoreUsersReorderIndices(TE.get());
8062 if (!ExternalUserReorderIndices.
empty()) {
8063 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
8065 std::move(ExternalUserReorderIndices));
8071 if (TE->hasState() && TE->isAltShuffle() &&
8072 TE->State != TreeEntry::SplitVectorize) {
8073 Type *ScalarTy = TE->Scalars[0]->getType();
8075 unsigned Opcode0 = TE->getOpcode();
8076 unsigned Opcode1 = TE->getAltOpcode();
8080 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
8081 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
8087 bool IgnoreReorder =
8088 !UserIgnoreList && VectorizableTree.front()->hasState() &&
8089 (VectorizableTree.front()->
getOpcode() == Instruction::InsertElement ||
8090 VectorizableTree.front()->getOpcode() == Instruction::Store);
8091 if (std::optional<OrdersType> CurrentOrder =
8101 const TreeEntry *UserTE = TE.get();
8103 if (!UserTE->UserTreeIndex)
8105 if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8106 UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
8107 UserTE->UserTreeIndex.UserTE->Idx != 0)
8109 UserTE = UserTE->UserTreeIndex.UserTE;
8112 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
8113 if (!(TE->State == TreeEntry::Vectorize ||
8114 TE->State == TreeEntry::StridedVectorize ||
8115 TE->State == TreeEntry::SplitVectorize ||
8116 TE->State == TreeEntry::CompressVectorize) ||
8117 !TE->ReuseShuffleIndices.empty())
8118 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
8119 if (TE->State == TreeEntry::Vectorize &&
8120 TE->getOpcode() == Instruction::PHI)
8121 PhisToOrders.
try_emplace(TE.get(), *CurrentOrder);
8126 for (
unsigned VF = VectorizableTree.front()->getVectorFactor();
8127 !VFToOrderedEntries.
empty() && VF > 1; VF -= 2 - (VF & 1U)) {
8128 auto It = VFToOrderedEntries.
find(VF);
8129 if (It == VFToOrderedEntries.
end())
8143 for (
const TreeEntry *OpTE : OrderedEntries) {
8146 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE) &&
8147 OpTE->State != TreeEntry::SplitVectorize)
8150 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
8152 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
8153 auto It = GathersToOrders.find(OpTE);
8154 if (It != GathersToOrders.end())
8157 if (OpTE->hasState() && OpTE->isAltShuffle()) {
8158 auto It = AltShufflesToOrders.find(OpTE);
8159 if (It != AltShufflesToOrders.end())
8162 if (OpTE->State == TreeEntry::Vectorize &&
8163 OpTE->getOpcode() == Instruction::PHI) {
8164 auto It = PhisToOrders.
find(OpTE);
8165 if (It != PhisToOrders.
end())
8168 return OpTE->ReorderIndices;
8171 auto It = ExternalUserReorderMap.
find(OpTE);
8172 if (It != ExternalUserReorderMap.
end()) {
8173 const auto &ExternalUserReorderIndices = It->second;
8177 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
8178 OrdersUses.try_emplace(
OrdersType(), 0).first->second +=
8179 ExternalUserReorderIndices.size();
8181 for (
const OrdersType &ExtOrder : ExternalUserReorderIndices)
8182 ++OrdersUses.try_emplace(ExtOrder, 0).first->second;
8189 if (OpTE->State == TreeEntry::Vectorize &&
8190 OpTE->getOpcode() == Instruction::Store && !Order.
empty()) {
8191 assert(!OpTE->isAltShuffle() &&
8192 "Alternate instructions are only supported by BinaryOperator "
8196 unsigned E = Order.
size();
8199 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8202 ++OrdersUses.try_emplace(CurrentOrder, 0).first->second;
8204 ++OrdersUses.try_emplace(Order, 0).first->second;
8207 if (OrdersUses.empty())
8210 unsigned IdentityCnt = 0;
8211 unsigned FilledIdentityCnt = 0;
8213 for (
auto &Pair : OrdersUses) {
8215 if (!Pair.first.empty())
8216 FilledIdentityCnt += Pair.second;
8217 IdentityCnt += Pair.second;
8222 unsigned Cnt = IdentityCnt;
8223 for (
auto &Pair : OrdersUses) {
8227 if (Cnt < Pair.second ||
8228 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
8229 Cnt == Pair.second && !BestOrder.
empty() &&
8232 BestOrder = Pair.first;
8245 unsigned E = BestOrder.
size();
8247 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8250 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8252 if (TE->Scalars.size() != VF) {
8253 if (TE->ReuseShuffleIndices.size() == VF) {
8254 assert(TE->State != TreeEntry::SplitVectorize &&
8255 "Split vectorized not expected.");
8260 (!TE->UserTreeIndex ||
8261 TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
8262 TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() ||
8263 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) &&
8264 "All users must be of VF size.");
8271 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->hasState() &&
8277 reorderNodeWithReuses(*TE, Mask);
8279 if (TE->UserTreeIndex &&
8280 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8281 TE->UserTreeIndex.UserTE->reorderSplitNode(
8282 TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder);
8286 if ((TE->State == TreeEntry::SplitVectorize &&
8287 TE->ReuseShuffleIndices.empty()) ||
8288 ((TE->State == TreeEntry::Vectorize ||
8289 TE->State == TreeEntry::StridedVectorize ||
8290 TE->State == TreeEntry::CompressVectorize) &&
8295 (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize &&
8296 TE->ReuseShuffleIndices.empty())) &&
8297 "Alternate instructions are only supported by BinaryOperator "
8303 TE->reorderOperands(Mask);
8306 TE->reorderOperands(Mask);
8307 assert(TE->ReorderIndices.empty() &&
8308 "Expected empty reorder sequence.");
8311 if (!TE->ReuseShuffleIndices.empty()) {
8318 addMask(NewReuses, TE->ReuseShuffleIndices);
8319 TE->ReuseShuffleIndices.swap(NewReuses);
8320 }
else if (TE->UserTreeIndex &&
8321 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8323 TE->UserTreeIndex.UserTE->reorderSplitNode(TE->UserTreeIndex.EdgeIdx,
8329void BoUpSLP::buildReorderableOperands(
8330 TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
8334 if (
any_of(Edges, [
I](
const std::pair<unsigned, TreeEntry *> &OpData) {
8335 return OpData.first ==
I &&
8336 (OpData.second->State == TreeEntry::Vectorize ||
8337 OpData.second->State == TreeEntry::StridedVectorize ||
8338 OpData.second->State == TreeEntry::CompressVectorize ||
8339 OpData.second->State == TreeEntry::SplitVectorize);
8343 if (UserTE->hasState()) {
8344 if (UserTE->getOpcode() == Instruction::ExtractElement ||
8345 UserTE->getOpcode() == Instruction::ExtractValue)
8347 if (UserTE->getOpcode() == Instruction::InsertElement &&
I == 0)
8349 if (UserTE->getOpcode() == Instruction::Store &&
8350 UserTE->State == TreeEntry::Vectorize &&
I == 1)
8352 if (UserTE->getOpcode() == Instruction::Load &&
8353 (UserTE->State == TreeEntry::Vectorize ||
8354 UserTE->State == TreeEntry::StridedVectorize ||
8355 UserTE->State == TreeEntry::CompressVectorize))
8358 TreeEntry *TE = getOperandEntry(UserTE,
I);
8359 assert(TE &&
"Expected operand entry.");
8360 if (!TE->isGather()) {
8363 Edges.emplace_back(
I, TE);
8369 if (TE->State == TreeEntry::ScatterVectorize &&
8370 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
8374 if (ReorderableGathers.
contains(TE))
8380 struct TreeEntryCompare {
8381 bool operator()(
const TreeEntry *LHS,
const TreeEntry *RHS)
const {
8382 if (LHS->UserTreeIndex && RHS->UserTreeIndex)
8383 return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx;
8384 return LHS->Idx < RHS->Idx;
8393 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8394 if (TE->State != TreeEntry::Vectorize &&
8395 TE->State != TreeEntry::StridedVectorize &&
8396 TE->State != TreeEntry::CompressVectorize &&
8397 TE->State != TreeEntry::SplitVectorize)
8398 NonVectorized.
insert(TE.get());
8399 if (std::optional<OrdersType> CurrentOrder =
8401 Queue.push(TE.get());
8402 if (!(TE->State == TreeEntry::Vectorize ||
8403 TE->State == TreeEntry::StridedVectorize ||
8404 TE->State == TreeEntry::CompressVectorize ||
8405 TE->State == TreeEntry::SplitVectorize) ||
8406 !TE->ReuseShuffleIndices.empty())
8407 GathersToOrders.
insert(TE.get());
8416 while (!Queue.empty()) {
8418 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>
Users;
8419 TreeEntry *TE = Queue.top();
8420 const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
8423 while (!Queue.empty()) {
8425 if (!UserTE || UserTE != TE->UserTreeIndex.UserTE)
8430 for (TreeEntry *TE : OrderedOps) {
8431 if (!(TE->State == TreeEntry::Vectorize ||
8432 TE->State == TreeEntry::StridedVectorize ||
8433 TE->State == TreeEntry::CompressVectorize ||
8434 TE->State == TreeEntry::SplitVectorize ||
8435 (TE->isGather() && GathersToOrders.
contains(TE))) ||
8436 !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
8437 !Visited.
insert(TE).second)
8441 Users.first = TE->UserTreeIndex.UserTE;
8442 Users.second.emplace_back(TE->UserTreeIndex.EdgeIdx, TE);
8446 if (
Data.first->State == TreeEntry::SplitVectorize) {
8448 Data.second.size() <= 2 &&
8449 "Expected not greater than 2 operands for split vectorize node.");
8451 [](
const auto &
Op) { return !Op.second->UserTreeIndex; }))
8454 assert(
Data.first->CombinedEntriesWithIndices.size() == 2 &&
8455 "Expected exactly 2 entries.");
8456 for (
const auto &
P :
Data.first->CombinedEntriesWithIndices) {
8457 TreeEntry &OpTE = *VectorizableTree[
P.first];
8459 if (Order.
empty() || !OpTE.ReuseShuffleIndices.empty()) {
8460 if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
8462 const auto BestOrder =
8471 const unsigned E = Order.
size();
8474 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8476 Data.first->reorderSplitNode(
P.second ? 1 : 0, Mask, MaskOrder);
8478 if (!OpTE.ReorderIndices.empty()) {
8479 OpTE.ReorderIndices.clear();
8480 }
else if (!OpTE.ReuseShuffleIndices.empty()) {
8483 assert(OpTE.isGather() &&
"Expected only gather/buildvector node.");
8487 if (
Data.first->ReuseShuffleIndices.empty() &&
8488 !
Data.first->ReorderIndices.empty()) {
8491 Queue.push(
Data.first);
8497 buildReorderableOperands(
Data.first,
Data.second, NonVectorized,
8509 for (
const auto &
Op :
Data.second) {
8510 TreeEntry *OpTE =
Op.second;
8511 if (!VisitedOps.
insert(OpTE).second)
8513 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
8515 const auto Order = [&]() ->
const OrdersType {
8516 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
8520 return OpTE->ReorderIndices;
8524 if (Order.
size() == 1)
8530 Value *Root = OpTE->hasState()
8533 auto GetSameNodesUsers = [&](
Value *Root) {
8535 for (
const TreeEntry *TE : ValueToGatherNodes.lookup(Root)) {
8536 if (TE != OpTE && TE->UserTreeIndex &&
8537 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8538 TE->Scalars.size() == OpTE->Scalars.size() &&
8539 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8540 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8541 Res.
insert(TE->UserTreeIndex.UserTE);
8543 for (
const TreeEntry *TE : getTreeEntries(Root)) {
8544 if (TE != OpTE && TE->UserTreeIndex &&
8545 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8546 TE->Scalars.size() == OpTE->Scalars.size() &&
8547 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8548 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8549 Res.
insert(TE->UserTreeIndex.UserTE);
8553 auto GetNumOperands = [](
const TreeEntry *TE) {
8554 if (TE->State == TreeEntry::SplitVectorize)
8555 return TE->getNumOperands();
8557 return CI->arg_size();
8558 return TE->getNumOperands();
8560 auto NodeShouldBeReorderedWithOperands = [&, TTI = TTI](
8561 const TreeEntry *TE) {
8569 const TreeEntry *
Op = getOperandEntry(TE, Idx);
8570 if (
Op->isGather() &&
Op->hasState()) {
8571 const TreeEntry *VecOp =
8572 getSameValuesTreeEntry(
Op->getMainOp(),
Op->Scalars);
8576 if (
Op->ReorderIndices.empty() &&
Op->ReuseShuffleIndices.empty())
8583 if (!RevisitedOps.
insert(UTE).second)
8585 return UTE ==
Data.first || !UTE->ReorderIndices.empty() ||
8586 !UTE->ReuseShuffleIndices.empty() ||
8587 (UTE->UserTreeIndex &&
8588 UTE->UserTreeIndex.UserTE ==
Data.first) ||
8589 (
Data.first->UserTreeIndex &&
8590 Data.first->UserTreeIndex.UserTE == UTE) ||
8591 (IgnoreReorder && UTE->UserTreeIndex &&
8592 UTE->UserTreeIndex.UserTE->Idx == 0) ||
8593 NodeShouldBeReorderedWithOperands(UTE);
8596 for (TreeEntry *UTE :
Users) {
8604 const TreeEntry *
Op = getOperandEntry(UTE, Idx);
8606 Queue.push(
const_cast<TreeEntry *
>(
Op));
8611 Data.second, [OpTE](
const std::pair<unsigned, TreeEntry *> &
P) {
8612 return P.second == OpTE;
8615 if (OpTE->State == TreeEntry::Vectorize &&
8616 OpTE->getOpcode() == Instruction::Store && !Order.
empty()) {
8617 assert(!OpTE->isAltShuffle() &&
8618 "Alternate instructions are only supported by BinaryOperator "
8622 unsigned E = Order.
size();
8625 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8628 OrdersUses.try_emplace(CurrentOrder, 0).first->second +=
NumOps;
8630 OrdersUses.try_emplace(Order, 0).first->second +=
NumOps;
8632 auto Res = OrdersUses.try_emplace(
OrdersType(), 0);
8633 const auto AllowsReordering = [&](
const TreeEntry *TE) {
8634 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
8635 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
8636 (IgnoreReorder && TE->Idx == 0))
8638 if (TE->isGather()) {
8648 if (OpTE->UserTreeIndex) {
8649 TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE;
8650 if (!VisitedUsers.
insert(UserTE).second)
8655 if (AllowsReordering(UserTE))
8663 if (
static_cast<unsigned>(
count_if(
8664 Ops, [UserTE, &AllowsReordering](
8665 const std::pair<unsigned, TreeEntry *> &
Op) {
8666 return AllowsReordering(
Op.second) &&
8667 Op.second->UserTreeIndex.UserTE == UserTE;
8668 })) <=
Ops.size() / 2)
8669 ++Res.first->second;
8672 if (OrdersUses.empty()) {
8677 unsigned IdentityCnt = 0;
8678 unsigned VF =
Data.second.front().second->getVectorFactor();
8680 for (
auto &Pair : OrdersUses) {
8682 IdentityCnt += Pair.second;
8687 unsigned Cnt = IdentityCnt;
8688 for (
auto &Pair : OrdersUses) {
8692 if (Cnt < Pair.second) {
8694 BestOrder = Pair.first;
8711 unsigned E = BestOrder.
size();
8713 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8715 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second) {
8716 TreeEntry *TE =
Op.second;
8717 if (!VisitedOps.
insert(TE).second)
8719 if (TE->ReuseShuffleIndices.size() == BestOrder.
size()) {
8720 reorderNodeWithReuses(*TE, Mask);
8724 if (TE->State != TreeEntry::Vectorize &&
8725 TE->State != TreeEntry::StridedVectorize &&
8726 TE->State != TreeEntry::CompressVectorize &&
8727 TE->State != TreeEntry::SplitVectorize &&
8728 (TE->State != TreeEntry::ScatterVectorize ||
8729 TE->ReorderIndices.empty()))
8731 assert((BestOrder.
size() == TE->ReorderIndices.size() ||
8732 TE->ReorderIndices.empty()) &&
8733 "Non-matching sizes of user/operand entries.");
8735 if (IgnoreReorder && TE == VectorizableTree.front().get())
8736 IgnoreReorder =
false;
8739 for (TreeEntry *
Gather : GatherOps) {
8741 "Unexpected reordering of gathers.");
8742 if (!
Gather->ReuseShuffleIndices.empty()) {
8752 auto IsNotProfitableAltCodeNode = [](
const TreeEntry &TE) {
8753 return TE.isAltShuffle() &&
8754 (!TE.ReuseShuffleIndices.empty() || TE.getVectorFactor() == 2 ||
8755 TE.ReorderIndices.empty());
8757 if (
Data.first->State != TreeEntry::Vectorize ||
8759 Data.first->getMainOp()) ||
8760 IsNotProfitableAltCodeNode(*
Data.first))
8761 Data.first->reorderOperands(Mask);
8763 IsNotProfitableAltCodeNode(*
Data.first) ||
8764 Data.first->State == TreeEntry::StridedVectorize ||
8765 Data.first->State == TreeEntry::CompressVectorize) {
8769 if (
Data.first->ReuseShuffleIndices.empty() &&
8770 !
Data.first->ReorderIndices.empty() &&
8771 !IsNotProfitableAltCodeNode(*
Data.first)) {
8774 Queue.push(
Data.first);
8782 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
8783 VectorizableTree.front()->ReuseShuffleIndices.empty())
8784 VectorizableTree.front()->ReorderIndices.
clear();
8787Instruction *BoUpSLP::getRootEntryInstruction(
const TreeEntry &Entry)
const {
8788 if (Entry.hasState() &&
8789 (Entry.getOpcode() == Instruction::Store ||
8790 Entry.getOpcode() == Instruction::Load) &&
8791 Entry.State == TreeEntry::StridedVectorize &&
8792 !Entry.ReorderIndices.empty() &&
isReverseOrder(Entry.ReorderIndices))
8799 const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
8803 for (
auto &TEPtr : VectorizableTree) {
8804 TreeEntry *Entry = TEPtr.get();
8807 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
8811 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
8812 Value *Scalar = Entry->Scalars[Lane];
8817 auto It = ScalarToExtUses.
find(Scalar);
8818 if (It != ScalarToExtUses.
end() && !ExternalUses[It->second].User)
8821 if (Scalar->hasNUsesOrMore(NumVectScalars)) {
8822 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8823 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract from lane " << FoundLane
8824 <<
" from " << *Scalar <<
"for many users.\n");
8825 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
8826 ExternalUses.emplace_back(Scalar,
nullptr, *Entry, FoundLane);
8827 ExternalUsesWithNonUsers.insert(Scalar);
8832 const auto ExtI = ExternallyUsedValues.
find(Scalar);
8833 if (ExtI != ExternallyUsedValues.
end()) {
8834 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8835 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract: Extra arg from lane "
8836 << FoundLane <<
" from " << *Scalar <<
".\n");
8837 ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size());
8838 ExternalUses.emplace_back(Scalar,
nullptr, *Entry, FoundLane);
8849 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
8854 !UseEntries.
empty()) {
8858 if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
8861 all_of(UseEntries, [&](TreeEntry *UseEntry) {
8862 return UseEntry->State == TreeEntry::ScatterVectorize ||
8864 Scalar, getRootEntryInstruction(*UseEntry), TLI,
8867 LLVM_DEBUG(
dbgs() <<
"SLP: \tInternal user will be removed:" << *U
8870 [](TreeEntry *UseEntry) {
8871 return UseEntry->isGather();
8877 if (It != ScalarToExtUses.
end()) {
8878 ExternalUses[It->second].User =
nullptr;
8883 if (U && Scalar->hasNUsesOrMore(
UsesLimit))
8885 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8887 <<
" from lane " << FoundLane <<
" from " << *Scalar
8889 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
8890 ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
8891 ExternalUsesWithNonUsers.insert(Scalar);
8900BoUpSLP::collectUserStores(
const BoUpSLP::TreeEntry *TE)
const {
8904 for (
unsigned Lane :
seq<unsigned>(0, TE->Scalars.size())) {
8905 Value *V = TE->Scalars[Lane];
8918 if (
SI ==
nullptr || !
SI->isSimple() ||
SI->getFunction() !=
F ||
8927 auto &StoresVec = PtrToStoresMap[{
SI->getParent(),
8928 SI->getValueOperand()->getType(),
Ptr}];
8931 if (StoresVec.size() > Lane)
8933 if (!StoresVec.empty()) {
8935 SI->getValueOperand()->getType(),
SI->getPointerOperand(),
8936 SI->getValueOperand()->getType(),
8937 StoresVec.front()->getPointerOperand(), *
DL, *SE,
8943 StoresVec.push_back(
SI);
8948 for (
auto &
P : PtrToStoresMap) {
8963 StoreInst *S0 = StoresVec[0];
8968 StoreInst *
SI = StoresVec[Idx];
8969 std::optional<int64_t> Diff =
8971 SI->getPointerOperand(), *DL, *SE,
8977 if (StoreOffsetVec.
size() != StoresVec.
size())
8979 sort(StoreOffsetVec, llvm::less_first());
8981 int64_t PrevDist = 0;
8982 for (
const auto &
P : StoreOffsetVec) {
8983 if (Idx > 0 &&
P.first != PrevDist + 1)
8991 ReorderIndices.assign(StoresVec.
size(), 0);
8992 bool IsIdentity =
true;
8994 ReorderIndices[
P.second] =
I;
8995 IsIdentity &=
P.second ==
I;
9001 ReorderIndices.clear();
9008 for (
unsigned Idx : Order)
9009 dbgs() << Idx <<
", ";
9015BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE)
const {
9016 unsigned NumLanes =
TE->Scalars.size();
9029 if (StoresVec.
size() != NumLanes)
9034 if (!canFormVector(StoresVec, ReorderIndices))
9039 ExternalReorderIndices.
push_back(ReorderIndices);
9041 return ExternalReorderIndices;
9047 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9048 "TreeEntryToStridedPtrInfoMap is not cleared");
9049 UserIgnoreList = &UserIgnoreLst;
9052 buildTreeRec(Roots, 0,
EdgeInfo());
9057 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9058 "TreeEntryToStridedPtrInfoMap is not cleared");
9061 buildTreeRec(Roots, 0,
EdgeInfo());
9070 bool AddNew =
true) {
9078 for (
Value *V : VL) {
9082 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
9084 bool IsFound =
false;
9085 for (
auto [Map,
Data] :
zip(ClusteredDistToLoad, ClusteredLoads)) {
9086 assert(LI->getParent() ==
Data.front().first->getParent() &&
9087 LI->getType() ==
Data.front().first->getType() &&
9091 "Expected loads with the same type, same parent and same "
9092 "underlying pointer.");
9094 LI->getType(), LI->getPointerOperand(),
Data.front().first->getType(),
9095 Data.front().first->getPointerOperand(),
DL, SE,
9099 auto It = Map.find(*Dist);
9100 if (It != Map.end() && It->second != LI)
9102 if (It == Map.end()) {
9103 Data.emplace_back(LI, *Dist);
9104 Map.try_emplace(*Dist, LI);
9114 auto FindMatchingLoads =
9119 int64_t &
Offset,
unsigned &Start) {
9121 return GatheredLoads.
end();
9130 std::optional<int64_t> Dist =
9132 Data.front().first->getType(),
9133 Data.front().first->getPointerOperand(),
DL, SE,
9139 for (std::pair<LoadInst *, int64_t>
P :
Data) {
9145 unsigned NumUniques = 0;
9146 for (
auto [Cnt, Pair] :
enumerate(Loads)) {
9147 bool Used = DataLoads.
contains(Pair.first);
9148 if (!Used && !DataDists.
contains(*Dist + Pair.second)) {
9152 Repeated.insert(Cnt);
9155 if (NumUniques > 0 &&
9156 (Loads.
size() == NumUniques ||
9157 (Loads.
size() - NumUniques >= 2 &&
9158 Loads.
size() - NumUniques >= Loads.
size() / 2 &&
9164 return std::next(GatheredLoads.
begin(), Idx);
9168 return GatheredLoads.
end();
9170 for (
ArrayRef<std::pair<LoadInst *, int64_t>>
Data : ClusteredLoads) {
9174 auto *It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
9176 while (It != GatheredLoads.
end()) {
9177 assert(!LocalToAdd.
empty() &&
"Expected some elements to add.");
9178 for (
unsigned Idx : LocalToAdd)
9181 It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
Offset,
9185 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
9192 Loads.push_back(
Data[Idx]);
9198 GatheredLoads, [&](
ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9199 return PD.front().first->getParent() == LI->
getParent() &&
9200 PD.front().first->getType() == LI->
getType();
9202 while (It != GatheredLoads.
end()) {
9205 std::next(It), GatheredLoads.
end(),
9206 [&](
ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9207 return PD.front().first->getParent() == LI->getParent() &&
9208 PD.front().first->getType() == LI->getType();
9212 GatheredLoads.emplace_back().append(
Data.begin(),
Data.end());
9213 AddNewLoads(GatheredLoads.emplace_back());
9218void BoUpSLP::tryToVectorizeGatheredLoads(
9219 const SmallMapVector<
9220 std::tuple<BasicBlock *, Value *, Type *>,
9223 GatheredLoadsEntriesFirst = VectorizableTree.
size();
9226 LoadEntriesToVectorize.size());
9227 for (
auto [Idx, Set] :
zip(LoadEntriesToVectorize, LoadSetsToVectorize))
9228 Set.insert_range(VectorizableTree[Idx]->Scalars);
9231 auto LoadSorter = [](
const std::pair<LoadInst *, int64_t> &L1,
9232 const std::pair<LoadInst *, int64_t> &L2) {
9233 return L1.second > L2.second;
9240 auto *Ty =
getWidenedType(Loads.front()->getType(), Loads.size());
9241 return TTI->isLegalMaskedGather(Ty, Alignment) &&
9242 !TTI->forceScalarizeMaskedGather(Ty, Alignment);
9247 SmallVectorImpl<LoadInst *> &NonVectorized,
9248 bool Final,
unsigned MaxVF) {
9250 unsigned StartIdx = 0;
9251 SmallVector<int> CandidateVFs;
9255 *TTI, Loads.
front()->getType(), MaxVF);
9257 *TTI, Loads.
front()->getType(), NumElts - 1)) {
9263 if (Final && CandidateVFs.
empty())
9266 unsigned BestVF = Final ? CandidateVFs.
back() : 0;
9267 for (
unsigned NumElts : CandidateVFs) {
9268 if (Final && NumElts > BestVF)
9270 SmallVector<unsigned> MaskedGatherVectorized;
9271 for (
unsigned Cnt = StartIdx,
E = Loads.
size(); Cnt <
E;
9275 if (VectorizedLoads.count(Slice.
front()) ||
9276 VectorizedLoads.count(Slice.
back()) ||
9282 bool AllowToVectorize =
false;
9285 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
9288 for (LoadInst *LI : Slice) {
9290 if (LI->hasOneUse())
9296 if (
static_cast<unsigned int>(std::distance(
9297 LI->user_begin(), LI->user_end())) != LI->getNumUses())
9299 if (!IsLegalBroadcastLoad)
9303 for (User *U : LI->users()) {
9306 for (
const TreeEntry *UTE : getTreeEntries(U)) {
9307 for (
int I :
seq<int>(UTE->getNumOperands())) {
9309 return V == LI || isa<PoisonValue>(V);
9319 AllowToVectorize = CheckIfAllowed(Slice);
9323 any_of(ValueToGatherNodes.at(Slice.front()),
9324 [=](
const TreeEntry *TE) {
9325 return TE->Scalars.size() == 2 &&
9326 ((TE->Scalars.front() == Slice.front() &&
9327 TE->Scalars.back() == Slice.back()) ||
9328 (TE->Scalars.front() == Slice.back() &&
9329 TE->Scalars.back() == Slice.front()));
9334 if (AllowToVectorize) {
9339 reinterpret_cast<Value *
const *
>(Slice.begin()), Slice.size());
9340 StridedPtrInfo SPtrInfo;
9342 PointerOps, SPtrInfo, &BestVF);
9344 (BestVF > 1 &&
static_cast<unsigned>(NumElts) == 2 * BestVF)) {
9346 if (MaskedGatherVectorized.
empty() ||
9347 Cnt >= MaskedGatherVectorized.
back() + NumElts)
9352 Results.emplace_back(Values, LS);
9353 VectorizedLoads.insert_range(Slice);
9356 if (Cnt == StartIdx)
9357 StartIdx += NumElts;
9360 if (StartIdx >= Loads.
size())
9364 if (!MaskedGatherVectorized.
empty() &&
9365 Cnt < MaskedGatherVectorized.
back() + NumElts)
9371 if (!AllowToVectorize || BestVF == 0)
9375 for (
unsigned Cnt : MaskedGatherVectorized) {
9377 Cnt, std::min<unsigned>(NumElts, Loads.
size() - Cnt));
9381 VectorizedLoads.insert_range(Slice);
9383 if (Cnt == StartIdx)
9384 StartIdx += NumElts;
9387 for (LoadInst *LI : Loads) {
9388 if (!VectorizedLoads.contains(LI))
9389 NonVectorized.push_back(LI);
9393 auto ProcessGatheredLoads =
9396 bool Final =
false) {
9398 for (
ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists :
9400 if (LoadsDists.size() <= 1) {
9401 NonVectorized.
push_back(LoadsDists.back().first);
9409 unsigned MaxConsecutiveDistance = 0;
9410 unsigned CurrentConsecutiveDist = 1;
9411 int64_t LastDist = LocalLoadsDists.front().second;
9412 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
9413 for (
const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {
9416 assert(LastDist >=
L.second &&
9417 "Expected first distance always not less than second");
9418 if (
static_cast<uint64_t
>(LastDist -
L.second) ==
9419 CurrentConsecutiveDist) {
9420 ++CurrentConsecutiveDist;
9421 MaxConsecutiveDistance =
9422 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
9426 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
9429 CurrentConsecutiveDist = 1;
9430 LastDist =
L.second;
9433 if (Loads.
size() <= 1)
9435 if (AllowMaskedGather)
9436 MaxConsecutiveDistance = Loads.
size();
9437 else if (MaxConsecutiveDistance < 2)
9442 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
9443 Final, MaxConsecutiveDistance);
9445 OriginalLoads.size() == Loads.
size() &&
9446 MaxConsecutiveDistance == Loads.
size() &&
9451 VectorizedLoads.
clear();
9455 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
9456 UnsortedNonVectorized, Final,
9457 OriginalLoads.size());
9458 if (SortedNonVectorized.
size() >= UnsortedNonVectorized.
size()) {
9459 SortedNonVectorized.
swap(UnsortedNonVectorized);
9460 Results.swap(UnsortedResults);
9465 << Slice.
size() <<
")\n");
9467 for (
Value *L : Slice)
9475 unsigned MaxVF = Slice.size();
9476 unsigned UserMaxVF = 0;
9477 unsigned InterleaveFactor = 0;
9482 std::optional<unsigned> InterleavedLoadsDistance = 0;
9484 std::optional<unsigned> CommonVF = 0;
9485 DenseMap<const TreeEntry *, unsigned> EntryToPosition;
9486 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
9487 for (
auto [Idx, V] :
enumerate(Slice)) {
9488 for (
const TreeEntry *
E : ValueToGatherNodes.at(V)) {
9489 UserMaxVF = std::max<unsigned>(UserMaxVF,
E->Scalars.size());
9492 UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
9494 if (*CommonVF == 0) {
9495 CommonVF =
E->Scalars.size();
9498 if (*CommonVF !=
E->Scalars.size())
9502 if (Pos != Idx && InterleavedLoadsDistance) {
9505 if (isa<Constant>(V))
9507 if (isVectorized(V))
9509 const auto &Nodes = ValueToGatherNodes.at(V);
9510 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
9511 !is_contained(Slice, V);
9513 InterleavedLoadsDistance.reset();
9517 if (*InterleavedLoadsDistance == 0) {
9518 InterleavedLoadsDistance = Idx - Pos;
9521 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
9522 (Idx - Pos) / *InterleavedLoadsDistance < Order)
9523 InterleavedLoadsDistance.reset();
9524 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
9528 DeinterleavedNodes.
clear();
9530 if (InterleavedLoadsDistance.value_or(0) > 1 &&
9531 CommonVF.value_or(0) != 0) {
9532 InterleaveFactor =
bit_ceil(*InterleavedLoadsDistance);
9533 unsigned VF = *CommonVF;
9536 StridedPtrInfo SPtrInfo;
9538 if (InterleaveFactor <= Slice.size() &&
9539 TTI.isLegalInterleavedAccessType(
9547 UserMaxVF = InterleaveFactor * VF;
9549 InterleaveFactor = 0;
9554 unsigned ConsecutiveNodesSize = 0;
9555 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
9556 any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9557 [&, Slice = Slice](
const auto &
P) {
9559 return std::get<1>(
P).contains(V);
9561 if (It == Slice.end())
9563 const TreeEntry &
TE =
9564 *VectorizableTree[std::get<0>(
P)];
9568 StridedPtrInfo SPtrInfo;
9570 VL, VL.
front(), Order, PointerOps, SPtrInfo);
9574 ConsecutiveNodesSize += VL.
size();
9575 size_t Start = std::distance(Slice.begin(), It);
9576 size_t Sz = Slice.size() -
Start;
9577 return Sz < VL.
size() ||
9578 Slice.slice(Start, VL.
size()) != VL;
9583 if (InterleaveFactor == 0 &&
9585 [&, Slice = Slice](
unsigned Idx) {
9587 SmallVector<Value *> PointerOps;
9588 StridedPtrInfo SPtrInfo;
9589 return canVectorizeLoads(
9590 Slice.slice(Idx * UserMaxVF, UserMaxVF),
9591 Slice[Idx * UserMaxVF], Order, PointerOps,
9592 SPtrInfo) == LoadsState::ScatterVectorize;
9595 if (Slice.size() != ConsecutiveNodesSize)
9596 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
9598 for (
unsigned VF = MaxVF; VF >= 2; VF /= 2) {
9599 bool IsVectorized =
true;
9600 for (
unsigned I = 0,
E = Slice.size();
I <
E;
I += VF) {
9602 Slice.slice(
I, std::min(VF,
E -
I));
9607 if (
any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9608 [&](
const auto &
P) {
9610 VectorizableTree[std::get<0>(
P)]
9615 unsigned Sz = VectorizableTree.size();
9616 buildTreeRec(SubSlice, 0,
EdgeInfo(), InterleaveFactor);
9617 if (Sz == VectorizableTree.size()) {
9618 IsVectorized =
false;
9621 if (InterleaveFactor > 0) {
9622 VF = 2 * (MaxVF / InterleaveFactor);
9623 InterleaveFactor = 0;
9632 NonVectorized.
append(SortedNonVectorized);
9634 return NonVectorized;
9636 for (
const auto &GLs : GatheredLoads) {
9637 const auto &
Ref = GLs.second;
9639 if (!
Ref.empty() && !NonVectorized.
empty() &&
9641 Ref.begin(),
Ref.end(), 0u,
9642 [](
unsigned S,
ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists)
9643 ->
unsigned { return S + LoadsDists.size(); }) !=
9644 NonVectorized.
size() &&
9645 IsMaskedGatherSupported(NonVectorized)) {
9648 for (LoadInst *LI : NonVectorized) {
9656 (void)ProcessGatheredLoads(FinalGatheredLoads,
true);
9660 for (
unsigned Idx : LoadEntriesToVectorize) {
9661 const TreeEntry &
E = *VectorizableTree[Idx];
9664 if (!
E.ReorderIndices.empty()) {
9667 SmallVector<int> ReorderMask;
9671 buildTreeRec(GatheredScalars, 0,
EdgeInfo());
9675 if (
static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
9676 VectorizableTree.size())
9677 GatheredLoadsEntriesFirst.reset();
9687 bool AllowAlternate) {
9710 isValidForAlternation(
I->getOpcode())) {
9722 std::pair<size_t, size_t> OpVals =
9730 if (CI->isCommutative())
9752 SubKey =
hash_value(Gep->getPointerOperand());
9764 return std::make_pair(
Key, SubKey);
9770 Instruction *AltOp,
const TargetLibraryInfo &TLI);
9772bool BoUpSLP::areAltOperandsProfitable(
const InstructionsState &S,
9774 Type *ScalarTy = S.getMainOp()->getType();
9775 unsigned Opcode0 = S.getOpcode();
9776 unsigned Opcode1 = S.getAltOpcode();
9777 SmallBitVector OpcodeMask(
getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
9780 Opcode1, OpcodeMask))
9783 for (
unsigned I :
seq<unsigned>(S.getMainOp()->getNumOperands())) {
9786 for (
Value *V : VL) {
9788 Operands.
back().push_back(
9795 if (Operands.
size() == 2) {
9799 Candidates[0] = std::make_pair(Operands[0][
I], Operands[0][
I + 1]);
9800 Candidates[1] = std::make_pair(Operands[0][
I], Operands[1][
I + 1]);
9801 Candidates[2] = std::make_pair(Operands[1][
I], Operands[0][
I + 1]);
9803 switch (Res.value_or(0)) {
9807 std::swap(Operands[0][
I + 1], Operands[1][
I + 1]);
9817 DenseSet<unsigned> UniqueOpcodes;
9818 constexpr unsigned NumAltInsts = 3;
9819 unsigned NonInstCnt = 0;
9822 unsigned UndefCnt = 0;
9824 unsigned ExtraShuffleInsts = 0;
9827 if (Operands.
size() == 2) {
9829 if (Operands.
front() == Operands.
back()) {
9833 return is_contained(Operands.back(), V);
9836 ++ExtraShuffleInsts;
9839 const Loop *
L = LI->getLoopFor(S.getMainOp()->getParent());
9851 DenseMap<Value *, unsigned> Uniques;
9861 if (!Res.second && Res.first->second == 1)
9862 ++ExtraShuffleInsts;
9863 ++Res.first->getSecond();
9865 UniqueOpcodes.
insert(
I->getOpcode());
9866 else if (Res.second)
9869 return none_of(Uniques, [&](
const auto &
P) {
9870 return P.first->hasNUsesOrMore(
P.second + 1) &&
9871 none_of(
P.first->users(), [&](User *U) {
9872 return isVectorized(U) || Uniques.contains(U);
9881 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
9882 (UniqueOpcodes.
size() + NonInstCnt + ExtraShuffleInsts +
9883 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
9890 const unsigned VF,
unsigned MinBW,
9913static std::pair<InstructionCost, InstructionCost>
9933 FMF = FPCI->getFastMathFlags();
9936 LibCost.isValid() ? LibCost : ScalarLimit);
9946BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
9948 bool IsScatterVectorizeUserTE,
OrdersType &CurrentOrder,
9949 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo) {
9951 "Expected instructions with same/alternate opcodes only.");
9953 unsigned ShuffleOrOp =
9954 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
9956 switch (ShuffleOrOp) {
9957 case Instruction::PHI: {
9960 return TreeEntry::NeedToGather;
9962 for (
Value *V : VL) {
9966 for (
Value *Incoming :
PHI->incoming_values()) {
9968 if (Term &&
Term->isTerminator()) {
9970 <<
"SLP: Need to swizzle PHINodes (terminator use).\n");
9971 return TreeEntry::NeedToGather;
9976 return TreeEntry::Vectorize;
9978 case Instruction::ExtractElement:
9985 return TreeEntry::NeedToGather;
9987 case Instruction::ExtractValue: {
9988 bool Reuse = canReuseExtract(VL, CurrentOrder);
9992 return TreeEntry::NeedToGather;
9993 if (Reuse || !CurrentOrder.empty())
9994 return TreeEntry::Vectorize;
9996 return TreeEntry::NeedToGather;
9998 case Instruction::InsertElement: {
10002 for (
Value *V : VL) {
10004 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement/poison vector.\n");
10005 return TreeEntry::NeedToGather;
10009 "Non-constant or undef index?");
10013 return !SourceVectors.contains(V);
10016 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
10017 "different source vectors.\n");
10018 return TreeEntry::NeedToGather;
10023 return SourceVectors.contains(V) && !
V->hasOneUse();
10026 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
10027 "multiple uses.\n");
10028 return TreeEntry::NeedToGather;
10031 return TreeEntry::Vectorize;
10033 case Instruction::Load: {
10040 auto IsGatheredNode = [&]() {
10041 if (!GatheredLoadsEntriesFirst)
10046 return any_of(getTreeEntries(V), [&](
const TreeEntry *TE) {
10047 return TE->Idx >= *GatheredLoadsEntriesFirst;
10053 return TreeEntry::Vectorize;
10055 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
10057 LoadEntriesToVectorize.insert(VectorizableTree.size());
10058 return TreeEntry::NeedToGather;
10060 return IsGatheredNode() ? TreeEntry::NeedToGather
10061 : TreeEntry::CompressVectorize;
10063 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
10065 LoadEntriesToVectorize.insert(VectorizableTree.size());
10066 return TreeEntry::NeedToGather;
10068 return IsGatheredNode() ? TreeEntry::NeedToGather
10069 : TreeEntry::ScatterVectorize;
10071 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
10073 LoadEntriesToVectorize.insert(VectorizableTree.size());
10074 return TreeEntry::NeedToGather;
10076 return IsGatheredNode() ? TreeEntry::NeedToGather
10077 : TreeEntry::StridedVectorize;
10081 if (DL->getTypeSizeInBits(ScalarTy) !=
10082 DL->getTypeAllocSizeInBits(ScalarTy))
10083 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering loads of non-packed type.\n");
10086 return !LI || !LI->isSimple();
10090 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering non-consecutive loads.\n");
10093 return TreeEntry::NeedToGather;
10097 case Instruction::ZExt:
10098 case Instruction::SExt:
10099 case Instruction::FPToUI:
10100 case Instruction::FPToSI:
10101 case Instruction::FPExt:
10102 case Instruction::PtrToInt:
10103 case Instruction::IntToPtr:
10104 case Instruction::SIToFP:
10105 case Instruction::UIToFP:
10106 case Instruction::Trunc:
10107 case Instruction::FPTrunc:
10108 case Instruction::BitCast: {
10110 for (
Value *V : VL) {
10116 dbgs() <<
"SLP: Gathering casts with different src types.\n");
10117 return TreeEntry::NeedToGather;
10120 return TreeEntry::Vectorize;
10122 case Instruction::ICmp:
10123 case Instruction::FCmp: {
10128 for (
Value *V : VL) {
10132 if ((
Cmp->getPredicate() != P0 &&
Cmp->getPredicate() != SwapP0) ||
10133 Cmp->getOperand(0)->getType() != ComparedTy) {
10134 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering cmp with different predicate.\n");
10135 return TreeEntry::NeedToGather;
10138 return TreeEntry::Vectorize;
10140 case Instruction::Select:
10141 case Instruction::FNeg:
10142 case Instruction::Add:
10143 case Instruction::FAdd:
10144 case Instruction::Sub:
10145 case Instruction::FSub:
10146 case Instruction::Mul:
10147 case Instruction::FMul:
10148 case Instruction::UDiv:
10149 case Instruction::SDiv:
10150 case Instruction::FDiv:
10151 case Instruction::URem:
10152 case Instruction::SRem:
10153 case Instruction::FRem:
10154 case Instruction::Shl:
10155 case Instruction::LShr:
10156 case Instruction::AShr:
10157 case Instruction::And:
10158 case Instruction::Or:
10159 case Instruction::Xor:
10160 case Instruction::Freeze:
10161 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10162 TTI->isFPVectorizationPotentiallyUnsafe() &&
any_of(VL, [](
Value *V) {
10164 return I &&
I->isBinaryOp() && !
I->isFast();
10166 return TreeEntry::NeedToGather;
10167 return TreeEntry::Vectorize;
10168 case Instruction::GetElementPtr: {
10170 for (
Value *V : VL) {
10174 if (
I->getNumOperands() != 2) {
10175 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (nested indexes).\n");
10176 return TreeEntry::NeedToGather;
10183 for (
Value *V : VL) {
10187 Type *CurTy =
GEP->getSourceElementType();
10188 if (Ty0 != CurTy) {
10189 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (different types).\n");
10190 return TreeEntry::NeedToGather;
10196 for (
Value *V : VL) {
10200 auto *
Op =
I->getOperand(1);
10202 (
Op->getType() != Ty1 &&
10204 Op->getType()->getScalarSizeInBits() >
10205 DL->getIndexSizeInBits(
10206 V->getType()->getPointerAddressSpace())))) {
10208 dbgs() <<
"SLP: not-vectorizable GEP (non-constant indexes).\n");
10209 return TreeEntry::NeedToGather;
10213 return TreeEntry::Vectorize;
10215 case Instruction::Store: {
10217 llvm::Type *ScalarTy =
cast<StoreInst>(VL0)->getValueOperand()->getType();
10220 if (DL->getTypeSizeInBits(ScalarTy) !=
10221 DL->getTypeAllocSizeInBits(ScalarTy)) {
10222 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering stores of non-packed type.\n");
10223 return TreeEntry::NeedToGather;
10227 for (
Value *V : VL) {
10229 if (!
SI->isSimple()) {
10231 return TreeEntry::NeedToGather;
10240 if (CurrentOrder.empty()) {
10241 Ptr0 = PointerOps.
front();
10242 PtrN = PointerOps.
back();
10244 Ptr0 = PointerOps[CurrentOrder.front()];
10245 PtrN = PointerOps[CurrentOrder.back()];
10247 std::optional<int64_t> Dist =
10250 if (
static_cast<uint64_t
>(*Dist) == VL.size() - 1)
10251 return TreeEntry::Vectorize;
10255 return TreeEntry::NeedToGather;
10257 case Instruction::Call: {
10258 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10259 TTI->isFPVectorizationPotentiallyUnsafe() &&
any_of(VL, [](
Value *V) {
10261 return I && !
I->isFast();
10263 return TreeEntry::NeedToGather;
10273 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10277 return TreeEntry::NeedToGather;
10280 unsigned NumArgs = CI->
arg_size();
10282 for (
unsigned J = 0; J != NumArgs; ++J)
10285 for (
Value *V : VL) {
10290 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
10292 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched calls:" << *CI <<
"!=" << *V
10294 return TreeEntry::NeedToGather;
10298 for (
unsigned J = 0; J != NumArgs; ++J) {
10301 if (ScalarArgs[J] != A1J) {
10303 <<
"SLP: mismatched arguments in call:" << *CI
10304 <<
" argument " << ScalarArgs[J] <<
"!=" << A1J <<
"\n");
10305 return TreeEntry::NeedToGather;
10314 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched bundle operands in calls:" << *CI
10315 <<
"!=" << *V <<
'\n');
10316 return TreeEntry::NeedToGather;
10321 auto *VecTy =
getWidenedType(S.getMainOp()->getType(), VL.size());
10323 if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())
10324 return TreeEntry::NeedToGather;
10326 return TreeEntry::Vectorize;
10328 case Instruction::ShuffleVector: {
10329 if (!S.isAltShuffle()) {
10332 return TreeEntry::Vectorize;
10335 LLVM_DEBUG(
dbgs() <<
"SLP: ShuffleVector are not vectorized.\n");
10336 return TreeEntry::NeedToGather;
10341 <<
"SLP: ShuffleVector not vectorized, operands are buildvector and "
10342 "the whole alt sequence is not profitable.\n");
10343 return TreeEntry::NeedToGather;
10346 return TreeEntry::Vectorize;
10350 return TreeEntry::NeedToGather;
10359 PHINode *Main =
nullptr;
10364 PHIHandler() =
delete;
10366 : DT(DT), Main(Main), Phis(Phis),
10367 Operands(Main->getNumIncomingValues(),
10369 void buildOperands() {
10370 constexpr unsigned FastLimit = 4;
10379 for (
auto [Idx, V] :
enumerate(Phis)) {
10383 "Expected isa instruction or poison value.");
10384 Operands[
I][Idx] =
V;
10387 if (
P->getIncomingBlock(
I) == InBB)
10388 Operands[
I][Idx] =
P->getIncomingValue(
I);
10390 Operands[
I][Idx] =
P->getIncomingValueForBlock(InBB);
10395 SmallMapVector<BasicBlock *, SmallVector<unsigned>, 4>
10405 for (
auto [Idx, V] :
enumerate(Phis)) {
10408 Operands[
I][Idx] =
V;
10417 Operands[
I][Idx] =
P->getIncomingValue(
I);
10420 auto *It = Blocks.
find(InBB);
10421 if (It == Blocks.
end())
10423 Operands[It->second.front()][Idx] =
P->getIncomingValue(
I);
10426 for (
const auto &
P : Blocks) {
10427 ArrayRef<unsigned> IncomingValues =
P.second;
10428 if (IncomingValues.
size() <= 1)
10431 for (
unsigned I : IncomingValues) {
10433 [&](
const auto &
Data) {
10434 return !
Data.value() ||
10435 Data.value() == Operands[BasicI][
Data.index()];
10437 "Expected empty operands list.");
10438 Operands[
I] = Operands[BasicI];
10451static std::pair<Instruction *, Instruction *>
10455 for (
Value *V : VL) {
10465 if (MainOp->
getOpcode() ==
I->getOpcode()) {
10484 "Expected different main and alt instructions.");
10485 return std::make_pair(MainOp, AltOp);
10498 const InstructionsState &S,
10500 bool TryPad =
false) {
10504 for (
Value *V : VL) {
10520 size_t NumUniqueScalarValues = UniqueValues.
size();
10523 if (NumUniqueScalarValues == VL.
size() &&
10525 ReuseShuffleIndices.
clear();
10530 if ((UserTreeIdx.
UserTE &&
10531 UserTreeIdx.
UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(
TTI)) ||
10533 LLVM_DEBUG(
dbgs() <<
"SLP: Reshuffling scalars not yet supported "
10534 "for nodes with padding.\n");
10535 ReuseShuffleIndices.
clear();
10540 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
10544 if (TryPad && UniquePositions.
size() > 1 && NumUniqueScalarValues > 1 &&
10545 S.getMainOp()->isSafeToRemove() &&
10546 (S.areInstructionsWithCopyableElements() ||
10550 TTI, UniqueValues.
front()->getType(), UniqueValues.
size());
10551 PWSz = std::min<unsigned>(PWSz, VL.
size());
10552 if (PWSz == VL.
size()) {
10556 ReuseShuffleIndices.
clear();
10560 UniqueValues.
end());
10561 PaddedUniqueValues.
append(
10562 PWSz - UniqueValues.
size(),
10566 if ((!S.areInstructionsWithCopyableElements() &&
10568 (S.areInstructionsWithCopyableElements() && S.isMulDivLikeOp() &&
10569 (S.getMainOp()->isIntDivRem() || S.getMainOp()->isFPDivRem() ||
10572 ReuseShuffleIndices.
clear();
10575 VL = std::move(PaddedUniqueValues);
10580 ReuseShuffleIndices.
clear();
10583 VL = std::move(UniqueValues);
10588 const InstructionsState &LocalState,
10589 SmallVectorImpl<Value *> &Op1,
10590 SmallVectorImpl<Value *> &Op2,
10592 constexpr unsigned SmallNodeSize = 4;
10593 if (VL.
size() <= SmallNodeSize || TTI->preferAlternateOpcodeVectorization() ||
10598 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *LocalState.getMainOp()
10600 for (TreeEntry *
E : getSplitTreeEntries(LocalState.getMainOp())) {
10601 if (
E->isSame(VL)) {
10603 << *LocalState.getMainOp() <<
".\n");
10615 ReorderIndices.assign(VL.
size(), VL.
size());
10616 SmallBitVector Op1Indices(VL.
size());
10621 Op1Indices.set(Idx);
10624 if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
10627 (LocalState.getAltOpcode() == LocalState.getOpcode() &&
10629 LocalState.getAltOp(), *TLI))) {
10631 Op1Indices.set(Idx);
10638 unsigned Opcode0 = LocalState.getOpcode();
10639 unsigned Opcode1 = LocalState.getAltOpcode();
10640 SmallBitVector OpcodeMask(
getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
10645 if (UOp1.size() <= 1 || UOp2.size() <= 1 ||
10646 TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) ||
10651 unsigned Op1Cnt = 0, Op2Cnt = Op1.
size();
10653 if (Op1Indices.test(Idx)) {
10654 ReorderIndices[Op1Cnt] = Idx;
10657 ReorderIndices[Op2Cnt] = Idx;
10662 ReorderIndices.clear();
10663 SmallVector<int>
Mask;
10664 if (!ReorderIndices.empty())
10666 unsigned NumParts = TTI->getNumberOfParts(VecTy);
10671 if (NumParts >= VL.
size())
10676 FixedVectorType *SubVecTy =
10680 if (!LocalState.isCmpOp() && NumParts <= 1 &&
10681 (
Mask.empty() || InsertCost >= NewShuffleCost))
10683 if ((LocalState.getMainOp()->isBinaryOp() &&
10684 LocalState.getAltOp()->isBinaryOp() &&
10685 (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
10686 LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
10687 (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
10688 (LocalState.getMainOp()->isUnaryOp() &&
10689 LocalState.getAltOp()->isUnaryOp())) {
10691 TTI->getArithmeticInstrCost(Opcode0, VecTy, Kind) +
10692 TTI->getArithmeticInstrCost(Opcode1, VecTy, Kind);
10697 OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.
size());
10701 VecTy, OriginalMask, Kind);
10703 TTI->getArithmeticInstrCost(Opcode0, Op1VecTy, Kind) +
10704 TTI->getArithmeticInstrCost(Opcode1, Op2VecTy, Kind);
10706 NewVecOpsCost + InsertCost +
10707 (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
10708 VectorizableTree.front()->getOpcode() == Instruction::Store
10712 if (NewCost >= OriginalCost)
10722class InstructionsCompatibilityAnalysis {
10724 const DataLayout &
DL;
10725 const TargetTransformInfo &
TTI;
10726 const TargetLibraryInfo &TLI;
10727 unsigned MainOpcode = 0;
10732 static bool isSupportedOpcode(
const unsigned Opcode) {
10733 return Opcode == Instruction::Add || Opcode == Instruction::LShr ||
10734 Opcode == Instruction::Shl || Opcode == Instruction::SDiv ||
10735 Opcode == Instruction::UDiv || Opcode == Instruction::And ||
10736 Opcode == Instruction::Or || Opcode == Instruction::Xor;
10746 auto IsSupportedInstruction = [&](
Instruction *
I,
bool AnyUndef) {
10747 if (AnyUndef && (
I->isIntDivRem() ||
I->isFPDivRem() ||
isa<CallInst>(
I)))
10749 return I && isSupportedOpcode(
I->getOpcode()) &&
10754 SmallDenseSet<Value *, 8> Operands;
10755 SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
10756 bool AnyUndef =
false;
10757 for (
Value *V : VL) {
10765 if (Candidates.
empty()) {
10766 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
10768 Operands.
insert(
I->op_begin(),
I->op_end());
10771 if (Parent ==
I->getParent()) {
10772 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
10773 Operands.
insert(
I->op_begin(),
I->op_end());
10776 auto *NodeA = DT.
getNode(Parent);
10777 auto *NodeB = DT.
getNode(
I->getParent());
10778 assert(NodeA &&
"Should only process reachable instructions");
10779 assert(NodeB &&
"Should only process reachable instructions");
10780 assert((NodeA == NodeB) ==
10781 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10782 "Different nodes should have different DFS numbers");
10783 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
10784 Candidates.
clear();
10785 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
10788 Operands.
insert(
I->op_begin(),
I->op_end());
10791 unsigned BestOpcodeNum = 0;
10793 for (
const auto &
P : Candidates) {
10794 if (
P.second.size() < BestOpcodeNum)
10796 for (Instruction *
I :
P.second) {
10797 if (IsSupportedInstruction(
I, AnyUndef) && !Operands.
contains(
I)) {
10799 BestOpcodeNum =
P.second.size();
10809 return I &&
I->getParent() == MainOp->
getParent() &&
10822 Value *selectBestIdempotentValue()
const {
10823 assert(isSupportedOpcode(MainOpcode) &&
"Unsupported opcode");
10834 if (!S.isCopyableElement(V))
10836 assert(isSupportedOpcode(MainOpcode) &&
"Unsupported opcode");
10837 return {
V, selectBestIdempotentValue()};
10843 SmallVectorImpl<BoUpSLP::ValueList> &Operands)
const {
10845 unsigned ShuffleOrOp =
10846 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
10849 switch (ShuffleOrOp) {
10850 case Instruction::PHI: {
10854 PHIHandler Handler(DT, PH, VL);
10855 Handler.buildOperands();
10856 Operands.
assign(PH->getNumOperands(), {});
10858 Operands[
I].
assign(Handler.getOperands(
I).begin(),
10859 Handler.getOperands(
I).end());
10862 case Instruction::ExtractValue:
10863 case Instruction::ExtractElement:
10868 case Instruction::InsertElement:
10876 case Instruction::Load:
10880 for (
auto [V,
Op] :
zip(VL, Operands.
back())) {
10884 Op = LI->getPointerOperand();
10887 case Instruction::ZExt:
10888 case Instruction::SExt:
10889 case Instruction::FPToUI:
10890 case Instruction::FPToSI:
10891 case Instruction::FPExt:
10892 case Instruction::PtrToInt:
10893 case Instruction::IntToPtr:
10894 case Instruction::SIToFP:
10895 case Instruction::UIToFP:
10896 case Instruction::Trunc:
10897 case Instruction::FPTrunc:
10898 case Instruction::BitCast:
10899 case Instruction::ICmp:
10900 case Instruction::FCmp:
10901 case Instruction::Select:
10902 case Instruction::FNeg:
10903 case Instruction::Add:
10904 case Instruction::FAdd:
10905 case Instruction::Sub:
10906 case Instruction::FSub:
10907 case Instruction::Mul:
10908 case Instruction::FMul:
10909 case Instruction::UDiv:
10910 case Instruction::SDiv:
10911 case Instruction::FDiv:
10912 case Instruction::URem:
10913 case Instruction::SRem:
10914 case Instruction::FRem:
10915 case Instruction::Shl:
10916 case Instruction::LShr:
10917 case Instruction::AShr:
10918 case Instruction::And:
10919 case Instruction::Or:
10920 case Instruction::Xor:
10921 case Instruction::Freeze:
10922 case Instruction::Store:
10923 case Instruction::ShuffleVector:
10932 auto [
Op, ConvertedOps] = convertTo(
I, S);
10937 case Instruction::GetElementPtr: {
10944 const unsigned IndexIdx = 1;
10950 return !
GEP || VL0Ty ==
GEP->getOperand(IndexIdx)->getType();
10953 :
DL.getIndexType(
cast<GetElementPtrInst>(VL0)
10954 ->getPointerOperandType()
10955 ->getScalarType());
10959 Operands[0][Idx] =
V;
10960 Operands[1][Idx] = ConstantInt::getNullValue(Ty);
10963 Operands[0][Idx] =
GEP->getPointerOperand();
10964 auto *
Op =
GEP->getOperand(IndexIdx);
10967 CI, Ty, CI->getValue().isSignBitSet(),
DL)
10972 case Instruction::Call: {
10979 for (
Value *V : VL) {
10981 Ops.push_back(
I ?
I->getOperand(Idx)
10994 InstructionsCompatibilityAnalysis(DominatorTree &DT,
const DataLayout &
DL,
10995 const TargetTransformInfo &
TTI,
10996 const TargetLibraryInfo &TLI)
11001 bool TryCopyableElementsVectorization,
11002 bool WithProfitabilityCheck =
false,
11003 bool SkipSameCodeCheck =
false) {
11004 InstructionsState S = (SkipSameCodeCheck || !
allSameBlock(VL))
11005 ? InstructionsState::invalid()
11011 findAndSetMainInstruction(VL, R);
11013 return InstructionsState::invalid();
11014 S = InstructionsState(MainOp, MainOp,
true);
11015 if (!WithProfitabilityCheck)
11019 auto BuildCandidates =
11020 [](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates,
Value *V1,
11026 if (I1 && I2 &&
I1->getOpcode() == I2->getOpcode() &&
11027 I1->getParent() != I2->getParent())
11031 if (VL.
size() == 2) {
11034 BuildCandidates(Candidates1, Operands[0][0], Operands[0][1]);
11035 BuildCandidates(Candidates2, Operands[1][0], Operands[1][1]);
11036 bool Res = !Candidates1.
empty() && !Candidates2.
empty() &&
11037 R.findBestRootPair(Candidates1) &&
11038 R.findBestRootPair(Candidates2);
11040 Candidates1.
clear();
11041 Candidates2.
clear();
11042 BuildCandidates(Candidates1, Operands[0][0], Operands[1][1]);
11043 BuildCandidates(Candidates2, Operands[1][0], Operands[0][1]);
11044 Res = !Candidates1.
empty() && !Candidates2.
empty() &&
11045 R.findBestRootPair(Candidates1) &&
11046 R.findBestRootPair(Candidates2);
11049 return InstructionsState::invalid();
11053 FixedVectorType *VecTy =
11055 switch (MainOpcode) {
11056 case Instruction::Add:
11057 case Instruction::LShr:
11058 case Instruction::Shl:
11059 case Instruction::SDiv:
11060 case Instruction::UDiv:
11061 case Instruction::And:
11062 case Instruction::Or:
11063 case Instruction::Xor:
11069 if (VectorCost > ScalarCost)
11070 return InstructionsState::invalid();
11073 assert(Operands.
size() == 2 &&
"Unexpected number of operands!");
11074 unsigned CopyableNum =
11075 count_if(VL, [&](
Value *V) {
return S.isCopyableElement(V); });
11076 if (CopyableNum < VL.
size() / 2)
11079 const unsigned Limit = VL.
size() / 24;
11080 if ((CopyableNum >= VL.
size() - Limit ||
11081 (CopyableNum >= VL.
size() - 1 && VL.
size() > 4) ||
11086 return InstructionsState::invalid();
11090 for (
auto &
Ops : Operands) {
11105 return InstructionsState::invalid();
11111 constexpr unsigned Limit = 4;
11112 if (Operands.front().size() >= Limit) {
11113 SmallDenseMap<const Value *, unsigned>
Counters;
11121 return C.second == 1;
11127 InstructionsCompatibilityAnalysis
Analysis(DT,
DL,
TTI, TLI);
11128 InstructionsState OpS =
Analysis.buildInstructionsState(
11130 if (!OpS || (OpS.getOpcode() == Instruction::PHI && !
allSameBlock(
Ops)))
11132 unsigned CopyableNum =
11134 return CopyableNum <= VL.
size() / 2;
11136 if (!CheckOperand(Operands.front()))
11137 return InstructionsState::invalid();
11144 assert(S &&
"Invalid state!");
11146 if (S.areInstructionsWithCopyableElements()) {
11147 MainOp = S.getMainOp();
11148 MainOpcode = S.getOpcode();
11153 for (
auto [OperandIdx, Operand] :
enumerate(OperandsForValue))
11154 Operands[OperandIdx][Idx] = Operand;
11157 buildOriginalOperands(S, VL, Operands);
11164BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
11166 bool TryCopyableElementsVectorization)
const {
11169 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
11170 InstructionsState S =
Analysis.buildInstructionsState(
11171 VL, *
this, TryCopyableElementsVectorization,
11172 true, TryCopyableElementsVectorization);
11180 return ScalarsVectorizationLegality(S,
false,
11186 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *S.getMainOp() <<
".\n");
11187 for (TreeEntry *
E : getTreeEntries(S.getMainOp())) {
11188 if (
E->isSame(VL)) {
11189 LLVM_DEBUG(
dbgs() <<
"SLP: Perfect diamond merge at " << *S.getMainOp()
11191 return ScalarsVectorizationLegality(S,
false);
11196 (S.getOpcode() == Instruction::PHI &&
isa<PHINode>(V) &&
11197 LI->getLoopFor(S.getMainOp()->getParent()) &&
11201 return ScalarsVectorizationLegality(S,
false);
11210 !(S && !S.isAltShuffle() && VL.
size() >= 4 &&
11217 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to max recursion depth.\n");
11218 return ScalarsVectorizationLegality(S,
false);
11222 if (S && S.getOpcode() == Instruction::ExtractElement &&
11225 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to scalable vector type.\n");
11226 return ScalarsVectorizationLegality(S,
false);
11233 return ScalarsVectorizationLegality(S,
false,
11243 if (!S || !S.isAltShuffle() || VL.
size() > 2)
11251 SmallVector<unsigned, 8> InstsCount;
11252 for (
Value *V : VL) {
11255 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
11258 bool IsCommutative =
11260 if ((IsCommutative &&
11261 std::accumulate(InstsCount.
begin(), InstsCount.
end(), 0) < 2) ||
11263 all_of(InstsCount, [](
unsigned ICnt) {
return ICnt < 2; })))
11265 assert(VL.size() == 2 &&
"Expected only 2 alternate op instructions.");
11269 for (
int Op :
seq<int>(S.getMainOp()->getNumOperands()))
11271 I2->getOperand(
Op));
11272 if (
static_cast<unsigned>(
count_if(
11273 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
11275 })) >= S.getMainOp()->getNumOperands() / 2)
11277 if (S.getMainOp()->getNumOperands() > 2)
11279 if (IsCommutative) {
11281 Candidates.
clear();
11282 for (
int Op = 0,
E = S.getMainOp()->getNumOperands();
Op <
E; ++
Op)
11284 I2->getOperand((
Op + 1) %
E));
11286 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
11293 SmallVector<unsigned> SortedIndices;
11295 bool IsScatterVectorizeUserTE =
11296 UserTreeIdx.UserTE &&
11297 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11298 bool AreAllSameBlock = S.valid();
11299 bool AreScatterAllGEPSameBlock =
11312 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
11314 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
11320 NotProfitableForVectorization(VL)) {
11322 LLVM_DEBUG(
dbgs() <<
"SLP: Try split and if failed, gathering due to "
11323 "C,S,B,O, small shuffle. \n";
11327 return ScalarsVectorizationLegality(S,
false,
11331 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to C,S,B,O, small shuffle. \n";
11335 return ScalarsVectorizationLegality(S,
false);
11339 if (S && !EphValues.empty()) {
11340 for (
Value *V : VL) {
11341 if (EphValues.count(V)) {
11343 <<
") is ephemeral.\n");
11345 return ScalarsVectorizationLegality(S,
false,
11357 if (S && S.isAltShuffle()) {
11358 auto GetNumVectorizedExtracted = [&]() {
11364 all_of(
I->operands(), [&](
const Use &U) {
11365 return isa<ExtractElementInst>(U.get());
11370 else if (!
I->hasOneUser() && !areAllUsersVectorized(
I, UserIgnoreList))
11373 return std::make_pair(Vectorized, Extracted);
11375 auto [Vectorized, Extracted] = GetNumVectorizedExtracted();
11377 bool PreferScalarize = !Vectorized.
isAllOnes() && VL.size() == 2;
11378 if (!Vectorized.
isAllOnes() && !PreferScalarize) {
11381 Type *ScalarTy = VL.front()->getType();
11386 false,
true, Kind);
11388 *TTI, ScalarTy, VecTy, Vectorized,
11389 true,
false, Kind,
false);
11390 PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate;
11392 if (PreferScalarize) {
11393 LLVM_DEBUG(
dbgs() <<
"SLP: The instructions are in tree and alternate "
11394 "node is not profitable.\n");
11395 return ScalarsVectorizationLegality(S,
false);
11400 if (UserIgnoreList && !UserIgnoreList->empty()) {
11401 for (
Value *V : VL) {
11402 if (UserIgnoreList->contains(V)) {
11403 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to gathered scalar.\n");
11404 return ScalarsVectorizationLegality(S,
false);
11411 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
11412 assert(VL.front()->getType()->isPointerTy() &&
11414 "Expected pointers only.");
11417 assert(It != VL.end() &&
"Expected at least one GEP.");
11428 !DT->isReachableFromEntry(BB))) {
11434 return ScalarsVectorizationLegality(S,
false);
11436 return ScalarsVectorizationLegality(S,
true);
11441 unsigned InterleaveFactor) {
11444 SmallVector<int> ReuseShuffleIndices;
11448 auto TrySplitNode = [&](
const InstructionsState &LocalState) {
11451 if (!canBuildSplitNode(VL, LocalState, Op1, Op2, ReorderIndices))
11454 auto Invalid = ScheduleBundle::invalid();
11455 auto *
TE = newTreeEntry(VL, TreeEntry::SplitVectorize,
Invalid, LocalState,
11456 UserTreeIdx, {}, ReorderIndices);
11461 getSameValuesTreeEntry(S.getMainOp(),
Op,
true))) {
11463 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11464 Idx == 0 ? 0 : Op1.
size());
11465 (void)newTreeEntry(
Op, TreeEntry::NeedToGather,
Invalid, S, {
TE, Idx});
11467 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11468 Idx == 0 ? 0 : Op1.
size());
11478 bool AreConsts =
false;
11479 for (
Value *V : VL) {
11491 if (AreOnlyConstsWithPHIs(VL)) {
11492 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to all constants and PHIs.\n");
11493 newGatherTreeEntry(VL, InstructionsState::invalid(), UserTreeIdx);
11497 ScalarsVectorizationLegality Legality = getScalarsVectorizationLegality(
11498 VL,
Depth, UserTreeIdx,
false);
11499 InstructionsState S = Legality.getInstructionsState();
11500 if (!Legality.isLegal()) {
11501 if (Legality.trySplitVectorize()) {
11504 if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
11508 Legality = getScalarsVectorizationLegality(
11509 VL,
Depth, UserTreeIdx,
true);
11510 if (!Legality.isLegal()) {
11511 if (Legality.tryToFindDuplicates())
11515 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11518 S = Legality.getInstructionsState();
11522 if (S.isAltShuffle() && TrySplitNode(S))
11528 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11533 bool IsScatterVectorizeUserTE =
11534 UserTreeIdx.UserTE &&
11535 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11538 StridedPtrInfo SPtrInfo;
11539 TreeEntry::EntryState State = getScalarsVectorizationState(
11540 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, SPtrInfo);
11541 if (State == TreeEntry::NeedToGather) {
11542 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11548 auto &BSRef = BlocksSchedules[BB];
11550 BSRef = std::make_unique<BlockScheduling>(BB);
11552 BlockScheduling &BS = *BSRef;
11555 std::optional<ScheduleBundle *> BundlePtr =
11556 BS.tryScheduleBundle(UniqueValues.getArrayRef(),
this, S, UserTreeIdx);
11557#ifdef EXPENSIVE_CHECKS
11561 if (!BundlePtr || (*BundlePtr && !*BundlePtr.value())) {
11562 LLVM_DEBUG(
dbgs() <<
"SLP: We are not able to schedule this bundle!\n");
11564 if (S.isAltShuffle() && ReuseShuffleIndices.
empty() && TrySplitNode(S))
11566 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11567 NonScheduledFirst.insert(VL.front());
11568 if (S.getOpcode() == Instruction::Load &&
11569 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
11573 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
11575 ScheduleBundle
Empty;
11576 ScheduleBundle &Bundle = BundlePtr.value() ? *BundlePtr.value() :
Empty;
11577 LLVM_DEBUG(
dbgs() <<
"SLP: We are able to schedule this bundle.\n");
11579 unsigned ShuffleOrOp =
11580 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
11581 auto CreateOperandNodes = [&](TreeEntry *
TE,
const auto &Operands) {
11583 SmallVector<unsigned> PHIOps;
11589 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
11594 for (
unsigned I : PHIOps)
11595 buildTreeRec(Operands[
I],
Depth + 1, {
TE,
I});
11597 switch (ShuffleOrOp) {
11598 case Instruction::PHI: {
11600 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
11604 TE->setOperands(Operands);
11605 CreateOperandNodes(TE, Operands);
11608 case Instruction::ExtractValue:
11609 case Instruction::ExtractElement: {
11610 if (CurrentOrder.empty()) {
11611 LLVM_DEBUG(
dbgs() <<
"SLP: Reusing or shuffling extract sequence.\n");
11614 dbgs() <<
"SLP: Reusing or shuffling of reordered extract sequence "
11616 for (
unsigned Idx : CurrentOrder)
11617 dbgs() <<
" " << Idx;
11624 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11625 ReuseShuffleIndices, CurrentOrder);
11627 "(ExtractValueInst/ExtractElementInst).\n";
11631 TE->setOperands(Operands);
11634 case Instruction::InsertElement: {
11635 assert(ReuseShuffleIndices.
empty() &&
"All inserts should be unique");
11637 auto OrdCompare = [](
const std::pair<int, int> &
P1,
11638 const std::pair<int, int> &P2) {
11639 return P1.first > P2.first;
11642 decltype(OrdCompare)>
11643 Indices(OrdCompare);
11644 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
11646 Indices.emplace(Idx,
I);
11648 OrdersType CurrentOrder(VL.size(), VL.size());
11649 bool IsIdentity =
true;
11650 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
11651 CurrentOrder[Indices.top().second] =
I;
11652 IsIdentity &= Indices.top().second ==
I;
11656 CurrentOrder.clear();
11657 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11659 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (InsertElementInst).\n";
11662 TE->setOperands(Operands);
11663 buildTreeRec(
TE->getOperand(1),
Depth + 1, {TE, 1});
11666 case Instruction::Load: {
11673 TreeEntry *
TE =
nullptr;
11676 case TreeEntry::Vectorize:
11677 TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11678 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
11679 if (CurrentOrder.empty())
11680 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (LoadInst).\n";
11684 <<
"SLP: added a new TreeEntry (jumbled LoadInst).\n";
11687 case TreeEntry::CompressVectorize:
11689 TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S,
11690 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11693 <<
"SLP: added a new TreeEntry (masked LoadInst + compress).\n";
11696 case TreeEntry::StridedVectorize:
11698 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
11699 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11700 TreeEntryToStridedPtrInfoMap[
TE] = SPtrInfo;
11701 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (strided LoadInst).\n";
11704 case TreeEntry::ScatterVectorize:
11706 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
11707 UserTreeIdx, ReuseShuffleIndices);
11710 <<
"SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
11713 case TreeEntry::CombinedVectorize:
11714 case TreeEntry::SplitVectorize:
11715 case TreeEntry::NeedToGather:
11718 if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {
11719 assert(Operands.
size() == 1 &&
"Expected a single operand only");
11720 SmallVector<int>
Mask;
11724 TE->setOperands(Operands);
11725 if (State == TreeEntry::ScatterVectorize)
11726 buildTreeRec(PointerOps,
Depth + 1, {
TE, 0});
11729 case Instruction::ZExt:
11730 case Instruction::SExt:
11731 case Instruction::FPToUI:
11732 case Instruction::FPToSI:
11733 case Instruction::FPExt:
11734 case Instruction::PtrToInt:
11735 case Instruction::IntToPtr:
11736 case Instruction::SIToFP:
11737 case Instruction::UIToFP:
11738 case Instruction::Trunc:
11739 case Instruction::FPTrunc:
11740 case Instruction::BitCast: {
11741 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
11742 std::make_pair(std::numeric_limits<unsigned>::min(),
11743 std::numeric_limits<unsigned>::max()));
11744 if (ShuffleOrOp == Instruction::ZExt ||
11745 ShuffleOrOp == Instruction::SExt) {
11746 CastMaxMinBWSizes = std::make_pair(
11747 std::max<unsigned>(DL->getTypeSizeInBits(VL0->
getType()),
11749 std::min<unsigned>(
11752 }
else if (ShuffleOrOp == Instruction::Trunc) {
11753 CastMaxMinBWSizes = std::make_pair(
11754 std::max<unsigned>(
11757 std::min<unsigned>(DL->getTypeSizeInBits(VL0->
getType()),
11760 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11761 ReuseShuffleIndices);
11762 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (CastInst).\n";
11765 TE->setOperands(Operands);
11767 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
11768 if (ShuffleOrOp == Instruction::Trunc) {
11769 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11770 }
else if (ShuffleOrOp == Instruction::SIToFP ||
11771 ShuffleOrOp == Instruction::UIToFP) {
11772 unsigned NumSignBits =
11775 APInt
Mask = DB->getDemandedBits(OpI);
11776 NumSignBits = std::max(NumSignBits,
Mask.countl_zero());
11778 if (NumSignBits * 2 >=
11780 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11784 case Instruction::ICmp:
11785 case Instruction::FCmp: {
11788 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11789 ReuseShuffleIndices);
11798 "Commutative Predicate mismatch");
11801 Operands.
back() =
Ops.getVL(1);
11808 if (
Cmp->getPredicate() != P0)
11812 TE->setOperands(Operands);
11813 buildTreeRec(Operands.
front(),
Depth + 1, {TE, 0});
11814 buildTreeRec(Operands.
back(),
Depth + 1, {TE, 1});
11815 if (ShuffleOrOp == Instruction::ICmp) {
11816 unsigned NumSignBits0 =
11818 if (NumSignBits0 * 2 >=
11820 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11821 unsigned NumSignBits1 =
11823 if (NumSignBits1 * 2 >=
11825 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
11829 case Instruction::Select:
11830 case Instruction::FNeg:
11831 case Instruction::Add:
11832 case Instruction::FAdd:
11833 case Instruction::Sub:
11834 case Instruction::FSub:
11835 case Instruction::Mul:
11836 case Instruction::FMul:
11837 case Instruction::UDiv:
11838 case Instruction::SDiv:
11839 case Instruction::FDiv:
11840 case Instruction::URem:
11841 case Instruction::SRem:
11842 case Instruction::FRem:
11843 case Instruction::Shl:
11844 case Instruction::LShr:
11845 case Instruction::AShr:
11846 case Instruction::And:
11847 case Instruction::Or:
11848 case Instruction::Xor:
11849 case Instruction::Freeze: {
11850 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11851 ReuseShuffleIndices);
11853 dbgs() <<
"SLP: added a new TreeEntry "
11854 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
11860 Operands[0] =
Ops.getVL(0);
11861 Operands[1] =
Ops.getVL(1);
11863 TE->setOperands(Operands);
11865 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
11868 case Instruction::GetElementPtr: {
11869 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11870 ReuseShuffleIndices);
11871 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (GetElementPtrInst).\n";
11873 TE->setOperands(Operands);
11876 buildTreeRec(Operands[
I],
Depth + 1, {
TE,
I});
11879 case Instruction::Store: {
11880 bool Consecutive = CurrentOrder.empty();
11883 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11884 ReuseShuffleIndices, CurrentOrder);
11886 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (StoreInst).\n";
11890 dbgs() <<
"SLP: added a new TreeEntry (jumbled StoreInst).\n";
11892 TE->setOperands(Operands);
11893 buildTreeRec(
TE->getOperand(0),
Depth + 1, {TE, 0});
11896 case Instruction::Call: {
11902 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11903 ReuseShuffleIndices);
11904 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (CallInst).\n";
11909 Operands[0] =
Ops.getVL(0);
11910 Operands[1] =
Ops.getVL(1);
11912 TE->setOperands(Operands);
11918 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
11922 case Instruction::ShuffleVector: {
11923 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11924 ReuseShuffleIndices);
11925 if (S.isAltShuffle()) {
11926 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (isAltShuffle).\n";
11931 dbgs() <<
"SLP: added a new TreeEntry (ShuffleVectorInst).\n";
11945 "Expected different main/alternate predicates.");
11961 TE->setOperands(Operands);
11962 buildTreeRec(Operands.
front(),
Depth + 1, {TE, 0});
11963 buildTreeRec(Operands.
back(),
Depth + 1, {TE, 1});
11970 Operands[0] =
Ops.getVL(0);
11971 Operands[1] =
Ops.getVL(1);
11973 TE->setOperands(Operands);
11975 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
11993 for (
const auto *Ty : ST->elements())
11994 if (Ty != *ST->element_begin())
11996 N *= ST->getNumElements();
11997 EltTy = *ST->element_begin();
11999 N *= AT->getNumElements();
12000 EltTy = AT->getElementType();
12003 N *= VT->getNumElements();
12004 EltTy = VT->getElementType();
12010 size_t VTSize = DL->getTypeStoreSizeInBits(
getWidenedType(EltTy,
N));
12011 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
12012 VTSize != DL->getTypeStoreSizeInBits(
T))
12019 bool ResizeAllowed)
const {
12021 assert(It != VL.
end() &&
"Expected at least one extract instruction.");
12028 Value *Vec = E0->getOperand(0);
12030 CurrentOrder.
clear();
12034 if (E0->getOpcode() == Instruction::ExtractValue) {
12046 unsigned E = VL.
size();
12047 if (!ResizeAllowed && NElts !=
E)
12050 unsigned MinIdx = NElts, MaxIdx = 0;
12055 if (Inst->getOperand(0) != Vec)
12063 const unsigned ExtIdx = *Idx;
12064 if (ExtIdx >= NElts)
12066 Indices[
I] = ExtIdx;
12067 if (MinIdx > ExtIdx)
12069 if (MaxIdx < ExtIdx)
12072 if (MaxIdx - MinIdx + 1 >
E)
12074 if (MaxIdx + 1 <=
E)
12078 bool ShouldKeepOrder =
true;
12085 for (
unsigned I = 0;
I <
E; ++
I) {
12088 const unsigned ExtIdx = Indices[
I] - MinIdx;
12089 if (CurrentOrder[ExtIdx] !=
E) {
12090 CurrentOrder.
clear();
12093 ShouldKeepOrder &= ExtIdx ==
I;
12094 CurrentOrder[ExtIdx] =
I;
12096 if (ShouldKeepOrder)
12097 CurrentOrder.
clear();
12099 return ShouldKeepOrder;
12102bool BoUpSLP::areAllUsersVectorized(
12103 Instruction *
I,
const SmallDenseSet<Value *> *VectorizedVals)
const {
12104 return (
I->hasOneUse() && (!VectorizedVals || VectorizedVals->
contains(
I))) ||
12105 all_of(
I->users(), [
this](User *U) {
12106 return isVectorized(U) || isVectorLikeInstWithConstOps(U) ||
12107 (isa<ExtractElementInst>(U) && MustGather.contains(U));
12111void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
12112 const function_ref<
bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
12113 SmallVectorImpl<Value *> *OpScalars,
12114 SmallVectorImpl<Value *> *AltScalars)
const {
12115 unsigned Sz = Scalars.size();
12117 SmallVector<int> OrderMask;
12118 if (!ReorderIndices.empty())
12120 for (
unsigned I = 0;
I < Sz; ++
I) {
12122 if (!ReorderIndices.empty())
12123 Idx = OrderMask[
I];
12127 if (IsAltOp(OpInst)) {
12128 Mask[
I] = Sz + Idx;
12137 if (!ReuseShuffleIndices.
empty()) {
12139 transform(ReuseShuffleIndices, NewMask.
begin(), [&Mask](
int Idx) {
12140 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
12142 Mask.swap(NewMask);
12149 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(
I) == MainOp;
12159 assert(MainP != AltP &&
"Expected different main/alternate predicates.");
12168 assert((MainP ==
P || AltP ==
P || MainP == SwappedP || AltP == SwappedP) &&
12169 "CmpInst expected to match either main or alternate predicate or "
12171 return MainP !=
P && MainP != SwappedP;
12173 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(
I) == AltOp;
12178 const auto *Op0 =
Ops.front();
12191 return CI->getValue().isPowerOf2();
12197 return CI->getValue().isNegatedPowerOf2();
12202 if (IsConstant && IsUniform)
12204 else if (IsConstant)
12206 else if (IsUniform)
12218class BaseShuffleAnalysis {
12220 Type *ScalarTy =
nullptr;
12222 BaseShuffleAnalysis(
Type *ScalarTy) : ScalarTy(ScalarTy) {}
12230 unsigned getVF(
Value *V)
const {
12231 assert(V &&
"V cannot be nullptr");
12233 "V does not have FixedVectorType");
12234 assert(ScalarTy &&
"ScalarTy cannot be nullptr");
12236 unsigned VNumElements =
12238 assert(VNumElements > ScalarTyNumElements &&
12239 "the number of elements of V is not large enough");
12240 assert(VNumElements % ScalarTyNumElements == 0 &&
12241 "the number of elements of V is not a vectorized value");
12242 return VNumElements / ScalarTyNumElements;
12248 static bool isIdentityMask(ArrayRef<int> Mask,
const FixedVectorType *VecTy,
12250 int Limit =
Mask.size();
12262 if (Limit % VF == 0 &&
all_of(
seq<int>(0, Limit / VF), [=](
int Idx) {
12263 ArrayRef<int> Slice =
Mask.slice(Idx * VF, VF);
12276 static void combineMasks(
unsigned LocalVF, SmallVectorImpl<int> &Mask,
12277 ArrayRef<int> ExtMask) {
12278 unsigned VF =
Mask.size();
12280 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
12283 int MaskedIdx =
Mask[ExtMask[
I] % VF];
12287 Mask.swap(NewMask);
12323 static bool peekThroughShuffles(
Value *&V, SmallVectorImpl<int> &Mask,
12324 bool SinglePermute) {
12326 ShuffleVectorInst *IdentityOp =
nullptr;
12327 SmallVector<int> IdentityMask;
12336 if (isIdentityMask(Mask, SVTy,
false)) {
12337 if (!IdentityOp || !SinglePermute ||
12338 (isIdentityMask(Mask, SVTy,
true) &&
12340 IdentityMask.
size()))) {
12345 IdentityMask.
assign(Mask);
12365 if (SV->isZeroEltSplat()) {
12367 IdentityMask.
assign(Mask);
12369 int LocalVF =
Mask.size();
12372 LocalVF = SVOpTy->getNumElements();
12376 static_cast<unsigned>(
I) >= SV->getShuffleMask().size())
12378 ExtMask[Idx] = SV->getMaskValue(
I);
12388 if (!IsOp1Undef && !IsOp2Undef) {
12390 for (
int &
I : Mask) {
12393 if (SV->getMaskValue(
I % SV->getShuffleMask().size()) ==
12399 SmallVector<int> ShuffleMask(SV->getShuffleMask());
12400 combineMasks(LocalVF, ShuffleMask, Mask);
12401 Mask.swap(ShuffleMask);
12403 Op = SV->getOperand(0);
12405 Op = SV->getOperand(1);
12408 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
12413 "Expected masks of same sizes.");
12418 Mask.swap(IdentityMask);
12420 return SinglePermute &&
12423 (Shuffle &&
Mask.size() == Shuffle->getShuffleMask().size() &&
12424 Shuffle->isZeroEltSplat() &&
12428 Shuffle->getShuffleMask()[
P.index()] == 0;
12441 template <
typename T,
typename ShuffleBuilderTy>
12442 static T createShuffle(
Value *V1,
Value *V2, ArrayRef<int> Mask,
12443 ShuffleBuilderTy &Builder,
Type *ScalarTy) {
12444 assert(V1 &&
"Expected at least one vector value.");
12446 SmallVector<int> NewMask(Mask);
12447 if (ScalarTyNumElements != 1) {
12453 Builder.resizeToMatch(V1, V2);
12454 int VF =
Mask.size();
12456 VF = FTy->getNumElements();
12467 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
12469 CombinedMask1[
I] =
Mask[
I];
12471 CombinedMask2[
I] =
Mask[
I] - VF;
12478 (void)peekThroughShuffles(Op1, CombinedMask1,
false);
12479 (void)peekThroughShuffles(Op2, CombinedMask2,
false);
12485 for (
auto [Idx,
I] :
enumerate(CombinedMask1)) {
12488 ExtMask1[Idx] = SV1->getMaskValue(
I);
12492 ->getNumElements(),
12493 ExtMask1, UseMask::SecondArg);
12494 SmallVector<int> ExtMask2(CombinedMask2.size(),
PoisonMaskElem);
12495 for (
auto [Idx,
I] :
enumerate(CombinedMask2)) {
12498 ExtMask2[Idx] = SV2->getMaskValue(
I);
12502 ->getNumElements(),
12503 ExtMask2, UseMask::SecondArg);
12504 if (SV1->getOperand(0)->getType() ==
12505 SV2->getOperand(0)->getType() &&
12506 SV1->getOperand(0)->getType() != SV1->getType() &&
12509 Op1 = SV1->getOperand(0);
12510 Op2 = SV2->getOperand(0);
12511 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
12512 int LocalVF = ShuffleMask1.size();
12514 LocalVF = FTy->getNumElements();
12515 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
12516 CombinedMask1.swap(ShuffleMask1);
12517 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
12518 LocalVF = ShuffleMask2.size();
12520 LocalVF = FTy->getNumElements();
12521 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
12522 CombinedMask2.swap(ShuffleMask2);
12525 }
while (PrevOp1 != Op1 || PrevOp2 != Op2);
12526 Builder.resizeToMatch(Op1, Op2);
12528 ->getElementCount()
12529 .getKnownMinValue(),
12531 ->getElementCount()
12532 .getKnownMinValue());
12533 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
12536 "Expected undefined mask element");
12537 CombinedMask1[
I] = CombinedMask2[
I] + (Op1 == Op2 ? 0 : VF);
12546 return Builder.createIdentity(Op1);
12547 return Builder.createShuffleVector(
12552 return Builder.createPoison(
12554 bool IsIdentity = peekThroughShuffles(V1, NewMask,
true);
12555 assert(V1 &&
"Expected non-null value after looking through shuffles.");
12558 return Builder.createShuffleVector(V1, NewMask);
12559 return Builder.createIdentity(V1);
12565 ArrayRef<int> Mask) {
12574static std::pair<InstructionCost, InstructionCost>
12585 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
12594 ScalarCost =
TTI.getPointersChainCost(
12595 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
12599 for (
Value *V : Ptrs) {
12600 if (V == BasePtr) {
12609 if (!
Ptr || !
Ptr->hasOneUse())
12613 if (PtrsRetainedInVecCode.
size() == Ptrs.size()) {
12618 VecCost =
TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
12619 TTI::PointersChainInfo::getKnownStride(),
12629 [](
const Value *V) {
12631 return Ptr && !
Ptr->hasAllConstantIndices();
12633 ? TTI::PointersChainInfo::getUnknownStride()
12634 : TTI::PointersChainInfo::getKnownStride();
12637 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy,
CostKind);
12641 if (It != Ptrs.
end())
12646 VecCost =
TTI.getGEPCost(BaseGEP->getSourceElementType(),
12647 BaseGEP->getPointerOperand(), Indices, VecTy,
12652 return std::make_pair(ScalarCost, VecCost);
12655void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
12656 assert(
TE.isGather() &&
TE.ReorderIndices.empty() &&
12657 "Expected gather node without reordering.");
12659 SmallSet<size_t, 2> LoadKeyUsed;
12663 if (
TE.Scalars.size() == 2 || (
TE.hasState() && !
TE.isAltShuffle()) ||
12668 return VectorizableTree[Idx]->isSame(TE.Scalars);
12672 auto GenerateLoadsSubkey = [&](
size_t Key, LoadInst *LI) {
12677 auto LIt = LoadsMap.
find(std::make_pair(
Key,
Ptr));
12678 if (LIt != LoadsMap.
end()) {
12679 for (LoadInst *RLI : LIt->second) {
12681 LI->
getType(), LI->getPointerOperand(), *DL, *SE,
12685 for (LoadInst *RLI : LIt->second) {
12687 LI->getPointerOperand(), *TLI)) {
12692 if (LIt->second.size() > 2) {
12694 hash_value(LIt->second.back()->getPointerOperand());
12703 MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues;
12704 SmallDenseMap<Value *, SmallVector<unsigned>, 8> KeyToIndex;
12705 bool IsOrdered =
true;
12706 unsigned NumInstructions = 0;
12710 size_t Key = 1, Idx = 1;
12718 auto &Container = SortedValues[
Key];
12719 if (IsOrdered && !KeyToIndex.
contains(V) &&
12722 ((Container.contains(Idx) &&
12723 KeyToIndex.
at(Container[Idx].back()).back() !=
I - 1) ||
12724 (!Container.empty() && !Container.contains(Idx) &&
12725 KeyToIndex.
at(Container.back().second.back()).back() !=
I - 1)))
12727 auto &KTI = KeyToIndex[
V];
12729 Container[Idx].push_back(V);
12734 if (!IsOrdered && NumInstructions > 1) {
12736 TE.ReorderIndices.resize(
TE.Scalars.size(),
TE.Scalars.size());
12737 for (
const auto &
D : SortedValues) {
12738 for (
const auto &
P :
D.second) {
12740 for (
Value *V :
P.second) {
12741 ArrayRef<unsigned> Indices = KeyToIndex.
at(V);
12742 for (
auto [K, Idx] :
enumerate(Indices)) {
12743 TE.ReorderIndices[Cnt +
K] = Idx;
12744 TE.Scalars[Cnt +
K] =
V;
12746 Sz += Indices.
size();
12747 Cnt += Indices.
size();
12751 *TTI,
TE.Scalars.front()->getType(), Sz);
12755 }
else if (!
P.second.empty() &&
isConstant(
P.second.front())) {
12763 if (!
TE.ReuseShuffleIndices.empty() ||
TE.ReorderIndices.empty())
12768 auto *ScalarTy =
TE.Scalars.front()->getType();
12770 for (
auto [Idx, Sz] : SubVectors) {
12777 int Sz =
TE.Scalars.size();
12778 SmallVector<int> ReorderMask(
TE.ReorderIndices.begin(),
12779 TE.ReorderIndices.end());
12785 ReorderMask[
I] =
I +
TE.ReorderIndices.size();
12789 any_of(ReorderMask, [&](
int I) {
return I >= Sz; })
12792 VecTy, ReorderMask);
12798 DemandedElts.clearBit(
I);
12800 ReorderMask[
I] =
I;
12802 ReorderMask[
I] =
I + Sz;
12808 if (!DemandedElts.isAllOnes())
12810 if (
Cost >= BVCost) {
12811 SmallVector<int>
Mask(
TE.ReorderIndices.begin(),
TE.ReorderIndices.end());
12813 TE.ReorderIndices.clear();
12820 const InstructionsState &S,
12826 return V->getType()->getScalarType()->isFloatingPointTy();
12828 "Can only convert to FMA for floating point types");
12829 assert(S.isAddSubLikeOp() &&
"Can only convert to FMA for add/sub");
12834 for (
Value *V : VL) {
12838 if (S.isCopyableElement(
I))
12840 Instruction *MatchingI = S.getMatchingMainOpOrAltOp(
I);
12841 if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
12844 FMF &= FPCI->getFastMathFlags();
12848 if (!CheckForContractable(VL))
12851 InstructionsCompatibilityAnalysis
Analysis(DT,
DL,
TTI, TLI);
12858 if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
12860 if (!CheckForContractable(Operands.
front()))
12868 for (
Value *V : VL) {
12872 if (!S.isCopyableElement(
I))
12874 FMF &= FPCI->getFastMathFlags();
12875 FMulPlusFAddCost +=
TTI.getInstructionCost(
I,
CostKind);
12878 for (
auto [V,
Op] :
zip(VL, Operands.
front())) {
12879 if (S.isCopyableElement(V))
12882 if (!
I || !
I->hasOneUse() || OpS.isCopyableElement(
I)) {
12884 FMACost +=
TTI.getInstructionCost(OpI,
CostKind);
12891 FMF &= FPCI->getFastMathFlags();
12892 FMulPlusFAddCost +=
TTI.getInstructionCost(
I,
CostKind);
12902 BaseGraphSize = VectorizableTree.size();
12904 class GraphTransformModeRAAI {
12905 bool &SavedIsGraphTransformMode;
12908 GraphTransformModeRAAI(
bool &IsGraphTransformMode)
12909 : SavedIsGraphTransformMode(IsGraphTransformMode) {
12910 IsGraphTransformMode =
true;
12912 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode =
false; }
12913 } TransformContext(IsGraphTransformMode);
12922 const InstructionsState &S) {
12926 I2->getOperand(
Op));
12928 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
12930 [](
const std::pair<Value *, Value *> &
P) {
12940 TreeEntry &E = *VectorizableTree[Idx];
12942 reorderGatherNode(E);
12947 constexpr unsigned VFLimit = 16;
12948 bool ForceLoadGather =
12949 count_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
12950 return TE->isGather() && TE->hasState() &&
12951 TE->getOpcode() == Instruction::Load &&
12952 TE->getVectorFactor() < VFLimit;
12958 return TE->isSame(VL) ||
all_of(VL, [&](
Value *V) {
12967 auto CheckForSameVectorNodes = [&](
const TreeEntry &E) {
12968 if (E.hasState()) {
12970 !TEs.
empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
12971 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
12972 ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
12973 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12974 return is_contained(TEs, TE);
12981 !TEs.
empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
12982 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
12983 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
12984 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12985 return is_contained(TEs, TE);
12993 if (It != E.Scalars.end()) {
12995 !TEs.empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
12996 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
12997 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
12998 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
12999 return is_contained(TEs, TE);
13009 for (
unsigned Idx :
seq<unsigned>(BaseGraphSize)) {
13010 TreeEntry &
E = *VectorizableTree[Idx];
13011 if (
E.isGather()) {
13014 unsigned MinVF =
getMinVF(2 * Sz);
13017 if (VL.
size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
13018 !(!
E.hasState() ||
E.getOpcode() == Instruction::Load ||
13024 if (ForceLoadGather &&
E.hasState() &&
E.getOpcode() == Instruction::Load)
13027 if (CheckForSameVectorNodes(
E))
13031 unsigned StartIdx = 0;
13032 unsigned End = VL.
size();
13034 *TTI, VL.
front()->getType(), VL.
size() - 1);
13036 *TTI, VL.
front()->getType(), VF - 1)) {
13037 if (StartIdx + VF > End)
13040 bool AllStrided =
true;
13041 for (
unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
13046 !getSameValuesTreeEntry(Slice.
front(), Slice,
true))
13053 bool IsSplat =
isSplat(Slice);
13054 bool IsTwoRegisterSplat =
true;
13055 if (IsSplat && VF == 2) {
13058 IsTwoRegisterSplat = NumRegs2VF == 2;
13060 if (Slices.
empty() || !IsSplat || !IsTwoRegisterSplat ||
13068 (S.getOpcode() == Instruction::Load &&
13070 (S.getOpcode() != Instruction::Load &&
13076 if ((!UserIgnoreList ||
E.Idx != 0) &&
13077 TTI->getInstructionCost(S.getMainOp(),
CostKind) <
13086 if (S.getOpcode() == Instruction::Load) {
13089 StridedPtrInfo SPtrInfo;
13091 PointerOps, SPtrInfo);
13102 if (UserIgnoreList &&
E.Idx == 0)
13107 }
else if (S.getOpcode() == Instruction::ExtractElement ||
13108 (TTI->getInstructionCost(S.getMainOp(),
CostKind) <
13110 !CheckOperandsProfitability(
13127 if (VF == 2 && AllStrided && Slices.
size() > 2)
13129 auto AddCombinedNode = [&](
unsigned Idx,
unsigned Cnt,
unsigned Sz) {
13130 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
13131 if (StartIdx == Cnt)
13132 StartIdx = Cnt + Sz;
13133 if (End == Cnt + Sz)
13136 for (
auto [Cnt, Sz] : Slices) {
13138 const TreeEntry *SameTE =
nullptr;
13140 It != Slice.
end()) {
13142 SameTE = getSameValuesTreeEntry(*It, Slice);
13144 unsigned PrevSize = VectorizableTree.size();
13145 [[maybe_unused]]
unsigned PrevEntriesSize =
13146 LoadEntriesToVectorize.size();
13147 buildTreeRec(Slice, 0,
EdgeInfo(&
E, UINT_MAX));
13148 if (PrevSize + 1 == VectorizableTree.size() && !SameTE &&
13149 VectorizableTree[PrevSize]->isGather() &&
13150 VectorizableTree[PrevSize]->hasState() &&
13151 VectorizableTree[PrevSize]->getOpcode() !=
13152 Instruction::ExtractElement &&
13154 if (UserIgnoreList &&
E.Idx == 0 && VF == 2)
13156 VectorizableTree.pop_back();
13157 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
13158 "LoadEntriesToVectorize expected to remain the same");
13161 AddCombinedNode(PrevSize, Cnt, Sz);
13165 if (
E.CombinedEntriesWithIndices.empty() && !
E.ReorderIndices.empty()) {
13166 SmallVector<int>
Mask(
E.ReorderIndices.begin(),
E.ReorderIndices.end());
13168 E.ReorderIndices.clear();
13173 switch (
E.getOpcode()) {
13174 case Instruction::Load: {
13177 if (
E.State != TreeEntry::Vectorize)
13179 Type *ScalarTy =
E.getMainOp()->getType();
13185 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13186 SmallVector<int>
Mask;
13190 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
13191 BaseLI->getPointerAddressSpace(),
CostKind,
13195 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
13196 false, CommonAlignment,
CostKind, BaseLI);
13201 ->getPointerOperand()
13203 StridedPtrInfo SPtrInfo;
13204 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1);
13205 SPtrInfo.Ty = VecTy;
13206 TreeEntryToStridedPtrInfoMap[&
E] = SPtrInfo;
13207 E.State = TreeEntry::StridedVectorize;
13212 case Instruction::Store: {
13220 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13221 SmallVector<int>
Mask;
13225 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
13226 BaseSI->getPointerAddressSpace(),
CostKind,
13230 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
13231 false, CommonAlignment,
CostKind, BaseSI);
13232 if (StridedCost < OriginalVecCost)
13235 E.State = TreeEntry::StridedVectorize;
13236 }
else if (!
E.ReorderIndices.empty()) {
13238 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int>
Mask) {
13240 assert(
Mask.size() > 1 &&
"Expected mask greater than 1 element.");
13241 if (
Mask.size() < 4)
13245 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
13246 TTI.isLegalInterleavedAccessType(
13247 VecTy, Factor, BaseSI->getAlign(),
13248 BaseSI->getPointerAddressSpace()))
13254 SmallVector<int>
Mask(
E.ReorderIndices.begin(),
E.ReorderIndices.end());
13255 unsigned InterleaveFactor = IsInterleaveMask(Mask);
13256 if (InterleaveFactor != 0)
13257 E.setInterleave(InterleaveFactor);
13261 case Instruction::Select: {
13262 if (
E.State != TreeEntry::Vectorize)
13268 E.CombinedOp = TreeEntry::MinMax;
13269 TreeEntry *CondEntry = getOperandEntry(&
E, 0);
13270 if (SelectOnly && CondEntry->UserTreeIndex &&
13271 CondEntry->State == TreeEntry::Vectorize) {
13273 CondEntry->State = TreeEntry::CombinedVectorize;
13277 case Instruction::FSub:
13278 case Instruction::FAdd: {
13280 if (
E.State != TreeEntry::Vectorize ||
13281 !
E.getOperations().isAddSubLikeOp())
13287 E.CombinedOp = TreeEntry::FMulAdd;
13288 TreeEntry *FMulEntry = getOperandEntry(&
E, 0);
13289 if (FMulEntry->UserTreeIndex &&
13290 FMulEntry->State == TreeEntry::Vectorize) {
13292 FMulEntry->State = TreeEntry::CombinedVectorize;
13301 if (LoadEntriesToVectorize.empty()) {
13303 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
13304 VectorizableTree.front()->getOpcode() == Instruction::Load)
13307 constexpr unsigned SmallTree = 3;
13308 constexpr unsigned SmallVF = 2;
13309 if ((VectorizableTree.size() <= SmallTree &&
13310 VectorizableTree.front()->Scalars.size() == SmallVF) ||
13311 (VectorizableTree.size() <= 2 && UserIgnoreList))
13314 if (VectorizableTree.front()->isNonPowOf2Vec() &&
13318 [](
const std::unique_ptr<TreeEntry> &TE) {
13319 return TE->isGather() &&
TE->hasState() &&
13320 TE->getOpcode() == Instruction::Load &&
13328 SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
13332 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
13333 TreeEntry &
E = *
TE;
13334 if (
E.isGather() &&
13335 ((
E.hasState() &&
E.getOpcode() == Instruction::Load) ||
13336 (!
E.hasState() &&
any_of(
E.Scalars,
13338 return isa<LoadInst>(V) &&
13339 !isVectorized(V) &&
13340 !isDeleted(cast<Instruction>(V));
13343 for (
Value *V :
E.Scalars) {
13350 *
this, V, *DL, *SE, *TTI,
13351 GatheredLoads[std::make_tuple(
13359 if (!GatheredLoads.
empty())
13360 tryToVectorizeGatheredLoads(GatheredLoads);
13370 bool IsFinalized =
false;
13383 bool SameNodesEstimated =
true;
13386 if (Ty->getScalarType()->isPointerTy()) {
13390 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
13391 Ty->getScalarType());
13409 assert(It != VL.
end() &&
"Expected at least one non-undef value.");
13412 count(VL, *It) > 1 &&
13414 if (!NeedShuffle) {
13417 return TTI.getShuffleCost(
13422 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
13423 CostKind, std::distance(VL.
begin(), It),
13429 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
13432 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
13436 VecTy, ShuffleMask, CostKind,
13440 return GatherCost +
13443 : R.getGatherCost(Gathers, !Root && VL.
equals(Gathers),
13451 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13452 unsigned NumParts) {
13453 assert(VL.
size() > NumParts &&
"Unexpected scalarized shuffle.");
13455 std::accumulate(VL.
begin(), VL.
end(), 0, [](
unsigned Sz,
Value *V) {
13456 auto *EE = dyn_cast<ExtractElementInst>(V);
13459 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
13462 return std::max(Sz, VecTy->getNumElements());
13469 -> std::optional<TTI::ShuffleKind> {
13470 if (NumElts <= EltsPerVector)
13471 return std::nullopt;
13473 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
13475 if (I == PoisonMaskElem)
13477 return std::min(S, I);
13480 int OffsetReg1 = OffsetReg0;
13484 int FirstRegId = -1;
13485 Indices.assign(1, OffsetReg0);
13489 int Idx =
I - OffsetReg0;
13491 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
13492 if (FirstRegId < 0)
13493 FirstRegId = RegId;
13494 RegIndices.
insert(RegId);
13495 if (RegIndices.
size() > 2)
13496 return std::nullopt;
13497 if (RegIndices.
size() == 2) {
13499 if (Indices.
size() == 1) {
13502 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
13503 [&](
int S,
int I) {
13504 if (I == PoisonMaskElem)
13506 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
13507 ((I - OffsetReg0) % NumElts) / EltsPerVector;
13508 if (RegId == FirstRegId)
13510 return std::min(S, I);
13513 unsigned Index = OffsetReg1 % NumElts;
13514 Indices.push_back(Index);
13515 SubVecSizes.push_back(std::min(NumElts - Index, EltsPerVector));
13517 Idx =
I - OffsetReg1;
13519 I = (Idx % NumElts) % EltsPerVector +
13520 (RegId == FirstRegId ? 0 : EltsPerVector);
13522 return ShuffleKind;
13530 if (!ShuffleKinds[Part])
13533 Part * EltsPerVector,
getNumElems(Mask.size(), EltsPerVector, Part));
13538 std::optional<TTI::ShuffleKind> RegShuffleKind =
13539 CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);
13540 if (!RegShuffleKind) {
13543 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.
size())))
13556 *R.TTI, VL.
front()->getType(),
alignTo(NumElts, EltsPerVector));
13557 for (
const auto [Idx, SubVecSize] :
zip(Indices, SubVecSizes)) {
13558 assert((Idx + SubVecSize) <= BaseVF &&
13559 "SK_ExtractSubvector index out of range");
13569 TTI, *ShuffleKinds[Part],
getWidenedType(ScalarTy, NumElts), SubMask);
13570 if (OriginalCost < Cost)
13571 Cost = OriginalCost;
13578 void estimateNodesPermuteCost(
const TreeEntry &E1,
const TreeEntry *E2,
13580 unsigned SliceSize) {
13581 if (SameNodesEstimated) {
13587 if ((InVectors.size() == 2 &&
13591 unsigned Limit =
getNumElems(Mask.size(), SliceSize, Part);
13594 "Expected all poisoned elements.");
13596 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
13601 Cost += createShuffle(InVectors.front(),
13602 InVectors.size() == 1 ?
nullptr : InVectors.back(),
13604 transformMaskAfterShuffle(CommonMask, CommonMask);
13605 }
else if (InVectors.size() == 2) {
13606 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
13607 transformMaskAfterShuffle(CommonMask, CommonMask);
13609 SameNodesEstimated =
false;
13610 if (!E2 && InVectors.size() == 1) {
13611 unsigned VF = E1.getVectorFactor();
13613 VF = std::max(VF, getVF(V1));
13616 VF = std::max(VF, E->getVectorFactor());
13618 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13620 CommonMask[Idx] = Mask[Idx] + VF;
13621 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
13622 transformMaskAfterShuffle(CommonMask, CommonMask);
13624 auto P = InVectors.front();
13625 Cost += createShuffle(&E1, E2, Mask);
13626 unsigned VF = Mask.size();
13632 VF = std::max(VF, E->getVectorFactor());
13634 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13636 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
13637 Cost += createShuffle(
P, InVectors.front(), CommonMask);
13638 transformMaskAfterShuffle(CommonMask, CommonMask);
13642 class ShuffleCostBuilder {
13645 static bool isEmptyOrIdentity(
ArrayRef<int> Mask,
unsigned VF) {
13647 return Mask.empty() ||
13648 (VF == Mask.size() &&
13656 ~ShuffleCostBuilder() =
default;
13662 if (isEmptyOrIdentity(Mask, VF))
13671 if (isEmptyOrIdentity(Mask, VF))
13680 void resizeToMatch(
Value *&,
Value *&)
const {}
13690 ShuffleCostBuilder Builder(TTI);
13693 unsigned CommonVF = Mask.size();
13695 auto GetNodeMinBWAffectedCost = [&](
const TreeEntry &E,
13699 Type *EScalarTy = E.Scalars.front()->getType();
13700 bool IsSigned =
true;
13701 if (
auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
13703 IsSigned = It->second.second;
13705 if (EScalarTy != ScalarTy) {
13706 unsigned CastOpcode = Instruction::Trunc;
13707 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13708 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13710 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13711 return TTI.getCastInstrCost(CastOpcode,
getWidenedType(ScalarTy, VF),
13721 Type *EScalarTy = VecTy->getElementType();
13722 if (EScalarTy != ScalarTy) {
13724 unsigned CastOpcode = Instruction::Trunc;
13725 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13726 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13728 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13729 return TTI.getCastInstrCost(
13735 if (!V1 && !V2 && !P2.
isNull()) {
13738 unsigned VF = E->getVectorFactor();
13740 CommonVF = std::max(VF, E2->getVectorFactor());
13743 return Idx < 2 * static_cast<int>(CommonVF);
13745 "All elements in mask must be less than 2 * CommonVF.");
13746 if (E->Scalars.size() == E2->Scalars.size()) {
13750 for (
int &Idx : CommonMask) {
13753 if (Idx <
static_cast<int>(CommonVF) && !EMask.
empty())
13755 else if (Idx >=
static_cast<int>(CommonVF))
13756 Idx = (E2Mask.
empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
13760 CommonVF = E->Scalars.size();
13761 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
13762 GetNodeMinBWAffectedCost(*E2, CommonVF);
13764 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
13765 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
13768 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13769 }
else if (!V1 && P2.
isNull()) {
13772 unsigned VF = E->getVectorFactor();
13776 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
13777 "All elements in mask must be less than CommonVF.");
13778 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
13780 assert(!EMask.
empty() &&
"Expected non-empty common mask.");
13781 for (
int &Idx : CommonMask) {
13785 CommonVF = E->Scalars.size();
13786 }
else if (
unsigned Factor = E->getInterleaveFactor();
13787 Factor > 0 && E->Scalars.size() != Mask.size() &&
13791 std::iota(CommonMask.begin(), CommonMask.end(), 0);
13793 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
13796 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
13797 CommonVF == CommonMask.size() &&
13799 [](
const auto &&
P) {
13801 static_cast<unsigned>(
P.value()) !=
P.index();
13809 }
else if (V1 && P2.
isNull()) {
13811 ExtraCost += GetValueMinBWAffectedCost(V1);
13812 CommonVF = getVF(V1);
13815 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
13816 "All elements in mask must be less than CommonVF.");
13817 }
else if (V1 && !V2) {
13819 unsigned VF = getVF(V1);
13821 CommonVF = std::max(VF, E2->getVectorFactor());
13824 return Idx < 2 * static_cast<int>(CommonVF);
13826 "All elements in mask must be less than 2 * CommonVF.");
13827 if (E2->Scalars.size() == VF && VF != CommonVF) {
13829 assert(!E2Mask.
empty() &&
"Expected non-empty common mask.");
13830 for (
int &Idx : CommonMask) {
13833 if (Idx >=
static_cast<int>(CommonVF))
13834 Idx = E2Mask[Idx - CommonVF] + VF;
13838 ExtraCost += GetValueMinBWAffectedCost(V1);
13840 ExtraCost += GetNodeMinBWAffectedCost(
13841 *E2, std::min(CommonVF, E2->getVectorFactor()));
13842 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13843 }
else if (!V1 && V2) {
13845 unsigned VF = getVF(V2);
13847 CommonVF = std::max(VF, E1->getVectorFactor());
13850 return Idx < 2 * static_cast<int>(CommonVF);
13852 "All elements in mask must be less than 2 * CommonVF.");
13853 if (E1->Scalars.size() == VF && VF != CommonVF) {
13855 assert(!E1Mask.
empty() &&
"Expected non-empty common mask.");
13856 for (
int &Idx : CommonMask) {
13859 if (Idx >=
static_cast<int>(CommonVF))
13860 Idx = E1Mask[Idx - CommonVF] + VF;
13866 ExtraCost += GetNodeMinBWAffectedCost(
13867 *E1, std::min(CommonVF, E1->getVectorFactor()));
13869 ExtraCost += GetValueMinBWAffectedCost(V2);
13870 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13872 assert(V1 && V2 &&
"Expected both vectors.");
13873 unsigned VF = getVF(V1);
13874 CommonVF = std::max(VF, getVF(V2));
13877 return Idx < 2 * static_cast<int>(CommonVF);
13879 "All elements in mask must be less than 2 * CommonVF.");
13881 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
13884 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13889 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13892 InVectors.front() =
13894 if (InVectors.size() == 2)
13895 InVectors.pop_back();
13896 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
13897 V1, V2, CommonMask, Builder, ScalarTy);
13904 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
13905 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
13906 CheckedExtracts(CheckedExtracts) {}
13908 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13909 unsigned NumParts,
bool &UseVecBaseAsInput) {
13910 UseVecBaseAsInput =
false;
13913 Value *VecBase =
nullptr;
13915 if (!E->ReorderIndices.empty()) {
13917 E->ReorderIndices.end());
13922 bool PrevNodeFound =
any_of(
13923 ArrayRef(R.VectorizableTree).take_front(E->Idx),
13924 [&](
const std::unique_ptr<TreeEntry> &TE) {
13925 return ((TE->hasState() && !TE->isAltShuffle() &&
13926 TE->getOpcode() == Instruction::ExtractElement) ||
13928 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
13929 return VL.size() > Data.index() &&
13930 (Mask[Data.index()] == PoisonMaskElem ||
13931 isa<UndefValue>(VL[Data.index()]) ||
13932 Data.value() == VL[Data.index()]);
13940 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
13954 VecBase = EE->getVectorOperand();
13955 UniqueBases.
insert(VecBase);
13957 if (!CheckedExtracts.
insert(V).second ||
13961 return isa<GetElementPtrInst>(U) &&
13962 !R.areAllUsersVectorized(cast<Instruction>(U),
13970 unsigned Idx = *EEIdx;
13972 if (EE->hasOneUse() || !PrevNodeFound) {
13978 Cost -= TTI.getExtractWithExtendCost(
13979 Ext->getOpcode(), Ext->getType(), EE->getVectorOperandType(),
13982 Cost += TTI.getCastInstrCost(
13983 Ext->getOpcode(), Ext->getType(), EE->getType(),
13988 APInt &DemandedElts =
13989 VectorOpsToExtracts
13992 .first->getSecond();
13993 DemandedElts.
setBit(Idx);
13996 for (
const auto &[Vec, DemandedElts] : VectorOpsToExtracts)
13998 DemandedElts,
false,
14006 if (!PrevNodeFound)
14007 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
14010 transformMaskAfterShuffle(CommonMask, CommonMask);
14011 SameNodesEstimated =
false;
14012 if (NumParts != 1 && UniqueBases.
size() != 1) {
14013 UseVecBaseAsInput =
true;
14021 std::optional<InstructionCost>
14025 return std::nullopt;
14029 IsFinalized =
false;
14030 CommonMask.clear();
14033 VectorizedVals.clear();
14034 SameNodesEstimated =
true;
14040 return Idx < static_cast<int>(E1.getVectorFactor());
14042 "Expected single vector shuffle mask.");
14046 if (InVectors.empty()) {
14047 CommonMask.assign(Mask.begin(), Mask.end());
14048 InVectors.assign({&E1, &E2});
14051 assert(!CommonMask.empty() &&
"Expected non-empty common mask.");
14057 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
14058 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
14061 if (InVectors.empty()) {
14062 CommonMask.assign(Mask.begin(), Mask.end());
14063 InVectors.assign(1, &E1);
14066 assert(!CommonMask.empty() &&
"Expected non-empty common mask.");
14072 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
14073 estimateNodesPermuteCost(E1,
nullptr, Mask, Part, SliceSize);
14074 if (!SameNodesEstimated && InVectors.size() == 1)
14075 InVectors.emplace_back(&E1);
14081 assert(InVectors.size() == 1 &&
14088 ->getOrdered(
P.index()));
14089 return EI->getVectorOperand() == V1 ||
14090 EI->getVectorOperand() == V2;
14092 "Expected extractelement vectors.");
14096 if (InVectors.empty()) {
14097 assert(CommonMask.empty() && !ForExtracts &&
14098 "Expected empty input mask/vectors.");
14099 CommonMask.assign(Mask.begin(), Mask.end());
14100 InVectors.assign(1, V1);
14106 !CommonMask.empty() &&
14110 ->getOrdered(
P.index());
14112 return P.value() == Mask[
P.index()] ||
14117 return EI->getVectorOperand() == V1;
14119 "Expected only tree entry for extractelement vectors.");
14122 assert(!InVectors.empty() && !CommonMask.empty() &&
14123 "Expected only tree entries from extracts/reused buildvectors.");
14124 unsigned VF = getVF(V1);
14125 if (InVectors.size() == 2) {
14126 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14127 transformMaskAfterShuffle(CommonMask, CommonMask);
14128 VF = std::max<unsigned>(VF, CommonMask.size());
14129 }
else if (
const auto *InTE =
14130 InVectors.front().dyn_cast<
const TreeEntry *>()) {
14131 VF = std::max(VF, InTE->getVectorFactor());
14135 ->getNumElements());
14137 InVectors.push_back(V1);
14138 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14140 CommonMask[Idx] = Mask[Idx] + VF;
14143 Value *Root =
nullptr) {
14144 Cost += getBuildVectorCost(VL, Root);
14148 unsigned VF = VL.
size();
14150 VF = std::min(VF, MaskVF);
14151 Type *VLScalarTy = VL.
front()->getType();
14175 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
14181 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14186 IsFinalized =
true;
14189 if (InVectors.
size() == 2)
14190 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
14192 Cost += createShuffle(Vec,
nullptr, CommonMask);
14193 transformMaskAfterShuffle(CommonMask, CommonMask);
14195 "Expected vector length for the final value before action.");
14198 Cost += createShuffle(V1, V2, Mask);
14201 InVectors.
front() = V;
14203 if (!SubVectors.empty()) {
14205 if (InVectors.
size() == 2)
14206 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
14208 Cost += createShuffle(Vec,
nullptr, CommonMask);
14209 transformMaskAfterShuffle(CommonMask, CommonMask);
14211 if (!SubVectorsMask.
empty()) {
14213 "Expected same size of masks for subvectors and common mask.");
14215 copy(SubVectorsMask, SVMask.begin());
14216 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
14219 I1 = I2 + CommonMask.
size();
14226 for (
auto [
E, Idx] : SubVectors) {
14227 Type *EScalarTy =
E->Scalars.front()->getType();
14228 bool IsSigned =
true;
14229 if (
auto It =
R.MinBWs.find(
E); It !=
R.MinBWs.end()) {
14232 IsSigned = It->second.second;
14234 if (ScalarTy != EScalarTy) {
14235 unsigned CastOpcode = Instruction::Trunc;
14236 unsigned DstSz =
R.DL->getTypeSizeInBits(ScalarTy);
14237 unsigned SrcSz =
R.DL->getTypeSizeInBits(EScalarTy);
14239 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14240 Cost += TTI.getCastInstrCost(
14249 if (!CommonMask.
empty()) {
14250 std::iota(std::next(CommonMask.
begin(), Idx),
14251 std::next(CommonMask.
begin(), Idx +
E->getVectorFactor()),
14257 if (!ExtMask.
empty()) {
14258 if (CommonMask.
empty()) {
14262 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
14265 NewMask[
I] = CommonMask[ExtMask[
I]];
14267 CommonMask.
swap(NewMask);
14270 if (CommonMask.
empty()) {
14271 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
14275 createShuffle(InVectors.
front(),
14276 InVectors.
size() == 2 ? InVectors.
back() :
nullptr,
14281 assert((IsFinalized || CommonMask.empty()) &&
14282 "Shuffle construction must be finalized.");
14286const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(
const TreeEntry *E,
14287 unsigned Idx)
const {
14288 TreeEntry *
Op = OperandsToTreeEntry.
at({E, Idx});
14289 assert(
Op->isSame(E->getOperand(Idx)) &&
"Operands mismatch!");
14294 if (TE.State == TreeEntry::ScatterVectorize ||
14295 TE.State == TreeEntry::StridedVectorize)
14297 if (TE.State == TreeEntry::CompressVectorize)
14299 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
14300 !TE.isAltShuffle()) {
14301 if (TE.ReorderIndices.empty())
14313 SmallPtrSetImpl<Value *> &CheckedExtracts) {
14323 auto It = MinBWs.find(
E);
14324 Type *OrigScalarTy = ScalarTy;
14325 if (It != MinBWs.end()) {
14332 unsigned EntryVF =
E->getVectorFactor();
14335 if (
E->isGather()) {
14341 ScalarTy = VL.
front()->getType();
14342 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
14343 E, ScalarTy, *TTI, VectorizedVals, *
this, CheckedExtracts);
14345 if (
E->State == TreeEntry::SplitVectorize) {
14346 assert(
E->CombinedEntriesWithIndices.size() == 2 &&
14347 "Expected exactly 2 combined entries.");
14348 assert(
E->ReuseShuffleIndices.empty() &&
"Expected empty reuses mask.");
14350 if (
E->ReorderIndices.empty()) {
14353 E->CombinedEntriesWithIndices.back().second,
14356 VectorizableTree[
E->CombinedEntriesWithIndices.back().first]
14357 ->getVectorFactor()));
14359 unsigned CommonVF =
14360 std::max(VectorizableTree[
E->CombinedEntriesWithIndices.front().first]
14361 ->getVectorFactor(),
14362 VectorizableTree[
E->CombinedEntriesWithIndices.back().first]
14363 ->getVectorFactor());
14368 LLVM_DEBUG(dumpTreeCosts(
E, 0, VectorCost, 0,
"Calculated costs for Tree"));
14372 SmallVector<int>
Mask;
14373 if (!
E->ReorderIndices.empty() &&
E->State != TreeEntry::CompressVectorize &&
14374 (
E->State != TreeEntry::StridedVectorize ||
14376 SmallVector<int> NewMask;
14377 if (
E->getOpcode() == Instruction::Store) {
14379 NewMask.
resize(
E->ReorderIndices.size());
14386 if (!
E->ReuseShuffleIndices.empty())
14391 assert((
E->State == TreeEntry::Vectorize ||
14392 E->State == TreeEntry::ScatterVectorize ||
14393 E->State == TreeEntry::StridedVectorize ||
14394 E->State == TreeEntry::CompressVectorize) &&
14395 "Unhandled state");
14398 (
E->getOpcode() == Instruction::GetElementPtr &&
14399 E->getMainOp()->getType()->isPointerTy()) ||
14400 E->hasCopyableElements()) &&
14403 unsigned ShuffleOrOp =
14404 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector :
E->
getOpcode();
14405 if (
E->CombinedOp != TreeEntry::NotCombinedOp)
14406 ShuffleOrOp =
E->CombinedOp;
14407 SmallSetVector<Value *, 16> UniqueValues(VL.
begin(), VL.
end());
14408 const unsigned Sz = UniqueValues.size();
14409 SmallBitVector UsedScalars(Sz,
false);
14410 for (
unsigned I = 0;
I < Sz; ++
I) {
14412 !
E->isCopyableElement(UniqueValues[
I]) &&
14413 getTreeEntries(UniqueValues[
I]).
front() ==
E)
14415 UsedScalars.set(
I);
14417 auto GetCastContextHint = [&](
Value *
V) {
14419 return getCastContextHint(*OpTEs.front());
14420 InstructionsState SrcState =
getSameOpcode(
E->getOperand(0), *TLI);
14421 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
14422 !SrcState.isAltShuffle())
14435 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
14437 for (
unsigned I = 0;
I < Sz; ++
I) {
14438 if (UsedScalars.test(
I))
14440 ScalarCost += ScalarEltCost(
I);
14449 (
E->getOpcode() != Instruction::Load ||
E->UserTreeIndex)) {
14451 if (!EI.UserTE->hasState() ||
14452 EI.UserTE->getOpcode() != Instruction::Select ||
14454 auto UserBWIt = MinBWs.find(EI.UserTE);
14455 Type *UserScalarTy =
14456 (EI.UserTE->isGather() ||
14457 EI.UserTE->State == TreeEntry::SplitVectorize)
14458 ? EI.UserTE->Scalars.front()->getType()
14459 : EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
14460 if (UserBWIt != MinBWs.end())
14462 UserBWIt->second.first);
14463 if (ScalarTy != UserScalarTy) {
14464 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
14465 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
14466 unsigned VecOpcode;
14468 if (BWSz > SrcBWSz)
14469 VecOpcode = Instruction::Trunc;
14472 It->second.second ? Instruction::SExt : Instruction::ZExt;
14474 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
14479 LLVM_DEBUG(dumpTreeCosts(
E, CommonCost, VecCost - CommonCost,
14480 ScalarCost,
"Calculated costs for Tree"));
14481 return VecCost - ScalarCost;
14486 assert((
E->State == TreeEntry::Vectorize ||
14487 E->State == TreeEntry::StridedVectorize ||
14488 E->State == TreeEntry::CompressVectorize) &&
14489 "Entry state expected to be Vectorize, StridedVectorize or "
14490 "MaskedLoadCompressVectorize here.");
14494 *TTI, Ptrs, BasePtr,
E->getOpcode(),
CostKind, OrigScalarTy, VecTy);
14495 LLVM_DEBUG(dumpTreeCosts(
E, 0, VecCost, ScalarCost,
14496 "Calculated GEPs cost for Tree"));
14498 return VecCost - ScalarCost;
14505 Type *CanonicalType = Ty;
14511 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
14512 {CanonicalType, CanonicalType});
14514 TTI->getIntrinsicInstrCost(CostAttrs,
CostKind);
14517 if (VI && SelectOnly) {
14519 "Expected only for scalar type.");
14522 CI->
getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
14523 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
14524 {TTI::OK_AnyValue, TTI::OP_None}, CI);
14528 auto GetFMulAddCost = [&, &TTI = *TTI](
const InstructionsState &S,
14533 switch (ShuffleOrOp) {
14534 case Instruction::PHI: {
14537 SmallPtrSet<const TreeEntry *, 4> CountedOps;
14538 for (
Value *V : UniqueValues) {
14543 ValueList Operands(
PHI->getNumIncomingValues(),
nullptr);
14544 for (
unsigned I = 0,
N =
PHI->getNumIncomingValues();
I <
N; ++
I) {
14548 if (
const TreeEntry *OpTE =
14549 getSameValuesTreeEntry(Operands.
front(), Operands))
14550 if (CountedOps.
insert(OpTE).second &&
14551 !OpTE->ReuseShuffleIndices.empty())
14552 ScalarCost +=
TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
14553 OpTE->Scalars.size());
14556 return CommonCost - ScalarCost;
14558 case Instruction::ExtractValue:
14559 case Instruction::ExtractElement: {
14560 APInt DemandedElts;
14562 auto GetScalarCost = [&](
unsigned Idx) {
14568 if (ShuffleOrOp == Instruction::ExtractElement) {
14570 SrcVecTy = EE->getVectorOperandType();
14573 Type *AggregateTy = EV->getAggregateOperand()->getType();
14576 NumElts = ATy->getNumElements();
14582 if (
I->hasOneUse()) {
14592 Cost -= TTI->getCastInstrCost(
14593 Ext->getOpcode(),
Ext->getType(),
I->getType(),
14598 if (DemandedElts.
isZero())
14604 return CommonCost - (DemandedElts.
isZero()
14606 : TTI.getScalarizationOverhead(
14607 SrcVecTy, DemandedElts,
false,
14610 return GetCostDiff(GetScalarCost, GetVectorCost);
14612 case Instruction::InsertElement: {
14613 assert(
E->ReuseShuffleIndices.empty() &&
14614 "Unique insertelements only are expected.");
14616 unsigned const NumElts = SrcVecTy->getNumElements();
14617 unsigned const NumScalars = VL.
size();
14623 unsigned OffsetEnd = OffsetBeg;
14624 InsertMask[OffsetBeg] = 0;
14627 if (OffsetBeg > Idx)
14629 else if (OffsetEnd < Idx)
14631 InsertMask[Idx] =
I + 1;
14634 if (NumOfParts > 0 && NumOfParts < NumElts)
14635 VecScalarsSz =
PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
14636 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
14638 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
14639 unsigned InsertVecSz = std::min<unsigned>(
14641 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
14642 bool IsWholeSubvector =
14643 OffsetBeg ==
Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
14647 if (OffsetBeg + InsertVecSz > VecSz) {
14650 InsertVecSz = VecSz;
14655 SmallVector<int>
Mask;
14656 if (!
E->ReorderIndices.empty()) {
14661 std::iota(
Mask.begin(), std::next(
Mask.begin(), InsertVecSz), 0);
14663 bool IsIdentity =
true;
14665 Mask.swap(PrevMask);
14666 for (
unsigned I = 0;
I < NumScalars; ++
I) {
14668 DemandedElts.
setBit(InsertIdx);
14669 IsIdentity &= InsertIdx - OffsetBeg ==
I;
14670 Mask[InsertIdx - OffsetBeg] =
I;
14672 assert(
Offset < NumElts &&
"Failed to find vector index offset");
14686 InsertVecTy, Mask);
14688 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
14694 SmallBitVector InMask =
14696 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
14697 if (!InMask.
all() && NumScalars != NumElts && !IsWholeSubvector) {
14698 if (InsertVecSz != VecSz) {
14703 for (
unsigned I = 0, End = OffsetBeg -
Offset;
I < End; ++
I)
14705 for (
unsigned I = OffsetBeg -
Offset, End = OffsetEnd -
Offset;
14709 for (
unsigned I = OffsetEnd + 1 -
Offset;
I < VecSz; ++
I)
14718 case Instruction::ZExt:
14719 case Instruction::SExt:
14720 case Instruction::FPToUI:
14721 case Instruction::FPToSI:
14722 case Instruction::FPExt:
14723 case Instruction::PtrToInt:
14724 case Instruction::IntToPtr:
14725 case Instruction::SIToFP:
14726 case Instruction::UIToFP:
14727 case Instruction::Trunc:
14728 case Instruction::FPTrunc:
14729 case Instruction::BitCast: {
14730 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
14733 unsigned Opcode = ShuffleOrOp;
14734 unsigned VecOpcode = Opcode;
14736 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
14738 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->
getScalarType());
14739 if (SrcIt != MinBWs.end()) {
14740 SrcBWSz = SrcIt->second.first;
14746 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->
getScalarType());
14747 if (BWSz == SrcBWSz) {
14748 VecOpcode = Instruction::BitCast;
14749 }
else if (BWSz < SrcBWSz) {
14750 VecOpcode = Instruction::Trunc;
14751 }
else if (It != MinBWs.end()) {
14752 assert(BWSz > SrcBWSz &&
"Invalid cast!");
14753 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
14754 }
else if (SrcIt != MinBWs.end()) {
14755 assert(BWSz > SrcBWSz &&
"Invalid cast!");
14757 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
14759 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
14760 !SrcIt->second.second) {
14761 VecOpcode = Instruction::UIToFP;
14764 assert(Idx == 0 &&
"Expected 0 index only");
14765 return TTI->getCastInstrCost(Opcode, VL0->
getType(),
14772 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
14774 auto *
VI = VL0->
getOpcode() == Opcode ? VL0 :
nullptr;
14777 bool IsArithmeticExtendedReduction =
14778 E->Idx == 0 && UserIgnoreList &&
14781 return is_contained({Instruction::Add, Instruction::FAdd,
14782 Instruction::Mul, Instruction::FMul,
14783 Instruction::And, Instruction::Or,
14787 if (IsArithmeticExtendedReduction &&
14788 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
14790 return CommonCost +
14791 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH,
CostKind,
14792 VecOpcode == Opcode ? VI :
nullptr);
14794 return GetCostDiff(GetScalarCost, GetVectorCost);
14796 case Instruction::FCmp:
14797 case Instruction::ICmp:
14798 case Instruction::Select: {
14799 CmpPredicate VecPred, SwappedVecPred;
14802 match(VL0, MatchCmp))
14808 auto GetScalarCost = [&](
unsigned Idx) {
14818 !
match(VI, MatchCmp)) ||
14826 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
14827 CostKind, getOperandInfo(
VI->getOperand(0)),
14828 getOperandInfo(
VI->getOperand(1)), VI);
14839 TTI->getCmpSelInstrCost(
E->getOpcode(), VecTy, MaskTy, VecPred,
14840 CostKind, getOperandInfo(
E->getOperand(0)),
14841 getOperandInfo(
E->getOperand(1)), VL0);
14845 unsigned CondNumElements = CondType->getNumElements();
14847 assert(VecTyNumElements >= CondNumElements &&
14848 VecTyNumElements % CondNumElements == 0 &&
14849 "Cannot vectorize Instruction::Select");
14850 if (CondNumElements != VecTyNumElements) {
14859 return VecCost + CommonCost;
14861 return GetCostDiff(GetScalarCost, GetVectorCost);
14863 case TreeEntry::MinMax: {
14864 auto GetScalarCost = [&](
unsigned Idx) {
14865 return GetMinMaxCost(OrigScalarTy);
14869 return VecCost + CommonCost;
14871 return GetCostDiff(GetScalarCost, GetVectorCost);
14873 case TreeEntry::FMulAdd: {
14874 auto GetScalarCost = [&](
unsigned Idx) {
14877 return GetFMulAddCost(
E->getOperations(),
14883 for (
Value *V :
E->Scalars) {
14885 FMF &= FPCI->getFastMathFlags();
14887 FMF &= FPCIOp->getFastMathFlags();
14890 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,
14891 {VecTy, VecTy, VecTy}, FMF);
14893 return VecCost + CommonCost;
14895 return GetCostDiff(GetScalarCost, GetVectorCost);
14897 case Instruction::FNeg:
14898 case Instruction::Add:
14899 case Instruction::FAdd:
14900 case Instruction::Sub:
14901 case Instruction::FSub:
14902 case Instruction::Mul:
14903 case Instruction::FMul:
14904 case Instruction::UDiv:
14905 case Instruction::SDiv:
14906 case Instruction::FDiv:
14907 case Instruction::URem:
14908 case Instruction::SRem:
14909 case Instruction::FRem:
14910 case Instruction::Shl:
14911 case Instruction::LShr:
14912 case Instruction::AShr:
14913 case Instruction::And:
14914 case Instruction::Or:
14915 case Instruction::Xor: {
14916 auto GetScalarCost = [&](
unsigned Idx) {
14923 Value *Op1 =
E->getOperand(0)[Idx];
14925 SmallVector<const Value *, 2> Operands(1, Op1);
14929 Op2 =
E->getOperand(1)[Idx];
14935 ShuffleOrOp, OrigScalarTy,
CostKind, Op1Info, Op2Info, Operands);
14937 I && (ShuffleOrOp == Instruction::FAdd ||
14938 ShuffleOrOp == Instruction::FSub)) {
14946 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
14951 return CI && CI->getValue().countr_one() >= It->second.first;
14959 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy,
CostKind, Op1Info,
14960 Op2Info, {},
nullptr, TLI) +
14963 return GetCostDiff(GetScalarCost, GetVectorCost);
14965 case Instruction::GetElementPtr: {
14966 return CommonCost + GetGEPCostDiff(VL, VL0);
14968 case Instruction::Load: {
14969 auto GetScalarCost = [&](
unsigned Idx) {
14971 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
14972 VI->getAlign(),
VI->getPointerAddressSpace(),
14978 switch (
E->State) {
14979 case TreeEntry::Vectorize:
14980 if (
unsigned Factor =
E->getInterleaveFactor()) {
14981 VecLdCost = TTI->getInterleavedMemoryOpCost(
14982 Instruction::Load, VecTy, Factor, {}, LI0->getAlign(),
14983 LI0->getPointerAddressSpace(),
CostKind);
14986 VecLdCost = TTI->getMemoryOpCost(
14987 Instruction::Load, VecTy, LI0->getAlign(),
14991 case TreeEntry::StridedVectorize: {
14992 Align CommonAlignment =
14994 VecLdCost = TTI->getStridedMemoryOpCost(
14995 Instruction::Load, VecTy, LI0->getPointerOperand(),
14996 false, CommonAlignment,
CostKind);
14999 case TreeEntry::CompressVectorize: {
15001 unsigned InterleaveFactor;
15002 SmallVector<int> CompressMask;
15005 if (!
E->ReorderIndices.empty()) {
15006 SmallVector<int>
Mask(
E->ReorderIndices.begin(),
15007 E->ReorderIndices.end());
15014 Scalars, PointerOps,
E->ReorderIndices, *TTI, *DL, *SE, *AC, *DT,
15015 *TLI, [](
Value *) { return true; }, IsMasked, InterleaveFactor,
15016 CompressMask, LoadVecTy);
15017 assert(IsVectorized &&
"Failed to vectorize load");
15018 CompressEntryToData.try_emplace(
E, CompressMask, LoadVecTy,
15019 InterleaveFactor, IsMasked);
15020 Align CommonAlignment = LI0->getAlign();
15021 if (InterleaveFactor) {
15022 VecLdCost = TTI->getInterleavedMemoryOpCost(
15023 Instruction::Load, LoadVecTy, InterleaveFactor, {},
15024 CommonAlignment, LI0->getPointerAddressSpace(),
CostKind);
15025 }
else if (IsMasked) {
15026 VecLdCost = TTI->getMaskedMemoryOpCost(
15027 Instruction::Load, LoadVecTy, CommonAlignment,
15028 LI0->getPointerAddressSpace(),
CostKind);
15031 LoadVecTy, CompressMask,
CostKind);
15033 VecLdCost = TTI->getMemoryOpCost(
15034 Instruction::Load, LoadVecTy, CommonAlignment,
15038 LoadVecTy, CompressMask,
CostKind);
15042 case TreeEntry::ScatterVectorize: {
15043 Align CommonAlignment =
15045 VecLdCost = TTI->getGatherScatterOpCost(
15046 Instruction::Load, VecTy, LI0->getPointerOperand(),
15047 false, CommonAlignment,
CostKind);
15050 case TreeEntry::CombinedVectorize:
15051 case TreeEntry::SplitVectorize:
15052 case TreeEntry::NeedToGather:
15055 return VecLdCost + CommonCost;
15061 if (
E->State == TreeEntry::ScatterVectorize)
15068 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
15070 case Instruction::Store: {
15071 bool IsReorder = !
E->ReorderIndices.empty();
15072 auto GetScalarCost = [=](
unsigned Idx) {
15075 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
15076 VI->getAlign(),
VI->getPointerAddressSpace(),
15084 if (
E->State == TreeEntry::StridedVectorize) {
15085 Align CommonAlignment =
15087 VecStCost = TTI->getStridedMemoryOpCost(
15088 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
15089 false, CommonAlignment,
CostKind);
15091 assert(
E->State == TreeEntry::Vectorize &&
15092 "Expected either strided or consecutive stores.");
15093 if (
unsigned Factor =
E->getInterleaveFactor()) {
15094 assert(
E->ReuseShuffleIndices.empty() && !
E->ReorderIndices.empty() &&
15095 "No reused shuffles expected");
15097 VecStCost = TTI->getInterleavedMemoryOpCost(
15098 Instruction::Store, VecTy, Factor, {}, BaseSI->getAlign(),
15099 BaseSI->getPointerAddressSpace(),
CostKind);
15102 VecStCost = TTI->getMemoryOpCost(
15103 Instruction::Store, VecTy, BaseSI->getAlign(),
15104 BaseSI->getPointerAddressSpace(),
CostKind, OpInfo);
15107 return VecStCost + CommonCost;
15111 unsigned Idx = IsReorder ?
E->ReorderIndices[
I] :
I;
15115 return GetCostDiff(GetScalarCost, GetVectorCost) +
15116 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
15118 case Instruction::Call: {
15119 auto GetScalarCost = [&](
unsigned Idx) {
15123 IntrinsicCostAttributes CostAttrs(
ID, *CI, 1);
15124 return TTI->getIntrinsicInstrCost(CostAttrs,
CostKind);
15134 CI,
ID, VecTy->getNumElements(),
15135 It != MinBWs.end() ? It->second.first : 0, TTI);
15137 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
15139 return GetCostDiff(GetScalarCost, GetVectorCost);
15141 case Instruction::ShuffleVector: {
15149 "Invalid Shuffle Vector Operand");
15152 auto TryFindNodeWithEqualOperands = [=]() {
15153 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
15156 if (
TE->hasState() &&
TE->isAltShuffle() &&
15157 ((
TE->getOpcode() ==
E->getOpcode() &&
15158 TE->getAltOpcode() ==
E->getAltOpcode()) ||
15159 (
TE->getOpcode() ==
E->getAltOpcode() &&
15160 TE->getAltOpcode() ==
E->getOpcode())) &&
15161 TE->hasEqualOperands(*
E))
15166 auto GetScalarCost = [&](
unsigned Idx) {
15171 assert(
E->getMatchingMainOpOrAltOp(VI) &&
15172 "Unexpected main/alternate opcode");
15174 return TTI->getInstructionCost(VI,
CostKind);
15182 if (TryFindNodeWithEqualOperands()) {
15184 dbgs() <<
"SLP: diamond match for alternate node found.\n";
15191 TTIRef.getArithmeticInstrCost(
E->getOpcode(), VecTy,
CostKind);
15193 TTIRef.getArithmeticInstrCost(
E->getAltOpcode(), VecTy,
CostKind);
15196 VecCost = TTIRef.getCmpSelInstrCost(
15197 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(),
CostKind,
15198 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15200 VecCost += TTIRef.getCmpSelInstrCost(
15201 E->getOpcode(), VecTy, MaskTy,
15203 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15206 Type *SrcSclTy =
E->getMainOp()->getOperand(0)->getType();
15209 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
15210 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
15212 DL->getTypeSizeInBits(
E->getMainOp()->getOperand(0)->getType());
15213 if (SrcIt != MinBWs.end()) {
15214 SrcBWSz = SrcIt->second.first;
15218 if (BWSz <= SrcBWSz) {
15219 if (BWSz < SrcBWSz)
15221 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
15225 <<
"SLP: alternate extension, which should be truncated.\n";
15231 VecCost = TTIRef.getCastInstrCost(
E->getOpcode(), VecTy, SrcTy,
15234 TTIRef.getCastInstrCost(
E->getAltOpcode(), VecTy, SrcTy,
15237 SmallVector<int>
Mask;
15238 E->buildAltOpShuffleMask(
15239 [&](Instruction *
I) {
15240 assert(
E->getMatchingMainOpOrAltOp(
I) &&
15241 "Unexpected main/alternate opcode");
15252 unsigned Opcode0 =
E->getOpcode();
15253 unsigned Opcode1 =
E->getAltOpcode();
15254 SmallBitVector OpcodeMask(
15258 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
15260 VecTy, Opcode0, Opcode1, OpcodeMask,
CostKind);
15261 return AltVecCost < VecCost ? AltVecCost : VecCost;
15267 return GetCostDiff(
15272 "Not supported shufflevector usage.");
15274 unsigned SVNumElements =
15276 ->getNumElements();
15277 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
15278 for (
size_t I = 0, End = VL.
size();
I != End;
I += GroupSize) {
15283 "Not supported shufflevector usage.");
15286 [[maybe_unused]]
bool IsExtractSubvectorMask =
15287 SV->isExtractSubvectorMask(Index);
15288 assert(IsExtractSubvectorMask &&
15289 "Not supported shufflevector usage.");
15290 if (NextIndex != Index)
15292 NextIndex += SV->getShuffleMask().size();
15295 return ::getShuffleCost(
15301 return GetCostDiff(GetScalarCost, GetVectorCost);
15303 case Instruction::Freeze:
15310bool BoUpSLP::isFullyVectorizableTinyTree(
bool ForReduction)
const {
15312 << VectorizableTree.size() <<
" is fully vectorizable .\n");
15314 auto &&AreVectorizableGathers = [
this](
const TreeEntry *
TE,
unsigned Limit) {
15315 SmallVector<int>
Mask;
15316 return TE->isGather() &&
15318 [
this](
Value *V) { return EphValues.contains(V); }) &&
15320 TE->Scalars.size() < Limit ||
15321 (((
TE->hasState() &&
15322 TE->getOpcode() == Instruction::ExtractElement) ||
15325 (
TE->hasState() &&
TE->getOpcode() == Instruction::Load &&
15326 !
TE->isAltShuffle()) ||
15331 if (VectorizableTree.size() == 1 &&
15332 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
15333 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
15334 VectorizableTree[0]->State == TreeEntry::CompressVectorize ||
15336 AreVectorizableGathers(VectorizableTree[0].
get(),
15337 VectorizableTree[0]->Scalars.size()) &&
15338 VectorizableTree[0]->getVectorFactor() > 2)))
15341 if (VectorizableTree.size() != 2)
15348 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
15349 AreVectorizableGathers(VectorizableTree[1].
get(),
15350 VectorizableTree[0]->Scalars.size()))
15354 if (VectorizableTree[0]->
isGather() ||
15355 (VectorizableTree[1]->
isGather() &&
15356 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
15357 VectorizableTree[0]->State != TreeEntry::StridedVectorize &&
15358 VectorizableTree[0]->State != TreeEntry::CompressVectorize))
15366 bool MustMatchOrInst) {
15370 Value *ZextLoad = Root;
15371 const APInt *ShAmtC;
15372 bool FoundOr =
false;
15376 ShAmtC->
urem(8) == 0))) {
15378 ZextLoad = BinOp->getOperand(0);
15379 if (BinOp->getOpcode() == Instruction::Or)
15384 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
15391 Type *SrcTy = Load->getType();
15392 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
15398 LLVM_DEBUG(
dbgs() <<
"SLP: Assume load combining for tree starting at "
15408 unsigned NumElts = VectorizableTree[0]->Scalars.size();
15409 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
15417 unsigned NumElts = Stores.
size();
15418 for (
Value *Scalar : Stores) {
15432 if (VectorizableTree.empty()) {
15433 assert(ExternalUses.empty() &&
"We shouldn't have any external users");
15439 if (VectorizableTree.size() == 2 &&
15441 VectorizableTree[1]->isGather() &&
15442 (VectorizableTree[1]->getVectorFactor() <= 2 ||
15443 !(
isSplat(VectorizableTree[1]->Scalars) ||
15451 constexpr int Limit = 4;
15453 !VectorizableTree.empty() &&
15454 all_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
15455 return (TE->isGather() &&
15456 (!TE->hasState() ||
15457 TE->getOpcode() != Instruction::ExtractElement) &&
15459 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
15466 VectorizableTree.size() <= Limit &&
15467 all_of(VectorizableTree,
15468 [&](
const std::unique_ptr<TreeEntry> &TE) {
15469 return (TE->isGather() &&
15470 (!TE->hasState() ||
15471 TE->getOpcode() != Instruction::ExtractElement) &&
15475 (TE->getOpcode() == Instruction::InsertElement ||
15476 (TE->getOpcode() == Instruction::PHI &&
15478 return isa<PoisonValue>(V) || MustGather.contains(V);
15481 any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
15482 return TE->State == TreeEntry::Vectorize &&
15483 TE->getOpcode() == Instruction::PHI;
15490 unsigned NumGathers = 0;
15491 constexpr int LimitTreeSize = 36;
15493 all_of(VectorizableTree,
15494 [&](
const std::unique_ptr<TreeEntry> &TE) {
15495 if (!TE->isGather() && TE->hasState() &&
15496 (TE->getOpcode() == Instruction::Load ||
15497 TE->getOpcode() == Instruction::Store)) {
15501 if (TE->isGather())
15503 return TE->State == TreeEntry::SplitVectorize ||
15504 (TE->Idx == 0 && TE->Scalars.size() == 2 &&
15505 TE->hasState() && TE->getOpcode() == Instruction::ICmp &&
15506 VectorizableTree.size() > LimitTreeSize) ||
15510 (TE->getOpcode() == Instruction::PHI ||
15511 (TE->hasCopyableElements() &&
15514 TE->Scalars.size() / 2) ||
15515 ((!TE->ReuseShuffleIndices.empty() ||
15516 !TE->ReorderIndices.empty() || TE->isAltShuffle()) &&
15517 TE->Scalars.size() == 2)));
15519 (StoreLoadNodes.
empty() ||
15520 (VectorizableTree.size() > LimitTreeSize * StoreLoadNodes.
size() &&
15521 (NumGathers > 0 ||
none_of(StoreLoadNodes, [&](
const TreeEntry *TE) {
15522 return TE->getOpcode() == Instruction::Store ||
15524 return !isa<LoadInst>(V) ||
15525 areAllUsersVectorized(cast<Instruction>(V));
15533 VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
15534 VectorizableTree.size() >= Limit &&
15536 [&](
const std::unique_ptr<TreeEntry> &TE) {
15537 return !TE->isGather() && TE->UserTreeIndex.UserTE &&
15538 TE->UserTreeIndex.UserTE->Idx == 0;
15545 VectorizableTree.size() > 2 &&
15546 VectorizableTree.front()->State == TreeEntry::Vectorize &&
15547 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
15548 VectorizableTree[1]->State == TreeEntry::Vectorize &&
15549 VectorizableTree[1]->getOpcode() == Instruction::PHI &&
15551 ArrayRef(VectorizableTree).drop_front(2),
15552 [&](
const std::unique_ptr<TreeEntry> &TE) {
return TE->isGather(); }))
15562 if (isFullyVectorizableTinyTree(ForReduction))
15567 bool IsAllowedSingleBVNode =
15568 VectorizableTree.
size() > 1 ||
15569 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
15570 !VectorizableTree.front()->isAltShuffle() &&
15571 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
15572 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
15574 if (
any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
15575 return TE->isGather() &&
all_of(TE->Scalars, [&](
Value *V) {
15576 return isa<ExtractElementInst, Constant>(V) ||
15577 (IsAllowedSingleBVNode &&
15578 !V->hasNUsesOrMore(UsesLimit) &&
15579 any_of(V->users(), IsaPred<InsertElementInst>));
15584 if (VectorizableTree.back()->isGather() &&
15585 VectorizableTree.back()->hasState() &&
15586 VectorizableTree.back()->isAltShuffle() &&
15587 VectorizableTree.back()->getVectorFactor() > 2 &&
15589 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
15590 TTI->getScalarizationOverhead(
15591 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
15592 VectorizableTree.back()->getVectorFactor()),
15605 constexpr unsigned SmallTree = 3;
15606 if (VectorizableTree.front()->isNonPowOf2Vec() &&
15609 [](
const std::unique_ptr<TreeEntry> &TE) {
15610 return TE->isGather() && TE->hasState() &&
15611 TE->getOpcode() == Instruction::Load &&
15619 TreeEntry &E = *VectorizableTree[Idx];
15620 if (E.State == TreeEntry::SplitVectorize)
15624 if ((E.hasState() && E.getOpcode() != Instruction::Load) ||
15643 const TreeEntry *Root = VectorizableTree.front().get();
15644 if (Root->isGather())
15652 for (
const auto &TEPtr : VectorizableTree) {
15653 if (!TEPtr->isGather()) {
15654 Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
15655 EntriesToLastInstruction.
try_emplace(TEPtr.get(), LastInst);
15656 LastInstructions.
insert(LastInst);
15658 if (TEPtr->UserTreeIndex)
15659 EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
15666 if (
II->isAssumeLikeIntrinsic())
15673 return IntrCost < CallCost;
15680 CheckedInstructions;
15681 unsigned Budget = 0;
15682 const unsigned BudgetLimit =
15687 "Expected instructions in same block.");
15688 if (
auto It = CheckedInstructions.
find(
Last);
15689 It != CheckedInstructions.
end()) {
15690 const Instruction *Checked = It->second.getPointer();
15692 return It->second.getInt() != 0;
15698 ++
First->getIterator().getReverse(),
15700 Last->getIterator().getReverse();
15702 while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
15708 for (
const Instruction *LastInst : LastInstsInRange)
15709 CheckedInstructions.
try_emplace(LastInst, &*PrevInstIt, 0);
15712 if (LastInstructions.
contains(&*PrevInstIt))
15713 LastInstsInRange.
push_back(&*PrevInstIt);
15718 for (
const Instruction *LastInst : LastInstsInRange)
15720 LastInst, PrevInstIt == InstIt ?
First : &*PrevInstIt,
15721 Budget <= BudgetLimit ? 1 : 0);
15722 return Budget <= BudgetLimit;
15724 auto AddCosts = [&](
const TreeEntry *
Op) {
15725 Type *ScalarTy =
Op->Scalars.front()->getType();
15726 auto It = MinBWs.find(
Op);
15727 if (It != MinBWs.end())
15730 Cost += TTI->getCostOfKeepingLiveOverCall(VecTy);
15733 Cost -=
Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy);
15740 ParentOpParentToPreds;
15743 auto Key = std::make_pair(Root, OpParent);
15744 if (
auto It = ParentOpParentToPreds.
find(
Key);
15745 It != ParentOpParentToPreds.
end())
15757 for (
const auto &KeyPair : ParentsPairsToAdd) {
15759 "Should not have been added before.");
15763 while (!Worklist.
empty()) {
15765 if (BB == OpParent || !Visited.
insert(BB).second)
15767 auto Pair = std::make_pair(BB, OpParent);
15768 if (
auto It = ParentOpParentToPreds.
find(Pair);
15769 It != ParentOpParentToPreds.
end()) {
15773 ParentsPairsToAdd.
insert(Pair);
15778 if (Budget > BudgetLimit)
15790 while (!LiveEntries.
empty()) {
15793 if (Operands.
empty())
15795 Instruction *LastInst = EntriesToLastInstruction.
at(Entry);
15797 for (
const TreeEntry *
Op : Operands) {
15798 if (!
Op->isGather())
15800 if (Entry->State == TreeEntry::SplitVectorize ||
15801 (Entry->getOpcode() != Instruction::PHI &&
Op->isGather()) ||
15807 Pred = Phi->getIncomingBlock(
Op->UserTreeIndex.EdgeIdx);
15810 if (
Op->isGather()) {
15811 assert(Entry->getOpcode() == Instruction::PHI &&
15812 "Expected phi node only.");
15814 ->getIncomingBlock(
Op->UserTreeIndex.EdgeIdx);
15816 for (
Value *V :
Op->Scalars) {
15827 OpLastInst = EntriesToLastInstruction.
at(
Op);
15831 if (OpParent == Parent) {
15832 if (Entry->getOpcode() == Instruction::PHI) {
15833 if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
15837 if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
15843 if (Entry->getOpcode() != Instruction::PHI &&
15844 !CheckForNonVecCallsInSameBlock(
15845 &*LastInst->
getParent()->getFirstNonPHIOrDbgOrAlloca(),
15851 if (!CheckForNonVecCallsInSameBlock(OpLastInst,
15857 if (!CheckPredecessors(Parent, Pred, OpParent)) {
15873 const auto *I1 = IE1;
15874 const auto *I2 = IE2;
15886 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
15889 if (I2 && ((I2 == IE2 || I2->
hasOneUse())) &&
15892 }
while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
15899struct ValueSelect {
15900 template <
typename U>
15901 static std::enable_if_t<std::is_same_v<Value *, U>,
Value *>
get(
Value *V) {
15904 template <
typename U>
15905 static std::enable_if_t<!std::is_same_v<Value *, U>,
U>
get(
Value *) {
15923template <
typename T>
15929 assert(!ShuffleMask.empty() &&
"Empty list of shuffles for inserts.");
15931 auto VMIt = std::next(ShuffleMask.begin());
15934 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
15936 if (!IsBaseUndef.
all()) {
15938 std::pair<T *, bool> Res =
15939 ResizeAction(ShuffleMask.begin()->first, Mask,
false);
15941 for (
unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
15945 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
15947 [[maybe_unused]]
auto *V = ValueSelect::get<T *>(
Base);
15948 assert((!V || GetVF(V) == Mask.size()) &&
15949 "Expected base vector of VF number of elements.");
15950 Prev = Action(Mask, {
nullptr, Res.first});
15951 }
else if (ShuffleMask.size() == 1) {
15954 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
15960 Prev = Action(Mask, {ShuffleMask.begin()->first});
15964 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
15965 unsigned Vec2VF = GetVF(VMIt->first);
15966 if (Vec1VF == Vec2VF) {
15970 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
15973 Mask[
I] = SecMask[
I] + Vec1VF;
15976 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
15979 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
15981 std::pair<T *, bool> Res2 =
15982 ResizeAction(VMIt->first, VMIt->second,
false);
15984 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
15991 Mask[
I] = (Res2.second ?
I : SecMask[
I]) + VF;
15994 Prev = Action(Mask, {Res1.first, Res2.first});
15996 VMIt = std::next(VMIt);
15998 [[maybe_unused]]
bool IsBaseNotUndef = !IsBaseUndef.
all();
16000 for (
auto E = ShuffleMask.end(); VMIt !=
E; ++VMIt) {
16002 std::pair<T *, bool> Res =
16003 ResizeAction(VMIt->first, VMIt->second,
false);
16005 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
16008 "Multiple uses of scalars.");
16009 Mask[
I] = (Res.second ?
I : SecMask[
I]) + VF;
16014 Prev = Action(Mask, {Prev, Res.first});
16022template <
typename T>
struct ShuffledInsertData {
16026 MapVector<T, SmallVector<int>> ValueMasks;
16034 << VectorizableTree.size() <<
".\n");
16037 for (
unsigned I = 0, E = VectorizableTree.size();
I < E; ++
I) {
16038 TreeEntry &TE = *VectorizableTree[
I];
16041 if (TE.State == TreeEntry::CombinedVectorize) {
16043 dbgs() <<
"SLP: Skipping cost for combined node that starts with "
16044 << *TE.Scalars[0] <<
".\n";
16045 TE.dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16048 if (TE.hasState() &&
16049 (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {
16050 if (
const TreeEntry *E =
16051 getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
16052 E && E->getVectorFactor() == TE.getVectorFactor()) {
16057 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16064 assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&
16065 "Expected gather nodes with users only.");
16071 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16075 none_of(ExternalUses, [](
const ExternalUser &EU) {
16086 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
16093 for (ExternalUser &EU : ExternalUses) {
16094 ScalarUserAndIdx.
emplace_back(EU.Scalar, EU.User, EU.Lane);
16097 for (ExternalUser &EU : ExternalUses) {
16098 LLVM_DEBUG(
dbgs() <<
"SLP: Computing cost for external use of TreeEntry "
16099 << EU.E.Idx <<
" in lane " << EU.Lane <<
"\n");
16101 else dbgs() <<
" User: nullptr\n");
16102 LLVM_DEBUG(
dbgs() <<
" Use: " << EU.Scalar->getNameOrAsOperand() <<
"\n");
16107 if (EphValues.count(EU.User))
16111 if (!CheckedScalarUser.
insert(std::make_pair(EU.Scalar, EU.User)).second ||
16113 CheckedScalarUser.
contains(std::make_pair(EU.Scalar,
nullptr))))
16121 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
16127 !ExtractCostCalculated.
insert(EU.Scalar).second)
16140 if (!UsedInserts.
insert(VU).second)
16144 const TreeEntry *ScalarTE = &EU.E;
16147 [
this, VU](
const ShuffledInsertData<const TreeEntry *> &
Data) {
16152 Value *Op0 =
II->getOperand(0);
16159 if (It == ShuffledInserts.
end()) {
16161 Data.InsertElements.emplace_back(VU);
16163 VecId = ShuffledInserts.
size() - 1;
16164 auto It = MinBWs.find(ScalarTE);
16165 if (It != MinBWs.end() &&
16167 .
insert(std::make_pair(ScalarTE, FTy->getElementType()))
16169 unsigned BWSz = It->second.first;
16170 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
16171 unsigned VecOpcode;
16172 if (DstBWSz < BWSz)
16173 VecOpcode = Instruction::Trunc;
16176 It->second.second ? Instruction::SExt : Instruction::ZExt;
16181 FTy->getNumElements()),
16184 <<
" for extending externally used vector with "
16185 "non-equal minimum bitwidth.\n");
16190 It->InsertElements.front() = VU;
16191 VecId = std::distance(ShuffledInserts.
begin(), It);
16193 int InIdx = *InsertIdx;
16195 ShuffledInserts[VecId].ValueMasks[ScalarTE];
16198 Mask[InIdx] = EU.Lane;
16199 DemandedElts[VecId].setBit(InIdx);
16210 auto *ScalarTy = EU.Scalar->getType();
16211 const unsigned BundleWidth = EU.E.getVectorFactor();
16212 assert(EU.Lane < BundleWidth &&
"Extracted lane out of bounds.");
16214 const TreeEntry *Entry = &EU.E;
16215 auto It = MinBWs.find(Entry);
16216 if (It != MinBWs.end()) {
16221 ? Instruction::ZExt
16222 : Instruction::SExt;
16227 << ExtraCost <<
"\n");
16231 CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx);
16232 LLVM_DEBUG(
dbgs() <<
" ExtractElement cost for " << *ScalarTy <<
" from "
16233 << *VecTy <<
": " << ExtraCost <<
"\n");
16236 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
16237 Entry->getOpcode() == Instruction::Load) {
16239 auto IsPhiInLoop = [&](
const ExternalUser &U) {
16242 const Loop *L = LI->getLoopFor(Phi->getParent());
16243 return L && (Phi->getParent() ==
I->getParent() ||
16244 L == LI->getLoopFor(
I->getParent()));
16248 if (!ValueToExtUses) {
16249 ValueToExtUses.emplace();
16250 for (
const auto &
P :
enumerate(ExternalUses)) {
16252 if (IsPhiInLoop(
P.value()))
16255 ValueToExtUses->try_emplace(
P.value().Scalar,
P.index());
16262 auto OperandIsScalar = [&](
Value *V) {
16268 return !EE->hasOneUse() || !MustGather.contains(EE);
16271 return ValueToExtUses->contains(V);
16273 bool CanBeUsedAsScalar =
all_of(Inst->operands(), OperandIsScalar);
16274 bool CanBeUsedAsScalarCast =
false;
16277 Op &&
all_of(
Op->operands(), OperandIsScalar)) {
16282 if (ScalarCost + OpCost <= ExtraCost) {
16283 CanBeUsedAsScalar = CanBeUsedAsScalarCast =
true;
16284 ScalarCost += OpCost;
16288 if (CanBeUsedAsScalar) {
16289 bool KeepScalar = ScalarCost <= ExtraCost;
16293 bool IsProfitablePHIUser =
16295 VectorizableTree.front()->Scalars.size() > 2)) &&
16296 VectorizableTree.front()->hasState() &&
16297 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
16301 auto *PHIUser = dyn_cast<PHINode>(U);
16302 return (!PHIUser ||
16303 PHIUser->getParent() !=
16305 VectorizableTree.front()->getMainOp())
16310 return ValueToExtUses->contains(V);
16312 if (IsProfitablePHIUser) {
16316 (!GatheredLoadsEntriesFirst.has_value() ||
16317 Entry->Idx < *GatheredLoadsEntriesFirst)) {
16318 unsigned ScalarUsesCount =
count_if(Entry->Scalars, [&](
Value *V) {
16319 return ValueToExtUses->contains(V);
16321 auto It = ExtractsCount.
find(Entry);
16322 if (It != ExtractsCount.
end()) {
16323 assert(ScalarUsesCount >= It->getSecond().size() &&
16324 "Expected total number of external uses not less than "
16325 "number of scalar uses.");
16326 ScalarUsesCount -= It->getSecond().size();
16331 KeepScalar = ScalarUsesCount <= 1 || !
has_single_bit(ScalarUsesCount);
16334 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
16335 for (
Value *V : Inst->operands()) {
16336 auto It = ValueToExtUses->find(V);
16337 if (It != ValueToExtUses->end()) {
16339 ExternalUses[It->second].User =
nullptr;
16342 ExtraCost = ScalarCost;
16343 if (!IsPhiInLoop(EU))
16344 ExtractsCount[Entry].
insert(Inst);
16345 if (CanBeUsedAsScalarCast) {
16346 ScalarOpsFromCasts.
insert(Inst->getOperand(0));
16350 for (
Value *V : IOp->operands()) {
16351 auto It = ValueToExtUses->find(V);
16352 if (It != ValueToExtUses->end()) {
16354 ExternalUses[It->second].User =
nullptr;
16363 ExtractCost += ExtraCost;
16367 for (
Value *V : ScalarOpsFromCasts) {
16368 ExternalUsesAsOriginalScalar.insert(V);
16370 ExternalUses.emplace_back(V,
nullptr, *TEs.front(),
16371 TEs.front()->findLaneForValue(V));
16375 if (!VectorizedVals.
empty()) {
16376 const TreeEntry &Root = *VectorizableTree.front();
16377 auto BWIt = MinBWs.find(&Root);
16378 if (BWIt != MinBWs.end()) {
16379 Type *DstTy = Root.Scalars.front()->getType();
16380 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->
getScalarType());
16382 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
16383 if (OriginalSz != SrcSz) {
16384 unsigned Opcode = Instruction::Trunc;
16385 if (OriginalSz > SrcSz)
16386 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
16392 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
16399 Cost += ExtractCost;
16401 bool ForSingleMask) {
16403 unsigned VF = Mask.size();
16404 unsigned VecVF = TE->getVectorFactor();
16405 bool HasLargeIndex =
16406 any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); });
16407 if ((VF != VecVF && HasLargeIndex) ||
16410 if (HasLargeIndex) {
16412 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
16418 dbgs() <<
"SLP: Adding cost " <<
C
16419 <<
" for final shuffle of insertelement external users.\n";
16420 TE->dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16422 return std::make_pair(TE,
true);
16425 if (!ForSingleMask) {
16427 for (
unsigned I = 0;
I < VF; ++
I) {
16429 ResizeMask[Mask[
I]] = Mask[
I];
16436 dbgs() <<
"SLP: Adding cost " <<
C
16437 <<
" for final shuffle of insertelement external users.\n";
16438 TE->dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16443 return std::make_pair(TE,
false);
16446 for (
int I = 0, E = ShuffledInserts.
size();
I < E; ++
I) {
16447 Value *
Base = ShuffledInserts[
I].InsertElements.
front()->getOperand(0);
16448 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
16452 assert((TEs.size() == 1 || TEs.size() == 2) &&
16453 "Expected exactly 1 or 2 tree entries.");
16454 if (TEs.size() == 1) {
16456 VF = TEs.front()->getVectorFactor();
16457 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16461 (
Data.index() < VF &&
16462 static_cast<int>(
Data.index()) ==
Data.value());
16467 <<
" for final shuffle of insertelement "
16468 "external users.\n";
16469 TEs.front()->
dump();
16470 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16476 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
16477 VF = TEs.front()->getVectorFactor();
16481 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16485 <<
" for final shuffle of vector node and external "
16486 "insertelement users.\n";
16487 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
16488 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16496 [](
const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
16497 EstimateShufflesCost);
16500 ShuffledInserts[
I].InsertElements.
front()->getType()),
16503 Cost -= InsertCost;
16507 if (ReductionBitWidth != 0) {
16508 assert(UserIgnoreList &&
"Expected reduction tree.");
16509 const TreeEntry &E = *VectorizableTree.front();
16510 auto It = MinBWs.find(&E);
16511 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
16512 unsigned SrcSize = It->second.first;
16513 unsigned DstSize = ReductionBitWidth;
16514 unsigned Opcode = Instruction::Trunc;
16515 if (SrcSize < DstSize) {
16516 bool IsArithmeticExtendedReduction =
16519 return is_contained({Instruction::Add, Instruction::FAdd,
16520 Instruction::Mul, Instruction::FMul,
16521 Instruction::And, Instruction::Or,
16525 if (IsArithmeticExtendedReduction)
16527 Instruction::BitCast;
16529 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
16531 if (Opcode != Instruction::BitCast) {
16533 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
16535 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
16538 switch (E.getOpcode()) {
16539 case Instruction::SExt:
16540 case Instruction::ZExt:
16541 case Instruction::Trunc: {
16542 const TreeEntry *OpTE = getOperandEntry(&E, 0);
16543 CCH = getCastContextHint(*OpTE);
16549 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
16553 <<
" for final resize for reduction from " << SrcVecTy
16554 <<
" to " << DstVecTy <<
"\n";
16555 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
16560 std::optional<InstructionCost> SpillCost;
16563 Cost += *SpillCost;
16569 OS <<
"SLP: Spill Cost = ";
16574 OS <<
".\nSLP: Extract Cost = " << ExtractCost <<
".\n"
16575 <<
"SLP: Total Cost = " <<
Cost <<
".\n";
16579 ViewGraph(
this,
"SLP" + F->getName(),
false, Str);
16590std::optional<TTI::ShuffleKind>
16591BoUpSLP::tryToGatherSingleRegisterExtractElements(
16597 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
16613 if (Idx >= VecTy->getNumElements()) {
16617 SmallBitVector ExtractMask(VecTy->getNumElements(),
true);
16618 ExtractMask.reset(*Idx);
16623 VectorOpToIdx[EI->getVectorOperand()].push_back(
I);
16628 stable_sort(Vectors, [](
const auto &P1,
const auto &P2) {
16629 return P1.second.size() > P2.second.size();
16632 const int UndefSz = UndefVectorExtracts.
size();
16633 unsigned SingleMax = 0;
16634 unsigned PairMax = 0;
16635 if (!Vectors.
empty()) {
16636 SingleMax = Vectors.
front().second.size() + UndefSz;
16637 if (Vectors.
size() > 1) {
16638 auto *ItNext = std::next(Vectors.
begin());
16639 PairMax = SingleMax + ItNext->second.size();
16642 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
16643 return std::nullopt;
16649 if (SingleMax >= PairMax && SingleMax) {
16650 for (
int Idx : Vectors.
front().second)
16651 std::swap(GatheredExtracts[Idx], VL[Idx]);
16652 }
else if (!Vectors.
empty()) {
16653 for (
unsigned Idx : {0, 1})
16654 for (
int Idx : Vectors[Idx].second)
16655 std::swap(GatheredExtracts[Idx], VL[Idx]);
16658 for (
int Idx : UndefVectorExtracts)
16659 std::swap(GatheredExtracts[Idx], VL[Idx]);
16662 std::optional<TTI::ShuffleKind> Res =
16668 return std::nullopt;
16672 for (
int I = 0,
E = GatheredExtracts.size();
I <
E; ++
I) {
16693BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
16694 SmallVectorImpl<int> &Mask,
16695 unsigned NumParts)
const {
16696 assert(NumParts > 0 &&
"NumParts expected be greater than or equal to 1.");
16705 SmallVector<int> SubMask;
16706 std::optional<TTI::ShuffleKind> Res =
16707 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
16708 ShufflesRes[Part] = Res;
16709 copy(SubMask, std::next(
Mask.begin(), Part * SliceSize));
16711 if (
none_of(ShufflesRes, [](
const std::optional<TTI::ShuffleKind> &Res) {
16712 return Res.has_value();
16714 ShufflesRes.clear();
16715 return ShufflesRes;
16718std::optional<TargetTransformInfo::ShuffleKind>
16719BoUpSLP::isGatherShuffledSingleRegisterEntry(
16721 SmallVectorImpl<const TreeEntry *> &Entries,
unsigned Part,
bool ForOrder) {
16725 auto GetUserEntry = [&](
const TreeEntry *
TE) {
16726 while (
TE->UserTreeIndex &&
TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16727 TE =
TE->UserTreeIndex.UserTE;
16728 if (TE == VectorizableTree.front().get())
16729 return EdgeInfo(
const_cast<TreeEntry *
>(TE), 0);
16730 return TE->UserTreeIndex;
16732 auto HasGatherUser = [&](
const TreeEntry *
TE) {
16733 while (
TE->Idx != 0 &&
TE->UserTreeIndex) {
16734 if (
TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16736 TE =
TE->UserTreeIndex.UserTE;
16740 const EdgeInfo TEUseEI = GetUserEntry(TE);
16742 return std::nullopt;
16743 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
16748 TEUseEI.UserTE->hasState() ? TEUseEI.UserTE->getMainOp() :
nullptr);
16749 PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
16750 TEInsertBlock =
PHI->getIncomingBlock(TEUseEI.EdgeIdx);
16753 TEInsertBlock = TEInsertPt->
getParent();
16755 if (!DT->isReachableFromEntry(TEInsertBlock))
16756 return std::nullopt;
16757 auto *NodeUI = DT->getNode(TEInsertBlock);
16758 assert(NodeUI &&
"Should only process reachable instructions");
16760 auto CheckOrdering = [&](
const Instruction *InsertPt) {
16773 const BasicBlock *InsertBlock = InsertPt->getParent();
16774 auto *NodeEUI = DT->getNode(InsertBlock);
16777 assert((NodeUI == NodeEUI) ==
16778 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
16779 "Different nodes should have different DFS numbers");
16781 if (TEInsertPt->
getParent() != InsertBlock &&
16782 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
16784 if (TEInsertPt->
getParent() == InsertBlock &&
16797 SmallDenseMap<Value *, int> UsedValuesEntry;
16798 SmallPtrSet<const Value *, 16> VisitedValue;
16799 auto CheckAndUseSameNode = [&](
const TreeEntry *TEPtr) {
16801 if ((TEPtr->getVectorFactor() != VL.
size() &&
16802 TEPtr->Scalars.size() != VL.
size()) ||
16803 (!TEPtr->isSame(VL) && !TEPtr->isSame(
TE->Scalars)))
16807 for (
Value *V : VL) {
16814 auto CheckParentNodes = [&](
const TreeEntry *User1,
const TreeEntry *User2,
16815 unsigned EdgeIdx) {
16816 const TreeEntry *Ptr1 = User1;
16817 const TreeEntry *Ptr2 = User2;
16818 SmallDenseMap<const TreeEntry *, unsigned> PtrToIdx;
16821 EdgeIdx = Ptr2->UserTreeIndex.EdgeIdx;
16822 Ptr2 = Ptr2->UserTreeIndex.UserTE;
16825 unsigned Idx = Ptr1->UserTreeIndex.EdgeIdx;
16826 Ptr1 = Ptr1->UserTreeIndex.UserTE;
16827 if (
auto It = PtrToIdx.
find(Ptr1); It != PtrToIdx.
end())
16828 return Idx < It->second;
16832 for (
Value *V : VL) {
16836 SmallPtrSet<const TreeEntry *, 4> VToTEs;
16837 for (
const TreeEntry *TEPtr : ValueToGatherNodes.lookup(V)) {
16838 if (TEPtr == TE || TEPtr->Idx == 0)
16841 [&](
Value *V) { return GatheredScalars.contains(V); }) &&
16842 "Must contain at least single gathered value.");
16843 assert(TEPtr->UserTreeIndex &&
16844 "Expected only single user of a gather node.");
16845 const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
16847 PHINode *UserPHI = (UseEI.UserTE->State != TreeEntry::SplitVectorize &&
16848 UseEI.UserTE->hasState())
16853 : &getLastInstructionInBundle(UseEI.UserTE);
16854 if (TEInsertPt == InsertPt) {
16856 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
16857 (TEUseEI.UserTE->getOpcode() != Instruction::PHI ||
16858 TEUseEI.UserTE->isAltShuffle()) &&
16860 if (UseEI.UserTE->State != TreeEntry::Vectorize ||
16861 (UseEI.UserTE->hasState() &&
16862 UseEI.UserTE->getOpcode() == Instruction::PHI &&
16863 !UseEI.UserTE->isAltShuffle()) ||
16872 (TEUseEI.UserTE != UseEI.UserTE || TEUseEI.EdgeIdx < UseEI.EdgeIdx))
16875 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
16876 TEUseEI.UserTE->getOpcode() == Instruction::PHI &&
16877 UseEI.UserTE->State == TreeEntry::Vectorize &&
16878 UseEI.UserTE->getOpcode() == Instruction::PHI &&
16879 TEUseEI.UserTE != UseEI.UserTE)
16884 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
16888 if (TEUseEI.UserTE != UseEI.UserTE &&
16889 (TEUseEI.UserTE->Idx < UseEI.UserTE->Idx ||
16890 HasGatherUser(TEUseEI.UserTE)))
16893 if (CheckParentNodes(TEUseEI.UserTE, UseEI.UserTE, UseEI.EdgeIdx))
16897 if (!TEUseEI.UserTE->isGather() && !UserPHI &&
16898 TEUseEI.UserTE->doesNotNeedToSchedule() !=
16899 UseEI.UserTE->doesNotNeedToSchedule() &&
16904 if ((TEInsertBlock != InsertPt->
getParent() ||
16905 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
16906 !CheckOrdering(InsertPt))
16909 if (CheckAndUseSameNode(TEPtr))
16915 VTEs, [&](
const TreeEntry *MTE) {
return MTE != TEUseEI.UserTE; });
16916 if (It != VTEs.end()) {
16917 const TreeEntry *VTE = *It;
16918 if (
none_of(
TE->CombinedEntriesWithIndices,
16919 [&](
const auto &
P) { return P.first == VTE->Idx; })) {
16920 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
16921 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
16925 if (CheckAndUseSameNode(VTE))
16931 const TreeEntry *VTE = VTEs.front();
16932 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
16933 VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
16934 VTEs = VTEs.drop_front();
16936 const auto *MIt =
find_if(VTEs, [](
const TreeEntry *MTE) {
16937 return MTE->State == TreeEntry::Vectorize;
16939 if (MIt == VTEs.end())
16943 if (
none_of(
TE->CombinedEntriesWithIndices,
16944 [&](
const auto &
P) { return P.first == VTE->Idx; })) {
16945 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
16946 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
16950 if (CheckAndUseSameNode(VTE))
16954 if (VToTEs.
empty())
16956 if (UsedTEs.
empty()) {
16964 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
16966 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
16970 if (!VToTEs.
empty()) {
16976 VToTEs = SavedVToTEs;
16981 if (Idx == UsedTEs.
size()) {
16985 if (UsedTEs.
size() == 2)
16987 UsedTEs.push_back(SavedVToTEs);
16988 Idx = UsedTEs.
size() - 1;
16994 if (UsedTEs.
empty()) {
16996 return std::nullopt;
17000 if (UsedTEs.
size() == 1) {
17003 UsedTEs.front().
end());
17004 sort(FirstEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
17005 return TE1->Idx < TE2->Idx;
17008 auto *It =
find_if(FirstEntries, [=](
const TreeEntry *EntryPtr) {
17009 return EntryPtr->isSame(VL) || EntryPtr->isSame(
TE->Scalars);
17011 if (It != FirstEntries.end() &&
17012 ((*It)->getVectorFactor() == VL.size() ||
17013 ((*It)->getVectorFactor() ==
TE->Scalars.size() &&
17014 TE->ReuseShuffleIndices.size() == VL.size() &&
17015 (*It)->isSame(
TE->Scalars)))) {
17017 if ((*It)->getVectorFactor() == VL.size()) {
17018 std::iota(std::next(
Mask.begin(), Part * VL.size()),
17019 std::next(
Mask.begin(), (Part + 1) * VL.size()), 0);
17021 SmallVector<int> CommonMask =
TE->getCommonMask();
17032 Entries.
push_back(FirstEntries.front());
17034 for (
auto &
P : UsedValuesEntry)
17036 VF = FirstEntries.front()->getVectorFactor();
17039 assert(UsedTEs.
size() == 2 &&
"Expected at max 2 permuted entries.");
17041 DenseMap<int, const TreeEntry *> VFToTE;
17042 for (
const TreeEntry *TE : UsedTEs.front()) {
17043 unsigned VF =
TE->getVectorFactor();
17044 auto It = VFToTE.
find(VF);
17045 if (It != VFToTE.
end()) {
17046 if (It->second->Idx >
TE->Idx)
17047 It->getSecond() =
TE;
17054 UsedTEs.back().
end());
17055 sort(SecondEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
17056 return TE1->Idx < TE2->Idx;
17058 for (
const TreeEntry *TE : SecondEntries) {
17059 auto It = VFToTE.
find(
TE->getVectorFactor());
17060 if (It != VFToTE.
end()) {
17069 if (Entries.
empty()) {
17071 UsedTEs.front(), [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
17072 return TE1->Idx < TE2->Idx;
17074 Entries.
push_back(SecondEntries.front());
17075 VF = std::max(Entries.
front()->getVectorFactor(),
17076 Entries.
back()->getVectorFactor());
17078 VF = Entries.
front()->getVectorFactor();
17081 for (
const TreeEntry *
E : Entries)
17085 for (
auto &
P : UsedValuesEntry) {
17087 if (ValuesToEntries[Idx].
contains(
P.first)) {
17097 auto AreCompatiblePHIs = [&](
Value *
V,
Value *V1) {
17104 for (
int I = 0,
E =
PHI->getNumIncomingValues();
I <
E; ++
I) {
17106 Value *In1 = PHI1->getIncomingValue(
I);
17121 auto MightBeIgnored = [=](
Value *
V) {
17125 !areAllUsersVectorized(
I, UserIgnoreList) &&
isSimple(
I);
17130 auto NeighborMightBeIgnored = [&](
Value *
V,
int Idx) {
17131 Value *V1 = VL[Idx];
17132 bool UsedInSameVTE =
false;
17133 auto It = UsedValuesEntry.find(V1);
17134 if (It != UsedValuesEntry.end())
17135 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
17136 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
17143 SmallBitVector UsedIdxs(Entries.size());
17145 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
17147 auto It = UsedValuesEntry.find(V);
17148 if (It == UsedValuesEntry.end())
17154 ((
I > 0 && NeighborMightBeIgnored(V,
I - 1)) ||
17155 (
I !=
E - 1 && NeighborMightBeIgnored(V,
I + 1)))))
17157 unsigned Idx = It->second;
17164 for (
unsigned I = 0, Sz = Entries.size();
I < Sz; ++
I) {
17165 if (!UsedIdxs.test(
I))
17171 for (std::pair<unsigned, int> &Pair : EntryLanes)
17172 if (Pair.first ==
I)
17173 Pair.first = TempEntries.
size();
17176 Entries.swap(TempEntries);
17177 if (EntryLanes.size() == Entries.size() &&
17179 .slice(Part * VL.size(),
17180 std::min<int>(VL.size(),
TE->Scalars.size())))) {
17186 return std::nullopt;
17189 bool IsIdentity = Entries.size() == 1;
17192 for (
const std::pair<unsigned, int> &Pair : EntryLanes) {
17193 unsigned Idx = Part * VL.size() + Pair.second;
17196 (ForOrder ? std::distance(
17197 Entries[Pair.first]->Scalars.begin(),
17198 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
17199 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
17200 IsIdentity &=
Mask[Idx] == Pair.second;
17202 if (ForOrder || IsIdentity || Entries.empty()) {
17203 switch (Entries.size()) {
17205 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
17209 if (EntryLanes.size() > 2 || VL.size() <= 2)
17216 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
17218 SmallVector<int> SubMask(std::next(
Mask.begin(), Part * VL.size()),
17219 std::next(
Mask.begin(), (Part + 1) * VL.size()));
17220 int MinElement = SubMask.
front(), MaxElement = SubMask.
front();
17221 for (
int Idx : SubMask) {
17229 assert(MaxElement >= 0 && MinElement >= 0 &&
17230 MaxElement % VF >= MinElement % VF &&
17231 "Expected at least single element.");
17232 unsigned NewVF = std::max<unsigned>(
17234 (MaxElement % VF) -
17235 (MinElement % VF) + 1));
17237 for (
int &Idx : SubMask) {
17240 Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
17241 (Idx >=
static_cast<int>(VF) ? NewVF : 0);
17249 auto *MaskVecTy =
getWidenedType(VL.front()->getType(), SubMask.size());
17250 auto GetShuffleCost = [&,
17251 &TTI = *TTI](ArrayRef<int>
Mask,
17254 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
17256 Mask, Entries.front()->getInterleaveFactor()))
17258 return ::getShuffleCost(TTI,
17263 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
17265 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
17266 if (Entries.size() == 1 || !Entries[0]->isGather()) {
17267 FirstShuffleCost = ShuffleCost;
17271 bool IsIdentity =
true;
17272 for (
auto [
I, Idx] :
enumerate(FirstMask)) {
17273 if (Idx >=
static_cast<int>(NewVF)) {
17278 IsIdentity &=
static_cast<int>(
I) == Idx;
17282 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
17284 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
17288 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
17289 if (Entries.size() == 1 || !Entries[1]->isGather()) {
17290 SecondShuffleCost = ShuffleCost;
17294 bool IsIdentity =
true;
17295 for (
auto [
I, Idx] :
enumerate(SecondMask)) {
17296 if (Idx <
static_cast<int>(NewVF) && Idx >= 0) {
17302 IsIdentity &=
static_cast<int>(
I) == Idx;
17307 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
17309 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
17317 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
17319 const TreeEntry *BestEntry =
nullptr;
17320 if (FirstShuffleCost < ShuffleCost) {
17321 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
17322 std::next(
Mask.begin(), (Part + 1) * VL.size()),
17324 if (Idx >= static_cast<int>(VF))
17325 Idx = PoisonMaskElem;
17327 BestEntry = Entries.front();
17328 ShuffleCost = FirstShuffleCost;
17330 if (SecondShuffleCost < ShuffleCost) {
17331 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
17332 std::next(
Mask.begin(), (Part + 1) * VL.size()),
17334 if (Idx < static_cast<int>(VF))
17335 Idx = PoisonMaskElem;
17339 BestEntry = Entries[1];
17340 ShuffleCost = SecondShuffleCost;
17342 if (BuildVectorCost >= ShuffleCost) {
17345 Entries.push_back(BestEntry);
17353 std::fill(std::next(
Mask.begin(), Part * VL.size()),
17355 return std::nullopt;
17359BoUpSLP::isGatherShuffledEntry(
17363 assert(NumParts > 0 && NumParts < VL.
size() &&
17364 "Expected positive number of registers.");
17367 if (TE == VectorizableTree.front().get() &&
17368 (!GatheredLoadsEntriesFirst.has_value() ||
17370 [](
const std::unique_ptr<TreeEntry> &TE) {
17371 return !
TE->isGather();
17376 if (
TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
17379 assert((
TE->UserTreeIndex || TE == VectorizableTree.front().get()) &&
17380 "Expected only single user of the gather node.");
17382 "Number of scalars must be divisible by NumParts.");
17383 if (
TE->UserTreeIndex &&
TE->UserTreeIndex.UserTE->isGather() &&
17384 TE->UserTreeIndex.EdgeIdx == UINT_MAX &&
17386 (
TE->hasState() &&
TE->getOpcode() == Instruction::ExtractElement) ||
17389 getSameValuesTreeEntry(
TE->getMainOp(),
TE->Scalars))))
17396 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
17397 std::optional<TTI::ShuffleKind> SubRes =
17398 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
17401 SubEntries.
clear();
17404 SubEntries.
front()->getVectorFactor() == VL.
size() &&
17405 (SubEntries.
front()->isSame(
TE->Scalars) ||
17406 SubEntries.
front()->isSame(VL))) {
17408 LocalSubEntries.
swap(SubEntries);
17411 std::iota(
Mask.begin(),
Mask.end(), 0);
17413 for (
int I = 0, Sz = VL.
size();
I < Sz; ++
I)
17416 Entries.emplace_back(1, LocalSubEntries.
front());
17422 [](
const std::optional<TTI::ShuffleKind> &SK) {
return !SK; })) {
17430 Type *ScalarTy)
const {
17431 const unsigned VF = VL.
size();
17439 auto EstimateInsertCost = [&](
unsigned I,
Value *
V) {
17441 if (
V->getType() != ScalarTy)
17442 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy,
V->getType(),
17446 std::iota(ConstantShuffleMask.begin(), ConstantShuffleMask.end(), 0);
17453 ConstantShuffleMask[
I] =
I + VF;
17456 EstimateInsertCost(
I, V);
17459 bool IsAnyNonUndefConst =
17462 if (!ForPoisonSrc && IsAnyNonUndefConst) {
17464 ConstantShuffleMask);
17468 if (!DemandedElements.
isZero())
17472 ForPoisonSrc && !IsAnyNonUndefConst, VL);
17476Instruction &BoUpSLP::getLastInstructionInBundle(
const TreeEntry *
E) {
17477 auto It = EntryToLastInstruction.find(
E);
17478 if (It != EntryToLastInstruction.end())
17486 if (
E->hasState()) {
17487 Front =
E->getMainOp();
17488 Opcode =
E->getOpcode();
17495 ((GatheredLoadsEntriesFirst.has_value() && Opcode == Instruction::Load &&
17496 E->isGather() &&
E->Idx < *GatheredLoadsEntriesFirst) ||
17497 E->State == TreeEntry::SplitVectorize ||
E->hasCopyableElements() ||
17499 [=](
Value *V) ->
bool {
17500 if (Opcode == Instruction::GetElementPtr &&
17501 !isa<GetElementPtrInst>(V))
17503 auto *I = dyn_cast<Instruction>(V);
17504 return !I || !E->getMatchingMainOpOrAltOp(I) ||
17505 I->getParent() == BB || isVectorLikeInstWithConstOps(I);
17507 "Expected gathered loads or GEPs or instructions from same basic "
17510 auto FindLastInst = [&]() {
17512 for (
Value *V :
E->Scalars) {
17516 if (
E->isCopyableElement(
I))
17518 if (LastInst->
getParent() ==
I->getParent()) {
17523 assert(((Opcode == Instruction::GetElementPtr &&
17525 E->State == TreeEntry::SplitVectorize ||
17528 (GatheredLoadsEntriesFirst.has_value() &&
17529 Opcode == Instruction::Load &&
E->isGather() &&
17530 E->Idx < *GatheredLoadsEntriesFirst)) &&
17531 "Expected vector-like or non-GEP in GEP node insts only.");
17532 if (!DT->isReachableFromEntry(LastInst->
getParent())) {
17536 if (!DT->isReachableFromEntry(
I->getParent()))
17538 auto *NodeA = DT->getNode(LastInst->
getParent());
17539 auto *NodeB = DT->getNode(
I->getParent());
17540 assert(NodeA &&
"Should only process reachable instructions");
17541 assert(NodeB &&
"Should only process reachable instructions");
17542 assert((NodeA == NodeB) ==
17543 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17544 "Different nodes should have different DFS numbers");
17545 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
17552 auto FindFirstInst = [&]() {
17554 for (
Value *V :
E->Scalars) {
17558 if (
E->isCopyableElement(
I))
17560 if (FirstInst->
getParent() ==
I->getParent()) {
17561 if (
I->comesBefore(FirstInst))
17565 assert(((Opcode == Instruction::GetElementPtr &&
17569 "Expected vector-like or non-GEP in GEP node insts only.");
17570 if (!DT->isReachableFromEntry(FirstInst->
getParent())) {
17574 if (!DT->isReachableFromEntry(
I->getParent()))
17576 auto *NodeA = DT->getNode(FirstInst->
getParent());
17577 auto *NodeB = DT->getNode(
I->getParent());
17578 assert(NodeA &&
"Should only process reachable instructions");
17579 assert(NodeB &&
"Should only process reachable instructions");
17580 assert((NodeA == NodeB) ==
17581 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17582 "Different nodes should have different DFS numbers");
17583 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
17589 if (
E->State == TreeEntry::SplitVectorize) {
17590 Res = FindLastInst();
17592 for (
auto *
E : Entries) {
17595 I = &getLastInstructionInBundle(
E);
17600 EntryToLastInstruction.try_emplace(
E, Res);
17605 if (GatheredLoadsEntriesFirst.has_value() &&
17606 E->Idx >= *GatheredLoadsEntriesFirst && !
E->isGather() &&
17607 Opcode == Instruction::Load) {
17608 Res = FindFirstInst();
17609 EntryToLastInstruction.try_emplace(
E, Res);
17615 auto FindScheduleBundle = [&](
const TreeEntry *
E) ->
const ScheduleBundle * {
17619 const auto *It = BlocksSchedules.find(BB);
17620 if (It == BlocksSchedules.end())
17622 for (
Value *V :
E->Scalars) {
17628 if (Bundles.
empty())
17631 Bundles, [&](ScheduleBundle *
B) {
return B->getTreeEntry() ==
E; });
17632 if (It != Bundles.
end())
17637 const ScheduleBundle *Bundle = FindScheduleBundle(
E);
17638 if (!
E->isGather() && !Bundle) {
17639 if ((Opcode == Instruction::GetElementPtr &&
17642 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
17645 return isa<PoisonValue>(V) ||
17646 (E->Idx == 0 && isa<InsertElementInst>(V)) ||
17647 E->isCopyableElement(V) ||
17648 (!isVectorLikeInstWithConstOps(V) && isUsedOutsideBlock(V));
17650 Res = FindLastInst();
17652 Res = FindFirstInst();
17653 EntryToLastInstruction.try_emplace(
E, Res);
17662 assert(!
E->isGather() &&
"Gathered instructions should not be scheduled");
17663 Res = Bundle->getBundle().back()->getInst();
17664 EntryToLastInstruction.try_emplace(
E, Res);
17687 Res = FindLastInst();
17688 assert(Res &&
"Failed to find last instruction in bundle");
17689 EntryToLastInstruction.try_emplace(
E, Res);
17693void BoUpSLP::setInsertPointAfterBundle(
const TreeEntry *
E) {
17694 auto *Front =
E->getMainOp();
17695 Instruction *LastInst = &getLastInstructionInBundle(
E);
17696 assert(LastInst &&
"Failed to find last instruction in bundle");
17701 LastInstIt = LastInst->
getParent()->getFirstNonPHIIt();
17702 if (LastInstIt != LastInst->
getParent()->end() &&
17703 LastInstIt->getParent()->isLandingPad())
17704 LastInstIt = std::next(LastInstIt);
17707 (!
E->isGather() &&
E->State != TreeEntry::SplitVectorize &&
17708 (
E->doesNotNeedToSchedule() ||
17709 (
E->hasCopyableElements() && !
E->isCopyableElement(LastInst) &&
17711 (GatheredLoadsEntriesFirst.has_value() &&
17712 E->Idx >= *GatheredLoadsEntriesFirst && !
E->isGather() &&
17713 E->getOpcode() == Instruction::Load)) {
17714 Builder.SetInsertPoint(LastInst->
getParent(), LastInstIt);
17718 Builder.SetInsertPoint(
17722 Builder.SetCurrentDebugLocation(Front->
getDebugLoc());
17725Value *BoUpSLP::gather(
17727 function_ref<
Value *(
Value *,
Value *, ArrayRef<int>)> CreateShuffle) {
17733 SmallSet<int, 4> PostponedIndices;
17734 Loop *
L = LI->getLoopFor(Builder.GetInsertBlock());
17736 SmallPtrSet<BasicBlock *, 4> Visited;
17737 while (InsertBB && InsertBB != InstBB && Visited.
insert(InsertBB).second)
17738 InsertBB = InsertBB->getSinglePredecessor();
17739 return InsertBB && InsertBB == InstBB;
17741 for (
int I = 0,
E = VL.
size();
I <
E; ++
I) {
17743 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
17745 (L && (!Root ||
L->isLoopInvariant(Root)) &&
L->contains(Inst))) &&
17746 PostponedIndices.
insert(
I).second)
17750 auto &&CreateInsertElement = [
this](
Value *Vec,
Value *
V,
unsigned Pos,
17753 if (
Scalar->getType() != Ty) {
17764 Scalar = Builder.CreateIntCast(
17778 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
17783 GatherShuffleExtractSeq.insert(InsElt);
17789 User *UserOp =
nullptr;
17794 if (
V->getType()->isVectorTy()) {
17796 SV && SV->getOperand(0) != V && SV->getOperand(1) != V) {
17798 auto FindOperand = [](
Value *Vec,
Value *
V) -> Instruction * {
17800 if (SV->getOperand(0) == V)
17802 if (SV->getOperand(1) == V)
17808 if (Instruction *User = FindOperand(SV->getOperand(0), V))
17810 else if (Instruction *User = FindOperand(SV->getOperand(1), V))
17813 "Failed to find shufflevector, caused by resize.");
17819 unsigned FoundLane = Entries.front()->findLaneForValue(V);
17820 ExternalUses.emplace_back(V, UserOp, *Entries.front(), FoundLane);
17828 SmallVector<int> NonConsts;
17830 std::iota(
Mask.begin(),
Mask.end(), 0);
17831 Value *OriginalRoot = Root;
17834 SV->getOperand(0)->getType() == VecTy) {
17835 Root = SV->getOperand(0);
17836 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
17839 for (
int I = 0,
E = VL.
size();
I <
E; ++
I) {
17848 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
17853 Vec = OriginalRoot;
17855 Vec = CreateShuffle(Root, Vec, Mask);
17857 OI && OI->use_empty() &&
17858 none_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
17859 return TE->VectorizedValue == OI;
17865 for (
int I : NonConsts)
17866 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
17869 for (
const std::pair<Value *, unsigned> &Pair : PostponedInsts)
17870 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
17908 bool IsFinalized =
false;
17921 class ShuffleIRBuilder {
17934 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
17935 CSEBlocks(CSEBlocks),
DL(DL) {}
17936 ~ShuffleIRBuilder() =
default;
17942 "Expected integer vector types only.");
17948 ->getIntegerBitWidth())
17949 V2 = Builder.CreateIntCast(
17952 V1 = Builder.CreateIntCast(
17956 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
17958 GatherShuffleExtractSeq.insert(
I);
17959 CSEBlocks.insert(
I->getParent());
17968 unsigned VF = Mask.size();
17972 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
17974 GatherShuffleExtractSeq.insert(
I);
17975 CSEBlocks.insert(
I->getParent());
17979 Value *createIdentity(
Value *V) {
return V; }
17980 Value *createPoison(
Type *Ty,
unsigned VF) {
17985 void resizeToMatch(
Value *&V1,
Value *&V2) {
17990 int VF = std::max(V1VF, V2VF);
17991 int MinVF = std::min(V1VF, V2VF);
17993 std::iota(IdentityMask.
begin(), std::next(IdentityMask.
begin(), MinVF),
17995 Value *&
Op = MinVF == V1VF ? V1 : V2;
17996 Op = Builder.CreateShuffleVector(
Op, IdentityMask);
17998 GatherShuffleExtractSeq.insert(
I);
17999 CSEBlocks.insert(
I->getParent());
18012 assert(V1 &&
"Expected at least one vector value.");
18013 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
18014 R.CSEBlocks, *R.DL);
18015 return BaseShuffleAnalysis::createShuffle<Value *>(
18016 V1, V2, Mask, ShuffleBuilder, ScalarTy);
18022 std::optional<bool> IsSigned = std::nullopt) {
18025 if (VecTy->getElementType() == ScalarTy->getScalarType())
18027 return Builder.CreateIntCast(
18028 V,
VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
18032 Value *getVectorizedValue(
const TreeEntry &E) {
18033 Value *Vec = E.VectorizedValue;
18036 return castToScalarTyElem(Vec,
any_of(E.Scalars, [&](
Value *V) {
18037 return !isa<PoisonValue>(V) &&
18038 !isKnownNonNegative(
18039 V, SimplifyQuery(*R.DL));
18045 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
18049 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
18050 unsigned NumParts,
bool &UseVecBaseAsInput) {
18051 UseVecBaseAsInput =
false;
18053 Value *VecBase =
nullptr;
18055 if (!E->ReorderIndices.empty()) {
18057 E->ReorderIndices.end());
18060 for (
int I = 0, Sz = Mask.size();
I < Sz; ++
I) {
18065 VecBase = EI->getVectorOperand();
18067 VecBase = TEs.front()->VectorizedValue;
18068 assert(VecBase &&
"Expected vectorized value.");
18069 UniqueBases.
insert(VecBase);
18072 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
18073 (NumParts != 1 &&
count(VL, EI) > 1) ||
18075 ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(U);
18076 return UTEs.empty() || UTEs.size() > 1 ||
18077 (isa<GetElementPtrInst>(U) &&
18078 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
18080 count_if(R.VectorizableTree,
18081 [&](const std::unique_ptr<TreeEntry> &TE) {
18082 return TE->UserTreeIndex.UserTE ==
18084 is_contained(VL, EI);
18088 R.eraseInstruction(EI);
18090 if (NumParts == 1 || UniqueBases.
size() == 1) {
18091 assert(VecBase &&
"Expected vectorized value.");
18092 return castToScalarTyElem(VecBase);
18094 UseVecBaseAsInput =
true;
18104 Value *Vec =
nullptr;
18111 constexpr int MaxBases = 2;
18113 auto VLMask =
zip(SubVL, SubMask);
18114 const unsigned VF = std::accumulate(
18115 VLMask.begin(), VLMask.end(), 0U, [&](
unsigned S,
const auto &
D) {
18116 if (std::get<1>(D) == PoisonMaskElem)
18119 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
18120 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp);
18122 VecOp = TEs.front()->VectorizedValue;
18123 assert(VecOp &&
"Expected vectorized value.");
18124 const unsigned Size =
18125 cast<FixedVectorType>(VecOp->getType())->getNumElements();
18126 return std::max(S, Size);
18128 for (
const auto [V,
I] : VLMask) {
18133 VecOp = TEs.front()->VectorizedValue;
18134 assert(VecOp &&
"Expected vectorized value.");
18135 VecOp = castToScalarTyElem(VecOp);
18136 Bases[
I / VF] = VecOp;
18138 if (!Bases.front())
18141 if (Bases.back()) {
18142 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
18143 TransformToIdentity(SubMask);
18145 SubVec = Bases.front();
18151 ArrayRef<int> SubMask =
18152 Mask.slice(
P * SliceSize,
18155 return all_of(SubMask, [](
int Idx) {
18159 "Expected first part or all previous parts masked.");
18160 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18165 unsigned SubVecVF =
18167 NewVF = std::max(NewVF, SubVecVF);
18170 for (
int &Idx : SubMask)
18173 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18174 Vec = createShuffle(Vec, SubVec, VecMask);
18175 TransformToIdentity(VecMask);
18183 std::optional<Value *>
18189 TEs, [](
const TreeEntry *TE) {
return TE->VectorizedValue; });
18191 return std::nullopt;
18194 auto *ResVecTy =
getWidenedType(ScalarTy, E->getVectorFactor());
18195 return Builder.CreateAlignedLoad(
18202 IsFinalized =
false;
18203 CommonMask.clear();
18209 Value *V1 = getVectorizedValue(E1);
18210 Value *V2 = getVectorizedValue(E2);
18216 Value *V1 = getVectorizedValue(E1);
18221 assert(V1 && V2 && !Mask.empty() &&
"Expected non-empty input vectors.");
18224 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
18225 V1 = castToScalarTyElem(V1);
18226 V2 = castToScalarTyElem(V2);
18227 if (InVectors.empty()) {
18228 InVectors.push_back(V1);
18229 InVectors.push_back(V2);
18230 CommonMask.assign(Mask.begin(), Mask.end());
18233 Value *Vec = InVectors.front();
18234 if (InVectors.size() == 2) {
18235 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18236 transformMaskAfterShuffle(CommonMask, CommonMask);
18239 Vec = createShuffle(Vec,
nullptr, CommonMask);
18240 transformMaskAfterShuffle(CommonMask, CommonMask);
18242 V1 = createShuffle(V1, V2, Mask);
18243 unsigned VF = std::max(getVF(V1), getVF(Vec));
18244 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18246 CommonMask[Idx] = Idx + VF;
18247 InVectors.front() = Vec;
18248 if (InVectors.size() == 2)
18249 InVectors.back() = V1;
18251 InVectors.push_back(V1);
18256 "castToScalarTyElem expects V1 to be FixedVectorType");
18257 V1 = castToScalarTyElem(V1);
18258 if (InVectors.empty()) {
18259 InVectors.push_back(V1);
18260 CommonMask.assign(Mask.begin(), Mask.end());
18263 const auto *It =
find(InVectors, V1);
18264 if (It == InVectors.end()) {
18265 if (InVectors.size() == 2 ||
18266 InVectors.front()->getType() != V1->
getType()) {
18267 Value *V = InVectors.front();
18268 if (InVectors.size() == 2) {
18269 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
18270 transformMaskAfterShuffle(CommonMask, CommonMask);
18272 CommonMask.size()) {
18273 V = createShuffle(InVectors.front(),
nullptr, CommonMask);
18274 transformMaskAfterShuffle(CommonMask, CommonMask);
18276 unsigned VF = std::max(CommonMask.size(), Mask.size());
18277 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18279 CommonMask[Idx] = V->getType() != V1->
getType()
18281 : Mask[Idx] + getVF(V1);
18282 if (V->getType() != V1->
getType())
18283 V1 = createShuffle(V1,
nullptr, Mask);
18284 InVectors.front() = V;
18285 if (InVectors.size() == 2)
18286 InVectors.back() = V1;
18288 InVectors.push_back(V1);
18293 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18295 InVectors.push_back(V1);
18300 for (
Value *V : InVectors)
18301 VF = std::max(VF, getVF(V));
18302 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18304 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
18313 Value *Root =
nullptr) {
18314 return R.gather(VL, Root, ScalarTy,
18316 return createShuffle(V1, V2, Mask);
18325 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
18330 IsFinalized =
true;
18333 if (InVectors.
size() == 2) {
18334 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
18337 Vec = createShuffle(Vec,
nullptr, CommonMask);
18339 transformMaskAfterShuffle(CommonMask, CommonMask);
18341 "Expected vector length for the final value before action.");
18345 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
18346 Vec = createShuffle(Vec,
nullptr, ResizeMask);
18349 return createShuffle(V1, V2, Mask);
18351 InVectors.
front() = Vec;
18353 if (!SubVectors.empty()) {
18355 if (InVectors.
size() == 2) {
18356 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
18359 Vec = createShuffle(Vec,
nullptr, CommonMask);
18361 transformMaskAfterShuffle(CommonMask, CommonMask);
18362 auto CreateSubVectors = [&](
Value *Vec,
18363 SmallVectorImpl<int> &CommonMask) {
18364 for (
auto [
E, Idx] : SubVectors) {
18365 Value *
V = getVectorizedValue(*
E);
18372 Type *OrigScalarTy = ScalarTy;
18375 Builder, Vec, V, InsertionIndex,
18376 std::bind(&ShuffleInstructionBuilder::createShuffle,
this, _1, _2,
18378 ScalarTy = OrigScalarTy;
18379 if (!CommonMask.
empty()) {
18380 std::iota(std::next(CommonMask.
begin(), Idx),
18381 std::next(CommonMask.
begin(), Idx +
E->getVectorFactor()),
18387 if (SubVectorsMask.
empty()) {
18388 Vec = CreateSubVectors(Vec, CommonMask);
18391 copy(SubVectorsMask, SVMask.begin());
18392 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
18395 I1 = I2 + CommonMask.
size();
18400 Vec = createShuffle(InsertVec, Vec, SVMask);
18401 transformMaskAfterShuffle(CommonMask, SVMask);
18403 InVectors.
front() = Vec;
18406 if (!ExtMask.
empty()) {
18407 if (CommonMask.
empty()) {
18411 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
18414 NewMask[
I] = CommonMask[ExtMask[
I]];
18416 CommonMask.
swap(NewMask);
18419 if (CommonMask.
empty()) {
18420 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
18421 return InVectors.
front();
18423 if (InVectors.
size() == 2)
18424 return createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
18425 return createShuffle(InVectors.
front(),
nullptr, CommonMask);
18429 assert((IsFinalized || CommonMask.empty()) &&
18430 "Shuffle construction must be finalized.");
18434Value *BoUpSLP::vectorizeOperand(TreeEntry *E,
unsigned NodeIdx) {
18438template <
typename BVTy,
typename ResTy,
typename... Args>
18439ResTy BoUpSLP::processBuildVector(
const TreeEntry *E,
Type *ScalarTy,
18441 assert(E->isGather() &&
"Expected gather node.");
18442 unsigned VF = E->getVectorFactor();
18444 bool NeedFreeze =
false;
18447 for (
auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
18449 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
18452 E->CombinedEntriesWithIndices.size());
18453 transform(
E->CombinedEntriesWithIndices, SubVectors.begin(),
18454 [&](
const auto &
P) {
18455 return std::make_pair(VectorizableTree[P.first].get(), P.second);
18460 E->ReorderIndices.end());
18461 if (!ReorderMask.empty())
18467 if (!SubVectors.empty() && !SubVectorsMask.
empty()) {
18469 if (
E->Scalars[
I] == GatheredScalars[ReorderMask[
I]])
18472 SubVectorsMask.
clear();
18476 unsigned I,
unsigned SliceSize,
18477 bool IsNotPoisonous) {
18479 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
18482 TreeEntry *UserTE =
E->UserTreeIndex.UserTE;
18483 unsigned EdgeIdx =
E->UserTreeIndex.EdgeIdx;
18484 if (UserTE->getNumOperands() != 2)
18486 if (!IsNotPoisonous) {
18487 auto *It =
find_if(
ArrayRef(VectorizableTree).drop_front(UserTE->Idx + 1),
18488 [=](
const std::unique_ptr<TreeEntry> &TE) {
18489 return TE->UserTreeIndex.UserTE == UserTE &&
18490 TE->UserTreeIndex.EdgeIdx != EdgeIdx;
18492 if (It == VectorizableTree.end())
18495 if (!(*It)->ReorderIndices.empty()) {
18499 if (!
all_of(
zip(GatheredScalars, GS), [&](
const auto &
P) {
18500 Value *V0 = std::get<0>(
P);
18501 Value *V1 = std::get<1>(
P);
18509 if ((
Mask.size() < InputVF &&
18512 (
Mask.size() == InputVF &&
18515 std::next(
Mask.begin(),
I * SliceSize),
18516 std::next(
Mask.begin(),
18523 std::next(
Mask.begin(),
I * SliceSize),
18524 std::next(
Mask.begin(),
18530 BVTy ShuffleBuilder(ScalarTy, Params...);
18531 ResTy Res = ResTy();
18532 SmallVector<int>
Mask;
18533 SmallVector<int> ExtractMask(GatheredScalars.size(),
PoisonMaskElem);
18535 Value *ExtractVecBase =
nullptr;
18536 bool UseVecBaseAsInput =
false;
18539 Type *OrigScalarTy = GatheredScalars.front()->getType();
18544 bool Resized =
false;
18546 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
18547 if (!ExtractShuffles.
empty()) {
18549 for (
auto [Idx,
I] :
enumerate(ExtractMask)) {
18555 ExtractEntries.
append(TEs.begin(), TEs.end());
18557 if (std::optional<ResTy> Delayed =
18558 ShuffleBuilder.needToDelay(
E, ExtractEntries)) {
18560 PostponedGathers.insert(
E);
18565 if (
Value *VecBase = ShuffleBuilder.adjustExtracts(
18566 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
18567 ExtractVecBase = VecBase;
18569 if (VF == VecBaseTy->getNumElements() &&
18570 GatheredScalars.size() != VF) {
18572 GatheredScalars.append(VF - GatheredScalars.size(),
18580 if (!ExtractShuffles.
empty() || !
E->hasState() ||
18581 E->getOpcode() != Instruction::Load ||
18582 (((
E->hasState() &&
E->getOpcode() == Instruction::Load) ||
18586 return isa<LoadInst>(V) && isVectorized(V);
18588 (
E->hasState() &&
E->isAltShuffle()) ||
18589 all_of(
E->Scalars, [
this](
Value *V) { return isVectorized(V); }) ||
18591 (
E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
18593 isGatherShuffledEntry(
E, GatheredScalars, Mask, Entries, NumParts);
18595 if (!GatherShuffles.
empty()) {
18596 if (std::optional<ResTy> Delayed =
18597 ShuffleBuilder.needToDelay(
E, Entries)) {
18599 PostponedGathers.insert(
E);
18604 if (GatherShuffles.
size() == 1 &&
18606 Entries.
front().front()->isSame(
E->Scalars)) {
18609 LLVM_DEBUG(
dbgs() <<
"SLP: perfect diamond match for gather bundle "
18612 Mask.resize(
E->Scalars.size());
18613 const TreeEntry *FrontTE = Entries.
front().front();
18614 if (FrontTE->ReorderIndices.empty() &&
18615 ((FrontTE->ReuseShuffleIndices.empty() &&
18616 E->Scalars.size() == FrontTE->Scalars.size()) ||
18617 (
E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
18618 std::iota(
Mask.begin(),
Mask.end(), 0);
18625 Mask[
I] = FrontTE->findLaneForValue(V);
18630 ShuffleBuilder.resetForSameNode();
18631 ShuffleBuilder.add(*FrontTE, Mask);
18633 Res = ShuffleBuilder.finalize(
E->getCommonMask(), {}, {});
18637 if (GatheredScalars.size() != VF &&
18639 return any_of(TEs, [&](
const TreeEntry *TE) {
18640 return TE->getVectorFactor() == VF;
18643 GatheredScalars.append(VF - GatheredScalars.size(),
18647 for (
int I = 0, Sz =
Mask.size();
I < Sz; ++
I) {
18653 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
18654 SmallVectorImpl<int> &ReuseMask,
18655 bool IsRootPoison) {
18658 bool IsSplat = IsRootPoison &&
isSplat(Scalars) &&
18661 SmallVector<int> UndefPos;
18662 DenseMap<Value *, unsigned> UniquePositions;
18665 int NumNonConsts = 0;
18684 Scalars.
front() = OrigV;
18687 const auto Res = UniquePositions.
try_emplace(OrigV,
I);
18688 Scalars[Res.first->second] = OrigV;
18689 ReuseMask[
I] = Res.first->second;
18692 if (NumNonConsts == 1) {
18697 if (!UndefPos.
empty() && UndefPos.
front() == 0)
18700 ReuseMask[SinglePos] = SinglePos;
18701 }
else if (!UndefPos.
empty() && IsSplat) {
18708 (
E->UserTreeIndex &&
any_of(
V->uses(), [
E](
const Use &U) {
18711 return E->UserTreeIndex.EdgeIdx != U.getOperandNo() &&
18712 is_contained(E->UserTreeIndex.UserTE->Scalars,
18716 if (It != Scalars.
end()) {
18718 int Pos = std::distance(Scalars.
begin(), It);
18719 for (
int I : UndefPos) {
18721 ReuseMask[
I] = Pos;
18730 for (
int I : UndefPos) {
18739 if (!ExtractShuffles.
empty() || !GatherShuffles.
empty()) {
18740 bool IsNonPoisoned =
true;
18741 bool IsUsedInExpr =
true;
18742 Value *Vec1 =
nullptr;
18743 if (!ExtractShuffles.
empty()) {
18747 Value *Vec2 =
nullptr;
18748 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
18752 if (UseVecBaseAsInput) {
18753 Vec1 = ExtractVecBase;
18755 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
18761 Value *VecOp = EI->getVectorOperand();
18763 !TEs.
empty() && TEs.
front()->VectorizedValue)
18764 VecOp = TEs.
front()->VectorizedValue;
18767 }
else if (Vec1 != VecOp) {
18768 assert((!Vec2 || Vec2 == VecOp) &&
18769 "Expected only 1 or 2 vectors shuffle.");
18775 IsUsedInExpr =
false;
18778 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
18781 IsUsedInExpr &= FindReusedSplat(
18784 ExtractMask.size(), IsNotPoisonedVec);
18785 ShuffleBuilder.add(Vec1, ExtractMask,
true);
18786 IsNonPoisoned &= IsNotPoisonedVec;
18788 IsUsedInExpr =
false;
18793 if (!GatherShuffles.
empty()) {
18794 unsigned SliceSize =
18798 for (
const auto [
I, TEs] :
enumerate(Entries)) {
18801 "No shuffles with empty entries list expected.");
18805 "Expected shuffle of 1 or 2 entries.");
18809 copy(SubMask, std::next(VecMask.begin(),
I * SliceSize));
18810 if (TEs.
size() == 1) {
18811 bool IsNotPoisonedVec =
18812 TEs.
front()->VectorizedValue
18816 FindReusedSplat(VecMask, TEs.
front()->getVectorFactor(),
I,
18817 SliceSize, IsNotPoisonedVec);
18818 ShuffleBuilder.add(*TEs.
front(), VecMask);
18819 IsNonPoisoned &= IsNotPoisonedVec;
18821 IsUsedInExpr =
false;
18822 ShuffleBuilder.add(*TEs.
front(), *TEs.
back(), VecMask);
18823 if (TEs.
front()->VectorizedValue && TEs.
back()->VectorizedValue)
18834 int EMSz = ExtractMask.size();
18835 int MSz =
Mask.size();
18838 bool IsSingleShuffle = ExtractShuffles.
empty() || GatherShuffles.
empty();
18839 bool IsIdentityShuffle =
18840 ((UseVecBaseAsInput ||
18842 [](
const std::optional<TTI::ShuffleKind> &SK) {
18846 none_of(ExtractMask, [&](
int I) {
return I >= EMSz; }) &&
18848 (!GatherShuffles.
empty() &&
18850 [](
const std::optional<TTI::ShuffleKind> &SK) {
18854 none_of(Mask, [&](
int I) {
return I >= MSz; }) &&
18856 bool EnoughConstsForShuffle =
18866 (!IsIdentityShuffle ||
18867 (GatheredScalars.size() == 2 &&
18875 for (
int I = 0, Sz = GatheredScalars.size();
I < Sz; ++
I) {
18876 if (EnoughConstsForShuffle &&
isa<Constant>(GatheredScalars[
I]))
18883 SmallVector<int> BVMask(GatheredScalars.size(),
PoisonMaskElem);
18884 TryPackScalars(GatheredScalars, BVMask,
true);
18885 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
18886 ShuffleBuilder.add(BV, BVMask);
18890 (IsSingleShuffle && ((IsIdentityShuffle &&
18893 Res = ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors,
18896 Res = ShuffleBuilder.finalize(
18897 E->ReuseShuffleIndices, SubVectors, SubVectorsMask,
E->Scalars.size(),
18898 [&](
Value *&Vec, SmallVectorImpl<int> &Mask,
auto CreateShuffle) {
18899 bool IsSplat = isSplat(NonConstants);
18900 SmallVector<int> BVMask(Mask.size(), PoisonMaskElem);
18901 TryPackScalars(NonConstants, BVMask, false);
18902 auto CheckIfSplatIsProfitable = [&]() {
18905 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
18906 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
18907 if (isa<ExtractElementInst>(V) || isVectorized(V))
18909 InstructionCost SplatCost = TTI->getVectorInstrCost(
18910 Instruction::InsertElement, VecTy, CostKind, 0,
18911 PoisonValue::get(VecTy), V);
18912 SmallVector<int> NewMask(Mask.begin(), Mask.end());
18913 for (auto [Idx, I] : enumerate(BVMask))
18914 if (I != PoisonMaskElem)
18915 NewMask[Idx] = Mask.size();
18916 SplatCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy,
18917 NewMask, CostKind);
18918 InstructionCost BVCost = TTI->getVectorInstrCost(
18919 Instruction::InsertElement, VecTy, CostKind,
18920 *find_if(Mask, [](int I) { return I != PoisonMaskElem; }),
18923 if (count(BVMask, PoisonMaskElem) <
18924 static_cast<int>(BVMask.size() - 1)) {
18925 SmallVector<int> NewMask(Mask.begin(), Mask.end());
18926 for (auto [Idx, I] : enumerate(BVMask))
18927 if (I != PoisonMaskElem)
18929 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
18930 VecTy, NewMask, CostKind);
18932 return SplatCost <= BVCost;
18934 if (!IsSplat ||
Mask.size() <= 2 || !CheckIfSplatIsProfitable()) {
18938 Vec = ShuffleBuilder.gather(NonConstants,
Mask.size(), Vec);
18944 Value *BV = ShuffleBuilder.gather(Values, BVMask.size());
18946 transform(BVMask, SplatMask.begin(), [](
int I) {
18947 return I == PoisonMaskElem ? PoisonMaskElem : 0;
18950 BV = CreateShuffle(BV,
nullptr, SplatMask);
18953 Mask[Idx] = BVMask.size() + Idx;
18954 Vec = CreateShuffle(Vec, BV, Mask);
18962 SmallVector<int> ReuseMask(GatheredScalars.size(),
PoisonMaskElem);
18963 TryPackScalars(GatheredScalars, ReuseMask,
true);
18964 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.
size());
18965 ShuffleBuilder.add(BV, ReuseMask);
18966 Res = ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors,
18971 for (
auto [
I, V] :
enumerate(GatheredScalars)) {
18975 Value *BV = ShuffleBuilder.gather(GatheredScalars);
18976 ShuffleBuilder.add(BV, Mask);
18977 Res = ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors,
18982 Res = ShuffleBuilder.createFreeze(Res);
18986Value *BoUpSLP::createBuildVector(
const TreeEntry *
E,
Type *ScalarTy) {
18987 for (
auto [EIdx,
_] :
E->CombinedEntriesWithIndices)
18989 return processBuildVector<ShuffleInstructionBuilder, Value *>(
E, ScalarTy,
18997 for (
Value *V : VL)
19010 IRBuilderBase::InsertPointGuard Guard(Builder);
19012 Value *
V =
E->Scalars.front();
19013 Type *ScalarTy =
V->getType();
19016 auto It = MinBWs.find(
E);
19017 if (It != MinBWs.end()) {
19023 if (
E->VectorizedValue)
19024 return E->VectorizedValue;
19026 if (
E->isGather()) {
19028 if (
E->hasState() &&
E->Idx == 0 && !UserIgnoreList)
19029 setInsertPointAfterBundle(
E);
19030 Value *Vec = createBuildVector(
E, ScalarTy);
19031 E->VectorizedValue = Vec;
19034 if (
E->State == TreeEntry::SplitVectorize) {
19035 assert(
E->CombinedEntriesWithIndices.size() == 2 &&
19036 "Expected exactly 2 combined entries.");
19037 setInsertPointAfterBundle(
E);
19039 *VectorizableTree[
E->CombinedEntriesWithIndices.front().first];
19041 ArrayRef(
E->Scalars).take_front(OpTE1.getVectorFactor())) &&
19042 "Expected same first part of scalars.");
19045 *VectorizableTree[
E->CombinedEntriesWithIndices.back().first];
19047 OpTE2.isSame(
ArrayRef(
E->Scalars).take_back(OpTE2.getVectorFactor())) &&
19048 "Expected same second part of scalars.");
19050 auto GetOperandSignedness = [&](
const TreeEntry *OpE) {
19051 bool IsSigned =
false;
19052 auto It = MinBWs.find(OpE);
19053 if (It != MinBWs.end())
19054 IsSigned = It->second.second;
19057 if (isa<PoisonValue>(V))
19059 return !isKnownNonNegative(R, SimplifyQuery(*DL));
19066 Op1 = Builder.CreateIntCast(
19071 GetOperandSignedness(&OpTE1));
19076 Op2 = Builder.CreateIntCast(
19081 GetOperandSignedness(&OpTE2));
19083 if (
E->ReorderIndices.empty()) {
19087 std::next(
Mask.begin(),
E->CombinedEntriesWithIndices.back().second),
19090 if (ScalarTyNumElements != 1) {
19094 Value *Vec = Builder.CreateShuffleVector(Op1, Mask);
19096 E->CombinedEntriesWithIndices.back().second *
19097 ScalarTyNumElements);
19098 E->VectorizedValue = Vec;
19101 unsigned CommonVF =
19102 std::max(OpTE1.getVectorFactor(), OpTE2.getVectorFactor());
19105 std::iota(
Mask.begin(), std::next(
Mask.begin(), OpTE1.getVectorFactor()),
19107 Op1 = Builder.CreateShuffleVector(Op1, Mask);
19111 std::iota(
Mask.begin(), std::next(
Mask.begin(), OpTE2.getVectorFactor()),
19113 Op2 = Builder.CreateShuffleVector(Op2, Mask);
19115 Value *Vec = Builder.CreateShuffleVector(Op1, Op2,
E->getSplitMask());
19116 E->VectorizedValue = Vec;
19120 bool IsReverseOrder =
19122 auto FinalShuffle = [&](
Value *
V,
const TreeEntry *
E) {
19124 if (
E->getOpcode() == Instruction::Store &&
19125 E->State == TreeEntry::Vectorize) {
19126 ArrayRef<int>
Mask =
19127 ArrayRef(
reinterpret_cast<const int *
>(
E->ReorderIndices.begin()),
19128 E->ReorderIndices.size());
19129 ShuffleBuilder.add(V, Mask);
19130 }
else if ((
E->State == TreeEntry::StridedVectorize && IsReverseOrder) ||
19131 E->State == TreeEntry::CompressVectorize) {
19132 ShuffleBuilder.addOrdered(V, {});
19134 ShuffleBuilder.addOrdered(V,
E->ReorderIndices);
19137 E->CombinedEntriesWithIndices.size());
19139 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](
const auto &
P) {
19140 return std::make_pair(VectorizableTree[P.first].get(), P.second);
19143 (
E->CombinedEntriesWithIndices.empty() ||
E->ReorderIndices.empty()) &&
19144 "Expected either combined subnodes or reordering");
19145 return ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors, {});
19148 assert(!
E->isGather() &&
"Unhandled state");
19149 unsigned ShuffleOrOp =
19150 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector :
E->
getOpcode();
19152 auto GetOperandSignedness = [&](
unsigned Idx) {
19153 const TreeEntry *OpE = getOperandEntry(
E, Idx);
19154 bool IsSigned =
false;
19155 auto It = MinBWs.find(OpE);
19156 if (It != MinBWs.end())
19157 IsSigned = It->second.second;
19160 if (isa<PoisonValue>(V))
19162 return !isKnownNonNegative(R, SimplifyQuery(*DL));
19166 switch (ShuffleOrOp) {
19167 case Instruction::PHI: {
19168 assert((
E->ReorderIndices.empty() || !
E->ReuseShuffleIndices.empty() ||
19169 E != VectorizableTree.front().get() ||
E->UserTreeIndex) &&
19170 "PHI reordering is free.");
19172 Builder.SetInsertPoint(PH->getParent(),
19173 PH->getParent()->getFirstNonPHIIt());
19175 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
19179 Builder.SetInsertPoint(PH->getParent(),
19180 PH->getParent()->getFirstInsertionPt());
19183 V = FinalShuffle(V,
E);
19185 E->VectorizedValue =
V;
19192 SmallPtrSet<BasicBlock *, 4> VisitedBBs;
19199 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
19203 if (!VisitedBBs.
insert(IBB).second) {
19206 TreeEntry *OpTE = getOperandEntry(
E,
I);
19207 assert(!OpTE->VectorizedValue &&
"Expected no vectorized value.");
19208 OpTE->VectorizedValue = VecOp;
19214 Value *Vec = vectorizeOperand(
E,
I);
19215 if (VecTy != Vec->
getType()) {
19217 MinBWs.contains(getOperandEntry(
E,
I))) &&
19218 "Expected item in MinBWs.");
19219 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(
I));
19225 "Invalid number of incoming values");
19226 assert(
E->VectorizedValue &&
"Expected vectorized value.");
19227 return E->VectorizedValue;
19230 case Instruction::ExtractElement: {
19231 Value *
V =
E->getSingleOperand(0);
19232 setInsertPointAfterBundle(
E);
19233 V = FinalShuffle(V,
E);
19234 E->VectorizedValue =
V;
19237 case Instruction::ExtractValue: {
19239 Builder.SetInsertPoint(LI);
19240 Value *
Ptr = LI->getPointerOperand();
19241 LoadInst *
V = Builder.CreateAlignedLoad(VecTy,
Ptr, LI->getAlign());
19243 NewV = FinalShuffle(NewV,
E);
19244 E->VectorizedValue = NewV;
19247 case Instruction::InsertElement: {
19248 assert(
E->ReuseShuffleIndices.empty() &&
"All inserts should be unique");
19249 if (
const TreeEntry *OpE = getOperandEntry(
E, 1);
19250 OpE && !OpE->isGather() && OpE->hasState() &&
19251 !OpE->hasCopyableElements())
19254 setInsertPointAfterBundle(
E);
19255 Value *
V = vectorizeOperand(
E, 1);
19257 Type *ScalarTy =
Op.front()->getType();
19260 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(
E, 1));
19261 assert(Res.first > 0 &&
"Expected item in MinBWs.");
19262 V = Builder.CreateIntCast(
19272 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
19274 const unsigned NumElts =
19276 const unsigned NumScalars =
E->Scalars.size();
19279 assert(
Offset < NumElts &&
"Failed to find vector index offset");
19282 SmallVector<int>
Mask;
19283 if (!
E->ReorderIndices.empty()) {
19288 std::iota(
Mask.begin(), std::next(
Mask.begin(), NumScalars), 0);
19291 bool IsIdentity =
true;
19293 Mask.swap(PrevMask);
19294 for (
unsigned I = 0;
I < NumScalars; ++
I) {
19297 IsIdentity &= InsertIdx -
Offset ==
I;
19300 if (!IsIdentity || NumElts != NumScalars) {
19301 Value *V2 =
nullptr;
19302 bool IsVNonPoisonous =
19304 SmallVector<int> InsertMask(Mask);
19305 if (NumElts != NumScalars &&
Offset == 0) {
19314 InsertMask[*InsertIdx] = *InsertIdx;
19315 if (!
Ins->hasOneUse())
19318 Ins->getUniqueUndroppableUser());
19320 SmallBitVector UseMask =
19321 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19322 SmallBitVector IsFirstPoison =
19324 SmallBitVector IsFirstUndef =
19326 if (!IsFirstPoison.
all()) {
19328 for (
unsigned I = 0;
I < NumElts;
I++) {
19330 IsFirstUndef.
test(
I)) {
19331 if (IsVNonPoisonous) {
19332 InsertMask[
I] =
I < NumScalars ?
I : 0;
19337 if (Idx >= NumScalars)
19338 Idx = NumScalars - 1;
19339 InsertMask[
I] = NumScalars + Idx;
19352 V = Builder.CreateShuffleVector(V, V2, InsertMask);
19354 GatherShuffleExtractSeq.insert(
I);
19355 CSEBlocks.insert(
I->getParent());
19360 for (
unsigned I = 0;
I < NumElts;
I++) {
19364 SmallBitVector UseMask =
19365 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19366 SmallBitVector IsFirstUndef =
19368 if ((!IsIdentity ||
Offset != 0 || !IsFirstUndef.
all()) &&
19369 NumElts != NumScalars) {
19370 if (IsFirstUndef.
all()) {
19372 SmallBitVector IsFirstPoison =
19374 if (!IsFirstPoison.
all()) {
19375 for (
unsigned I = 0;
I < NumElts;
I++) {
19377 InsertMask[
I] =
I + NumElts;
19380 V = Builder.CreateShuffleVector(
19386 GatherShuffleExtractSeq.insert(
I);
19387 CSEBlocks.insert(
I->getParent());
19391 SmallBitVector IsFirstPoison =
19393 for (
unsigned I = 0;
I < NumElts;
I++) {
19397 InsertMask[
I] += NumElts;
19399 V = Builder.CreateShuffleVector(
19400 FirstInsert->getOperand(0), V, InsertMask,
19403 GatherShuffleExtractSeq.insert(
I);
19404 CSEBlocks.insert(
I->getParent());
19409 ++NumVectorInstructions;
19410 E->VectorizedValue =
V;
19413 case Instruction::ZExt:
19414 case Instruction::SExt:
19415 case Instruction::FPToUI:
19416 case Instruction::FPToSI:
19417 case Instruction::FPExt:
19418 case Instruction::PtrToInt:
19419 case Instruction::IntToPtr:
19420 case Instruction::SIToFP:
19421 case Instruction::UIToFP:
19422 case Instruction::Trunc:
19423 case Instruction::FPTrunc:
19424 case Instruction::BitCast: {
19425 setInsertPointAfterBundle(
E);
19427 Value *InVec = vectorizeOperand(
E, 0);
19432 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
19434 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
19437 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
19438 if (SrcIt != MinBWs.end())
19439 SrcBWSz = SrcIt->second.first;
19440 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->
getScalarType());
19441 if (BWSz == SrcBWSz) {
19442 VecOpcode = Instruction::BitCast;
19443 }
else if (BWSz < SrcBWSz) {
19444 VecOpcode = Instruction::Trunc;
19445 }
else if (It != MinBWs.end()) {
19446 assert(BWSz > SrcBWSz &&
"Invalid cast!");
19447 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
19448 }
else if (SrcIt != MinBWs.end()) {
19449 assert(BWSz > SrcBWSz &&
"Invalid cast!");
19451 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
19453 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
19454 !SrcIt->second.second) {
19455 VecOpcode = Instruction::UIToFP;
19457 Value *
V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
19459 : Builder.CreateCast(VecOpcode, InVec, VecTy);
19460 V = FinalShuffle(V,
E);
19462 E->VectorizedValue =
V;
19463 ++NumVectorInstructions;
19466 case Instruction::FCmp:
19467 case Instruction::ICmp: {
19468 setInsertPointAfterBundle(
E);
19470 Value *
L = vectorizeOperand(
E, 0);
19471 Value *
R = vectorizeOperand(
E, 1);
19472 if (
L->getType() !=
R->getType()) {
19475 MinBWs.contains(getOperandEntry(
E, 0)) ||
19476 MinBWs.contains(getOperandEntry(
E, 1))) &&
19477 "Expected item in MinBWs.");
19482 ->getIntegerBitWidth()) {
19483 Type *CastTy =
R->getType();
19484 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
19486 Type *CastTy =
L->getType();
19487 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
19492 Value *
V = Builder.CreateCmp(P0, L, R);
19495 ICmp->setSameSign(
false);
19498 V = FinalShuffle(V,
E);
19500 E->VectorizedValue =
V;
19501 ++NumVectorInstructions;
19504 case Instruction::Select: {
19505 setInsertPointAfterBundle(
E);
19508 Value *True = vectorizeOperand(
E, 1);
19509 Value *False = vectorizeOperand(
E, 2);
19513 MinBWs.contains(getOperandEntry(
E, 1)) ||
19514 MinBWs.contains(getOperandEntry(
E, 2))) &&
19515 "Expected item in MinBWs.");
19516 if (True->
getType() != VecTy)
19517 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
19518 if (False->
getType() != VecTy)
19519 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
19524 assert(TrueNumElements >= CondNumElements &&
19525 TrueNumElements % CondNumElements == 0 &&
19526 "Cannot vectorize Instruction::Select");
19528 "Cannot vectorize Instruction::Select");
19529 if (CondNumElements != TrueNumElements) {
19532 Cond = Builder.CreateShuffleVector(
19537 "Cannot vectorize Instruction::Select");
19539 Builder.CreateSelectWithUnknownProfile(
Cond, True, False,
DEBUG_TYPE);
19540 V = FinalShuffle(V,
E);
19542 E->VectorizedValue =
V;
19543 ++NumVectorInstructions;
19546 case Instruction::FNeg: {
19547 setInsertPointAfterBundle(
E);
19549 Value *
Op = vectorizeOperand(
E, 0);
19551 Value *
V = Builder.CreateUnOp(
19557 V = FinalShuffle(V,
E);
19559 E->VectorizedValue =
V;
19560 ++NumVectorInstructions;
19564 case Instruction::Freeze: {
19565 setInsertPointAfterBundle(
E);
19567 Value *
Op = vectorizeOperand(
E, 0);
19569 if (
Op->getType() != VecTy) {
19571 MinBWs.contains(getOperandEntry(
E, 0))) &&
19572 "Expected item in MinBWs.");
19573 Op = Builder.CreateIntCast(
Op, VecTy, GetOperandSignedness(0));
19575 Value *
V = Builder.CreateFreeze(
Op);
19576 V = FinalShuffle(V,
E);
19578 E->VectorizedValue =
V;
19579 ++NumVectorInstructions;
19583 case Instruction::Add:
19584 case Instruction::FAdd:
19585 case Instruction::Sub:
19586 case Instruction::FSub:
19587 case Instruction::Mul:
19588 case Instruction::FMul:
19589 case Instruction::UDiv:
19590 case Instruction::SDiv:
19591 case Instruction::FDiv:
19592 case Instruction::URem:
19593 case Instruction::SRem:
19594 case Instruction::FRem:
19595 case Instruction::Shl:
19596 case Instruction::LShr:
19597 case Instruction::AShr:
19598 case Instruction::And:
19599 case Instruction::Or:
19600 case Instruction::Xor: {
19601 setInsertPointAfterBundle(
E);
19605 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
19610 return CI && CI->getValue().countr_one() >= It->second.first;
19612 V = FinalShuffle(
I == 0 ?
RHS :
LHS,
E);
19613 E->VectorizedValue =
V;
19614 ++NumVectorInstructions;
19622 MinBWs.contains(getOperandEntry(
E, 0)) ||
19623 MinBWs.contains(getOperandEntry(
E, 1))) &&
19624 "Expected item in MinBWs.");
19626 LHS = Builder.CreateIntCast(
LHS, VecTy, GetOperandSignedness(0));
19628 RHS = Builder.CreateIntCast(
RHS, VecTy, GetOperandSignedness(1));
19631 Value *
V = Builder.CreateBinOp(
19638 if (!MinBWs.contains(
E) && ShuffleOrOp == Instruction::Sub &&
19640 return isa<PoisonValue>(V) || isCommutative(cast<Instruction>(V));
19642 I->setHasNoUnsignedWrap(
false);
19645 V = FinalShuffle(V,
E);
19647 E->VectorizedValue =
V;
19648 ++NumVectorInstructions;
19652 case Instruction::Load: {
19655 setInsertPointAfterBundle(
E);
19659 FixedVectorType *StridedLoadTy =
nullptr;
19660 Value *PO = LI->getPointerOperand();
19661 if (
E->State == TreeEntry::Vectorize) {
19662 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
19663 }
else if (
E->State == TreeEntry::CompressVectorize) {
19664 auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
19665 CompressEntryToData.at(
E);
19666 Align CommonAlignment = LI->getAlign();
19672 for (
int I : CompressMask)
19676 MaskValues =
replicateMask(MaskValues, VecTy->getNumElements());
19679 NewLI = Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment,
19682 NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment);
19693 }
else if (
E->State == TreeEntry::StridedVectorize) {
19696 PO = IsReverseOrder ? PtrN : Ptr0;
19697 Type *StrideTy = DL->getIndexType(PO->
getType());
19699 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(
E);
19700 StridedLoadTy = SPtrInfo.Ty;
19701 assert(StridedLoadTy &&
"Missing StridedPoinerInfo for tree entry.");
19702 unsigned StridedLoadEC =
19705 Value *Stride = SPtrInfo.StrideVal;
19707 const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
19708 assert(StrideSCEV &&
"Neither StrideVal nor StrideSCEV were set.");
19709 SCEVExpander Expander(*SE, *DL,
"strided-load-vec");
19710 Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->
getType(),
19711 &*Builder.GetInsertPoint());
19714 Builder.CreateIntCast(Stride, StrideTy,
true);
19715 StrideVal = Builder.CreateMul(
19716 NewStride, ConstantInt::get(
19717 StrideTy, (IsReverseOrder ? -1 : 1) *
19719 DL->getTypeAllocSize(ScalarTy))));
19721 auto *Inst = Builder.CreateIntrinsic(
19722 Intrinsic::experimental_vp_strided_load,
19723 {StridedLoadTy, PO->
getType(), StrideTy},
19726 Builder.getInt32(StridedLoadEC)});
19727 Inst->addParamAttr(
19732 assert(
E->State == TreeEntry::ScatterVectorize &&
"Unhandled state");
19733 Value *VecPtr = vectorizeOperand(
E, 0);
19738 unsigned ScalarTyNumElements =
19740 unsigned VecTyNumElements =
19742 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
19743 "Cannot expand getelementptr.");
19744 unsigned VF = VecTyNumElements / ScalarTyNumElements;
19747 return Builder.getInt64(I % ScalarTyNumElements);
19749 VecPtr = Builder.CreateGEP(
19750 VecTy->getElementType(),
19751 Builder.CreateShuffleVector(
19757 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
19759 Value *
V =
E->State == TreeEntry::CompressVectorize
19763 V = FinalShuffle(V,
E);
19764 E->VectorizedValue =
V;
19765 ++NumVectorInstructions;
19768 case Instruction::Store: {
19771 setInsertPointAfterBundle(
E);
19773 Value *VecValue = vectorizeOperand(
E, 0);
19774 if (VecValue->
getType() != VecTy)
19776 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
19777 VecValue = FinalShuffle(VecValue,
E);
19781 if (
E->State == TreeEntry::Vectorize) {
19782 ST = Builder.CreateAlignedStore(VecValue,
Ptr,
SI->getAlign());
19784 assert(
E->State == TreeEntry::StridedVectorize &&
19785 "Expected either strided or consecutive stores.");
19786 if (!
E->ReorderIndices.empty()) {
19788 Ptr =
SI->getPointerOperand();
19791 Type *StrideTy = DL->getIndexType(
SI->getPointerOperandType());
19792 auto *Inst = Builder.CreateIntrinsic(
19793 Intrinsic::experimental_vp_strided_store,
19794 {VecTy,
Ptr->getType(), StrideTy},
19797 StrideTy, -
static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
19798 Builder.getAllOnesMask(VecTy->getElementCount()),
19799 Builder.getInt32(
E->Scalars.size())});
19800 Inst->addParamAttr(
19808 E->VectorizedValue =
V;
19809 ++NumVectorInstructions;
19812 case Instruction::GetElementPtr: {
19814 setInsertPointAfterBundle(
E);
19816 Value *Op0 = vectorizeOperand(
E, 0);
19819 for (
int J = 1,
N = GEP0->getNumOperands(); J <
N; ++J) {
19820 Value *OpVec = vectorizeOperand(
E, J);
19824 Value *
V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
19827 for (
Value *V :
E->Scalars) {
19834 V = FinalShuffle(V,
E);
19836 E->VectorizedValue =
V;
19837 ++NumVectorInstructions;
19841 case Instruction::Call: {
19843 setInsertPointAfterBundle(
E);
19848 CI,
ID, VecTy->getNumElements(),
19849 It != MinBWs.end() ? It->second.first : 0, TTI);
19852 VecCallCosts.first <= VecCallCosts.second;
19854 Value *ScalarArg =
nullptr;
19865 ScalarArg = CEI->getArgOperand(
I);
19868 if (
ID == Intrinsic::abs && It != MinBWs.end() &&
19869 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
19870 ScalarArg = Builder.getFalse();
19877 Value *OpVec = vectorizeOperand(
E,
I);
19878 ScalarArg = CEI->getArgOperand(
I);
19881 It == MinBWs.end()) {
19884 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(
I));
19885 }
else if (It != MinBWs.end()) {
19886 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(
I));
19895 if (!UseIntrinsic) {
19900 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
19907 Value *
V = Builder.CreateCall(CF, OpVecs, OpBundles);
19910 V = FinalShuffle(V,
E);
19912 E->VectorizedValue =
V;
19913 ++NumVectorInstructions;
19916 case Instruction::ShuffleVector: {
19919 setInsertPointAfterBundle(
E);
19920 Value *Src = vectorizeOperand(
E, 0);
19923 SmallVector<int> NewMask(ThisMask.size());
19925 return SVSrc->getShuffleMask()[Mask];
19927 V = Builder.CreateShuffleVector(SVSrc->getOperand(0),
19928 SVSrc->getOperand(1), NewMask);
19930 V = Builder.CreateShuffleVector(Src, ThisMask);
19935 V = FinalShuffle(V,
E);
19943 "Invalid Shuffle Vector Operand");
19947 setInsertPointAfterBundle(
E);
19948 LHS = vectorizeOperand(
E, 0);
19949 RHS = vectorizeOperand(
E, 1);
19951 setInsertPointAfterBundle(
E);
19952 LHS = vectorizeOperand(
E, 0);
19958 assert((It != MinBWs.end() ||
19959 getOperandEntry(
E, 0)->State == TreeEntry::NeedToGather ||
19960 getOperandEntry(
E, 1)->State == TreeEntry::NeedToGather ||
19961 MinBWs.contains(getOperandEntry(
E, 0)) ||
19962 MinBWs.contains(getOperandEntry(
E, 1))) &&
19963 "Expected item in MinBWs.");
19964 Type *CastTy = VecTy;
19970 ->getIntegerBitWidth())
19976 LHS = Builder.CreateIntCast(
LHS, CastTy, GetOperandSignedness(0));
19978 RHS = Builder.CreateIntCast(
RHS, CastTy, GetOperandSignedness(1));
19983 V0 = Builder.CreateBinOp(
19985 V1 = Builder.CreateBinOp(
19988 V0 = Builder.CreateCmp(CI0->getPredicate(),
LHS,
RHS);
19991 V1 = Builder.CreateCmp(AltPred,
LHS,
RHS);
19994 unsigned SrcBWSz = DL->getTypeSizeInBits(
19996 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
19997 if (BWSz <= SrcBWSz) {
19998 if (BWSz < SrcBWSz)
19999 LHS = Builder.CreateIntCast(
LHS, VecTy, It->second.first);
20001 "Expected same type as operand.");
20005 E->VectorizedValue =
LHS;
20006 ++NumVectorInstructions;
20010 V0 = Builder.CreateCast(
20012 V1 = Builder.CreateCast(
20017 for (
Value *V : {V0, V1}) {
20019 GatherShuffleExtractSeq.insert(
I);
20020 CSEBlocks.insert(
I->getParent());
20028 SmallVector<int>
Mask;
20029 E->buildAltOpShuffleMask(
20030 [
E,
this](Instruction *
I) {
20031 assert(
E->getMatchingMainOpOrAltOp(
I) &&
20032 "Unexpected main/alternate opcode");
20036 Mask, &OpScalars, &AltScalars);
20040 auto DropNuwFlag = [&](
Value *Vec,
unsigned Opcode) {
20043 I && Opcode == Instruction::Sub && !MinBWs.contains(
E) &&
20045 if (isa<PoisonValue>(V))
20047 auto *IV = cast<Instruction>(V);
20048 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
20050 I->setHasNoUnsignedWrap(
false);
20052 DropNuwFlag(V0,
E->getOpcode());
20053 DropNuwFlag(V1,
E->getAltOpcode());
20059 V = Builder.CreateShuffleVector(V0, V1, Mask);
20062 GatherShuffleExtractSeq.insert(
I);
20063 CSEBlocks.insert(
I->getParent());
20067 E->VectorizedValue =
V;
20068 ++NumVectorInstructions;
20086 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
20089 EntryToLastInstruction.clear();
20091 for (
auto &BSIter : BlocksSchedules)
20092 scheduleBlock(*
this, BSIter.second.get());
20095 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20096 if (TE->isGather())
20098 (void)getLastInstructionInBundle(TE.get());
20102 Builder.SetInsertPoint(ReductionRoot->
getParent(),
20105 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20109 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20110 if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
20111 TE->UserTreeIndex.UserTE->hasState() &&
20112 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
20113 (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
20114 TE->UserTreeIndex.UserTE->isAltShuffle()) &&
20115 !TE->UserTreeIndex.UserTE->hasCopyableElements() &&
20116 all_of(TE->UserTreeIndex.UserTE->Scalars,
20117 [](
Value *V) { return isUsedOutsideBlock(V); })) {
20119 getLastInstructionInBundle(TE->UserTreeIndex.UserTE);
20123 for (
auto &Entry : GatherEntries) {
20125 Builder.SetInsertPoint(Entry.second);
20126 Builder.SetCurrentDebugLocation(Entry.second->getDebugLoc());
20131 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20132 if (GatheredLoadsEntriesFirst.has_value() &&
20133 TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue &&
20134 (!TE->isGather() || TE->UserTreeIndex)) {
20135 assert((TE->UserTreeIndex ||
20136 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
20137 "Expected gathered load node.");
20146 for (
const TreeEntry *E : PostponedNodes) {
20147 auto *TE =
const_cast<TreeEntry *
>(E);
20149 TE->VectorizedValue =
nullptr;
20160 (TE->UserTreeIndex.UserTE->hasState() &&
20161 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI)) {
20170 if (UI->comesBefore(InsertPt))
20173 Builder.SetInsertPoint(InsertPt);
20175 Builder.SetInsertPoint(PrevVec);
20177 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
20180 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
20181 Builder.GetInsertPoint()->comesBefore(VecI))
20182 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
20183 Builder.GetInsertPoint());
20184 if (Vec->
getType() != PrevVec->getType()) {
20186 PrevVec->getType()->isIntOrIntVectorTy() &&
20187 "Expected integer vector types only.");
20188 std::optional<bool> IsSigned;
20189 for (
Value *V : TE->Scalars) {
20191 for (
const TreeEntry *MNTE : getTreeEntries(V)) {
20192 auto It = MinBWs.find(MNTE);
20193 if (It != MinBWs.end()) {
20194 IsSigned = IsSigned.value_or(
false) || It->second.second;
20199 if (IsSigned.value_or(
false))
20202 for (
const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
20203 auto It = MinBWs.find(BVE);
20204 if (It != MinBWs.end()) {
20205 IsSigned = IsSigned.value_or(
false) || It->second.second;
20210 if (IsSigned.value_or(
false))
20214 IsSigned.value_or(
false) ||
20218 if (IsSigned.value_or(
false))
20222 if (IsSigned.value_or(
false)) {
20224 auto It = MinBWs.find(TE->UserTreeIndex.UserTE);
20225 if (It != MinBWs.end())
20226 IsSigned = It->second.second;
20229 "Expected user node or perfect diamond match in MinBWs.");
20230 Vec = Builder.CreateIntCast(Vec, PrevVec->
getType(), *IsSigned);
20232 PrevVec->replaceAllUsesWith(Vec);
20233 PostponedValues.
try_emplace(Vec).first->second.push_back(TE);
20236 auto It = PostponedValues.
find(PrevVec);
20237 if (It != PostponedValues.
end()) {
20238 for (TreeEntry *VTE : It->getSecond())
20239 VTE->VectorizedValue = Vec;
20259 for (
const auto &ExternalUse : ExternalUses) {
20260 Value *Scalar = ExternalUse.Scalar;
20267 const TreeEntry *E = &ExternalUse.E;
20268 assert(E &&
"Invalid scalar");
20269 assert(!E->isGather() &&
"Extracting from a gather list");
20271 if (E->getOpcode() == Instruction::GetElementPtr &&
20275 Value *Vec = E->VectorizedValue;
20276 assert(Vec &&
"Can't find vectorizable value");
20278 Value *Lane = Builder.getInt32(ExternalUse.Lane);
20279 auto ExtractAndExtendIfNeeded = [&](
Value *Vec) {
20280 if (Scalar->getType() != Vec->
getType()) {
20281 Value *Ex =
nullptr;
20282 Value *ExV =
nullptr;
20284 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
20285 auto It = ScalarToEEs.
find(Scalar);
20286 if (It != ScalarToEEs.
end()) {
20289 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
20290 : Builder.GetInsertBlock());
20291 if (EEIt != It->second.end()) {
20292 Value *PrevV = EEIt->second.first;
20294 I && !ReplaceInst &&
20295 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
20296 Builder.GetInsertPoint()->comesBefore(
I)) {
20297 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
20298 Builder.GetInsertPoint());
20303 ExV = EEIt->second.second ? EEIt->second.second : Ex;
20312 IgnoredExtracts.
insert(EE);
20315 auto *CloneInst = Inst->clone();
20316 CloneInst->insertBefore(Inst->getIterator());
20317 if (Inst->hasName())
20318 CloneInst->takeName(Inst);
20323 Value *V = ES->getVectorOperand();
20326 V = ETEs.front()->VectorizedValue;
20328 !
IV ||
IV == Vec ||
IV->getParent() != IVec->getParent() ||
20329 IV->comesBefore(IVec))
20330 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
20332 Ex = Builder.CreateExtractElement(Vec, Lane);
20333 }
else if (
auto *VecTy =
20336 unsigned VecTyNumElements = VecTy->getNumElements();
20341 ExternalUse.Lane * VecTyNumElements);
20343 Ex = Builder.CreateExtractElement(Vec, Lane);
20348 if (Scalar->getType() != Ex->
getType())
20349 ExV = Builder.CreateIntCast(
20354 : &F->getEntryBlock(),
20355 std::make_pair(Ex, ExV));
20361 GatherShuffleExtractSeq.insert(ExI);
20362 CSEBlocks.insert(ExI->getParent());
20368 "In-tree scalar of vector type is not insertelement?");
20377 if (!ScalarsWithNullptrUser.
insert(Scalar).second)
20380 (ExternallyUsedValues.
count(Scalar) ||
20381 ExternalUsesWithNonUsers.count(Scalar) ||
20382 ExternalUsesAsOriginalScalar.contains(Scalar) ||
20386 if (ExternalUsesAsOriginalScalar.contains(U))
20388 ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
20389 return !UseEntries.empty() &&
20390 (E->State == TreeEntry::Vectorize ||
20391 E->State == TreeEntry::StridedVectorize ||
20392 E->State == TreeEntry::CompressVectorize) &&
20393 any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {
20394 return (UseEntry->State == TreeEntry::Vectorize ||
20396 TreeEntry::StridedVectorize ||
20398 TreeEntry::CompressVectorize) &&
20399 doesInTreeUserNeedToExtract(
20400 Scalar, getRootEntryInstruction(*UseEntry),
20404 "Scalar with nullptr User must be registered in "
20405 "ExternallyUsedValues map or remain as scalar in vectorized "
20409 if (
PHI->getParent()->isLandingPad())
20410 Builder.SetInsertPoint(
20413 PHI->getParent()->getLandingPadInst()->getIterator()));
20415 Builder.SetInsertPoint(
PHI->getParent(),
20416 PHI->getParent()->getFirstNonPHIIt());
20418 Builder.SetInsertPoint(VecI->getParent(),
20419 std::next(VecI->getIterator()));
20422 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20424 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20426 if (Scalar != NewInst) {
20429 "Extractelements should not be replaced.");
20430 Scalar->replaceAllUsesWith(NewInst);
20440 if (!UsedInserts.
insert(VU).second)
20443 auto BWIt = MinBWs.find(E);
20445 auto *ScalarTy = FTy->getElementType();
20446 auto Key = std::make_pair(Vec, ScalarTy);
20447 auto VecIt = VectorCasts.
find(
Key);
20448 if (VecIt == VectorCasts.
end()) {
20451 if (IVec->getParent()->isLandingPad())
20452 Builder.SetInsertPoint(IVec->getParent(),
20453 std::next(IVec->getParent()
20454 ->getLandingPadInst()
20457 Builder.SetInsertPoint(
20458 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
20460 Builder.SetInsertPoint(IVec->getNextNode());
20462 Vec = Builder.CreateIntCast(
20467 BWIt->second.second);
20470 Vec = VecIt->second;
20477 ShuffledInserts, [VU](
const ShuffledInsertData<Value *> &
Data) {
20484 unsigned Idx = *InsertIdx;
20485 if (It == ShuffledInserts.
end()) {
20487 It = std::next(ShuffledInserts.
begin(),
20488 ShuffledInserts.
size() - 1);
20493 Mask[Idx] = ExternalUse.Lane;
20505 for (
unsigned I :
seq<unsigned>(0, PH->getNumIncomingValues())) {
20506 if (PH->getIncomingValue(
I) == Scalar) {
20508 PH->getIncomingBlock(
I)->getTerminator();
20510 Builder.SetInsertPoint(VecI->getParent(),
20511 std::next(VecI->getIterator()));
20513 Builder.SetInsertPoint(PH->getIncomingBlock(
I)->getTerminator());
20515 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20516 PH->setOperand(
I, NewInst);
20521 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20525 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20526 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20537 for (
int I = 0, E = Mask.size();
I < E; ++
I) {
20539 CombinedMask1[
I] = Mask[
I];
20541 CombinedMask2[
I] = Mask[
I] - VF;
20543 ShuffleInstructionBuilder ShuffleBuilder(
20545 ShuffleBuilder.add(V1, CombinedMask1);
20547 ShuffleBuilder.add(V2, CombinedMask2);
20548 return ShuffleBuilder.finalize({}, {}, {});
20551 auto &&ResizeToVF = [&CreateShuffle](
Value *Vec, ArrayRef<int>
Mask,
20552 bool ForSingleMask) {
20553 unsigned VF =
Mask.size();
20556 if (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); })) {
20557 Vec = CreateShuffle(Vec,
nullptr, Mask);
20558 return std::make_pair(Vec,
true);
20560 if (!ForSingleMask) {
20562 for (
unsigned I = 0;
I < VF; ++
I) {
20566 Vec = CreateShuffle(Vec,
nullptr, ResizeMask);
20570 return std::make_pair(Vec,
false);
20574 for (
int I = 0,
E = ShuffledInserts.size();
I <
E; ++
I) {
20577 InsertElementInst *FirstInsert = ShuffledInserts[
I].InsertElements.front();
20578 InsertElementInst *LastInsert = ShuffledInserts[
I].InsertElements.back();
20579 Builder.SetInsertPoint(LastInsert);
20580 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
20585 return cast<VectorType>(Vec->getType())
20586 ->getElementCount()
20587 .getKnownMinValue();
20590 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
20592 assert((Vals.size() == 1 || Vals.size() == 2) &&
20593 "Expected exactly 1 or 2 input values.");
20594 if (Vals.size() == 1) {
20597 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
20598 ->getNumElements() ||
20599 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
20600 return CreateShuffle(Vals.front(), nullptr, Mask);
20601 return Vals.front();
20603 return CreateShuffle(Vals.
front() ? Vals.
front()
20605 Vals.
back(), Mask);
20607 auto It = ShuffledInserts[
I].InsertElements.rbegin();
20609 InsertElementInst *
II =
nullptr;
20610 if (It != ShuffledInserts[
I].InsertElements.rend())
20613 while (It != ShuffledInserts[
I].InsertElements.rend()) {
20614 assert(
II &&
"Must be an insertelement instruction.");
20621 for (Instruction *
II :
reverse(Inserts)) {
20622 II->replaceUsesOfWith(
II->getOperand(0), NewInst);
20624 if (
II->getParent() == NewI->getParent() &&
II->comesBefore(NewI))
20625 II->moveAfter(NewI);
20629 for (InsertElementInst *IE :
reverse(ShuffledInserts[
I].InsertElements)) {
20630 IE->replaceUsesOfWith(
IE->getOperand(0),
20632 IE->replaceUsesOfWith(
IE->getOperand(1),
20636 CSEBlocks.insert(LastInsert->
getParent());
20641 for (
auto &TEPtr : VectorizableTree) {
20642 TreeEntry *
Entry = TEPtr.get();
20645 if (
Entry->isGather() ||
Entry->State == TreeEntry::SplitVectorize)
20648 assert(
Entry->VectorizedValue &&
"Can't find vectorizable value");
20651 for (
int Lane = 0, LE =
Entry->Scalars.size(); Lane != LE; ++Lane) {
20654 if (
Entry->getOpcode() == Instruction::GetElementPtr &&
20658 EE && IgnoredExtracts.contains(EE))
20665 for (User *U :
Scalar->users()) {
20670 (UserIgnoreList && UserIgnoreList->contains(U)) ||
20673 "Deleting out-of-tree value");
20677 LLVM_DEBUG(
dbgs() <<
"SLP: \tErasing scalar:" << *Scalar <<
".\n");
20686 V->mergeDIAssignID(RemovedInsts);
20689 if (UserIgnoreList) {
20690 for (Instruction *
I : RemovedInsts) {
20691 const TreeEntry *
IE = getTreeEntries(
I).front();
20692 if (
IE->Idx != 0 &&
20693 !(VectorizableTree.front()->isGather() &&
IE->UserTreeIndex &&
20694 (ValueToGatherNodes.lookup(
I).contains(
20695 VectorizableTree.front().get()) ||
20696 (
IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
20697 IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
20698 !(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
20699 IE->UserTreeIndex &&
20701 !(GatheredLoadsEntriesFirst.has_value() &&
20702 IE->Idx >= *GatheredLoadsEntriesFirst &&
20703 VectorizableTree.front()->isGather() &&
20705 !(!VectorizableTree.front()->isGather() &&
20706 VectorizableTree.front()->isCopyableElement(
I)))
20711 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
20712 (match(U.getUser(), m_LogicalAnd()) ||
20713 match(U.getUser(), m_LogicalOr())) &&
20714 U.getOperandNo() == 0;
20715 if (IsPoisoningLogicalOp) {
20716 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
20719 return UserIgnoreList->contains(
U.getUser());
20723 for (SelectInst *SI : LogicalOpSelects)
20733 Builder.ClearInsertionPoint();
20734 InstrElementSize.clear();
20736 const TreeEntry &RootTE = *VectorizableTree.front();
20737 Value *Vec = RootTE.VectorizedValue;
20738 if (
auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
20739 It != MinBWs.end() &&
20740 ReductionBitWidth != It->second.first) {
20741 IRBuilder<>::InsertPointGuard Guard(Builder);
20742 Builder.SetInsertPoint(ReductionRoot->getParent(),
20743 ReductionRoot->getIterator());
20744 Vec = Builder.CreateIntCast(
20748 It->second.second);
20754 LLVM_DEBUG(
dbgs() <<
"SLP: Optimizing " << GatherShuffleExtractSeq.size()
20755 <<
" gather sequences instructions.\n");
20762 Loop *L = LI->getLoopFor(
I->getParent());
20767 BasicBlock *PreHeader = L->getLoopPreheader();
20775 auto *OpI = dyn_cast<Instruction>(V);
20776 return OpI && L->contains(OpI);
20782 CSEBlocks.insert(PreHeader);
20787 CSEWorkList.
reserve(CSEBlocks.size());
20790 assert(DT->isReachableFromEntry(
N));
20797 assert((
A ==
B) == (
A->getDFSNumIn() ==
B->getDFSNumIn()) &&
20798 "Different nodes should have different DFS numbers");
20799 return A->getDFSNumIn() <
B->getDFSNumIn();
20807 auto &&IsIdenticalOrLessDefined = [TTI = TTI](
Instruction *I1,
20810 if (I1->getType() != I2->getType())
20815 return I1->isIdenticalTo(I2);
20816 if (SI1->isIdenticalTo(SI2))
20818 for (
int I = 0, E = SI1->getNumOperands();
I < E; ++
I)
20819 if (SI1->getOperand(
I) != SI2->getOperand(
I))
20822 NewMask.
assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
20826 unsigned LastUndefsCnt = 0;
20827 for (
int I = 0, E = NewMask.
size();
I < E; ++
I) {
20833 NewMask[
I] != SM1[
I])
20836 NewMask[
I] = SM1[
I];
20840 return SM1.
size() - LastUndefsCnt > 1 &&
20844 SM1.
size() - LastUndefsCnt));
20850 for (
auto I = CSEWorkList.
begin(), E = CSEWorkList.
end();
I != E; ++
I) {
20852 (
I == CSEWorkList.
begin() || !DT->dominates(*
I, *std::prev(
I))) &&
20853 "Worklist not sorted properly!");
20860 !GatherShuffleExtractSeq.contains(&In))
20865 bool Replaced =
false;
20868 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
20869 DT->dominates(V->getParent(), In.getParent())) {
20870 In.replaceAllUsesWith(V);
20873 if (!NewMask.
empty())
20874 SI->setShuffleMask(NewMask);
20879 GatherShuffleExtractSeq.contains(V) &&
20880 IsIdenticalOrLessDefined(V, &In, NewMask) &&
20881 DT->dominates(In.getParent(), V->getParent())) {
20883 V->replaceAllUsesWith(&In);
20886 if (!NewMask.
empty())
20887 SI->setShuffleMask(NewMask);
20895 Visited.push_back(&In);
20900 GatherShuffleExtractSeq.clear();
20903BoUpSLP::ScheduleBundle &BoUpSLP::BlockScheduling::buildBundle(
20906 ScheduledBundlesList.emplace_back(std::make_unique<ScheduleBundle>());
20907 for (
Value *V : VL) {
20908 if (S.isNonSchedulable(V))
20911 if (S.isCopyableElement(V)) {
20913 ScheduleCopyableData &SD =
20914 addScheduleCopyableData(EI,
I, SchedulingRegionID, *BundlePtr);
20916 BundlePtr->add(&SD);
20919 ScheduleData *BundleMember = getScheduleData(V);
20920 assert(BundleMember &&
"no ScheduleData for bundle member "
20921 "(maybe not in same basic block)");
20923 BundlePtr->add(BundleMember);
20924 ScheduledBundles.try_emplace(
I).first->getSecond().push_back(
20927 assert(BundlePtr && *BundlePtr &&
"Failed to find schedule bundle");
20933std::optional<BoUpSLP::ScheduleBundle *>
20935 const InstructionsState &S,
20942 bool HasCopyables = S.areInstructionsWithCopyableElements();
20944 all_of(VL, [&](
Value *V) {
return S.isNonSchedulable(V); }))) {
20948 SmallVector<ScheduleData *> ControlDependentMembers;
20949 for (
Value *V : VL) {
20951 if (!
I || (HasCopyables && S.isCopyableElement(V)))
20953 SmallDenseMap<std::pair<Instruction *, Value *>,
unsigned> UserOpToNumOps;
20954 for (
const Use &U :
I->operands()) {
20957 .first->getSecond();
20960 Op && areAllOperandsReplacedByCopyableData(
I,
Op, *SLP,
NumOps)) {
20961 if (ScheduleData *OpSD = getScheduleData(
Op);
20962 OpSD && OpSD->hasValidDependencies()) {
20963 OpSD->clearDirectDependencies();
20964 if (RegionHasStackSave ||
20966 ControlDependentMembers.
push_back(OpSD);
20971 if (!ControlDependentMembers.
empty()) {
20972 ScheduleBundle
Invalid = ScheduleBundle::invalid();
20973 calculateDependencies(
Invalid,
true, SLP,
20974 ControlDependentMembers);
20981 LLVM_DEBUG(
dbgs() <<
"SLP: bundle: " << *S.getMainOp() <<
"\n");
20983 auto TryScheduleBundleImpl = [=](
bool ReSchedule, ScheduleBundle &Bundle) {
20986 SmallVector<ScheduleData *> ControlDependentMembers;
20987 auto CheckIfNeedToClearDeps = [&](ScheduleBundle &Bundle) {
20988 SmallDenseMap<std::pair<Instruction *, Value *>,
unsigned> UserOpToNumOps;
20989 for (ScheduleEntity *SE : Bundle.getBundle()) {
20991 if (ScheduleData *BundleMember = getScheduleData(SD->getInst());
20992 BundleMember && BundleMember->hasValidDependencies()) {
20993 BundleMember->clearDirectDependencies();
20994 if (RegionHasStackSave ||
20996 BundleMember->getInst()))
20997 ControlDependentMembers.
push_back(BundleMember);
21002 if (SD->hasValidDependencies() &&
21003 (!S.areInstructionsWithCopyableElements() ||
21004 !S.isCopyableElement(SD->getInst())) &&
21005 !getScheduleCopyableData(SD->getInst()).empty() && EI.UserTE &&
21006 EI.UserTE->hasState() &&
21007 (!EI.UserTE->hasCopyableElements() ||
21008 !EI.UserTE->isCopyableElement(SD->getInst())))
21009 SD->clearDirectDependencies();
21010 for (
const Use &U : SD->getInst()->operands()) {
21013 .
try_emplace(std::make_pair(SD->getInst(),
U.get()), 0)
21014 .first->getSecond();
21017 Op && areAllOperandsReplacedByCopyableData(SD->getInst(),
Op,
21019 if (ScheduleData *OpSD = getScheduleData(
Op);
21020 OpSD && OpSD->hasValidDependencies()) {
21021 OpSD->clearDirectDependencies();
21022 if (RegionHasStackSave ||
21024 ControlDependentMembers.
push_back(OpSD);
21035 if (OldScheduleEnd && ScheduleEnd != OldScheduleEnd) {
21036 for_each(ScheduleDataMap, [&](
auto &
P) {
21037 if (BB !=
P.first->getParent())
21039 ScheduleData *SD =
P.second;
21040 if (isInSchedulingRegion(*SD))
21041 SD->clearDependencies();
21043 for_each(ScheduleCopyableDataMapByInst, [&](
auto &
P) {
21044 for_each(
P.second, [&](ScheduleCopyableData *SD) {
21045 if (isInSchedulingRegion(*SD))
21046 SD->clearDependencies();
21053 if (Bundle && !Bundle.getBundle().empty()) {
21054 if (S.areInstructionsWithCopyableElements() ||
21055 !ScheduleCopyableDataMap.empty())
21056 CheckIfNeedToClearDeps(Bundle);
21057 LLVM_DEBUG(
dbgs() <<
"SLP: try schedule bundle " << Bundle <<
" in block "
21059 calculateDependencies(Bundle, !ReSchedule, SLP,
21060 ControlDependentMembers);
21061 }
else if (!ControlDependentMembers.
empty()) {
21062 ScheduleBundle
Invalid = ScheduleBundle::invalid();
21063 calculateDependencies(
Invalid, !ReSchedule, SLP,
21064 ControlDependentMembers);
21069 initialFillReadyList(ReadyInsts);
21076 while (((!Bundle && ReSchedule) || (Bundle && !Bundle.isReady())) &&
21077 !ReadyInsts.empty()) {
21078 ScheduleEntity *Picked = ReadyInsts.pop_back_val();
21079 assert(Picked->isReady() &&
"must be ready to schedule");
21080 schedule(*SLP, S, EI, Picked, ReadyInsts);
21081 if (Picked == &Bundle)
21088 for (
Value *V : VL) {
21089 if (S.isNonSchedulable(V))
21091 if (!extendSchedulingRegion(V, S)) {
21098 ScheduleBundle
Invalid = ScheduleBundle::invalid();
21099 TryScheduleBundleImpl(
false,
Invalid);
21100 return std::nullopt;
21104 bool ReSchedule =
false;
21105 for (
Value *V : VL) {
21106 if (S.isNonSchedulable(V))
21110 if (!CopyableData.
empty()) {
21111 for (ScheduleCopyableData *SD : CopyableData)
21112 ReadyInsts.remove(SD);
21114 ScheduleData *BundleMember = getScheduleData(V);
21115 assert((BundleMember || S.isCopyableElement(V)) &&
21116 "no ScheduleData for bundle member (maybe not in same basic block)");
21122 ReadyInsts.remove(BundleMember);
21124 !Bundles.
empty()) {
21125 for (ScheduleBundle *
B : Bundles)
21126 ReadyInsts.remove(
B);
21129 if (!S.isCopyableElement(V) && !BundleMember->isScheduled())
21136 LLVM_DEBUG(
dbgs() <<
"SLP: reset schedule because " << *BundleMember
21137 <<
" was already scheduled\n");
21141 ScheduleBundle &Bundle = buildBundle(VL, S, EI);
21142 TryScheduleBundleImpl(ReSchedule, Bundle);
21143 if (!Bundle.isReady()) {
21144 for (ScheduleEntity *BD : Bundle.getBundle()) {
21148 if (BD->isReady()) {
21150 if (Bundles.
empty()) {
21151 ReadyInsts.insert(BD);
21154 for (ScheduleBundle *
B : Bundles)
21156 ReadyInsts.insert(
B);
21159 ScheduledBundlesList.pop_back();
21160 SmallVector<ScheduleData *> ControlDependentMembers;
21161 SmallPtrSet<Instruction *, 4> Visited;
21162 for (
Value *V : VL) {
21163 if (S.isNonSchedulable(V))
21166 if (S.isCopyableElement(
I)) {
21169 auto KV = std::make_pair(EI,
I);
21170 assert(ScheduleCopyableDataMap.contains(KV) &&
21171 "no ScheduleCopyableData for copyable element");
21172 ScheduleCopyableData *SD =
21173 ScheduleCopyableDataMapByInst.find(
I)->getSecond().pop_back_val();
21174 ScheduleCopyableDataMapByUsers[
I].remove(SD);
21177 const auto *It =
find(
Op,
I);
21178 assert(It !=
Op.end() &&
"Lane not set");
21179 SmallPtrSet<Instruction *, 4> Visited;
21181 int Lane = std::distance(
Op.begin(), It);
21182 assert(Lane >= 0 &&
"Lane not set");
21184 !EI.UserTE->ReorderIndices.empty())
21185 Lane = EI.UserTE->ReorderIndices[Lane];
21186 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
21187 "Couldn't find extract lane");
21189 if (!Visited.
insert(In).second) {
21193 ScheduleCopyableDataMapByInstUser
21194 [std::make_pair(std::make_pair(In, EI.EdgeIdx),
I)]
21197 }
while (It !=
Op.end());
21199 if (ScheduleCopyableData *UserCD = getScheduleCopyableData(UserEI,
I))
21200 ScheduleCopyableDataMapByUsers[
I].insert(UserCD);
21202 if (ScheduleCopyableDataMapByUsers[
I].
empty())
21203 ScheduleCopyableDataMapByUsers.erase(
I);
21204 ScheduleCopyableDataMap.erase(KV);
21206 if (ScheduleData *OpSD = getScheduleData(
I);
21207 OpSD && OpSD->hasValidDependencies()) {
21208 OpSD->clearDirectDependencies();
21209 if (RegionHasStackSave ||
21211 ControlDependentMembers.
push_back(OpSD);
21215 ScheduledBundles.find(
I)->getSecond().pop_back();
21217 if (!ControlDependentMembers.
empty()) {
21218 ScheduleBundle
Invalid = ScheduleBundle::invalid();
21219 calculateDependencies(
Invalid,
false, SLP,
21220 ControlDependentMembers);
21222 return std::nullopt;
21227BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
21229 if (ChunkPos >= ChunkSize) {
21230 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
21233 return &(ScheduleDataChunks.back()[ChunkPos++]);
21236bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
21237 Value *V,
const InstructionsState &S) {
21239 assert(
I &&
"bundle member must be an instruction");
21240 if (getScheduleData(
I))
21242 if (!ScheduleStart) {
21244 initScheduleData(
I,
I->getNextNode(),
nullptr,
nullptr);
21246 ScheduleEnd =
I->getNextNode();
21247 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
21248 LLVM_DEBUG(
dbgs() <<
"SLP: initialize schedule region to " << *
I <<
"\n");
21256 ++ScheduleStart->getIterator().getReverse();
21262 return II->isAssumeLikeIntrinsic();
21265 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21266 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21267 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter !=
I &&
21269 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
21270 LLVM_DEBUG(
dbgs() <<
"SLP: exceeded schedule region size limit\n");
21277 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21278 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21280 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter ==
I)) {
21281 assert(
I->getParent() == ScheduleStart->getParent() &&
21282 "Instruction is in wrong basic block.");
21283 initScheduleData(
I, ScheduleStart,
nullptr, FirstLoadStoreInRegion);
21289 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter ==
I)) &&
21290 "Expected to reach top of the basic block or instruction down the "
21292 assert(
I->getParent() == ScheduleEnd->getParent() &&
21293 "Instruction is in wrong basic block.");
21294 initScheduleData(ScheduleEnd,
I->getNextNode(), LastLoadStoreInRegion,
21296 ScheduleEnd =
I->getNextNode();
21297 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
21298 LLVM_DEBUG(
dbgs() <<
"SLP: extend schedule region end to " << *
I <<
"\n");
21302void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
21304 ScheduleData *PrevLoadStore,
21305 ScheduleData *NextLoadStore) {
21306 ScheduleData *CurrentLoadStore = PrevLoadStore;
21311 ScheduleData *SD = ScheduleDataMap.lookup(
I);
21313 SD = allocateScheduleDataChunks();
21314 ScheduleDataMap[
I] = SD;
21316 assert(!isInSchedulingRegion(*SD) &&
21317 "new ScheduleData already in scheduling region");
21318 SD->init(SchedulingRegionID,
I);
21320 if (
I->mayReadOrWriteMemory() &&
21324 Intrinsic::pseudoprobe))) {
21326 if (CurrentLoadStore) {
21327 CurrentLoadStore->setNextLoadStore(SD);
21329 FirstLoadStoreInRegion = SD;
21331 CurrentLoadStore = SD;
21336 RegionHasStackSave =
true;
21338 if (NextLoadStore) {
21339 if (CurrentLoadStore)
21340 CurrentLoadStore->setNextLoadStore(NextLoadStore);
21342 LastLoadStoreInRegion = CurrentLoadStore;
21346void BoUpSLP::BlockScheduling::calculateDependencies(
21347 ScheduleBundle &Bundle,
bool InsertInReadyList,
BoUpSLP *SLP,
21349 SmallVector<ScheduleEntity *> WorkList;
21350 auto ProcessNode = [&](ScheduleEntity *SE) {
21352 if (CD->hasValidDependencies())
21355 CD->initDependencies();
21356 CD->resetUnscheduledDeps();
21357 const EdgeInfo &EI = CD->getEdgeInfo();
21360 const auto *It =
find(
Op, CD->getInst());
21361 assert(It !=
Op.end() &&
"Lane not set");
21362 SmallPtrSet<Instruction *, 4> Visited;
21364 int Lane = std::distance(
Op.begin(), It);
21365 assert(Lane >= 0 &&
"Lane not set");
21367 !EI.UserTE->ReorderIndices.empty())
21368 Lane = EI.UserTE->ReorderIndices[Lane];
21369 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
21370 "Couldn't find extract lane");
21372 if (EI.UserTE->isCopyableElement(In)) {
21375 if (ScheduleCopyableData *UseSD =
21376 getScheduleCopyableData(EI.UserTE->UserTreeIndex, In)) {
21377 CD->incDependencies();
21378 if (!UseSD->isScheduled())
21379 CD->incrementUnscheduledDeps(1);
21380 if (!UseSD->hasValidDependencies() ||
21381 (InsertInReadyList && UseSD->isReady()))
21384 }
else if (Visited.
insert(In).second) {
21385 if (ScheduleData *UseSD = getScheduleData(In)) {
21386 CD->incDependencies();
21387 if (!UseSD->isScheduled())
21388 CD->incrementUnscheduledDeps(1);
21389 if (!UseSD->hasValidDependencies() ||
21390 (InsertInReadyList && UseSD->isReady()))
21395 }
while (It !=
Op.end());
21396 if (CD->isReady() && CD->getDependencies() == 0 &&
21397 (EI.UserTE->hasState() &&
21398 (EI.UserTE->getMainOp()->getParent() !=
21399 CD->getInst()->getParent() ||
21401 (EI.UserTE->getMainOp()->hasNUsesOrMore(
UsesLimit) ||
21402 any_of(EI.UserTE->getMainOp()->users(), [&](User *U) {
21403 auto *IU = dyn_cast<Instruction>(U);
21406 return IU->getParent() == EI.UserTE->getMainOp()->getParent();
21412 CD->incDependencies();
21413 CD->incrementUnscheduledDeps(1);
21419 if (BundleMember->hasValidDependencies())
21421 LLVM_DEBUG(
dbgs() <<
"SLP: update deps of " << *BundleMember <<
"\n");
21422 BundleMember->initDependencies();
21423 BundleMember->resetUnscheduledDeps();
21425 SmallDenseMap<Value *, unsigned> UserToNumOps;
21426 for (User *U : BundleMember->getInst()->users()) {
21429 if (ScheduleData *UseSD = getScheduleData(U)) {
21433 if (areAllOperandsReplacedByCopyableData(
21436 BundleMember->incDependencies();
21437 if (!UseSD->isScheduled())
21438 BundleMember->incrementUnscheduledDeps(1);
21439 if (!UseSD->hasValidDependencies() ||
21440 (InsertInReadyList && UseSD->isReady()))
21444 for (ScheduleCopyableData *UseSD :
21445 getScheduleCopyableDataUsers(BundleMember->getInst())) {
21446 BundleMember->incDependencies();
21447 if (!UseSD->isScheduled())
21448 BundleMember->incrementUnscheduledDeps(1);
21449 if (!UseSD->hasValidDependencies() ||
21450 (InsertInReadyList && UseSD->isReady()))
21454 SmallPtrSet<const Instruction *, 4> Visited;
21457 if (!Visited.
insert(
I).second)
21459 auto *DepDest = getScheduleData(
I);
21460 assert(DepDest &&
"must be in schedule window");
21461 DepDest->addControlDependency(BundleMember);
21462 BundleMember->incDependencies();
21463 if (!DepDest->isScheduled())
21464 BundleMember->incrementUnscheduledDeps(1);
21465 if (!DepDest->hasValidDependencies() ||
21466 (InsertInReadyList && DepDest->isReady()))
21474 for (Instruction *
I = BundleMember->getInst()->getNextNode();
21475 I != ScheduleEnd;
I =
I->getNextNode()) {
21480 MakeControlDependent(
I);
21488 if (RegionHasStackSave) {
21493 match(BundleMember->getInst(),
21495 for (Instruction *
I = BundleMember->getInst()->getNextNode();
21496 I != ScheduleEnd;
I =
I->getNextNode()) {
21507 MakeControlDependent(
I);
21517 BundleMember->getInst()->mayReadOrWriteMemory()) {
21518 for (Instruction *
I = BundleMember->getInst()->getNextNode();
21519 I != ScheduleEnd;
I =
I->getNextNode()) {
21525 MakeControlDependent(
I);
21532 ScheduleData *NextLoadStore = BundleMember->getNextLoadStore();
21533 if (!NextLoadStore)
21537 "NextLoadStore list for non memory effecting bundle?");
21540 unsigned NumAliased = 0;
21541 unsigned DistToSrc = 1;
21542 bool IsNonSimpleSrc = !SrcLoc.
Ptr || !
isSimple(SrcInst);
21544 for (ScheduleData *DepDest = NextLoadStore; DepDest;
21545 DepDest = DepDest->getNextLoadStore()) {
21546 assert(isInSchedulingRegion(*DepDest) &&
"Expected to be in region");
21556 ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
21558 SLP->isAliased(SrcLoc, SrcInst, DepDest->getInst())))) {
21565 DepDest->addMemoryDependency(BundleMember);
21566 BundleMember->incDependencies();
21567 if (!DepDest->isScheduled())
21568 BundleMember->incrementUnscheduledDeps(1);
21569 if (!DepDest->hasValidDependencies() ||
21570 (InsertInReadyList && DepDest->isReady()))
21594 "expected at least one instruction to schedule");
21596 WorkList.
push_back(Bundle.getBundle().front());
21598 SmallPtrSet<ScheduleBundle *, 16> Visited;
21599 while (!WorkList.
empty()) {
21604 CopyableBundle.
push_back(&CD->getBundle());
21605 Bundles = CopyableBundle;
21607 Bundles = getScheduleBundles(SD->getInst());
21609 if (Bundles.
empty()) {
21610 if (!SD->hasValidDependencies())
21612 if (InsertInReadyList && SD->isReady()) {
21613 ReadyInsts.insert(SD);
21614 LLVM_DEBUG(
dbgs() <<
"SLP: gets ready on update: " << *SD <<
"\n");
21618 for (ScheduleBundle *Bundle : Bundles) {
21619 if (Bundle->hasValidDependencies() || !Visited.
insert(Bundle).second)
21621 assert(isInSchedulingRegion(*Bundle) &&
21622 "ScheduleData not in scheduling region");
21623 for_each(Bundle->getBundle(), ProcessNode);
21625 if (InsertInReadyList && SD->isReady()) {
21626 for (ScheduleBundle *Bundle : Bundles) {
21627 assert(isInSchedulingRegion(*Bundle) &&
21628 "ScheduleData not in scheduling region");
21629 if (!Bundle->isReady())
21631 ReadyInsts.insert(Bundle);
21639void BoUpSLP::BlockScheduling::resetSchedule() {
21641 "tried to reset schedule on block which has not been scheduled");
21642 for_each(ScheduleDataMap, [&](
auto &
P) {
21643 if (BB !=
P.first->getParent())
21645 ScheduleData *SD =
P.second;
21646 if (isInSchedulingRegion(*SD)) {
21647 SD->setScheduled(
false);
21648 SD->resetUnscheduledDeps();
21651 for_each(ScheduleCopyableDataMapByInst, [&](
auto &
P) {
21652 for_each(
P.second, [&](ScheduleCopyableData *SD) {
21653 if (isInSchedulingRegion(*SD)) {
21654 SD->setScheduled(false);
21655 SD->resetUnscheduledDeps();
21659 for_each(ScheduledBundles, [&](
auto &
P) {
21660 for_each(
P.second, [&](ScheduleBundle *Bundle) {
21661 if (isInSchedulingRegion(*Bundle))
21662 Bundle->setScheduled(false);
21666 for (
auto &
P : ScheduleCopyableDataMap) {
21667 if (isInSchedulingRegion(*
P.second)) {
21668 P.second->setScheduled(
false);
21669 P.second->resetUnscheduledDeps();
21672 ReadyInsts.clear();
21675void BoUpSLP::scheduleBlock(
const BoUpSLP &R, BlockScheduling *BS) {
21676 if (!BS->ScheduleStart)
21679 LLVM_DEBUG(
dbgs() <<
"SLP: schedule block " << BS->BB->getName() <<
"\n");
21686 BS->resetSchedule();
21693 struct ScheduleDataCompare {
21694 bool operator()(
const ScheduleEntity *SD1,
21695 const ScheduleEntity *SD2)
const {
21696 return SD2->getSchedulingPriority() < SD1->getSchedulingPriority();
21699 std::set<ScheduleEntity *, ScheduleDataCompare> ReadyInsts;
21704 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
21705 I =
I->getNextNode()) {
21707 if (!Bundles.
empty()) {
21708 for (ScheduleBundle *Bundle : Bundles) {
21709 Bundle->setSchedulingPriority(Idx++);
21710 if (!Bundle->hasValidDependencies())
21711 BS->calculateDependencies(*Bundle,
false,
this);
21714 for (ScheduleCopyableData *SD :
reverse(SDs)) {
21715 ScheduleBundle &Bundle = SD->getBundle();
21716 Bundle.setSchedulingPriority(Idx++);
21717 if (!Bundle.hasValidDependencies())
21718 BS->calculateDependencies(Bundle,
false,
this);
21723 BS->getScheduleCopyableDataUsers(
I);
21724 if (ScheduleData *SD = BS->getScheduleData(
I)) {
21727 SDTEs.
front()->doesNotNeedToSchedule() ||
21729 "scheduler and vectorizer bundle mismatch");
21730 SD->setSchedulingPriority(Idx++);
21731 if (!SD->hasValidDependencies() &&
21732 (!CopyableData.
empty() ||
21733 any_of(
R.ValueToGatherNodes.lookup(
I), [&](
const TreeEntry *TE) {
21734 assert(TE->isGather() &&
"expected gather node");
21735 return TE->hasState() && TE->hasCopyableElements() &&
21736 TE->isCopyableElement(I);
21742 ScheduleBundle Bundle;
21744 BS->calculateDependencies(Bundle,
false,
this);
21747 for (ScheduleCopyableData *SD :
reverse(CopyableData)) {
21748 ScheduleBundle &Bundle = SD->getBundle();
21749 Bundle.setSchedulingPriority(Idx++);
21750 if (!Bundle.hasValidDependencies())
21751 BS->calculateDependencies(Bundle,
false,
this);
21754 BS->initialFillReadyList(ReadyInsts);
21756 Instruction *LastScheduledInst = BS->ScheduleEnd;
21759 SmallPtrSet<Instruction *, 16> Scheduled;
21760 while (!ReadyInsts.empty()) {
21761 auto *Picked = *ReadyInsts.begin();
21762 ReadyInsts.erase(ReadyInsts.begin());
21767 for (
const ScheduleEntity *BundleMember : Bundle->getBundle()) {
21768 Instruction *PickedInst = BundleMember->getInst();
21770 bool IsCopyable = Bundle->getTreeEntry()->isCopyableElement(PickedInst);
21771 if ((IsCopyable && BS->getScheduleData(PickedInst)) ||
21772 (!IsCopyable && !Scheduled.
insert(PickedInst).second))
21774 if (PickedInst->
getNextNode() != LastScheduledInst)
21776 LastScheduledInst = PickedInst;
21778 EntryToLastInstruction.try_emplace(Bundle->getTreeEntry(),
21779 LastScheduledInst);
21783 if (PickedInst->
getNextNode() != LastScheduledInst)
21785 LastScheduledInst = PickedInst;
21787 auto Invalid = InstructionsState::invalid();
21792#ifdef EXPENSIVE_CHECKS
21796#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
21798 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
21799 I =
I->getNextNode()) {
21802 [](
const ScheduleBundle *Bundle) {
21803 return Bundle->isScheduled();
21805 "must be scheduled at this point");
21810 BS->ScheduleStart =
nullptr;
21818 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
21823 auto E = InstrElementSize.find(V);
21824 if (E != InstrElementSize.end())
21841 Value *FirstNonBool =
nullptr;
21842 while (!Worklist.
empty()) {
21847 auto *Ty =
I->getType();
21850 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
21858 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
21866 for (
Use &U :
I->operands()) {
21868 if (Visited.
insert(J).second &&
21874 FirstNonBool = U.get();
21885 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
21887 Width = DL->getTypeSizeInBits(V->getType());
21891 InstrElementSize[
I] = Width;
21896bool BoUpSLP::collectValuesToDemote(
21897 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
21900 bool &IsProfitableToDemote,
bool IsTruncRoot)
const {
21905 unsigned OrigBitWidth =
21906 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
21913 if (NodesToKeepBWs.
contains(E.Idx))
21919 bool IsSignedNode =
any_of(E.Scalars, [&](
Value *R) {
21920 if (isa<PoisonValue>(R))
21922 return !isKnownNonNegative(R, SimplifyQuery(*DL));
21924 auto IsPotentiallyTruncated = [&](
Value *V,
unsigned &
BitWidth) ->
bool {
21927 if (getTreeEntries(V).
size() > 1)
21933 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth >
BitWidth) {
21939 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
21944 unsigned BitWidth2 =
21945 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
21946 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
21952 BitWidth1 = std::min(BitWidth1, BitWidth2);
21957 auto FinalAnalysis = [&, TTI = TTI]() {
21958 if (!IsProfitableToDemote)
21961 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(
BitWidth)));
21963 if (Res &&
E.isGather()) {
21964 if (
E.hasState()) {
21965 if (
const TreeEntry *SameTE =
21966 getSameValuesTreeEntry(
E.getMainOp(),
E.Scalars);
21968 if (collectValuesToDemote(*SameTE, IsProfitableToDemoteRoot,
BitWidth,
21969 ToDemote, Visited, NodesToKeepBWs,
21970 MaxDepthLevel, IsProfitableToDemote,
21978 SmallPtrSet<Value *, 4> UniqueBases;
21979 for (
Value *V :
E.Scalars) {
21983 UniqueBases.
insert(EE->getVectorOperand());
21985 const unsigned VF =
E.Scalars.size();
21986 Type *OrigScalarTy =
E.Scalars.front()->getType();
21987 if (UniqueBases.
size() <= 2 ||
22000 if (
E.isGather() || !Visited.
insert(&
E).second ||
22002 return !isa<Constant>(V) && all_of(V->users(), [&](User *U) {
22003 return isa<InsertElementInst>(U) && !isVectorized(U);
22006 return FinalAnalysis();
22009 return !isa<Constant>(V) && !all_of(V->users(), [=](User *U) {
22010 return isVectorized(U) ||
22011 (E.Idx == 0 && UserIgnoreList &&
22012 UserIgnoreList->contains(U)) ||
22013 (!isa<CmpInst>(U) && U->getType()->isSized() &&
22014 !U->getType()->isScalableTy() &&
22015 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
22016 }) && !IsPotentiallyTruncated(V,
BitWidth);
22021 bool &NeedToExit) {
22022 NeedToExit =
false;
22023 unsigned InitLevel = MaxDepthLevel;
22024 for (
const TreeEntry *
Op : Operands) {
22025 unsigned Level = InitLevel;
22026 if (!collectValuesToDemote(*
Op, IsProfitableToDemoteRoot,
BitWidth,
22027 ToDemote, Visited, NodesToKeepBWs, Level,
22028 IsProfitableToDemote, IsTruncRoot)) {
22029 if (!IsProfitableToDemote)
22032 if (!FinalAnalysis())
22036 MaxDepthLevel = std::max(MaxDepthLevel, Level);
22040 auto AttemptCheckBitwidth =
22041 [&](function_ref<bool(
unsigned,
unsigned)> Checker,
bool &NeedToExit) {
22043 NeedToExit =
false;
22044 unsigned BestFailBitwidth = 0;
22046 if (Checker(
BitWidth, OrigBitWidth))
22048 if (BestFailBitwidth == 0 && FinalAnalysis())
22052 if (BestFailBitwidth == 0) {
22063 auto TryProcessInstruction =
22065 function_ref<bool(
unsigned,
unsigned)> Checker = {}) {
22066 if (Operands.empty()) {
22069 for (
Value *V :
E.Scalars)
22070 (void)IsPotentiallyTruncated(V,
BitWidth);
22075 return !V->hasOneUse() && !IsPotentiallyTruncated(V, BitWidth);
22078 bool NeedToExit =
false;
22079 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
22083 if (!ProcessOperands(Operands, NeedToExit))
22092 return IsProfitableToDemote;
22095 if (
E.State == TreeEntry::SplitVectorize)
22096 return TryProcessInstruction(
22098 {VectorizableTree[
E.CombinedEntriesWithIndices.front().first].get(),
22099 VectorizableTree[
E.CombinedEntriesWithIndices.back().first].get()});
22101 switch (
E.getOpcode()) {
22105 case Instruction::Trunc:
22106 if (IsProfitableToDemoteRoot)
22107 IsProfitableToDemote =
true;
22108 return TryProcessInstruction(
BitWidth);
22109 case Instruction::ZExt:
22110 case Instruction::SExt:
22111 if (
E.UserTreeIndex.UserTE &&
E.UserTreeIndex.UserTE->hasState() &&
22112 E.UserTreeIndex.UserTE->getOpcode() == Instruction::BitCast &&
22113 E.UserTreeIndex.UserTE->getMainOp()->getType()->isFPOrFPVectorTy())
22115 IsProfitableToDemote =
true;
22116 return TryProcessInstruction(
BitWidth);
22120 case Instruction::Add:
22121 case Instruction::Sub:
22122 case Instruction::Mul:
22123 case Instruction::And:
22124 case Instruction::Or:
22125 case Instruction::Xor: {
22126 return TryProcessInstruction(
22127 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)});
22129 case Instruction::Freeze:
22130 return TryProcessInstruction(
BitWidth, getOperandEntry(&
E, 0));
22131 case Instruction::Shl: {
22134 auto ShlChecker = [&](
unsigned BitWidth, unsigned) {
22136 if (isa<PoisonValue>(V))
22138 if (E.isCopyableElement(V))
22140 auto *I = cast<Instruction>(V);
22141 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22142 return AmtKnownBits.getMaxValue().ult(BitWidth);
22145 return TryProcessInstruction(
22146 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)}, ShlChecker);
22148 case Instruction::LShr: {
22152 auto LShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22154 if (isa<PoisonValue>(V))
22156 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22157 if (E.isCopyableElement(V))
22158 return MaskedValueIsZero(V, ShiftedBits, SimplifyQuery(*DL));
22159 auto *I = cast<Instruction>(V);
22160 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22161 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
22162 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
22163 SimplifyQuery(*DL));
22166 return TryProcessInstruction(
22167 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)},
22170 case Instruction::AShr: {
22174 auto AShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22176 if (isa<PoisonValue>(V))
22178 auto *I = cast<Instruction>(V);
22179 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22180 unsigned ShiftedBits = OrigBitWidth - BitWidth;
22181 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
22183 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22186 return TryProcessInstruction(
22187 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)},
22190 case Instruction::UDiv:
22191 case Instruction::URem: {
22193 auto Checker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22196 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22197 if (E.hasCopyableElements() && E.isCopyableElement(V))
22198 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
22199 auto *I = cast<Instruction>(V);
22200 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
22201 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
22204 return TryProcessInstruction(
22205 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)}, Checker);
22209 case Instruction::Select: {
22210 return TryProcessInstruction(
22211 BitWidth, {getOperandEntry(&
E, 1), getOperandEntry(&
E, 2)});
22215 case Instruction::PHI: {
22216 const unsigned NumOps =
E.getNumOperands();
22219 [&](
unsigned Idx) { return getOperandEntry(&E, Idx); });
22224 case Instruction::Call: {
22229 if (
ID != Intrinsic::abs &&
ID != Intrinsic::smin &&
22230 ID != Intrinsic::smax &&
ID != Intrinsic::umin &&
ID != Intrinsic::umax)
22233 function_ref<bool(
unsigned,
unsigned)> CallChecker;
22234 auto CompChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22237 auto *I = cast<Instruction>(V);
22238 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
22239 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22240 return MaskedValueIsZero(I->getOperand(0), Mask,
22241 SimplifyQuery(*DL)) &&
22242 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
22244 assert((
ID == Intrinsic::smin ||
ID == Intrinsic::smax) &&
22245 "Expected min/max intrinsics only.");
22246 unsigned SignBits = OrigBitWidth -
BitWidth;
22248 unsigned Op0SignBits =
22250 unsigned Op1SignBits =
22252 return SignBits <= Op0SignBits &&
22253 ((SignBits != Op0SignBits &&
22256 SimplifyQuery(*DL))) &&
22257 SignBits <= Op1SignBits &&
22258 ((SignBits != Op1SignBits &&
22263 auto AbsChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22266 auto *I = cast<Instruction>(V);
22267 unsigned SignBits = OrigBitWidth - BitWidth;
22268 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
22269 unsigned Op0SignBits =
22270 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22271 return SignBits <= Op0SignBits &&
22272 ((SignBits != Op0SignBits &&
22273 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
22274 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
22277 if (
ID != Intrinsic::abs) {
22278 Operands.push_back(getOperandEntry(&
E, 1));
22279 CallChecker = CompChecker;
22281 CallChecker = AbsChecker;
22284 std::numeric_limits<InstructionCost::CostType>::max();
22286 unsigned VF =
E.Scalars.size();
22288 auto Checker = [&](
unsigned BitWidth, unsigned) {
22296 if (
Cost < BestCost) {
22302 [[maybe_unused]]
bool NeedToExit;
22303 (void)AttemptCheckBitwidth(Checker, NeedToExit);
22305 return TryProcessInstruction(
BitWidth, Operands, CallChecker);
22313 return FinalAnalysis();
22320 bool IsStoreOrInsertElt =
22321 VectorizableTree.front()->hasState() &&
22322 (VectorizableTree.front()->
getOpcode() == Instruction::Store ||
22323 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
22324 if ((IsStoreOrInsertElt || UserIgnoreList) &&
22325 ExtraBitWidthNodes.size() <= 1 &&
22326 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
22327 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
22330 unsigned NodeIdx = 0;
22331 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
22335 assert((VectorizableTree[NodeIdx]->
isGather() || NodeIdx != 0 ||
22336 !VectorizableTree[NodeIdx]->UserTreeIndex) &&
22337 "Unexpected tree is graph.");
22341 bool IsTruncRoot =
false;
22342 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
22345 if (NodeIdx != 0 &&
22346 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22347 VectorizableTree[NodeIdx]->
getOpcode() == Instruction::Trunc) {
22348 assert(IsStoreOrInsertElt &&
"Expected store/insertelement seeded graph.");
22349 IsTruncRoot =
true;
22351 IsProfitableToDemoteRoot =
true;
22356 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
22360 auto ComputeMaxBitWidth =
22361 [&](
const TreeEntry &E,
bool IsTopRoot,
bool IsProfitableToDemoteRoot,
22362 unsigned Limit,
bool IsTruncRoot,
bool IsSignedCmp) ->
unsigned {
22366 if (E.isGather() && IsTruncRoot && E.UserTreeIndex &&
22367 !NodesToKeepBWs.
contains(E.Idx) &&
22368 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
22370 return V->hasOneUse() || isa<Constant>(V) ||
22371 (!V->hasNUsesOrMore(UsesLimit) &&
22372 none_of(V->users(), [&](User *U) {
22373 ArrayRef<TreeEntry *> TEs = getTreeEntries(U);
22374 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22375 if (TEs.empty() || is_contained(TEs, UserTE))
22377 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22379 isa<SIToFPInst, UIToFPInst>(U) ||
22380 (UserTE->hasState() &&
22381 (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22382 SelectInst>(UserTE->getMainOp()) ||
22383 isa<SIToFPInst, UIToFPInst>(UserTE->getMainOp()))))
22385 unsigned UserTESz = DL->getTypeSizeInBits(
22386 UserTE->Scalars.front()->getType());
22387 if (all_of(TEs, [&](const TreeEntry *TE) {
22388 auto It = MinBWs.find(TE);
22389 return It != MinBWs.end() &&
22390 It->second.first > UserTESz;
22393 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
22397 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22398 auto It = MinBWs.find(UserTE);
22399 if (It != MinBWs.end())
22400 return It->second.first;
22401 unsigned MaxBitWidth =
22402 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
22403 MaxBitWidth =
bit_ceil(MaxBitWidth);
22404 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22406 return MaxBitWidth;
22412 unsigned VF = E.getVectorFactor();
22413 Type *ScalarTy = E.Scalars.front()->getType();
22420 [&](
Value *V) { return AnalyzedMinBWVals.contains(V); }))
22429 unsigned MaxBitWidth = 1u;
22437 bool IsKnownPositive = !IsSignedCmp &&
all_of(E.Scalars, [&](
Value *R) {
22438 if (isa<PoisonValue>(R))
22440 KnownBits Known = computeKnownBits(R, *DL);
22441 return Known.isNonNegative();
22444 if (!IsKnownPositive && !IsTopRoot && E.UserTreeIndex &&
22445 E.UserTreeIndex.UserTE->hasState() &&
22446 E.UserTreeIndex.UserTE->getOpcode() == Instruction::UIToFP)
22448 std::min(DL->getTypeSizeInBits(
22449 E.UserTreeIndex.UserTE->Scalars.front()->getType()),
22450 DL->getTypeSizeInBits(ScalarTy));
22454 for (
Value *Root : E.Scalars) {
22460 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22476 if (!IsKnownPositive)
22481 MaxBitWidth = std::max(BitWidth1, MaxBitWidth);
22484 APInt Mask = DB->getDemandedBits(
I);
22485 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
22487 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
22490 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22495 if (NumParts > 1 &&
22503 unsigned Opcode = E.getOpcode();
22504 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
22505 Opcode == Instruction::SExt ||
22506 Opcode == Instruction::ZExt || NumParts > 1;
22511 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
22512 bool NeedToDemote = IsProfitableToDemote;
22514 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
22515 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
22516 NeedToDemote, IsTruncRoot) ||
22517 (MaxDepthLevel <= Limit &&
22518 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
22519 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
22520 DL->getTypeSizeInBits(TreeRootIT) /
22521 DL->getTypeSizeInBits(
22522 E.getMainOp()->getOperand(0)->getType()) >
22526 MaxBitWidth =
bit_ceil(MaxBitWidth);
22528 return MaxBitWidth;
22535 if (UserIgnoreList &&
22539 if (
all_of(*UserIgnoreList,
22544 VectorizableTree.front()->State == TreeEntry::Vectorize &&
22545 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
22546 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
22547 Builder.getInt1Ty()) {
22548 ReductionBitWidth = 1;
22550 for (
Value *V : *UserIgnoreList) {
22554 TypeSize NumTypeBits =
DL->getTypeSizeInBits(V->getType());
22555 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22558 unsigned BitWidth2 = BitWidth1;
22561 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
22563 ReductionBitWidth =
22564 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
22566 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
22567 ReductionBitWidth = 8;
22569 ReductionBitWidth =
bit_ceil(ReductionBitWidth);
22572 bool IsTopRoot = NodeIdx == 0;
22573 while (NodeIdx < VectorizableTree.size() &&
22574 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22575 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
22576 RootDemotes.push_back(NodeIdx);
22578 IsTruncRoot =
true;
22580 bool IsSignedCmp =
false;
22581 if (UserIgnoreList &&
22585 IsSignedCmp =
true;
22586 while (NodeIdx < VectorizableTree.size()) {
22588 unsigned Limit = 2;
22590 ReductionBitWidth ==
22591 DL->getTypeSizeInBits(
22592 VectorizableTree.front()->Scalars.front()->getType()))
22594 unsigned MaxBitWidth = ComputeMaxBitWidth(
22595 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
22596 IsTruncRoot, IsSignedCmp);
22597 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
22598 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
22599 ReductionBitWidth =
bit_ceil(MaxBitWidth);
22600 else if (MaxBitWidth == 0)
22601 ReductionBitWidth = 0;
22604 for (
unsigned Idx : RootDemotes) {
22605 if (
all_of(VectorizableTree[Idx]->Scalars, [&](
Value *V) {
22606 uint32_t OrigBitWidth =
22607 DL->getTypeSizeInBits(
V->getType()->getScalarType());
22608 if (OrigBitWidth > MaxBitWidth) {
22616 RootDemotes.clear();
22618 IsProfitableToDemoteRoot =
true;
22620 if (ExtraBitWidthNodes.empty()) {
22621 NodeIdx = VectorizableTree.size();
22623 unsigned NewIdx = 0;
22625 NewIdx = *ExtraBitWidthNodes.begin();
22626 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
22627 }
while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
22630 NodeIdx < VectorizableTree.size() &&
22631 VectorizableTree[NodeIdx]->UserTreeIndex &&
22632 VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&
22633 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22634 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22635 Instruction::Trunc &&
22636 !VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();
22638 NodeIdx < VectorizableTree.size() &&
22639 VectorizableTree[NodeIdx]->UserTreeIndex &&
22640 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22641 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22642 Instruction::ICmp &&
22644 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->Scalars,
22646 auto *IC = dyn_cast<ICmpInst>(V);
22647 return IC && (IC->isSigned() ||
22648 !isKnownNonNegative(IC->getOperand(0),
22649 SimplifyQuery(*DL)) ||
22650 !isKnownNonNegative(IC->getOperand(1),
22651 SimplifyQuery(*DL)));
22657 if (MaxBitWidth == 0 ||
22661 if (UserIgnoreList)
22662 AnalyzedMinBWVals.insert_range(TreeRoot);
22669 for (
unsigned Idx : ToDemote) {
22670 TreeEntry *
TE = VectorizableTree[Idx].get();
22671 if (MinBWs.contains(TE))
22674 if (isa<PoisonValue>(R))
22676 return !isKnownNonNegative(R, SimplifyQuery(*DL));
22678 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
22719 DL = &
F.getDataLayout();
22727 if (!
TTI->getNumberOfRegisters(
TTI->getRegisterClassForType(
true))) {
22729 dbgs() <<
"SLP: Didn't find any vector registers for target, abort.\n");
22734 if (
F.hasFnAttribute(Attribute::NoImplicitFloat))
22737 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing blocks in " <<
F.getName() <<
".\n");
22741 BoUpSLP R(&
F,
SE,
TTI,
TLI,
AA,
LI,
DT,
AC,
DB,
DL, ORE_);
22747 DT->updateDFSNumbers();
22750 for (
auto *BB :
post_order(&
F.getEntryBlock())) {
22755 R.clearReductionData();
22756 collectSeedInstructions(BB);
22759 if (!Stores.empty()) {
22761 <<
" underlying objects.\n");
22762 Changed |= vectorizeStoreChains(R);
22766 Changed |= vectorizeChainsInBlock(BB, R);
22771 if (!GEPs.empty()) {
22773 <<
" underlying objects.\n");
22774 Changed |= vectorizeGEPIndices(BB, R);
22779 R.optimizeGatherSequence();
22787 unsigned Idx,
unsigned MinVF,
22792 const unsigned Sz = R.getVectorElementSize(Chain[0]);
22793 unsigned VF = Chain.
size();
22799 VF < 2 || VF < MinVF) {
22807 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << VF <<
" stores at offset " << Idx
22811 for (
Value *V : Chain)
22814 InstructionsCompatibilityAnalysis
Analysis(*DT, *
DL, *
TTI, *TLI);
22815 InstructionsState S =
Analysis.buildInstructionsState(
22819 bool IsAllowedSize =
22823 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
22824 (!S.getMainOp()->isSafeToRemove() ||
22827 return !isa<ExtractElementInst>(V) &&
22828 (V->getNumUses() > Chain.size() ||
22829 any_of(V->users(), [&](User *U) {
22830 return !Stores.contains(U);
22833 (ValOps.
size() > Chain.size() / 2 && !S)) {
22834 Size = (!IsAllowedSize && S) ? 1 : 2;
22838 if (
R.isLoadCombineCandidate(Chain))
22840 R.buildTree(Chain);
22842 if (
R.isTreeTinyAndNotFullyVectorizable()) {
22843 if (
R.isGathered(Chain.front()) ||
22845 return std::nullopt;
22846 Size =
R.getCanonicalGraphSize();
22849 if (
R.isProfitableToReorder()) {
22850 R.reorderTopToBottom();
22851 R.reorderBottomToTop();
22853 R.transformNodes();
22854 R.buildExternalUses();
22856 R.computeMinimumValueSizes();
22858 Size =
R.getCanonicalGraphSize();
22859 if (S && S.getOpcode() == Instruction::Load)
22867 using namespace ore;
22869 R.getORE()->emit(OptimizationRemark(
SV_NAME,
"StoresVectorized",
22871 <<
"Stores SLP vectorized with cost " <<
NV(
"Cost",
Cost)
22872 <<
" and with tree size "
22873 <<
NV(
"TreeSize",
R.getTreeSize()));
22887 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
22888 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
22889 unsigned Size = First ? Val.first : Val.second;
22901 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
22902 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
22903 unsigned P = First ? Val.first : Val.second;
22906 return V + (P - Mean) * (P - Mean);
22909 return Dev * 96 / (Mean * Mean) == 0;
22917class RelatedStoreInsts {
22920 : AllStores(AllStores) {
22921 reset(BaseInstrIdx);
22924 void reset(
unsigned NewBaseInstr) {
22925 assert(NewBaseInstr < AllStores.size() &&
22926 "Instruction index out of bounds");
22927 BaseInstrIdx = NewBaseInstr;
22929 insertOrLookup(NewBaseInstr, 0);
22936 std::optional<unsigned> insertOrLookup(
unsigned InstrIdx, int64_t PtrDist) {
22937 auto [It,
Inserted] = Instrs.emplace(PtrDist, InstrIdx);
22938 return Inserted ? std::nullopt : std::make_optional(It->second);
22941 using DistToInstMap = std::map<int64_t, unsigned>;
22942 const DistToInstMap &getStores()
const {
return Instrs; }
22946 std::optional<int64_t> getPointerDiff(StoreInst &SI,
const DataLayout &
DL,
22947 ScalarEvolution &SE)
const {
22948 StoreInst &BaseStore = *AllStores[BaseInstrIdx];
22951 SI.getValueOperand()->getType(),
SI.getPointerOperand(),
DL, SE,
22957 void rebase(
unsigned MinSafeIdx,
unsigned NewBaseInstIdx,
22958 int64_t DistFromCurBase) {
22959 DistToInstMap PrevSet = std::move(Instrs);
22960 reset(NewBaseInstIdx);
22965 for (
auto [Dist, InstIdx] : PrevSet) {
22966 if (InstIdx >= MinSafeIdx)
22967 insertOrLookup(InstIdx, Dist - DistFromCurBase);
22973 DistToInstMap::reverse_iterator LastVectorizedStore =
find_if(
22974 reverse(Instrs), [&](
const std::pair<int64_t, unsigned> &DistAndIdx) {
22975 return VectorizedStores.
contains(AllStores[DistAndIdx.second]);
22980 DistToInstMap::iterator VectorizedStoresEnd = LastVectorizedStore.base();
22981 Instrs.erase(Instrs.begin(), VectorizedStoresEnd);
22986 unsigned BaseInstrIdx;
22989 DistToInstMap Instrs;
22997bool SLPVectorizerPass::vectorizeStores(
22999 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
23006 auto TryToVectorize = [&](
const RelatedStoreInsts::DistToInstMap &StoreSeq) {
23007 int64_t PrevDist = -1;
23011 auto &[Dist, InstIdx] =
Data;
23012 if (Operands.
empty() || Dist - PrevDist == 1) {
23015 if (Idx != StoreSeq.size() - 1)
23024 if (Operands.
size() <= 1 ||
23026 .
insert({Operands.front(),
23027 cast<StoreInst>(Operands.front())->getValueOperand(),
23029 cast<StoreInst>(Operands.back())->getValueOperand(),
23034 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
23035 unsigned EltSize =
R.getVectorElementSize(Operands[0]);
23039 std::min(
R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
23041 Type *StoreTy =
Store->getValueOperand()->getType();
23042 Type *ValueTy = StoreTy;
23044 ValueTy = Trunc->getSrcTy();
23053 R.getMinVF(DL->getTypeStoreSizeInBits(StoreScalarTy)), StoreScalarTy,
23056 MinVF = std::max<unsigned>(2, MinVF);
23058 if (MaxVF < MinVF) {
23059 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
23061 <<
"MinVF (" << MinVF <<
")\n");
23065 unsigned NonPowerOf2VF = 0;
23070 unsigned CandVF = std::clamp<unsigned>(Operands.
size(), MinVF, MaxVF);
23072 NonPowerOf2VF = CandVF;
23073 assert(NonPowerOf2VF != MaxVF &&
23074 "Non-power-of-2 VF should not be equal to MaxVF");
23081 unsigned MaxRegVF = MaxVF;
23083 MaxVF = std::min<unsigned>(MaxVF,
bit_floor(Operands.
size()));
23084 if (MaxVF < MinVF) {
23085 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
23087 <<
"MinVF (" << MinVF <<
")\n");
23091 SmallVector<unsigned> CandidateVFs;
23092 for (
unsigned VF = std::max(MaxVF, NonPowerOf2VF); VF >= MinVF;
23096 unsigned End = Operands.
size();
23097 unsigned Repeat = 0;
23098 constexpr unsigned MaxAttempts = 4;
23099 OwningArrayRef<std::pair<unsigned, unsigned>> RangeSizes(Operands.
size());
23100 for (std::pair<unsigned, unsigned> &
P : RangeSizes)
23101 P.first =
P.second = 1;
23102 DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable;
23103 auto IsNotVectorized = [](
bool First,
23104 const std::pair<unsigned, unsigned> &
P) {
23105 return First ?
P.first > 0 :
P.second > 0;
23107 auto IsVectorized = [](
bool First,
23108 const std::pair<unsigned, unsigned> &
P) {
23109 return First ?
P.first == 0 :
P.second == 0;
23111 auto VFIsProfitable = [](
bool First,
unsigned Size,
23112 const std::pair<unsigned, unsigned> &
P) {
23115 auto FirstSizeSame = [](
unsigned Size,
23116 const std::pair<unsigned, unsigned> &
P) {
23117 return Size ==
P.first;
23121 bool RepeatChanged =
false;
23122 bool AnyProfitableGraph =
false;
23123 for (
unsigned VF : CandidateVFs) {
23124 AnyProfitableGraph =
false;
23125 unsigned FirstUnvecStore =
23126 std::distance(RangeSizes.begin(),
23127 find_if(RangeSizes, std::bind(IsNotVectorized,
23128 VF >= MaxRegVF, _1)));
23132 while (FirstUnvecStore < End) {
23133 unsigned FirstVecStore = std::distance(
23134 RangeSizes.begin(),
23135 find_if(RangeSizes.drop_front(FirstUnvecStore),
23136 std::bind(IsVectorized, VF >= MaxRegVF, _1)));
23137 unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore;
23138 for (
unsigned SliceStartIdx = FirstUnvecStore;
23139 SliceStartIdx + VF <= MaxSliceEnd;) {
23150 ->getValueOperand()
23153 ->getValueOperand()
23156 "Expected all operands of same type.");
23157 if (!NonSchedulable.
empty()) {
23158 auto [NonSchedSizeMax, NonSchedSizeMin] =
23160 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= VF) {
23163 SliceStartIdx += NonSchedSizeMax;
23168 std::optional<bool> Res =
23169 vectorizeStoreChain(Slice, R, SliceStartIdx, MinVF, TreeSize);
23175 .first->getSecond()
23183 AnyProfitableGraph = RepeatChanged =
Changed =
true;
23186 for (std::pair<unsigned, unsigned> &
P :
23187 RangeSizes.slice(SliceStartIdx, VF))
23188 P.first =
P.second = 0;
23189 if (SliceStartIdx < FirstUnvecStore + MinVF) {
23190 for (std::pair<unsigned, unsigned> &
P : RangeSizes.slice(
23191 FirstUnvecStore, SliceStartIdx - FirstUnvecStore))
23192 P.first =
P.second = 0;
23193 FirstUnvecStore = SliceStartIdx + VF;
23195 if (SliceStartIdx > MaxSliceEnd - VF - MinVF) {
23196 for (std::pair<unsigned, unsigned> &
P :
23197 RangeSizes.slice(SliceStartIdx + VF,
23198 MaxSliceEnd - (SliceStartIdx + VF)))
23199 P.first =
P.second = 0;
23200 if (MaxSliceEnd == End)
23201 End = SliceStartIdx;
23202 MaxSliceEnd = SliceStartIdx;
23204 SliceStartIdx += VF;
23207 if (VF > 2 && Res &&
23208 !
all_of(RangeSizes.slice(SliceStartIdx, VF),
23209 std::bind(VFIsProfitable, VF >= MaxRegVF, TreeSize,
23211 SliceStartIdx += VF;
23216 if (VF > MaxRegVF && TreeSize > 1 &&
23217 all_of(RangeSizes.slice(SliceStartIdx, VF),
23218 std::bind(FirstSizeSame, TreeSize, _1))) {
23219 SliceStartIdx += VF;
23220 while (SliceStartIdx != MaxSliceEnd &&
23221 RangeSizes[SliceStartIdx].first == TreeSize)
23225 if (TreeSize > 1) {
23226 for (std::pair<unsigned, unsigned> &
P :
23227 RangeSizes.slice(SliceStartIdx, VF)) {
23228 if (VF >= MaxRegVF)
23229 P.second = std::max(
P.second, TreeSize);
23231 P.first = std::max(
P.first, TreeSize);
23235 AnyProfitableGraph =
true;
23237 if (FirstUnvecStore >= End)
23239 if (MaxSliceEnd - FirstUnvecStore < VF &&
23240 MaxSliceEnd - FirstUnvecStore >= MinVF)
23241 AnyProfitableGraph =
true;
23242 FirstUnvecStore = std::distance(
23243 RangeSizes.begin(),
23244 find_if(RangeSizes.drop_front(MaxSliceEnd),
23245 std::bind(IsNotVectorized, VF >= MaxRegVF, _1)));
23247 if (!AnyProfitableGraph && VF >= MaxRegVF &&
has_single_bit(VF))
23251 if (
all_of(RangeSizes, [](
const std::pair<unsigned, unsigned> &
P) {
23252 return P.first == 0 &&
P.second == 0;
23256 if (Repeat >= MaxAttempts ||
23257 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
23259 constexpr unsigned StoresLimit = 64;
23260 const unsigned MaxTotalNum = std::min<unsigned>(
23262 static_cast<unsigned>(
23265 RangeSizes.begin(),
23266 find_if(RangeSizes, std::bind(IsNotVectorized,
true, _1))) +
23268 unsigned VF =
bit_ceil(CandidateVFs.front()) * 2;
23271 CandidateVFs.clear();
23273 CandidateVFs.push_back(Limit);
23274 if (VF > MaxTotalNum || VF >= StoresLimit)
23276 for (std::pair<unsigned, unsigned> &
P : RangeSizes) {
23278 P.first = std::max(
P.second,
P.first);
23282 CandidateVFs.push_back(VF);
23322 auto FillStoresSet = [&](
unsigned Idx, StoreInst *
SI) {
23323 std::optional<int64_t> PtrDist;
23324 auto *RelatedStores =
find_if(
23325 SortedStores, [&PtrDist, SI,
this](
const RelatedStoreInsts &StoreSeq) {
23326 PtrDist = StoreSeq.getPointerDiff(*SI, *DL, *SE);
23327 return PtrDist.has_value();
23331 if (RelatedStores == SortedStores.
end()) {
23339 if (std::optional<unsigned> PrevInst =
23340 RelatedStores->insertOrLookup(Idx, *PtrDist)) {
23341 TryToVectorize(RelatedStores->getStores());
23342 RelatedStores->clearVectorizedStores(VectorizedStores);
23343 RelatedStores->rebase(*PrevInst + 1,
23348 Type *PrevValTy =
nullptr;
23350 if (
R.isDeleted(SI))
23353 PrevValTy =
SI->getValueOperand()->getType();
23355 if (PrevValTy !=
SI->getValueOperand()->getType()) {
23356 for (RelatedStoreInsts &StoreSeq : SortedStores)
23357 TryToVectorize(StoreSeq.getStores());
23358 SortedStores.clear();
23359 PrevValTy =
SI->getValueOperand()->getType();
23361 FillStoresSet(
I, SI);
23365 for (RelatedStoreInsts &StoreSeq : SortedStores)
23366 TryToVectorize(StoreSeq.getStores());
23371void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
23379 for (Instruction &
I : *BB) {
23383 if (!
SI->isSimple())
23394 if (
GEP->getNumIndices() != 1)
23396 Value *Idx =
GEP->idx_begin()->get();
23401 if (
GEP->getType()->isVectorTy())
23413 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize a list of length = "
23414 << VL.
size() <<
".\n");
23425 for (
Value *V : VL) {
23426 Type *Ty =
V->getType();
23430 R.getORE()->emit([&]() {
23431 std::string TypeStr;
23432 llvm::raw_string_ostream OS(TypeStr);
23434 return OptimizationRemarkMissed(
SV_NAME,
"UnsupportedType", I0)
23435 <<
"Cannot SLP vectorize list: type "
23436 << TypeStr +
" is unsupported by vectorizer";
23443 unsigned Sz =
R.getVectorElementSize(I0);
23444 unsigned MinVF =
R.getMinVF(Sz);
23445 unsigned MaxVF = std::max<unsigned>(
23447 MaxVF = std::min(
R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
23449 R.getORE()->emit([&]() {
23450 return OptimizationRemarkMissed(
SV_NAME,
"SmallVF", I0)
23451 <<
"Cannot SLP vectorize list: vectorization factor "
23452 <<
"less than 2 is not supported";
23458 bool CandidateFound =
false;
23461 unsigned NextInst = 0, MaxInst = VL.size();
23462 for (
unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
23468 if (TTI->getNumberOfParts(VecTy) == VF)
23470 for (
unsigned I = NextInst;
I < MaxInst; ++
I) {
23471 unsigned ActualVF = std::min(MaxInst -
I, VF);
23476 if (MaxVFOnly && ActualVF < MaxVF)
23478 if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
23483 for (
Value *V : VL.drop_front(
I)) {
23487 !Inst || !
R.isDeleted(Inst)) {
23490 if (Idx == ActualVF)
23495 if (Idx != ActualVF)
23498 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << ActualVF <<
" operations "
23502 if (
R.isTreeTinyAndNotFullyVectorizable())
23504 if (
R.isProfitableToReorder()) {
23505 R.reorderTopToBottom();
23508 R.transformNodes();
23509 R.buildExternalUses();
23511 R.computeMinimumValueSizes();
23513 CandidateFound =
true;
23514 MinCost = std::min(MinCost,
Cost);
23517 <<
" for VF=" << ActualVF <<
"\n");
23520 R.getORE()->emit(OptimizationRemark(
SV_NAME,
"VectorizedList",
23522 <<
"SLP vectorized with cost " <<
ore::NV(
"Cost",
Cost)
23523 <<
" and with tree size "
23524 <<
ore::NV(
"TreeSize",
R.getTreeSize()));
23535 if (!
Changed && CandidateFound) {
23536 R.getORE()->emit([&]() {
23537 return OptimizationRemarkMissed(
SV_NAME,
"NotBeneficial", I0)
23538 <<
"List vectorization was possible but not beneficial with cost "
23539 <<
ore::NV(
"Cost", MinCost) <<
" >= "
23543 R.getORE()->emit([&]() {
23544 return OptimizationRemarkMissed(
SV_NAME,
"NotPossible", I0)
23545 <<
"Cannot SLP vectorize list: vectorization was impossible"
23546 <<
" with available vectorization factors";
23581 using ReductionOpsType = SmallVector<Value *, 16>;
23582 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
23583 ReductionOpsListType ReductionOps;
23587 SmallDenseMap<Value *, SmallVector<Instruction *>, 16> ReducedValsToOps;
23588 WeakTrackingVH ReductionRoot;
23593 bool IsSupportedHorRdxIdentityOp =
false;
23600 static bool isCmpSelMinMax(Instruction *
I) {
23608 static bool isBoolLogicOp(Instruction *
I) {
23614 static bool isVectorizable(
RecurKind Kind, Instruction *
I,
23615 bool TwoElementReduction =
false) {
23616 if (Kind == RecurKind::None)
23625 if (TwoElementReduction)
23628 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
23632 return I->getFastMathFlags().noNaNs();
23635 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
23638 return I->isAssociative();
23641 static Value *getRdxOperand(Instruction *
I,
unsigned Index) {
23647 return I->getOperand(2);
23648 return I->getOperand(Index);
23653 Value *
RHS,
const Twine &Name,
bool UseSelect) {
23657 case RecurKind::Or: {
23666 case RecurKind::And: {
23676 case RecurKind::Add:
23677 case RecurKind::Mul:
23678 case RecurKind::Xor:
23679 case RecurKind::FAdd:
23680 case RecurKind::FMul: {
23685 case RecurKind::SMax:
23686 case RecurKind::SMin:
23687 case RecurKind::UMax:
23688 case RecurKind::UMin:
23696 case RecurKind::FMax:
23697 case RecurKind::FMin:
23698 case RecurKind::FMaximum:
23699 case RecurKind::FMinimum:
23700 case RecurKind::FMaximumNum:
23701 case RecurKind::FMinimumNum: {
23714 const ReductionOpsListType &ReductionOps) {
23715 bool UseSelect = ReductionOps.size() == 2 ||
23717 (ReductionOps.size() == 1 &&
23719 assert((!UseSelect || ReductionOps.size() != 2 ||
23721 "Expected cmp + select pairs for reduction");
23722 Value *
Op = createOp(Builder, RdxKind,
LHS,
RHS, Name, UseSelect);
23740 return RecurKind::None;
23742 return RecurKind::Add;
23744 return RecurKind::Mul;
23747 return RecurKind::And;
23750 return RecurKind::Or;
23752 return RecurKind::Xor;
23754 return RecurKind::FAdd;
23756 return RecurKind::FMul;
23759 return RecurKind::FMax;
23761 return RecurKind::FMin;
23764 return RecurKind::FMaximum;
23766 return RecurKind::FMinimum;
23772 return RecurKind::SMax;
23774 return RecurKind::SMin;
23776 return RecurKind::UMax;
23778 return RecurKind::UMin;
23804 return RecurKind::None;
23808 return RecurKind::None;
23811 return RecurKind::None;
23815 return RecurKind::None;
23820 return RecurKind::None;
23823 return RecurKind::SMax;
23826 return RecurKind::SMin;
23829 return RecurKind::UMax;
23832 return RecurKind::UMin;
23835 return RecurKind::None;
23839 static unsigned getFirstOperandIndex(Instruction *
I) {
23840 return isCmpSelMinMax(
I) ? 1 : 0;
23845 static unsigned getNumberOfOperands(Instruction *
I) {
23846 return isCmpSelMinMax(
I) ? 3 : 2;
23851 static bool hasSameParent(Instruction *
I, BasicBlock *BB) {
23852 if (isCmpSelMinMax(
I) || isBoolLogicOp(
I)) {
23855 return Sel->getParent() == BB &&
Cmp &&
Cmp->getParent() == BB;
23857 return I->getParent() == BB;
23861 static bool hasRequiredNumberOfUses(
bool IsCmpSelMinMax, Instruction *
I) {
23862 if (IsCmpSelMinMax) {
23866 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
23867 return I->hasNUses(2);
23875 void initReductionOps(Instruction *
I) {
23876 if (isCmpSelMinMax(
I))
23877 ReductionOps.assign(2, ReductionOpsType());
23879 ReductionOps.assign(1, ReductionOpsType());
23883 void addReductionOps(Instruction *
I) {
23884 if (isCmpSelMinMax(
I)) {
23886 ReductionOps[1].emplace_back(
I);
23888 ReductionOps[0].emplace_back(
I);
23893 int Sz =
Data.size();
23902 : ReductionRoot(
I), ReductionLimit(2) {
23903 RdxKind = HorizontalReduction::getRdxKind(
I);
23904 ReductionOps.emplace_back().push_back(
I);
23907 ReducedValsToOps[
V].push_back(
I);
23910 bool matchReductionForOperands()
const {
23913 assert(ReductionRoot &&
"Reduction root is not set!");
23916 return Ops.size() == 2;
23924 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
23925 ScalarEvolution &SE,
const DataLayout &
DL,
23926 const TargetLibraryInfo &TLI) {
23927 RdxKind = HorizontalReduction::getRdxKind(Root);
23928 if (!isVectorizable(RdxKind, Root))
23940 if (!Sel->getCondition()->hasOneUse())
23943 ReductionRoot = Root;
23948 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
23950 1, std::make_pair(Root, 0));
23955 SmallVectorImpl<Value *> &PossibleReducedVals,
23956 SmallVectorImpl<Instruction *> &ReductionOps,
23959 getNumberOfOperands(TreeN)))) {
23960 Value *EdgeVal = getRdxOperand(TreeN,
I);
23961 ReducedValsToOps[EdgeVal].push_back(TreeN);
23969 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
23970 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
23971 !isVectorizable(RdxKind, EdgeInst) ||
23972 (
R.isAnalyzedReductionRoot(EdgeInst) &&
23974 PossibleReducedVals.push_back(EdgeVal);
23977 ReductionOps.push_back(EdgeInst);
23986 size_t, SmallMapVector<size_t, SmallMapVector<Value *, unsigned, 2>, 2>,
23988 PossibleReducedVals;
23989 initReductionOps(Root);
23991 SmallSet<size_t, 2> LoadKeyUsed;
23993 auto GenerateLoadsSubkey = [&](
size_t Key, LoadInst *LI) {
23998 auto LIt = LoadsMap.
find(std::make_pair(
Key,
Ptr));
23999 if (LIt != LoadsMap.
end()) {
24000 for (LoadInst *RLI : LIt->second) {
24006 for (LoadInst *RLI : LIt->second) {
24013 if (LIt->second.size() > 2) {
24015 hash_value(LIt->second.back()->getPointerOperand());
24021 .first->second.push_back(LI);
24025 while (!Worklist.empty()) {
24026 auto [TreeN,
Level] = Worklist.pop_back_val();
24029 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
24030 addReductionOps(TreeN);
24033 for (
Value *V : PossibleRedVals) {
24037 ++PossibleReducedVals[
Key][Idx].
try_emplace(V, 0).first->second;
24039 for (Instruction *
I :
reverse(PossibleReductionOps))
24040 Worklist.emplace_back(
I,
I->getParent() == BB ? 0 : Level + 1);
24042 auto PossibleReducedValsVect = PossibleReducedVals.
takeVector();
24045 for (
auto &PossibleReducedVals : PossibleReducedValsVect) {
24046 auto PossibleRedVals = PossibleReducedVals.second.
takeVector();
24048 for (
auto &Slice : PossibleRedVals) {
24050 auto RedValsVect = Slice.second.takeVector();
24052 for (
const std::pair<Value *, unsigned> &
Data : RedValsVect)
24053 PossibleRedValsVect.
back().append(
Data.second,
Data.first);
24055 stable_sort(PossibleRedValsVect, [](
const auto &P1,
const auto &P2) {
24056 return P1.size() > P2.size();
24063 }
else if (!isGoodForReduction(
Data)) {
24066 if (!LI || !LastLI ||
24071 ReducedVals.
back().append(
Data.rbegin(),
Data.rend());
24077 return P1.size() > P2.
size();
24083 Value *tryToReduce(BoUpSLP &V,
const DataLayout &
DL, TargetTransformInfo *
TTI,
24084 const TargetLibraryInfo &TLI, AssumptionCache *AC,
24085 DominatorTree &DT) {
24086 constexpr unsigned RegMaxNumber = 4;
24087 constexpr unsigned RedValsMaxNumber = 128;
24091 if (
unsigned NumReducedVals = std::accumulate(
24092 ReducedVals.
begin(), ReducedVals.
end(), 0,
24094 if (!isGoodForReduction(Vals))
24096 return Num + Vals.size();
24098 NumReducedVals < ReductionLimit &&
24102 for (ReductionOpsType &RdxOps : ReductionOps)
24103 for (
Value *RdxOp : RdxOps)
24108 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
24114 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.
size() *
24115 ReducedVals.
front().size());
24119 auto &&GetCmpForMinMaxReduction = [](
Instruction *RdxRootInst) {
24121 "Expected min/max reduction to have select root instruction");
24124 "Expected min/max reduction to have compare condition");
24128 bool AnyBoolLogicOp =
any_of(ReductionOps.back(), [](
Value *V) {
24129 return isBoolLogicOp(cast<Instruction>(V));
24132 auto GetNewVectorizedTree = [&](
Value *VectorizedTree,
Value *Res) {
24133 if (VectorizedTree) {
24137 if (AnyBoolLogicOp) {
24138 auto It = ReducedValsToOps.
find(VectorizedTree);
24139 auto It1 = ReducedValsToOps.
find(Res);
24140 if ((It == ReducedValsToOps.
end() && It1 == ReducedValsToOps.
end()) ||
24142 (It != ReducedValsToOps.
end() &&
24143 any_of(It->getSecond(), [&](Instruction *
I) {
24144 return isBoolLogicOp(I) &&
24145 getRdxOperand(I, 0) == VectorizedTree;
24149 (It1 != ReducedValsToOps.
end() &&
24150 any_of(It1->getSecond(), [&](Instruction *
I) {
24151 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
24155 VectorizedTree = Builder.
CreateFreeze(VectorizedTree);
24159 return createOp(Builder, RdxKind, VectorizedTree, Res,
"op.rdx",
24165 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
24166 ReductionOps.front().size());
24167 for (ReductionOpsType &RdxOps : ReductionOps)
24168 for (
Value *RdxOp : RdxOps) {
24171 IgnoreList.insert(RdxOp);
24174 FastMathFlags RdxFMF;
24176 for (
Value *U : IgnoreList)
24178 RdxFMF &= FPMO->getFastMathFlags();
24184 for (
Value *V : Candidates)
24185 TrackedVals.try_emplace(V, V);
24187 auto At = [](SmallMapVector<Value *, unsigned, 16> &MV,
24188 Value *
V) ->
unsigned & {
24189 auto *It = MV.
find(V);
24190 assert(It != MV.
end() &&
"Unable to find given key.");
24194 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.
size());
24197 SmallPtrSet<Value *, 4> RequiredExtract;
24198 WeakTrackingVH VectorizedTree =
nullptr;
24199 bool CheckForReusedReductionOps =
false;
24204 for (
unsigned I = 0,
E = ReducedVals.
size();
I <
E; ++
I) {
24206 InstructionsState S = States[
I];
24209 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.
size());
24210 for (
Value *ReducedVal : OrigReducedVals) {
24211 Value *RdxVal = TrackedVals.at(ReducedVal);
24218 (!S || !S.getMatchingMainOpOrAltOp(Inst))) ||
24222 TrackedToOrig.try_emplace(RdxVal, ReducedVal);
24224 bool ShuffledExtracts =
false;
24226 if (S && S.getOpcode() == Instruction::ExtractElement &&
24227 !S.isAltShuffle() &&
I + 1 <
E) {
24229 for (
Value *RV : ReducedVals[
I + 1]) {
24230 Value *RdxVal = TrackedVals.at(RV);
24237 CommonCandidates.push_back(RdxVal);
24238 TrackedToOrig.try_emplace(RdxVal, RV);
24240 SmallVector<int>
Mask;
24243 Candidates.
swap(CommonCandidates);
24244 ShuffledExtracts =
true;
24251 Value *OrigV = TrackedToOrig.at(Candidates.
front());
24252 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24254 Res = createOp(Builder, RdxKind, Res, VC,
"const.rdx", ReductionOps);
24255 Value *OrigV = TrackedToOrig.at(VC);
24256 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24258 V.analyzedReductionRoot(ResI);
24260 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
24264 unsigned NumReducedVals = Candidates.
size();
24265 if (NumReducedVals < ReductionLimit &&
24266 (NumReducedVals < 2 || !
isSplat(Candidates)))
24271 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
24272 RdxKind != RecurKind::FMul &&
24273 RdxKind != RecurKind::FMulAdd;
24275 SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
24276 if (IsSupportedHorRdxIdentityOp)
24277 for (
Value *V : Candidates) {
24278 Value *OrigV = TrackedToOrig.at(V);
24279 ++SameValuesCounter.
try_emplace(OrigV).first->second;
24291 bool SameScaleFactor =
false;
24292 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
24293 SameValuesCounter.
size() != Candidates.size();
24295 if (OptReusedScalars) {
24297 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
24298 RdxKind == RecurKind::Xor) &&
24300 [&SameValuesCounter](
const std::pair<Value *, unsigned> &
P) {
24301 return P.second == SameValuesCounter.
front().second;
24303 Candidates.resize(SameValuesCounter.
size());
24304 transform(SameValuesCounter, Candidates.begin(),
24305 [&](
const auto &
P) { return TrackedVals.at(P.first); });
24306 NumReducedVals = Candidates.size();
24308 if (NumReducedVals == 1) {
24309 Value *OrigV = TrackedToOrig.at(Candidates.front());
24310 unsigned Cnt = At(SameValuesCounter, OrigV);
24312 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
24313 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
24314 VectorizedVals.try_emplace(OrigV, Cnt);
24315 ExternallyUsedValues.
insert(OrigV);
24320 unsigned MaxVecRegSize =
V.getMaxVecRegSize();
24321 unsigned EltSize =
V.getVectorElementSize(Candidates[0]);
24322 const unsigned MaxElts = std::clamp<unsigned>(
24324 RegMaxNumber * RedValsMaxNumber);
24326 unsigned ReduxWidth = NumReducedVals;
24327 auto GetVectorFactor = [&, &
TTI = *
TTI](
unsigned ReduxWidth) {
24328 unsigned NumParts, NumRegs;
24329 Type *ScalarTy = Candidates.front()->getType();
24336 while (NumParts > NumRegs) {
24337 assert(ReduxWidth > 0 &&
"ReduxWidth is unexpectedly 0.");
24338 ReduxWidth =
bit_floor(ReduxWidth - 1);
24344 if (NumParts > NumRegs / 2)
24349 ReduxWidth = GetVectorFactor(ReduxWidth);
24350 ReduxWidth = std::min(ReduxWidth, MaxElts);
24352 unsigned Start = 0;
24353 unsigned Pos =
Start;
24355 unsigned PrevReduxWidth = ReduxWidth;
24356 bool CheckForReusedReductionOpsLocal =
false;
24357 auto AdjustReducedVals = [&](
bool IgnoreVL =
false) {
24358 bool IsAnyRedOpGathered = !IgnoreVL &&
V.isAnyGathered(IgnoreList);
24359 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
24362 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
24365 if (Pos < NumReducedVals - ReduxWidth + 1)
24366 return IsAnyRedOpGathered;
24369 if (ReduxWidth > 1)
24370 ReduxWidth = GetVectorFactor(ReduxWidth);
24371 return IsAnyRedOpGathered;
24373 bool AnyVectorized =
false;
24374 SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
24375 while (Pos < NumReducedVals - ReduxWidth + 1 &&
24376 ReduxWidth >= ReductionLimit) {
24379 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
24381 CheckForReusedReductionOps =
true;
24384 PrevReduxWidth = ReduxWidth;
24387 if (IgnoredCandidates.
contains(std::make_pair(Pos, ReduxWidth)) ||
24390 std::make_pair(Pos,
bit_floor(ReduxWidth))) ||
24392 std::make_pair(Pos + (ReduxWidth -
bit_floor(ReduxWidth)),
24394 V.areAnalyzedReductionVals(VL)) {
24395 (void)AdjustReducedVals(
true);
24402 return RedValI &&
V.isDeleted(RedValI);
24405 V.buildTree(VL, IgnoreList);
24406 if (
V.isTreeTinyAndNotFullyVectorizable(
true)) {
24407 if (!AdjustReducedVals())
24408 V.analyzedReductionVals(VL);
24411 if (
V.isLoadCombineReductionCandidate(RdxKind)) {
24412 if (!AdjustReducedVals())
24413 V.analyzedReductionVals(VL);
24416 V.reorderTopToBottom();
24419 VL.front()->getType()->isIntOrIntVectorTy() ||
24420 ReductionLimit > 2);
24424 ExternallyUsedValues);
24428 LocalExternallyUsedValues.insert(ReductionRoot);
24429 for (
unsigned Cnt = 0, Sz = ReducedVals.
size(); Cnt < Sz; ++Cnt) {
24430 if (Cnt ==
I || (ShuffledExtracts && Cnt ==
I - 1))
24432 for (
Value *V : ReducedVals[Cnt])
24434 LocalExternallyUsedValues.insert(TrackedVals[V]);
24436 if (!IsSupportedHorRdxIdentityOp) {
24439 "Reused values counter map is not empty");
24440 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
24441 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
24443 Value *
V = Candidates[Cnt];
24444 Value *OrigV = TrackedToOrig.at(V);
24445 ++SameValuesCounter.
try_emplace(OrigV).first->second;
24448 V.transformNodes();
24451 SmallPtrSet<Value *, 4> Visited;
24452 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
24453 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
24455 Value *RdxVal = Candidates[Cnt];
24456 if (
auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
24457 RdxVal = It->second;
24458 if (!Visited.
insert(RdxVal).second)
24462 if (!VLScalars.contains(RdxVal) &&
V.isVectorized(RdxVal)) {
24463 LocalExternallyUsedValues.insert(RdxVal);
24466 Value *OrigV = TrackedToOrig.at(RdxVal);
24468 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
24469 if (
NumOps != ReducedValsToOps.
at(OrigV).size())
24470 LocalExternallyUsedValues.insert(RdxVal);
24473 if (!IsSupportedHorRdxIdentityOp)
24474 SameValuesCounter.
clear();
24475 for (
Value *RdxVal : VL)
24476 if (RequiredExtract.
contains(RdxVal))
24477 LocalExternallyUsedValues.insert(RdxVal);
24478 V.buildExternalUses(LocalExternallyUsedValues);
24480 V.computeMinimumValueSizes();
24484 getReductionCost(
TTI, VL, IsCmpSelMinMax, RdxFMF, V, DT,
DL, TLI);
24487 <<
" for reduction\n");
24491 V.getORE()->emit([&]() {
24492 return OptimizationRemarkMissed(
SV_NAME,
"HorSLPNotBeneficial",
24493 ReducedValsToOps.
at(VL[0]).front())
24494 <<
"Vectorizing horizontal reduction is possible "
24495 <<
"but not beneficial with cost " <<
ore::NV(
"Cost",
Cost)
24496 <<
" and threshold "
24499 if (!AdjustReducedVals()) {
24500 V.analyzedReductionVals(VL);
24502 if (ReduxWidth > ReductionLimit &&
V.isTreeNotExtendable()) {
24505 *
TTI, VL.front()->getType(), ReduxWidth - 1);
24506 VF >= ReductionLimit;
24508 *
TTI, VL.front()->getType(), VF - 1)) {
24510 V.getCanonicalGraphSize() !=
V.getTreeSize())
24513 IgnoredCandidates.
insert(std::make_pair(
Offset + Idx, VF));
24520 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorizing horizontal reduction at cost:"
24521 <<
Cost <<
". (HorRdx)\n");
24522 V.getORE()->emit([&]() {
24523 return OptimizationRemark(
SV_NAME,
"VectorizedHorizontalReduction",
24524 ReducedValsToOps.
at(VL[0]).front())
24525 <<
"Vectorized horizontal reduction with cost "
24526 <<
ore::NV(
"Cost",
Cost) <<
" and with tree size "
24527 <<
ore::NV(
"TreeSize",
V.getTreeSize());
24536 if (IsCmpSelMinMax)
24537 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
24540 Value *VectorizedRoot =
V.vectorizeTree(
24541 LocalExternallyUsedValues, InsertPt, VectorValuesAndScales);
24544 for (
Value *RdxVal : Candidates) {
24545 Value *OrigVal = TrackedToOrig.at(RdxVal);
24546 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
24547 if (TransformedRdxVal != RdxVal)
24548 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
24557 VectorizedRoot = Builder.
CreateFreeze(VectorizedRoot);
24560 if (OptReusedScalars && !SameScaleFactor) {
24561 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
24562 SameValuesCounter, TrackedToOrig);
24565 Type *ScalarTy = VL.front()->getType();
24570 OptReusedScalars && SameScaleFactor
24571 ? SameValuesCounter.
front().second
24574 ?
V.isSignedMinBitwidthRootNode()
24578 for (
Value *RdxVal : VL) {
24579 Value *OrigV = TrackedToOrig.at(RdxVal);
24580 if (IsSupportedHorRdxIdentityOp) {
24581 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
24584 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24585 if (!
V.isVectorized(RdxVal))
24586 RequiredExtract.
insert(RdxVal);
24590 ReduxWidth = NumReducedVals - Pos;
24591 if (ReduxWidth > 1)
24592 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
24593 AnyVectorized =
true;
24595 if (OptReusedScalars && !AnyVectorized) {
24596 for (
const std::pair<Value *, unsigned> &
P : SameValuesCounter) {
24597 Value *RdxVal = TrackedVals.at(
P.first);
24598 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder,
P.second);
24599 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
24600 VectorizedVals.try_emplace(
P.first,
P.second);
24605 if (!VectorValuesAndScales.
empty())
24606 VectorizedTree = GetNewVectorizedTree(
24608 emitReduction(Builder, *
TTI, ReductionRoot->getType()));
24610 if (!VectorizedTree) {
24611 if (!CheckForReusedReductionOps) {
24612 for (ReductionOpsType &RdxOps : ReductionOps)
24613 for (
Value *RdxOp : RdxOps)
24635 auto FixBoolLogicalOps =
24638 if (!AnyBoolLogicOp)
24640 if (isBoolLogicOp(RedOp1) && ((!InitStep &&
LHS == VectorizedTree) ||
24641 getRdxOperand(RedOp1, 0) ==
LHS ||
24644 if (isBoolLogicOp(RedOp2) && ((!InitStep &&
RHS == VectorizedTree) ||
24645 getRdxOperand(RedOp2, 0) ==
RHS ||
24650 if (
LHS != VectorizedTree)
24658 unsigned Sz = InstVals.
size();
24660 for (
unsigned I = 0,
E = (Sz / 2) * 2;
I <
E;
I += 2) {
24663 Value *RdxVal1 = InstVals[
I].second;
24664 Value *StableRdxVal1 = RdxVal1;
24665 auto It1 = TrackedVals.find(RdxVal1);
24666 if (It1 != TrackedVals.end())
24667 StableRdxVal1 = It1->second;
24668 Value *RdxVal2 = InstVals[
I + 1].second;
24669 Value *StableRdxVal2 = RdxVal2;
24670 auto It2 = TrackedVals.find(RdxVal2);
24671 if (It2 != TrackedVals.end())
24672 StableRdxVal2 = It2->second;
24676 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[
I].first,
24678 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
24679 StableRdxVal2,
"op.rdx", ReductionOps);
24680 ExtraReds[
I / 2] = std::make_pair(InstVals[
I].first, ExtraRed);
24683 ExtraReds[Sz / 2] = InstVals.
back();
24689 SmallPtrSet<Value *, 8> Visited;
24691 for (
Value *RdxVal : Candidates) {
24692 if (!Visited.
insert(RdxVal).second)
24694 unsigned NumOps = VectorizedVals.lookup(RdxVal);
24695 for (Instruction *RedOp :
24701 bool InitStep =
true;
24702 while (ExtraReductions.
size() > 1) {
24704 FinalGen(ExtraReductions, InitStep);
24705 ExtraReductions.
swap(NewReds);
24708 VectorizedTree = ExtraReductions.
front().second;
24710 ReductionRoot->replaceAllUsesWith(VectorizedTree);
24717 SmallPtrSet<Value *, 4> IgnoreSet;
24726 for (
auto *U :
Ignore->users()) {
24728 "All users must be either in the reduction ops list.");
24731 if (!
Ignore->use_empty()) {
24733 Ignore->replaceAllUsesWith(
P);
24736 V.removeInstructionsAndOperands(RdxOps, VectorValuesAndScales);
24738 return VectorizedTree;
24744 Value *createSingleOp(IRBuilderBase &Builder,
const TargetTransformInfo &
TTI,
24745 Value *Vec,
unsigned Scale,
bool IsSigned,
24769 Rdx, emitReduction(Lane, Builder, &
TTI, DestTy),
I);
24772 Rdx = emitReduction(Vec, Builder, &
TTI, DestTy);
24774 if (Rdx->
getType() != DestTy)
24780 Rdx = emitScaleForReusedOps(Rdx, Builder, Scale);
24787 bool IsCmpSelMinMax, FastMathFlags FMF,
24788 const BoUpSLP &R, DominatorTree &DT,
24789 const DataLayout &
DL,
24790 const TargetLibraryInfo &TLI) {
24792 Type *ScalarTy = ReducedVals.
front()->getType();
24793 unsigned ReduxWidth = ReducedVals.
size();
24794 FixedVectorType *VectorTy =
R.getReductionType();
24799 auto EvaluateScalarCost = [&](function_ref<
InstructionCost()> GenCostFn) {
24802 int Cnt = ReducedVals.
size();
24803 for (
Value *RdxVal : ReducedVals) {
24808 Cost += GenCostFn();
24812 for (User *U : RdxVal->
users()) {
24814 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
24815 if (RdxKind == RecurKind::FAdd) {
24825 FMACost -= FMulCost;
24827 ScalarCost += FMACost;
24834 ScalarCost = InstructionCost::getInvalid();
24838 Cost += ScalarCost;
24840 Cost += GenCostFn();
24849 bool DoesRequireReductionOp = !AllConsts && VectorValuesAndScales.
empty();
24851 case RecurKind::Add:
24852 case RecurKind::Mul:
24853 case RecurKind::Or:
24854 case RecurKind::And:
24855 case RecurKind::Xor:
24856 case RecurKind::FAdd:
24857 case RecurKind::FMul: {
24860 if (DoesRequireReductionOp) {
24863 unsigned ScalarTyNumElements = VecTy->getNumElements();
24868 ReducedVals.size()),
24879 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
24880 std::make_pair(RedTy,
true));
24881 if (RType == RedTy) {
24886 RdxOpcode, !IsSigned, RedTy,
24892 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
24893 std::make_pair(RedTy,
true));
24896 if (RdxKind == RecurKind::FAdd) {
24901 for (
Value *RdxVal : ReducedVals) {
24907 FMF &= FPCI->getFastMathFlags();
24910 if (!
Ops.empty()) {
24915 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy,
24916 {RVecTy, RVecTy, RVecTy}, FMF);
24922 Instruction::FMul, RVecTy,
CostKind);
24924 <<
"Minus vector FMul cost: " << FMulCost <<
"\n");
24925 FMACost -= FMulCost;
24929 if (FMACost.isValid())
24930 VectorCost += FMACost;
24934 if (RType != RedTy) {
24935 unsigned Opcode = Instruction::Trunc;
24937 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
24943 ScalarCost = EvaluateScalarCost([&]() {
24948 case RecurKind::FMax:
24949 case RecurKind::FMin:
24950 case RecurKind::FMaximum:
24951 case RecurKind::FMinimum:
24952 case RecurKind::SMax:
24953 case RecurKind::SMin:
24954 case RecurKind::UMax:
24955 case RecurKind::UMin: {
24958 if (DoesRequireReductionOp) {
24964 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
24965 std::make_pair(RedTy,
true));
24967 IntrinsicCostAttributes ICA(Id, RVecTy, {RVecTy, RVecTy}, FMF);
24969 if (RType != RedTy) {
24970 unsigned Opcode = Instruction::Trunc;
24972 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
24978 ScalarCost = EvaluateScalarCost([&]() {
24979 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
24988 LLVM_DEBUG(
dbgs() <<
"SLP: Adding cost " << VectorCost - ScalarCost
24990 <<
" (It is a splitting reduction)\n");
24991 return VectorCost - ScalarCost;
24997 Value *emitReduction(IRBuilderBase &Builder,
const TargetTransformInfo &
TTI,
24999 Value *ReducedSubTree =
nullptr;
25001 auto CreateSingleOp = [&](
Value *Vec,
unsigned Scale,
bool IsSigned) {
25002 Value *Rdx = createSingleOp(Builder,
TTI, Vec, Scale, IsSigned, DestTy);
25003 if (ReducedSubTree)
25004 ReducedSubTree = createOp(Builder, RdxKind, ReducedSubTree, Rdx,
25005 "op.rdx", ReductionOps);
25007 ReducedSubTree = Rdx;
25009 if (VectorValuesAndScales.
size() == 1) {
25010 const auto &[Vec, Scale, IsSigned] = VectorValuesAndScales.
front();
25011 CreateSingleOp(Vec, Scale, IsSigned);
25012 return ReducedSubTree;
25016 Value *VecRes =
nullptr;
25017 bool VecResSignedness =
false;
25018 auto CreateVecOp = [&](
Value *Vec,
unsigned Cnt,
bool IsSigned) {
25024 case RecurKind::Add: {
25025 if (ScalarTy == Builder.
getInt1Ty() && ScalarTy != DestTy) {
25028 <<
". (HorRdx)\n");
25031 std::iota(std::next(
Mask.begin(), VF *
I),
25032 std::next(
Mask.begin(), VF * (
I + 1)), 0);
25033 ++NumVectorInstructions;
25044 LLVM_DEBUG(
dbgs() <<
"SLP: Add (to-mul) " << Cnt <<
"of " << Vec
25045 <<
". (HorRdx)\n");
25046 ++NumVectorInstructions;
25050 case RecurKind::Xor: {
25053 <<
"SLP: Xor " << Cnt <<
"of " << Vec <<
". (HorRdx)\n");
25058 case RecurKind::FAdd: {
25062 LLVM_DEBUG(
dbgs() <<
"SLP: FAdd (to-fmul) " << Cnt <<
"of " << Vec
25063 <<
". (HorRdx)\n");
25064 ++NumVectorInstructions;
25068 case RecurKind::And:
25069 case RecurKind::Or:
25070 case RecurKind::SMax:
25071 case RecurKind::SMin:
25072 case RecurKind::UMax:
25073 case RecurKind::UMin:
25074 case RecurKind::FMax:
25075 case RecurKind::FMin:
25076 case RecurKind::FMaximum:
25077 case RecurKind::FMinimum:
25080 case RecurKind::Sub:
25081 case RecurKind::AddChainWithSubs:
25082 case RecurKind::Mul:
25083 case RecurKind::FMul:
25084 case RecurKind::FMulAdd:
25085 case RecurKind::AnyOf:
25086 case RecurKind::FindFirstIVSMin:
25087 case RecurKind::FindFirstIVUMin:
25088 case RecurKind::FindLastIVSMax:
25089 case RecurKind::FindLastIVUMax:
25090 case RecurKind::FMaxNum:
25091 case RecurKind::FMinNum:
25092 case RecurKind::FMaximumNum:
25093 case RecurKind::FMinimumNum:
25094 case RecurKind::None:
25101 VecResSignedness = IsSigned;
25103 ++NumVectorInstructions;
25104 if (ScalarTy == Builder.
getInt1Ty() && ScalarTy != DestTy &&
25110 std::iota(
Mask.begin(),
Mask.end(), 0);
25112 if (VecResVF < VecVF) {
25116 if (VecResVF != VecVF) {
25118 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
25135 if (VecResVF < VecVF) {
25141 if (VecResVF != VecVF)
25143 Op = createOp(Builder, RdxKind,
Op, Vec,
"rdx.op", ReductionOps);
25144 if (VecResVF != VecVF)
25149 for (
auto [Vec, Scale, IsSigned] : VectorValuesAndScales)
25150 CreateVecOp(Vec, Scale, IsSigned);
25151 CreateSingleOp(VecRes, 1,
false);
25153 return ReducedSubTree;
25157 Value *emitReduction(
Value *VectorizedValue, IRBuilderBase &Builder,
25158 const TargetTransformInfo *
TTI,
Type *DestTy) {
25159 assert(VectorizedValue &&
"Need to have a vectorized tree node");
25160 assert(RdxKind != RecurKind::FMulAdd &&
25161 "A call to the llvm.fmuladd intrinsic is not handled yet");
25164 if (FTy->getScalarType() == Builder.
getInt1Ty() &&
25165 RdxKind == RecurKind::Add &&
25170 VectorizedValue, Builder.
getIntNTy(FTy->getNumElements()));
25171 ++NumVectorInstructions;
25174 ++NumVectorInstructions;
25179 Value *emitScaleForReusedOps(
Value *VectorizedValue, IRBuilderBase &Builder,
25181 assert(IsSupportedHorRdxIdentityOp &&
25182 "The optimization of matched scalar identity horizontal reductions "
25183 "must be supported.");
25185 return VectorizedValue;
25187 case RecurKind::Add: {
25189 Value *Scale = ConstantInt::get(VectorizedValue->
getType(), Cnt);
25191 << VectorizedValue <<
". (HorRdx)\n");
25192 return Builder.
CreateMul(VectorizedValue, Scale);
25194 case RecurKind::Xor: {
25196 LLVM_DEBUG(
dbgs() <<
"SLP: Xor " << Cnt <<
"of " << VectorizedValue
25197 <<
". (HorRdx)\n");
25200 return VectorizedValue;
25202 case RecurKind::FAdd: {
25204 Value *Scale = ConstantFP::get(VectorizedValue->
getType(), Cnt);
25206 << VectorizedValue <<
". (HorRdx)\n");
25207 return Builder.
CreateFMul(VectorizedValue, Scale);
25209 case RecurKind::And:
25210 case RecurKind::Or:
25211 case RecurKind::SMax:
25212 case RecurKind::SMin:
25213 case RecurKind::UMax:
25214 case RecurKind::UMin:
25215 case RecurKind::FMax:
25216 case RecurKind::FMin:
25217 case RecurKind::FMaximum:
25218 case RecurKind::FMinimum:
25220 return VectorizedValue;
25221 case RecurKind::Sub:
25222 case RecurKind::AddChainWithSubs:
25223 case RecurKind::Mul:
25224 case RecurKind::FMul:
25225 case RecurKind::FMulAdd:
25226 case RecurKind::AnyOf:
25227 case RecurKind::FindFirstIVSMin:
25228 case RecurKind::FindFirstIVUMin:
25229 case RecurKind::FindLastIVSMax:
25230 case RecurKind::FindLastIVUMax:
25231 case RecurKind::FMaxNum:
25232 case RecurKind::FMinNum:
25233 case RecurKind::FMaximumNum:
25234 case RecurKind::FMinimumNum:
25235 case RecurKind::None:
25244 emitReusedOps(
Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,
25245 const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter,
25246 const DenseMap<Value *, Value *> &TrackedToOrig) {
25247 assert(IsSupportedHorRdxIdentityOp &&
25248 "The optimization of matched scalar identity horizontal reductions "
25249 "must be supported.");
25252 if (VTy->getElementType() != VL.
front()->getType()) {
25256 R.isSignedMinBitwidthRootNode());
25259 case RecurKind::Add: {
25262 for (
Value *V : VL) {
25263 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
25264 Vals.
push_back(ConstantInt::get(
V->getType(), Cnt,
false));
25268 << VectorizedValue <<
". (HorRdx)\n");
25269 return Builder.
CreateMul(VectorizedValue, Scale);
25271 case RecurKind::And:
25272 case RecurKind::Or:
25275 <<
". (HorRdx)\n");
25276 return VectorizedValue;
25277 case RecurKind::SMax:
25278 case RecurKind::SMin:
25279 case RecurKind::UMax:
25280 case RecurKind::UMin:
25281 case RecurKind::FMax:
25282 case RecurKind::FMin:
25283 case RecurKind::FMaximum:
25284 case RecurKind::FMinimum:
25287 <<
". (HorRdx)\n");
25288 return VectorizedValue;
25289 case RecurKind::Xor: {
25294 SmallVector<int>
Mask(
25297 std::iota(
Mask.begin(),
Mask.end(), 0);
25298 bool NeedShuffle =
false;
25299 for (
unsigned I = 0, VF = VL.size();
I < VF; ++
I) {
25301 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
25302 if (Cnt % 2 == 0) {
25304 NeedShuffle =
true;
25310 dbgs() <<
"> of " << VectorizedValue <<
". (HorRdx)\n");
25314 ConstantVector::getNullValue(VectorizedValue->
getType()), Mask);
25315 return VectorizedValue;
25317 case RecurKind::FAdd: {
25320 for (
Value *V : VL) {
25321 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
25322 Vals.
push_back(ConstantFP::get(
V->getType(), Cnt));
25325 return Builder.
CreateFMul(VectorizedValue, Scale);
25327 case RecurKind::Sub:
25328 case RecurKind::AddChainWithSubs:
25329 case RecurKind::Mul:
25330 case RecurKind::FMul:
25331 case RecurKind::FMulAdd:
25332 case RecurKind::AnyOf:
25333 case RecurKind::FindFirstIVSMin:
25334 case RecurKind::FindFirstIVUMin:
25335 case RecurKind::FindLastIVSMax:
25336 case RecurKind::FindLastIVUMax:
25337 case RecurKind::FMaxNum:
25338 case RecurKind::FMinNum:
25339 case RecurKind::FMaximumNum:
25340 case RecurKind::FMinimumNum:
25341 case RecurKind::None:
25351 return HorizontalReduction::getRdxKind(V);
25357 unsigned AggregateSize = 1;
25359 Type *CurrentType =
IV->getType();
25362 for (
auto *Elt : ST->elements())
25363 if (Elt != ST->getElementType(0))
25364 return std::nullopt;
25365 AggregateSize *= ST->getNumElements();
25366 CurrentType = ST->getElementType(0);
25368 AggregateSize *= AT->getNumElements();
25369 CurrentType = AT->getElementType();
25371 AggregateSize *= VT->getNumElements();
25372 return AggregateSize;
25374 return AggregateSize;
25376 return std::nullopt;
25385 unsigned OperandOffset,
const BoUpSLP &R) {
25388 std::optional<unsigned> OperandIndex =
25390 if (!OperandIndex || R.isDeleted(LastInsertInst))
25394 BuildVectorOpds, InsertElts, *OperandIndex, R);
25397 BuildVectorOpds[*OperandIndex] = InsertedOperand;
25398 InsertElts[*OperandIndex] = LastInsertInst;
25401 }
while (LastInsertInst !=
nullptr &&
25428 "Expected insertelement or insertvalue instruction!");
25431 "Expected empty result vectors!");
25434 if (!AggregateSize)
25436 BuildVectorOpds.
resize(*AggregateSize);
25437 InsertElts.
resize(*AggregateSize);
25442 if (BuildVectorOpds.
size() >= 2)
25460 auto DominatedReduxValue = [&](
Value *R) {
25468 if (
P->getIncomingBlock(0) == ParentBB) {
25470 }
else if (
P->getIncomingBlock(1) == ParentBB) {
25474 if (Rdx && DominatedReduxValue(Rdx))
25487 if (
P->getIncomingBlock(0) == BBLatch) {
25489 }
else if (
P->getIncomingBlock(1) == BBLatch) {
25493 if (Rdx && DominatedReduxValue(Rdx))
25529 "Expected binop, select, or intrinsic for reduction matching");
25531 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root));
25533 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
25544 Value *Op0 =
nullptr;
25545 Value *Op1 =
nullptr;
25554 Value *B0 =
nullptr, *B1 =
nullptr;
25559bool SLPVectorizerPass::vectorizeHorReduction(
25560 PHINode *
P, Instruction *Root, BasicBlock *BB,
BoUpSLP &R,
25561 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
25570 auto SelectRoot = [&]() {
25589 std::queue<std::pair<Instruction *, unsigned>>
Stack;
25590 Stack.emplace(SelectRoot(), 0);
25591 SmallPtrSet<Value *, 8> VisitedInstrs;
25594 if (
R.isAnalyzedReductionRoot(Inst))
25599 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
25601 return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC, *DT);
25603 auto TryAppendToPostponedInsts = [&](
Instruction *FutureSeed) {
25604 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
25616 while (!
Stack.empty()) {
25619 std::tie(Inst, Level) =
Stack.front();
25624 if (
R.isDeleted(Inst))
25626 if (
Value *VectorizedV = TryToReduce(Inst)) {
25630 Stack.emplace(
I, Level);
25633 if (
R.isDeleted(Inst))
25637 if (!TryAppendToPostponedInsts(Inst)) {
25648 if (VisitedInstrs.
insert(
Op).second)
25653 !
R.isDeleted(
I) &&
I->getParent() == BB)
25654 Stack.emplace(
I, Level);
25659bool SLPVectorizerPass::tryToVectorize(Instruction *
I,
BoUpSLP &R) {
25666 if ((
I->getOpcode() == Instruction::FAdd ||
25667 I->getOpcode() == Instruction::FSub) &&
25677 if (!Op0 || !Op1 || Op0->getParent() !=
P || Op1->getParent() !=
P ||
25678 R.isDeleted(Op0) ||
R.isDeleted(Op1))
25688 if (
A &&
B &&
B->hasOneUse()) {
25691 if (B0 && B0->getParent() ==
P && !
R.isDeleted(B0))
25693 if (B1 && B1->getParent() ==
P && !
R.isDeleted(B1))
25697 if (
B &&
A &&
A->hasOneUse()) {
25700 if (A0 && A0->getParent() ==
P && !
R.isDeleted(A0))
25702 if (A1 && A1->getParent() ==
P && !
R.isDeleted(A1))
25706 auto TryToReduce = [
this, &
R, &TTI = *TTI](
Instruction *Inst,
25710 Type *Ty = Inst->getType();
25714 if (!HorRdx.matchReductionForOperands())
25720 TTI.getScalarizationOverhead(
25723 TTI.getInstructionCost(Inst,
CostKind);
25735 FMF = FPCI->getFastMathFlags();
25736 RedCost = TTI.getArithmeticReductionCost(Inst->getOpcode(), VecTy, FMF,
25743 if (RedCost >= ScalarCost)
25746 return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC, *DT) !=
nullptr;
25748 if (Candidates.
size() == 1)
25749 return TryToReduce(
I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1},
R);
25752 std::optional<int> BestCandidate =
R.findBestRootPair(Candidates);
25753 if (!BestCandidate)
25755 return (*BestCandidate == 0 &&
25756 TryToReduce(
I, {Candidates[*BestCandidate].first,
25757 Candidates[*BestCandidate].second})) ||
25758 tryToVectorizeList({Candidates[*BestCandidate].first,
25759 Candidates[*BestCandidate].second},
25763bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *
P, Instruction *Root,
25764 BasicBlock *BB,
BoUpSLP &R) {
25766 bool Res = vectorizeHorReduction(
P, Root, BB, R, PostponedInsts);
25767 Res |= tryToVectorize(PostponedInsts, R);
25774 for (
Value *V : Insts)
25776 Res |= tryToVectorize(Inst, R);
25780bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
25783 if (!
R.canMapToVector(IVI->
getType()))
25786 SmallVector<Value *, 16> BuildVectorOpds;
25787 SmallVector<Value *, 16> BuildVectorInsts;
25791 if (MaxVFOnly && BuildVectorOpds.
size() == 2) {
25792 R.getORE()->emit([&]() {
25793 return OptimizationRemarkMissed(
SV_NAME,
"NotPossible", IVI)
25794 <<
"Cannot SLP vectorize list: only 2 elements of buildvalue, "
25795 "trying reduction first.";
25799 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IVI <<
"\n");
25801 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
25804bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
25807 SmallVector<Value *, 16> BuildVectorInsts;
25808 SmallVector<Value *, 16> BuildVectorOpds;
25809 SmallVector<int>
Mask;
25815 if (MaxVFOnly && BuildVectorInsts.
size() == 2) {
25816 R.getORE()->emit([&]() {
25817 return OptimizationRemarkMissed(
SV_NAME,
"NotPossible", IEI)
25818 <<
"Cannot SLP vectorize list: only 2 elements of buildvector, "
25819 "trying reduction first.";
25823 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IEI <<
"\n");
25824 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
25827template <
typename T>
25832 bool MaxVFOnly,
BoUpSLP &R) {
25845 if (!
I || R.isDeleted(
I)) {
25849 auto *SameTypeIt = IncIt;
25852 AreCompatible(VL, *SameTypeIt))) {
25855 if (
I && !R.isDeleted(
I))
25860 unsigned NumElts = VL.
size();
25861 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize starting at nodes ("
25862 << NumElts <<
")\n");
25872 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL), MaxVFOnly)) {
25875 VL.
swap(Candidates);
25876 Candidates.
clear();
25884 auto GetMinNumElements = [&R](
Value *V) {
25885 unsigned EltSize = R.getVectorElementSize(V);
25886 return std::max(2U, R.getMaxVecRegSize() / EltSize);
25888 if (NumElts < GetMinNumElements(*IncIt) &&
25889 (Candidates.
empty() ||
25890 Candidates.
front()->getType() == (*IncIt)->getType())) {
25898 if (Candidates.
size() > 1 &&
25899 (SameTypeIt ==
E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
25900 if (TryToVectorizeHelper(Candidates,
false)) {
25903 }
else if (MaxVFOnly) {
25906 for (
auto *It = Candidates.
begin(), *End = Candidates.
end(); It != End;
25909 if (!
I || R.isDeleted(
I)) {
25913 auto *SameTypeIt = It;
25914 while (SameTypeIt != End &&
25917 AreCompatible(*SameTypeIt, *It))) {
25920 if (
I && !R.isDeleted(
I))
25923 unsigned NumElts = VL.
size();
25924 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL),
25930 Candidates.
clear();
25934 IncIt = SameTypeIt;
25946template <
bool IsCompatibility>
25951 "Expected valid element types only.");
25953 return IsCompatibility;
25956 if (CI1->getOperand(0)->getType()->getTypeID() <
25958 return !IsCompatibility;
25959 if (CI1->getOperand(0)->getType()->getTypeID() >
25962 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
25964 return !IsCompatibility;
25965 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
25974 if (BasePred1 < BasePred2)
25975 return !IsCompatibility;
25976 if (BasePred1 > BasePred2)
25979 bool CI1Preds = Pred1 == BasePred1;
25980 bool CI2Preds = Pred2 == BasePred1;
25981 for (
int I = 0,
E = CI1->getNumOperands();
I <
E; ++
I) {
25982 auto *Op1 = CI1->getOperand(CI1Preds ?
I :
E -
I - 1);
25987 return !IsCompatibility;
25992 if (IsCompatibility) {
25993 if (I1->getParent() != I2->getParent())
26000 return NodeI2 !=
nullptr;
26003 assert((NodeI1 == NodeI2) ==
26005 "Different nodes should have different DFS numbers");
26006 if (NodeI1 != NodeI2)
26010 if (S && (IsCompatibility || !S.isAltShuffle()))
26012 if (IsCompatibility)
26014 if (I1->getOpcode() != I2->getOpcode())
26015 return I1->getOpcode() < I2->getOpcode();
26018 return IsCompatibility;
26021template <
typename ItT>
26023 BasicBlock *BB,
BoUpSLP &R) {
26026 for (CmpInst *
I : CmpInsts) {
26027 if (
R.isDeleted(
I))
26031 Changed |= vectorizeRootInstruction(
nullptr, RootOp, BB, R);
26032 if (
R.isDeleted(
I))
26037 for (CmpInst *
I : CmpInsts) {
26038 if (
R.isDeleted(
I))
26057 for (Instruction *V : CmpInsts)
26060 if (Vals.
size() <= 1)
26063 Vals, CompareSorter, AreCompatibleCompares,
26066 bool ArePossiblyReducedInOtherBlock =
any_of(Candidates, [](
Value *V) {
26067 return any_of(
V->users(), [V](User *U) {
26068 auto *Select = dyn_cast<SelectInst>(U);
26070 Select->getParent() != cast<Instruction>(V)->getParent();
26073 if (ArePossiblyReducedInOtherBlock)
26075 return tryToVectorizeList(Candidates, R, MaxVFOnly);
26081bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
26082 BasicBlock *BB,
BoUpSLP &R) {
26084 "This function only accepts Insert instructions");
26085 bool OpsChanged =
false;
26087 for (
auto *
I :
reverse(Instructions)) {
26093 vectorizeInsertValueInst(LastInsertValue, BB, R,
true);
26096 vectorizeInsertElementInst(LastInsertElem, BB, R,
true);
26099 if (
R.isDeleted(
I))
26101 OpsChanged |= vectorizeHorReduction(
nullptr,
I, BB, R, PostponedInsts);
26107 vectorizeInsertValueInst(LastInsertValue, BB, R,
false);
26109 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
26114 OpsChanged |= tryToVectorize(PostponedInsts, R);
26120bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB,
BoUpSLP &R) {
26123 SmallPtrSet<Value *, 16> VisitedInstrs;
26127 DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;
26128 auto PHICompare = [
this, &PHIToOpcodes](
Value *V1,
Value *V2) {
26131 "Expected vectorizable types only.");
26141 V2->getType()->getScalarSizeInBits())
26144 V2->getType()->getScalarSizeInBits())
26148 if (Opcodes1.
size() < Opcodes2.
size())
26150 if (Opcodes1.
size() > Opcodes2.
size())
26152 for (
int I = 0,
E = Opcodes1.
size();
I <
E; ++
I) {
26161 return NodeI2 !=
nullptr;
26164 assert((NodeI1 == NodeI2) ==
26166 "Different nodes should have different DFS numbers");
26167 if (NodeI1 != NodeI2)
26170 if (S && !S.isAltShuffle() &&
I1->getOpcode() == I2->getOpcode()) {
26186 DT->getNode(V1->getParent());
26188 DT->getNode(V2->getParent());
26190 return NodeI2 !=
nullptr;
26193 assert((NodeI1 == NodeI2) ==
26195 "Different nodes should have different DFS numbers");
26196 if (NodeI1 != NodeI2)
26198 return V1->comesBefore(V2);
26211 return *Id1 < *Id2;
26215 if (
I1->getOpcode() == I2->getOpcode())
26217 return I1->getOpcode() < I2->getOpcode();
26240 auto ValID1 = Opcodes1[
I]->getValueID();
26241 auto ValID2 = Opcodes2[
I]->getValueID();
26242 if (ValID1 == ValID2)
26244 if (ValID1 < ValID2)
26246 if (ValID1 > ValID2)
26255 assert(U1 && U2 &&
"The only thing left should be undef & undef.");
26261 if (VL.empty() || V1 == VL.back())
26263 Value *V2 = VL.back();
26268 if (Opcodes1.
size() != Opcodes2.
size())
26270 for (
int I = 0,
E = Opcodes1.
size();
I <
E; ++
I) {
26276 if (
R.isDeleted(I1) ||
R.isDeleted(I2))
26278 if (
I1->getParent() != I2->getParent())
26286 if (Opcodes1[
I]->getValueID() != Opcodes2[
I]->getValueID())
26292 bool HaveVectorizedPhiNodes =
false;
26296 for (Instruction &
I : *BB) {
26303 if (!VisitedInstrs.
count(
P) && !
R.isDeleted(
P) &&
26308 if (Incoming.
size() <= 1)
26313 for (
Value *V : Incoming) {
26314 SmallVectorImpl<Value *> &Opcodes =
26316 if (!Opcodes.
empty())
26319 SmallPtrSet<Value *, 4> Visited;
26320 while (!Nodes.empty()) {
26324 for (
Value *V :
PHI->incoming_values()) {
26326 Nodes.push_back(PHI1);
26335 Incoming, PHICompare, AreCompatiblePHIs,
26337 return tryToVectorizeList(Candidates, R, MaxVFOnly);
26340 Changed |= HaveVectorizedPhiNodes;
26341 if (HaveVectorizedPhiNodes &&
any_of(PHIToOpcodes, [&](
const auto &
P) {
26343 return !
PHI ||
R.isDeleted(
PHI);
26345 PHIToOpcodes.
clear();
26347 }
while (HaveVectorizedPhiNodes);
26349 VisitedInstrs.
clear();
26351 InstSetVector PostProcessInserts;
26352 SmallSetVector<CmpInst *, 8> PostProcessCmps;
26355 auto VectorizeInsertsAndCmps = [&](
bool VectorizeCmps) {
26356 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
26357 if (VectorizeCmps) {
26359 PostProcessCmps.
clear();
26361 PostProcessInserts.clear();
26367 return PostProcessCmps.
contains(Cmp);
26369 PostProcessInserts.contains(
I);
26375 return I->use_empty() &&
26385 if (
R.isDeleted(&*It))
26388 if (!VisitedInstrs.
insert(&*It).second) {
26389 if (HasNoUsers(&*It) &&
26390 VectorizeInsertsAndCmps(It->isTerminator())) {
26403 if (
P->getNumIncomingValues() == 2) {
26406 if (Root && vectorizeRootInstruction(
P, Root, BB, R)) {
26420 if (BB ==
P->getIncomingBlock(
I) ||
26421 !DT->isReachableFromEntry(
P->getIncomingBlock(
I)))
26427 PI && !IsInPostProcessInstrs(PI)) {
26429 vectorizeRootInstruction(
nullptr, PI,
P->getIncomingBlock(
I), R);
26431 if (Res &&
R.isDeleted(
P)) {
26441 if (HasNoUsers(&*It)) {
26442 bool OpsChanged =
false;
26453 TryToVectorizeRoot |= (
I == Stores.
end() ||
I->second.size() == 1) &&
26454 SI->getValueOperand()->hasOneUse();
26456 if (TryToVectorizeRoot) {
26457 for (
auto *V : It->operand_values()) {
26461 VI && !IsInPostProcessInstrs(VI))
26463 OpsChanged |= vectorizeRootInstruction(
nullptr, VI, BB, R);
26470 VectorizeInsertsAndCmps(It->isTerminator());
26482 PostProcessInserts.insert(&*It);
26490bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB,
BoUpSLP &R) {
26492 for (
auto &Entry : GEPs) {
26495 if (
Entry.second.size() < 2)
26498 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing a getelementptr list of length "
26499 <<
Entry.second.size() <<
".\n");
26507 return !R.isDeleted(GEP);
26509 if (It ==
Entry.second.end())
26511 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
26512 unsigned EltSize =
R.getVectorElementSize(*(*It)->idx_begin());
26513 if (MaxVecRegSize < EltSize)
26516 unsigned MaxElts = MaxVecRegSize / EltSize;
26517 for (
unsigned BI = 0, BE =
Entry.second.size(); BI < BE; BI += MaxElts) {
26518 auto Len = std::min<unsigned>(BE - BI, MaxElts);
26531 Candidates.remove_if([&R](
Value *
I) {
26541 for (
int I = 0,
E = GEPList.size();
I <
E && Candidates.
size() > 1; ++
I) {
26542 auto *GEPI = GEPList[
I];
26543 if (!Candidates.count(GEPI))
26545 const SCEV *SCEVI = SE->getSCEV(GEPList[
I]);
26546 for (
int J =
I + 1; J <
E && Candidates.
size() > 1; ++J) {
26547 auto *GEPJ = GEPList[J];
26548 const SCEV *SCEVJ = SE->getSCEV(GEPList[J]);
26550 Candidates.remove(GEPI);
26551 Candidates.remove(GEPJ);
26552 }
else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
26553 Candidates.remove(GEPJ);
26560 if (Candidates.
size() < 2)
26566 SmallVector<Value *, 16> Bundle(Candidates.
size());
26567 auto BundleIndex = 0
u;
26568 for (
auto *V : Candidates) {
26570 auto *GEPIdx =
GEP->idx_begin()->get();
26572 Bundle[BundleIndex++] = GEPIdx;
26584 Changed |= tryToVectorizeList(Bundle, R);
26590bool SLPVectorizerPass::vectorizeStoreChains(
BoUpSLP &R) {
26595 auto &&StoreSorter = [
this](StoreInst *
V, StoreInst *V2) {
26596 if (
V->getValueOperand()->getType()->getTypeID() <
26599 if (
V->getValueOperand()->getType()->getTypeID() >
26602 if (
V->getPointerOperandType()->getTypeID() <
26603 V2->getPointerOperandType()->getTypeID())
26605 if (
V->getPointerOperandType()->getTypeID() >
26606 V2->getPointerOperandType()->getTypeID())
26608 if (
V->getValueOperand()->getType()->getScalarSizeInBits() <
26611 if (
V->getValueOperand()->getType()->getScalarSizeInBits() >
26617 DomTreeNodeBase<llvm::BasicBlock> *NodeI1 =
26618 DT->getNode(
I1->getParent());
26619 DomTreeNodeBase<llvm::BasicBlock> *NodeI2 =
26620 DT->getNode(I2->getParent());
26621 assert(NodeI1 &&
"Should only process reachable instructions");
26622 assert(NodeI2 &&
"Should only process reachable instructions");
26623 assert((NodeI1 == NodeI2) ==
26625 "Different nodes should have different DFS numbers");
26626 if (NodeI1 != NodeI2)
26628 return I1->getOpcode() < I2->getOpcode();
26630 return V->getValueOperand()->getValueID() <
26634 bool SameParent =
true;
26640 StoreInst *V2 = VL.
back();
26665 SameParent &=
I1 && I2 &&
I1->getParent() == I2->getParent();
26667 for (
auto [SI, V] :
zip(VL, NewVL))
26668 V =
SI->getValueOperand();
26669 NewVL.back() = V1->getValueOperand();
26670 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
26671 InstructionsState S =
Analysis.buildInstructionsState(
26679 return V1->getValueOperand()->
getValueID() ==
26684 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> Attempted;
26685 for (
auto &Pair : Stores) {
26686 if (Pair.second.size() < 2)
26690 << Pair.second.size() <<
".\n");
26699 Pair.second.rend());
26701 ReversedStores, StoreSorter, AreCompatibleStores,
26703 return vectorizeStores(Candidates, R, Attempted);
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool isConstant(const MachineInstr &MI)
AMDGPU Register Bank Select
ReachingDefInfo InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
block Block Frequency Analysis
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
static bool runImpl(Function &F, const TargetLowering &TLI, AssumptionCache *AC)
This is the interface for a simple mod/ref and alias analysis over globals.
static Value * getCondition(Instruction *I)
static void setCondition(Instruction *I, Value *NewCond)
static const HTTPClientCleanup Cleanup
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
This file provides utility analysis objects describing memory locations.
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
static bool IsSelect(MachineInstr &MI)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
static std::optional< OperandInfo > getOperandInfo(const MachineOperand &MO)
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool isAlternateInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static cl::opt< bool > SplitAlternateInstructions("slp-split-alternate-instructions", cl::init(true), cl::Hidden, cl::desc("Improve the code quality by splitting alternate instructions"))
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static bool isMaskedLoadCompress(ArrayRef< Value * > VL, ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC, const DominatorTree &DT, const TargetLibraryInfo &TLI, const function_ref< bool(Value *)> AreAllUsersVectorized, bool &IsMasked, unsigned &InterleaveFactor, SmallVectorImpl< int > &CompressMask, VectorType *&LoadVecTy)
Checks if the VL can be transformed to a (masked)load + compress or (masked) interleaved load.
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< bool > VectorizeCopyableElements("slp-copyable-elements", cl::init(true), cl::Hidden, cl::desc("Try to replace values with the idempotent instructions for " "better vectorization."))
Enables vectorization of copyable elements.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static bool isCommutative(Instruction *I, Value *ValWithUses)
static bool allSameOpcode(ArrayRef< Value * > VL)
static InstructionCost canConvertToFMA(ArrayRef< Value * > VL, const InstructionsState &S, DominatorTree &DT, const DataLayout &DL, TargetTransformInfo &TTI, const TargetLibraryInfo &TLI)
Check if we can convert fadd/fsub sequence to FMAD.
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Creates subvector insert.
static void findBuildAggregateRec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static const SCEV * calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static DebugLoc getDebugLocFromPHI(PHINode &PN)
static std::optional< unsigned > getExtractIndex(const Instruction *E)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static cl::opt< bool > ForceStridedLoads("slp-force-strided-loads", cl::init(false), cl::Hidden, cl::desc("Generate strided loads even if they are not " "profitable. Used for testing only."))
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static bool isMainInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an main operation for the given MainOp and AltOp instruction...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static unsigned getNumberOfPotentiallyCommutativeOps(Instruction *I)
static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy, VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={})
This is similar to TargetTransformInfo::getScalarizationOverhead, but if ScalarTy is a FixedVectorTyp...
static bool buildCompressMask(ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, Type *ScalarTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< int > &CompressMask)
Builds compress-like mask for shuffles for the given PointerOps, ordered with Order.
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
Calculates the costs of vectorized intrinsic (if possible) and vectorized function (if possible) call...
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static SmallVector< Constant * > replicateMask(ArrayRef< Constant * > Val, unsigned VF)
Replicates the given Val VF times.
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static Instruction * findInstructionWithOpcode(ArrayRef< Value * > VL, unsigned Opcode)
Find an instruction with a specific opcode in VL.
static InstructionCost getExtractWithExtendCost(const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput)
This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst is a FixedVectorType,...
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static std::pair< Instruction *, Instruction * > getMainAltOpsNoStateVL(ArrayRef< Value * > VL)
Returns main/alternate instructions for the given VL.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int64_t > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(ArrayRef< T * >, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static bool tryToFindDuplicates(SmallVectorImpl< Value * > &VL, SmallVectorImpl< int > &ReuseShuffleIndices, const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, const InstructionsState &S, const BoUpSLP::EdgeInfo &UserTreeIdx, bool TryPad=false)
Checks that every instruction appears once in the list and if not, packs them, building ReuseShuffleI...
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Value * createExtractVector(IRBuilderBase &Builder, Value *Vec, unsigned SubVecVF, unsigned Index)
Generates subvector extract using Generator or using default shuffle.
static cl::opt< bool > DisableTreeReorder("slp-disable-tree-reorder", cl::init(false), cl::Hidden, cl::desc("Disable tree reordering even if it is " "profitable. Used for testing only."))
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static InstructionCost getVectorInstrCost(const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar, ArrayRef< std::tuple< Value *, User *, int > > ScalarUserAndIdx)
This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy is a FixedVectorType,...
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, Type *ScalarTy, unsigned Opcode0, unsigned Opcode1)
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
static const int BlockSize
LocallyHashedType DenseMapInfo< LocallyHashedType >::Empty
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
static const uint32_t IV[8]
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void resetForSameNode()
Reset the builder to handle perfect diamond match.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Value * createFreeze(Value *V)
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
~ShuffleInstructionBuilder()
void resetForSameNode()
Reset the builder to handle perfect diamond match.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
A manager for alias analyses.
Class for arbitrary precision integers.
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
void clearBit(unsigned BitPosition)
Set a given bit to 0.
uint64_t getZExtValue() const
Get zero extended value.
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
unsigned getBitWidth() const
Return the number of bits in the APInt.
bool ult(const APInt &RHS) const
Unsigned less than comparison.
void clearAllBits()
Set every bit to 0.
void negate()
Negate this APInt in place.
unsigned logBase2() const
void setAllBits()
Set every bit to 1.
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
bool isOne() const
Determine if this is a value of 1.
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
const T & back() const
back - Get the last element.
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
const T & front() const
front - Get the first element.
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
const T & consume_front()
consume_front() - Returns the first element and drops it from ArrayRef.
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::reverse_iterator reverse_iterator
InstListType::iterator iterator
Instruction iterators...
LLVM_ABI const_iterator getFirstNonPHIOrDbgOrAlloca() const
Returns an iterator to the first instruction in this block that is not a PHINode, a debug intrinsic,...
InstListType::const_reverse_iterator const_reverse_iterator
bool isEHPad() const
Return true if this basic block is an exception handling block.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Represents analyses that only rely on functions' control flow.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
LLVM_ABI void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
This class is the base class for the comparison instructions.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ ICMP_SLT
signed less than
@ ICMP_SLE
signed less or equal
@ ICMP_UGE
unsigned greater or equal
@ ICMP_UGT
unsigned greater than
@ ICMP_SGT
signed greater than
@ ICMP_ULT
unsigned less than
@ ICMP_SGE
signed greater or equal
@ ICMP_ULE
unsigned less or equal
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Predicate getPredicate() const
Return the predicate for this instruction.
static LLVM_ABI Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
static LLVM_ABI Constant * getBinOpIdentity(unsigned Opcode, Type *Ty, bool AllowRHSConstant=false, bool NSZ=false)
Return the identity constant for a binary opcode.
This is the shared class of boolean and integer constants.
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
const APInt & getValue() const
Return the constant as an APInt value reference.
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
static bool shouldExecute(unsigned CounterName)
static DebugLoc getUnknown()
An analysis that produces DemandedBits for a function.
LLVM_ABI APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
bool erase(const KeyT &Val)
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
Base class for the actual dominator tree node.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Convenience struct for specifying and reasoning about fast-math flags.
bool allowReassoc() const
Flag queries.
bool allowContract() const
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
ArrayRef< Type * > params() const
Type * getReturnType() const
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
For the node iterator we just need to turn the TreeEntry iterator into a TreeEntry* iterator so that ...
nodes_iterator operator++()
nodes_iterator(const ItTy &It2)
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Value * CreateFreeze(Value *V, const Twine &Name="")
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
LLVM_ABI Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
LLVM_ABI Value * CreateSelectWithUnknownProfile(Value *C, Value *True, Value *False, StringRef PassName, const Twine &Name="")
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
static InstructionCost getInvalid(CostType Val=0)
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
LLVM_ABI bool mayWriteToMemory() const LLVM_READONLY
Return true if this instruction may modify memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
const SmallVectorImpl< Type * > & getArgTypes() const
An instruction for reading from memory.
Value * getPointerOperand()
Analysis pass that exposes the LoopInfo for a function.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
This class implements a map that also provides access to all stored values in a deterministic order.
VectorType takeVector()
Clear the MapVector and return the underlying vector.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
ValueT lookup(const KeyT &Key) const
std::pair< KeyT, ValueT > & front()
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
T & front() const
front - Get the first element.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
ArrayRef< value_type > getArrayRef() const
size_type size() const
Determine the number of elements in the SetVector.
const value_type & front() const
Return the first element of the SetVector.
void insert_range(Range &&R)
Vector takeVector()
Clear the SetVector and return the underlying vector.
void clear()
Completely clear the SetVector.
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
static LLVM_ABI bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static LLVM_ABI bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static LLVM_ABI bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
bool contains(const T &V) const
Check if the SmallSet contains the given element.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
iterator erase(const_iterator CI)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
The instances of the Type class are immutable: once they are created, they are never changed.
LLVM_ABI bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
bool isVectorTy() const
True if this is an instance of VectorType.
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
bool isPointerTy() const
True if this is an instance of PointerType.
LLVM_ABI unsigned getStructNumElements() const
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
bool isIntegerTy() const
True if this is an instance of IntegerType.
TypeID getTypeID() const
Return the type id for the type.
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
bool isVoidTy() const
Return true if this is 'void'.
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Value * getOperand(unsigned i) const
unsigned getNumOperands() const
iterator_range< value_op_iterator > operand_values()
The Vector Function Database.
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
user_iterator user_begin()
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
unsigned getValueID() const
Return an ID for the concrete type of this object.
LLVM_ABI bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
LLVM_ABI unsigned getNumUses() const
This method computes the number of uses of this Value.
iterator_range< use_iterator > uses()
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
iterator find(const_arg_type_t< ValueT > V)
void insert_range(Range &&R)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
const ParentTy * getParent() const
self_iterator getIterator()
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator I
iterator_adaptor_base()=default
This class implements an extremely fast bulk output stream that can only output to a stream.
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreConstants
Constants.
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
VLOperands(ArrayRef< Value * > RootVL, ArrayRef< ValueList > Operands, const InstructionsState &S, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
Bottom Up SLP Vectorizer.
bool isProfitableToReorder() const
Checks if it is profitable to reorder the current tree.
SmallVector< unsigned, 4 > OrdersType
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleEntity &SE)
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleBundle &Bundle)
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
InstructionCost getSpillCost()
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
SmallVector< StoreInst *, 8 > StoreList
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={}, InstructionCost ReductionCost=TTI::TCC_Free)
unsigned getTreeSize() const
bool isStridedLoad(ArrayRef< Value * > PointerOps, Type *ScalarTy, Align Alignment, const int64_t Diff, const size_t Sz) const
Checks if strided loads can be generated out of VL loads with pointers PointerOps:
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
SmallVector< Instruction *, 16 > InstrList
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, StridedPtrInfo &SPtrInfo, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
SmallPtrSet< Value *, 16 > ValueSet
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
bool analyzeRtStrideCandidate(ArrayRef< Value * > PointerOps, Type *ScalarTy, Align CommonAlignment, SmallVectorImpl< unsigned > &SortedIndices, StridedPtrInfo &SPtrInfo) const
Return true if an array of scalar loads can be replaced with a strided load (with run-time stride).
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isVectorized(const Value *V) const
Check if the value is vectorized in the tree.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleData &SD)
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals, ArrayRef< std::tuple< Value *, unsigned, bool > > VectorValuesAndScales)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
static bool isIdentityOrder(ArrayRef< unsigned > Order)
Does this non-empty order represent an identity order?
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
unsigned getMaxVecRegSize() const
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
bool analyzeConstantStrideCandidate(const ArrayRef< Value * > PointerOps, Type *ElemTy, Align Alignment, const SmallVectorImpl< unsigned > &SortedIndices, const int64_t Diff, Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const
Return true if an array of scalar loads can be replaced with a strided load (with constant stride).
unsigned getMinVecRegSize() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Gets reordering data for the given tree entry.
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ BasicBlock
Various leaf nodes.
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaxNum(const Opnd0 &Op0, const Opnd1 &Op1)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinimum(const Opnd0 &Op0, const Opnd1 &Op1)
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaximum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinNum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MatchFunctor< Val, Pattern > match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
initializer< Ty > init(const Ty &Val)
unsigned combineHashValue(unsigned a, unsigned b)
Simplistic combination of 32-bit hash values into 32-bit hash values.
@ User
could "use" a pointer
DiagnosticInfoOptimizationBase::Argument NV
friend class Instruction
Iterator for Instructions in a `BasicBlock.
LLVM_ABI iterator begin() const
LLVM_ABI Instruction & front() const
A private "module" namespace for types and utilities used by this pass.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
LLVM_ABI Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
FunctionAddr VTableAddr Value
void stable_sort(R &&Range)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
void fill(R &&Range, T &&Value)
Provide wrappers to std::fill which take ranges instead of having to pass begin/end explicitly.
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
hash_code hash_value(const FixedPointSemantics &Val)
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
MaybeAlign getAlign(const CallInst &I, unsigned Index)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
auto pred_end(const MachineBasicBlock *BB)
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
LLVM_ABI void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
constexpr from_range_t from_range
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
LLVM_ABI std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
auto cast_or_null(const Y &Val)
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
iterator_range< po_iterator< T > > post_order(const T &G)
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
bool isa_and_nonnull(const Y &Val)
auto binary_search(R &&Range, T &&Value)
Provide wrappers to std::binary_search which take ranges instead of having to pass begin/end explicit...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
DomTreeNodeBase< BasicBlock > DomTreeNode
auto dyn_cast_or_null(const Y &Val)
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
constexpr bool has_single_bit(T Value) noexcept
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
bool isa_and_present(const Y &Val)
isa_and_present<X> - Functionally identical to isa, except that a null value is accepted.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
bool isPointerTy(const Type *T)
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
auto make_first_range(ContainerTy &&c)
Given a container of pairs, return a range over the first elements.
LLVM_ABI std::optional< int64_t > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
LLVM_ABI bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
bool isModOrRefSet(const ModRefInfo MRI)
bool is_sorted(R &&Range, Compare C)
Wrapper function around std::is_sorted to check if elements in a range R are sorted with respect to a...
LLVM_ABI bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Ref
The access may reference the value stored in memory.
@ LLVM_MARK_AS_BITMASK_ENUM
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
LLVM_ABI CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)
Returns the comparison predicate used when expanding a min/max reduction.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Xor
Bitwise or logical XOR of integers.
@ And
Bitwise or logical AND of integers.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
FunctionAddr VTableAddr Next
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
OutputIt copy(R &&Range, OutputIt Out)
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
static unsigned getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy, const unsigned Limit=std::numeric_limits< unsigned >::max())
Returns number of parts, the type VecTy will be split at the codegen phase.
auto pred_begin(const MachineBasicBlock *BB)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
template class LLVM_TEMPLATE_ABI DomTreeNodeBase< BasicBlock >
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
LLVM_ABI Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
LLVM_ABI bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
LLVM_ABI bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Used to keep track of an operand bundle.
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
BoUpSLP::TreeEntry TreeEntry
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits(bool IsSimple=false)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits(bool simple=false)
DenseMapInfo< BoUpSLP::TreeEntry * > FirstInfo
static bool isEqual(const BoUpSLP::EdgeInfo &LHS, const BoUpSLP::EdgeInfo &RHS)
static BoUpSLP::EdgeInfo getEmptyKey()
DenseMapInfo< unsigned > SecondInfo
static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val)
static BoUpSLP::EdgeInfo getTombstoneKey()
An information struct used to provide DenseMap with the various necessary components for a given valu...
Add the VectorizableTree to the index iterator to be able to return TreeEntry pointers.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
ContainerTy & VectorizableTree
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
BoUpSLP::TreeEntry TreeEntry
static nodes_iterator nodes_end(BoUpSLP *R)
BoUpSLP::TreeEntry::VecTreeTy ContainerTy
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
TargetTransformInfo * TTI
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Function object to check whether the second component of a container supported by std::get (like std:...
This structure holds any data we need about the edges being traversed during buildTreeRec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.
bool operator==(const EdgeInfo &Other) const