74#ifdef EXPENSIVE_CHECKS
108using namespace std::placeholders;
110#define SV_NAME "slp-vectorizer"
111#define DEBUG_TYPE "SLP"
113STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
116 "Controls which SLP graphs should be vectorized.");
120 cl::desc(
"Run the SLP vectorization passes"));
124 cl::desc(
"Enable vectorization for wider vector utilization"));
128 cl::desc(
"Only vectorize if you gain more than this "
133 cl::desc(
"When true, SLP vectorizer bypasses profitability checks based on "
134 "heuristics and makes vectorization decision via cost modeling."));
138 cl::desc(
"Attempt to vectorize horizontal reductions"));
143 "Attempt to vectorize horizontal reductions feeding into a store"));
147 cl::desc(
"Improve the code quality by splitting alternate instructions"));
151 cl::desc(
"Attempt to vectorize for this register size in bits"));
155 cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
163 cl::desc(
"Limit the size of the SLP scheduling region per block"));
167 cl::desc(
"Attempt to vectorize for this register size in bits"));
171 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
175 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
181 cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
190 cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
194 cl::desc(
"The minimum number of loads, which should be considered strided, "
195 "if the stride is > 1 or is runtime value"));
199 cl::desc(
"The maximum stride, considered to be profitable."));
203 cl::desc(
"Disable tree reordering even if it is "
204 "profitable. Used for testing only."));
208 cl::desc(
"Generate strided loads even if they are not "
209 "profitable. Used for testing only."));
213 cl::desc(
"Display the SLP trees with Graphviz"));
217 cl::desc(
"Try to vectorize with non-power-of-2 number of elements."));
222 cl::desc(
"Try to replace values with the idempotent instructions for "
223 "better vectorization."));
255 Ty = Ty->getScalarType();
257 !Ty->isPPC_FP128Ty();
266 return SI->getValueOperand()->getType();
268 return CI->getOperand(0)->getType();
270 return IE->getOperand(1)->getType();
277 "ScalableVectorType is not supported.");
279 return VecTy->getNumElements();
293 Type *Ty,
unsigned Sz) {
298 if (NumParts == 0 || NumParts >= Sz)
313 if (NumParts == 0 || NumParts >= Sz)
318 return (Sz / RegVF) * RegVF;
330 I * VecTyNumElements, VecTyNumElements)))
332 : Mask[
I] * VecTyNumElements + J;
366 unsigned SVNumElements =
368 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
369 if (SVNumElements % ShuffleMaskSize != 0)
371 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
372 if (GroupSize == 0 || (VL.
size() % GroupSize) != 0)
374 unsigned NumGroup = 0;
375 for (
size_t I = 0,
E = VL.
size();
I !=
E;
I += GroupSize) {
377 Value *Src = SV->getOperand(0);
383 if (SV->getOperand(0) != Src)
386 if (!SV->isExtractSubvectorMask(Index))
388 ExpectedIndex.
set(Index / ShuffleMaskSize);
392 if (!ExpectedIndex.
all())
396 assert(NumGroup == (VL.
size() / GroupSize) &&
"Unexpected number of groups");
415 unsigned SVNumElements =
418 unsigned AccumulateLength = 0;
419 for (
Value *V : VL) {
421 for (
int M : SV->getShuffleMask())
423 : AccumulateLength + M);
424 AccumulateLength += SVNumElements;
465 return std::min<unsigned>(PartNumElems,
Size - Part * PartNumElems);
474 OS <<
"Idx: " << Idx <<
", ";
475 OS <<
"n=" << VL.
size() <<
" [" << *VL.
front() <<
", ..]";
498 if (BB !=
II->getParent())
515 Value *FirstNonUndef =
nullptr;
516 for (
Value *V : VL) {
519 if (!FirstNonUndef) {
523 if (V != FirstNonUndef)
526 return FirstNonUndef !=
nullptr;
541 bool IsCopyable =
false) {
543 return Cmp->isCommutative();
545 return BO->isCommutative() ||
546 (BO->getOpcode() == Instruction::Sub &&
553 if (match(U.getUser(),
554 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
555 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
559 auto *I = dyn_cast<BinaryOperator>(U.get());
560 return match(U.getUser(),
561 m_Intrinsic<Intrinsic::abs>(
562 m_Specific(U.get()), m_ConstantInt(Flag))) &&
563 ((!IsCopyable && I && !I->hasNoSignedWrap()) ||
566 (BO->getOpcode() == Instruction::FSub &&
569 return match(U.getUser(),
570 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
572 return I->isCommutative();
592 constexpr unsigned IntrinsicNumOperands = 2;
593 return IntrinsicNumOperands;
595 return I->getNumOperands();
601 static_assert(std::is_same_v<T, InsertElementInst> ||
602 std::is_same_v<T, ExtractElementInst>,
612 if (CI->getValue().uge(VT->getNumElements()))
614 Index *= VT->getNumElements();
615 Index += CI->getZExtValue();
637 Type *CurrentType =
IV->getType();
638 for (
unsigned I :
IV->indices()) {
640 Index *= ST->getNumElements();
641 CurrentType = ST->getElementType(
I);
643 Index *= AT->getNumElements();
644 CurrentType = AT->getElementType();
666 return std::all_of(It, VL.
end(), [&](
Value *V) {
667 if (auto *CI = dyn_cast<CmpInst>(V))
668 return BasePred == CI->getPredicate();
669 if (auto *I = dyn_cast<Instruction>(V))
670 return I->getOpcode() == Opcode;
671 return isa<PoisonValue>(V);
699 if (MaskArg == UseMask::UndefsAsMask)
703 if (MaskArg == UseMask::FirstArg &&
Value < VF)
704 UseMask.reset(
Value);
705 else if (MaskArg == UseMask::SecondArg &&
Value >= VF)
706 UseMask.reset(
Value - VF);
714template <
bool IsPoisonOnly = false>
718 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
726 if (!UseMask.empty()) {
737 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
752 for (
unsigned I = 0,
E = VecTy->getNumElements();
I !=
E; ++
I) {
753 if (
Constant *Elem =
C->getAggregateElement(
I))
755 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
783static std::optional<TargetTransformInfo::ShuffleKind>
790 std::accumulate(VL.
begin(), VL.
end(), 0u, [](
unsigned S,
Value *V) {
791 auto *EI = dyn_cast<ExtractElementInst>(V);
794 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
797 return std::max(S, VTy->getNumElements());
800 Value *Vec1 =
nullptr;
801 Value *Vec2 =
nullptr;
806 Value *Vec = EE->getVectorOperand();
812 ShuffleMode CommonShuffleMode =
Unknown;
814 for (
unsigned I = 0,
E = VL.
size();
I <
E; ++
I) {
821 auto *Vec = EI->getVectorOperand();
835 if (Idx->getValue().uge(
Size))
837 unsigned IntIdx = Idx->getValue().getZExtValue();
844 if (!Vec1 || Vec1 == Vec) {
846 }
else if (!Vec2 || Vec2 == Vec) {
852 if (CommonShuffleMode == Permute)
856 if (Mask[
I] %
Size !=
I) {
857 CommonShuffleMode = Permute;
860 CommonShuffleMode =
Select;
863 if (CommonShuffleMode ==
Select && Vec2)
873 unsigned Opcode =
E->getOpcode();
874 assert((Opcode == Instruction::ExtractElement ||
875 Opcode == Instruction::ExtractValue) &&
876 "Expected extractelement or extractvalue instruction.");
877 if (Opcode == Instruction::ExtractElement) {
881 return CI->getZExtValue();
884 if (EI->getNumIndices() != 1)
886 return *EI->idx_begin();
920class BinOpSameOpcodeHelper {
921 using MaskType = std::uint_fast16_t;
923 constexpr static std::initializer_list<unsigned> SupportedOp = {
924 Instruction::Add, Instruction::Sub, Instruction::Mul, Instruction::Shl,
925 Instruction::AShr, Instruction::And, Instruction::Or, Instruction::Xor};
935 MainOpBIT = 0b100000000,
943 static std::pair<ConstantInt *, unsigned>
944 isBinOpWithConstantInt(
const Instruction *
I) {
945 unsigned Opcode =
I->getOpcode();
951 if (Opcode == Instruction::Sub || Opcode == Instruction::Shl ||
952 Opcode == Instruction::AShr)
958 struct InterchangeableInfo {
961 MaskType Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |
962 MulBIT | AShrBIT | ShlBIT;
967 MaskType SeenBefore = 0;
968 InterchangeableInfo(
const Instruction *I) : I(I) {}
972 bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) {
973 if (Mask & InterchangeableMask) {
974 SeenBefore |= OpcodeInMaskForm;
975 Mask &= InterchangeableMask;
980 bool equal(
unsigned Opcode) {
981 return Opcode == I->getOpcode() && trySet(MainOpBIT, MainOpBIT);
984 MaskType Candidate = Mask & SeenBefore;
985 if (Candidate & MainOpBIT)
986 return I->getOpcode();
987 if (Candidate & ShlBIT)
988 return Instruction::Shl;
989 if (Candidate & AShrBIT)
990 return Instruction::AShr;
991 if (Candidate & MulBIT)
992 return Instruction::Mul;
993 if (Candidate & AddBIT)
994 return Instruction::Add;
995 if (Candidate & SubBIT)
996 return Instruction::Sub;
997 if (Candidate & AndBIT)
998 return Instruction::And;
999 if (Candidate & OrBIT)
1000 return Instruction::Or;
1001 if (Candidate & XorBIT)
1002 return Instruction::Xor;
1007 bool hasCandidateOpcode(
unsigned Opcode)
const {
1008 MaskType Candidate = Mask & SeenBefore;
1010 case Instruction::Shl:
1011 return Candidate & ShlBIT;
1012 case Instruction::AShr:
1013 return Candidate & AShrBIT;
1014 case Instruction::Mul:
1015 return Candidate & MulBIT;
1016 case Instruction::Add:
1017 return Candidate & AddBIT;
1018 case Instruction::Sub:
1019 return Candidate & SubBIT;
1020 case Instruction::And:
1021 return Candidate & AndBIT;
1022 case Instruction::Or:
1023 return Candidate & OrBIT;
1024 case Instruction::Xor:
1025 return Candidate & XorBIT;
1026 case Instruction::LShr:
1027 case Instruction::FAdd:
1028 case Instruction::FSub:
1029 case Instruction::FMul:
1030 case Instruction::SDiv:
1031 case Instruction::UDiv:
1032 case Instruction::FDiv:
1033 case Instruction::SRem:
1034 case Instruction::URem:
1035 case Instruction::FRem:
1045 unsigned FromOpcode = I->getOpcode();
1046 if (FromOpcode == ToOpcode)
1049 auto [CI, Pos] = isBinOpWithConstantInt(I);
1050 const APInt &FromCIValue = CI->getValue();
1051 unsigned FromCIValueBitWidth = FromCIValue.
getBitWidth();
1053 switch (FromOpcode) {
1054 case Instruction::Shl:
1055 if (ToOpcode == Instruction::Mul) {
1059 assert(FromCIValue.
isZero() &&
"Cannot convert the instruction.");
1060 ToCIValue = ToOpcode == Instruction::And
1062 : APInt::getZero(FromCIValueBitWidth);
1065 case Instruction::Mul:
1067 if (ToOpcode == Instruction::Shl) {
1068 ToCIValue = APInt(FromCIValueBitWidth, FromCIValue.
logBase2());
1070 assert(FromCIValue.
isOne() &&
"Cannot convert the instruction.");
1071 ToCIValue = ToOpcode == Instruction::And
1073 : APInt::getZero(FromCIValueBitWidth);
1076 case Instruction::Add:
1077 case Instruction::Sub:
1078 if (FromCIValue.
isZero()) {
1082 "Cannot convert the instruction.");
1083 ToCIValue = FromCIValue;
1087 case Instruction::And:
1089 ToCIValue = ToOpcode == Instruction::Mul
1091 : APInt::getZero(FromCIValueBitWidth);
1094 assert(FromCIValue.
isZero() &&
"Cannot convert the instruction.");
1098 Value *
LHS = I->getOperand(1 - Pos);
1100 ConstantInt::get(I->getOperand(Pos)->getType(), ToCIValue);
1104 ((FromOpcode == Instruction::Add || FromOpcode == Instruction::Or ||
1105 FromOpcode == Instruction::Xor) &&
1106 ToOpcode == Instruction::Sub))
1111 InterchangeableInfo MainOp;
1112 InterchangeableInfo AltOp;
1114 return ::isValidForAlternation(MainOp.I->getOpcode()) &&
1117 bool initializeAltOp(
const Instruction *
I) {
1127 BinOpSameOpcodeHelper(
const Instruction *MainOp,
1128 const Instruction *AltOp =
nullptr)
1129 : MainOp(MainOp), AltOp(AltOp) {
1132 bool add(
const Instruction *
I) {
1134 "BinOpSameOpcodeHelper only accepts BinaryOperator.");
1135 unsigned Opcode =
I->getOpcode();
1136 MaskType OpcodeInMaskForm;
1139 case Instruction::Shl:
1140 OpcodeInMaskForm = ShlBIT;
1142 case Instruction::AShr:
1143 OpcodeInMaskForm = AShrBIT;
1145 case Instruction::Mul:
1146 OpcodeInMaskForm = MulBIT;
1148 case Instruction::Add:
1149 OpcodeInMaskForm = AddBIT;
1151 case Instruction::Sub:
1152 OpcodeInMaskForm = SubBIT;
1154 case Instruction::And:
1155 OpcodeInMaskForm = AndBIT;
1157 case Instruction::Or:
1158 OpcodeInMaskForm = OrBIT;
1160 case Instruction::Xor:
1161 OpcodeInMaskForm = XorBIT;
1164 return MainOp.equal(Opcode) ||
1165 (initializeAltOp(
I) && AltOp.equal(Opcode));
1167 MaskType InterchangeableMask = OpcodeInMaskForm;
1168 ConstantInt *CI = isBinOpWithConstantInt(
I).first;
1170 constexpr MaskType CanBeAll =
1171 XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | MulBIT | AShrBIT | ShlBIT;
1172 const APInt &CIValue = CI->
getValue();
1174 case Instruction::Shl:
1176 InterchangeableMask = CIValue.
isZero() ? CanBeAll : MulBIT | ShlBIT;
1178 case Instruction::Mul:
1179 if (CIValue.
isOne()) {
1180 InterchangeableMask = CanBeAll;
1184 InterchangeableMask = MulBIT | ShlBIT;
1186 case Instruction::Add:
1187 case Instruction::Sub:
1188 InterchangeableMask = CIValue.
isZero() ? CanBeAll : SubBIT | AddBIT;
1190 case Instruction::And:
1192 InterchangeableMask = CanBeAll;
1194 case Instruction::Xor:
1196 InterchangeableMask = XorBIT | OrBIT | AndBIT | SubBIT | AddBIT;
1200 InterchangeableMask = CanBeAll;
1204 return MainOp.trySet(OpcodeInMaskForm, InterchangeableMask) ||
1205 (initializeAltOp(
I) &&
1206 AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));
1208 unsigned getMainOpcode()
const {
return MainOp.getOpcode(); }
1210 bool hasCandidateOpcode(
unsigned Opcode)
const {
1211 return MainOp.hasCandidateOpcode(Opcode);
1213 bool hasAltOp()
const {
return AltOp.I; }
1214 unsigned getAltOpcode()
const {
1215 return hasAltOp() ? AltOp.getOpcode() : getMainOpcode();
1218 return MainOp.getOperand(
I);
1223class InstructionsState {
1249 bool HasCopyables =
false;
1253 assert(valid() &&
"InstructionsState is invalid.");
1258 assert(valid() &&
"InstructionsState is invalid.");
1263 unsigned getOpcode()
const {
return getMainOp()->getOpcode(); }
1265 unsigned getAltOpcode()
const {
return getAltOp()->getOpcode(); }
1268 bool isAltShuffle()
const {
return getMainOp() != getAltOp(); }
1277 Instruction *getMatchingMainOpOrAltOp(Instruction *
I)
const {
1278 assert(MainOp &&
"MainOp cannot be nullptr.");
1279 if (
I->getOpcode() == MainOp->getOpcode())
1282 assert(AltOp &&
"AltOp cannot be nullptr.");
1283 if (
I->getOpcode() == AltOp->getOpcode())
1285 if (!
I->isBinaryOp())
1287 BinOpSameOpcodeHelper
Converter(MainOp);
1290 if (isAltShuffle() && !
Converter.hasCandidateOpcode(MainOp->getOpcode())) {
1291 BinOpSameOpcodeHelper AltConverter(AltOp);
1292 if (AltConverter.add(
I) && AltConverter.add(AltOp) &&
1293 AltConverter.hasCandidateOpcode(AltOp->getOpcode()))
1296 if (
Converter.hasAltOp() && !isAltShuffle())
1298 return Converter.hasAltOp() ? AltOp : MainOp;
1302 bool isShiftOp()
const {
1303 return getMainOp()->isShift() && getAltOp()->isShift();
1308 return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();
1312 bool isMulDivLikeOp()
const {
1313 constexpr std::array<unsigned, 8> MulDiv = {
1314 Instruction::Mul, Instruction::FMul, Instruction::SDiv,
1315 Instruction::UDiv, Instruction::FDiv, Instruction::SRem,
1316 Instruction::URem, Instruction::FRem};
1322 bool isAddSubLikeOp()
const {
1323 constexpr std::array<unsigned, 4>
AddSub = {
1324 Instruction::Add, Instruction::Sub, Instruction::FAdd,
1331 bool isCmpOp()
const {
1332 return (
getOpcode() == Instruction::ICmp ||
1338 bool valid()
const {
return MainOp && AltOp; }
1340 explicit operator bool()
const {
return valid(); }
1342 InstructionsState() =
delete;
1343 InstructionsState(Instruction *MainOp, Instruction *AltOp,
1344 bool HasCopyables =
false)
1345 : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}
1346 static InstructionsState invalid() {
return {
nullptr,
nullptr}; }
1349 bool isCopyableElement(
Value *V)
const {
1350 assert(valid() &&
"InstructionsState is invalid.");
1353 if (isAltShuffle() ||
getOpcode() == Instruction::GetElementPtr)
1358 if (
I->getParent() != MainOp->getParent() &&
1362 if (
I->getOpcode() == MainOp->getOpcode())
1364 if (!
I->isBinaryOp())
1366 BinOpSameOpcodeHelper
Converter(MainOp);
1372 bool isNonSchedulable(
Value *V)
const {
1373 assert(valid() &&
"InstructionsState is invalid.");
1380 if (getMainOp() == V)
1382 if (isCopyableElement(V)) {
1383 auto IsNonSchedulableCopyableElement = [
this](
Value *
V) {
1385 return !
I ||
isa<PHINode>(
I) ||
I->getParent() != MainOp->getParent() ||
1390 !MainOp->comesBefore(
I));
1393 return IsNonSchedulableCopyableElement(V);
1400 bool areInstructionsWithCopyableElements()
const {
1401 assert(valid() &&
"InstructionsState is invalid.");
1402 return HasCopyables;
1406std::pair<Instruction *, SmallVector<Value *>>
1408 Instruction *SelectedOp = S.getMatchingMainOpOrAltOp(
I);
1409 assert(SelectedOp &&
"Cannot convert the instruction.");
1410 if (
I->isBinaryOp()) {
1412 return std::make_pair(SelectedOp,
Converter.getOperand(SelectedOp));
1431 for (
Value *V : VL) {
1436 if (Inst->getOpcode() == Opcode)
1450 BaseOp0 == Op0 || BaseOp1 == Op1 ||
1461 "Assessing comparisons of different types?");
1471 return (BasePred == Pred &&
1473 (BasePred == SwappedPred &&
1484 return InstructionsState::invalid();
1488 return InstructionsState::invalid();
1493 (VL.
size() == 2 && InstCnt < 2))
1494 return InstructionsState::invalid();
1503 unsigned AltOpcode = Opcode;
1505 BinOpSameOpcodeHelper BinOpHelper(MainOp);
1506 bool SwappedPredsCompatible = IsCmpOp && [&]() {
1508 UniquePreds.
insert(BasePred);
1509 UniqueNonSwappedPreds.
insert(BasePred);
1510 for (
Value *V : VL) {
1517 UniqueNonSwappedPreds.
insert(CurrentPred);
1518 if (!UniquePreds.
contains(CurrentPred) &&
1519 !UniquePreds.
contains(SwappedCurrentPred))
1520 UniquePreds.
insert(CurrentPred);
1525 return UniqueNonSwappedPreds.
size() > 2 && UniquePreds.
size() == 2;
1535 return InstructionsState::invalid();
1537 bool AnyPoison = InstCnt != VL.
size();
1548 if (AnyPoison && (
I->isIntDivRem() ||
I->isFPDivRem() ||
isa<CallInst>(
I)))
1549 return InstructionsState::invalid();
1550 unsigned InstOpcode =
I->getOpcode();
1552 if (BinOpHelper.add(
I))
1557 Value *Op1 =
I->getOperand(0);
1560 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
1562 if (Opcode == AltOpcode) {
1565 "Cast isn't safe for alternation, logic needs to be updated!");
1566 AltOpcode = InstOpcode;
1573 Type *Ty0 = BaseInst->getOperand(0)->getType();
1574 Type *Ty1 = Inst->getOperand(0)->getType();
1576 assert(InstOpcode == Opcode &&
"Expected same CmpInst opcode.");
1577 assert(InstOpcode == AltOpcode &&
1578 "Alternate instructions are only supported by BinaryOperator "
1586 if ((VL.
size() == 2 || SwappedPredsCompatible) &&
1587 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1593 if (MainOp != AltOp) {
1596 }
else if (BasePred != CurrentPred) {
1599 "CmpInst isn't safe for alternation, logic needs to be updated!");
1604 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1605 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1608 }
else if (InstOpcode == Opcode) {
1609 assert(InstOpcode == AltOpcode &&
1610 "Alternate instructions are only supported by BinaryOperator and "
1613 if (Gep->getNumOperands() != 2 ||
1615 return InstructionsState::invalid();
1618 return InstructionsState::invalid();
1621 if (!LI->isSimple() || !BaseLI->isSimple())
1622 return InstructionsState::invalid();
1626 return InstructionsState::invalid();
1627 if (
Call->hasOperandBundles() &&
1629 !std::equal(
Call->op_begin() +
Call->getBundleOperandsStartIndex(),
1630 Call->op_begin() +
Call->getBundleOperandsEndIndex(),
1633 return InstructionsState::invalid();
1636 return InstructionsState::invalid();
1639 if (Mappings.
size() != BaseMappings.
size() ||
1640 Mappings.
front().ISA != BaseMappings.
front().ISA ||
1641 Mappings.
front().ScalarName != BaseMappings.
front().ScalarName ||
1642 Mappings.
front().VectorName != BaseMappings.
front().VectorName ||
1643 Mappings.
front().Shape.VF != BaseMappings.
front().Shape.VF ||
1644 Mappings.
front().Shape.Parameters !=
1645 BaseMappings.
front().Shape.Parameters)
1646 return InstructionsState::invalid();
1651 return InstructionsState::invalid();
1656 assert(MainOp &&
"Cannot find MainOp with Opcode from BinOpHelper.");
1658 assert(MainOp &&
"Cannot find AltOp with Opcode from BinOpHelper.");
1661 "Incorrect implementation of allSameOpcode.");
1662 InstructionsState S(MainOp, AltOp);
1668 "Invalid InstructionsState.");
1676 return all_of(VL, [&](
Value *V) {
return V->getType() == Ty; });
1686 unsigned Opcode = UserInst->
getOpcode();
1688 case Instruction::Load: {
1692 case Instruction::Store: {
1694 return (
SI->getPointerOperand() == Scalar);
1696 case Instruction::Call: {
1700 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1701 Arg.value().get() == Scalar;
1721 return LI->isSimple();
1723 return SI->isSimple();
1725 return !
MI->isVolatile();
1733 bool ExtendingManyInputs =
false) {
1734 if (SubMask.
empty())
1737 (!ExtendingManyInputs || SubMask.
size() > Mask.size() ||
1740 "SubMask with many inputs support must be larger than the mask.");
1742 Mask.append(SubMask.
begin(), SubMask.
end());
1746 int TermValue = std::min(Mask.size(), SubMask.
size());
1747 for (
int I = 0,
E = SubMask.
size();
I <
E; ++
I) {
1749 (!ExtendingManyInputs &&
1750 (SubMask[
I] >= TermValue || Mask[SubMask[
I]] >= TermValue)))
1752 NewMask[
I] = Mask[SubMask[
I]];
1768 const size_t Sz = Order.
size();
1771 for (
unsigned I = 0;
I < Sz; ++
I) {
1773 UnusedIndices.
reset(Order[
I]);
1775 MaskedIndices.
set(
I);
1777 if (MaskedIndices.
none())
1780 "Non-synced masked/available indices.");
1784 assert(Idx >= 0 &&
"Indices must be synced.");
1794 unsigned Opcode0,
unsigned Opcode1) {
1801 OpcodeMask.
set(Lane * ScalarTyNumElements,
1802 Lane * ScalarTyNumElements + ScalarTyNumElements);
1811 "Expected scalar constants.");
1814 std::fill_n(NewVal.begin() +
I * VF, VF, V);
1821 const unsigned E = Indices.
size();
1823 for (
unsigned I = 0;
I <
E; ++
I)
1824 Mask[Indices[
I]] =
I;
1830 assert(!Mask.empty() &&
"Expected non-empty mask.");
1834 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
1836 Scalars[Mask[
I]] = Prev[
I];
1849 auto *IO = dyn_cast<Instruction>(V);
1852 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1865 return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(
UsesLimit) &&
1867 auto *IU = dyn_cast<Instruction>(U);
1870 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1886 return !VL.
empty() &&
1902 return NumParts > 0 && NumParts < Sz &&
has_single_bit(Sz / NumParts) &&
1911 const unsigned Limit = std::numeric_limits<unsigned>::max()) {
1912 unsigned NumParts =
TTI.getNumberOfParts(VecTy);
1913 if (NumParts == 0 || NumParts >= Limit)
1916 if (NumParts >= Sz || Sz % NumParts != 0 ||
1925 class ScheduleEntity;
1927 class ScheduleCopyableData;
1928 class ScheduleBundle;
1938 struct StridedPtrInfo {
1939 Value *StrideVal =
nullptr;
1940 const SCEV *StrideSCEV =
nullptr;
1966 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1967 AC(AC), DB(DB), DL(DL), ORE(ORE),
1986 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1999 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales = {});
2013 const SmallDenseSet<Value *> &UserIgnoreLst);
2020 assert(!VectorizableTree.empty() &&
"No graph to get the first node from");
2021 return VectorizableTree.front()->Scalars;
2027 const TreeEntry &Root = *VectorizableTree.front();
2028 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
2029 !Root.Scalars.
front()->getType()->isIntegerTy())
2030 return std::nullopt;
2031 auto It = MinBWs.find(&Root);
2032 if (It != MinBWs.end())
2036 if (Root.getOpcode() == Instruction::ZExt ||
2037 Root.getOpcode() == Instruction::SExt)
2038 return std::make_pair(
cast<CastInst>(Root.getMainOp())->getSrcTy(),
2039 Root.getOpcode() == Instruction::SExt);
2040 return std::nullopt;
2046 return MinBWs.at(VectorizableTree.front().get()).second;
2051 if (ReductionBitWidth == 0 ||
2052 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
2053 ReductionBitWidth >=
2054 DL->getTypeSizeInBits(
2055 VectorizableTree.front()->Scalars.front()->getType()))
2057 VectorizableTree.front()->Scalars.front()->getType(),
2058 VectorizableTree.front()->getVectorFactor());
2061 VectorizableTree.front()->Scalars.front()->getContext(),
2063 VectorizableTree.front()->getVectorFactor());
2078 VectorizableTree.clear();
2079 ScalarToTreeEntries.clear();
2080 OperandsToTreeEntry.clear();
2081 ScalarsInSplitNodes.clear();
2083 NonScheduledFirst.clear();
2084 EntryToLastInstruction.clear();
2085 LastInstructionToPos.clear();
2086 LoadEntriesToVectorize.clear();
2087 IsGraphTransformMode =
false;
2088 GatheredLoadsEntriesFirst.reset();
2089 CompressEntryToData.clear();
2090 ExternalUses.clear();
2091 ExternalUsesAsOriginalScalar.clear();
2092 ExternalUsesWithNonUsers.clear();
2093 for (
auto &Iter : BlocksSchedules) {
2094 BlockScheduling *BS = Iter.second.get();
2098 ReductionBitWidth = 0;
2100 CastMaxMinBWSizes.reset();
2101 ExtraBitWidthNodes.clear();
2102 InstrElementSize.clear();
2103 UserIgnoreList =
nullptr;
2104 PostponedGathers.clear();
2105 ValueToGatherNodes.clear();
2106 TreeEntryToStridedPtrInfoMap.clear();
2122 assert(!Order.
empty() &&
"expected non-empty order");
2123 const unsigned Sz = Order.
size();
2125 return P.value() ==
P.index() ||
P.value() == Sz;
2138 bool IgnoreReorder);
2151 std::optional<OrdersType>
2189 return MaxVecRegSize;
2194 return MinVecRegSize;
2202 unsigned MaxVF =
MaxVFOption.getNumOccurrences() ?
2203 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
2204 return MaxVF ? MaxVF : UINT_MAX;
2243 Align Alignment,
const int64_t Diff,
2244 const size_t Sz)
const;
2284 Value *Ptr0,
Value *PtrN, StridedPtrInfo &SPtrInfo)
const;
2302 Align CommonAlignment,
2304 StridedPtrInfo &SPtrInfo)
const;
2319 StridedPtrInfo &SPtrInfo,
2320 unsigned *BestVF =
nullptr,
2321 bool TryRecursiveCheck =
true)
const;
2325 ListOfKnonwnNonVectorizableLoads.insert(
hash_value(VL));
2329 template <
typename T>
2331 return ListOfKnonwnNonVectorizableLoads.contains(
hash_value(VL));
2356 OS <<
"{User:" << (
UserTE ? std::to_string(
UserTE->Idx) :
"null")
2357 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
2382 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
2383 MaxLevel(MaxLevel) {}
2439 auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
2444 auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
2446 return U == U1 || U == U2 || R.isVectorized(U);
2449 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
2452 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
2454 ((
int)V1->getNumUses() == NumLanes ||
2455 AllUsersAreInternal(V1, V2)))
2461 auto CheckSameEntryOrFail = [&]() {
2466 any_of(TEs2, [&](TreeEntry *E) {
return Set.contains(E); }))
2475 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
2477 return CheckSameEntryOrFail();
2480 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
2481 LI2->getPointerOperand(), DL, SE,
true);
2482 if (!Dist || *Dist == 0) {
2485 R.TTI->isLegalMaskedGather(
2488 return CheckSameEntryOrFail();
2492 if (std::abs(*Dist) > NumLanes / 2)
2525 Value *EV2 =
nullptr;
2538 int Dist = Idx2 - Idx1;
2541 if (std::abs(Dist) == 0)
2543 if (std::abs(Dist) > NumLanes / 2)
2550 return CheckSameEntryOrFail();
2556 if (I1->getParent() != I2->getParent())
2557 return CheckSameEntryOrFail();
2565 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.
empty() ||
2566 !S.isAltShuffle()) &&
2570 S.getMainOp()->getNumOperands();
2582 return CheckSameEntryOrFail();
2616 int ShallowScoreAtThisLevel =
2627 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
2630 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
2632 ShallowScoreAtThisLevel))
2633 return ShallowScoreAtThisLevel;
2634 assert(I1 && I2 &&
"Should have early exited.");
2641 for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
2642 OpIdx1 != NumOperands1; ++OpIdx1) {
2644 int MaxTmpScore = 0;
2645 unsigned MaxOpIdx2 = 0;
2646 bool FoundBest =
false;
2650 ? I2->getNumOperands()
2651 : std::min(I2->getNumOperands(), OpIdx1 + 1);
2652 assert(FromIdx <= ToIdx &&
"Bad index");
2653 for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
2655 if (Op2Used.
count(OpIdx2))
2660 I1, I2, CurrLevel + 1, {});
2663 TmpScore > MaxTmpScore) {
2664 MaxTmpScore = TmpScore;
2671 Op2Used.
insert(MaxOpIdx2);
2672 ShallowScoreAtThisLevel += MaxTmpScore;
2675 return ShallowScoreAtThisLevel;
2706 struct OperandData {
2707 OperandData() =
default;
2708 OperandData(
Value *V,
bool APO,
bool IsUsed)
2709 : V(V), APO(APO), IsUsed(IsUsed) {}
2719 bool IsUsed =
false;
2728 enum class ReorderingMode {
2742 unsigned ArgSize = 0;
2748 const Loop *L =
nullptr;
2751 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
2752 return OpsVec[
OpIdx][Lane];
2756 const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const {
2757 return OpsVec[
OpIdx][Lane];
2762 for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
2764 for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2766 OpsVec[
OpIdx][Lane].IsUsed =
false;
2770 void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
2771 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2783 int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx,
2785 Value *IdxLaneV = getData(Idx, Lane).V;
2798 unsigned UniquesCount = Uniques.
size();
2799 auto IdxIt = Uniques.
find(IdxLaneV);
2800 unsigned UniquesCntWithIdxLaneV =
2801 IdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2803 auto OpIdxIt = Uniques.
find(OpIdxLaneV);
2804 unsigned UniquesCntWithOpIdxLaneV =
2805 OpIdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2806 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2808 return std::min(
bit_ceil(UniquesCntWithOpIdxLaneV) -
2809 UniquesCntWithOpIdxLaneV,
2810 UniquesCntWithOpIdxLaneV -
2812 ((IdxIt != Uniques.
end() && UsedLanes.
test(IdxIt->second))
2813 ? UniquesCntWithIdxLaneV -
bit_floor(UniquesCntWithIdxLaneV)
2814 :
bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2823 int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
2824 Value *IdxLaneV = getData(Idx, Lane).V;
2837 return R.areAllUsersVectorized(IdxLaneI)
2845 static const int ScoreScaleFactor = 10;
2853 int Lane,
unsigned OpIdx,
unsigned Idx,
2863 int SplatScore = getSplatScore(Lane,
OpIdx, Idx, UsedLanes);
2864 if (Score <= -SplatScore) {
2868 Score += SplatScore;
2874 Score *= ScoreScaleFactor;
2875 Score += getExternalUseScore(Lane,
OpIdx, Idx);
2893 std::optional<unsigned>
2894 getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
2898 unsigned NumOperands = getNumOperands();
2901 Value *OpLastLane = getData(
OpIdx, LastLane).V;
2904 ReorderingMode RMode = ReorderingModes[
OpIdx];
2905 if (RMode == ReorderingMode::Failed)
2906 return std::nullopt;
2909 bool OpIdxAPO = getData(
OpIdx, Lane).APO;
2915 std::optional<unsigned> Idx;
2919 BestScoresPerLanes.try_emplace(std::make_pair(
OpIdx, Lane), 0)
2925 bool IsUsed = RMode == ReorderingMode::Splat ||
2926 RMode == ReorderingMode::Constant ||
2927 RMode == ReorderingMode::Load;
2929 for (
unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2931 OperandData &OpData = getData(Idx, Lane);
2933 bool OpAPO = OpData.APO;
2942 if (OpAPO != OpIdxAPO)
2947 case ReorderingMode::Load:
2948 case ReorderingMode::Opcode: {
2949 bool LeftToRight = Lane > LastLane;
2950 Value *OpLeft = (LeftToRight) ? OpLastLane :
Op;
2951 Value *OpRight = (LeftToRight) ?
Op : OpLastLane;
2952 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2953 OpIdx, Idx, IsUsed, UsedLanes);
2954 if (Score >
static_cast<int>(BestOp.Score) ||
2955 (Score > 0 && Score ==
static_cast<int>(BestOp.Score) &&
2958 BestOp.Score = Score;
2959 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] = Score;
2963 case ReorderingMode::Constant:
2965 (!BestOp.Score && L && L->isLoopInvariant(
Op))) {
2969 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] =
2976 case ReorderingMode::Splat:
2978 IsUsed =
Op == OpLastLane;
2979 if (
Op == OpLastLane) {
2981 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] =
2987 case ReorderingMode::Failed:
2993 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2997 return std::nullopt;
3004 unsigned getBestLaneToStartReordering()
const {
3005 unsigned Min = UINT_MAX;
3006 unsigned SameOpNumber = 0;
3017 for (
int I = getNumLanes();
I > 0; --
I) {
3018 unsigned Lane =
I - 1;
3019 OperandsOrderData NumFreeOpsHash =
3020 getMaxNumOperandsThatCanBeReordered(Lane);
3023 if (NumFreeOpsHash.NumOfAPOs < Min) {
3024 Min = NumFreeOpsHash.NumOfAPOs;
3025 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3027 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3028 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
3029 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
3032 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3033 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3034 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
3035 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
3036 auto [It, Inserted] =
3037 HashMap.
try_emplace(NumFreeOpsHash.Hash, 1, Lane);
3043 unsigned BestLane = 0;
3044 unsigned CntMin = UINT_MAX;
3046 if (
Data.second.first < CntMin) {
3047 CntMin =
Data.second.first;
3048 BestLane =
Data.second.second;
3055 struct OperandsOrderData {
3058 unsigned NumOfAPOs = UINT_MAX;
3061 unsigned NumOpsWithSameOpcodeParent = 0;
3075 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const {
3076 unsigned CntTrue = 0;
3077 unsigned NumOperands = getNumOperands();
3087 bool AllUndefs =
true;
3088 unsigned NumOpsWithSameOpcodeParent = 0;
3093 const OperandData &OpData = getData(
OpIdx, Lane);
3100 I->getParent() != Parent) {
3101 if (NumOpsWithSameOpcodeParent == 0) {
3102 NumOpsWithSameOpcodeParent = 1;
3104 Parent =
I->getParent();
3106 --NumOpsWithSameOpcodeParent;
3109 ++NumOpsWithSameOpcodeParent;
3118 OperandsOrderData
Data;
3119 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
3120 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
3127 const InstructionsState &S) {
3131 return VL.
size() == getNumLanes();
3133 "Expected same number of lanes");
3134 assert(S.valid() &&
"InstructionsState is invalid.");
3140 OpsVec.resize(ArgSize);
3141 unsigned NumLanes = VL.
size();
3142 for (OperandDataVec &
Ops : OpsVec)
3143 Ops.resize(NumLanes);
3158 OpsVec[
OpIdx][Lane] = {Operands[
OpIdx][Lane],
true,
false};
3161 bool IsInverseOperation =
false;
3162 if (S.isCopyableElement(VL[Lane])) {
3164 IsInverseOperation =
3167 assert(
I &&
"Expected instruction");
3168 auto [SelectedOp,
Ops] = convertTo(
I, S);
3175 bool APO = (
OpIdx == 0) ?
false : IsInverseOperation;
3176 OpsVec[
OpIdx][Lane] = {Operands[
OpIdx][Lane], APO,
false};
3182 unsigned getNumOperands()
const {
return ArgSize; }
3185 unsigned getNumLanes()
const {
return OpsVec[0].size(); }
3188 Value *getValue(
unsigned OpIdx,
unsigned Lane)
const {
3189 return getData(
OpIdx, Lane).V;
3193 bool empty()
const {
return OpsVec.empty(); }
3196 void clear() { OpsVec.clear(); }
3201 bool shouldBroadcast(
Value *
Op,
unsigned OpIdx,
unsigned Lane) {
3203 "Op is expected to be getValue(OpIdx, Lane).");
3207 bool OpAPO = getData(
OpIdx, Lane).APO;
3208 bool IsInvariant = L && L->isLoopInvariant(
Op);
3210 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3214 bool FoundCandidate =
false;
3215 for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
3216 OperandData &
Data = getData(OpI, Ln);
3217 if (
Data.APO != OpAPO ||
Data.IsUsed)
3219 Value *OpILane = getValue(OpI, Lane);
3243 L->isLoopInvariant(
Data.V))) {
3244 FoundCandidate =
true;
3251 if (!FoundCandidate)
3254 return getNumLanes() == 2 || Cnt > 1;
3261 "Op is expected to be getValue(OpIdx, Lane).");
3262 bool OpAPO = getData(
OpIdx, Lane).APO;
3263 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3267 const OperandData &
Data = getData(OpI, Ln);
3268 if (
Data.APO != OpAPO ||
Data.IsUsed)
3270 Value *OpILn = getValue(OpI, Ln);
3271 return (L && L->isLoopInvariant(OpILn)) ||
3283 const InstructionsState &S,
const BoUpSLP &R)
3284 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
3285 L(R.LI->getLoopFor(S.getMainOp()->
getParent())) {
3287 appendOperands(RootVL, Operands, S);
3295 "Expected same num of lanes across all operands");
3296 for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
3297 OpVL[Lane] = OpsVec[
OpIdx][Lane].V;
3305 unsigned NumOperands = getNumOperands();
3306 unsigned NumLanes = getNumLanes();
3326 unsigned FirstLane = getBestLaneToStartReordering();
3335 if (shouldBroadcast(OpLane0,
OpIdx, FirstLane) ||
3336 !canBeVectorized(OpILane0,
OpIdx, FirstLane))
3337 ReorderingModes[
OpIdx] = ReorderingMode::Splat;
3339 ReorderingModes[
OpIdx] = ReorderingMode::Load;
3341 ReorderingModes[
OpIdx] = ReorderingMode::Opcode;
3343 ReorderingModes[
OpIdx] = ReorderingMode::Constant;
3346 ReorderingModes[
OpIdx] = ReorderingMode::Splat;
3356 auto &&SkipReordering = [
this]() {
3359 for (
const OperandData &
Data : Op0)
3362 ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
3363 if (
any_of(
Op, [&UniqueValues](
const OperandData &
Data) {
3370 return UniqueValues.
size() != 2 &&
3372 UniqueValues.
size());
3384 if (SkipReordering())
3387 bool StrategyFailed =
false;
3395 for (
unsigned I = 0;
I < NumOperands; ++
I)
3396 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
3399 UsedLanes.
set(FirstLane);
3400 for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
3402 for (
int Direction : {+1, -1}) {
3403 int Lane = FirstLane + Direction * Distance;
3404 if (Lane < 0 || Lane >= (
int)NumLanes)
3406 UsedLanes.
set(Lane);
3407 int LastLane = Lane - Direction;
3408 assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
3413 std::optional<unsigned> BestIdx =
3414 getBestOperand(
OpIdx, Lane, LastLane, ReorderingModes,
3415 MainAltOps[
OpIdx], UsedLanes);
3422 swap(
OpIdx, *BestIdx, Lane);
3425 StrategyFailed =
true;
3429 OperandData &AltOp = getData(
OpIdx, Lane);
3430 InstructionsState OpS =
3432 if (OpS && OpS.isAltShuffle())
3439 if (!StrategyFailed)
3444#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3447 case ReorderingMode::Load:
3449 case ReorderingMode::Opcode:
3451 case ReorderingMode::Constant:
3453 case ReorderingMode::Splat:
3455 case ReorderingMode::Failed:
3476 const unsigned Indent = 2;
3478 for (
const OperandDataVec &OpDataVec : OpsVec) {
3479 OS <<
"Operand " << Cnt++ <<
"\n";
3480 for (
const OperandData &OpData : OpDataVec) {
3481 OS.
indent(Indent) <<
"{";
3482 if (
Value *V = OpData.V)
3486 OS <<
", APO:" << OpData.APO <<
"}\n";
3508 int BestScore = Limit;
3509 std::optional<int> Index;
3510 for (
int I :
seq<int>(0, Candidates.size())) {
3512 Candidates[
I].second,
3515 if (Score > BestScore) {
3530 DeletedInstructions.insert(
I);
3535 template <
typename T>
3538 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
3540 for (T *V : DeadVals) {
3545 for (T *V : DeadVals) {
3546 if (!V || !Processed.
insert(V).second)
3551 for (
Use &U :
I->operands()) {
3553 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
3555 (Entries.
empty() ||
none_of(Entries, [&](
const TreeEntry *Entry) {
3556 return Entry->VectorizedValue == OpI;
3560 I->dropAllReferences();
3562 for (T *V : DeadVals) {
3564 if (!
I->getParent())
3569 cast<Instruction>(U.getUser()));
3571 "trying to erase instruction with users.");
3572 I->removeFromParent();
3576 while (!DeadInsts.
empty()) {
3579 if (!VI || !VI->getParent())
3582 "Live instruction found in dead worklist!");
3583 assert(VI->use_empty() &&
"Instructions with uses are not dead.");
3590 for (
Use &OpU : VI->operands()) {
3591 Value *OpV = OpU.get();
3603 if (!DeletedInstructions.contains(OpI) &&
3604 (!OpI->getType()->isVectorTy() ||
3605 none_of(VectorValuesAndScales,
3606 [&](
const std::tuple<Value *, unsigned, bool> &V) {
3607 return std::get<0>(V) == OpI;
3613 VI->removeFromParent();
3615 SE->forgetValue(VI);
3622 return AnalyzedReductionsRoots.count(
I);
3627 AnalyzedReductionsRoots.insert(
I);
3632 return AnalyzedReductionVals.contains(
hash_value(VL));
3637 AnalyzedReductionVals.insert(
hash_value(VL));
3641 AnalyzedReductionsRoots.clear();
3642 AnalyzedReductionVals.clear();
3643 AnalyzedMinBWVals.clear();
3651 return MustGather.contains(V);
3655 return NonScheduledFirst.contains(V);
3660 assert(V &&
"V cannot be nullptr.");
3661 return ScalarToTreeEntries.contains(V);
3671 bool collectValuesToDemote(
3672 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
3675 bool &IsProfitableToDemote,
bool IsTruncRoot)
const;
3684 void buildReorderableOperands(
3692 void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
3695 bool areAllUsersVectorized(
3704 const TreeEntry *getOperandEntry(
const TreeEntry *E,
unsigned Idx)
const;
3705 TreeEntry *getOperandEntry(TreeEntry *E,
unsigned Idx) {
3706 return const_cast<TreeEntry *
>(
3707 getOperandEntry(
const_cast<const TreeEntry *
>(E), Idx));
3713 Instruction *getRootEntryInstruction(
const TreeEntry &Entry)
const;
3717 getCastContextHint(
const TreeEntry &TE)
const;
3731 const InstructionsState &LocalState,
3738 unsigned InterleaveFactor = 0);
3749 bool ResizeAllowed =
false)
const;
3756 Value *vectorizeOperand(TreeEntry *
E,
unsigned NodeIdx);
3761 template <
typename BVTy,
typename ResTy,
typename... Args>
3762 ResTy processBuildVector(
const TreeEntry *
E,
Type *ScalarTy, Args &...Params);
3767 Value *createBuildVector(
const TreeEntry *
E,
Type *ScalarTy);
3773 Instruction &getLastInstructionInBundle(
const TreeEntry *
E);
3780 std::optional<TargetTransformInfo::ShuffleKind>
3792 unsigned NumParts)
const;
3804 std::optional<TargetTransformInfo::ShuffleKind>
3805 isGatherShuffledSingleRegisterEntry(
3822 isGatherShuffledEntry(
3825 unsigned NumParts,
bool ForOrder =
false);
3831 Type *ScalarTy)
const;
3835 void setInsertPointAfterBundle(
const TreeEntry *
E);
3845 bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
3850 void tryToVectorizeGatheredLoads(
3852 std::tuple<BasicBlock *, Value *, Type *>,
3860 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
3876 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
3880 void reorderGatherNode(TreeEntry &TE);
3885 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3888 SmallVector<int> getCommonMask()
const {
3889 if (State == TreeEntry::SplitVectorize)
3891 SmallVector<int>
Mask;
3898 SmallVector<int> getSplitMask()
const {
3899 assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
3900 "Expected only split vectorize node.");
3902 unsigned CommonVF = std::max<unsigned>(
3903 CombinedEntriesWithIndices.back().second,
3904 Scalars.size() - CombinedEntriesWithIndices.back().second);
3905 for (
auto [Idx,
I] :
enumerate(ReorderIndices))
3907 Idx + (Idx >= CombinedEntriesWithIndices.back().second
3908 ? CommonVF - CombinedEntriesWithIndices.back().second
3915 void reorderSplitNode(
unsigned Idx, ArrayRef<int> Mask,
3916 ArrayRef<int> MaskOrder);
3921 if (
Mask.size() != VL.
size() && VL.
size() == Scalars.size())
3922 return std::equal(VL.
begin(), VL.
end(), Scalars.begin());
3925 [Scalars](
Value *V,
int Idx) {
3926 return (isa<UndefValue>(V) &&
3927 Idx == PoisonMaskElem) ||
3928 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3931 if (!ReorderIndices.empty()) {
3935 SmallVector<int>
Mask;
3937 if (VL.
size() == Scalars.size())
3938 return IsSame(Scalars, Mask);
3939 if (VL.
size() == ReuseShuffleIndices.size()) {
3941 return IsSame(Scalars, Mask);
3945 return IsSame(Scalars, ReuseShuffleIndices);
3949 bool hasEqualOperands(
const TreeEntry &TE)
const {
3950 if (
TE.getNumOperands() != getNumOperands())
3952 SmallBitVector
Used(getNumOperands());
3953 for (
unsigned I = 0,
E = getNumOperands();
I <
E; ++
I) {
3954 unsigned PrevCount =
Used.count();
3955 for (
unsigned K = 0;
K <
E; ++
K) {
3958 if (getOperand(K) ==
TE.getOperand(
I)) {
3964 if (PrevCount ==
Used.count())
3973 unsigned getVectorFactor()
const {
3974 if (!ReuseShuffleIndices.empty())
3975 return ReuseShuffleIndices.size();
3976 return Scalars.size();
3980 bool isGather()
const {
return State == NeedToGather; }
3986 WeakTrackingVH VectorizedValue =
nullptr;
4007 enum CombinedOpcode {
4009 MinMax = Instruction::OtherOpsEnd + 1,
4012 CombinedOpcode CombinedOp = NotCombinedOp;
4015 SmallVector<int, 4> ReuseShuffleIndices;
4018 SmallVector<unsigned, 4> ReorderIndices;
4026 VecTreeTy &Container;
4029 EdgeInfo UserTreeIndex;
4042 SmallVector<ValueList, 2> Operands;
4045 SmallPtrSet<const Value *, 4> CopyableElements;
4049 InstructionsState S = InstructionsState::invalid();
4052 unsigned InterleaveFactor = 0;
4055 bool DoesNotNeedToSchedule =
false;
4059 if (Operands.size() <
OpIdx + 1)
4060 Operands.resize(
OpIdx + 1);
4063 "Number of operands is greater than the number of scalars.");
4070 unsigned getInterleaveFactor()
const {
return InterleaveFactor; }
4072 void setInterleave(
unsigned Factor) { InterleaveFactor = Factor; }
4075 void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule =
true; }
4078 bool doesNotNeedToSchedule()
const {
return DoesNotNeedToSchedule; }
4083 setOperand(
I, Operands[
I]);
4087 void reorderOperands(ArrayRef<int> Mask) {
4095 return Operands[
OpIdx];
4101 return Operands[
OpIdx];
4105 unsigned getNumOperands()
const {
return Operands.size(); }
4108 Value *getSingleOperand(
unsigned OpIdx)
const {
4111 return Operands[
OpIdx][0];
4115 bool isAltShuffle()
const {
return S.isAltShuffle(); }
4117 Instruction *getMatchingMainOpOrAltOp(Instruction *
I)
const {
4118 return S.getMatchingMainOpOrAltOp(
I);
4126 if (
I && getMatchingMainOpOrAltOp(
I))
4128 return S.getMainOp();
4131 void setOperations(
const InstructionsState &S) {
4132 assert(S &&
"InstructionsState is invalid.");
4136 Instruction *getMainOp()
const {
return S.getMainOp(); }
4138 Instruction *getAltOp()
const {
return S.getAltOp(); }
4141 unsigned getOpcode()
const {
return S.getOpcode(); }
4143 unsigned getAltOpcode()
const {
return S.getAltOpcode(); }
4145 bool hasState()
const {
return S.valid(); }
4148 void addCopyableElement(
Value *V) {
4149 assert(S.isCopyableElement(V) &&
"Not a copyable element.");
4150 CopyableElements.insert(V);
4154 bool isCopyableElement(
Value *V)
const {
4155 return CopyableElements.contains(V);
4159 bool hasCopyableElements()
const {
return !CopyableElements.empty(); }
4162 const InstructionsState &getOperations()
const {
return S; }
4166 unsigned findLaneForValue(
Value *V)
const {
4167 unsigned FoundLane = getVectorFactor();
4168 for (
auto *It =
find(Scalars, V), *End = Scalars.end(); It != End;
4169 std::advance(It, 1)) {
4172 FoundLane = std::distance(Scalars.begin(), It);
4173 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
4174 if (!ReorderIndices.empty())
4175 FoundLane = ReorderIndices[FoundLane];
4176 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
4177 if (ReuseShuffleIndices.empty())
4179 if (
auto *RIt =
find(ReuseShuffleIndices, FoundLane);
4180 RIt != ReuseShuffleIndices.end()) {
4181 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
4185 assert(FoundLane < getVectorFactor() &&
"Unable to find given value.");
4192 buildAltOpShuffleMask(
const function_ref<
bool(Instruction *)> IsAltOp,
4193 SmallVectorImpl<int> &Mask,
4194 SmallVectorImpl<Value *> *OpScalars =
nullptr,
4195 SmallVectorImpl<Value *> *AltScalars =
nullptr)
const;
4198 bool isNonPowOf2Vec()
const {
4200 return IsNonPowerOf2;
4206 hasNonWholeRegisterOrNonPowerOf2Vec(
const TargetTransformInfo &
TTI)
const {
4209 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
4210 "Reshuffling not supported with non-power-of-2 vectors yet.");
4211 return IsNonPowerOf2;
4214 Value *getOrdered(
unsigned Idx)
const {
4215 assert(isGather() &&
"Must be used only for buildvectors/gathers.");
4216 if (ReorderIndices.empty())
4217 return Scalars[Idx];
4218 SmallVector<int>
Mask;
4220 return Scalars[
Mask[Idx]];
4226 dbgs() << Idx <<
".\n";
4227 for (
unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
4228 dbgs() <<
"Operand " << OpI <<
":\n";
4229 for (
const Value *V : Operands[OpI])
4232 dbgs() <<
"Scalars: \n";
4233 for (
Value *V : Scalars)
4235 dbgs() <<
"State: ";
4236 if (S && hasCopyableElements())
4237 dbgs() <<
"[[Copyable]] ";
4240 if (InterleaveFactor > 0) {
4241 dbgs() <<
"Vectorize with interleave factor " << InterleaveFactor
4244 dbgs() <<
"Vectorize\n";
4247 case ScatterVectorize:
4248 dbgs() <<
"ScatterVectorize\n";
4250 case StridedVectorize:
4251 dbgs() <<
"StridedVectorize\n";
4253 case CompressVectorize:
4254 dbgs() <<
"CompressVectorize\n";
4257 dbgs() <<
"NeedToGather\n";
4259 case CombinedVectorize:
4260 dbgs() <<
"CombinedVectorize\n";
4262 case SplitVectorize:
4263 dbgs() <<
"SplitVectorize\n";
4267 dbgs() <<
"MainOp: " << *S.getMainOp() <<
"\n";
4268 dbgs() <<
"AltOp: " << *S.getAltOp() <<
"\n";
4270 dbgs() <<
"MainOp: NULL\n";
4271 dbgs() <<
"AltOp: NULL\n";
4273 dbgs() <<
"VectorizedValue: ";
4274 if (VectorizedValue)
4275 dbgs() << *VectorizedValue <<
"\n";
4278 dbgs() <<
"ReuseShuffleIndices: ";
4279 if (ReuseShuffleIndices.empty())
4282 for (
int ReuseIdx : ReuseShuffleIndices)
4283 dbgs() << ReuseIdx <<
", ";
4285 dbgs() <<
"ReorderIndices: ";
4286 for (
unsigned ReorderIdx : ReorderIndices)
4287 dbgs() << ReorderIdx <<
", ";
4289 dbgs() <<
"UserTreeIndex: ";
4291 dbgs() << UserTreeIndex;
4293 dbgs() <<
"<invalid>";
4295 if (!CombinedEntriesWithIndices.empty()) {
4296 dbgs() <<
"Combined entries: ";
4298 dbgs() <<
"Entry index " <<
P.first <<
" with offset " <<
P.second;
4309 StringRef Banner)
const {
4310 dbgs() <<
"SLP: " << Banner <<
":\n";
4312 dbgs() <<
"SLP: Costs:\n";
4313 dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
4314 dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
4315 dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
4316 dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = "
4317 << ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
4323 const InstructionsState &S,
4325 ArrayRef<int> ReuseShuffleIndices = {}) {
4326 auto Invalid = ScheduleBundle::invalid();
4327 return newTreeEntry(VL,
Invalid, S, UserTreeIdx, ReuseShuffleIndices);
4332 const InstructionsState &S,
4334 ArrayRef<int> ReuseShuffleIndices = {},
4335 ArrayRef<unsigned> ReorderIndices = {},
4336 unsigned InterleaveFactor = 0) {
4337 TreeEntry::EntryState EntryState =
4338 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
4339 TreeEntry *
E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
4340 ReuseShuffleIndices, ReorderIndices);
4341 if (
E && InterleaveFactor > 0)
4342 E->setInterleave(InterleaveFactor);
4347 TreeEntry::EntryState EntryState,
4348 ScheduleBundle &Bundle,
const InstructionsState &S,
4350 ArrayRef<int> ReuseShuffleIndices = {},
4351 ArrayRef<unsigned> ReorderIndices = {}) {
4352 assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||
4353 EntryState == TreeEntry::SplitVectorize)) ||
4354 (Bundle && EntryState != TreeEntry::NeedToGather &&
4355 EntryState != TreeEntry::SplitVectorize)) &&
4356 "Need to vectorize gather entry?");
4358 if (GatheredLoadsEntriesFirst.has_value() &&
4359 EntryState == TreeEntry::NeedToGather && S &&
4360 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
4361 !UserTreeIdx.UserTE)
4363 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
4364 TreeEntry *
Last = VectorizableTree.back().get();
4365 Last->Idx = VectorizableTree.size() - 1;
4366 Last->State = EntryState;
4367 if (UserTreeIdx.UserTE)
4368 OperandsToTreeEntry.try_emplace(
4369 std::make_pair(UserTreeIdx.UserTE, UserTreeIdx.EdgeIdx),
Last);
4374 ReuseShuffleIndices.empty()) &&
4375 "Reshuffling scalars not yet supported for nodes with padding");
4376 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
4377 ReuseShuffleIndices.end());
4378 if (ReorderIndices.
empty()) {
4381 Last->setOperations(S);
4384 Last->Scalars.assign(VL.
size(),
nullptr);
4386 [VL](
unsigned Idx) ->
Value * {
4387 if (Idx >= VL.size())
4388 return UndefValue::get(VL.front()->getType());
4393 Last->setOperations(S);
4394 Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
4396 if (EntryState == TreeEntry::SplitVectorize) {
4397 assert(S &&
"Split nodes must have operations.");
4398 Last->setOperations(S);
4399 SmallPtrSet<Value *, 4> Processed;
4400 for (
Value *V : VL) {
4404 auto It = ScalarsInSplitNodes.find(V);
4405 if (It == ScalarsInSplitNodes.end()) {
4406 ScalarsInSplitNodes.try_emplace(V).first->getSecond().push_back(
Last);
4407 (void)Processed.
insert(V);
4408 }
else if (Processed.
insert(V).second) {
4410 "Value already associated with the node.");
4411 It->getSecond().push_back(
Last);
4414 }
else if (!
Last->isGather()) {
4417 (!S.areInstructionsWithCopyableElements() &&
4419 all_of(VL, [&](
Value *V) {
return S.isNonSchedulable(V); }))
4420 Last->setDoesNotNeedToSchedule();
4421 SmallPtrSet<Value *, 4> Processed;
4422 for (
Value *V : VL) {
4425 if (S.isCopyableElement(V)) {
4426 Last->addCopyableElement(V);
4429 auto It = ScalarToTreeEntries.find(V);
4430 if (It == ScalarToTreeEntries.end()) {
4431 ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(
Last);
4432 (void)Processed.
insert(V);
4433 }
else if (Processed.
insert(V).second) {
4435 "Value already associated with the node.");
4436 It->getSecond().push_back(
Last);
4440 assert((!Bundle.getBundle().empty() ||
Last->doesNotNeedToSchedule()) &&
4441 "Bundle and VL out of sync");
4442 if (!Bundle.getBundle().empty()) {
4443#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
4444 auto *BundleMember = Bundle.getBundle().begin();
4445 SmallPtrSet<Value *, 4> Processed;
4446 for (
Value *V : VL) {
4447 if (S.isNonSchedulable(V) || !Processed.
insert(V).second)
4451 assert(BundleMember == Bundle.getBundle().end() &&
4452 "Bundle and VL out of sync");
4454 Bundle.setTreeEntry(
Last);
4458 bool AllConstsOrCasts =
true;
4459 for (
Value *V : VL) {
4460 if (S && S.areInstructionsWithCopyableElements() &&
4461 S.isCopyableElement(V))
4462 Last->addCopyableElement(V);
4465 AllConstsOrCasts &=
I &&
I->getType()->isIntegerTy();
4466 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
4467 !UserTreeIdx.UserTE->isGather())
4468 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(
Last);
4471 if (AllConstsOrCasts)
4473 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
4474 MustGather.insert_range(VL);
4477 if (UserTreeIdx.UserTE)
4478 Last->UserTreeIndex = UserTreeIdx;
4484 TreeEntry::VecTreeTy VectorizableTree;
4489 for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
4490 VectorizableTree[
Id]->dump();
4498 assert(V &&
"V cannot be nullptr.");
4499 auto It = ScalarToTreeEntries.find(V);
4500 if (It == ScalarToTreeEntries.end())
4502 return It->getSecond();
4507 assert(V &&
"V cannot be nullptr.");
4508 auto It = ScalarsInSplitNodes.find(V);
4509 if (It == ScalarsInSplitNodes.end())
4511 return It->getSecond();
4516 bool SameVF =
false)
const {
4517 assert(V &&
"V cannot be nullptr.");
4518 for (TreeEntry *TE : ScalarToTreeEntries.lookup(V))
4519 if ((!SameVF ||
TE->getVectorFactor() == VL.
size()) &&
TE->isSame(VL))
4530 bool areAltOperandsProfitable(
const InstructionsState &S,
4535 class ScalarsVectorizationLegality {
4536 InstructionsState S;
4538 bool TryToFindDuplicates;
4539 bool TrySplitVectorize;
4542 ScalarsVectorizationLegality(InstructionsState S,
bool IsLegal,
4543 bool TryToFindDuplicates =
true,
4544 bool TrySplitVectorize =
false)
4545 : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
4546 TrySplitVectorize(TrySplitVectorize) {
4547 assert((!IsLegal || (S.valid() && TryToFindDuplicates)) &&
4548 "Inconsistent state");
4550 const InstructionsState &getInstructionsState()
const {
return S; };
4551 bool isLegal()
const {
return IsLegal; }
4552 bool tryToFindDuplicates()
const {
return TryToFindDuplicates; }
4553 bool trySplitVectorize()
const {
return TrySplitVectorize; }
4558 ScalarsVectorizationLegality
4561 bool TryCopyableElementsVectorization)
const;
4565 TreeEntry::EntryState getScalarsVectorizationState(
4567 bool IsScatterVectorizeUserTE,
OrdersType &CurrentOrder,
4568 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo);
4571 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
4574 SmallDenseMap<std::pair<const TreeEntry *, unsigned>, TreeEntry *>
4575 OperandsToTreeEntry;
4578 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarsInSplitNodes;
4581 SmallDenseMap<Value *, unsigned> InstrElementSize;
4595 SmallDenseMap<const TreeEntry *, WeakTrackingVH> EntryToLastInstruction;
4599 SmallDenseMap<const Instruction *, Instruction *> LastInstructionToPos;
4604 SetVector<const TreeEntry *> PostponedGathers;
4606 using ValueToGatherNodesMap =
4607 DenseMap<Value *, SmallSetVector<const TreeEntry *, 4>>;
4608 ValueToGatherNodesMap ValueToGatherNodes;
4613 SetVector<unsigned> LoadEntriesToVectorize;
4616 bool IsGraphTransformMode =
false;
4619 std::optional<unsigned> GatheredLoadsEntriesFirst;
4622 SmallDenseMap<
const TreeEntry *,
4623 std::tuple<SmallVector<int>,
VectorType *, unsigned,
bool>>
4624 CompressEntryToData;
4627 struct ExternalUser {
4628 ExternalUser(
Value *S, llvm::User *U,
const TreeEntry &E,
unsigned L)
4629 : Scalar(S), User(
U), E(E), Lane(
L) {}
4632 Value *Scalar =
nullptr;
4635 llvm::User *User =
nullptr;
4643 using UserList = SmallVector<ExternalUser, 16>;
4649 bool isAliased(
const MemoryLocation &Loc1, Instruction *Inst1,
4650 Instruction *Inst2) {
4653 AliasCacheKey
Key = std::make_pair(Inst1, Inst2);
4654 auto Res = AliasCache.try_emplace(
Key);
4656 return Res.first->second;
4657 bool Aliased =
isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
4659 Res.first->getSecond() = Aliased;
4663 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
4667 SmallDenseMap<AliasCacheKey, bool> AliasCache;
4672 BatchAAResults BatchAA;
4679 DenseSet<Instruction *> DeletedInstructions;
4682 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
4685 DenseSet<size_t> AnalyzedReductionVals;
4689 DenseSet<Value *> AnalyzedMinBWVals;
4695 UserList ExternalUses;
4699 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
4703 SmallPtrSet<Value *, 4> ExternalUsesWithNonUsers;
4706 SmallPtrSet<const Value *, 32> EphValues;
4710 SetVector<Instruction *> GatherShuffleExtractSeq;
4713 DenseSet<BasicBlock *> CSEBlocks;
4716 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
4723 class ScheduleEntity {
4724 friend class ScheduleBundle;
4725 friend class ScheduleData;
4726 friend class ScheduleCopyableData;
4729 enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData };
4730 Kind getKind()
const {
return K; }
4731 ScheduleEntity(Kind K) : K(K) {}
4735 int SchedulingPriority = 0;
4738 bool IsScheduled =
false;
4740 const Kind K = Kind::ScheduleData;
4743 ScheduleEntity() =
delete;
4745 void setSchedulingPriority(
int Priority) { SchedulingPriority = Priority; }
4746 int getSchedulingPriority()
const {
return SchedulingPriority; }
4747 bool isReady()
const {
4749 return SD->isReady();
4751 return CD->isReady();
4757 bool hasValidDependencies()
const {
4759 return SD->hasValidDependencies();
4761 return CD->hasValidDependencies();
4765 int getUnscheduledDeps()
const {
4767 return SD->getUnscheduledDeps();
4769 return CD->getUnscheduledDeps();
4773 int incrementUnscheduledDeps(
int Incr) {
4775 return SD->incrementUnscheduledDeps(Incr);
4779 int getDependencies()
const {
4781 return SD->getDependencies();
4787 return SD->getInst();
4792 bool isScheduled()
const {
return IsScheduled; }
4793 void setScheduled(
bool Scheduled) { IsScheduled = Scheduled; }
4795 static bool classof(
const ScheduleEntity *) {
return true; }
4797#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4798 void dump(raw_ostream &OS)
const {
4800 return SD->dump(OS);
4802 return CD->dump(OS);
4813#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4815 const BoUpSLP::ScheduleEntity &SE) {
4825 class ScheduleData final :
public ScheduleEntity {
4829 enum { InvalidDeps = -1 };
4831 ScheduleData() : ScheduleEntity(Kind::ScheduleData) {}
4832 static bool classof(
const ScheduleEntity *Entity) {
4833 return Entity->getKind() == Kind::ScheduleData;
4836 void init(
int BlockSchedulingRegionID, Instruction *
I) {
4837 NextLoadStore =
nullptr;
4838 IsScheduled =
false;
4839 SchedulingRegionID = BlockSchedulingRegionID;
4840 clearDependencies();
4846 if (hasValidDependencies()) {
4847 assert(UnscheduledDeps <= Dependencies &&
"invariant");
4849 assert(UnscheduledDeps == Dependencies &&
"invariant");
4853 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
4854 "unexpected scheduled state");
4861 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
4865 bool isReady()
const {
return UnscheduledDeps == 0 && !IsScheduled; }
4870 int incrementUnscheduledDeps(
int Incr) {
4871 assert(hasValidDependencies() &&
4872 "increment of unscheduled deps would be meaningless");
4873 UnscheduledDeps += Incr;
4874 assert(UnscheduledDeps >= 0 &&
4875 "Expected valid number of unscheduled deps");
4876 return UnscheduledDeps;
4881 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
4884 void clearDependencies() {
4885 clearDirectDependencies();
4886 MemoryDependencies.clear();
4887 ControlDependencies.clear();
4894 void clearDirectDependencies() {
4895 Dependencies = InvalidDeps;
4896 resetUnscheduledDeps();
4897 IsScheduled =
false;
4901 int getUnscheduledDeps()
const {
return UnscheduledDeps; }
4903 int getDependencies()
const {
return Dependencies; }
4905 void initDependencies() { Dependencies = 0; }
4907 void incDependencies() { Dependencies++; }
4910 int getSchedulingRegionID()
const {
return SchedulingRegionID; }
4917 return MemoryDependencies;
4920 void addMemoryDependency(ScheduleData *Dep) {
4921 MemoryDependencies.push_back(Dep);
4925 return ControlDependencies;
4928 void addControlDependency(ScheduleData *Dep) {
4929 ControlDependencies.push_back(Dep);
4932 ScheduleData *getNextLoadStore()
const {
return NextLoadStore; }
4933 void setNextLoadStore(ScheduleData *
Next) { NextLoadStore =
Next; }
4935 void dump(raw_ostream &OS)
const { OS << *Inst; }
4947 ScheduleData *NextLoadStore =
nullptr;
4951 SmallVector<ScheduleData *> MemoryDependencies;
4957 SmallVector<ScheduleData *> ControlDependencies;
4961 int SchedulingRegionID = 0;
4967 int Dependencies = InvalidDeps;
4973 int UnscheduledDeps = InvalidDeps;
4978 const BoUpSLP::ScheduleData &SD) {
4984 class ScheduleBundle final :
public ScheduleEntity {
4988 bool IsValid =
true;
4990 TreeEntry *TE =
nullptr;
4991 ScheduleBundle(
bool IsValid)
4992 : ScheduleEntity(Kind::ScheduleBundle), IsValid(IsValid) {}
4995 ScheduleBundle() : ScheduleEntity(Kind::ScheduleBundle) {}
4996 static bool classof(
const ScheduleEntity *Entity) {
4997 return Entity->getKind() == Kind::ScheduleBundle;
5002 for (
const ScheduleEntity *SD : Bundle) {
5003 if (SD->hasValidDependencies()) {
5004 assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
5007 assert(SD->getUnscheduledDeps() == SD->getDependencies() &&
5011 if (isScheduled()) {
5012 assert(SD->hasValidDependencies() && SD->getUnscheduledDeps() == 0 &&
5013 "unexpected scheduled state");
5019 int unscheduledDepsInBundle()
const {
5020 assert(*
this &&
"bundle must not be empty");
5022 for (
const ScheduleEntity *BundleMember : Bundle) {
5023 if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
5024 return ScheduleData::InvalidDeps;
5025 Sum += BundleMember->getUnscheduledDeps();
5033 bool hasValidDependencies()
const {
5034 return all_of(Bundle, [](
const ScheduleEntity *SD) {
5035 return SD->hasValidDependencies();
5041 bool isReady()
const {
5042 assert(*
this &&
"bundle must not be empty");
5043 return unscheduledDepsInBundle() == 0 && !isScheduled();
5051 void add(ScheduleEntity *SD) { Bundle.push_back(SD); }
5054 void setTreeEntry(TreeEntry *TE) { this->TE = TE; }
5055 TreeEntry *getTreeEntry()
const {
return TE; }
5057 static ScheduleBundle invalid() {
return {
false}; }
5059 operator bool()
const {
return IsValid; }
5062 void dump(raw_ostream &OS)
const {
5071 OS << *SD->getInst();
5085 const BoUpSLP::ScheduleBundle &Bundle) {
5096 class ScheduleCopyableData final :
public ScheduleEntity {
5103 int SchedulingRegionID = 0;
5105 ScheduleBundle &Bundle;
5108 ScheduleCopyableData(
int BlockSchedulingRegionID,
Instruction *
I,
5109 const EdgeInfo &EI, ScheduleBundle &Bundle)
5110 : ScheduleEntity(Kind::ScheduleCopyableData), Inst(
I), EI(EI),
5111 SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {}
5112 static bool classof(
const ScheduleEntity *Entity) {
5113 return Entity->getKind() == Kind::ScheduleCopyableData;
5118 if (hasValidDependencies()) {
5119 assert(UnscheduledDeps <= Dependencies &&
"invariant");
5121 assert(UnscheduledDeps == Dependencies &&
"invariant");
5125 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
5126 "unexpected scheduled state");
5133 bool hasValidDependencies()
const {
5134 return Dependencies != ScheduleData::InvalidDeps;
5139 bool isReady()
const {
return UnscheduledDeps == 0 && !IsScheduled; }
5144 int incrementUnscheduledDeps(
int Incr) {
5145 assert(hasValidDependencies() &&
5146 "increment of unscheduled deps would be meaningless");
5147 UnscheduledDeps += Incr;
5148 assert(UnscheduledDeps >= 0 &&
"invariant");
5149 return UnscheduledDeps;
5154 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5157 int getUnscheduledDeps()
const {
return UnscheduledDeps; }
5159 int getDependencies()
const {
return Dependencies; }
5161 void initDependencies() { Dependencies = 0; }
5163 void incDependencies() { Dependencies++; }
5166 int getSchedulingRegionID()
const {
return SchedulingRegionID; }
5172 void clearDependencies() {
5173 Dependencies = ScheduleData::InvalidDeps;
5174 UnscheduledDeps = ScheduleData::InvalidDeps;
5175 IsScheduled =
false;
5179 const EdgeInfo &getEdgeInfo()
const {
return EI; }
5182 ScheduleBundle &getBundle() {
return Bundle; }
5183 const ScheduleBundle &getBundle()
const {
return Bundle; }
5185#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5186 void dump(raw_ostream &OS)
const { OS <<
"[Copyable]" << *getInst(); }
5197 int Dependencies = ScheduleData::InvalidDeps;
5203 int UnscheduledDeps = ScheduleData::InvalidDeps;
5233 struct BlockScheduling {
5235 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
5238 ScheduledBundles.clear();
5239 ScheduledBundlesList.
clear();
5240 ScheduleCopyableDataMap.clear();
5241 ScheduleCopyableDataMapByInst.clear();
5242 ScheduleCopyableDataMapByInstUser.clear();
5243 ScheduleCopyableDataMapByUsers.clear();
5245 ScheduleStart =
nullptr;
5246 ScheduleEnd =
nullptr;
5247 FirstLoadStoreInRegion =
nullptr;
5248 LastLoadStoreInRegion =
nullptr;
5249 RegionHasStackSave =
false;
5253 ScheduleRegionSizeLimit -= ScheduleRegionSize;
5256 ScheduleRegionSize = 0;
5260 ++SchedulingRegionID;
5263 ScheduleData *getScheduleData(Instruction *
I) {
5266 if (BB !=
I->getParent())
5269 ScheduleData *SD = ScheduleDataMap.lookup(
I);
5270 if (SD && isInSchedulingRegion(*SD))
5275 ScheduleData *getScheduleData(
Value *V) {
5281 ScheduleCopyableData *getScheduleCopyableData(
const EdgeInfo &EI,
5282 const Value *V)
const {
5283 if (ScheduleCopyableDataMap.empty())
5285 auto It = ScheduleCopyableDataMap.find(std::make_pair(EI, V));
5286 if (It == ScheduleCopyableDataMap.end())
5288 ScheduleCopyableData *SD = It->getSecond().get();
5289 if (!isInSchedulingRegion(*SD))
5297 getScheduleCopyableData(
const Value *User,
unsigned OperandIdx,
5299 if (ScheduleCopyableDataMapByInstUser.empty())
5301 const auto It = ScheduleCopyableDataMapByInstUser.find(
5302 std::make_pair(std::make_pair(User, OperandIdx), V));
5303 if (It == ScheduleCopyableDataMapByInstUser.end())
5306 for (ScheduleCopyableData *SD : It->getSecond()) {
5307 if (isInSchedulingRegion(*SD))
5321 bool areAllOperandsReplacedByCopyableData(Instruction *User,
5325 if (ScheduleCopyableDataMap.empty())
5327 SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;
5328 SmallDenseMap<const TreeEntry *, unsigned> OrderedEntriesCount;
5329 for (
const Use &U :
User->operands()) {
5333 if (Entries.
empty())
5337 for (TreeEntry *TE : Entries) {
5339 bool IsNonSchedulableWithParentPhiNode =
5340 TE->doesNotNeedToSchedule() &&
TE->UserTreeIndex &&
5341 TE->UserTreeIndex.UserTE->hasState() &&
5342 TE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
5343 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
5346 if (IsNonSchedulableWithParentPhiNode) {
5347 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5348 const TreeEntry *ParentTE =
TE->UserTreeIndex.UserTE;
5349 for (
Value *V : ParentTE->Scalars) {
5353 if (ParentsUniqueUsers.
insert(
PHI).second &&
5366 bool IsCommutativeUser =
5371 OrderedEntriesCount.
try_emplace(TE, 0).first->getSecond();
5372 EdgeInfo EI(TE,
U.getOperandNo());
5373 if (!getScheduleCopyableData(EI,
Op))
5379 PotentiallyReorderedEntriesCount.
try_emplace(TE, 0)
5380 .first->getSecond() += Inc;
5383 if (PotentiallyReorderedEntriesCount.
empty())
5384 return all_of(OrderedEntriesCount,
5385 [&](
const std::pair<const TreeEntry *, unsigned> &
P) {
5389 for (
auto &
P : PotentiallyReorderedEntriesCount) {
5390 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5391 bool IsNonSchedulableWithParentPhiNode =
5392 P.first->doesNotNeedToSchedule() &&
P.first->UserTreeIndex &&
5393 P.first->UserTreeIndex.UserTE->hasState() &&
5394 P.first->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
5395 P.first->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
5396 auto *It =
find(
P.first->Scalars, User);
5398 assert(It !=
P.first->Scalars.end() &&
5399 "User is not in the tree entry");
5400 int Lane = std::distance(
P.first->Scalars.begin(), It);
5401 assert(Lane >= 0 &&
"Lane is not found");
5403 Lane =
P.first->ReorderIndices[Lane];
5404 assert(Lane <
static_cast<int>(
P.first->Scalars.size()) &&
5405 "Couldn't find extract lane");
5408 if (IsNonSchedulableWithParentPhiNode) {
5409 const TreeEntry *ParentTE =
P.first->UserTreeIndex.UserTE;
5411 if (!ParentsUniqueUsers.
insert(User).second) {
5417 for (
unsigned OpIdx :
5419 P.first->getMainOp()))) {
5420 if (
P.first->getOperand(
OpIdx)[Lane] ==
Op &&
5421 getScheduleCopyableData(EdgeInfo(
P.first,
OpIdx),
Op))
5425 if (!IsNonSchedulableWithParentPhiNode)
5428 }
while (It !=
P.first->Scalars.end());
5430 return all_of(PotentiallyReorderedEntriesCount,
5431 [&](
const std::pair<const TreeEntry *, unsigned> &
P) {
5432 return P.second ==
NumOps - 1;
5434 all_of(OrderedEntriesCount,
5435 [&](
const std::pair<const TreeEntry *, unsigned> &
P) {
5441 getScheduleCopyableData(
const Instruction *
I)
const {
5442 if (ScheduleCopyableDataMapByInst.empty())
5444 const auto It = ScheduleCopyableDataMapByInst.find(
I);
5445 if (It == ScheduleCopyableDataMapByInst.end())
5448 for (ScheduleCopyableData *SD : It->getSecond()) {
5449 if (isInSchedulingRegion(*SD))
5456 getScheduleCopyableDataUsers(
const Instruction *User)
const {
5457 if (ScheduleCopyableDataMapByUsers.empty())
5459 const auto It = ScheduleCopyableDataMapByUsers.find(User);
5460 if (It == ScheduleCopyableDataMapByUsers.end())
5463 for (ScheduleCopyableData *SD : It->getSecond()) {
5464 if (isInSchedulingRegion(*SD))
5470 ScheduleCopyableData &addScheduleCopyableData(
const EdgeInfo &EI,
5472 int SchedulingRegionID,
5473 ScheduleBundle &Bundle) {
5474 assert(!getScheduleCopyableData(EI,
I) &&
"already in the map");
5475 ScheduleCopyableData *CD =
5476 ScheduleCopyableDataMap
5477 .try_emplace(std::make_pair(EI,
I),
5478 std::make_unique<ScheduleCopyableData>(
5479 SchedulingRegionID,
I, EI, Bundle))
5482 ScheduleCopyableDataMapByInst[
I].push_back(CD);
5486 assert(It !=
Op.end() &&
"Lane not set");
5487 SmallPtrSet<Instruction *, 4> Visited;
5489 int Lane = std::distance(
Op.begin(), It);
5490 assert(Lane >= 0 &&
"Lane not set");
5492 !EI.UserTE->ReorderIndices.empty())
5493 Lane = EI.UserTE->ReorderIndices[Lane];
5494 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
5495 "Couldn't find extract lane");
5497 if (!Visited.
insert(In).second) {
5501 ScheduleCopyableDataMapByInstUser
5502 .try_emplace(std::make_pair(std::make_pair(In, EI.EdgeIdx),
I))
5505 ScheduleCopyableDataMapByUsers.try_emplace(
I)
5512 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
5513 if (ScheduleCopyableData *UserCD =
5514 getScheduleCopyableData(UserEI, In))
5515 ScheduleCopyableDataMapByUsers[
I].remove(UserCD);
5518 }
while (It !=
Op.end());
5520 ScheduleCopyableDataMapByUsers.try_emplace(
I).first->getSecond().insert(
5530 auto It = ScheduledBundles.find(
I);
5531 if (It == ScheduledBundles.end())
5533 return It->getSecond();
5537 bool isInSchedulingRegion(
const ScheduleEntity &SD)
const {
5539 return Data->getSchedulingRegionID() == SchedulingRegionID;
5541 return CD->getSchedulingRegionID() == SchedulingRegionID;
5543 [&](
const ScheduleEntity *BundleMember) {
5544 return isInSchedulingRegion(*BundleMember);
5550 template <
typename ReadyListType>
5551 void schedule(
const BoUpSLP &R,
const InstructionsState &S,
5552 const EdgeInfo &EI, ScheduleEntity *
Data,
5553 ReadyListType &ReadyList) {
5554 auto ProcessBundleMember = [&](ScheduleEntity *BundleMember,
5559 auto DecrUnsched = [&](
auto *
Data,
bool IsControl =
false) {
5560 if ((IsControl ||
Data->hasValidDependencies()) &&
5561 Data->incrementUnscheduledDeps(-1) == 0) {
5568 CopyableBundle.
push_back(&CD->getBundle());
5569 Bundles = CopyableBundle;
5571 Bundles = getScheduleBundles(
Data->getInst());
5573 if (!Bundles.
empty()) {
5574 for (ScheduleBundle *Bundle : Bundles) {
5575 if (Bundle->unscheduledDepsInBundle() == 0) {
5576 assert(!Bundle->isScheduled() &&
5577 "already scheduled bundle gets ready");
5578 ReadyList.insert(Bundle);
5580 <<
"SLP: gets ready: " << *Bundle <<
"\n");
5586 "already scheduled bundle gets ready");
5588 "Expected non-copyable data");
5589 ReadyList.insert(
Data);
5596 if (!ScheduleCopyableDataMap.empty()) {
5598 getScheduleCopyableData(User,
OpIdx,
I);
5599 for (ScheduleCopyableData *CD : CopyableData)
5600 DecrUnsched(CD,
false);
5601 if (!CopyableData.empty())
5604 if (ScheduleData *OpSD = getScheduleData(
I))
5605 DecrUnsched(OpSD,
false);
5611 if (!Bundles.empty()) {
5612 auto *
In = BundleMember->getInst();
5614 SmallDenseMap<const Instruction *, unsigned> OperandsUses;
5615 unsigned TotalOpCount = 0;
5618 TotalOpCount = OperandsUses[
In] = 1;
5620 for (
const Use &U :
In->operands()) {
5623 ++Res.first->getSecond();
5630 auto DecrUnschedForInst =
5632 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>>
5634 if (!ScheduleCopyableDataMap.empty()) {
5635 const EdgeInfo EI = {UserTE,
OpIdx};
5636 if (ScheduleCopyableData *CD =
5637 getScheduleCopyableData(EI,
I)) {
5638 if (!Checked.insert(std::make_pair(CD,
OpIdx)).second)
5640 DecrUnsched(CD,
false);
5644 auto It = OperandsUses.
find(
I);
5645 assert(It != OperandsUses.
end() &&
"Operand not found");
5646 if (It->second > 0) {
5648 assert(TotalOpCount > 0 &&
"No more operands to decrement");
5650 if (ScheduleData *OpSD = getScheduleData(
I)) {
5651 if (!Checked.insert(std::make_pair(OpSD,
OpIdx)).second)
5653 DecrUnsched(OpSD,
false);
5658 for (ScheduleBundle *Bundle : Bundles) {
5659 if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
5661 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5664 auto *It =
find(Bundle->getTreeEntry()->Scalars, In);
5665 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>> Checked;
5666 bool IsNonSchedulableWithParentPhiNode =
5667 Bundle->getTreeEntry()->doesNotNeedToSchedule() &&
5668 Bundle->getTreeEntry()->UserTreeIndex &&
5669 Bundle->getTreeEntry()->UserTreeIndex.UserTE->hasState() &&
5670 Bundle->getTreeEntry()->UserTreeIndex.UserTE->State !=
5671 TreeEntry::SplitVectorize &&
5672 Bundle->getTreeEntry()->UserTreeIndex.UserTE->getOpcode() ==
5676 std::distance(Bundle->getTreeEntry()->Scalars.begin(), It);
5677 assert(Lane >= 0 &&
"Lane not set");
5679 !Bundle->getTreeEntry()->ReorderIndices.empty())
5680 Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
5681 assert(Lane <
static_cast<int>(
5682 Bundle->getTreeEntry()->Scalars.size()) &&
5683 "Couldn't find extract lane");
5693 In->getNumOperands() ==
5694 Bundle->getTreeEntry()->getNumOperands() ||
5695 Bundle->getTreeEntry()->isCopyableElement(In)) &&
5696 "Missed TreeEntry operands?");
5700 if (IsNonSchedulableWithParentPhiNode) {
5701 const TreeEntry *ParentTE =
5702 Bundle->getTreeEntry()->UserTreeIndex.UserTE;
5704 if (!ParentsUniqueUsers.
insert(User).second) {
5705 It = std::find(std::next(It),
5706 Bundle->getTreeEntry()->Scalars.end(), In);
5711 for (
unsigned OpIdx :
5714 Bundle->getTreeEntry()->getOperand(
OpIdx)[Lane])) {
5717 DecrUnschedForInst(
I, Bundle->getTreeEntry(),
OpIdx, Checked);
5720 if (!IsNonSchedulableWithParentPhiNode)
5722 It = std::find(std::next(It),
5723 Bundle->getTreeEntry()->Scalars.end(), In);
5724 }
while (It != Bundle->getTreeEntry()->Scalars.end());
5729 for (Use &U : BundleMember->getInst()->operands()) {
5732 <<
"SLP: check for readiness (def): " << *
I <<
"\n");
5733 DecrUnschedForInst(BundleMember->getInst(),
U.getOperandNo(),
I);
5741 SmallPtrSet<const ScheduleData *, 4> VisitedMemory;
5742 for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) {
5743 if (!VisitedMemory.
insert(MemoryDep).second)
5748 << *MemoryDep <<
"\n");
5749 DecrUnsched(MemoryDep);
5752 SmallPtrSet<const ScheduleData *, 4> VisitedControl;
5753 for (ScheduleData *Dep : SD->getControlDependencies()) {
5754 if (!VisitedControl.
insert(Dep).second)
5759 <<
"SLP: check for readiness (ctrl): " << *Dep <<
"\n");
5760 DecrUnsched(Dep,
true);
5764 SD->setScheduled(
true);
5769 if (
R.isVectorized(In)) {
5771 for (TreeEntry *TE : Entries) {
5773 In->getNumOperands() !=
TE->getNumOperands())
5776 PseudoBundles.
emplace_back(std::make_unique<ScheduleBundle>());
5777 BundlePtr->setTreeEntry(TE);
5782 ProcessBundleMember(SD, Bundles);
5785 Bundle.setScheduled(
true);
5787 auto AreAllBundlesScheduled =
5788 [&](
const ScheduleEntity *SD,
5792 return !SDBundles.empty() &&
5793 all_of(SDBundles, [&](
const ScheduleBundle *SDBundle) {
5794 return SDBundle->isScheduled();
5797 for (ScheduleEntity *SD : Bundle.getBundle()) {
5800 SDBundles = getScheduleBundles(SD->getInst());
5801 if (AreAllBundlesScheduled(SD, SDBundles)) {
5802 SD->setScheduled(
true);
5815 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
5816 ScheduleStart->comesBefore(ScheduleEnd) &&
5817 "Not a valid scheduling region?");
5819 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
5821 if (!Bundles.
empty()) {
5822 for (ScheduleBundle *Bundle : Bundles) {
5823 assert(isInSchedulingRegion(*Bundle) &&
5824 "primary schedule data not in window?");
5829 auto *SD = getScheduleData(
I);
5832 assert(isInSchedulingRegion(*SD) &&
5833 "primary schedule data not in window?");
5838 [](
const ScheduleEntity *Bundle) {
5839 return Bundle->isReady();
5841 "item in ready list not ready?");
5845 template <
typename ReadyListType>
5846 void initialFillReadyList(ReadyListType &ReadyList) {
5847 SmallPtrSet<ScheduleBundle *, 16> Visited;
5848 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
5849 ScheduleData *SD = getScheduleData(
I);
5850 if (SD && SD->hasValidDependencies() && SD->isReady()) {
5853 for (ScheduleBundle *Bundle : Bundles) {
5854 if (!Visited.
insert(Bundle).second)
5856 if (Bundle->hasValidDependencies() && Bundle->isReady()) {
5857 ReadyList.insert(Bundle);
5859 << *Bundle <<
"\n");
5864 ReadyList.insert(SD);
5866 <<
"SLP: initially in ready list: " << *SD <<
"\n");
5877 const InstructionsState &S,
const EdgeInfo &EI);
5884 std::optional<ScheduleBundle *>
5886 const InstructionsState &S,
const EdgeInfo &EI);
5889 ScheduleData *allocateScheduleDataChunks();
5893 bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
5897 void initScheduleData(Instruction *FromI, Instruction *ToI,
5898 ScheduleData *PrevLoadStore,
5899 ScheduleData *NextLoadStore);
5903 void calculateDependencies(ScheduleBundle &Bundle,
bool InsertInReadyList,
5908 void resetSchedule();
5925 SmallDenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
5929 SmallDenseMap<std::pair<EdgeInfo, const Value *>,
5930 std::unique_ptr<ScheduleCopyableData>>
5931 ScheduleCopyableDataMap;
5937 SmallDenseMap<const Instruction *, SmallVector<ScheduleCopyableData *>>
5938 ScheduleCopyableDataMapByInst;
5944 SmallDenseMap<std::pair<std::pair<const Value *, unsigned>,
const Value *>,
5946 ScheduleCopyableDataMapByInstUser;
5966 SmallSetVector<ScheduleCopyableData *, 4>>
5967 ScheduleCopyableDataMapByUsers;
5970 SmallDenseMap<Instruction *, SmallVector<ScheduleBundle *>>
5976 SetVector<ScheduleEntity *> ReadyInsts;
5986 ScheduleData *FirstLoadStoreInRegion =
nullptr;
5990 ScheduleData *LastLoadStoreInRegion =
nullptr;
5995 bool RegionHasStackSave =
false;
5998 int ScheduleRegionSize = 0;
6007 int SchedulingRegionID = 1;
6011 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
6015 void scheduleBlock(
const BoUpSLP &R, BlockScheduling *BS);
6018 const SmallDenseSet<Value *> *UserIgnoreList =
nullptr;
6022 struct OrdersTypeDenseMapInfo {
6035 static unsigned getHashValue(
const OrdersType &V) {
6046 ScalarEvolution *SE;
6047 TargetTransformInfo *TTI;
6048 TargetLibraryInfo *TLI;
6051 AssumptionCache *AC;
6053 const DataLayout *DL;
6054 OptimizationRemarkEmitter *ORE;
6056 unsigned MaxVecRegSize;
6057 unsigned MinVecRegSize;
6060 IRBuilder<TargetFolder> Builder;
6067 DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
6072 unsigned ReductionBitWidth = 0;
6075 unsigned BaseGraphSize = 1;
6079 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
6083 DenseSet<unsigned> ExtraBitWidthNodes;
6091 SecondInfo::getEmptyKey());
6096 SecondInfo::getTombstoneKey());
6101 SecondInfo::getHashValue(Val.
EdgeIdx));
6122 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
6133 return R.VectorizableTree[0].get();
6137 return {&
N->UserTreeIndex,
N->Container};
6141 return {&
N->UserTreeIndex + 1,
N->Container};
6168 static unsigned size(
BoUpSLP *R) {
return R->VectorizableTree.size(); }
6180 OS << Entry->Idx <<
".\n";
6183 for (
auto *V : Entry->Scalars) {
6185 if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
6186 return EU.Scalar == V;
6196 if (Entry->isGather())
6198 if (Entry->State == TreeEntry::ScatterVectorize ||
6199 Entry->State == TreeEntry::StridedVectorize ||
6200 Entry->State == TreeEntry::CompressVectorize)
6201 return "color=blue";
6208 for (
auto *
I : DeletedInstructions) {
6209 if (!
I->getParent()) {
6214 I->insertBefore(F->getEntryBlock(),
6215 F->getEntryBlock().getFirstNonPHIIt());
6217 I->insertBefore(F->getEntryBlock().getTerminator()->getIterator());
6220 for (
Use &U :
I->operands()) {
6222 if (
Op && !DeletedInstructions.count(
Op) &&
Op->hasOneUser() &&
6226 I->dropAllReferences();
6228 for (
auto *
I : DeletedInstructions) {
6230 "trying to erase instruction with users.");
6231 I->eraseFromParent();
6237#ifdef EXPENSIVE_CHECKS
6248 assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
6249 "Expected non-empty mask.");
6252 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
6254 Reuses[Mask[
I]] = Prev[
I];
6262 bool BottomOrder =
false) {
6263 assert(!Mask.empty() &&
"Expected non-empty mask.");
6264 unsigned Sz = Mask.size();
6267 if (Order.
empty()) {
6269 std::iota(PrevOrder.
begin(), PrevOrder.
end(), 0);
6271 PrevOrder.
swap(Order);
6274 for (
unsigned I = 0;
I < Sz; ++
I)
6276 Order[
I] = PrevOrder[Mask[
I]];
6278 return Data.value() == Sz ||
Data.index() ==
Data.value();
6287 if (Order.
empty()) {
6289 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
6299 for (
unsigned I = 0;
I < Sz; ++
I)
6301 Order[MaskOrder[
I]] =
I;
6305std::optional<BoUpSLP::OrdersType>
6307 bool TopToBottom,
bool IgnoreReorder) {
6308 assert(TE.isGather() &&
"Expected gather node only.");
6312 Type *ScalarTy = GatheredScalars.
front()->getType();
6313 size_t NumScalars = GatheredScalars.
size();
6315 return std::nullopt;
6322 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
6324 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
6327 if (GatherShuffles.
empty() && ExtractShuffles.
empty())
6328 return std::nullopt;
6329 OrdersType CurrentOrder(NumScalars, NumScalars);
6330 if (GatherShuffles.
size() == 1 &&
6332 Entries.
front().front()->isSame(TE.Scalars)) {
6336 return std::nullopt;
6338 if (Entries.
front().front()->UserTreeIndex.UserTE ==
6339 TE.UserTreeIndex.UserTE)
6340 return std::nullopt;
6343 if (!IgnoreReorder && Entries.
front().front()->Idx == 0)
6344 return std::nullopt;
6347 if (!Entries.
front().front()->ReuseShuffleIndices.empty() &&
6348 TE.getVectorFactor() == 2 && Mask.size() == 2 &&
6351 return P.value() % 2 != static_cast<int>(P.index()) % 2;
6353 return std::nullopt;
6357 std::iota(CurrentOrder.
begin(), CurrentOrder.
end(), 0);
6358 return CurrentOrder;
6362 return all_of(Mask, [&](
int I) {
6369 if ((ExtractShuffles.
empty() && IsSplatMask(Mask) &&
6370 (Entries.
size() != 1 ||
6371 Entries.
front().front()->ReorderIndices.empty())) ||
6372 (GatherShuffles.
empty() && IsSplatMask(ExtractMask)))
6373 return std::nullopt;
6379 if (ShuffledSubMasks.
test(
I))
6381 const int VF = GetVF(
I);
6387 if (
any_of(Slice, [&](
unsigned I) {
return I != NumScalars; })) {
6389 ShuffledSubMasks.
set(
I);
6393 int FirstMin = INT_MAX;
6394 int SecondVecFound =
false;
6396 int Idx = Mask[
I * PartSz + K];
6398 Value *V = GatheredScalars[
I * PartSz + K];
6400 SecondVecFound =
true;
6409 SecondVecFound =
true;
6413 FirstMin = (FirstMin / PartSz) * PartSz;
6415 if (SecondVecFound) {
6417 ShuffledSubMasks.
set(
I);
6421 int Idx = Mask[
I * PartSz + K];
6425 if (Idx >= PartSz) {
6426 SecondVecFound =
true;
6429 if (CurrentOrder[
I * PartSz + Idx] >
6430 static_cast<unsigned>(
I * PartSz + K) &&
6431 CurrentOrder[
I * PartSz + Idx] !=
6432 static_cast<unsigned>(
I * PartSz + Idx))
6433 CurrentOrder[
I * PartSz + Idx] =
I * PartSz + K;
6436 if (SecondVecFound) {
6438 ShuffledSubMasks.
set(
I);
6444 if (!ExtractShuffles.
empty())
6445 TransformMaskToOrder(
6446 CurrentOrder, ExtractMask, PartSz, NumParts, [&](
unsigned I) {
6447 if (!ExtractShuffles[
I])
6450 unsigned Sz =
getNumElems(TE.getVectorFactor(), PartSz,
I);
6452 int K =
I * PartSz + Idx;
6455 if (!TE.ReuseShuffleIndices.empty())
6456 K = TE.ReuseShuffleIndices[K];
6459 if (!TE.ReorderIndices.empty())
6460 K = std::distance(TE.ReorderIndices.begin(),
6461 find(TE.ReorderIndices, K));
6467 .getKnownMinValue());
6472 if (GatherShuffles.
size() == 1 && NumParts != 1) {
6473 if (ShuffledSubMasks.
any())
6474 return std::nullopt;
6475 PartSz = NumScalars;
6478 if (!Entries.
empty())
6479 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](
unsigned I) {
6480 if (!GatherShuffles[
I])
6482 return std::max(Entries[
I].front()->getVectorFactor(),
6483 Entries[
I].back()->getVectorFactor());
6485 unsigned NumUndefs =
count(CurrentOrder, NumScalars);
6486 if (ShuffledSubMasks.
all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
6487 return std::nullopt;
6488 return std::move(CurrentOrder);
6493 bool CompareOpcodes =
true) {
6499 return (!GEP1 || GEP1->getNumOperands() == 2) &&
6500 (!GEP2 || GEP2->getNumOperands() == 2) &&
6501 (((!GEP1 ||
isConstant(GEP1->getOperand(1))) &&
6502 (!GEP2 ||
isConstant(GEP2->getOperand(1)))) ||
6505 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
6509template <
typename T>
6514 return CommonAlignment;
6520 "Order is empty. Please check it before using isReverseOrder.");
6521 unsigned Sz = Order.
size();
6523 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
6534 const SCEV *PtrSCEVLowest =
nullptr;
6535 const SCEV *PtrSCEVHighest =
nullptr;
6538 for (
Value *Ptr : PointerOps) {
6543 if (!PtrSCEVLowest && !PtrSCEVHighest) {
6544 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
6551 PtrSCEVLowest = PtrSCEV;
6558 PtrSCEVHighest = PtrSCEV;
6566 int Size =
DL.getTypeStoreSize(ElemTy);
6567 auto TryGetStride = [&](
const SCEV *Dist,
6568 const SCEV *Multiplier) ->
const SCEV * {
6570 if (M->getOperand(0) == Multiplier)
6571 return M->getOperand(1);
6572 if (M->getOperand(1) == Multiplier)
6573 return M->getOperand(0);
6576 if (Multiplier == Dist)
6581 const SCEV *Stride =
nullptr;
6582 if (
Size != 1 || SCEVs.
size() > 2) {
6584 Stride = TryGetStride(Dist, Sz);
6592 using DistOrdPair = std::pair<int64_t, int>;
6594 std::set<DistOrdPair,
decltype(Compare)> Offsets(Compare);
6596 bool IsConsecutive =
true;
6597 for (
const SCEV *PtrSCEV : SCEVs) {
6599 if (PtrSCEV != PtrSCEVLowest) {
6601 const SCEV *Coeff = TryGetStride(Diff, Stride);
6611 Dist = SC->getAPInt().getZExtValue();
6616 auto Res = Offsets.emplace(Dist, Cnt);
6620 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
6623 if (Offsets.size() != SCEVs.
size())
6625 SortedIndices.
clear();
6626 if (!IsConsecutive) {
6630 for (
const std::pair<int64_t, int> &Pair : Offsets) {
6631 SortedIndices[Cnt] = Pair.second;
6638static std::pair<InstructionCost, InstructionCost>
6657 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask,
CostKind, Index, SubTp,
6659 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
6662 Mask, NumSrcElts, NumSubElts, Index)) {
6663 if (Index + NumSubElts > NumSrcElts &&
6664 Index + NumSrcElts <=
static_cast<int>(
Mask.size()))
6668 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask,
CostKind, Index, SubTp,
6681 "ScalableVectorType is not supported.");
6684 "Incorrect usage.");
6689 unsigned ScalarTyNumElements = VecTy->getNumElements();
6692 if (!DemandedElts[
I])
6696 I * ScalarTyNumElements, VecTy);
6699 I * ScalarTyNumElements, VecTy);
6703 return TTI.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
6712 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
6713 if (Opcode == Instruction::ExtractElement) {
6719 Index * VecTy->getNumElements(), VecTy);
6722 return TTI.getVectorInstrCost(Opcode, Val,
CostKind, Index, Scalar,
6735 getWidenedType(VecTy->getElementType(), ScalarTy->getNumElements());
6737 Index * ScalarTy->getNumElements(), SubTp) +
6741 return TTI.getExtractWithExtendCost(Opcode, Dst, VecTy, Index,
CostKind);
6757 auto *Begin = std::next(
Mask.begin(), Index);
6758 std::iota(Begin, std::next(Begin, SubVecVF), 0);
6759 Vec = Builder.CreateShuffleVector(V, Mask);
6762 std::iota(
Mask.begin(),
Mask.end(), 0);
6763 std::iota(std::next(
Mask.begin(), Index),
6764 std::next(
Mask.begin(), Index + SubVecVF), VecVF);
6766 return Generator(Vec, V, Mask);
6769 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
6770 V = Builder.CreateShuffleVector(V, ResizeMask);
6772 return Builder.CreateShuffleVector(Vec, V, Mask);
6777 unsigned SubVecVF,
unsigned Index) {
6779 std::iota(Mask.begin(), Mask.end(), Index);
6780 return Builder.CreateShuffleVector(Vec, Mask);
6790 const unsigned Sz = PointerOps.
size();
6793 CompressMask[0] = 0;
6795 std::optional<unsigned> Stride = 0;
6798 Value *Ptr = Order.
empty() ? PointerOps[
I] : PointerOps[Order[
I]];
6799 std::optional<int64_t> OptPos =
6801 if (!OptPos || OptPos > std::numeric_limits<unsigned>::max())
6803 unsigned Pos =
static_cast<unsigned>(*OptPos);
6804 CompressMask[
I] = Pos;
6811 if (Pos != *Stride *
I)
6814 return Stride.has_value();
6827 InterleaveFactor = 0;
6829 const size_t Sz = VL.
size();
6837 if (AreAllUsersVectorized(V))
6840 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
CostKind,
6841 Mask.empty() ?
I : Mask[
I]);
6844 if (ExtractCost <= ScalarCost)
6849 if (Order.
empty()) {
6850 Ptr0 = PointerOps.
front();
6851 PtrN = PointerOps.
back();
6853 Ptr0 = PointerOps[Order.
front()];
6854 PtrN = PointerOps[Order.
back()];
6856 std::optional<int64_t> Diff =
6860 const size_t MaxRegSize =
6864 if (*Diff / Sz >= MaxRegSize / 8)
6868 Align CommonAlignment = LI->getAlign();
6870 Ptr0, LoadVecTy, CommonAlignment,
DL,
6873 if (IsMasked && !
TTI.isLegalMaskedLoad(LoadVecTy, CommonAlignment,
6874 LI->getPointerAddressSpace()))
6880 assert(CompressMask.
size() >= 2 &&
"At least two elements are required");
6884 auto [ScalarGEPCost, VectorGEPCost] =
6886 Instruction::GetElementPtr,
CostKind, ScalarTy, LoadVecTy);
6903 LoadCost =
TTI.getMaskedMemoryOpCost({Intrinsic::masked_load, LoadVecTy,
6905 LI->getPointerAddressSpace()},
6909 TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
6910 LI->getPointerAddressSpace(),
CostKind);
6912 if (IsStrided && !IsMasked && Order.
empty()) {
6919 AlignedLoadVecTy = LoadVecTy;
6920 if (
TTI.isLegalInterleavedAccessType(AlignedLoadVecTy, CompressMask[1],
6922 LI->getPointerAddressSpace())) {
6924 VectorGEPCost +
TTI.getInterleavedMemoryOpCost(
6925 Instruction::Load, AlignedLoadVecTy,
6926 CompressMask[1], {}, CommonAlignment,
6927 LI->getPointerAddressSpace(),
CostKind, IsMasked);
6928 if (InterleavedCost < GatherCost) {
6929 InterleaveFactor = CompressMask[1];
6930 LoadVecTy = AlignedLoadVecTy;
6937 if (!Order.
empty()) {
6940 NewMask[
I] = CompressMask[Mask[
I]];
6942 CompressMask.
swap(NewMask);
6944 InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
6945 return TotalVecCost < GatherCost;
6958 unsigned InterleaveFactor;
6962 AreAllUsersVectorized, IsMasked, InterleaveFactor,
6963 CompressMask, LoadVecTy);
6980 Align Alignment,
const int64_t Diff,
6981 const size_t Sz)
const {
6982 if (Diff % (Sz - 1) != 0)
6986 auto IsAnyPointerUsedOutGraph =
any_of(PointerOps, [&](
Value *V) {
6988 return !isVectorized(U) && !MustGather.contains(U);
6992 const uint64_t AbsoluteDiff = std::abs(Diff);
6994 if (IsAnyPointerUsedOutGraph ||
6995 (AbsoluteDiff > Sz &&
6998 AbsoluteDiff % Sz == 0 &&
has_single_bit(AbsoluteDiff / Sz)))) ||
6999 Diff == -(
static_cast<int64_t
>(Sz) - 1)) {
7000 int64_t Stride = Diff /
static_cast<int64_t
>(Sz - 1);
7001 if (Diff != Stride *
static_cast<int64_t
>(Sz - 1))
7003 if (!TTI->isLegalStridedLoadStore(VecTy, Alignment))
7013 Value *Ptr0,
Value *PtrN, StridedPtrInfo &SPtrInfo)
const {
7014 const size_t Sz = PointerOps.
size();
7019 SortedIndices.
empty() ? PointerOps[
I] : PointerOps[SortedIndices[
I]];
7020 SortedOffsetsFromBase[
I] =
7038 int64_t StrideWithinGroup =
7039 SortedOffsetsFromBase[1] - SortedOffsetsFromBase[0];
7042 auto IsEndOfGroupIndex = [=, &SortedOffsetsFromBase](
unsigned Idx) {
7043 return SortedOffsetsFromBase[Idx] - SortedOffsetsFromBase[Idx - 1] !=
7048 unsigned GroupSize = FoundIt != Indices.end() ? *FoundIt : Sz;
7050 unsigned VecSz = Sz;
7051 Type *NewScalarTy = ScalarTy;
7055 bool NeedsWidening = Sz != GroupSize;
7056 if (NeedsWidening) {
7057 if (Sz % GroupSize != 0)
7060 if (StrideWithinGroup != 1)
7062 VecSz = Sz / GroupSize;
7065 DL->getTypeSizeInBits(ScalarTy).getFixedValue() * GroupSize);
7068 if (!
isStridedLoad(PointerOps, NewScalarTy, Alignment, Diff, VecSz))
7071 int64_t StrideIntVal = StrideWithinGroup;
7072 if (NeedsWidening) {
7075 unsigned CurrentGroupStartIdx = GroupSize;
7076 int64_t StrideBetweenGroups =
7077 SortedOffsetsFromBase[GroupSize] - SortedOffsetsFromBase[0];
7078 StrideIntVal = StrideBetweenGroups;
7079 for (; CurrentGroupStartIdx < Sz; CurrentGroupStartIdx += GroupSize) {
7080 if (SortedOffsetsFromBase[CurrentGroupStartIdx] -
7081 SortedOffsetsFromBase[CurrentGroupStartIdx - GroupSize] !=
7082 StrideBetweenGroups)
7086 auto CheckGroup = [=](
const unsigned StartIdx) ->
bool {
7089 unsigned GroupEndIdx = FoundIt != Indices.end() ? *FoundIt : Sz;
7090 return GroupEndIdx - StartIdx == GroupSize;
7092 for (
unsigned I = 0;
I < Sz;
I += GroupSize) {
7098 Type *StrideTy = DL->getIndexType(Ptr0->
getType());
7099 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, StrideIntVal);
7107 StridedPtrInfo &SPtrInfo)
const {
7108 const unsigned Sz = PointerOps.
size();
7110 if (Sz <= MinProfitableStridedLoads || !TTI->isTypeLegal(StridedLoadTy) ||
7111 !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))
7113 if (
const SCEV *Stride =
7116 SPtrInfo.StrideSCEV = Stride;
7125 unsigned *BestVF,
bool TryRecursiveCheck)
const {
7138 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
7144 const size_t Sz = VL.
size();
7146 auto *POIter = PointerOps.
begin();
7147 for (
Value *V : VL) {
7149 if (!L || !L->isSimple())
7151 *POIter = L->getPointerOperand();
7157 bool IsSorted =
sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
7166 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
7167 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
7178 if (Order.
empty()) {
7179 Ptr0 = PointerOps.
front();
7180 PtrN = PointerOps.
back();
7182 Ptr0 = PointerOps[Order.
front()];
7183 PtrN = PointerOps[Order.
back()];
7185 std::optional<int64_t> Diff =
7188 if (
static_cast<uint64_t>(*Diff) == Sz - 1)
7191 *TLI, [&](
Value *V) {
7192 return areAllUsersVectorized(
7200 *Diff, Ptr0, PtrN, SPtrInfo))
7203 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
7204 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
7209 auto CheckForShuffledLoads = [&, &TTI = *TTI](
Align CommonAlignment,
7211 bool ProfitableGatherPointers) {
7216 auto [ScalarGEPCost, VectorGEPCost] =
7218 Instruction::GetElementPtr,
CostKind, ScalarTy, VecTy);
7222 Type *PtrScalarTy = PointerOps.
front()->getType()->getScalarType();
7224 if (
static_cast<unsigned>(
count_if(
7243 return C + TTI.getInstructionCost(
7249 TTI.getGatherScatterOpCost(
7251 false, CommonAlignment,
CostKind) +
7252 (ProfitableGatherPointers ? 0 : VectorGEPCost);
7260 constexpr unsigned ListLimit = 4;
7261 if (!TryRecursiveCheck || VL.
size() < ListLimit)
7270 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
7280 for (
unsigned Cnt = 0, End = VL.
size(); Cnt + VF <= End; Cnt += VF) {
7285 PointerOps, SPtrInfo, BestVF,
7293 DemandedElts.
setBits(Cnt, Cnt + VF);
7309 if (!DemandedElts.
isZero()) {
7315 if (DemandedElts[Idx])
7326 LI0->getPointerOperand(),
7327 Instruction::GetElementPtr,
CostKind, ScalarTy,
7331 if (
static_cast<unsigned>(
7333 PointerOps.
size() - 1 ||
7352 TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
7353 LI0->getPointerAddressSpace(),
CostKind,
7358 VecLdCost += TTI.getStridedMemoryOpCost(Instruction::Load, SubVecTy,
7359 LI0->getPointerOperand(),
7365 VecLdCost += TTI.getMaskedMemoryOpCost(
7366 {Intrinsic::masked_load, SubVecTy, CommonAlignment,
7367 LI0->getPointerAddressSpace()},
7374 VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,
7375 LI0->getPointerOperand(),
7386 ShuffleMask[Idx] = Idx / VF ==
I ? VL.
size() + Idx % VF : Idx;
7395 if (MaskedGatherCost >= VecLdCost &&
7408 bool ProfitableGatherPointers =
7409 L && Sz > 2 &&
static_cast<unsigned>(
count_if(PointerOps, [L](
Value *V) {
7410 return L->isLoopInvariant(V);
7412 if (ProfitableGatherPointers ||
all_of(PointerOps, [](
Value *
P) {
7415 (
GEP &&
GEP->getNumOperands() == 2 &&
7423 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
7424 ProfitableGatherPointers))
7436 all_of(VL, [](
const Value *V) {
return V->getType()->isPointerTy(); }) &&
7437 "Expected list of pointer operands.");
7442 std::pair<BasicBlock *, Value *>,
7446 .try_emplace(std::make_pair(
7450 SortedIndices.
clear();
7452 auto Key = std::make_pair(BBs[Cnt + 1],
7454 bool Found =
any_of(Bases.try_emplace(
Key).first->second,
7455 [&, &Cnt = Cnt, &Ptr = Ptr](
auto &
Base) {
7456 std::optional<int64_t> Diff =
7457 getPointersDiff(ElemTy, std::get<0>(Base.front()),
7458 ElemTy, Ptr, DL, SE,
7463 Base.emplace_back(Ptr, *Diff, Cnt + 1);
7469 if (Bases.size() > VL.
size() / 2 - 1)
7473 Bases.find(
Key)->second.emplace_back().emplace_back(Ptr, 0, Cnt + 1);
7477 if (Bases.size() == VL.
size())
7480 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
7481 Bases.front().second.size() == VL.
size()))
7486 auto ComparePointers = [](
Value *Ptr1,
Value *Ptr2) {
7495 FirstPointers.
insert(P1);
7496 SecondPointers.
insert(P2);
7502 "Unable to find matching root.");
7505 for (
auto &
Base : Bases) {
7506 for (
auto &Vec :
Base.second) {
7507 if (Vec.size() > 1) {
7509 int64_t InitialOffset = std::get<1>(Vec[0]);
7510 bool AnyConsecutive =
7512 return std::get<1>(
P.value()) ==
7513 int64_t(
P.index()) + InitialOffset;
7517 if (!AnyConsecutive)
7522 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
7526 for (
auto &
T : Bases)
7527 for (
const auto &Vec :
T.second)
7528 for (
const auto &
P : Vec)
7532 "Expected SortedIndices to be the size of VL");
7536std::optional<BoUpSLP::OrdersType>
7538 assert(TE.isGather() &&
"Expected gather node only.");
7539 Type *ScalarTy = TE.Scalars[0]->getType();
7542 Ptrs.
reserve(TE.Scalars.size());
7544 BBs.
reserve(TE.Scalars.size());
7545 for (
Value *V : TE.Scalars) {
7547 if (!L || !L->isSimple())
7548 return std::nullopt;
7554 if (!LoadEntriesToVectorize.contains(TE.Idx) &&
7556 return std::move(Order);
7557 return std::nullopt;
7568 if (VU->
getType() != V->getType())
7571 if (!VU->
hasOneUse() && !V->hasOneUse())
7577 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
7584 bool IsReusedIdx =
false;
7586 if (IE2 == VU && !IE1)
7588 if (IE1 == V && !IE2)
7589 return V->hasOneUse();
7590 if (IE1 && IE1 != V) {
7592 IsReusedIdx |= ReusedIdx.
test(Idx1);
7593 ReusedIdx.
set(Idx1);
7594 if ((IE1 != VU && !IE1->
hasOneUse()) || IsReusedIdx)
7599 if (IE2 && IE2 != VU) {
7601 IsReusedIdx |= ReusedIdx.
test(Idx2);
7602 ReusedIdx.
set(Idx2);
7603 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
7608 }
while (!IsReusedIdx && (IE1 || IE2));
7618std::optional<BoUpSLP::OrdersType>
7620 bool IgnoreReorder) {
7623 if (!TE.ReuseShuffleIndices.empty()) {
7625 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
7626 "Reshuffling scalars not yet supported for nodes with padding");
7629 return std::nullopt;
7637 unsigned Sz = TE.Scalars.size();
7638 if (TE.isGather()) {
7639 if (std::optional<OrdersType> CurrentOrder =
7644 ::addMask(Mask, TE.ReuseShuffleIndices);
7645 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
7646 unsigned Sz = TE.Scalars.size();
7647 for (
int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
7650 Res[Idx + K * Sz] =
I + K * Sz;
7652 return std::move(Res);
7655 if (Sz == 2 && TE.getVectorFactor() == 4 &&
7657 2 * TE.getVectorFactor())) == 1)
7658 return std::nullopt;
7659 if (TE.ReuseShuffleIndices.size() % Sz != 0)
7660 return std::nullopt;
7664 if (TE.ReorderIndices.empty())
7665 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
7668 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
7669 unsigned VF = ReorderMask.
size();
7673 for (
unsigned I = 0;
I < VF;
I += Sz) {
7675 unsigned UndefCnt = 0;
7676 unsigned Limit = std::min(Sz, VF -
I);
7685 Val >=
static_cast<int>(NumParts) || UsedVals.
test(Val) ||
7687 return std::nullopt;
7689 for (
unsigned K = 0; K < NumParts; ++K) {
7690 unsigned Idx = Val + Sz * K;
7691 if (Idx < VF &&
I + K < VF)
7692 ResOrder[Idx] =
I + K;
7695 return std::move(ResOrder);
7697 unsigned VF = TE.getVectorFactor();
7700 TE.ReuseShuffleIndices.end());
7701 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
7703 if (isa<PoisonValue>(V))
7705 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
7706 return Idx && *Idx < Sz;
7708 assert(!TE.isAltShuffle() &&
"Alternate instructions are only supported "
7709 "by BinaryOperator and CastInst.");
7711 if (TE.ReorderIndices.empty())
7712 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
7715 for (
unsigned I = 0;
I < VF; ++
I) {
7716 int &Idx = ReusedMask[
I];
7719 Value *V = TE.Scalars[ReorderMask[Idx]];
7721 Idx = std::distance(ReorderMask.
begin(),
find(ReorderMask, *EI));
7727 std::iota(ResOrder.
begin(), ResOrder.
end(), 0);
7728 auto *It = ResOrder.
begin();
7729 for (
unsigned K = 0; K < VF; K += Sz) {
7733 std::iota(SubMask.
begin(), SubMask.
end(), 0);
7735 transform(CurrentOrder, It, [K](
unsigned Pos) {
return Pos + K; });
7736 std::advance(It, Sz);
7739 return Data.index() ==
Data.value();
7741 return std::nullopt;
7742 return std::move(ResOrder);
7744 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
7745 (!TE.UserTreeIndex || !TE.UserTreeIndex.UserTE->hasState() ||
7747 (TE.ReorderIndices.empty() ||
isReverseOrder(TE.ReorderIndices)))
7748 return std::nullopt;
7749 if (TE.State == TreeEntry::SplitVectorize ||
7750 ((TE.State == TreeEntry::Vectorize ||
7751 TE.State == TreeEntry::StridedVectorize ||
7752 TE.State == TreeEntry::CompressVectorize) &&
7755 assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
7756 "Alternate instructions are only supported by "
7757 "BinaryOperator and CastInst.");
7758 return TE.ReorderIndices;
7760 if (!TopToBottom && IgnoreReorder && TE.State == TreeEntry::Vectorize &&
7761 TE.isAltShuffle()) {
7762 assert(TE.ReuseShuffleIndices.empty() &&
7763 "ReuseShuffleIndices should be "
7764 "empty for alternate instructions.");
7766 TE.buildAltOpShuffleMask(
7768 assert(TE.getMatchingMainOpOrAltOp(
I) &&
7769 "Unexpected main/alternate opcode");
7773 const int VF = TE.getVectorFactor();
7778 ResOrder[Mask[
I] % VF] =
I;
7780 return std::move(ResOrder);
7782 if (!TE.ReorderIndices.empty())
7783 return TE.ReorderIndices;
7784 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
7785 if (!TE.ReorderIndices.empty())
7786 return TE.ReorderIndices;
7789 for (
auto [
I, V] :
zip(UserBVHead, TE.Scalars)) {
7797 while (
II &&
II->hasOneUse() &&
II->getParent() == BB) {
7805 assert(BB1 != BB2 &&
"Expected different basic blocks.");
7806 if (!DT->isReachableFromEntry(BB1))
7808 if (!DT->isReachableFromEntry(BB2))
7810 auto *NodeA = DT->getNode(BB1);
7811 auto *NodeB = DT->getNode(BB2);
7812 assert(NodeA &&
"Should only process reachable instructions");
7813 assert(NodeB &&
"Should only process reachable instructions");
7814 assert((NodeA == NodeB) ==
7815 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
7816 "Different nodes should have different DFS numbers");
7817 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
7819 auto PHICompare = [&](
unsigned I1,
unsigned I2) {
7820 Value *V1 = TE.Scalars[I1];
7821 Value *V2 = TE.Scalars[I2];
7834 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
7835 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
7836 FirstUserOfPhi2->getParent());
7846 if (UserBVHead[I1] && !UserBVHead[I2])
7848 if (!UserBVHead[I1])
7850 if (UserBVHead[I1] == UserBVHead[I2])
7853 return CompareByBasicBlocks(UserBVHead[I1]->
getParent(),
7855 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
7868 if (EE1->getOperand(0) == EE2->getOperand(0))
7870 if (!Inst1 && Inst2)
7872 if (Inst1 && Inst2) {
7880 "Expected either instructions or arguments vector operands.");
7881 return P1->getArgNo() < P2->getArgNo();
7886 std::iota(Phis.
begin(), Phis.
end(), 0);
7889 return std::nullopt;
7890 return std::move(Phis);
7892 if (TE.isGather() &&
7893 (!TE.hasState() || !TE.isAltShuffle() ||
7894 ScalarsInSplitNodes.contains(TE.getMainOp())) &&
7898 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
7902 auto *EE = dyn_cast<ExtractElementInst>(V);
7903 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
7909 canReuseExtract(TE.Scalars, CurrentOrder,
true);
7910 if (Reuse || !CurrentOrder.
empty())
7911 return std::move(CurrentOrder);
7919 int Sz = TE.Scalars.size();
7923 if (It == TE.Scalars.begin())
7926 if (It != TE.Scalars.end()) {
7928 unsigned Idx = std::distance(TE.Scalars.begin(), It);
7943 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
7946 return std::move(Order);
7951 return std::nullopt;
7952 if (TE.Scalars.size() >= 3)
7957 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
7959 StridedPtrInfo SPtrInfo;
7962 CurrentOrder, PointerOps, SPtrInfo);
7965 return std::move(CurrentOrder);
7970 if (std::optional<OrdersType> CurrentOrder =
7972 return CurrentOrder;
7974 return std::nullopt;
7984 for (
unsigned I = Sz,
E = Mask.size();
I <
E;
I += Sz) {
7986 if (Cluster != FirstCluster)
7992void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const {
7995 const unsigned Sz =
TE.Scalars.size();
7997 if (!
TE.isGather() ||
8004 addMask(NewMask,
TE.ReuseShuffleIndices);
8006 TE.ReorderIndices.clear();
8013 for (
auto *It =
TE.ReuseShuffleIndices.begin(),
8014 *End =
TE.ReuseShuffleIndices.end();
8015 It != End; std::advance(It, Sz))
8016 std::iota(It, std::next(It, Sz), 0);
8022 "Expected same size of orders");
8023 size_t Sz = Order.
size();
8026 if (Order[Idx] != Sz)
8027 UsedIndices.
set(Order[Idx]);
8029 if (SecondaryOrder.
empty()) {
8031 if (Order[Idx] == Sz && !UsedIndices.
test(Idx))
8035 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
8036 !UsedIndices.
test(SecondaryOrder[Idx]))
8037 Order[Idx] = SecondaryOrder[Idx];
8045 constexpr unsigned TinyVF = 2;
8046 constexpr unsigned TinyTree = 10;
8047 constexpr unsigned PhiOpsLimit = 12;
8048 constexpr unsigned GatherLoadsLimit = 2;
8049 if (VectorizableTree.size() <= TinyTree)
8051 if (VectorizableTree.front()->hasState() &&
8052 !VectorizableTree.front()->isGather() &&
8053 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
8054 VectorizableTree.front()->getOpcode() == Instruction::PHI ||
8055 (VectorizableTree.front()->getVectorFactor() <= TinyVF &&
8056 (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
8057 VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
8058 VectorizableTree.front()->ReorderIndices.empty()) {
8062 if (VectorizableTree.front()->hasState() &&
8063 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
8064 VectorizableTree.front()->Scalars.size() == TinyVF &&
8065 VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
8068 if (VectorizableTree.front()->hasState() &&
8069 VectorizableTree.front()->getOpcode() == Instruction::Store &&
8070 VectorizableTree.front()->ReorderIndices.empty()) {
8071 const unsigned ReorderedSplitsCnt =
8072 count_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
8073 return TE->State == TreeEntry::SplitVectorize &&
8074 !TE->ReorderIndices.empty() && TE->UserTreeIndex.UserTE &&
8075 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8078 if (ReorderedSplitsCnt <= 1 &&
8080 VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
8081 return ((!TE->isGather() &&
8082 (TE->ReorderIndices.empty() ||
8083 (TE->UserTreeIndex.UserTE &&
8084 TE->UserTreeIndex.UserTE->State ==
8085 TreeEntry::Vectorize &&
8086 !TE->UserTreeIndex.UserTE->ReuseShuffleIndices
8088 (TE->isGather() && TE->ReorderIndices.empty() &&
8089 (!TE->hasState() || TE->isAltShuffle() ||
8090 TE->getOpcode() == Instruction::Load ||
8091 TE->getOpcode() == Instruction::ZExt ||
8092 TE->getOpcode() == Instruction::SExt))) &&
8093 (VectorizableTree.front()->getVectorFactor() > TinyVF ||
8094 !TE->isGather() ||
none_of(TE->Scalars, [&](
Value *V) {
8095 return !isConstant(V) && isVectorized(V);
8097 })) >= VectorizableTree.size() - ReorderedSplitsCnt)
8100 bool HasPhis =
false;
8101 bool HasLoad =
true;
8102 unsigned GatherLoads = 0;
8103 for (
const std::unique_ptr<TreeEntry> &TE :
8104 ArrayRef(VectorizableTree).drop_front()) {
8105 if (TE->State == TreeEntry::SplitVectorize)
8107 if (!TE->hasState()) {
8111 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8116 if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) {
8117 if (!TE->isGather()) {
8124 if (GatherLoads >= GatherLoadsLimit)
8127 if (TE->getOpcode() == Instruction::GetElementPtr ||
8130 if (TE->getOpcode() != Instruction::PHI &&
8131 (!TE->hasCopyableElements() ||
8133 TE->Scalars.size() / 2))
8135 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8136 TE->getNumOperands() > PhiOpsLimit)
8145void BoUpSLP::TreeEntry::reorderSplitNode(
unsigned Idx,
ArrayRef<int> Mask,
8147 assert(State == TreeEntry::SplitVectorize &&
"Expected split user node.");
8150 std::iota(NewMask.
begin(), NewMask.
end(), 0);
8151 std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0);
8154 copy(MaskOrder, NewMaskOrder.begin());
8156 assert(Idx == 1 &&
"Expected either 0 or 1 index.");
8157 unsigned Offset = CombinedEntriesWithIndices.
back().second;
8166 ReorderIndices.clear();
8185 ExternalUserReorderMap;
8189 for_each(VectorizableTree, [&, &TTIRef = *TTI](
8190 const std::unique_ptr<TreeEntry> &TE) {
8193 findExternalStoreUsersReorderIndices(TE.get());
8194 if (!ExternalUserReorderIndices.
empty()) {
8195 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
8197 std::move(ExternalUserReorderIndices));
8203 if (TE->hasState() && TE->isAltShuffle() &&
8204 TE->State != TreeEntry::SplitVectorize) {
8205 Type *ScalarTy = TE->Scalars[0]->getType();
8207 unsigned Opcode0 = TE->getOpcode();
8208 unsigned Opcode1 = TE->getAltOpcode();
8212 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
8213 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
8219 bool IgnoreReorder =
8220 !UserIgnoreList && VectorizableTree.front()->hasState() &&
8221 (VectorizableTree.front()->
getOpcode() == Instruction::InsertElement ||
8222 VectorizableTree.front()->getOpcode() == Instruction::Store);
8223 if (std::optional<OrdersType> CurrentOrder =
8233 const TreeEntry *UserTE = TE.get();
8235 if (!UserTE->UserTreeIndex)
8237 if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8238 UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
8239 UserTE->UserTreeIndex.UserTE->Idx != 0)
8241 UserTE = UserTE->UserTreeIndex.UserTE;
8244 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
8245 if (!(TE->State == TreeEntry::Vectorize ||
8246 TE->State == TreeEntry::StridedVectorize ||
8247 TE->State == TreeEntry::SplitVectorize ||
8248 TE->State == TreeEntry::CompressVectorize) ||
8249 !TE->ReuseShuffleIndices.empty())
8250 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
8251 if (TE->State == TreeEntry::Vectorize &&
8252 TE->getOpcode() == Instruction::PHI)
8253 PhisToOrders.
try_emplace(TE.get(), *CurrentOrder);
8258 for (
unsigned VF = VectorizableTree.front()->getVectorFactor();
8259 !VFToOrderedEntries.
empty() && VF > 1; VF -= 2 - (VF & 1U)) {
8260 auto It = VFToOrderedEntries.
find(VF);
8261 if (It == VFToOrderedEntries.
end())
8275 for (
const TreeEntry *OpTE : OrderedEntries) {
8278 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE) &&
8279 OpTE->State != TreeEntry::SplitVectorize)
8282 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
8284 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
8285 auto It = GathersToOrders.find(OpTE);
8286 if (It != GathersToOrders.end())
8289 if (OpTE->hasState() && OpTE->isAltShuffle()) {
8290 auto It = AltShufflesToOrders.find(OpTE);
8291 if (It != AltShufflesToOrders.end())
8294 if (OpTE->State == TreeEntry::Vectorize &&
8295 OpTE->getOpcode() == Instruction::PHI) {
8296 auto It = PhisToOrders.
find(OpTE);
8297 if (It != PhisToOrders.
end())
8300 return OpTE->ReorderIndices;
8303 auto It = ExternalUserReorderMap.
find(OpTE);
8304 if (It != ExternalUserReorderMap.
end()) {
8305 const auto &ExternalUserReorderIndices = It->second;
8309 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
8310 OrdersUses.try_emplace(
OrdersType(), 0).first->second +=
8311 ExternalUserReorderIndices.size();
8313 for (
const OrdersType &ExtOrder : ExternalUserReorderIndices)
8314 ++OrdersUses.try_emplace(ExtOrder, 0).first->second;
8321 if (OpTE->State == TreeEntry::Vectorize &&
8322 OpTE->getOpcode() == Instruction::Store && !Order.
empty()) {
8323 assert(!OpTE->isAltShuffle() &&
8324 "Alternate instructions are only supported by BinaryOperator "
8328 unsigned E = Order.
size();
8331 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8334 ++OrdersUses.try_emplace(CurrentOrder, 0).first->second;
8336 ++OrdersUses.try_emplace(Order, 0).first->second;
8339 if (OrdersUses.empty())
8342 unsigned IdentityCnt = 0;
8343 unsigned FilledIdentityCnt = 0;
8345 for (
auto &Pair : OrdersUses) {
8347 if (!Pair.first.empty())
8348 FilledIdentityCnt += Pair.second;
8349 IdentityCnt += Pair.second;
8354 unsigned Cnt = IdentityCnt;
8355 for (
auto &Pair : OrdersUses) {
8359 if (Cnt < Pair.second ||
8360 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
8361 Cnt == Pair.second && !BestOrder.
empty() &&
8364 BestOrder = Pair.first;
8377 unsigned E = BestOrder.
size();
8379 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8382 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8384 if (TE->Scalars.size() != VF) {
8385 if (TE->ReuseShuffleIndices.size() == VF) {
8386 assert(TE->State != TreeEntry::SplitVectorize &&
8387 "Split vectorized not expected.");
8392 (!TE->UserTreeIndex ||
8393 TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
8394 TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() ||
8395 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) &&
8396 "All users must be of VF size.");
8403 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->hasState() &&
8409 reorderNodeWithReuses(*TE, Mask);
8411 if (TE->UserTreeIndex &&
8412 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8413 TE->UserTreeIndex.UserTE->reorderSplitNode(
8414 TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder);
8418 if ((TE->State == TreeEntry::SplitVectorize &&
8419 TE->ReuseShuffleIndices.empty()) ||
8420 ((TE->State == TreeEntry::Vectorize ||
8421 TE->State == TreeEntry::StridedVectorize ||
8422 TE->State == TreeEntry::CompressVectorize) &&
8427 (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize &&
8428 TE->ReuseShuffleIndices.empty())) &&
8429 "Alternate instructions are only supported by BinaryOperator "
8435 TE->reorderOperands(Mask);
8438 TE->reorderOperands(Mask);
8439 assert(TE->ReorderIndices.empty() &&
8440 "Expected empty reorder sequence.");
8443 if (!TE->ReuseShuffleIndices.empty()) {
8450 addMask(NewReuses, TE->ReuseShuffleIndices);
8451 TE->ReuseShuffleIndices.swap(NewReuses);
8452 }
else if (TE->UserTreeIndex &&
8453 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8455 TE->UserTreeIndex.UserTE->reorderSplitNode(TE->UserTreeIndex.EdgeIdx,
8461void BoUpSLP::buildReorderableOperands(
8462 TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
8466 if (
any_of(Edges, [
I](
const std::pair<unsigned, TreeEntry *> &OpData) {
8467 return OpData.first ==
I &&
8468 (OpData.second->State == TreeEntry::Vectorize ||
8469 OpData.second->State == TreeEntry::StridedVectorize ||
8470 OpData.second->State == TreeEntry::CompressVectorize ||
8471 OpData.second->State == TreeEntry::SplitVectorize);
8475 if (UserTE->hasState()) {
8476 if (UserTE->getOpcode() == Instruction::ExtractElement ||
8477 UserTE->getOpcode() == Instruction::ExtractValue)
8479 if (UserTE->getOpcode() == Instruction::InsertElement &&
I == 0)
8481 if (UserTE->getOpcode() == Instruction::Store &&
8482 UserTE->State == TreeEntry::Vectorize &&
I == 1)
8484 if (UserTE->getOpcode() == Instruction::Load &&
8485 (UserTE->State == TreeEntry::Vectorize ||
8486 UserTE->State == TreeEntry::StridedVectorize ||
8487 UserTE->State == TreeEntry::CompressVectorize))
8490 TreeEntry *
TE = getOperandEntry(UserTE,
I);
8491 assert(TE &&
"Expected operand entry.");
8492 if (!
TE->isGather()) {
8495 Edges.emplace_back(
I, TE);
8501 if (
TE->State == TreeEntry::ScatterVectorize &&
8502 TE->ReuseShuffleIndices.empty() &&
TE->ReorderIndices.empty())
8506 if (ReorderableGathers.
contains(TE))
8512 struct TreeEntryCompare {
8513 bool operator()(
const TreeEntry *LHS,
const TreeEntry *RHS)
const {
8514 if (LHS->UserTreeIndex && RHS->UserTreeIndex)
8515 return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx;
8516 return LHS->Idx < RHS->Idx;
8525 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8526 if (TE->State != TreeEntry::Vectorize &&
8527 TE->State != TreeEntry::StridedVectorize &&
8528 TE->State != TreeEntry::CompressVectorize &&
8529 TE->State != TreeEntry::SplitVectorize)
8530 NonVectorized.
insert(TE.get());
8531 if (std::optional<OrdersType> CurrentOrder =
8533 Queue.push(TE.get());
8534 if (!(TE->State == TreeEntry::Vectorize ||
8535 TE->State == TreeEntry::StridedVectorize ||
8536 TE->State == TreeEntry::CompressVectorize ||
8537 TE->State == TreeEntry::SplitVectorize) ||
8538 !TE->ReuseShuffleIndices.empty())
8539 GathersToOrders.
insert(TE.get());
8548 while (!Queue.empty()) {
8550 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>
Users;
8551 TreeEntry *TE = Queue.top();
8552 const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
8555 while (!Queue.empty()) {
8557 if (!UserTE || UserTE != TE->UserTreeIndex.UserTE)
8562 for (TreeEntry *TE : OrderedOps) {
8563 if (!(TE->State == TreeEntry::Vectorize ||
8564 TE->State == TreeEntry::StridedVectorize ||
8565 TE->State == TreeEntry::CompressVectorize ||
8566 TE->State == TreeEntry::SplitVectorize ||
8567 (TE->isGather() && GathersToOrders.
contains(TE))) ||
8568 !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
8569 !Visited.
insert(TE).second)
8573 Users.first = TE->UserTreeIndex.UserTE;
8574 Users.second.emplace_back(TE->UserTreeIndex.EdgeIdx, TE);
8578 if (
Data.first->State == TreeEntry::SplitVectorize) {
8580 Data.second.size() <= 2 &&
8581 "Expected not greater than 2 operands for split vectorize node.");
8583 [](
const auto &
Op) { return !Op.second->UserTreeIndex; }))
8586 assert(
Data.first->CombinedEntriesWithIndices.size() == 2 &&
8587 "Expected exactly 2 entries.");
8588 for (
const auto &
P :
Data.first->CombinedEntriesWithIndices) {
8589 TreeEntry &OpTE = *VectorizableTree[
P.first];
8591 if (Order.
empty() || !OpTE.ReuseShuffleIndices.empty()) {
8592 if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
8594 const auto BestOrder =
8603 const unsigned E = Order.
size();
8606 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8608 Data.first->reorderSplitNode(
P.second ? 1 : 0, Mask, MaskOrder);
8610 if (!OpTE.ReorderIndices.empty()) {
8611 OpTE.ReorderIndices.clear();
8612 }
else if (!OpTE.ReuseShuffleIndices.empty()) {
8615 assert(OpTE.isGather() &&
"Expected only gather/buildvector node.");
8619 if (
Data.first->ReuseShuffleIndices.empty() &&
8620 !
Data.first->ReorderIndices.empty()) {
8623 Queue.push(
Data.first);
8629 buildReorderableOperands(
Data.first,
Data.second, NonVectorized,
8641 for (
const auto &
Op :
Data.second) {
8642 TreeEntry *OpTE =
Op.second;
8643 if (!VisitedOps.
insert(OpTE).second)
8645 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
8647 const auto Order = [&]() ->
const OrdersType {
8648 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
8652 return OpTE->ReorderIndices;
8656 if (Order.
size() == 1)
8662 Value *Root = OpTE->hasState()
8665 auto GetSameNodesUsers = [&](
Value *Root) {
8667 for (
const TreeEntry *TE : ValueToGatherNodes.lookup(Root)) {
8668 if (TE != OpTE && TE->UserTreeIndex &&
8669 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8670 TE->Scalars.size() == OpTE->Scalars.size() &&
8671 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8672 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8673 Res.
insert(TE->UserTreeIndex.UserTE);
8675 for (
const TreeEntry *TE : getTreeEntries(Root)) {
8676 if (TE != OpTE && TE->UserTreeIndex &&
8677 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8678 TE->Scalars.size() == OpTE->Scalars.size() &&
8679 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8680 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8681 Res.
insert(TE->UserTreeIndex.UserTE);
8685 auto GetNumOperands = [](
const TreeEntry *TE) {
8686 if (TE->State == TreeEntry::SplitVectorize)
8687 return TE->getNumOperands();
8689 return CI->arg_size();
8690 return TE->getNumOperands();
8692 auto NodeShouldBeReorderedWithOperands = [&, TTI = TTI](
8693 const TreeEntry *TE) {
8701 const TreeEntry *
Op = getOperandEntry(TE, Idx);
8702 if (
Op->isGather() &&
Op->hasState()) {
8703 const TreeEntry *VecOp =
8704 getSameValuesTreeEntry(
Op->getMainOp(),
Op->Scalars);
8708 if (
Op->ReorderIndices.empty() &&
Op->ReuseShuffleIndices.empty())
8715 if (!RevisitedOps.
insert(UTE).second)
8717 return UTE ==
Data.first || !UTE->ReorderIndices.empty() ||
8718 !UTE->ReuseShuffleIndices.empty() ||
8719 (UTE->UserTreeIndex &&
8720 UTE->UserTreeIndex.UserTE ==
Data.first) ||
8721 (
Data.first->UserTreeIndex &&
8722 Data.first->UserTreeIndex.UserTE == UTE) ||
8723 (IgnoreReorder && UTE->UserTreeIndex &&
8724 UTE->UserTreeIndex.UserTE->Idx == 0) ||
8725 NodeShouldBeReorderedWithOperands(UTE);
8728 for (TreeEntry *UTE :
Users) {
8736 const TreeEntry *
Op = getOperandEntry(UTE, Idx);
8738 Queue.push(
const_cast<TreeEntry *
>(
Op));
8743 Data.second, [OpTE](
const std::pair<unsigned, TreeEntry *> &
P) {
8744 return P.second == OpTE;
8747 if (OpTE->State == TreeEntry::Vectorize &&
8748 OpTE->getOpcode() == Instruction::Store && !Order.
empty()) {
8749 assert(!OpTE->isAltShuffle() &&
8750 "Alternate instructions are only supported by BinaryOperator "
8754 unsigned E = Order.
size();
8757 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8760 OrdersUses.try_emplace(CurrentOrder, 0).first->second +=
NumOps;
8762 OrdersUses.try_emplace(Order, 0).first->second +=
NumOps;
8764 auto Res = OrdersUses.try_emplace(
OrdersType(), 0);
8765 const auto AllowsReordering = [&](
const TreeEntry *TE) {
8766 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
8767 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
8768 (IgnoreReorder && TE->Idx == 0))
8770 if (TE->isGather()) {
8780 if (OpTE->UserTreeIndex) {
8781 TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE;
8782 if (!VisitedUsers.
insert(UserTE).second)
8787 if (AllowsReordering(UserTE))
8795 if (
static_cast<unsigned>(
count_if(
8796 Ops, [UserTE, &AllowsReordering](
8797 const std::pair<unsigned, TreeEntry *> &
Op) {
8798 return AllowsReordering(
Op.second) &&
8799 Op.second->UserTreeIndex.UserTE == UserTE;
8800 })) <=
Ops.size() / 2)
8801 ++Res.first->second;
8804 if (OrdersUses.empty()) {
8809 unsigned IdentityCnt = 0;
8810 unsigned VF =
Data.second.front().second->getVectorFactor();
8812 for (
auto &Pair : OrdersUses) {
8814 IdentityCnt += Pair.second;
8819 unsigned Cnt = IdentityCnt;
8820 for (
auto &Pair : OrdersUses) {
8824 if (Cnt < Pair.second) {
8826 BestOrder = Pair.first;
8843 unsigned E = BestOrder.
size();
8845 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8847 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second) {
8848 TreeEntry *TE =
Op.second;
8849 if (!VisitedOps.
insert(TE).second)
8851 if (TE->ReuseShuffleIndices.size() == BestOrder.
size()) {
8852 reorderNodeWithReuses(*TE, Mask);
8856 if (TE->State != TreeEntry::Vectorize &&
8857 TE->State != TreeEntry::StridedVectorize &&
8858 TE->State != TreeEntry::CompressVectorize &&
8859 TE->State != TreeEntry::SplitVectorize &&
8860 (TE->State != TreeEntry::ScatterVectorize ||
8861 TE->ReorderIndices.empty()))
8863 assert((BestOrder.
size() == TE->ReorderIndices.size() ||
8864 TE->ReorderIndices.empty()) &&
8865 "Non-matching sizes of user/operand entries.");
8867 if (IgnoreReorder && TE == VectorizableTree.front().get())
8868 IgnoreReorder =
false;
8871 for (TreeEntry *
Gather : GatherOps) {
8873 "Unexpected reordering of gathers.");
8874 if (!
Gather->ReuseShuffleIndices.empty()) {
8884 auto IsNotProfitableAltCodeNode = [](
const TreeEntry &TE) {
8885 return TE.isAltShuffle() &&
8886 (!TE.ReuseShuffleIndices.empty() || TE.getVectorFactor() == 2 ||
8887 TE.ReorderIndices.empty());
8889 if (
Data.first->State != TreeEntry::Vectorize ||
8891 Data.first->getMainOp()) ||
8892 IsNotProfitableAltCodeNode(*
Data.first))
8893 Data.first->reorderOperands(Mask);
8895 IsNotProfitableAltCodeNode(*
Data.first) ||
8896 Data.first->State == TreeEntry::StridedVectorize ||
8897 Data.first->State == TreeEntry::CompressVectorize) {
8901 if (
Data.first->ReuseShuffleIndices.empty() &&
8902 !
Data.first->ReorderIndices.empty() &&
8903 !IsNotProfitableAltCodeNode(*
Data.first)) {
8906 Queue.push(
Data.first);
8914 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
8915 VectorizableTree.front()->ReuseShuffleIndices.empty())
8916 VectorizableTree.front()->ReorderIndices.
clear();
8919Instruction *BoUpSLP::getRootEntryInstruction(
const TreeEntry &Entry)
const {
8920 if (Entry.hasState() &&
8921 (Entry.getOpcode() == Instruction::Store ||
8922 Entry.getOpcode() == Instruction::Load) &&
8923 Entry.State == TreeEntry::StridedVectorize &&
8924 !Entry.ReorderIndices.empty() &&
isReverseOrder(Entry.ReorderIndices))
8931 const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
8934 for (
auto &TEPtr : VectorizableTree) {
8935 TreeEntry *Entry = TEPtr.get();
8938 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
8942 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
8943 Value *Scalar = Entry->Scalars[Lane];
8948 auto It = ScalarToExtUses.
find(Scalar);
8949 if (It != ScalarToExtUses.
end() && !ExternalUses[It->second].User)
8952 if (Scalar->hasNUsesOrMore(NumVectScalars)) {
8953 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8954 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract from lane " << FoundLane
8955 <<
" from " << *Scalar <<
"for many users.\n");
8956 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
8957 ExternalUses.emplace_back(Scalar,
nullptr, *Entry, FoundLane);
8958 ExternalUsesWithNonUsers.insert(Scalar);
8963 const auto ExtI = ExternallyUsedValues.
find(Scalar);
8964 if (ExtI != ExternallyUsedValues.
end()) {
8965 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8966 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract: Extra arg from lane "
8967 << FoundLane <<
" from " << *Scalar <<
".\n");
8968 ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size());
8969 ExternalUses.emplace_back(Scalar,
nullptr, *Entry, FoundLane);
8980 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
8985 !UseEntries.
empty()) {
8989 if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
8992 all_of(UseEntries, [&](TreeEntry *UseEntry) {
8993 return UseEntry->State == TreeEntry::ScatterVectorize ||
8995 Scalar, getRootEntryInstruction(*UseEntry), TLI,
8998 LLVM_DEBUG(
dbgs() <<
"SLP: \tInternal user will be removed:" << *U
9001 [](TreeEntry *UseEntry) {
9002 return UseEntry->isGather();
9008 if (It != ScalarToExtUses.
end()) {
9009 ExternalUses[It->second].User =
nullptr;
9014 if (U && Scalar->hasNUsesOrMore(
UsesLimit))
9016 unsigned FoundLane = Entry->findLaneForValue(Scalar);
9018 <<
" from lane " << FoundLane <<
" from " << *Scalar
9020 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
9021 ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
9022 ExternalUsesWithNonUsers.insert(Scalar);
9031BoUpSLP::collectUserStores(
const BoUpSLP::TreeEntry *TE)
const {
9035 for (
unsigned Lane :
seq<unsigned>(0, TE->Scalars.size())) {
9036 Value *V = TE->Scalars[Lane];
9049 if (
SI ==
nullptr || !
SI->isSimple() ||
SI->getFunction() !=
F ||
9058 auto &StoresVec = PtrToStoresMap[{
SI->getParent(),
9059 SI->getValueOperand()->getType(), Ptr}];
9062 if (StoresVec.size() > Lane)
9064 if (!StoresVec.empty()) {
9066 SI->getValueOperand()->getType(),
SI->getPointerOperand(),
9067 SI->getValueOperand()->getType(),
9068 StoresVec.front()->getPointerOperand(), *
DL, *SE,
9074 StoresVec.push_back(SI);
9079 for (
auto &
P : PtrToStoresMap) {
9094 StoreInst *S0 = StoresVec[0];
9099 StoreInst *
SI = StoresVec[Idx];
9100 std::optional<int64_t> Diff =
9102 SI->getPointerOperand(), *DL, *SE,
9108 if (StoreOffsetVec.
size() != StoresVec.
size())
9110 sort(StoreOffsetVec, llvm::less_first());
9112 int64_t PrevDist = 0;
9113 for (
const auto &
P : StoreOffsetVec) {
9114 if (Idx > 0 &&
P.first != PrevDist + 1)
9122 ReorderIndices.assign(StoresVec.
size(), 0);
9123 bool IsIdentity =
true;
9125 ReorderIndices[
P.second] =
I;
9126 IsIdentity &=
P.second ==
I;
9132 ReorderIndices.clear();
9139 for (
unsigned Idx : Order)
9140 dbgs() << Idx <<
", ";
9146BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE)
const {
9147 unsigned NumLanes =
TE->Scalars.size();
9160 if (StoresVec.
size() != NumLanes)
9165 if (!canFormVector(StoresVec, ReorderIndices))
9170 ExternalReorderIndices.
push_back(ReorderIndices);
9172 return ExternalReorderIndices;
9178 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9179 "TreeEntryToStridedPtrInfoMap is not cleared");
9180 UserIgnoreList = &UserIgnoreLst;
9183 buildTreeRec(Roots, 0,
EdgeInfo());
9188 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9189 "TreeEntryToStridedPtrInfoMap is not cleared");
9192 buildTreeRec(Roots, 0,
EdgeInfo());
9201 bool AddNew =
true) {
9209 for (
Value *V : VL) {
9213 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
9215 bool IsFound =
false;
9216 for (
auto [Map,
Data] :
zip(ClusteredDistToLoad, ClusteredLoads)) {
9217 assert(LI->getParent() ==
Data.front().first->getParent() &&
9218 LI->getType() ==
Data.front().first->getType() &&
9222 "Expected loads with the same type, same parent and same "
9223 "underlying pointer.");
9225 LI->getType(), LI->getPointerOperand(),
Data.front().first->getType(),
9226 Data.front().first->getPointerOperand(),
DL, SE,
9230 auto It = Map.find(*Dist);
9231 if (It != Map.end() && It->second != LI)
9233 if (It == Map.end()) {
9234 Data.emplace_back(LI, *Dist);
9235 Map.try_emplace(*Dist, LI);
9245 auto FindMatchingLoads =
9250 int64_t &
Offset,
unsigned &Start) {
9252 return GatheredLoads.
end();
9261 std::optional<int64_t> Dist =
9263 Data.front().first->getType(),
9264 Data.front().first->getPointerOperand(),
DL, SE,
9270 for (std::pair<LoadInst *, int64_t>
P :
Data) {
9276 unsigned NumUniques = 0;
9277 for (
auto [Cnt, Pair] :
enumerate(Loads)) {
9278 bool Used = DataLoads.
contains(Pair.first);
9279 if (!Used && !DataDists.
contains(*Dist + Pair.second)) {
9283 Repeated.insert(Cnt);
9286 if (NumUniques > 0 &&
9287 (Loads.
size() == NumUniques ||
9288 (Loads.
size() - NumUniques >= 2 &&
9289 Loads.
size() - NumUniques >= Loads.
size() / 2 &&
9295 return std::next(GatheredLoads.
begin(), Idx);
9299 return GatheredLoads.
end();
9301 for (
ArrayRef<std::pair<LoadInst *, int64_t>>
Data : ClusteredLoads) {
9305 auto *It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
9307 while (It != GatheredLoads.
end()) {
9308 assert(!LocalToAdd.
empty() &&
"Expected some elements to add.");
9309 for (
unsigned Idx : LocalToAdd)
9312 It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
Offset,
9316 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
9323 Loads.push_back(
Data[Idx]);
9329 GatheredLoads, [&](
ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9330 return PD.front().first->getParent() == LI->
getParent() &&
9331 PD.front().first->getType() == LI->
getType();
9333 while (It != GatheredLoads.
end()) {
9336 std::next(It), GatheredLoads.
end(),
9337 [&](
ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9338 return PD.front().first->getParent() == LI->getParent() &&
9339 PD.front().first->getType() == LI->getType();
9343 GatheredLoads.emplace_back().append(
Data.begin(),
Data.end());
9344 AddNewLoads(GatheredLoads.emplace_back());
9349void BoUpSLP::tryToVectorizeGatheredLoads(
9350 const SmallMapVector<
9351 std::tuple<BasicBlock *, Value *, Type *>,
9354 GatheredLoadsEntriesFirst = VectorizableTree.
size();
9357 LoadEntriesToVectorize.size());
9358 for (
auto [Idx, Set] :
zip(LoadEntriesToVectorize, LoadSetsToVectorize))
9359 Set.insert_range(VectorizableTree[Idx]->Scalars);
9362 auto LoadSorter = [](
const std::pair<LoadInst *, int64_t> &L1,
9363 const std::pair<LoadInst *, int64_t> &L2) {
9364 return L1.second > L2.second;
9371 auto *Ty =
getWidenedType(Loads.front()->getType(), Loads.size());
9372 return TTI->isLegalMaskedGather(Ty, Alignment) &&
9373 !TTI->forceScalarizeMaskedGather(Ty, Alignment);
9378 SmallVectorImpl<LoadInst *> &NonVectorized,
9379 bool Final,
unsigned MaxVF) {
9381 unsigned StartIdx = 0;
9382 SmallVector<int> CandidateVFs;
9386 *TTI, Loads.
front()->getType(), MaxVF);
9388 *TTI, Loads.
front()->getType(), NumElts - 1)) {
9394 if (Final && CandidateVFs.
empty())
9397 unsigned BestVF = Final ? CandidateVFs.
back() : 0;
9398 for (
unsigned NumElts : CandidateVFs) {
9399 if (Final && NumElts > BestVF)
9401 SmallVector<unsigned> MaskedGatherVectorized;
9402 for (
unsigned Cnt = StartIdx,
E = Loads.
size(); Cnt <
E;
9406 if (VectorizedLoads.count(Slice.
front()) ||
9407 VectorizedLoads.count(Slice.
back()) ||
9413 bool AllowToVectorize =
false;
9416 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
9419 for (LoadInst *LI : Slice) {
9421 if (LI->hasOneUse())
9427 if (
static_cast<unsigned int>(std::distance(
9428 LI->user_begin(), LI->user_end())) != LI->getNumUses())
9430 if (!IsLegalBroadcastLoad)
9434 for (User *U : LI->users()) {
9437 for (
const TreeEntry *UTE : getTreeEntries(U)) {
9438 for (
int I :
seq<int>(UTE->getNumOperands())) {
9440 return V == LI || isa<PoisonValue>(V);
9450 AllowToVectorize = CheckIfAllowed(Slice);
9454 any_of(ValueToGatherNodes.at(Slice.front()),
9455 [=](
const TreeEntry *TE) {
9456 return TE->Scalars.size() == 2 &&
9457 ((TE->Scalars.front() == Slice.front() &&
9458 TE->Scalars.back() == Slice.back()) ||
9459 (TE->Scalars.front() == Slice.back() &&
9460 TE->Scalars.back() == Slice.front()));
9465 if (AllowToVectorize) {
9470 reinterpret_cast<Value *
const *
>(Slice.begin()), Slice.size());
9471 StridedPtrInfo SPtrInfo;
9473 PointerOps, SPtrInfo, &BestVF);
9475 (BestVF > 1 &&
static_cast<unsigned>(NumElts) == 2 * BestVF)) {
9477 if (MaskedGatherVectorized.
empty() ||
9478 Cnt >= MaskedGatherVectorized.
back() + NumElts)
9483 Results.emplace_back(Values, LS);
9484 VectorizedLoads.insert_range(Slice);
9487 if (Cnt == StartIdx)
9488 StartIdx += NumElts;
9491 if (StartIdx >= Loads.
size())
9495 if (!MaskedGatherVectorized.
empty() &&
9496 Cnt < MaskedGatherVectorized.
back() + NumElts)
9502 if (!AllowToVectorize || BestVF == 0)
9506 for (
unsigned Cnt : MaskedGatherVectorized) {
9508 Cnt, std::min<unsigned>(NumElts, Loads.
size() - Cnt));
9512 VectorizedLoads.insert_range(Slice);
9514 if (Cnt == StartIdx)
9515 StartIdx += NumElts;
9518 for (LoadInst *LI : Loads) {
9519 if (!VectorizedLoads.contains(LI))
9520 NonVectorized.push_back(LI);
9524 auto ProcessGatheredLoads =
9527 bool Final =
false) {
9529 for (
ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists :
9531 if (LoadsDists.size() <= 1) {
9532 NonVectorized.
push_back(LoadsDists.back().first);
9540 unsigned MaxConsecutiveDistance = 0;
9541 unsigned CurrentConsecutiveDist = 1;
9542 int64_t LastDist = LocalLoadsDists.front().second;
9543 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
9544 for (
const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {
9547 assert(LastDist >=
L.second &&
9548 "Expected first distance always not less than second");
9549 if (
static_cast<uint64_t
>(LastDist -
L.second) ==
9550 CurrentConsecutiveDist) {
9551 ++CurrentConsecutiveDist;
9552 MaxConsecutiveDistance =
9553 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
9557 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
9560 CurrentConsecutiveDist = 1;
9561 LastDist =
L.second;
9564 if (Loads.
size() <= 1)
9566 if (AllowMaskedGather)
9567 MaxConsecutiveDistance = Loads.
size();
9568 else if (MaxConsecutiveDistance < 2)
9573 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
9574 Final, MaxConsecutiveDistance);
9576 OriginalLoads.size() == Loads.
size() &&
9577 MaxConsecutiveDistance == Loads.
size() &&
9582 VectorizedLoads.
clear();
9586 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
9587 UnsortedNonVectorized, Final,
9588 OriginalLoads.size());
9589 if (SortedNonVectorized.
size() >= UnsortedNonVectorized.
size()) {
9590 SortedNonVectorized.
swap(UnsortedNonVectorized);
9591 Results.swap(UnsortedResults);
9596 << Slice.
size() <<
")\n");
9598 for (
Value *L : Slice)
9606 unsigned MaxVF = Slice.size();
9607 unsigned UserMaxVF = 0;
9608 unsigned InterleaveFactor = 0;
9613 std::optional<unsigned> InterleavedLoadsDistance = 0;
9615 std::optional<unsigned> CommonVF = 0;
9616 DenseMap<const TreeEntry *, unsigned> EntryToPosition;
9617 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
9618 for (
auto [Idx, V] :
enumerate(Slice)) {
9619 for (
const TreeEntry *
E : ValueToGatherNodes.at(V)) {
9620 UserMaxVF = std::max<unsigned>(UserMaxVF,
E->Scalars.size());
9623 UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
9625 if (*CommonVF == 0) {
9626 CommonVF =
E->Scalars.size();
9629 if (*CommonVF !=
E->Scalars.size())
9633 if (Pos != Idx && InterleavedLoadsDistance) {
9636 if (isa<Constant>(V))
9638 if (isVectorized(V))
9640 const auto &Nodes = ValueToGatherNodes.at(V);
9641 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
9642 !is_contained(Slice, V);
9644 InterleavedLoadsDistance.reset();
9648 if (*InterleavedLoadsDistance == 0) {
9649 InterleavedLoadsDistance = Idx - Pos;
9652 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
9653 (Idx - Pos) / *InterleavedLoadsDistance < Order)
9654 InterleavedLoadsDistance.reset();
9655 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
9659 DeinterleavedNodes.
clear();
9661 if (InterleavedLoadsDistance.value_or(0) > 1 &&
9662 CommonVF.value_or(0) != 0) {
9663 InterleaveFactor =
bit_ceil(*InterleavedLoadsDistance);
9664 unsigned VF = *CommonVF;
9667 StridedPtrInfo SPtrInfo;
9669 if (InterleaveFactor <= Slice.size() &&
9670 TTI.isLegalInterleavedAccessType(
9678 UserMaxVF = InterleaveFactor * VF;
9680 InterleaveFactor = 0;
9685 unsigned ConsecutiveNodesSize = 0;
9686 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
9687 any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9688 [&, Slice = Slice](
const auto &
P) {
9690 return std::get<1>(
P).contains(V);
9692 if (It == Slice.end())
9694 const TreeEntry &
TE =
9695 *VectorizableTree[std::get<0>(
P)];
9699 StridedPtrInfo SPtrInfo;
9701 VL, VL.
front(), Order, PointerOps, SPtrInfo);
9705 ConsecutiveNodesSize += VL.
size();
9706 size_t Start = std::distance(Slice.begin(), It);
9707 size_t Sz = Slice.size() -
Start;
9708 return Sz < VL.
size() ||
9709 Slice.slice(Start, VL.
size()) != VL;
9714 if (InterleaveFactor == 0 &&
9716 [&, Slice = Slice](
unsigned Idx) {
9718 SmallVector<Value *> PointerOps;
9719 StridedPtrInfo SPtrInfo;
9720 return canVectorizeLoads(
9721 Slice.slice(Idx * UserMaxVF, UserMaxVF),
9722 Slice[Idx * UserMaxVF], Order, PointerOps,
9723 SPtrInfo) == LoadsState::ScatterVectorize;
9726 if (Slice.size() != ConsecutiveNodesSize)
9727 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
9729 for (
unsigned VF = MaxVF; VF >= 2; VF /= 2) {
9730 bool IsVectorized =
true;
9731 for (
unsigned I = 0,
E = Slice.size();
I <
E;
I += VF) {
9733 Slice.slice(
I, std::min(VF,
E -
I));
9738 if (
any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9739 [&](
const auto &
P) {
9741 VectorizableTree[std::get<0>(
P)]
9746 unsigned Sz = VectorizableTree.size();
9747 buildTreeRec(SubSlice, 0,
EdgeInfo(), InterleaveFactor);
9748 if (Sz == VectorizableTree.size()) {
9749 IsVectorized =
false;
9752 if (InterleaveFactor > 0) {
9753 VF = 2 * (MaxVF / InterleaveFactor);
9754 InterleaveFactor = 0;
9763 NonVectorized.
append(SortedNonVectorized);
9765 return NonVectorized;
9767 for (
const auto &GLs : GatheredLoads) {
9768 const auto &
Ref = GLs.second;
9770 if (!
Ref.empty() && !NonVectorized.
empty() &&
9772 Ref.begin(),
Ref.end(), 0u,
9773 [](
unsigned S,
ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists)
9774 ->
unsigned { return S + LoadsDists.size(); }) !=
9775 NonVectorized.
size() &&
9776 IsMaskedGatherSupported(NonVectorized)) {
9779 for (LoadInst *LI : NonVectorized) {
9787 (void)ProcessGatheredLoads(FinalGatheredLoads,
true);
9791 for (
unsigned Idx : LoadEntriesToVectorize) {
9792 const TreeEntry &
E = *VectorizableTree[Idx];
9795 if (!
E.ReorderIndices.empty()) {
9798 SmallVector<int> ReorderMask;
9802 buildTreeRec(GatheredScalars, 0,
EdgeInfo());
9806 if (
static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
9807 VectorizableTree.size())
9808 GatheredLoadsEntriesFirst.reset();
9818 bool AllowAlternate) {
9853 std::pair<size_t, size_t> OpVals =
9861 if (CI->isCommutative())
9883 SubKey =
hash_value(Gep->getPointerOperand());
9895 return std::make_pair(
Key, SubKey);
9901 Instruction *AltOp,
const TargetLibraryInfo &TLI);
9903bool BoUpSLP::areAltOperandsProfitable(
const InstructionsState &S,
9905 Type *ScalarTy = S.getMainOp()->getType();
9906 unsigned Opcode0 = S.getOpcode();
9907 unsigned Opcode1 = S.getAltOpcode();
9908 SmallBitVector OpcodeMask(
getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
9911 Opcode1, OpcodeMask))
9914 for (
unsigned I :
seq<unsigned>(S.getMainOp()->getNumOperands())) {
9917 for (
Value *V : VL) {
9919 Operands.
back().push_back(
9926 if (Operands.
size() == 2) {
9930 Candidates[0] = std::make_pair(Operands[0][
I], Operands[0][
I + 1]);
9931 Candidates[1] = std::make_pair(Operands[0][
I], Operands[1][
I + 1]);
9932 Candidates[2] = std::make_pair(Operands[1][
I], Operands[0][
I + 1]);
9934 switch (Res.value_or(0)) {
9938 std::swap(Operands[0][
I + 1], Operands[1][
I + 1]);
9948 DenseSet<unsigned> UniqueOpcodes;
9949 constexpr unsigned NumAltInsts = 3;
9950 unsigned NonInstCnt = 0;
9953 unsigned UndefCnt = 0;
9955 unsigned ExtraShuffleInsts = 0;
9958 if (Operands.
size() == 2) {
9960 if (Operands.
front() == Operands.
back()) {
9964 return is_contained(Operands.back(), V);
9967 ++ExtraShuffleInsts;
9970 const Loop *
L = LI->getLoopFor(S.getMainOp()->getParent());
9982 DenseMap<Value *, unsigned> Uniques;
9992 if (!Res.second && Res.first->second == 1)
9993 ++ExtraShuffleInsts;
9994 ++Res.first->getSecond();
9996 UniqueOpcodes.
insert(
I->getOpcode());
9997 else if (Res.second)
10000 return none_of(Uniques, [&](
const auto &
P) {
10001 return P.first->hasNUsesOrMore(
P.second + 1) &&
10002 none_of(
P.first->users(), [&](User *U) {
10003 return isVectorized(U) || Uniques.contains(U);
10012 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
10013 (UniqueOpcodes.
size() + NonInstCnt + ExtraShuffleInsts +
10014 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
10021 const unsigned VF,
unsigned MinBW,
10044static std::pair<InstructionCost, InstructionCost>
10064 FMF = FPCI->getFastMathFlags();
10067 LibCost.isValid() ? LibCost : ScalarLimit);
10077BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
10079 bool IsScatterVectorizeUserTE,
OrdersType &CurrentOrder,
10080 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo) {
10082 "Expected instructions with same/alternate opcodes only.");
10084 unsigned ShuffleOrOp =
10085 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
10087 switch (ShuffleOrOp) {
10088 case Instruction::PHI: {
10091 return TreeEntry::NeedToGather;
10093 for (
Value *V : VL) {
10097 for (
Value *Incoming :
PHI->incoming_values()) {
10099 if (Term &&
Term->isTerminator()) {
10101 <<
"SLP: Need to swizzle PHINodes (terminator use).\n");
10102 return TreeEntry::NeedToGather;
10107 return TreeEntry::Vectorize;
10109 case Instruction::ExtractElement:
10116 return TreeEntry::NeedToGather;
10118 case Instruction::ExtractValue: {
10119 bool Reuse = canReuseExtract(VL, CurrentOrder);
10123 return TreeEntry::NeedToGather;
10124 if (Reuse || !CurrentOrder.empty())
10125 return TreeEntry::Vectorize;
10127 return TreeEntry::NeedToGather;
10129 case Instruction::InsertElement: {
10133 for (
Value *V : VL) {
10135 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement/poison vector.\n");
10136 return TreeEntry::NeedToGather;
10140 "Non-constant or undef index?");
10144 return !SourceVectors.contains(V);
10147 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
10148 "different source vectors.\n");
10149 return TreeEntry::NeedToGather;
10154 return SourceVectors.contains(V) && !
V->hasOneUse();
10157 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
10158 "multiple uses.\n");
10159 return TreeEntry::NeedToGather;
10162 return TreeEntry::Vectorize;
10164 case Instruction::Load: {
10171 auto IsGatheredNode = [&]() {
10172 if (!GatheredLoadsEntriesFirst)
10177 return any_of(getTreeEntries(V), [&](
const TreeEntry *TE) {
10178 return TE->Idx >= *GatheredLoadsEntriesFirst;
10184 return TreeEntry::Vectorize;
10186 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
10188 LoadEntriesToVectorize.insert(VectorizableTree.size());
10189 return TreeEntry::NeedToGather;
10191 return IsGatheredNode() ? TreeEntry::NeedToGather
10192 : TreeEntry::CompressVectorize;
10194 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
10196 LoadEntriesToVectorize.insert(VectorizableTree.size());
10197 return TreeEntry::NeedToGather;
10199 return IsGatheredNode() ? TreeEntry::NeedToGather
10200 : TreeEntry::ScatterVectorize;
10202 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
10204 LoadEntriesToVectorize.insert(VectorizableTree.size());
10205 return TreeEntry::NeedToGather;
10207 return IsGatheredNode() ? TreeEntry::NeedToGather
10208 : TreeEntry::StridedVectorize;
10212 if (DL->getTypeSizeInBits(ScalarTy) !=
10213 DL->getTypeAllocSizeInBits(ScalarTy))
10214 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering loads of non-packed type.\n");
10217 return !LI || !LI->isSimple();
10221 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering non-consecutive loads.\n");
10224 return TreeEntry::NeedToGather;
10228 case Instruction::ZExt:
10229 case Instruction::SExt:
10230 case Instruction::FPToUI:
10231 case Instruction::FPToSI:
10232 case Instruction::FPExt:
10233 case Instruction::PtrToInt:
10234 case Instruction::IntToPtr:
10235 case Instruction::SIToFP:
10236 case Instruction::UIToFP:
10237 case Instruction::Trunc:
10238 case Instruction::FPTrunc:
10239 case Instruction::BitCast: {
10241 for (
Value *V : VL) {
10247 dbgs() <<
"SLP: Gathering casts with different src types.\n");
10248 return TreeEntry::NeedToGather;
10251 return TreeEntry::Vectorize;
10253 case Instruction::ICmp:
10254 case Instruction::FCmp: {
10259 for (
Value *V : VL) {
10263 if ((
Cmp->getPredicate() != P0 &&
Cmp->getPredicate() != SwapP0) ||
10264 Cmp->getOperand(0)->getType() != ComparedTy) {
10265 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering cmp with different predicate.\n");
10266 return TreeEntry::NeedToGather;
10269 return TreeEntry::Vectorize;
10271 case Instruction::Select:
10272 case Instruction::FNeg:
10273 case Instruction::Add:
10274 case Instruction::FAdd:
10275 case Instruction::Sub:
10276 case Instruction::FSub:
10277 case Instruction::Mul:
10278 case Instruction::FMul:
10279 case Instruction::UDiv:
10280 case Instruction::SDiv:
10281 case Instruction::FDiv:
10282 case Instruction::URem:
10283 case Instruction::SRem:
10284 case Instruction::FRem:
10285 case Instruction::Shl:
10286 case Instruction::LShr:
10287 case Instruction::AShr:
10288 case Instruction::And:
10289 case Instruction::Or:
10290 case Instruction::Xor:
10291 case Instruction::Freeze:
10292 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10293 TTI->isFPVectorizationPotentiallyUnsafe() &&
any_of(VL, [](
Value *V) {
10295 return I &&
I->isBinaryOp() && !
I->isFast();
10297 return TreeEntry::NeedToGather;
10298 return TreeEntry::Vectorize;
10299 case Instruction::GetElementPtr: {
10301 for (
Value *V : VL) {
10305 if (
I->getNumOperands() != 2) {
10306 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (nested indexes).\n");
10307 return TreeEntry::NeedToGather;
10314 for (
Value *V : VL) {
10318 Type *CurTy =
GEP->getSourceElementType();
10319 if (Ty0 != CurTy) {
10320 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (different types).\n");
10321 return TreeEntry::NeedToGather;
10327 for (
Value *V : VL) {
10331 auto *
Op =
I->getOperand(1);
10333 (
Op->getType() != Ty1 &&
10335 Op->getType()->getScalarSizeInBits() >
10336 DL->getIndexSizeInBits(
10337 V->getType()->getPointerAddressSpace())))) {
10339 dbgs() <<
"SLP: not-vectorizable GEP (non-constant indexes).\n");
10340 return TreeEntry::NeedToGather;
10344 return TreeEntry::Vectorize;
10346 case Instruction::Store: {
10348 llvm::Type *ScalarTy =
cast<StoreInst>(VL0)->getValueOperand()->getType();
10351 if (DL->getTypeSizeInBits(ScalarTy) !=
10352 DL->getTypeAllocSizeInBits(ScalarTy)) {
10353 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering stores of non-packed type.\n");
10354 return TreeEntry::NeedToGather;
10358 for (
Value *V : VL) {
10360 if (!
SI->isSimple()) {
10362 return TreeEntry::NeedToGather;
10371 if (CurrentOrder.empty()) {
10372 Ptr0 = PointerOps.
front();
10373 PtrN = PointerOps.
back();
10375 Ptr0 = PointerOps[CurrentOrder.front()];
10376 PtrN = PointerOps[CurrentOrder.back()];
10378 std::optional<int64_t> Dist =
10381 if (
static_cast<uint64_t
>(*Dist) == VL.size() - 1)
10382 return TreeEntry::Vectorize;
10386 return TreeEntry::NeedToGather;
10388 case Instruction::Call: {
10389 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10390 TTI->isFPVectorizationPotentiallyUnsafe() &&
any_of(VL, [](
Value *V) {
10392 return I && !
I->isFast();
10394 return TreeEntry::NeedToGather;
10404 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10408 return TreeEntry::NeedToGather;
10411 unsigned NumArgs = CI->
arg_size();
10413 for (
unsigned J = 0; J != NumArgs; ++J)
10416 for (
Value *V : VL) {
10421 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
10423 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched calls:" << *CI <<
"!=" << *V
10425 return TreeEntry::NeedToGather;
10429 for (
unsigned J = 0; J != NumArgs; ++J) {
10432 if (ScalarArgs[J] != A1J) {
10434 <<
"SLP: mismatched arguments in call:" << *CI
10435 <<
" argument " << ScalarArgs[J] <<
"!=" << A1J <<
"\n");
10436 return TreeEntry::NeedToGather;
10445 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched bundle operands in calls:" << *CI
10446 <<
"!=" << *V <<
'\n');
10447 return TreeEntry::NeedToGather;
10452 auto *VecTy =
getWidenedType(S.getMainOp()->getType(), VL.size());
10454 if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())
10455 return TreeEntry::NeedToGather;
10457 return TreeEntry::Vectorize;
10459 case Instruction::ShuffleVector: {
10460 if (!S.isAltShuffle()) {
10463 return TreeEntry::Vectorize;
10466 LLVM_DEBUG(
dbgs() <<
"SLP: ShuffleVector are not vectorized.\n");
10467 return TreeEntry::NeedToGather;
10472 <<
"SLP: ShuffleVector not vectorized, operands are buildvector and "
10473 "the whole alt sequence is not profitable.\n");
10474 return TreeEntry::NeedToGather;
10477 return TreeEntry::Vectorize;
10481 return TreeEntry::NeedToGather;
10490 PHINode *Main =
nullptr;
10495 PHIHandler() =
delete;
10497 : DT(DT), Main(Main), Phis(Phis),
10498 Operands(Main->getNumIncomingValues(),
10500 void buildOperands() {
10501 constexpr unsigned FastLimit = 4;
10510 for (
auto [Idx, V] :
enumerate(Phis)) {
10514 "Expected isa instruction or poison value.");
10515 Operands[
I][Idx] =
V;
10518 if (
P->getIncomingBlock(
I) == InBB)
10519 Operands[
I][Idx] =
P->getIncomingValue(
I);
10521 Operands[
I][Idx] =
P->getIncomingValueForBlock(InBB);
10526 SmallMapVector<BasicBlock *, SmallVector<unsigned>, 4>
10536 for (
auto [Idx, V] :
enumerate(Phis)) {
10539 Operands[
I][Idx] =
V;
10548 Operands[
I][Idx] =
P->getIncomingValue(
I);
10551 auto *It = Blocks.
find(InBB);
10552 if (It == Blocks.
end())
10554 Operands[It->second.front()][Idx] =
P->getIncomingValue(
I);
10557 for (
const auto &
P : Blocks) {
10558 ArrayRef<unsigned> IncomingValues =
P.second;
10559 if (IncomingValues.
size() <= 1)
10562 for (
unsigned I : IncomingValues) {
10564 [&](
const auto &
Data) {
10565 return !
Data.value() ||
10566 Data.value() == Operands[BasicI][
Data.index()];
10568 "Expected empty operands list.");
10569 Operands[
I] = Operands[BasicI];
10582static std::pair<Instruction *, Instruction *>
10586 for (
Value *V : VL) {
10596 if (MainOp->
getOpcode() ==
I->getOpcode()) {
10615 "Expected different main and alt instructions.");
10616 return std::make_pair(MainOp, AltOp);
10629 const InstructionsState &S,
10631 bool TryPad =
false) {
10635 for (
Value *V : VL) {
10651 size_t NumUniqueScalarValues = UniqueValues.
size();
10654 if (NumUniqueScalarValues == VL.
size() &&
10656 ReuseShuffleIndices.
clear();
10661 if ((UserTreeIdx.
UserTE &&
10662 UserTreeIdx.
UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(
TTI)) ||
10664 LLVM_DEBUG(
dbgs() <<
"SLP: Reshuffling scalars not yet supported "
10665 "for nodes with padding.\n");
10666 ReuseShuffleIndices.
clear();
10671 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
10675 if (TryPad && UniquePositions.
size() > 1 && NumUniqueScalarValues > 1 &&
10676 S.getMainOp()->isSafeToRemove() &&
10677 (S.areInstructionsWithCopyableElements() ||
10681 TTI, UniqueValues.
front()->getType(), UniqueValues.
size());
10682 PWSz = std::min<unsigned>(PWSz, VL.
size());
10683 if (PWSz == VL.
size()) {
10687 ReuseShuffleIndices.
clear();
10691 UniqueValues.
end());
10692 PaddedUniqueValues.
append(
10693 PWSz - UniqueValues.
size(),
10697 if ((!S.areInstructionsWithCopyableElements() &&
10699 (S.areInstructionsWithCopyableElements() && S.isMulDivLikeOp() &&
10700 (S.getMainOp()->isIntDivRem() || S.getMainOp()->isFPDivRem() ||
10703 ReuseShuffleIndices.
clear();
10706 VL = std::move(PaddedUniqueValues);
10711 ReuseShuffleIndices.
clear();
10714 VL = std::move(UniqueValues);
10719 const InstructionsState &LocalState,
10720 SmallVectorImpl<Value *> &Op1,
10721 SmallVectorImpl<Value *> &Op2,
10723 constexpr unsigned SmallNodeSize = 4;
10724 if (VL.
size() <= SmallNodeSize || TTI->preferAlternateOpcodeVectorization() ||
10729 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *LocalState.getMainOp()
10731 for (TreeEntry *
E : getSplitTreeEntries(LocalState.getMainOp())) {
10732 if (
E->isSame(VL)) {
10734 << *LocalState.getMainOp() <<
".\n");
10746 ReorderIndices.assign(VL.
size(), VL.
size());
10747 SmallBitVector Op1Indices(VL.
size());
10752 Op1Indices.set(Idx);
10755 if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
10758 (LocalState.getAltOpcode() == LocalState.getOpcode() &&
10760 LocalState.getAltOp(), *TLI))) {
10762 Op1Indices.set(Idx);
10769 unsigned Opcode0 = LocalState.getOpcode();
10770 unsigned Opcode1 = LocalState.getAltOpcode();
10771 SmallBitVector OpcodeMask(
getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
10776 if (UOp1.size() <= 1 || UOp2.size() <= 1 ||
10777 TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) ||
10782 unsigned Op1Cnt = 0, Op2Cnt = Op1.
size();
10784 if (Op1Indices.test(Idx)) {
10785 ReorderIndices[Op1Cnt] = Idx;
10788 ReorderIndices[Op2Cnt] = Idx;
10793 ReorderIndices.clear();
10794 SmallVector<int>
Mask;
10795 if (!ReorderIndices.empty())
10797 unsigned NumParts = TTI->getNumberOfParts(VecTy);
10802 if (NumParts >= VL.
size())
10807 FixedVectorType *SubVecTy =
10811 if (!LocalState.isCmpOp() && NumParts <= 1 &&
10812 (
Mask.empty() || InsertCost >= NewShuffleCost))
10814 if ((LocalState.getMainOp()->isBinaryOp() &&
10815 LocalState.getAltOp()->isBinaryOp() &&
10816 (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
10817 LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
10818 (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
10819 (LocalState.getMainOp()->isUnaryOp() &&
10820 LocalState.getAltOp()->isUnaryOp())) {
10822 TTI->getArithmeticInstrCost(Opcode0, VecTy, Kind) +
10823 TTI->getArithmeticInstrCost(Opcode1, VecTy, Kind);
10828 OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.
size());
10832 VecTy, OriginalMask, Kind);
10834 TTI->getArithmeticInstrCost(Opcode0, Op1VecTy, Kind) +
10835 TTI->getArithmeticInstrCost(Opcode1, Op2VecTy, Kind);
10837 NewVecOpsCost + InsertCost +
10838 (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
10839 VectorizableTree.front()->getOpcode() == Instruction::Store
10843 if (NewCost >= OriginalCost)
10853class InstructionsCompatibilityAnalysis {
10855 const DataLayout &
DL;
10856 const TargetTransformInfo &
TTI;
10857 const TargetLibraryInfo &TLI;
10858 unsigned MainOpcode = 0;
10863 static bool isSupportedOpcode(
const unsigned Opcode) {
10864 return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
10865 Opcode == Instruction::LShr || Opcode == Instruction::Shl ||
10866 Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
10867 Opcode == Instruction::And || Opcode == Instruction::Or ||
10868 Opcode == Instruction::Xor;
10878 auto IsSupportedInstruction = [&](
Instruction *
I,
bool AnyUndef) {
10879 if (AnyUndef && (
I->isIntDivRem() ||
I->isFPDivRem() ||
isa<CallInst>(
I)))
10881 return I && isSupportedOpcode(
I->getOpcode()) &&
10886 SmallDenseSet<Value *, 8> Operands;
10887 SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
10888 bool AnyUndef =
false;
10889 for (
Value *V : VL) {
10897 if (Candidates.
empty()) {
10898 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
10900 Operands.
insert(
I->op_begin(),
I->op_end());
10903 if (Parent ==
I->getParent()) {
10904 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
10905 Operands.
insert(
I->op_begin(),
I->op_end());
10908 auto *NodeA = DT.
getNode(Parent);
10909 auto *NodeB = DT.
getNode(
I->getParent());
10910 assert(NodeA &&
"Should only process reachable instructions");
10911 assert(NodeB &&
"Should only process reachable instructions");
10912 assert((NodeA == NodeB) ==
10913 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10914 "Different nodes should have different DFS numbers");
10915 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
10916 Candidates.
clear();
10917 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
10920 Operands.
insert(
I->op_begin(),
I->op_end());
10923 unsigned BestOpcodeNum = 0;
10925 for (
const auto &
P : Candidates) {
10926 if (
P.second.size() < BestOpcodeNum)
10930 [&](Instruction *
I) { return Operands.contains(I); }))
10932 for (Instruction *
I :
P.second) {
10933 if (IsSupportedInstruction(
I, AnyUndef)) {
10935 BestOpcodeNum =
P.second.size();
10945 return I &&
I->getParent() == MainOp->
getParent() &&
10958 Value *selectBestIdempotentValue()
const {
10959 assert(isSupportedOpcode(MainOpcode) &&
"Unsupported opcode");
10970 if (!S.isCopyableElement(V))
10972 assert(isSupportedOpcode(MainOpcode) &&
"Unsupported opcode");
10973 return {
V, selectBestIdempotentValue()};
10979 SmallVectorImpl<BoUpSLP::ValueList> &Operands)
const {
10981 unsigned ShuffleOrOp =
10982 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
10985 switch (ShuffleOrOp) {
10986 case Instruction::PHI: {
10990 PHIHandler Handler(DT, PH, VL);
10991 Handler.buildOperands();
10992 Operands.
assign(PH->getNumOperands(), {});
10994 Operands[
I].
assign(Handler.getOperands(
I).begin(),
10995 Handler.getOperands(
I).end());
10998 case Instruction::ExtractValue:
10999 case Instruction::ExtractElement:
11004 case Instruction::InsertElement:
11012 case Instruction::Load:
11016 for (
auto [V,
Op] :
zip(VL, Operands.
back())) {
11020 Op = LI->getPointerOperand();
11023 case Instruction::ZExt:
11024 case Instruction::SExt:
11025 case Instruction::FPToUI:
11026 case Instruction::FPToSI:
11027 case Instruction::FPExt:
11028 case Instruction::PtrToInt:
11029 case Instruction::IntToPtr:
11030 case Instruction::SIToFP:
11031 case Instruction::UIToFP:
11032 case Instruction::Trunc:
11033 case Instruction::FPTrunc:
11034 case Instruction::BitCast:
11035 case Instruction::ICmp:
11036 case Instruction::FCmp:
11037 case Instruction::Select:
11038 case Instruction::FNeg:
11039 case Instruction::Add:
11040 case Instruction::FAdd:
11041 case Instruction::Sub:
11042 case Instruction::FSub:
11043 case Instruction::Mul:
11044 case Instruction::FMul:
11045 case Instruction::UDiv:
11046 case Instruction::SDiv:
11047 case Instruction::FDiv:
11048 case Instruction::URem:
11049 case Instruction::SRem:
11050 case Instruction::FRem:
11051 case Instruction::Shl:
11052 case Instruction::LShr:
11053 case Instruction::AShr:
11054 case Instruction::And:
11055 case Instruction::Or:
11056 case Instruction::Xor:
11057 case Instruction::Freeze:
11058 case Instruction::Store:
11059 case Instruction::ShuffleVector:
11068 auto [
Op, ConvertedOps] = convertTo(
I, S);
11073 case Instruction::GetElementPtr: {
11080 const unsigned IndexIdx = 1;
11086 return !
GEP || VL0Ty ==
GEP->getOperand(IndexIdx)->getType();
11090 ->getPointerOperandType()
11091 ->getScalarType());
11095 Operands[0][Idx] =
V;
11096 Operands[1][Idx] = ConstantInt::getNullValue(Ty);
11099 Operands[0][Idx] =
GEP->getPointerOperand();
11100 auto *
Op =
GEP->getOperand(IndexIdx);
11103 CI, Ty, CI->getValue().isSignBitSet(),
DL)
11108 case Instruction::Call: {
11115 for (
Value *V : VL) {
11117 Ops.push_back(
I ?
I->getOperand(Idx)
11130 InstructionsCompatibilityAnalysis(DominatorTree &DT,
const DataLayout &
DL,
11131 const TargetTransformInfo &
TTI,
11132 const TargetLibraryInfo &TLI)
11137 bool TryCopyableElementsVectorization,
11138 bool WithProfitabilityCheck =
false,
11139 bool SkipSameCodeCheck =
false) {
11140 InstructionsState S = (SkipSameCodeCheck || !
allSameBlock(VL))
11141 ? InstructionsState::invalid()
11147 findAndSetMainInstruction(VL, R);
11149 return InstructionsState::invalid();
11150 S = InstructionsState(MainOp, MainOp,
true);
11151 if (!WithProfitabilityCheck)
11155 auto BuildCandidates =
11156 [](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates,
Value *V1,
11162 if (I1 && I2 &&
I1->getOpcode() == I2->getOpcode() &&
11163 I1->getParent() != I2->getParent())
11167 if (VL.
size() == 2) {
11170 BuildCandidates(Candidates1, Operands[0][0], Operands[0][1]);
11171 BuildCandidates(Candidates2, Operands[1][0], Operands[1][1]);
11172 bool Res = !Candidates1.
empty() && !Candidates2.
empty() &&
11173 R.findBestRootPair(Candidates1) &&
11174 R.findBestRootPair(Candidates2);
11176 Candidates1.
clear();
11177 Candidates2.
clear();
11178 BuildCandidates(Candidates1, Operands[0][0], Operands[1][1]);
11179 BuildCandidates(Candidates2, Operands[1][0], Operands[0][1]);
11180 Res = !Candidates1.
empty() && !Candidates2.
empty() &&
11181 R.findBestRootPair(Candidates1) &&
11182 R.findBestRootPair(Candidates2);
11185 return InstructionsState::invalid();
11189 FixedVectorType *VecTy =
11191 switch (MainOpcode) {
11192 case Instruction::Add:
11193 case Instruction::Sub:
11194 case Instruction::LShr:
11195 case Instruction::Shl:
11196 case Instruction::SDiv:
11197 case Instruction::UDiv:
11198 case Instruction::And:
11199 case Instruction::Or:
11200 case Instruction::Xor:
11206 if (VectorCost > ScalarCost)
11207 return InstructionsState::invalid();
11210 assert(Operands.
size() == 2 &&
"Unexpected number of operands!");
11211 unsigned CopyableNum =
11212 count_if(VL, [&](
Value *V) {
return S.isCopyableElement(V); });
11213 if (CopyableNum < VL.
size() / 2)
11216 const unsigned Limit = VL.
size() / 24;
11217 if ((CopyableNum >= VL.
size() - Limit ||
11218 (CopyableNum >= VL.
size() - 1 && VL.
size() > 4) ||
11223 return InstructionsState::invalid();
11227 for (
auto &
Ops : Operands) {
11242 return InstructionsState::invalid();
11248 constexpr unsigned Limit = 4;
11249 if (Operands.front().size() >= Limit) {
11250 SmallDenseMap<const Value *, unsigned>
Counters;
11258 return C.second == 1;
11264 InstructionsCompatibilityAnalysis
Analysis(DT,
DL,
TTI, TLI);
11265 InstructionsState OpS =
Analysis.buildInstructionsState(
11267 if (!OpS || (OpS.getOpcode() == Instruction::PHI && !
allSameBlock(
Ops)))
11269 unsigned CopyableNum =
11271 return CopyableNum <= VL.
size() / 2;
11273 if (!CheckOperand(Operands.front()))
11274 return InstructionsState::invalid();
11281 assert(S &&
"Invalid state!");
11283 if (S.areInstructionsWithCopyableElements()) {
11284 MainOp = S.getMainOp();
11285 MainOpcode = S.getOpcode();
11290 for (
auto [OperandIdx, Operand] :
enumerate(OperandsForValue))
11291 Operands[OperandIdx][Idx] = Operand;
11294 buildOriginalOperands(S, VL, Operands);
11301BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
11303 bool TryCopyableElementsVectorization)
const {
11306 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
11307 InstructionsState S =
Analysis.buildInstructionsState(
11308 VL, *
this, TryCopyableElementsVectorization,
11309 true, TryCopyableElementsVectorization);
11317 return ScalarsVectorizationLegality(S,
false,
11323 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *S.getMainOp() <<
".\n");
11324 for (TreeEntry *
E : getTreeEntries(S.getMainOp())) {
11325 if (
E->isSame(VL)) {
11326 LLVM_DEBUG(
dbgs() <<
"SLP: Perfect diamond merge at " << *S.getMainOp()
11328 return ScalarsVectorizationLegality(S,
false);
11333 (S.getOpcode() == Instruction::PHI &&
isa<PHINode>(V) &&
11334 LI->getLoopFor(S.getMainOp()->getParent()) &&
11338 return ScalarsVectorizationLegality(S,
false);
11347 !(S && !S.isAltShuffle() && VL.
size() >= 4 &&
11354 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to max recursion depth.\n");
11355 return ScalarsVectorizationLegality(S,
false);
11359 if (S && S.getOpcode() == Instruction::ExtractElement &&
11362 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to scalable vector type.\n");
11363 return ScalarsVectorizationLegality(S,
false);
11370 return ScalarsVectorizationLegality(S,
false,
11380 if (!S || !S.isAltShuffle() || VL.
size() > 2)
11388 SmallVector<unsigned, 8> InstsCount;
11389 for (
Value *V : VL) {
11392 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
11395 bool IsCommutative =
11397 if ((IsCommutative &&
11398 std::accumulate(InstsCount.
begin(), InstsCount.
end(), 0) < 2) ||
11400 all_of(InstsCount, [](
unsigned ICnt) {
return ICnt < 2; })))
11402 assert(VL.size() == 2 &&
"Expected only 2 alternate op instructions.");
11406 for (
int Op :
seq<int>(S.getMainOp()->getNumOperands()))
11408 I2->getOperand(
Op));
11409 if (
static_cast<unsigned>(
count_if(
11410 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
11412 })) >= S.getMainOp()->getNumOperands() / 2)
11414 if (S.getMainOp()->getNumOperands() > 2)
11416 if (IsCommutative) {
11418 Candidates.
clear();
11419 for (
int Op = 0,
E = S.getMainOp()->getNumOperands();
Op <
E; ++
Op)
11421 I2->getOperand((
Op + 1) %
E));
11423 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
11430 SmallVector<unsigned> SortedIndices;
11432 bool IsScatterVectorizeUserTE =
11433 UserTreeIdx.UserTE &&
11434 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11435 bool AreAllSameBlock = S.valid();
11436 bool AreScatterAllGEPSameBlock =
11449 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
11451 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
11457 NotProfitableForVectorization(VL)) {
11459 LLVM_DEBUG(
dbgs() <<
"SLP: Try split and if failed, gathering due to "
11460 "C,S,B,O, small shuffle. \n";
11464 return ScalarsVectorizationLegality(S,
false,
11468 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to C,S,B,O, small shuffle. \n";
11472 return ScalarsVectorizationLegality(S,
false);
11476 if (S && !EphValues.empty()) {
11477 for (
Value *V : VL) {
11478 if (EphValues.count(V)) {
11480 <<
") is ephemeral.\n");
11482 return ScalarsVectorizationLegality(S,
false,
11494 if (S && S.isAltShuffle()) {
11495 auto GetNumVectorizedExtracted = [&]() {
11501 all_of(
I->operands(), [&](
const Use &U) {
11502 return isa<ExtractElementInst>(U.get());
11507 else if (!
I->hasOneUser() && !areAllUsersVectorized(
I, UserIgnoreList))
11510 return std::make_pair(Vectorized, Extracted);
11512 auto [Vectorized, Extracted] = GetNumVectorizedExtracted();
11514 bool PreferScalarize = !Vectorized.
isAllOnes() && VL.size() == 2;
11515 if (!Vectorized.
isAllOnes() && !PreferScalarize) {
11518 Type *ScalarTy = VL.front()->getType();
11523 false,
true, Kind);
11525 *TTI, ScalarTy, VecTy, Vectorized,
11526 true,
false, Kind,
false);
11527 PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate;
11529 if (PreferScalarize) {
11530 LLVM_DEBUG(
dbgs() <<
"SLP: The instructions are in tree and alternate "
11531 "node is not profitable.\n");
11532 return ScalarsVectorizationLegality(S,
false);
11537 if (UserIgnoreList && !UserIgnoreList->empty()) {
11538 for (
Value *V : VL) {
11539 if (UserIgnoreList->contains(V)) {
11540 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to gathered scalar.\n");
11541 return ScalarsVectorizationLegality(S,
false);
11548 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
11549 assert(VL.front()->getType()->isPointerTy() &&
11551 "Expected pointers only.");
11554 assert(It != VL.end() &&
"Expected at least one GEP.");
11565 !DT->isReachableFromEntry(BB))) {
11571 return ScalarsVectorizationLegality(S,
false);
11573 return ScalarsVectorizationLegality(S,
true);
11578 unsigned InterleaveFactor) {
11581 SmallVector<int> ReuseShuffleIndices;
11585 auto TrySplitNode = [&](
const InstructionsState &LocalState) {
11588 if (!canBuildSplitNode(VL, LocalState, Op1, Op2, ReorderIndices))
11591 auto Invalid = ScheduleBundle::invalid();
11592 auto *
TE = newTreeEntry(VL, TreeEntry::SplitVectorize,
Invalid, LocalState,
11593 UserTreeIdx, {}, ReorderIndices);
11598 getSameValuesTreeEntry(S.getMainOp(),
Op,
true))) {
11600 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11601 Idx == 0 ? 0 : Op1.
size());
11602 (void)newTreeEntry(
Op, TreeEntry::NeedToGather,
Invalid, S, {
TE, Idx});
11604 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11605 Idx == 0 ? 0 : Op1.
size());
11615 bool AreConsts =
false;
11616 for (
Value *V : VL) {
11628 if (AreOnlyConstsWithPHIs(VL)) {
11629 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to all constants and PHIs.\n");
11630 newGatherTreeEntry(VL, InstructionsState::invalid(), UserTreeIdx);
11634 ScalarsVectorizationLegality Legality = getScalarsVectorizationLegality(
11635 VL,
Depth, UserTreeIdx,
false);
11636 InstructionsState S = Legality.getInstructionsState();
11637 if (!Legality.isLegal()) {
11638 if (Legality.trySplitVectorize()) {
11641 if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
11645 Legality = getScalarsVectorizationLegality(
11646 VL,
Depth, UserTreeIdx,
true);
11647 if (!Legality.isLegal()) {
11648 if (Legality.tryToFindDuplicates())
11652 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11655 S = Legality.getInstructionsState();
11659 if (S.isAltShuffle() && TrySplitNode(S))
11665 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11670 bool IsScatterVectorizeUserTE =
11671 UserTreeIdx.UserTE &&
11672 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11675 StridedPtrInfo SPtrInfo;
11676 TreeEntry::EntryState State = getScalarsVectorizationState(
11677 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, SPtrInfo);
11678 if (State == TreeEntry::NeedToGather) {
11679 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11685 auto &BSRef = BlocksSchedules[BB];
11687 BSRef = std::make_unique<BlockScheduling>(BB);
11689 BlockScheduling &BS = *BSRef;
11692 std::optional<ScheduleBundle *> BundlePtr =
11693 BS.tryScheduleBundle(UniqueValues.getArrayRef(),
this, S, UserTreeIdx);
11694#ifdef EXPENSIVE_CHECKS
11698 if (!BundlePtr || (*BundlePtr && !*BundlePtr.value())) {
11699 LLVM_DEBUG(
dbgs() <<
"SLP: We are not able to schedule this bundle!\n");
11701 if (S.isAltShuffle() && ReuseShuffleIndices.
empty() && TrySplitNode(S))
11703 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11704 NonScheduledFirst.insert(VL.front());
11705 if (S.getOpcode() == Instruction::Load &&
11706 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
11710 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
11712 ScheduleBundle
Empty;
11713 ScheduleBundle &Bundle = BundlePtr.value() ? *BundlePtr.value() :
Empty;
11714 LLVM_DEBUG(
dbgs() <<
"SLP: We are able to schedule this bundle.\n");
11716 unsigned ShuffleOrOp =
11717 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
11718 auto CreateOperandNodes = [&](TreeEntry *
TE,
const auto &Operands) {
11720 SmallVector<unsigned> PHIOps;
11726 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
11731 for (
unsigned I : PHIOps)
11732 buildTreeRec(Operands[
I],
Depth + 1, {
TE,
I});
11734 switch (ShuffleOrOp) {
11735 case Instruction::PHI: {
11737 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
11741 TE->setOperands(Operands);
11742 CreateOperandNodes(TE, Operands);
11745 case Instruction::ExtractValue:
11746 case Instruction::ExtractElement: {
11747 if (CurrentOrder.empty()) {
11748 LLVM_DEBUG(
dbgs() <<
"SLP: Reusing or shuffling extract sequence.\n");
11751 dbgs() <<
"SLP: Reusing or shuffling of reordered extract sequence "
11753 for (
unsigned Idx : CurrentOrder)
11754 dbgs() <<
" " << Idx;
11761 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11762 ReuseShuffleIndices, CurrentOrder);
11764 "(ExtractValueInst/ExtractElementInst).\n";
11768 TE->setOperands(Operands);
11771 case Instruction::InsertElement: {
11772 assert(ReuseShuffleIndices.
empty() &&
"All inserts should be unique");
11774 auto OrdCompare = [](
const std::pair<int, int> &
P1,
11775 const std::pair<int, int> &P2) {
11776 return P1.first > P2.first;
11779 decltype(OrdCompare)>
11780 Indices(OrdCompare);
11781 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
11783 Indices.emplace(Idx,
I);
11785 OrdersType CurrentOrder(VL.size(), VL.size());
11786 bool IsIdentity =
true;
11787 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
11788 CurrentOrder[Indices.top().second] =
I;
11789 IsIdentity &= Indices.top().second ==
I;
11793 CurrentOrder.clear();
11794 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11796 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (InsertElementInst).\n";
11799 TE->setOperands(Operands);
11800 buildTreeRec(
TE->getOperand(1),
Depth + 1, {TE, 1});
11803 case Instruction::Load: {
11810 TreeEntry *
TE =
nullptr;
11813 case TreeEntry::Vectorize:
11814 TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11815 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
11816 if (CurrentOrder.empty())
11817 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (LoadInst).\n";
11821 <<
"SLP: added a new TreeEntry (jumbled LoadInst).\n";
11824 case TreeEntry::CompressVectorize:
11826 TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S,
11827 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11830 <<
"SLP: added a new TreeEntry (masked LoadInst + compress).\n";
11833 case TreeEntry::StridedVectorize:
11835 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
11836 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11837 TreeEntryToStridedPtrInfoMap[
TE] = SPtrInfo;
11838 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (strided LoadInst).\n";
11841 case TreeEntry::ScatterVectorize:
11843 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
11844 UserTreeIdx, ReuseShuffleIndices);
11847 <<
"SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
11850 case TreeEntry::CombinedVectorize:
11851 case TreeEntry::SplitVectorize:
11852 case TreeEntry::NeedToGather:
11855 if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {
11856 assert(Operands.
size() == 1 &&
"Expected a single operand only");
11857 SmallVector<int>
Mask;
11861 TE->setOperands(Operands);
11862 if (State == TreeEntry::ScatterVectorize)
11863 buildTreeRec(PointerOps,
Depth + 1, {
TE, 0});
11866 case Instruction::ZExt:
11867 case Instruction::SExt:
11868 case Instruction::FPToUI:
11869 case Instruction::FPToSI:
11870 case Instruction::FPExt:
11871 case Instruction::PtrToInt:
11872 case Instruction::IntToPtr:
11873 case Instruction::SIToFP:
11874 case Instruction::UIToFP:
11875 case Instruction::Trunc:
11876 case Instruction::FPTrunc:
11877 case Instruction::BitCast: {
11878 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
11879 std::make_pair(std::numeric_limits<unsigned>::min(),
11880 std::numeric_limits<unsigned>::max()));
11881 if (ShuffleOrOp == Instruction::ZExt ||
11882 ShuffleOrOp == Instruction::SExt) {
11883 CastMaxMinBWSizes = std::make_pair(
11884 std::max<unsigned>(DL->getTypeSizeInBits(VL0->
getType()),
11886 std::min<unsigned>(
11889 }
else if (ShuffleOrOp == Instruction::Trunc) {
11890 CastMaxMinBWSizes = std::make_pair(
11891 std::max<unsigned>(
11894 std::min<unsigned>(DL->getTypeSizeInBits(VL0->
getType()),
11897 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11898 ReuseShuffleIndices);
11899 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (CastInst).\n";
11902 TE->setOperands(Operands);
11904 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
11905 if (ShuffleOrOp == Instruction::Trunc) {
11906 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11907 }
else if (ShuffleOrOp == Instruction::SIToFP ||
11908 ShuffleOrOp == Instruction::UIToFP) {
11909 unsigned NumSignBits =
11912 APInt
Mask = DB->getDemandedBits(OpI);
11913 NumSignBits = std::max(NumSignBits,
Mask.countl_zero());
11915 if (NumSignBits * 2 >=
11917 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11921 case Instruction::ICmp:
11922 case Instruction::FCmp: {
11925 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11926 ReuseShuffleIndices);
11935 "Commutative Predicate mismatch");
11938 Operands.
back() =
Ops.getVL(1);
11945 if (
Cmp->getPredicate() != P0)
11949 TE->setOperands(Operands);
11950 buildTreeRec(Operands.
front(),
Depth + 1, {TE, 0});
11951 buildTreeRec(Operands.
back(),
Depth + 1, {TE, 1});
11952 if (ShuffleOrOp == Instruction::ICmp) {
11953 unsigned NumSignBits0 =
11955 if (NumSignBits0 * 2 >=
11957 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11958 unsigned NumSignBits1 =
11960 if (NumSignBits1 * 2 >=
11962 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
11966 case Instruction::Select:
11967 case Instruction::FNeg:
11968 case Instruction::Add:
11969 case Instruction::FAdd:
11970 case Instruction::Sub:
11971 case Instruction::FSub:
11972 case Instruction::Mul:
11973 case Instruction::FMul:
11974 case Instruction::UDiv:
11975 case Instruction::SDiv:
11976 case Instruction::FDiv:
11977 case Instruction::URem:
11978 case Instruction::SRem:
11979 case Instruction::FRem:
11980 case Instruction::Shl:
11981 case Instruction::LShr:
11982 case Instruction::AShr:
11983 case Instruction::And:
11984 case Instruction::Or:
11985 case Instruction::Xor:
11986 case Instruction::Freeze: {
11987 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11988 ReuseShuffleIndices);
11990 dbgs() <<
"SLP: added a new TreeEntry "
11991 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
11997 Operands[0] =
Ops.getVL(0);
11998 Operands[1] =
Ops.getVL(1);
12000 TE->setOperands(Operands);
12002 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
12005 case Instruction::GetElementPtr: {
12006 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12007 ReuseShuffleIndices);
12008 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (GetElementPtrInst).\n";
12010 TE->setOperands(Operands);
12013 buildTreeRec(Operands[
I],
Depth + 1, {
TE,
I});
12016 case Instruction::Store: {
12017 bool Consecutive = CurrentOrder.empty();
12020 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12021 ReuseShuffleIndices, CurrentOrder);
12023 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (StoreInst).\n";
12027 dbgs() <<
"SLP: added a new TreeEntry (jumbled StoreInst).\n";
12029 TE->setOperands(Operands);
12030 buildTreeRec(
TE->getOperand(0),
Depth + 1, {TE, 0});
12033 case Instruction::Call: {
12039 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12040 ReuseShuffleIndices);
12041 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (CallInst).\n";
12046 Operands[0] =
Ops.getVL(0);
12047 Operands[1] =
Ops.getVL(1);
12049 TE->setOperands(Operands);
12055 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
12059 case Instruction::ShuffleVector: {
12060 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12061 ReuseShuffleIndices);
12062 if (S.isAltShuffle()) {
12063 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (isAltShuffle).\n";
12068 dbgs() <<
"SLP: added a new TreeEntry (ShuffleVectorInst).\n";
12082 "Expected different main/alternate predicates.");
12098 TE->setOperands(Operands);
12099 buildTreeRec(Operands.
front(),
Depth + 1, {TE, 0});
12100 buildTreeRec(Operands.
back(),
Depth + 1, {TE, 1});
12107 Operands[0] =
Ops.getVL(0);
12108 Operands[1] =
Ops.getVL(1);
12110 TE->setOperands(Operands);
12112 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
12130 for (
const auto *Ty : ST->elements())
12131 if (Ty != *ST->element_begin())
12133 N *= ST->getNumElements();
12134 EltTy = *ST->element_begin();
12136 N *= AT->getNumElements();
12137 EltTy = AT->getElementType();
12140 N *= VT->getNumElements();
12141 EltTy = VT->getElementType();
12147 size_t VTSize = DL->getTypeStoreSizeInBits(
getWidenedType(EltTy,
N));
12148 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
12149 VTSize != DL->getTypeStoreSizeInBits(T))
12156 bool ResizeAllowed)
const {
12158 assert(It != VL.
end() &&
"Expected at least one extract instruction.");
12165 Value *Vec = E0->getOperand(0);
12167 CurrentOrder.
clear();
12171 if (E0->getOpcode() == Instruction::ExtractValue) {
12183 unsigned E = VL.
size();
12184 if (!ResizeAllowed && NElts !=
E)
12187 unsigned MinIdx = NElts, MaxIdx = 0;
12192 if (Inst->getOperand(0) != Vec)
12200 const unsigned ExtIdx = *Idx;
12201 if (ExtIdx >= NElts)
12203 Indices[
I] = ExtIdx;
12204 if (MinIdx > ExtIdx)
12206 if (MaxIdx < ExtIdx)
12209 if (MaxIdx - MinIdx + 1 >
E)
12211 if (MaxIdx + 1 <=
E)
12215 bool ShouldKeepOrder =
true;
12222 for (
unsigned I = 0;
I <
E; ++
I) {
12225 const unsigned ExtIdx = Indices[
I] - MinIdx;
12226 if (CurrentOrder[ExtIdx] !=
E) {
12227 CurrentOrder.
clear();
12230 ShouldKeepOrder &= ExtIdx ==
I;
12231 CurrentOrder[ExtIdx] =
I;
12233 if (ShouldKeepOrder)
12234 CurrentOrder.
clear();
12236 return ShouldKeepOrder;
12239bool BoUpSLP::areAllUsersVectorized(
12240 Instruction *
I,
const SmallDenseSet<Value *> *VectorizedVals)
const {
12241 return (
I->hasOneUse() && (!VectorizedVals || VectorizedVals->
contains(
I))) ||
12242 all_of(
I->users(), [
this](User *U) {
12243 return isVectorized(U) || isVectorLikeInstWithConstOps(U) ||
12244 (isa<ExtractElementInst>(U) && MustGather.contains(U));
12248void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
12249 const function_ref<
bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
12250 SmallVectorImpl<Value *> *OpScalars,
12251 SmallVectorImpl<Value *> *AltScalars)
const {
12252 unsigned Sz = Scalars.size();
12254 SmallVector<int> OrderMask;
12255 if (!ReorderIndices.empty())
12257 for (
unsigned I = 0;
I < Sz; ++
I) {
12259 if (!ReorderIndices.empty())
12260 Idx = OrderMask[
I];
12264 if (IsAltOp(OpInst)) {
12265 Mask[
I] = Sz + Idx;
12274 if (!ReuseShuffleIndices.
empty()) {
12276 transform(ReuseShuffleIndices, NewMask.
begin(), [&Mask](
int Idx) {
12277 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
12279 Mask.swap(NewMask);
12286 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(
I) == MainOp;
12296 assert(MainP != AltP &&
"Expected different main/alternate predicates.");
12305 assert((MainP ==
P || AltP ==
P || MainP == SwappedP || AltP == SwappedP) &&
12306 "CmpInst expected to match either main or alternate predicate or "
12308 return MainP !=
P && MainP != SwappedP;
12310 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(
I) == AltOp;
12315 const auto *Op0 =
Ops.front();
12328 return CI->getValue().isPowerOf2();
12334 return CI->getValue().isNegatedPowerOf2();
12339 if (IsConstant && IsUniform)
12341 else if (IsConstant)
12343 else if (IsUniform)
12355class BaseShuffleAnalysis {
12357 Type *ScalarTy =
nullptr;
12359 BaseShuffleAnalysis(
Type *ScalarTy) : ScalarTy(ScalarTy) {}
12367 unsigned getVF(
Value *V)
const {
12368 assert(V &&
"V cannot be nullptr");
12370 "V does not have FixedVectorType");
12371 assert(ScalarTy &&
"ScalarTy cannot be nullptr");
12373 unsigned VNumElements =
12375 assert(VNumElements > ScalarTyNumElements &&
12376 "the number of elements of V is not large enough");
12377 assert(VNumElements % ScalarTyNumElements == 0 &&
12378 "the number of elements of V is not a vectorized value");
12379 return VNumElements / ScalarTyNumElements;
12385 static bool isIdentityMask(ArrayRef<int> Mask,
const FixedVectorType *VecTy,
12387 int Limit =
Mask.size();
12399 if (Limit % VF == 0 &&
all_of(
seq<int>(0, Limit / VF), [=](
int Idx) {
12400 ArrayRef<int> Slice =
Mask.slice(Idx * VF, VF);
12413 static void combineMasks(
unsigned LocalVF, SmallVectorImpl<int> &Mask,
12414 ArrayRef<int> ExtMask) {
12415 unsigned VF =
Mask.size();
12417 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
12420 int MaskedIdx =
Mask[ExtMask[
I] % VF];
12424 Mask.swap(NewMask);
12460 static bool peekThroughShuffles(
Value *&V, SmallVectorImpl<int> &Mask,
12461 bool SinglePermute) {
12463 ShuffleVectorInst *IdentityOp =
nullptr;
12464 SmallVector<int> IdentityMask;
12473 if (isIdentityMask(Mask, SVTy,
false)) {
12474 if (!IdentityOp || !SinglePermute ||
12475 (isIdentityMask(Mask, SVTy,
true) &&
12477 IdentityMask.
size()))) {
12482 IdentityMask.
assign(Mask);
12502 if (SV->isZeroEltSplat()) {
12504 IdentityMask.
assign(Mask);
12506 int LocalVF =
Mask.size();
12509 LocalVF = SVOpTy->getNumElements();
12513 static_cast<unsigned>(
I) >= SV->getShuffleMask().size())
12515 ExtMask[Idx] = SV->getMaskValue(
I);
12525 if (!IsOp1Undef && !IsOp2Undef) {
12527 for (
int &
I : Mask) {
12530 if (SV->getMaskValue(
I % SV->getShuffleMask().size()) ==
12536 SmallVector<int> ShuffleMask(SV->getShuffleMask());
12537 combineMasks(LocalVF, ShuffleMask, Mask);
12538 Mask.swap(ShuffleMask);
12540 Op = SV->getOperand(0);
12542 Op = SV->getOperand(1);
12545 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
12550 "Expected masks of same sizes.");
12555 Mask.swap(IdentityMask);
12557 return SinglePermute &&
12560 (Shuffle &&
Mask.size() == Shuffle->getShuffleMask().size() &&
12561 Shuffle->isZeroEltSplat() &&
12565 Shuffle->getShuffleMask()[
P.index()] == 0;
12578 template <
typename T,
typename ShuffleBuilderTy>
12579 static T createShuffle(
Value *V1,
Value *V2, ArrayRef<int> Mask,
12580 ShuffleBuilderTy &Builder,
Type *ScalarTy) {
12581 assert(V1 &&
"Expected at least one vector value.");
12583 SmallVector<int> NewMask(Mask);
12584 if (ScalarTyNumElements != 1) {
12590 Builder.resizeToMatch(V1, V2);
12591 int VF =
Mask.size();
12593 VF = FTy->getNumElements();
12604 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
12606 CombinedMask1[
I] =
Mask[
I];
12608 CombinedMask2[
I] =
Mask[
I] - VF;
12615 (void)peekThroughShuffles(Op1, CombinedMask1,
false);
12616 (void)peekThroughShuffles(Op2, CombinedMask2,
false);
12622 for (
auto [Idx,
I] :
enumerate(CombinedMask1)) {
12625 ExtMask1[Idx] = SV1->getMaskValue(
I);
12629 ->getNumElements(),
12630 ExtMask1, UseMask::SecondArg);
12631 SmallVector<int> ExtMask2(CombinedMask2.size(),
PoisonMaskElem);
12632 for (
auto [Idx,
I] :
enumerate(CombinedMask2)) {
12635 ExtMask2[Idx] = SV2->getMaskValue(
I);
12639 ->getNumElements(),
12640 ExtMask2, UseMask::SecondArg);
12641 if (SV1->getOperand(0)->getType() ==
12642 SV2->getOperand(0)->getType() &&
12643 SV1->getOperand(0)->getType() != SV1->getType() &&
12646 Op1 = SV1->getOperand(0);
12647 Op2 = SV2->getOperand(0);
12648 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
12649 int LocalVF = ShuffleMask1.size();
12651 LocalVF = FTy->getNumElements();
12652 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
12653 CombinedMask1.swap(ShuffleMask1);
12654 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
12655 LocalVF = ShuffleMask2.size();
12657 LocalVF = FTy->getNumElements();
12658 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
12659 CombinedMask2.swap(ShuffleMask2);
12662 }
while (PrevOp1 != Op1 || PrevOp2 != Op2);
12663 Builder.resizeToMatch(Op1, Op2);
12665 ->getElementCount()
12666 .getKnownMinValue(),
12668 ->getElementCount()
12669 .getKnownMinValue());
12670 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
12673 "Expected undefined mask element");
12674 CombinedMask1[
I] = CombinedMask2[
I] + (Op1 == Op2 ? 0 : VF);
12683 return Builder.createIdentity(Op1);
12684 return Builder.createShuffleVector(
12689 return Builder.createPoison(
12691 bool IsIdentity = peekThroughShuffles(V1, NewMask,
true);
12692 assert(V1 &&
"Expected non-null value after looking through shuffles.");
12695 return Builder.createShuffleVector(V1, NewMask);
12696 return Builder.createIdentity(V1);
12702 ArrayRef<int> Mask) {
12711static std::pair<InstructionCost, InstructionCost>
12722 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
12731 ScalarCost =
TTI.getPointersChainCost(
12732 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
12736 for (
Value *V : Ptrs) {
12737 if (V == BasePtr) {
12750 if (PtrsRetainedInVecCode.
size() == Ptrs.size()) {
12755 VecCost =
TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
12756 TTI::PointersChainInfo::getKnownStride(),
12766 [](
const Value *V) {
12768 return Ptr && !Ptr->hasAllConstantIndices();
12770 ? TTI::PointersChainInfo::getUnknownStride()
12771 : TTI::PointersChainInfo::getKnownStride();
12774 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy,
CostKind);
12778 if (It != Ptrs.
end())
12783 VecCost =
TTI.getGEPCost(BaseGEP->getSourceElementType(),
12784 BaseGEP->getPointerOperand(), Indices, VecTy,
12789 return std::make_pair(ScalarCost, VecCost);
12792void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
12793 assert(
TE.isGather() &&
TE.ReorderIndices.empty() &&
12794 "Expected gather node without reordering.");
12796 SmallSet<size_t, 2> LoadKeyUsed;
12800 if (
TE.Scalars.size() == 2 || (
TE.hasState() && !
TE.isAltShuffle()) ||
12805 return VectorizableTree[Idx]->isSame(TE.Scalars);
12809 auto GenerateLoadsSubkey = [&](
size_t Key, LoadInst *LI) {
12814 auto LIt = LoadsMap.
find(std::make_pair(
Key, Ptr));
12815 if (LIt != LoadsMap.
end()) {
12816 for (LoadInst *RLI : LIt->second) {
12818 LI->
getType(), LI->getPointerOperand(), *DL, *SE,
12822 for (LoadInst *RLI : LIt->second) {
12824 LI->getPointerOperand(), *TLI)) {
12829 if (LIt->second.size() > 2) {
12831 hash_value(LIt->second.back()->getPointerOperand());
12837 LoadsMap.
try_emplace(std::make_pair(
Key, Ptr)).first->second.push_back(LI);
12840 MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues;
12841 SmallDenseMap<Value *, SmallVector<unsigned>, 8> KeyToIndex;
12842 bool IsOrdered =
true;
12843 unsigned NumInstructions = 0;
12847 size_t Key = 1, Idx = 1;
12855 auto &Container = SortedValues[
Key];
12856 if (IsOrdered && !KeyToIndex.
contains(V) &&
12859 ((Container.contains(Idx) &&
12860 KeyToIndex.
at(Container[Idx].back()).back() !=
I - 1) ||
12861 (!Container.empty() && !Container.contains(Idx) &&
12862 KeyToIndex.
at(Container.back().second.back()).back() !=
I - 1)))
12864 auto &KTI = KeyToIndex[
V];
12866 Container[Idx].push_back(V);
12871 if (!IsOrdered && NumInstructions > 1) {
12873 TE.ReorderIndices.resize(
TE.Scalars.size(),
TE.Scalars.size());
12874 for (
const auto &
D : SortedValues) {
12875 for (
const auto &
P :
D.second) {
12877 for (
Value *V :
P.second) {
12878 ArrayRef<unsigned> Indices = KeyToIndex.
at(V);
12879 for (
auto [K, Idx] :
enumerate(Indices)) {
12880 TE.ReorderIndices[Cnt +
K] = Idx;
12881 TE.Scalars[Cnt +
K] =
V;
12883 Sz += Indices.
size();
12884 Cnt += Indices.
size();
12888 *TTI,
TE.Scalars.front()->getType(), Sz);
12892 }
else if (!
P.second.empty() &&
isConstant(
P.second.front())) {
12900 if (!
TE.ReuseShuffleIndices.empty() ||
TE.ReorderIndices.empty())
12905 auto *ScalarTy =
TE.Scalars.front()->getType();
12907 for (
auto [Idx, Sz] : SubVectors) {
12914 int Sz =
TE.Scalars.size();
12915 SmallVector<int> ReorderMask(
TE.ReorderIndices.begin(),
12916 TE.ReorderIndices.end());
12922 ReorderMask[
I] =
I +
TE.ReorderIndices.size();
12926 any_of(ReorderMask, [&](
int I) {
return I >= Sz; })
12929 VecTy, ReorderMask);
12935 DemandedElts.clearBit(
I);
12937 ReorderMask[
I] =
I;
12939 ReorderMask[
I] =
I + Sz;
12945 if (!DemandedElts.isAllOnes())
12947 if (
Cost >= BVCost) {
12948 SmallVector<int>
Mask(
TE.ReorderIndices.begin(),
TE.ReorderIndices.end());
12950 TE.ReorderIndices.clear();
12957 const InstructionsState &S,
12963 return V->getType()->getScalarType()->isFloatingPointTy();
12965 "Can only convert to FMA for floating point types");
12966 assert(S.isAddSubLikeOp() &&
"Can only convert to FMA for add/sub");
12971 for (
Value *V : VL) {
12975 if (S.isCopyableElement(
I))
12977 Instruction *MatchingI = S.getMatchingMainOpOrAltOp(
I);
12978 if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
12981 FMF &= FPCI->getFastMathFlags();
12985 if (!CheckForContractable(VL))
12988 InstructionsCompatibilityAnalysis
Analysis(DT,
DL,
TTI, TLI);
12995 if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
12997 if (!CheckForContractable(Operands.
front()))
13005 for (
Value *V : VL) {
13009 if (!S.isCopyableElement(
I))
13011 FMF &= FPCI->getFastMathFlags();
13012 FMulPlusFAddCost +=
TTI.getInstructionCost(
I,
CostKind);
13015 for (
auto [V,
Op] :
zip(VL, Operands.
front())) {
13016 if (S.isCopyableElement(V))
13019 if (!
I || !
I->hasOneUse() || OpS.isCopyableElement(
I)) {
13021 FMACost +=
TTI.getInstructionCost(OpI,
CostKind);
13028 FMF &= FPCI->getFastMathFlags();
13029 FMulPlusFAddCost +=
TTI.getInstructionCost(
I,
CostKind);
13039 BaseGraphSize = VectorizableTree.size();
13041 class GraphTransformModeRAAI {
13042 bool &SavedIsGraphTransformMode;
13045 GraphTransformModeRAAI(
bool &IsGraphTransformMode)
13046 : SavedIsGraphTransformMode(IsGraphTransformMode) {
13047 IsGraphTransformMode =
true;
13049 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode =
false; }
13050 } TransformContext(IsGraphTransformMode);
13059 const InstructionsState &S) {
13063 I2->getOperand(
Op));
13065 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
13067 [](
const std::pair<Value *, Value *> &
P) {
13077 TreeEntry &E = *VectorizableTree[Idx];
13079 reorderGatherNode(E);
13084 constexpr unsigned VFLimit = 16;
13085 bool ForceLoadGather =
13086 count_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
13087 return TE->isGather() && TE->hasState() &&
13088 TE->getOpcode() == Instruction::Load &&
13089 TE->getVectorFactor() < VFLimit;
13095 return TE->isSame(VL) ||
all_of(VL, [&](
Value *V) {
13104 auto CheckForSameVectorNodes = [&](
const TreeEntry &E) {
13105 if (E.hasState()) {
13107 !TEs.
empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
13108 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
13109 ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
13110 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13111 return is_contained(TEs, TE);
13118 !TEs.
empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
13119 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
13120 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
13121 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13122 return is_contained(TEs, TE);
13130 if (It != E.Scalars.end()) {
13132 !TEs.empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
13133 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
13134 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
13135 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13136 return is_contained(TEs, TE);
13146 for (
unsigned Idx :
seq<unsigned>(BaseGraphSize)) {
13147 TreeEntry &
E = *VectorizableTree[Idx];
13148 if (
E.isGather()) {
13151 unsigned MinVF =
getMinVF(2 * Sz);
13154 if (VL.
size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
13155 !(!
E.hasState() ||
E.getOpcode() == Instruction::Load ||
13161 if (ForceLoadGather &&
E.hasState() &&
E.getOpcode() == Instruction::Load)
13164 if (CheckForSameVectorNodes(
E))
13168 unsigned StartIdx = 0;
13169 unsigned End = VL.
size();
13171 *TTI, VL.
front()->getType(), VL.
size() - 1);
13173 *TTI, VL.
front()->getType(), VF - 1)) {
13174 if (StartIdx + VF > End)
13177 bool AllStrided =
true;
13178 for (
unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
13183 !getSameValuesTreeEntry(Slice.
front(), Slice,
true))
13190 bool IsSplat =
isSplat(Slice);
13191 bool IsTwoRegisterSplat =
true;
13192 if (IsSplat && VF == 2) {
13195 IsTwoRegisterSplat = NumRegs2VF == 2;
13197 if (Slices.
empty() || !IsSplat || !IsTwoRegisterSplat ||
13205 (S.getOpcode() == Instruction::Load &&
13207 (S.getOpcode() != Instruction::Load &&
13213 if ((!UserIgnoreList ||
E.Idx != 0) &&
13214 TTI->getInstructionCost(S.getMainOp(),
CostKind) <
13223 if (S.getOpcode() == Instruction::Load) {
13226 StridedPtrInfo SPtrInfo;
13228 PointerOps, SPtrInfo);
13239 if (UserIgnoreList &&
E.Idx == 0)
13244 }
else if (S.getOpcode() == Instruction::ExtractElement ||
13245 (TTI->getInstructionCost(S.getMainOp(),
CostKind) <
13247 !CheckOperandsProfitability(
13264 if (VF == 2 && AllStrided && Slices.
size() > 2)
13266 auto AddCombinedNode = [&](
unsigned Idx,
unsigned Cnt,
unsigned Sz) {
13267 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
13268 if (StartIdx == Cnt)
13269 StartIdx = Cnt + Sz;
13270 if (End == Cnt + Sz)
13273 for (
auto [Cnt, Sz] : Slices) {
13275 const TreeEntry *SameTE =
nullptr;
13277 It != Slice.
end()) {
13279 SameTE = getSameValuesTreeEntry(*It, Slice);
13281 unsigned PrevSize = VectorizableTree.size();
13282 [[maybe_unused]]
unsigned PrevEntriesSize =
13283 LoadEntriesToVectorize.size();
13284 buildTreeRec(Slice, 0,
EdgeInfo(&
E, UINT_MAX));
13285 if (PrevSize + 1 == VectorizableTree.size() && !SameTE &&
13286 VectorizableTree[PrevSize]->isGather() &&
13287 VectorizableTree[PrevSize]->hasState() &&
13288 VectorizableTree[PrevSize]->getOpcode() !=
13289 Instruction::ExtractElement &&
13291 if (UserIgnoreList &&
E.Idx == 0 && VF == 2)
13293 VectorizableTree.pop_back();
13294 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
13295 "LoadEntriesToVectorize expected to remain the same");
13298 AddCombinedNode(PrevSize, Cnt, Sz);
13302 if (
E.CombinedEntriesWithIndices.empty() && !
E.ReorderIndices.empty()) {
13303 SmallVector<int>
Mask(
E.ReorderIndices.begin(),
E.ReorderIndices.end());
13305 E.ReorderIndices.clear();
13310 switch (
E.getOpcode()) {
13311 case Instruction::Load: {
13314 if (
E.State != TreeEntry::Vectorize)
13316 Type *ScalarTy =
E.getMainOp()->getType();
13322 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13323 SmallVector<int>
Mask;
13327 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
13328 BaseLI->getPointerAddressSpace(),
CostKind,
13332 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
13333 false, CommonAlignment,
CostKind, BaseLI);
13338 ->getPointerOperand()
13340 StridedPtrInfo SPtrInfo;
13341 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1);
13342 SPtrInfo.Ty = VecTy;
13343 TreeEntryToStridedPtrInfoMap[&
E] = SPtrInfo;
13344 E.State = TreeEntry::StridedVectorize;
13349 case Instruction::Store: {
13357 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13358 SmallVector<int>
Mask;
13362 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
13363 BaseSI->getPointerAddressSpace(),
CostKind,
13367 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
13368 false, CommonAlignment,
CostKind, BaseSI);
13369 if (StridedCost < OriginalVecCost)
13372 E.State = TreeEntry::StridedVectorize;
13373 }
else if (!
E.ReorderIndices.empty()) {
13375 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int>
Mask) {
13377 assert(
Mask.size() > 1 &&
"Expected mask greater than 1 element.");
13378 if (
Mask.size() < 4)
13382 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
13383 TTI.isLegalInterleavedAccessType(
13384 VecTy, Factor, BaseSI->getAlign(),
13385 BaseSI->getPointerAddressSpace()))
13391 SmallVector<int>
Mask(
E.ReorderIndices.begin(),
E.ReorderIndices.end());
13392 unsigned InterleaveFactor = IsInterleaveMask(Mask);
13393 if (InterleaveFactor != 0)
13394 E.setInterleave(InterleaveFactor);
13398 case Instruction::Select: {
13399 if (
E.State != TreeEntry::Vectorize)
13405 E.CombinedOp = TreeEntry::MinMax;
13406 TreeEntry *CondEntry = getOperandEntry(&
E, 0);
13407 if (SelectOnly && CondEntry->UserTreeIndex &&
13408 CondEntry->State == TreeEntry::Vectorize) {
13410 CondEntry->State = TreeEntry::CombinedVectorize;
13414 case Instruction::FSub:
13415 case Instruction::FAdd: {
13417 if (
E.State != TreeEntry::Vectorize ||
13418 !
E.getOperations().isAddSubLikeOp())
13424 E.CombinedOp = TreeEntry::FMulAdd;
13425 TreeEntry *FMulEntry = getOperandEntry(&
E, 0);
13426 if (FMulEntry->UserTreeIndex &&
13427 FMulEntry->State == TreeEntry::Vectorize) {
13429 FMulEntry->State = TreeEntry::CombinedVectorize;
13438 if (LoadEntriesToVectorize.empty()) {
13440 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
13441 VectorizableTree.front()->getOpcode() == Instruction::Load)
13444 constexpr unsigned SmallTree = 3;
13445 constexpr unsigned SmallVF = 2;
13446 if ((VectorizableTree.size() <= SmallTree &&
13447 VectorizableTree.front()->Scalars.size() == SmallVF) ||
13448 (VectorizableTree.size() <= 2 && UserIgnoreList))
13451 if (VectorizableTree.front()->isNonPowOf2Vec() &&
13455 [](
const std::unique_ptr<TreeEntry> &TE) {
13456 return TE->isGather() &&
TE->hasState() &&
13457 TE->getOpcode() == Instruction::Load &&
13465 SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
13469 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
13470 TreeEntry &
E = *
TE;
13471 if (
E.isGather() &&
13472 ((
E.hasState() &&
E.getOpcode() == Instruction::Load) ||
13473 (!
E.hasState() &&
any_of(
E.Scalars,
13475 return isa<LoadInst>(V) &&
13476 !isVectorized(V) &&
13477 !isDeleted(cast<Instruction>(V));
13480 for (
Value *V :
E.Scalars) {
13487 *
this, V, *DL, *SE, *TTI,
13488 GatheredLoads[std::make_tuple(
13496 if (!GatheredLoads.
empty())
13497 tryToVectorizeGatheredLoads(GatheredLoads);
13507 bool IsFinalized =
false;
13520 bool SameNodesEstimated =
true;
13523 if (Ty->getScalarType()->isPointerTy()) {
13527 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
13528 Ty->getScalarType());
13546 assert(It != VL.
end() &&
"Expected at least one non-undef value.");
13549 count(VL, *It) > 1 &&
13551 if (!NeedShuffle) {
13554 return TTI.getShuffleCost(
13559 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
13560 CostKind, std::distance(VL.
begin(), It),
13566 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
13569 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
13573 VecTy, ShuffleMask, CostKind,
13577 return GatherCost +
13580 : R.getGatherCost(Gathers, !Root && VL.
equals(Gathers),
13588 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13589 unsigned NumParts) {
13590 assert(VL.
size() > NumParts &&
"Unexpected scalarized shuffle.");
13592 std::accumulate(VL.
begin(), VL.
end(), 0, [](
unsigned Sz,
Value *V) {
13593 auto *EE = dyn_cast<ExtractElementInst>(V);
13596 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
13599 return std::max(Sz, VecTy->getNumElements());
13606 -> std::optional<TTI::ShuffleKind> {
13607 if (NumElts <= EltsPerVector)
13608 return std::nullopt;
13610 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
13612 if (I == PoisonMaskElem)
13614 return std::min(S, I);
13617 int OffsetReg1 = OffsetReg0;
13621 int FirstRegId = -1;
13622 Indices.assign(1, OffsetReg0);
13626 int Idx =
I - OffsetReg0;
13628 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
13629 if (FirstRegId < 0)
13630 FirstRegId = RegId;
13631 RegIndices.
insert(RegId);
13632 if (RegIndices.
size() > 2)
13633 return std::nullopt;
13634 if (RegIndices.
size() == 2) {
13636 if (Indices.
size() == 1) {
13639 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
13640 [&](
int S,
int I) {
13641 if (I == PoisonMaskElem)
13643 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
13644 ((I - OffsetReg0) % NumElts) / EltsPerVector;
13645 if (RegId == FirstRegId)
13647 return std::min(S, I);
13650 unsigned Index = OffsetReg1 % NumElts;
13651 Indices.push_back(Index);
13652 SubVecSizes.push_back(std::min(NumElts - Index, EltsPerVector));
13654 Idx =
I - OffsetReg1;
13656 I = (Idx % NumElts) % EltsPerVector +
13657 (RegId == FirstRegId ? 0 : EltsPerVector);
13659 return ShuffleKind;
13667 if (!ShuffleKinds[Part])
13670 Part * EltsPerVector,
getNumElems(Mask.size(), EltsPerVector, Part));
13675 std::optional<TTI::ShuffleKind> RegShuffleKind =
13676 CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);
13677 if (!RegShuffleKind) {
13680 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.
size())))
13693 *R.TTI, VL.
front()->getType(),
alignTo(NumElts, EltsPerVector));
13694 for (
const auto [Idx, SubVecSize] :
zip(Indices, SubVecSizes)) {
13695 assert((Idx + SubVecSize) <= BaseVF &&
13696 "SK_ExtractSubvector index out of range");
13706 TTI, *ShuffleKinds[Part],
getWidenedType(ScalarTy, NumElts), SubMask);
13707 if (OriginalCost < Cost)
13708 Cost = OriginalCost;
13715 void estimateNodesPermuteCost(
const TreeEntry &E1,
const TreeEntry *E2,
13717 unsigned SliceSize) {
13718 if (SameNodesEstimated) {
13724 if ((InVectors.size() == 2 &&
13728 unsigned Limit =
getNumElems(Mask.size(), SliceSize, Part);
13731 "Expected all poisoned elements.");
13733 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
13738 Cost += createShuffle(InVectors.front(),
13739 InVectors.size() == 1 ?
nullptr : InVectors.back(),
13741 transformMaskAfterShuffle(CommonMask, CommonMask);
13742 }
else if (InVectors.size() == 2) {
13743 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
13744 transformMaskAfterShuffle(CommonMask, CommonMask);
13746 SameNodesEstimated =
false;
13747 if (!E2 && InVectors.size() == 1) {
13748 unsigned VF = E1.getVectorFactor();
13750 VF = std::max(VF, getVF(V1));
13753 VF = std::max(VF, E->getVectorFactor());
13755 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13757 CommonMask[Idx] = Mask[Idx] + VF;
13758 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
13759 transformMaskAfterShuffle(CommonMask, CommonMask);
13761 auto P = InVectors.front();
13762 Cost += createShuffle(&E1, E2, Mask);
13763 unsigned VF = Mask.size();
13769 VF = std::max(VF, E->getVectorFactor());
13771 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13773 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
13774 Cost += createShuffle(
P, InVectors.front(), CommonMask);
13775 transformMaskAfterShuffle(CommonMask, CommonMask);
13779 class ShuffleCostBuilder {
13782 static bool isEmptyOrIdentity(
ArrayRef<int> Mask,
unsigned VF) {
13784 return Mask.empty() ||
13785 (VF == Mask.size() &&
13793 ~ShuffleCostBuilder() =
default;
13799 if (isEmptyOrIdentity(Mask, VF))
13808 if (isEmptyOrIdentity(Mask, VF))
13817 void resizeToMatch(
Value *&,
Value *&)
const {}
13827 ShuffleCostBuilder Builder(TTI);
13830 unsigned CommonVF = Mask.size();
13832 auto GetNodeMinBWAffectedCost = [&](
const TreeEntry &E,
13836 Type *EScalarTy = E.Scalars.front()->getType();
13837 bool IsSigned =
true;
13838 if (
auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
13840 IsSigned = It->second.second;
13842 if (EScalarTy != ScalarTy) {
13843 unsigned CastOpcode = Instruction::Trunc;
13844 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13845 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13847 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13848 return TTI.getCastInstrCost(CastOpcode,
getWidenedType(ScalarTy, VF),
13858 Type *EScalarTy = VecTy->getElementType();
13859 if (EScalarTy != ScalarTy) {
13861 unsigned CastOpcode = Instruction::Trunc;
13862 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13863 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13865 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13866 return TTI.getCastInstrCost(
13872 if (!V1 && !V2 && !P2.
isNull()) {
13875 unsigned VF = E->getVectorFactor();
13877 CommonVF = std::max(VF, E2->getVectorFactor());
13880 return Idx < 2 * static_cast<int>(CommonVF);
13882 "All elements in mask must be less than 2 * CommonVF.");
13883 if (E->Scalars.size() == E2->Scalars.size()) {
13887 for (
int &Idx : CommonMask) {
13890 if (Idx <
static_cast<int>(CommonVF) && !EMask.
empty())
13892 else if (Idx >=
static_cast<int>(CommonVF))
13893 Idx = (E2Mask.
empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
13897 CommonVF = E->Scalars.size();
13898 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
13899 GetNodeMinBWAffectedCost(*E2, CommonVF);
13901 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
13902 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
13905 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13906 }
else if (!V1 && P2.
isNull()) {
13909 unsigned VF = E->getVectorFactor();
13913 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
13914 "All elements in mask must be less than CommonVF.");
13915 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
13917 assert(!EMask.
empty() &&
"Expected non-empty common mask.");
13918 for (
int &Idx : CommonMask) {
13922 CommonVF = E->Scalars.size();
13923 }
else if (
unsigned Factor = E->getInterleaveFactor();
13924 Factor > 0 && E->Scalars.size() != Mask.size() &&
13928 std::iota(CommonMask.begin(), CommonMask.end(), 0);
13930 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
13933 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
13934 CommonVF == CommonMask.size() &&
13936 [](
const auto &&
P) {
13938 static_cast<unsigned>(
P.value()) !=
P.index();
13946 }
else if (V1 && P2.
isNull()) {
13948 ExtraCost += GetValueMinBWAffectedCost(V1);
13949 CommonVF = getVF(V1);
13952 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
13953 "All elements in mask must be less than CommonVF.");
13954 }
else if (V1 && !V2) {
13956 unsigned VF = getVF(V1);
13958 CommonVF = std::max(VF, E2->getVectorFactor());
13961 return Idx < 2 * static_cast<int>(CommonVF);
13963 "All elements in mask must be less than 2 * CommonVF.");
13964 if (E2->Scalars.size() == VF && VF != CommonVF) {
13966 assert(!E2Mask.
empty() &&
"Expected non-empty common mask.");
13967 for (
int &Idx : CommonMask) {
13970 if (Idx >=
static_cast<int>(CommonVF))
13971 Idx = E2Mask[Idx - CommonVF] + VF;
13975 ExtraCost += GetValueMinBWAffectedCost(V1);
13977 ExtraCost += GetNodeMinBWAffectedCost(
13978 *E2, std::min(CommonVF, E2->getVectorFactor()));
13979 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13980 }
else if (!V1 && V2) {
13982 unsigned VF = getVF(V2);
13984 CommonVF = std::max(VF, E1->getVectorFactor());
13987 return Idx < 2 * static_cast<int>(CommonVF);
13989 "All elements in mask must be less than 2 * CommonVF.");
13990 if (E1->Scalars.size() == VF && VF != CommonVF) {
13992 assert(!E1Mask.
empty() &&
"Expected non-empty common mask.");
13993 for (
int &Idx : CommonMask) {
13996 if (Idx >=
static_cast<int>(CommonVF))
13997 Idx = E1Mask[Idx - CommonVF] + VF;
14003 ExtraCost += GetNodeMinBWAffectedCost(
14004 *E1, std::min(CommonVF, E1->getVectorFactor()));
14006 ExtraCost += GetValueMinBWAffectedCost(V2);
14007 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
14009 assert(V1 && V2 &&
"Expected both vectors.");
14010 unsigned VF = getVF(V1);
14011 CommonVF = std::max(VF, getVF(V2));
14014 return Idx < 2 * static_cast<int>(CommonVF);
14016 "All elements in mask must be less than 2 * CommonVF.");
14018 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
14021 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
14026 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
14029 InVectors.front() =
14031 if (InVectors.size() == 2)
14032 InVectors.pop_back();
14033 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
14034 V1, V2, CommonMask, Builder, ScalarTy);
14041 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
14042 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
14043 CheckedExtracts(CheckedExtracts) {}
14045 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
14046 unsigned NumParts,
bool &UseVecBaseAsInput) {
14047 UseVecBaseAsInput =
false;
14050 Value *VecBase =
nullptr;
14052 if (!E->ReorderIndices.empty()) {
14054 E->ReorderIndices.end());
14059 bool PrevNodeFound =
any_of(
14060 ArrayRef(R.VectorizableTree).take_front(E->Idx),
14061 [&](
const std::unique_ptr<TreeEntry> &TE) {
14062 return ((TE->hasState() && !TE->isAltShuffle() &&
14063 TE->getOpcode() == Instruction::ExtractElement) ||
14065 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
14066 return VL.size() > Data.index() &&
14067 (Mask[Data.index()] == PoisonMaskElem ||
14068 isa<UndefValue>(VL[Data.index()]) ||
14069 Data.value() == VL[Data.index()]);
14077 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
14091 VecBase = EE->getVectorOperand();
14092 UniqueBases.
insert(VecBase);
14094 if (!CheckedExtracts.
insert(V).second ||
14098 return isa<GetElementPtrInst>(U) &&
14099 !R.areAllUsersVectorized(cast<Instruction>(U),
14107 unsigned Idx = *EEIdx;
14109 if (EE->hasOneUse() || !PrevNodeFound) {
14115 Cost -= TTI.getExtractWithExtendCost(
14116 Ext->getOpcode(), Ext->getType(), EE->getVectorOperandType(),
14119 Cost += TTI.getCastInstrCost(
14120 Ext->getOpcode(), Ext->getType(), EE->getType(),
14125 APInt &DemandedElts =
14126 VectorOpsToExtracts
14129 .first->getSecond();
14130 DemandedElts.
setBit(Idx);
14133 for (
const auto &[Vec, DemandedElts] : VectorOpsToExtracts)
14135 DemandedElts,
false,
14143 if (!PrevNodeFound)
14144 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
14147 transformMaskAfterShuffle(CommonMask, CommonMask);
14148 SameNodesEstimated =
false;
14149 if (NumParts != 1 && UniqueBases.
size() != 1) {
14150 UseVecBaseAsInput =
true;
14158 std::optional<InstructionCost>
14162 return std::nullopt;
14166 IsFinalized =
false;
14167 CommonMask.clear();
14170 VectorizedVals.clear();
14171 SameNodesEstimated =
true;
14177 return Idx < static_cast<int>(E1.getVectorFactor());
14179 "Expected single vector shuffle mask.");
14183 if (InVectors.empty()) {
14184 CommonMask.assign(Mask.begin(), Mask.end());
14185 InVectors.assign({&E1, &E2});
14188 assert(!CommonMask.empty() &&
"Expected non-empty common mask.");
14194 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
14195 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
14198 if (InVectors.empty()) {
14199 CommonMask.assign(Mask.begin(), Mask.end());
14200 InVectors.assign(1, &E1);
14203 assert(!CommonMask.empty() &&
"Expected non-empty common mask.");
14209 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
14210 estimateNodesPermuteCost(E1,
nullptr, Mask, Part, SliceSize);
14211 if (!SameNodesEstimated && InVectors.size() == 1)
14212 InVectors.emplace_back(&E1);
14218 assert(InVectors.size() == 1 &&
14225 ->getOrdered(
P.index()));
14226 return EI->getVectorOperand() == V1 ||
14227 EI->getVectorOperand() == V2;
14229 "Expected extractelement vectors.");
14233 if (InVectors.empty()) {
14234 assert(CommonMask.empty() && !ForExtracts &&
14235 "Expected empty input mask/vectors.");
14236 CommonMask.assign(Mask.begin(), Mask.end());
14237 InVectors.assign(1, V1);
14243 !CommonMask.empty() &&
14247 ->getOrdered(
P.index());
14249 return P.value() == Mask[
P.index()] ||
14254 return EI->getVectorOperand() == V1;
14256 "Expected only tree entry for extractelement vectors.");
14259 assert(!InVectors.empty() && !CommonMask.empty() &&
14260 "Expected only tree entries from extracts/reused buildvectors.");
14261 unsigned VF = getVF(V1);
14262 if (InVectors.size() == 2) {
14263 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14264 transformMaskAfterShuffle(CommonMask, CommonMask);
14265 VF = std::max<unsigned>(VF, CommonMask.size());
14266 }
else if (
const auto *InTE =
14267 InVectors.front().dyn_cast<
const TreeEntry *>()) {
14268 VF = std::max(VF, InTE->getVectorFactor());
14272 ->getNumElements());
14274 InVectors.push_back(V1);
14275 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14277 CommonMask[Idx] = Mask[Idx] + VF;
14280 Value *Root =
nullptr) {
14281 Cost += getBuildVectorCost(VL, Root);
14285 unsigned VF = VL.
size();
14287 VF = std::min(VF, MaskVF);
14288 Type *VLScalarTy = VL.
front()->getType();
14312 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
14318 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14323 IsFinalized =
true;
14326 if (InVectors.
size() == 2)
14327 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
14329 Cost += createShuffle(Vec,
nullptr, CommonMask);
14330 transformMaskAfterShuffle(CommonMask, CommonMask);
14332 "Expected vector length for the final value before action.");
14335 Cost += createShuffle(V1, V2, Mask);
14338 InVectors.
front() = V;
14340 if (!SubVectors.empty()) {
14342 if (InVectors.
size() == 2)
14343 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
14345 Cost += createShuffle(Vec,
nullptr, CommonMask);
14346 transformMaskAfterShuffle(CommonMask, CommonMask);
14348 if (!SubVectorsMask.
empty()) {
14350 "Expected same size of masks for subvectors and common mask.");
14352 copy(SubVectorsMask, SVMask.begin());
14353 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
14356 I1 = I2 + CommonMask.
size();
14363 for (
auto [
E, Idx] : SubVectors) {
14364 Type *EScalarTy =
E->Scalars.front()->getType();
14365 bool IsSigned =
true;
14366 if (
auto It =
R.MinBWs.find(
E); It !=
R.MinBWs.end()) {
14369 IsSigned = It->second.second;
14371 if (ScalarTy != EScalarTy) {
14372 unsigned CastOpcode = Instruction::Trunc;
14373 unsigned DstSz =
R.DL->getTypeSizeInBits(ScalarTy);
14374 unsigned SrcSz =
R.DL->getTypeSizeInBits(EScalarTy);
14376 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14377 Cost += TTI.getCastInstrCost(
14386 if (!CommonMask.
empty()) {
14387 std::iota(std::next(CommonMask.
begin(), Idx),
14388 std::next(CommonMask.
begin(), Idx +
E->getVectorFactor()),
14394 if (!ExtMask.
empty()) {
14395 if (CommonMask.
empty()) {
14399 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
14402 NewMask[
I] = CommonMask[ExtMask[
I]];
14404 CommonMask.
swap(NewMask);
14407 if (CommonMask.
empty()) {
14408 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
14412 createShuffle(InVectors.
front(),
14413 InVectors.
size() == 2 ? InVectors.
back() :
nullptr,
14418 assert((IsFinalized || CommonMask.empty()) &&
14419 "Shuffle construction must be finalized.");
14423const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(
const TreeEntry *E,
14424 unsigned Idx)
const {
14425 TreeEntry *
Op = OperandsToTreeEntry.
at({E, Idx});
14426 assert(
Op->isSame(
E->getOperand(Idx)) &&
"Operands mismatch!");
14431 if (
TE.State == TreeEntry::ScatterVectorize ||
14432 TE.State == TreeEntry::StridedVectorize)
14434 if (
TE.State == TreeEntry::CompressVectorize)
14436 if (
TE.State == TreeEntry::Vectorize &&
TE.getOpcode() == Instruction::Load &&
14437 !
TE.isAltShuffle()) {
14438 if (
TE.ReorderIndices.empty())
14440 SmallVector<int>
Mask;
14450 SmallPtrSetImpl<Value *> &CheckedExtracts) {
14455 return InstructionCost::getInvalid();
14460 auto It = MinBWs.find(
E);
14461 Type *OrigScalarTy = ScalarTy;
14462 if (It != MinBWs.end()) {
14469 unsigned EntryVF =
E->getVectorFactor();
14472 if (
E->isGather()) {
14476 return InstructionCost::getInvalid();
14478 ScalarTy = VL.
front()->getType();
14479 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
14480 E, ScalarTy, *TTI, VectorizedVals, *
this, CheckedExtracts);
14482 if (
E->State == TreeEntry::SplitVectorize) {
14483 assert(
E->CombinedEntriesWithIndices.size() == 2 &&
14484 "Expected exactly 2 combined entries.");
14485 assert(
E->ReuseShuffleIndices.empty() &&
"Expected empty reuses mask.");
14487 if (
E->ReorderIndices.empty()) {
14490 E->CombinedEntriesWithIndices.back().second,
14493 VectorizableTree[
E->CombinedEntriesWithIndices.back().first]
14494 ->getVectorFactor()));
14496 unsigned CommonVF =
14497 std::max(VectorizableTree[
E->CombinedEntriesWithIndices.front().first]
14498 ->getVectorFactor(),
14499 VectorizableTree[
E->CombinedEntriesWithIndices.back().first]
14500 ->getVectorFactor());
14505 LLVM_DEBUG(dumpTreeCosts(
E, 0, VectorCost, 0,
"Calculated costs for Tree"));
14509 SmallVector<int>
Mask;
14510 if (!
E->ReorderIndices.empty() &&
E->State != TreeEntry::CompressVectorize &&
14511 (
E->State != TreeEntry::StridedVectorize ||
14513 SmallVector<int> NewMask;
14514 if (
E->getOpcode() == Instruction::Store) {
14516 NewMask.
resize(
E->ReorderIndices.size());
14523 if (!
E->ReuseShuffleIndices.empty())
14528 assert((
E->State == TreeEntry::Vectorize ||
14529 E->State == TreeEntry::ScatterVectorize ||
14530 E->State == TreeEntry::StridedVectorize ||
14531 E->State == TreeEntry::CompressVectorize) &&
14532 "Unhandled state");
14535 (
E->getOpcode() == Instruction::GetElementPtr &&
14536 E->getMainOp()->getType()->isPointerTy()) ||
14537 E->hasCopyableElements()) &&
14540 unsigned ShuffleOrOp =
14541 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector :
E->
getOpcode();
14542 if (
E->CombinedOp != TreeEntry::NotCombinedOp)
14543 ShuffleOrOp =
E->CombinedOp;
14544 SmallSetVector<Value *, 16> UniqueValues(VL.
begin(), VL.
end());
14545 const unsigned Sz = UniqueValues.size();
14546 SmallBitVector UsedScalars(Sz,
false);
14547 for (
unsigned I = 0;
I < Sz; ++
I) {
14549 !
E->isCopyableElement(UniqueValues[
I]) &&
14550 getTreeEntries(UniqueValues[
I]).
front() ==
E)
14552 UsedScalars.set(
I);
14554 auto GetCastContextHint = [&](
Value *
V) {
14556 return getCastContextHint(*OpTEs.front());
14557 InstructionsState SrcState =
getSameOpcode(
E->getOperand(0), *TLI);
14558 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
14559 !SrcState.isAltShuffle())
14572 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
14574 for (
unsigned I = 0;
I < Sz; ++
I) {
14575 if (UsedScalars.test(
I))
14577 ScalarCost += ScalarEltCost(
I);
14584 if (It != MinBWs.end() && !UnaryInstruction::isCast(
E->getOpcode()) &&
14586 (
E->getOpcode() != Instruction::Load ||
E->UserTreeIndex)) {
14588 if (!EI.UserTE->hasState() ||
14589 EI.UserTE->getOpcode() != Instruction::Select ||
14591 auto UserBWIt = MinBWs.find(EI.UserTE);
14592 Type *UserScalarTy =
14593 (EI.UserTE->isGather() ||
14594 EI.UserTE->State == TreeEntry::SplitVectorize)
14595 ? EI.UserTE->Scalars.front()->getType()
14596 : EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
14597 if (UserBWIt != MinBWs.end())
14599 UserBWIt->second.first);
14600 if (ScalarTy != UserScalarTy) {
14601 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
14602 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
14603 unsigned VecOpcode;
14605 if (BWSz > SrcBWSz)
14606 VecOpcode = Instruction::Trunc;
14609 It->second.second ? Instruction::SExt : Instruction::ZExt;
14611 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
14616 LLVM_DEBUG(dumpTreeCosts(
E, CommonCost, VecCost - CommonCost,
14617 ScalarCost,
"Calculated costs for Tree"));
14618 return VecCost - ScalarCost;
14623 assert((
E->State == TreeEntry::Vectorize ||
14624 E->State == TreeEntry::StridedVectorize ||
14625 E->State == TreeEntry::CompressVectorize) &&
14626 "Entry state expected to be Vectorize, StridedVectorize or "
14627 "MaskedLoadCompressVectorize here.");
14631 *TTI, Ptrs, BasePtr,
E->getOpcode(),
CostKind, OrigScalarTy, VecTy);
14632 LLVM_DEBUG(dumpTreeCosts(
E, 0, VecCost, ScalarCost,
14633 "Calculated GEPs cost for Tree"));
14635 return VecCost - ScalarCost;
14641 return InstructionCost::getInvalid();
14642 Type *CanonicalType = Ty;
14648 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
14649 {CanonicalType, CanonicalType});
14651 TTI->getIntrinsicInstrCost(CostAttrs,
CostKind);
14654 if (VI && SelectOnly) {
14656 "Expected only for scalar type.");
14659 CI->
getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
14660 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
14661 {TTI::OK_AnyValue, TTI::OP_None}, CI);
14665 auto GetFMulAddCost = [&, &TTI = *TTI](
const InstructionsState &S,
14670 switch (ShuffleOrOp) {
14671 case Instruction::PHI: {
14674 SmallPtrSet<const TreeEntry *, 4> CountedOps;
14675 for (
Value *V : UniqueValues) {
14680 ValueList Operands(
PHI->getNumIncomingValues(),
nullptr);
14681 for (
unsigned I = 0,
N =
PHI->getNumIncomingValues();
I <
N; ++
I) {
14685 if (
const TreeEntry *OpTE =
14686 getSameValuesTreeEntry(Operands.
front(), Operands))
14687 if (CountedOps.
insert(OpTE).second &&
14688 !OpTE->ReuseShuffleIndices.empty())
14689 ScalarCost +=
TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
14690 OpTE->Scalars.size());
14693 return CommonCost - ScalarCost;
14695 case Instruction::ExtractValue:
14696 case Instruction::ExtractElement: {
14697 APInt DemandedElts;
14699 auto GetScalarCost = [&](
unsigned Idx) {
14705 if (ShuffleOrOp == Instruction::ExtractElement) {
14707 SrcVecTy = EE->getVectorOperandType();
14710 Type *AggregateTy = EV->getAggregateOperand()->getType();
14713 NumElts = ATy->getNumElements();
14719 if (
I->hasOneUse()) {
14729 Cost -= TTI->getCastInstrCost(
14730 Ext->getOpcode(),
Ext->getType(),
I->getType(),
14735 if (DemandedElts.
isZero())
14741 return CommonCost - (DemandedElts.
isZero()
14743 : TTI.getScalarizationOverhead(
14744 SrcVecTy, DemandedElts,
false,
14747 return GetCostDiff(GetScalarCost, GetVectorCost);
14749 case Instruction::InsertElement: {
14750 assert(
E->ReuseShuffleIndices.empty() &&
14751 "Unique insertelements only are expected.");
14753 unsigned const NumElts = SrcVecTy->getNumElements();
14754 unsigned const NumScalars = VL.
size();
14760 unsigned OffsetEnd = OffsetBeg;
14761 InsertMask[OffsetBeg] = 0;
14764 if (OffsetBeg > Idx)
14766 else if (OffsetEnd < Idx)
14768 InsertMask[Idx] =
I + 1;
14771 if (NumOfParts > 0 && NumOfParts < NumElts)
14772 VecScalarsSz =
PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
14773 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
14775 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
14776 unsigned InsertVecSz = std::min<unsigned>(
14778 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
14779 bool IsWholeSubvector =
14780 OffsetBeg ==
Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
14784 if (OffsetBeg + InsertVecSz > VecSz) {
14787 InsertVecSz = VecSz;
14792 SmallVector<int>
Mask;
14793 if (!
E->ReorderIndices.empty()) {
14798 std::iota(
Mask.begin(), std::next(
Mask.begin(), InsertVecSz), 0);
14800 bool IsIdentity =
true;
14802 Mask.swap(PrevMask);
14803 for (
unsigned I = 0;
I < NumScalars; ++
I) {
14805 DemandedElts.
setBit(InsertIdx);
14806 IsIdentity &= InsertIdx - OffsetBeg ==
I;
14807 Mask[InsertIdx - OffsetBeg] =
I;
14809 assert(
Offset < NumElts &&
"Failed to find vector index offset");
14823 InsertVecTy, Mask);
14825 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
14831 SmallBitVector InMask =
14833 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
14834 if (!InMask.
all() && NumScalars != NumElts && !IsWholeSubvector) {
14835 if (InsertVecSz != VecSz) {
14840 for (
unsigned I = 0, End = OffsetBeg -
Offset;
I < End; ++
I)
14842 for (
unsigned I = OffsetBeg -
Offset, End = OffsetEnd -
Offset;
14846 for (
unsigned I = OffsetEnd + 1 -
Offset;
I < VecSz; ++
I)
14855 case Instruction::ZExt:
14856 case Instruction::SExt:
14857 case Instruction::FPToUI:
14858 case Instruction::FPToSI:
14859 case Instruction::FPExt:
14860 case Instruction::PtrToInt:
14861 case Instruction::IntToPtr:
14862 case Instruction::SIToFP:
14863 case Instruction::UIToFP:
14864 case Instruction::Trunc:
14865 case Instruction::FPTrunc:
14866 case Instruction::BitCast: {
14867 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
14870 unsigned Opcode = ShuffleOrOp;
14871 unsigned VecOpcode = Opcode;
14873 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
14875 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->
getScalarType());
14876 if (SrcIt != MinBWs.end()) {
14877 SrcBWSz = SrcIt->second.first;
14883 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->
getScalarType());
14884 if (BWSz == SrcBWSz) {
14885 VecOpcode = Instruction::BitCast;
14886 }
else if (BWSz < SrcBWSz) {
14887 VecOpcode = Instruction::Trunc;
14888 }
else if (It != MinBWs.end()) {
14889 assert(BWSz > SrcBWSz &&
"Invalid cast!");
14890 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
14891 }
else if (SrcIt != MinBWs.end()) {
14892 assert(BWSz > SrcBWSz &&
"Invalid cast!");
14894 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
14896 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
14897 !SrcIt->second.second) {
14898 VecOpcode = Instruction::UIToFP;
14901 assert(Idx == 0 &&
"Expected 0 index only");
14902 return TTI->getCastInstrCost(Opcode, VL0->
getType(),
14909 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
14911 auto *
VI = VL0->
getOpcode() == Opcode ? VL0 :
nullptr;
14914 bool IsArithmeticExtendedReduction =
14915 E->Idx == 0 && UserIgnoreList &&
14918 return is_contained({Instruction::Add, Instruction::FAdd,
14919 Instruction::Mul, Instruction::FMul,
14920 Instruction::And, Instruction::Or,
14924 if (IsArithmeticExtendedReduction &&
14925 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
14927 return CommonCost +
14928 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH,
CostKind,
14929 VecOpcode == Opcode ? VI :
nullptr);
14931 return GetCostDiff(GetScalarCost, GetVectorCost);
14933 case Instruction::FCmp:
14934 case Instruction::ICmp:
14935 case Instruction::Select: {
14936 CmpPredicate VecPred, SwappedVecPred;
14939 match(VL0, MatchCmp))
14945 auto GetScalarCost = [&](
unsigned Idx) {
14955 !
match(VI, MatchCmp)) ||
14963 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
14964 CostKind, getOperandInfo(
VI->getOperand(0)),
14965 getOperandInfo(
VI->getOperand(1)), VI);
14976 TTI->getCmpSelInstrCost(
E->getOpcode(), VecTy, MaskTy, VecPred,
14977 CostKind, getOperandInfo(
E->getOperand(0)),
14978 getOperandInfo(
E->getOperand(1)), VL0);
14982 unsigned CondNumElements = CondType->getNumElements();
14984 assert(VecTyNumElements >= CondNumElements &&
14985 VecTyNumElements % CondNumElements == 0 &&
14986 "Cannot vectorize Instruction::Select");
14987 if (CondNumElements != VecTyNumElements) {
14996 return VecCost + CommonCost;
14998 return GetCostDiff(GetScalarCost, GetVectorCost);
15000 case TreeEntry::MinMax: {
15001 auto GetScalarCost = [&](
unsigned Idx) {
15002 return GetMinMaxCost(OrigScalarTy);
15006 return VecCost + CommonCost;
15008 return GetCostDiff(GetScalarCost, GetVectorCost);
15010 case TreeEntry::FMulAdd: {
15011 auto GetScalarCost = [&](
unsigned Idx) {
15014 return GetFMulAddCost(
E->getOperations(),
15020 for (
Value *V :
E->Scalars) {
15022 FMF &= FPCI->getFastMathFlags();
15024 FMF &= FPCIOp->getFastMathFlags();
15027 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,
15028 {VecTy, VecTy, VecTy}, FMF);
15030 return VecCost + CommonCost;
15032 return GetCostDiff(GetScalarCost, GetVectorCost);
15034 case Instruction::FNeg:
15035 case Instruction::Add:
15036 case Instruction::FAdd:
15037 case Instruction::Sub:
15038 case Instruction::FSub:
15039 case Instruction::Mul:
15040 case Instruction::FMul:
15041 case Instruction::UDiv:
15042 case Instruction::SDiv:
15043 case Instruction::FDiv:
15044 case Instruction::URem:
15045 case Instruction::SRem:
15046 case Instruction::FRem:
15047 case Instruction::Shl:
15048 case Instruction::LShr:
15049 case Instruction::AShr:
15050 case Instruction::And:
15051 case Instruction::Or:
15052 case Instruction::Xor: {
15053 auto GetScalarCost = [&](
unsigned Idx) {
15060 Value *Op1 =
E->getOperand(0)[Idx];
15062 SmallVector<const Value *, 2> Operands(1, Op1);
15066 Op2 =
E->getOperand(1)[Idx];
15072 ShuffleOrOp, OrigScalarTy,
CostKind, Op1Info, Op2Info, Operands);
15074 I && (ShuffleOrOp == Instruction::FAdd ||
15075 ShuffleOrOp == Instruction::FSub)) {
15083 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
15088 return CI && CI->getValue().countr_one() >= It->second.first;
15096 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy,
CostKind, Op1Info,
15097 Op2Info, {},
nullptr, TLI) +
15100 return GetCostDiff(GetScalarCost, GetVectorCost);
15102 case Instruction::GetElementPtr: {
15103 return CommonCost + GetGEPCostDiff(VL, VL0);
15105 case Instruction::Load: {
15106 auto GetScalarCost = [&](
unsigned Idx) {
15108 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
15109 VI->getAlign(),
VI->getPointerAddressSpace(),
15115 switch (
E->State) {
15116 case TreeEntry::Vectorize:
15117 if (
unsigned Factor =
E->getInterleaveFactor()) {
15118 VecLdCost = TTI->getInterleavedMemoryOpCost(
15119 Instruction::Load, VecTy, Factor, {}, LI0->getAlign(),
15120 LI0->getPointerAddressSpace(),
CostKind);
15123 VecLdCost = TTI->getMemoryOpCost(
15124 Instruction::Load, VecTy, LI0->getAlign(),
15128 case TreeEntry::StridedVectorize: {
15129 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(
E);
15130 FixedVectorType *StridedLoadTy = SPtrInfo.Ty;
15131 assert(StridedLoadTy &&
"Missing StridedPoinerInfo for tree entry.");
15132 Align CommonAlignment =
15134 VecLdCost = TTI->getStridedMemoryOpCost(
15135 Instruction::Load, StridedLoadTy, LI0->getPointerOperand(),
15136 false, CommonAlignment,
CostKind);
15137 if (StridedLoadTy != VecTy)
15139 TTI->getCastInstrCost(Instruction::BitCast, VecTy, StridedLoadTy,
15144 case TreeEntry::CompressVectorize: {
15146 unsigned InterleaveFactor;
15147 SmallVector<int> CompressMask;
15150 if (!
E->ReorderIndices.empty()) {
15151 SmallVector<int>
Mask(
E->ReorderIndices.begin(),
15152 E->ReorderIndices.end());
15159 Scalars, PointerOps,
E->ReorderIndices, *TTI, *DL, *SE, *AC, *DT,
15160 *TLI, [](
Value *) { return true; }, IsMasked, InterleaveFactor,
15161 CompressMask, LoadVecTy);
15162 assert(IsVectorized &&
"Failed to vectorize load");
15163 CompressEntryToData.try_emplace(
E, CompressMask, LoadVecTy,
15164 InterleaveFactor, IsMasked);
15165 Align CommonAlignment = LI0->getAlign();
15166 if (InterleaveFactor) {
15167 VecLdCost = TTI->getInterleavedMemoryOpCost(
15168 Instruction::Load, LoadVecTy, InterleaveFactor, {},
15169 CommonAlignment, LI0->getPointerAddressSpace(),
CostKind);
15170 }
else if (IsMasked) {
15171 VecLdCost = TTI->getMaskedMemoryOpCost(
15172 {Intrinsic::masked_load, LoadVecTy, CommonAlignment,
15173 LI0->getPointerAddressSpace()},
15177 LoadVecTy, CompressMask,
CostKind);
15179 VecLdCost = TTI->getMemoryOpCost(
15180 Instruction::Load, LoadVecTy, CommonAlignment,
15184 LoadVecTy, CompressMask,
CostKind);
15188 case TreeEntry::ScatterVectorize: {
15189 Align CommonAlignment =
15191 VecLdCost = TTI->getGatherScatterOpCost(
15192 Instruction::Load, VecTy, LI0->getPointerOperand(),
15193 false, CommonAlignment,
CostKind);
15196 case TreeEntry::CombinedVectorize:
15197 case TreeEntry::SplitVectorize:
15198 case TreeEntry::NeedToGather:
15201 return VecLdCost + CommonCost;
15207 if (
E->State == TreeEntry::ScatterVectorize)
15214 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
15216 case Instruction::Store: {
15217 bool IsReorder = !
E->ReorderIndices.empty();
15218 auto GetScalarCost = [=](
unsigned Idx) {
15221 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
15222 VI->getAlign(),
VI->getPointerAddressSpace(),
15230 if (
E->State == TreeEntry::StridedVectorize) {
15231 Align CommonAlignment =
15233 VecStCost = TTI->getStridedMemoryOpCost(
15234 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
15235 false, CommonAlignment,
CostKind);
15237 assert(
E->State == TreeEntry::Vectorize &&
15238 "Expected either strided or consecutive stores.");
15239 if (
unsigned Factor =
E->getInterleaveFactor()) {
15240 assert(
E->ReuseShuffleIndices.empty() && !
E->ReorderIndices.empty() &&
15241 "No reused shuffles expected");
15243 VecStCost = TTI->getInterleavedMemoryOpCost(
15244 Instruction::Store, VecTy, Factor, {}, BaseSI->getAlign(),
15245 BaseSI->getPointerAddressSpace(),
CostKind);
15248 VecStCost = TTI->getMemoryOpCost(
15249 Instruction::Store, VecTy, BaseSI->getAlign(),
15250 BaseSI->getPointerAddressSpace(),
CostKind, OpInfo);
15253 return VecStCost + CommonCost;
15257 unsigned Idx = IsReorder ?
E->ReorderIndices[
I] :
I;
15261 return GetCostDiff(GetScalarCost, GetVectorCost) +
15262 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
15264 case Instruction::Call: {
15265 auto GetScalarCost = [&](
unsigned Idx) {
15269 IntrinsicCostAttributes CostAttrs(
ID, *CI, 1);
15270 return TTI->getIntrinsicInstrCost(CostAttrs,
CostKind);
15280 CI,
ID, VecTy->getNumElements(),
15281 It != MinBWs.end() ? It->second.first : 0, TTI);
15283 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
15285 return GetCostDiff(GetScalarCost, GetVectorCost);
15287 case Instruction::ShuffleVector: {
15295 "Invalid Shuffle Vector Operand");
15298 auto TryFindNodeWithEqualOperands = [=]() {
15299 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
15302 if (
TE->hasState() &&
TE->isAltShuffle() &&
15303 ((
TE->getOpcode() ==
E->getOpcode() &&
15304 TE->getAltOpcode() ==
E->getAltOpcode()) ||
15305 (
TE->getOpcode() ==
E->getAltOpcode() &&
15306 TE->getAltOpcode() ==
E->getOpcode())) &&
15307 TE->hasEqualOperands(*
E))
15312 auto GetScalarCost = [&](
unsigned Idx) {
15317 assert(
E->getMatchingMainOpOrAltOp(VI) &&
15318 "Unexpected main/alternate opcode");
15320 return TTI->getInstructionCost(VI,
CostKind);
15328 if (TryFindNodeWithEqualOperands()) {
15330 dbgs() <<
"SLP: diamond match for alternate node found.\n";
15337 TTIRef.getArithmeticInstrCost(
E->getOpcode(), VecTy,
CostKind);
15339 TTIRef.getArithmeticInstrCost(
E->getAltOpcode(), VecTy,
CostKind);
15342 VecCost = TTIRef.getCmpSelInstrCost(
15343 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(),
CostKind,
15344 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15346 VecCost += TTIRef.getCmpSelInstrCost(
15347 E->getOpcode(), VecTy, MaskTy,
15349 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15352 Type *SrcSclTy =
E->getMainOp()->getOperand(0)->getType();
15355 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
15356 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
15358 DL->getTypeSizeInBits(
E->getMainOp()->getOperand(0)->getType());
15359 if (SrcIt != MinBWs.end()) {
15360 SrcBWSz = SrcIt->second.first;
15364 if (BWSz <= SrcBWSz) {
15365 if (BWSz < SrcBWSz)
15367 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
15371 <<
"SLP: alternate extension, which should be truncated.\n";
15377 VecCost = TTIRef.getCastInstrCost(
E->getOpcode(), VecTy, SrcTy,
15380 TTIRef.getCastInstrCost(
E->getAltOpcode(), VecTy, SrcTy,
15383 SmallVector<int>
Mask;
15384 E->buildAltOpShuffleMask(
15385 [&](Instruction *
I) {
15386 assert(
E->getMatchingMainOpOrAltOp(
I) &&
15387 "Unexpected main/alternate opcode");
15398 unsigned Opcode0 =
E->getOpcode();
15399 unsigned Opcode1 =
E->getAltOpcode();
15400 SmallBitVector OpcodeMask(
15404 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
15406 VecTy, Opcode0, Opcode1, OpcodeMask,
CostKind);
15407 return AltVecCost < VecCost ? AltVecCost : VecCost;
15413 return GetCostDiff(
15418 "Not supported shufflevector usage.");
15420 unsigned SVNumElements =
15422 ->getNumElements();
15423 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
15424 for (
size_t I = 0, End = VL.
size();
I != End;
I += GroupSize) {
15429 "Not supported shufflevector usage.");
15432 [[maybe_unused]]
bool IsExtractSubvectorMask =
15433 SV->isExtractSubvectorMask(Index);
15434 assert(IsExtractSubvectorMask &&
15435 "Not supported shufflevector usage.");
15436 if (NextIndex != Index)
15438 NextIndex += SV->getShuffleMask().size();
15441 return ::getShuffleCost(
15447 return GetCostDiff(GetScalarCost, GetVectorCost);
15449 case Instruction::Freeze:
15456bool BoUpSLP::isFullyVectorizableTinyTree(
bool ForReduction)
const {
15458 << VectorizableTree.size() <<
" is fully vectorizable .\n");
15460 auto &&AreVectorizableGathers = [
this](
const TreeEntry *
TE,
unsigned Limit) {
15461 SmallVector<int>
Mask;
15462 return TE->isGather() &&
15464 [
this](
Value *V) { return EphValues.contains(V); }) &&
15466 TE->Scalars.size() < Limit ||
15467 (((
TE->hasState() &&
15468 TE->getOpcode() == Instruction::ExtractElement) ||
15471 (
TE->hasState() &&
TE->getOpcode() == Instruction::Load &&
15472 !
TE->isAltShuffle()) ||
15477 if (VectorizableTree.size() == 1 &&
15478 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
15479 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
15480 VectorizableTree[0]->State == TreeEntry::CompressVectorize ||
15482 AreVectorizableGathers(VectorizableTree[0].
get(),
15483 VectorizableTree[0]->Scalars.size()) &&
15484 VectorizableTree[0]->getVectorFactor() > 2)))
15487 if (VectorizableTree.size() != 2)
15494 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
15495 AreVectorizableGathers(VectorizableTree[1].
get(),
15496 VectorizableTree[0]->Scalars.size()))
15500 if (VectorizableTree[0]->
isGather() ||
15501 (VectorizableTree[1]->
isGather() &&
15502 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
15503 VectorizableTree[0]->State != TreeEntry::StridedVectorize &&
15504 VectorizableTree[0]->State != TreeEntry::CompressVectorize))
15512 bool MustMatchOrInst) {
15516 Value *ZextLoad = Root;
15517 const APInt *ShAmtC;
15518 bool FoundOr =
false;
15522 ShAmtC->
urem(8) == 0))) {
15524 ZextLoad = BinOp->getOperand(0);
15525 if (BinOp->getOpcode() == Instruction::Or)
15530 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
15537 Type *SrcTy = Load->getType();
15538 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
15544 LLVM_DEBUG(
dbgs() <<
"SLP: Assume load combining for tree starting at "
15554 unsigned NumElts = VectorizableTree[0]->Scalars.size();
15555 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
15563 unsigned NumElts = Stores.
size();
15564 for (
Value *Scalar : Stores) {
15578 if (VectorizableTree.empty()) {
15579 assert(ExternalUses.empty() &&
"We shouldn't have any external users");
15585 if (VectorizableTree.size() == 2 &&
15587 VectorizableTree[1]->isGather() &&
15588 (VectorizableTree[1]->getVectorFactor() <= 2 ||
15589 !(
isSplat(VectorizableTree[1]->Scalars) ||
15597 constexpr int Limit = 4;
15599 !VectorizableTree.empty() &&
15600 all_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
15601 return (TE->isGather() &&
15602 (!TE->hasState() ||
15603 TE->getOpcode() != Instruction::ExtractElement) &&
15605 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
15612 VectorizableTree.size() <= Limit &&
15613 all_of(VectorizableTree,
15614 [&](
const std::unique_ptr<TreeEntry> &TE) {
15615 return (TE->isGather() &&
15616 (!TE->hasState() ||
15617 TE->getOpcode() != Instruction::ExtractElement) &&
15621 (TE->getOpcode() == Instruction::InsertElement ||
15622 (TE->getOpcode() == Instruction::PHI &&
15624 return isa<PoisonValue>(V) || MustGather.contains(V);
15627 any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
15628 return TE->State == TreeEntry::Vectorize &&
15629 TE->getOpcode() == Instruction::PHI;
15636 unsigned NumGathers = 0;
15637 constexpr int LimitTreeSize = 36;
15639 all_of(VectorizableTree,
15640 [&](
const std::unique_ptr<TreeEntry> &TE) {
15641 if (!TE->isGather() && TE->hasState() &&
15642 (TE->getOpcode() == Instruction::Load ||
15643 TE->getOpcode() == Instruction::Store)) {
15647 if (TE->isGather())
15649 return TE->State == TreeEntry::SplitVectorize ||
15650 (TE->Idx == 0 && TE->Scalars.size() == 2 &&
15651 TE->hasState() && TE->getOpcode() == Instruction::ICmp &&
15652 VectorizableTree.size() > LimitTreeSize) ||
15656 (TE->getOpcode() == Instruction::PHI ||
15657 (TE->hasCopyableElements() &&
15660 TE->Scalars.size() / 2) ||
15661 ((!TE->ReuseShuffleIndices.empty() ||
15662 !TE->ReorderIndices.empty() || TE->isAltShuffle()) &&
15663 TE->Scalars.size() == 2)));
15665 (StoreLoadNodes.
empty() ||
15666 (VectorizableTree.size() > LimitTreeSize * StoreLoadNodes.
size() &&
15667 (NumGathers > 0 ||
none_of(StoreLoadNodes, [&](
const TreeEntry *TE) {
15668 return TE->getOpcode() == Instruction::Store ||
15670 return !isa<LoadInst>(V) ||
15671 areAllUsersVectorized(cast<Instruction>(V));
15679 VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
15680 VectorizableTree.size() >= Limit &&
15682 [&](
const std::unique_ptr<TreeEntry> &TE) {
15683 return !TE->isGather() && TE->UserTreeIndex.UserTE &&
15684 TE->UserTreeIndex.UserTE->Idx == 0;
15691 VectorizableTree.size() > 2 &&
15692 VectorizableTree.front()->State == TreeEntry::Vectorize &&
15693 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
15694 VectorizableTree[1]->State == TreeEntry::Vectorize &&
15695 VectorizableTree[1]->getOpcode() == Instruction::PHI &&
15697 ArrayRef(VectorizableTree).drop_front(2),
15698 [&](
const std::unique_ptr<TreeEntry> &TE) {
return TE->isGather(); }))
15708 if (isFullyVectorizableTinyTree(ForReduction))
15713 bool IsAllowedSingleBVNode =
15714 VectorizableTree.
size() > 1 ||
15715 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
15716 !VectorizableTree.front()->isAltShuffle() &&
15717 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
15718 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
15720 if (
any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
15721 return TE->isGather() &&
all_of(TE->Scalars, [&](
Value *V) {
15722 return isa<ExtractElementInst, Constant>(V) ||
15723 (IsAllowedSingleBVNode &&
15724 !V->hasNUsesOrMore(UsesLimit) &&
15725 any_of(V->users(), IsaPred<InsertElementInst>));
15730 if (VectorizableTree.back()->isGather() &&
15731 VectorizableTree.back()->hasState() &&
15732 VectorizableTree.back()->isAltShuffle() &&
15733 VectorizableTree.back()->getVectorFactor() > 2 &&
15735 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
15736 TTI->getScalarizationOverhead(
15737 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
15738 VectorizableTree.back()->getVectorFactor()),
15751 constexpr unsigned SmallTree = 3;
15752 if (VectorizableTree.front()->isNonPowOf2Vec() &&
15755 [](
const std::unique_ptr<TreeEntry> &TE) {
15756 return TE->isGather() && TE->hasState() &&
15757 TE->getOpcode() == Instruction::Load &&
15765 TreeEntry &E = *VectorizableTree[Idx];
15766 if (E.State == TreeEntry::SplitVectorize)
15770 if ((E.hasState() && E.getOpcode() != Instruction::Load) ||
15789 const TreeEntry *Root = VectorizableTree.front().get();
15790 if (Root->isGather())
15798 for (
const auto &TEPtr : VectorizableTree) {
15799 if (!TEPtr->isGather()) {
15800 Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
15801 EntriesToLastInstruction.
try_emplace(TEPtr.get(), LastInst);
15802 LastInstructions.
insert(LastInst);
15804 if (TEPtr->UserTreeIndex)
15805 EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
15812 if (
II->isAssumeLikeIntrinsic())
15819 return IntrCost < CallCost;
15826 CheckedInstructions;
15827 unsigned Budget = 0;
15828 const unsigned BudgetLimit =
15833 "Expected instructions in same block.");
15834 if (
auto It = CheckedInstructions.
find(
Last);
15835 It != CheckedInstructions.
end()) {
15836 const Instruction *Checked = It->second.getPointer();
15838 return It->second.getInt() != 0;
15844 ++
First->getIterator().getReverse(),
15846 Last->getIterator().getReverse();
15848 while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
15854 for (
const Instruction *LastInst : LastInstsInRange)
15855 CheckedInstructions.
try_emplace(LastInst, &*PrevInstIt, 0);
15858 if (LastInstructions.
contains(&*PrevInstIt))
15859 LastInstsInRange.
push_back(&*PrevInstIt);
15864 for (
const Instruction *LastInst : LastInstsInRange)
15866 LastInst, PrevInstIt == InstIt ?
First : &*PrevInstIt,
15867 Budget <= BudgetLimit ? 1 : 0);
15868 return Budget <= BudgetLimit;
15870 auto AddCosts = [&](
const TreeEntry *
Op) {
15871 Type *ScalarTy =
Op->Scalars.front()->getType();
15872 auto It = MinBWs.find(
Op);
15873 if (It != MinBWs.end())
15876 Cost += TTI->getCostOfKeepingLiveOverCall(VecTy);
15879 Cost -=
Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy);
15886 ParentOpParentToPreds;
15889 auto Key = std::make_pair(Root, OpParent);
15890 if (
auto It = ParentOpParentToPreds.
find(
Key);
15891 It != ParentOpParentToPreds.
end())
15903 for (
const auto &KeyPair : ParentsPairsToAdd) {
15905 "Should not have been added before.");
15909 while (!Worklist.
empty()) {
15911 if (BB == OpParent || !Visited.
insert(BB).second)
15913 auto Pair = std::make_pair(BB, OpParent);
15914 if (
auto It = ParentOpParentToPreds.
find(Pair);
15915 It != ParentOpParentToPreds.
end()) {
15919 ParentsPairsToAdd.
insert(Pair);
15924 if (Budget > BudgetLimit)
15936 while (!LiveEntries.
empty()) {
15939 if (Operands.
empty())
15941 Instruction *LastInst = EntriesToLastInstruction.
at(Entry);
15943 for (
const TreeEntry *
Op : Operands) {
15944 if (!
Op->isGather())
15946 if (Entry->State == TreeEntry::SplitVectorize ||
15947 (Entry->getOpcode() != Instruction::PHI &&
Op->isGather()) ||
15953 Pred = Phi->getIncomingBlock(
Op->UserTreeIndex.EdgeIdx);
15956 if (
Op->isGather()) {
15957 assert(Entry->getOpcode() == Instruction::PHI &&
15958 "Expected phi node only.");
15960 ->getIncomingBlock(
Op->UserTreeIndex.EdgeIdx);
15962 for (
Value *V :
Op->Scalars) {
15973 OpLastInst = EntriesToLastInstruction.
at(
Op);
15977 if (OpParent == Parent) {
15978 if (Entry->getOpcode() == Instruction::PHI) {
15979 if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
15983 if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
15989 if (Entry->getOpcode() != Instruction::PHI &&
15990 !CheckForNonVecCallsInSameBlock(
15991 &*LastInst->
getParent()->getFirstNonPHIOrDbgOrAlloca(),
15997 if (!CheckForNonVecCallsInSameBlock(OpLastInst,
16003 if (!CheckPredecessors(Parent, Pred, OpParent)) {
16019 const auto *I1 = IE1;
16020 const auto *I2 = IE2;
16032 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
16035 if (I2 && ((I2 == IE2 || I2->
hasOneUse())) &&
16038 }
while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
16045struct ValueSelect {
16046 template <
typename U>
16047 static std::enable_if_t<std::is_same_v<Value *, U>,
Value *>
get(
Value *V) {
16050 template <
typename U>
16051 static std::enable_if_t<!std::is_same_v<Value *, U>,
U>
get(
Value *) {
16069template <
typename T>
16075 assert(!ShuffleMask.empty() &&
"Empty list of shuffles for inserts.");
16077 auto VMIt = std::next(ShuffleMask.begin());
16080 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
16082 if (!IsBaseUndef.
all()) {
16084 std::pair<T *, bool> Res =
16085 ResizeAction(ShuffleMask.begin()->first, Mask,
false);
16087 for (
unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
16091 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
16093 [[maybe_unused]]
auto *V = ValueSelect::get<T *>(
Base);
16094 assert((!V || GetVF(V) == Mask.size()) &&
16095 "Expected base vector of VF number of elements.");
16096 Prev = Action(Mask, {
nullptr, Res.first});
16097 }
else if (ShuffleMask.size() == 1) {
16100 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
16106 Prev = Action(Mask, {ShuffleMask.begin()->first});
16110 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
16111 unsigned Vec2VF = GetVF(VMIt->first);
16112 if (Vec1VF == Vec2VF) {
16116 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
16119 Mask[
I] = SecMask[
I] + Vec1VF;
16122 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
16125 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
16127 std::pair<T *, bool> Res2 =
16128 ResizeAction(VMIt->first, VMIt->second,
false);
16130 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
16137 Mask[
I] = (Res2.second ?
I : SecMask[
I]) + VF;
16140 Prev = Action(Mask, {Res1.first, Res2.first});
16142 VMIt = std::next(VMIt);
16144 [[maybe_unused]]
bool IsBaseNotUndef = !IsBaseUndef.
all();
16146 for (
auto E = ShuffleMask.end(); VMIt !=
E; ++VMIt) {
16148 std::pair<T *, bool> Res =
16149 ResizeAction(VMIt->first, VMIt->second,
false);
16151 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
16154 "Multiple uses of scalars.");
16155 Mask[
I] = (Res.second ?
I : SecMask[
I]) + VF;
16160 Prev = Action(Mask, {Prev, Res.first});
16168template <
typename T>
struct ShuffledInsertData {
16172 MapVector<T, SmallVector<int>> ValueMasks;
16180 << VectorizableTree.size() <<
".\n");
16183 for (
unsigned I = 0, E = VectorizableTree.size();
I < E; ++
I) {
16184 TreeEntry &TE = *VectorizableTree[
I];
16187 if (TE.State == TreeEntry::CombinedVectorize) {
16189 dbgs() <<
"SLP: Skipping cost for combined node that starts with "
16190 << *TE.Scalars[0] <<
".\n";
16191 TE.dump();
dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
16194 if (TE.hasState() &&
16195 (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {
16196 if (
const TreeEntry *E =
16197 getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
16198 E && E->getVectorFactor() == TE.getVectorFactor()) {
16203 <<
"SLP: Current total cost = " << Cost <<
"\n");
16210 assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&
16211 "Expected gather nodes with users only.");
16217 <<
"SLP: Current total cost = " << Cost <<
"\n");
16221 none_of(ExternalUses, [](
const ExternalUser &EU) {
16232 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
16239 for (ExternalUser &EU : ExternalUses) {
16240 ScalarUserAndIdx.
emplace_back(EU.Scalar, EU.User, EU.Lane);
16243 for (ExternalUser &EU : ExternalUses) {
16244 LLVM_DEBUG(
dbgs() <<
"SLP: Computing cost for external use of TreeEntry "
16245 << EU.E.Idx <<
" in lane " << EU.Lane <<
"\n");
16247 else dbgs() <<
" User: nullptr\n");
16248 LLVM_DEBUG(
dbgs() <<
" Use: " << EU.Scalar->getNameOrAsOperand() <<
"\n");
16253 if (EphValues.count(EU.User))
16257 if (!CheckedScalarUser.
insert(std::make_pair(EU.Scalar, EU.User)).second ||
16259 CheckedScalarUser.
contains(std::make_pair(EU.Scalar,
nullptr))))
16267 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
16273 !ExtractCostCalculated.
insert(EU.Scalar).second)
16286 if (!UsedInserts.
insert(VU).second)
16290 const TreeEntry *ScalarTE = &EU.E;
16293 [
this, VU](
const ShuffledInsertData<const TreeEntry *> &
Data) {
16298 Value *Op0 =
II->getOperand(0);
16305 if (It == ShuffledInserts.
end()) {
16307 Data.InsertElements.emplace_back(VU);
16309 VecId = ShuffledInserts.
size() - 1;
16310 auto It = MinBWs.find(ScalarTE);
16311 if (It != MinBWs.end() &&
16313 .
insert(std::make_pair(ScalarTE, FTy->getElementType()))
16315 unsigned BWSz = It->second.first;
16316 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
16317 unsigned VecOpcode;
16318 if (DstBWSz < BWSz)
16319 VecOpcode = Instruction::Trunc;
16322 It->second.second ? Instruction::SExt : Instruction::ZExt;
16327 FTy->getNumElements()),
16330 <<
" for extending externally used vector with "
16331 "non-equal minimum bitwidth.\n");
16336 It->InsertElements.front() = VU;
16337 VecId = std::distance(ShuffledInserts.
begin(), It);
16339 int InIdx = *InsertIdx;
16341 ShuffledInserts[VecId].ValueMasks[ScalarTE];
16344 Mask[InIdx] = EU.Lane;
16345 DemandedElts[VecId].setBit(InIdx);
16356 auto *ScalarTy = EU.Scalar->getType();
16357 const unsigned BundleWidth = EU.E.getVectorFactor();
16358 assert(EU.Lane < BundleWidth &&
"Extracted lane out of bounds.");
16360 const TreeEntry *Entry = &EU.E;
16361 auto It = MinBWs.find(Entry);
16362 if (It != MinBWs.end()) {
16367 ? Instruction::ZExt
16368 : Instruction::SExt;
16373 << ExtraCost <<
"\n");
16377 CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx);
16378 LLVM_DEBUG(
dbgs() <<
" ExtractElement cost for " << *ScalarTy <<
" from "
16379 << *VecTy <<
": " << ExtraCost <<
"\n");
16382 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
16383 Entry->getOpcode() == Instruction::Load) {
16385 auto IsPhiInLoop = [&](
const ExternalUser &U) {
16388 const Loop *L = LI->getLoopFor(Phi->getParent());
16389 return L && (Phi->getParent() ==
I->getParent() ||
16390 L == LI->getLoopFor(
I->getParent()));
16394 if (!ValueToExtUses) {
16395 ValueToExtUses.emplace();
16396 for (
const auto &
P :
enumerate(ExternalUses)) {
16398 if (IsPhiInLoop(
P.value()))
16401 ValueToExtUses->try_emplace(
P.value().Scalar,
P.index());
16408 auto OperandIsScalar = [&](
Value *V) {
16414 return !EE->hasOneUse() || !MustGather.contains(EE);
16417 return ValueToExtUses->contains(V);
16419 bool CanBeUsedAsScalar =
all_of(Inst->operands(), OperandIsScalar);
16420 bool CanBeUsedAsScalarCast =
false;
16423 Op &&
all_of(
Op->operands(), OperandIsScalar)) {
16428 if (ScalarCost + OpCost <= ExtraCost) {
16429 CanBeUsedAsScalar = CanBeUsedAsScalarCast =
true;
16430 ScalarCost += OpCost;
16434 if (CanBeUsedAsScalar) {
16435 bool KeepScalar = ScalarCost <= ExtraCost;
16439 bool IsProfitablePHIUser =
16441 VectorizableTree.front()->Scalars.size() > 2)) &&
16442 VectorizableTree.front()->hasState() &&
16443 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
16447 auto *PHIUser = dyn_cast<PHINode>(U);
16448 return (!PHIUser ||
16449 PHIUser->getParent() !=
16451 VectorizableTree.front()->getMainOp())
16456 return ValueToExtUses->contains(V);
16458 if (IsProfitablePHIUser) {
16462 (!GatheredLoadsEntriesFirst.has_value() ||
16463 Entry->Idx < *GatheredLoadsEntriesFirst)) {
16464 unsigned ScalarUsesCount =
count_if(Entry->Scalars, [&](
Value *V) {
16465 return ValueToExtUses->contains(V);
16467 auto It = ExtractsCount.
find(Entry);
16468 if (It != ExtractsCount.
end()) {
16469 assert(ScalarUsesCount >= It->getSecond().size() &&
16470 "Expected total number of external uses not less than "
16471 "number of scalar uses.");
16472 ScalarUsesCount -= It->getSecond().size();
16477 KeepScalar = ScalarUsesCount <= 1 || !
has_single_bit(ScalarUsesCount);
16480 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
16481 for (
Value *V : Inst->operands()) {
16482 auto It = ValueToExtUses->find(V);
16483 if (It != ValueToExtUses->end()) {
16485 ExternalUses[It->second].User =
nullptr;
16488 ExtraCost = ScalarCost;
16489 if (!IsPhiInLoop(EU))
16490 ExtractsCount[Entry].
insert(Inst);
16491 if (CanBeUsedAsScalarCast) {
16492 ScalarOpsFromCasts.
insert(Inst->getOperand(0));
16496 for (
Value *V : IOp->operands()) {
16497 auto It = ValueToExtUses->find(V);
16498 if (It != ValueToExtUses->end()) {
16500 ExternalUses[It->second].User =
nullptr;
16509 ExtractCost += ExtraCost;
16513 for (
Value *V : ScalarOpsFromCasts) {
16514 ExternalUsesAsOriginalScalar.insert(V);
16516 ExternalUses.emplace_back(V,
nullptr, *TEs.front(),
16517 TEs.front()->findLaneForValue(V));
16521 if (!VectorizedVals.
empty()) {
16522 const TreeEntry &Root = *VectorizableTree.front();
16523 auto BWIt = MinBWs.find(&Root);
16524 if (BWIt != MinBWs.end()) {
16525 Type *DstTy = Root.Scalars.front()->getType();
16526 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->
getScalarType());
16528 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
16529 if (OriginalSz != SrcSz) {
16530 unsigned Opcode = Instruction::Trunc;
16531 if (OriginalSz > SrcSz)
16532 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
16538 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
16545 Cost += ExtractCost;
16546 auto &&ResizeToVF = [
this, &Cost](
const TreeEntry *TE,
ArrayRef<int> Mask,
16547 bool ForSingleMask) {
16549 unsigned VF = Mask.size();
16550 unsigned VecVF = TE->getVectorFactor();
16551 bool HasLargeIndex =
16552 any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); });
16553 if ((VF != VecVF && HasLargeIndex) ||
16556 if (HasLargeIndex) {
16558 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
16564 dbgs() <<
"SLP: Adding cost " <<
C
16565 <<
" for final shuffle of insertelement external users.\n";
16566 TE->dump();
dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
16568 return std::make_pair(TE,
true);
16571 if (!ForSingleMask) {
16573 for (
unsigned I = 0;
I < VF; ++
I) {
16575 ResizeMask[Mask[
I]] = Mask[
I];
16582 dbgs() <<
"SLP: Adding cost " <<
C
16583 <<
" for final shuffle of insertelement external users.\n";
16584 TE->dump();
dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
16589 return std::make_pair(TE,
false);
16592 for (
int I = 0, E = ShuffledInserts.
size();
I < E; ++
I) {
16593 Value *
Base = ShuffledInserts[
I].InsertElements.
front()->getOperand(0);
16594 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
16598 assert((TEs.size() == 1 || TEs.size() == 2) &&
16599 "Expected exactly 1 or 2 tree entries.");
16600 if (TEs.size() == 1) {
16602 VF = TEs.front()->getVectorFactor();
16603 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16607 (
Data.index() < VF &&
16608 static_cast<int>(
Data.index()) ==
Data.value());
16613 <<
" for final shuffle of insertelement "
16614 "external users.\n";
16615 TEs.front()->
dump();
16616 dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
16622 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
16623 VF = TEs.front()->getVectorFactor();
16627 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16631 <<
" for final shuffle of vector node and external "
16632 "insertelement users.\n";
16633 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
16634 dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
16642 [](
const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
16643 EstimateShufflesCost);
16646 ShuffledInserts[
I].InsertElements.
front()->getType()),
16649 Cost -= InsertCost;
16653 if (ReductionBitWidth != 0) {
16654 assert(UserIgnoreList &&
"Expected reduction tree.");
16655 const TreeEntry &E = *VectorizableTree.front();
16656 auto It = MinBWs.find(&E);
16657 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
16658 unsigned SrcSize = It->second.first;
16659 unsigned DstSize = ReductionBitWidth;
16660 unsigned Opcode = Instruction::Trunc;
16661 if (SrcSize < DstSize) {
16662 bool IsArithmeticExtendedReduction =
16665 return is_contained({Instruction::Add, Instruction::FAdd,
16666 Instruction::Mul, Instruction::FMul,
16667 Instruction::And, Instruction::Or,
16671 if (IsArithmeticExtendedReduction)
16673 Instruction::BitCast;
16675 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
16677 if (Opcode != Instruction::BitCast) {
16679 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
16681 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
16684 switch (E.getOpcode()) {
16685 case Instruction::SExt:
16686 case Instruction::ZExt:
16687 case Instruction::Trunc: {
16688 const TreeEntry *OpTE = getOperandEntry(&E, 0);
16689 CCH = getCastContextHint(*OpTE);
16695 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
16699 <<
" for final resize for reduction from " << SrcVecTy
16700 <<
" to " << DstVecTy <<
"\n";
16701 dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
16706 std::optional<InstructionCost> SpillCost;
16709 Cost += *SpillCost;
16715 OS <<
"SLP: Spill Cost = ";
16720 OS <<
".\nSLP: Extract Cost = " << ExtractCost <<
".\n"
16721 <<
"SLP: Total Cost = " << Cost <<
".\n";
16725 ViewGraph(
this,
"SLP" + F->getName(),
false, Str);
16736std::optional<TTI::ShuffleKind>
16737BoUpSLP::tryToGatherSingleRegisterExtractElements(
16743 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
16759 if (Idx >= VecTy->getNumElements()) {
16763 SmallBitVector ExtractMask(VecTy->getNumElements(),
true);
16764 ExtractMask.reset(*Idx);
16769 VectorOpToIdx[EI->getVectorOperand()].push_back(
I);
16774 stable_sort(Vectors, [](
const auto &P1,
const auto &P2) {
16775 return P1.second.size() > P2.second.size();
16778 const int UndefSz = UndefVectorExtracts.
size();
16779 unsigned SingleMax = 0;
16780 unsigned PairMax = 0;
16781 if (!Vectors.
empty()) {
16782 SingleMax = Vectors.
front().second.size() + UndefSz;
16783 if (Vectors.
size() > 1) {
16784 auto *ItNext = std::next(Vectors.
begin());
16785 PairMax = SingleMax + ItNext->second.size();
16788 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
16789 return std::nullopt;
16795 if (SingleMax >= PairMax && SingleMax) {
16796 for (
int Idx : Vectors.
front().second)
16797 std::swap(GatheredExtracts[Idx], VL[Idx]);
16798 }
else if (!Vectors.
empty()) {
16799 for (
unsigned Idx : {0, 1})
16800 for (
int Idx : Vectors[Idx].second)
16801 std::swap(GatheredExtracts[Idx], VL[Idx]);
16804 for (
int Idx : UndefVectorExtracts)
16805 std::swap(GatheredExtracts[Idx], VL[Idx]);
16808 std::optional<TTI::ShuffleKind> Res =
16814 return std::nullopt;
16818 for (
int I = 0,
E = GatheredExtracts.size();
I <
E; ++
I) {
16839BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
16840 SmallVectorImpl<int> &Mask,
16841 unsigned NumParts)
const {
16842 assert(NumParts > 0 &&
"NumParts expected be greater than or equal to 1.");
16851 SmallVector<int> SubMask;
16852 std::optional<TTI::ShuffleKind> Res =
16853 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
16854 ShufflesRes[Part] = Res;
16855 copy(SubMask, std::next(
Mask.begin(), Part * SliceSize));
16857 if (
none_of(ShufflesRes, [](
const std::optional<TTI::ShuffleKind> &Res) {
16858 return Res.has_value();
16860 ShufflesRes.clear();
16861 return ShufflesRes;
16864std::optional<TargetTransformInfo::ShuffleKind>
16865BoUpSLP::isGatherShuffledSingleRegisterEntry(
16867 SmallVectorImpl<const TreeEntry *> &Entries,
unsigned Part,
bool ForOrder) {
16871 auto GetUserEntry = [&](
const TreeEntry *
TE) {
16872 while (
TE->UserTreeIndex &&
TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16873 TE =
TE->UserTreeIndex.UserTE;
16874 if (TE == VectorizableTree.front().get())
16875 return EdgeInfo(
const_cast<TreeEntry *
>(TE), 0);
16876 return TE->UserTreeIndex;
16878 auto HasGatherUser = [&](
const TreeEntry *
TE) {
16879 while (
TE->Idx != 0 &&
TE->UserTreeIndex) {
16880 if (
TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16882 TE =
TE->UserTreeIndex.UserTE;
16886 const EdgeInfo TEUseEI = GetUserEntry(TE);
16888 return std::nullopt;
16889 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
16894 TEUseEI.UserTE->hasState() ? TEUseEI.UserTE->getMainOp() :
nullptr);
16895 PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
16896 TEInsertBlock =
PHI->getIncomingBlock(TEUseEI.EdgeIdx);
16899 TEInsertBlock = TEInsertPt->
getParent();
16901 if (!DT->isReachableFromEntry(TEInsertBlock))
16902 return std::nullopt;
16903 auto *NodeUI = DT->getNode(TEInsertBlock);
16904 assert(NodeUI &&
"Should only process reachable instructions");
16906 auto CheckOrdering = [&](
const Instruction *InsertPt) {
16919 const BasicBlock *InsertBlock = InsertPt->getParent();
16920 auto *NodeEUI = DT->getNode(InsertBlock);
16923 assert((NodeUI == NodeEUI) ==
16924 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
16925 "Different nodes should have different DFS numbers");
16927 if (TEInsertPt->
getParent() != InsertBlock &&
16928 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
16930 if (TEInsertPt->
getParent() == InsertBlock &&
16943 SmallDenseMap<Value *, int> UsedValuesEntry;
16944 SmallPtrSet<const Value *, 16> VisitedValue;
16945 auto CheckAndUseSameNode = [&](
const TreeEntry *TEPtr) {
16947 if ((TEPtr->getVectorFactor() != VL.
size() &&
16948 TEPtr->Scalars.size() != VL.
size()) ||
16949 (!TEPtr->isSame(VL) && !TEPtr->isSame(
TE->Scalars)))
16953 for (
Value *V : VL) {
16960 auto CheckParentNodes = [&](
const TreeEntry *User1,
const TreeEntry *User2,
16961 unsigned EdgeIdx) {
16962 const TreeEntry *Ptr1 = User1;
16963 const TreeEntry *Ptr2 = User2;
16964 SmallDenseMap<const TreeEntry *, unsigned> PtrToIdx;
16967 EdgeIdx = Ptr2->UserTreeIndex.EdgeIdx;
16968 Ptr2 = Ptr2->UserTreeIndex.UserTE;
16971 unsigned Idx = Ptr1->UserTreeIndex.EdgeIdx;
16972 Ptr1 = Ptr1->UserTreeIndex.UserTE;
16973 if (
auto It = PtrToIdx.
find(Ptr1); It != PtrToIdx.
end())
16974 return Idx < It->second;
16978 auto CheckNonSchedulableOrdering = [&](
const TreeEntry *
E,
16980 return TEUseEI && TEUseEI.UserTE && TEUseEI.UserTE->hasCopyableElements() &&
16981 !TEUseEI.UserTE->isCopyableElement(
16984 InsertPt->getNextNode() == TEInsertPt &&
16985 (!
E->hasCopyableElements() || !
E->isCopyableElement(InsertPt) ||
16988 for (
Value *V : VL) {
16992 SmallPtrSet<const TreeEntry *, 4> VToTEs;
16993 for (
const TreeEntry *TEPtr : ValueToGatherNodes.lookup(V)) {
16994 if (TEPtr == TE || TEPtr->Idx == 0)
16997 [&](
Value *V) { return GatheredScalars.contains(V); }) &&
16998 "Must contain at least single gathered value.");
16999 assert(TEPtr->UserTreeIndex &&
17000 "Expected only single user of a gather node.");
17001 const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
17003 PHINode *UserPHI = (UseEI.UserTE->State != TreeEntry::SplitVectorize &&
17004 UseEI.UserTE->hasState())
17009 : &getLastInstructionInBundle(UseEI.UserTE);
17010 if (TEInsertPt == InsertPt) {
17012 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
17013 (TEUseEI.UserTE->getOpcode() != Instruction::PHI ||
17014 TEUseEI.UserTE->isAltShuffle()) &&
17016 if (UseEI.UserTE->State != TreeEntry::Vectorize ||
17017 (UseEI.UserTE->hasState() &&
17018 UseEI.UserTE->getOpcode() == Instruction::PHI &&
17019 !UseEI.UserTE->isAltShuffle()) ||
17028 (TEUseEI.UserTE != UseEI.UserTE || TEUseEI.EdgeIdx < UseEI.EdgeIdx))
17031 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
17032 TEUseEI.UserTE->getOpcode() == Instruction::PHI &&
17033 UseEI.UserTE->State == TreeEntry::Vectorize &&
17034 UseEI.UserTE->getOpcode() == Instruction::PHI &&
17035 TEUseEI.UserTE != UseEI.UserTE)
17040 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
17044 if (TEUseEI.UserTE != UseEI.UserTE &&
17045 (TEUseEI.UserTE->Idx < UseEI.UserTE->Idx ||
17046 HasGatherUser(TEUseEI.UserTE)))
17049 if (CheckParentNodes(TEUseEI.UserTE, UseEI.UserTE, UseEI.EdgeIdx))
17053 if (!TEUseEI.UserTE->isGather() && !UserPHI &&
17054 TEUseEI.UserTE->doesNotNeedToSchedule() !=
17055 UseEI.UserTE->doesNotNeedToSchedule() &&
17060 if ((TEInsertBlock != InsertPt->
getParent() ||
17061 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
17062 (!CheckOrdering(InsertPt) ||
17063 (UseEI.UserTE->hasCopyableElements() &&
17068 if (CheckAndUseSameNode(TEPtr))
17073 if (CheckNonSchedulableOrdering(UseEI.UserTE, InsertPt))
17079 VTEs, [&](
const TreeEntry *MTE) {
return MTE != TEUseEI.UserTE; });
17080 if (It != VTEs.end()) {
17081 const TreeEntry *VTE = *It;
17082 if (
none_of(
TE->CombinedEntriesWithIndices,
17083 [&](
const auto &
P) { return P.first == VTE->Idx; })) {
17084 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
17085 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
17089 if (CheckAndUseSameNode(VTE))
17095 const TreeEntry *VTE = VTEs.front();
17096 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
17097 VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
17098 VTEs = VTEs.drop_front();
17100 const auto *MIt =
find_if(VTEs, [](
const TreeEntry *MTE) {
17101 return MTE->State == TreeEntry::Vectorize;
17103 if (MIt == VTEs.end())
17107 if (
none_of(
TE->CombinedEntriesWithIndices,
17108 [&](
const auto &
P) { return P.first == VTE->Idx; })) {
17109 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
17110 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst) ||
17111 CheckNonSchedulableOrdering(VTE, &LastBundleInst))
17115 if (CheckAndUseSameNode(VTE))
17119 if (VToTEs.
empty())
17121 if (UsedTEs.
empty()) {
17129 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
17131 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
17135 if (!VToTEs.
empty()) {
17141 VToTEs = SavedVToTEs;
17146 if (Idx == UsedTEs.
size()) {
17150 if (UsedTEs.
size() == 2)
17152 UsedTEs.push_back(SavedVToTEs);
17153 Idx = UsedTEs.
size() - 1;
17159 if (UsedTEs.
empty()) {
17161 return std::nullopt;
17165 if (UsedTEs.
size() == 1) {
17168 UsedTEs.front().
end());
17169 sort(FirstEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
17170 return TE1->Idx < TE2->Idx;
17173 auto *It =
find_if(FirstEntries, [=](
const TreeEntry *EntryPtr) {
17174 return EntryPtr->isSame(VL) || EntryPtr->isSame(
TE->Scalars);
17176 if (It != FirstEntries.end() &&
17177 ((*It)->getVectorFactor() == VL.size() ||
17178 ((*It)->getVectorFactor() ==
TE->Scalars.size() &&
17179 TE->ReuseShuffleIndices.size() == VL.size() &&
17180 (*It)->isSame(
TE->Scalars)))) {
17182 if ((*It)->getVectorFactor() == VL.size()) {
17183 std::iota(std::next(
Mask.begin(), Part * VL.size()),
17184 std::next(
Mask.begin(), (Part + 1) * VL.size()), 0);
17186 SmallVector<int> CommonMask =
TE->getCommonMask();
17197 Entries.
push_back(FirstEntries.front());
17199 for (
auto &
P : UsedValuesEntry)
17201 VF = FirstEntries.front()->getVectorFactor();
17204 assert(UsedTEs.
size() == 2 &&
"Expected at max 2 permuted entries.");
17206 DenseMap<int, const TreeEntry *> VFToTE;
17207 for (
const TreeEntry *TE : UsedTEs.front()) {
17208 unsigned VF =
TE->getVectorFactor();
17209 auto It = VFToTE.
find(VF);
17210 if (It != VFToTE.
end()) {
17211 if (It->second->Idx >
TE->Idx)
17212 It->getSecond() =
TE;
17219 UsedTEs.back().
end());
17220 sort(SecondEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
17221 return TE1->Idx < TE2->Idx;
17223 for (
const TreeEntry *TE : SecondEntries) {
17224 auto It = VFToTE.
find(
TE->getVectorFactor());
17225 if (It != VFToTE.
end()) {
17234 if (Entries.
empty()) {
17236 UsedTEs.front(), [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
17237 return TE1->Idx < TE2->Idx;
17239 Entries.
push_back(SecondEntries.front());
17240 VF = std::max(Entries.
front()->getVectorFactor(),
17241 Entries.
back()->getVectorFactor());
17243 VF = Entries.
front()->getVectorFactor();
17246 for (
const TreeEntry *
E : Entries)
17250 for (
auto &
P : UsedValuesEntry) {
17252 if (ValuesToEntries[Idx].
contains(
P.first)) {
17262 auto AreCompatiblePHIs = [&](
Value *
V,
Value *V1) {
17269 for (
int I = 0,
E =
PHI->getNumIncomingValues();
I <
E; ++
I) {
17271 Value *In1 = PHI1->getIncomingValue(
I);
17286 auto MightBeIgnored = [=](
Value *
V) {
17290 !areAllUsersVectorized(
I, UserIgnoreList) &&
isSimple(
I);
17295 auto NeighborMightBeIgnored = [&](
Value *
V,
int Idx) {
17296 Value *V1 = VL[Idx];
17297 bool UsedInSameVTE =
false;
17298 auto It = UsedValuesEntry.find(V1);
17299 if (It != UsedValuesEntry.end())
17300 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
17301 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
17308 SmallBitVector UsedIdxs(Entries.size());
17310 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
17312 auto It = UsedValuesEntry.find(V);
17313 if (It == UsedValuesEntry.end())
17319 ((
I > 0 && NeighborMightBeIgnored(V,
I - 1)) ||
17320 (
I !=
E - 1 && NeighborMightBeIgnored(V,
I + 1)))))
17322 unsigned Idx = It->second;
17329 for (
unsigned I = 0, Sz = Entries.size();
I < Sz; ++
I) {
17330 if (!UsedIdxs.test(
I))
17336 for (std::pair<unsigned, int> &Pair : EntryLanes)
17337 if (Pair.first ==
I)
17338 Pair.first = TempEntries.
size();
17341 Entries.swap(TempEntries);
17342 if (EntryLanes.size() == Entries.size() &&
17344 .slice(Part * VL.size(),
17345 std::min<int>(VL.size(),
TE->Scalars.size())))) {
17351 return std::nullopt;
17354 bool IsIdentity = Entries.size() == 1;
17357 for (
const std::pair<unsigned, int> &Pair : EntryLanes) {
17358 unsigned Idx = Part * VL.size() + Pair.second;
17361 (ForOrder ? std::distance(
17362 Entries[Pair.first]->Scalars.begin(),
17363 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
17364 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
17365 IsIdentity &=
Mask[Idx] == Pair.second;
17367 if (ForOrder || IsIdentity || Entries.empty()) {
17368 switch (Entries.size()) {
17370 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
17374 if (EntryLanes.size() > 2 || VL.size() <= 2)
17381 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
17383 SmallVector<int> SubMask(std::next(
Mask.begin(), Part * VL.size()),
17384 std::next(
Mask.begin(), (Part + 1) * VL.size()));
17385 int MinElement = SubMask.
front(), MaxElement = SubMask.
front();
17386 for (
int Idx : SubMask) {
17394 assert(MaxElement >= 0 && MinElement >= 0 &&
17395 MaxElement % VF >= MinElement % VF &&
17396 "Expected at least single element.");
17397 unsigned NewVF = std::max<unsigned>(
17399 (MaxElement % VF) -
17400 (MinElement % VF) + 1));
17402 for (
int &Idx : SubMask) {
17405 Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
17406 (Idx >=
static_cast<int>(VF) ? NewVF : 0);
17414 auto *MaskVecTy =
getWidenedType(VL.front()->getType(), SubMask.size());
17415 auto GetShuffleCost = [&,
17416 &TTI = *TTI](ArrayRef<int>
Mask,
17419 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
17421 Mask, Entries.front()->getInterleaveFactor()))
17423 return ::getShuffleCost(TTI,
17428 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
17430 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
17431 if (Entries.size() == 1 || !Entries[0]->isGather()) {
17432 FirstShuffleCost = ShuffleCost;
17436 bool IsIdentity =
true;
17437 for (
auto [
I, Idx] :
enumerate(FirstMask)) {
17438 if (Idx >=
static_cast<int>(NewVF)) {
17443 IsIdentity &=
static_cast<int>(
I) == Idx;
17447 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
17449 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
17453 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
17454 if (Entries.size() == 1 || !Entries[1]->isGather()) {
17455 SecondShuffleCost = ShuffleCost;
17459 bool IsIdentity =
true;
17460 for (
auto [
I, Idx] :
enumerate(SecondMask)) {
17461 if (Idx <
static_cast<int>(NewVF) && Idx >= 0) {
17467 IsIdentity &=
static_cast<int>(
I) == Idx;
17472 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
17474 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
17482 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
17484 const TreeEntry *BestEntry =
nullptr;
17485 if (FirstShuffleCost < ShuffleCost) {
17486 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
17487 std::next(
Mask.begin(), (Part + 1) * VL.size()),
17489 if (Idx >= static_cast<int>(VF))
17490 Idx = PoisonMaskElem;
17492 BestEntry = Entries.front();
17493 ShuffleCost = FirstShuffleCost;
17495 if (SecondShuffleCost < ShuffleCost) {
17496 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
17497 std::next(
Mask.begin(), (Part + 1) * VL.size()),
17499 if (Idx < static_cast<int>(VF))
17500 Idx = PoisonMaskElem;
17504 BestEntry = Entries[1];
17505 ShuffleCost = SecondShuffleCost;
17507 if (BuildVectorCost >= ShuffleCost) {
17510 Entries.push_back(BestEntry);
17518 std::fill(std::next(
Mask.begin(), Part * VL.size()),
17520 return std::nullopt;
17524BoUpSLP::isGatherShuffledEntry(
17528 assert(NumParts > 0 && NumParts < VL.
size() &&
17529 "Expected positive number of registers.");
17532 if (TE == VectorizableTree.front().get() &&
17533 (!GatheredLoadsEntriesFirst.has_value() ||
17535 [](
const std::unique_ptr<TreeEntry> &TE) {
17536 return !
TE->isGather();
17541 if (
TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
17544 assert((
TE->UserTreeIndex || TE == VectorizableTree.front().get()) &&
17545 "Expected only single user of the gather node.");
17547 "Number of scalars must be divisible by NumParts.");
17548 if (
TE->UserTreeIndex &&
TE->UserTreeIndex.UserTE->isGather() &&
17549 TE->UserTreeIndex.EdgeIdx == UINT_MAX &&
17551 (
TE->hasState() &&
TE->getOpcode() == Instruction::ExtractElement) ||
17554 getSameValuesTreeEntry(
TE->getMainOp(),
TE->Scalars))))
17561 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
17562 std::optional<TTI::ShuffleKind> SubRes =
17563 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
17566 SubEntries.
clear();
17569 SubEntries.
front()->getVectorFactor() == VL.
size() &&
17570 (SubEntries.
front()->isSame(
TE->Scalars) ||
17571 SubEntries.
front()->isSame(VL))) {
17573 LocalSubEntries.
swap(SubEntries);
17576 std::iota(
Mask.begin(),
Mask.end(), 0);
17578 for (
int I = 0, Sz = VL.
size();
I < Sz; ++
I)
17581 Entries.emplace_back(1, LocalSubEntries.
front());
17587 [](
const std::optional<TTI::ShuffleKind> &SK) {
return !SK; })) {
17595 Type *ScalarTy)
const {
17596 const unsigned VF = VL.
size();
17604 auto EstimateInsertCost = [&](
unsigned I,
Value *
V) {
17606 if (
V->getType() != ScalarTy)
17607 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy,
V->getType(),
17611 std::iota(ConstantShuffleMask.begin(), ConstantShuffleMask.end(), 0);
17618 ConstantShuffleMask[
I] =
I + VF;
17621 EstimateInsertCost(
I, V);
17624 bool IsAnyNonUndefConst =
17627 if (!ForPoisonSrc && IsAnyNonUndefConst) {
17629 ConstantShuffleMask);
17633 if (!DemandedElements.
isZero())
17637 ForPoisonSrc && !IsAnyNonUndefConst, VL);
17641Instruction &BoUpSLP::getLastInstructionInBundle(
const TreeEntry *
E) {
17642 auto It = EntryToLastInstruction.find(
E);
17643 if (It != EntryToLastInstruction.end())
17651 if (
E->hasState()) {
17652 Front =
E->getMainOp();
17653 Opcode =
E->getOpcode();
17660 ((GatheredLoadsEntriesFirst.has_value() && Opcode == Instruction::Load &&
17661 E->isGather() &&
E->Idx < *GatheredLoadsEntriesFirst) ||
17662 E->State == TreeEntry::SplitVectorize ||
E->hasCopyableElements() ||
17664 [=](
Value *V) ->
bool {
17665 if (Opcode == Instruction::GetElementPtr &&
17666 !isa<GetElementPtrInst>(V))
17668 auto *I = dyn_cast<Instruction>(V);
17669 return !I || !E->getMatchingMainOpOrAltOp(I) ||
17670 I->getParent() == BB || isVectorLikeInstWithConstOps(I);
17672 "Expected gathered loads or GEPs or instructions from same basic "
17675 auto FindLastInst = [&]() {
17677 for (
Value *V :
E->Scalars) {
17681 if (
E->isCopyableElement(
I))
17683 if (LastInst->
getParent() ==
I->getParent()) {
17688 assert(((Opcode == Instruction::GetElementPtr &&
17690 E->State == TreeEntry::SplitVectorize ||
17693 (GatheredLoadsEntriesFirst.has_value() &&
17694 Opcode == Instruction::Load &&
E->isGather() &&
17695 E->Idx < *GatheredLoadsEntriesFirst)) &&
17696 "Expected vector-like or non-GEP in GEP node insts only.");
17697 if (!DT->isReachableFromEntry(LastInst->
getParent())) {
17701 if (!DT->isReachableFromEntry(
I->getParent()))
17703 auto *NodeA = DT->getNode(LastInst->
getParent());
17704 auto *NodeB = DT->getNode(
I->getParent());
17705 assert(NodeA &&
"Should only process reachable instructions");
17706 assert(NodeB &&
"Should only process reachable instructions");
17707 assert((NodeA == NodeB) ==
17708 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17709 "Different nodes should have different DFS numbers");
17710 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
17717 auto FindFirstInst = [&]() {
17719 for (
Value *V :
E->Scalars) {
17723 if (
E->isCopyableElement(
I))
17725 if (FirstInst->
getParent() ==
I->getParent()) {
17726 if (
I->comesBefore(FirstInst))
17730 assert(((Opcode == Instruction::GetElementPtr &&
17734 "Expected vector-like or non-GEP in GEP node insts only.");
17735 if (!DT->isReachableFromEntry(FirstInst->
getParent())) {
17739 if (!DT->isReachableFromEntry(
I->getParent()))
17741 auto *NodeA = DT->getNode(FirstInst->
getParent());
17742 auto *NodeB = DT->getNode(
I->getParent());
17743 assert(NodeA &&
"Should only process reachable instructions");
17744 assert(NodeB &&
"Should only process reachable instructions");
17745 assert((NodeA == NodeB) ==
17746 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17747 "Different nodes should have different DFS numbers");
17748 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
17754 if (
E->State == TreeEntry::SplitVectorize) {
17755 Res = FindLastInst();
17757 for (
auto *
E : Entries) {
17760 I = &getLastInstructionInBundle(
E);
17765 EntryToLastInstruction.try_emplace(
E, Res);
17770 if (GatheredLoadsEntriesFirst.has_value() &&
17771 E->Idx >= *GatheredLoadsEntriesFirst && !
E->isGather() &&
17772 Opcode == Instruction::Load) {
17773 Res = FindFirstInst();
17774 EntryToLastInstruction.try_emplace(
E, Res);
17780 auto FindScheduleBundle = [&](
const TreeEntry *
E) ->
const ScheduleBundle * {
17784 const auto *It = BlocksSchedules.find(BB);
17785 if (It == BlocksSchedules.end())
17787 for (
Value *V :
E->Scalars) {
17793 if (Bundles.
empty())
17796 Bundles, [&](ScheduleBundle *
B) {
return B->getTreeEntry() ==
E; });
17797 if (It != Bundles.
end())
17802 const ScheduleBundle *Bundle = FindScheduleBundle(
E);
17803 if (!
E->isGather() && !Bundle) {
17804 if ((Opcode == Instruction::GetElementPtr &&
17807 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
17811 return isa<PoisonValue>(V) ||
17812 (E->Idx == 0 && isa<InsertElementInst>(V)) ||
17813 E->isCopyableElement(V) ||
17814 (!isVectorLikeInstWithConstOps(V) &&
17815 isUsedOutsideBlock(V));
17817 (!
E->doesNotNeedToSchedule() ||
17820 if (!isa<Instruction>(V) ||
17821 (E->hasCopyableElements() && E->isCopyableElement(V)))
17823 return !areAllOperandsNonInsts(V);
17826 if (!isa<Instruction>(V) ||
17827 (E->hasCopyableElements() && E->isCopyableElement(V)))
17829 return MustGather.contains(V);
17831 Res = FindLastInst();
17833 Res = FindFirstInst();
17834 EntryToLastInstruction.try_emplace(
E, Res);
17843 assert(!
E->isGather() &&
"Gathered instructions should not be scheduled");
17844 Res = Bundle->getBundle().back()->getInst();
17845 EntryToLastInstruction.try_emplace(
E, Res);
17868 Res = FindLastInst();
17869 assert(Res &&
"Failed to find last instruction in bundle");
17870 EntryToLastInstruction.try_emplace(
E, Res);
17874void BoUpSLP::setInsertPointAfterBundle(
const TreeEntry *
E) {
17875 auto *Front =
E->getMainOp();
17876 Instruction *LastInst = &getLastInstructionInBundle(
E);
17877 assert(LastInst &&
"Failed to find last instruction in bundle");
17882 LastInstIt = LastInst->
getParent()->getFirstNonPHIIt();
17883 if (LastInstIt != LastInst->
getParent()->end() &&
17884 LastInstIt->getParent()->isLandingPad())
17885 LastInstIt = std::next(LastInstIt);
17888 (!
E->isGather() &&
E->State != TreeEntry::SplitVectorize &&
17889 (
E->doesNotNeedToSchedule() ||
17890 (
E->hasCopyableElements() && !
E->isCopyableElement(LastInst) &&
17892 (GatheredLoadsEntriesFirst.has_value() &&
17893 E->Idx >= *GatheredLoadsEntriesFirst && !
E->isGather() &&
17894 E->getOpcode() == Instruction::Load)) {
17895 Builder.SetInsertPoint(LastInst->
getParent(), LastInstIt);
17899 Builder.SetInsertPoint(
17902 if (Instruction *Res = LastInstructionToPos.lookup(LastInst)) {
17905 Res = Builder.CreateAlignedLoad(Builder.getPtrTy(),
17910 LastInstructionToPos.try_emplace(LastInst, Res);
17913 Builder.SetCurrentDebugLocation(Front->
getDebugLoc());
17916Value *BoUpSLP::gather(
17918 function_ref<
Value *(
Value *,
Value *, ArrayRef<int>)> CreateShuffle) {
17924 SmallSet<int, 4> PostponedIndices;
17925 Loop *
L = LI->getLoopFor(Builder.GetInsertBlock());
17927 SmallPtrSet<BasicBlock *, 4> Visited;
17928 while (InsertBB && InsertBB != InstBB && Visited.
insert(InsertBB).second)
17929 InsertBB = InsertBB->getSinglePredecessor();
17930 return InsertBB && InsertBB == InstBB;
17932 for (
int I = 0,
E = VL.
size();
I <
E; ++
I) {
17934 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
17936 (L && (!Root ||
L->isLoopInvariant(Root)) &&
L->contains(Inst))) &&
17937 PostponedIndices.
insert(
I).second)
17941 auto &&CreateInsertElement = [
this](
Value *Vec,
Value *
V,
unsigned Pos,
17944 if (
Scalar->getType() != Ty) {
17955 Scalar = Builder.CreateIntCast(
17969 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
17974 GatherShuffleExtractSeq.insert(InsElt);
17980 User *UserOp =
nullptr;
17985 if (
V->getType()->isVectorTy()) {
17987 SV && SV->getOperand(0) != V && SV->getOperand(1) != V) {
17989 auto FindOperand = [](
Value *Vec,
Value *
V) -> Instruction * {
17991 if (SV->getOperand(0) == V)
17993 if (SV->getOperand(1) == V)
17999 if (Instruction *User = FindOperand(SV->getOperand(0), V))
18001 else if (Instruction *User = FindOperand(SV->getOperand(1), V))
18004 "Failed to find shufflevector, caused by resize.");
18010 unsigned FoundLane = Entries.front()->findLaneForValue(V);
18011 ExternalUses.emplace_back(V, UserOp, *Entries.front(), FoundLane);
18019 SmallVector<int> NonConsts;
18021 std::iota(
Mask.begin(),
Mask.end(), 0);
18022 Value *OriginalRoot = Root;
18025 SV->getOperand(0)->getType() == VecTy) {
18026 Root = SV->getOperand(0);
18027 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
18030 for (
int I = 0,
E = VL.
size();
I <
E; ++
I) {
18039 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
18044 Vec = OriginalRoot;
18046 Vec = CreateShuffle(Root, Vec, Mask);
18048 OI && OI->use_empty() &&
18049 none_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
18050 return TE->VectorizedValue == OI;
18056 for (
int I : NonConsts)
18057 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
18060 for (
const std::pair<Value *, unsigned> &Pair : PostponedInsts)
18061 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
18099 bool IsFinalized =
false;
18112 class ShuffleIRBuilder {
18125 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
18126 CSEBlocks(CSEBlocks),
DL(DL) {}
18127 ~ShuffleIRBuilder() =
default;
18133 "Expected integer vector types only.");
18139 ->getIntegerBitWidth())
18140 V2 = Builder.CreateIntCast(
18143 V1 = Builder.CreateIntCast(
18147 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
18149 GatherShuffleExtractSeq.insert(
I);
18150 CSEBlocks.insert(
I->getParent());
18159 unsigned VF = Mask.size();
18163 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
18165 GatherShuffleExtractSeq.insert(
I);
18166 CSEBlocks.insert(
I->getParent());
18170 Value *createIdentity(
Value *V) {
return V; }
18171 Value *createPoison(
Type *Ty,
unsigned VF) {
18176 void resizeToMatch(
Value *&V1,
Value *&V2) {
18181 int VF = std::max(V1VF, V2VF);
18182 int MinVF = std::min(V1VF, V2VF);
18184 std::iota(IdentityMask.
begin(), std::next(IdentityMask.
begin(), MinVF),
18186 Value *&
Op = MinVF == V1VF ? V1 : V2;
18187 Op = Builder.CreateShuffleVector(
Op, IdentityMask);
18189 GatherShuffleExtractSeq.insert(
I);
18190 CSEBlocks.insert(
I->getParent());
18203 assert(V1 &&
"Expected at least one vector value.");
18204 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
18205 R.CSEBlocks, *R.DL);
18206 return BaseShuffleAnalysis::createShuffle<Value *>(
18207 V1, V2, Mask, ShuffleBuilder, ScalarTy);
18213 std::optional<bool> IsSigned = std::nullopt) {
18216 if (VecTy->getElementType() == ScalarTy->getScalarType())
18218 return Builder.CreateIntCast(
18219 V,
VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
18223 Value *getVectorizedValue(
const TreeEntry &E) {
18224 Value *Vec = E.VectorizedValue;
18227 return castToScalarTyElem(Vec,
any_of(E.Scalars, [&](
Value *V) {
18228 return !isa<PoisonValue>(V) &&
18229 !isKnownNonNegative(
18230 V, SimplifyQuery(*R.DL));
18236 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
18240 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
18241 unsigned NumParts,
bool &UseVecBaseAsInput) {
18242 UseVecBaseAsInput =
false;
18244 Value *VecBase =
nullptr;
18246 if (!E->ReorderIndices.empty()) {
18248 E->ReorderIndices.end());
18251 for (
int I = 0, Sz = Mask.size();
I < Sz; ++
I) {
18256 VecBase = EI->getVectorOperand();
18258 VecBase = TEs.front()->VectorizedValue;
18259 assert(VecBase &&
"Expected vectorized value.");
18260 UniqueBases.
insert(VecBase);
18263 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
18264 (NumParts != 1 &&
count(VL, EI) > 1) ||
18266 ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(U);
18267 return UTEs.empty() || UTEs.size() > 1 ||
18268 (isa<GetElementPtrInst>(U) &&
18269 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
18271 count_if(R.VectorizableTree,
18272 [&](const std::unique_ptr<TreeEntry> &TE) {
18273 return TE->UserTreeIndex.UserTE ==
18275 is_contained(VL, EI);
18279 R.eraseInstruction(EI);
18281 if (NumParts == 1 || UniqueBases.
size() == 1) {
18282 assert(VecBase &&
"Expected vectorized value.");
18283 return castToScalarTyElem(VecBase);
18285 UseVecBaseAsInput =
true;
18295 Value *Vec =
nullptr;
18302 constexpr int MaxBases = 2;
18304 auto VLMask =
zip(SubVL, SubMask);
18305 const unsigned VF = std::accumulate(
18306 VLMask.begin(), VLMask.end(), 0U, [&](
unsigned S,
const auto &
D) {
18307 if (std::get<1>(D) == PoisonMaskElem)
18310 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
18311 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp);
18313 VecOp = TEs.front()->VectorizedValue;
18314 assert(VecOp &&
"Expected vectorized value.");
18315 const unsigned Size =
18316 cast<FixedVectorType>(VecOp->getType())->getNumElements();
18317 return std::max(S, Size);
18319 for (
const auto [V,
I] : VLMask) {
18324 VecOp = TEs.front()->VectorizedValue;
18325 assert(VecOp &&
"Expected vectorized value.");
18326 VecOp = castToScalarTyElem(VecOp);
18327 Bases[
I / VF] = VecOp;
18329 if (!Bases.front())
18332 if (Bases.back()) {
18333 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
18334 TransformToIdentity(SubMask);
18336 SubVec = Bases.front();
18342 ArrayRef<int> SubMask =
18343 Mask.slice(
P * SliceSize,
18346 return all_of(SubMask, [](
int Idx) {
18350 "Expected first part or all previous parts masked.");
18351 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18356 unsigned SubVecVF =
18358 NewVF = std::max(NewVF, SubVecVF);
18361 for (
int &Idx : SubMask)
18364 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18365 Vec = createShuffle(Vec, SubVec, VecMask);
18366 TransformToIdentity(VecMask);
18374 std::optional<Value *>
18380 TEs, [](
const TreeEntry *TE) {
return TE->VectorizedValue; });
18382 return std::nullopt;
18385 auto *ResVecTy =
getWidenedType(ScalarTy, E->getVectorFactor());
18386 return Builder.CreateAlignedLoad(
18393 IsFinalized =
false;
18394 CommonMask.clear();
18400 Value *V1 = getVectorizedValue(E1);
18401 Value *V2 = getVectorizedValue(E2);
18407 Value *V1 = getVectorizedValue(E1);
18412 assert(V1 && V2 && !Mask.empty() &&
"Expected non-empty input vectors.");
18415 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
18416 V1 = castToScalarTyElem(V1);
18417 V2 = castToScalarTyElem(V2);
18418 if (InVectors.empty()) {
18419 InVectors.push_back(V1);
18420 InVectors.push_back(V2);
18421 CommonMask.assign(Mask.begin(), Mask.end());
18424 Value *Vec = InVectors.front();
18425 if (InVectors.size() == 2) {
18426 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18427 transformMaskAfterShuffle(CommonMask, CommonMask);
18430 Vec = createShuffle(Vec,
nullptr, CommonMask);
18431 transformMaskAfterShuffle(CommonMask, CommonMask);
18433 V1 = createShuffle(V1, V2, Mask);
18434 unsigned VF = std::max(getVF(V1), getVF(Vec));
18435 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18437 CommonMask[Idx] = Idx + VF;
18438 InVectors.front() = Vec;
18439 if (InVectors.size() == 2)
18440 InVectors.back() = V1;
18442 InVectors.push_back(V1);
18447 "castToScalarTyElem expects V1 to be FixedVectorType");
18448 V1 = castToScalarTyElem(V1);
18449 if (InVectors.empty()) {
18450 InVectors.push_back(V1);
18451 CommonMask.assign(Mask.begin(), Mask.end());
18454 const auto *It =
find(InVectors, V1);
18455 if (It == InVectors.end()) {
18456 if (InVectors.size() == 2 ||
18457 InVectors.front()->getType() != V1->
getType()) {
18458 Value *V = InVectors.front();
18459 if (InVectors.size() == 2) {
18460 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
18461 transformMaskAfterShuffle(CommonMask, CommonMask);
18463 CommonMask.size()) {
18464 V = createShuffle(InVectors.front(),
nullptr, CommonMask);
18465 transformMaskAfterShuffle(CommonMask, CommonMask);
18467 unsigned VF = std::max(CommonMask.size(), Mask.size());
18468 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18470 CommonMask[Idx] = V->getType() != V1->
getType()
18472 : Mask[Idx] + getVF(V1);
18473 if (V->getType() != V1->
getType())
18474 V1 = createShuffle(V1,
nullptr, Mask);
18475 InVectors.front() = V;
18476 if (InVectors.size() == 2)
18477 InVectors.back() = V1;
18479 InVectors.push_back(V1);
18484 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18486 InVectors.push_back(V1);
18491 for (
Value *V : InVectors)
18492 VF = std::max(VF, getVF(V));
18493 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18495 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
18504 Value *Root =
nullptr) {
18505 return R.gather(VL, Root, ScalarTy,
18507 return createShuffle(V1, V2, Mask);
18516 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
18521 IsFinalized =
true;
18524 if (InVectors.
size() == 2) {
18525 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
18528 Vec = createShuffle(Vec,
nullptr, CommonMask);
18530 transformMaskAfterShuffle(CommonMask, CommonMask);
18532 "Expected vector length for the final value before action.");
18536 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
18537 Vec = createShuffle(Vec,
nullptr, ResizeMask);
18539 Action(Vec, CommonMask, [
this](
Value *V1,
Value *V2, ArrayRef<int> Mask) {
18540 return createShuffle(V1, V2, Mask);
18542 InVectors.
front() = Vec;
18544 if (!SubVectors.empty()) {
18546 if (InVectors.
size() == 2) {
18547 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
18550 Vec = createShuffle(Vec,
nullptr, CommonMask);
18552 transformMaskAfterShuffle(CommonMask, CommonMask);
18553 auto CreateSubVectors = [&](
Value *Vec,
18554 SmallVectorImpl<int> &CommonMask) {
18555 for (
auto [
E, Idx] : SubVectors) {
18556 Value *
V = getVectorizedValue(*
E);
18563 Type *OrigScalarTy = ScalarTy;
18566 Builder, Vec, V, InsertionIndex,
18567 std::bind(&ShuffleInstructionBuilder::createShuffle,
this, _1, _2,
18569 ScalarTy = OrigScalarTy;
18570 if (!CommonMask.
empty()) {
18571 std::iota(std::next(CommonMask.
begin(), Idx),
18572 std::next(CommonMask.
begin(), Idx +
E->getVectorFactor()),
18578 if (SubVectorsMask.
empty()) {
18579 Vec = CreateSubVectors(Vec, CommonMask);
18582 copy(SubVectorsMask, SVMask.begin());
18583 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
18586 I1 = I2 + CommonMask.
size();
18591 Vec = createShuffle(InsertVec, Vec, SVMask);
18592 transformMaskAfterShuffle(CommonMask, SVMask);
18594 InVectors.
front() = Vec;
18597 if (!ExtMask.
empty()) {
18598 if (CommonMask.
empty()) {
18602 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
18605 NewMask[
I] = CommonMask[ExtMask[
I]];
18607 CommonMask.
swap(NewMask);
18610 if (CommonMask.
empty()) {
18611 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
18612 return InVectors.
front();
18614 if (InVectors.
size() == 2)
18615 return createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
18616 return createShuffle(InVectors.
front(),
nullptr, CommonMask);
18620 assert((IsFinalized || CommonMask.empty()) &&
18621 "Shuffle construction must be finalized.");
18625Value *BoUpSLP::vectorizeOperand(TreeEntry *E,
unsigned NodeIdx) {
18629template <
typename BVTy,
typename ResTy,
typename... Args>
18630ResTy BoUpSLP::processBuildVector(
const TreeEntry *E,
Type *ScalarTy,
18632 assert(E->isGather() &&
"Expected gather node.");
18633 unsigned VF = E->getVectorFactor();
18635 bool NeedFreeze =
false;
18638 for (
auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
18640 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
18643 E->CombinedEntriesWithIndices.size());
18644 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
18645 [&](
const auto &
P) {
18646 return std::make_pair(VectorizableTree[P.first].get(), P.second);
18651 E->ReorderIndices.end());
18652 if (!ReorderMask.
empty())
18658 if (!SubVectors.empty() && !SubVectorsMask.
empty()) {
18660 if (E->Scalars[
I] == GatheredScalars[ReorderMask[
I]])
18663 SubVectorsMask.
clear();
18667 unsigned I,
unsigned SliceSize,
18668 bool IsNotPoisonous) {
18670 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
18673 TreeEntry *UserTE = E->UserTreeIndex.UserTE;
18674 unsigned EdgeIdx = E->UserTreeIndex.EdgeIdx;
18675 if (UserTE->getNumOperands() != 2)
18677 if (!IsNotPoisonous) {
18678 auto *It =
find_if(
ArrayRef(VectorizableTree).drop_front(UserTE->Idx + 1),
18679 [=](
const std::unique_ptr<TreeEntry> &TE) {
18680 return TE->UserTreeIndex.UserTE == UserTE &&
18681 TE->UserTreeIndex.EdgeIdx != EdgeIdx;
18683 if (It == VectorizableTree.end())
18686 if (!(*It)->ReorderIndices.empty()) {
18690 if (!
all_of(
zip(GatheredScalars, GS), [&](
const auto &
P) {
18691 Value *V0 = std::get<0>(
P);
18692 Value *V1 = std::get<1>(
P);
18700 if ((Mask.size() < InputVF &&
18703 (Mask.size() == InputVF &&
18706 std::next(Mask.begin(),
I * SliceSize),
18707 std::next(Mask.begin(),
18714 std::next(Mask.begin(),
I * SliceSize),
18715 std::next(Mask.begin(),
18721 BVTy ShuffleBuilder(ScalarTy, Params...);
18722 ResTy Res = ResTy();
18726 Value *ExtractVecBase =
nullptr;
18727 bool UseVecBaseAsInput =
false;
18730 Type *OrigScalarTy = GatheredScalars.
front()->getType();
18735 bool Resized =
false;
18737 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
18738 if (!ExtractShuffles.
empty()) {
18740 for (
auto [Idx,
I] :
enumerate(ExtractMask)) {
18746 ExtractEntries.
append(TEs.begin(), TEs.end());
18748 if (std::optional<ResTy> Delayed =
18749 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
18751 PostponedGathers.insert(E);
18756 if (
Value *VecBase = ShuffleBuilder.adjustExtracts(
18757 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
18758 ExtractVecBase = VecBase;
18760 if (VF == VecBaseTy->getNumElements() &&
18761 GatheredScalars.
size() != VF) {
18763 GatheredScalars.
append(VF - GatheredScalars.
size(),
18771 if (!ExtractShuffles.
empty() || !E->hasState() ||
18772 E->getOpcode() != Instruction::Load ||
18773 (((E->hasState() && E->getOpcode() == Instruction::Load) ||
18777 return isa<LoadInst>(V) && isVectorized(V);
18779 (E->hasState() && E->isAltShuffle()) ||
18780 all_of(E->Scalars, [
this](
Value *V) { return isVectorized(V); }) ||
18782 (E->Scalars != GatheredScalars && GatheredScalars.
size() <= 2)) {
18784 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
18786 if (!GatherShuffles.
empty()) {
18787 if (std::optional<ResTy> Delayed =
18788 ShuffleBuilder.needToDelay(E, Entries)) {
18790 PostponedGathers.insert(E);
18795 if (GatherShuffles.
size() == 1 &&
18797 Entries.
front().front()->isSame(E->Scalars)) {
18800 LLVM_DEBUG(
dbgs() <<
"SLP: perfect diamond match for gather bundle "
18803 Mask.resize(E->Scalars.size());
18804 const TreeEntry *FrontTE = Entries.
front().front();
18805 if (FrontTE->ReorderIndices.empty() &&
18806 ((FrontTE->ReuseShuffleIndices.empty() &&
18807 E->Scalars.size() == FrontTE->Scalars.size()) ||
18808 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
18809 std::iota(Mask.begin(), Mask.end(), 0);
18816 Mask[
I] = FrontTE->findLaneForValue(V);
18821 ShuffleBuilder.resetForSameNode();
18822 ShuffleBuilder.add(*FrontTE, Mask);
18824 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
18828 if (GatheredScalars.
size() != VF &&
18830 return any_of(TEs, [&](
const TreeEntry *TE) {
18831 return TE->getVectorFactor() == VF;
18834 GatheredScalars.
append(VF - GatheredScalars.
size(),
18838 for (
int I = 0, Sz = Mask.size();
I < Sz; ++
I) {
18846 bool IsRootPoison) {
18849 bool IsSplat = IsRootPoison &&
isSplat(Scalars) &&
18856 int NumNonConsts = 0;
18875 Scalars.
front() = OrigV;
18878 const auto Res = UniquePositions.
try_emplace(OrigV,
I);
18879 Scalars[Res.first->second] = OrigV;
18880 ReuseMask[
I] = Res.first->second;
18883 if (NumNonConsts == 1) {
18888 if (!UndefPos.
empty() && UndefPos.
front() == 0)
18891 ReuseMask[SinglePos] = SinglePos;
18892 }
else if (!UndefPos.
empty() && IsSplat) {
18899 (E->UserTreeIndex &&
any_of(V->uses(), [E](
const Use &U) {
18902 return E->UserTreeIndex.EdgeIdx != U.getOperandNo() &&
18903 is_contained(E->UserTreeIndex.UserTE->Scalars,
18907 if (It != Scalars.
end()) {
18909 int Pos = std::distance(Scalars.
begin(), It);
18910 for (
int I : UndefPos) {
18912 ReuseMask[
I] = Pos;
18921 for (
int I : UndefPos) {
18930 if (!ExtractShuffles.
empty() || !GatherShuffles.
empty()) {
18931 bool IsNonPoisoned =
true;
18932 bool IsUsedInExpr =
true;
18933 Value *Vec1 =
nullptr;
18934 if (!ExtractShuffles.
empty()) {
18938 Value *Vec2 =
nullptr;
18939 for (
unsigned I = 0, Sz = ExtractMask.
size();
I < Sz; ++
I) {
18943 if (UseVecBaseAsInput) {
18944 Vec1 = ExtractVecBase;
18946 for (
unsigned I = 0, Sz = ExtractMask.
size();
I < Sz; ++
I) {
18952 Value *VecOp = EI->getVectorOperand();
18954 !TEs.
empty() && TEs.front()->VectorizedValue)
18955 VecOp = TEs.front()->VectorizedValue;
18958 }
else if (Vec1 != VecOp) {
18959 assert((!Vec2 || Vec2 == VecOp) &&
18960 "Expected only 1 or 2 vectors shuffle.");
18966 IsUsedInExpr =
false;
18969 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
18972 IsUsedInExpr &= FindReusedSplat(
18975 ExtractMask.
size(), IsNotPoisonedVec);
18976 ShuffleBuilder.add(Vec1, ExtractMask,
true);
18977 IsNonPoisoned &= IsNotPoisonedVec;
18979 IsUsedInExpr =
false;
18984 if (!GatherShuffles.
empty()) {
18985 unsigned SliceSize =
18989 for (
const auto [
I, TEs] :
enumerate(Entries)) {
18992 "No shuffles with empty entries list expected.");
18995 assert((TEs.size() == 1 || TEs.size() == 2) &&
18996 "Expected shuffle of 1 or 2 entries.");
18997 unsigned Limit =
getNumElems(Mask.size(), SliceSize,
I);
19000 copy(SubMask, std::next(VecMask.
begin(),
I * SliceSize));
19001 if (TEs.size() == 1) {
19002 bool IsNotPoisonedVec =
19003 TEs.front()->VectorizedValue
19007 FindReusedSplat(VecMask, TEs.
front()->getVectorFactor(),
I,
19008 SliceSize, IsNotPoisonedVec);
19009 ShuffleBuilder.add(*TEs.front(), VecMask);
19010 IsNonPoisoned &= IsNotPoisonedVec;
19012 IsUsedInExpr =
false;
19013 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
19014 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
19025 int EMSz = ExtractMask.
size();
19026 int MSz = Mask.size();
19029 bool IsSingleShuffle = ExtractShuffles.
empty() || GatherShuffles.
empty();
19030 bool IsIdentityShuffle =
19031 ((UseVecBaseAsInput ||
19033 [](
const std::optional<TTI::ShuffleKind> &SK) {
19037 none_of(ExtractMask, [&](
int I) {
return I >= EMSz; }) &&
19039 (!GatherShuffles.
empty() &&
19041 [](
const std::optional<TTI::ShuffleKind> &SK) {
19045 none_of(Mask, [&](
int I) {
return I >= MSz; }) &&
19047 bool EnoughConstsForShuffle =
19057 (!IsIdentityShuffle ||
19058 (GatheredScalars.
size() == 2 &&
19066 for (
int I = 0, Sz = GatheredScalars.
size();
I < Sz; ++
I) {
19067 if (EnoughConstsForShuffle &&
isa<Constant>(GatheredScalars[
I]))
19075 TryPackScalars(GatheredScalars, BVMask,
true);
19076 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.
size());
19077 ShuffleBuilder.add(BV, BVMask);
19081 (IsSingleShuffle && ((IsIdentityShuffle &&
19084 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
19087 Res = ShuffleBuilder.finalize(
19088 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.
size(),
19090 bool IsSplat = isSplat(NonConstants);
19091 SmallVector<int> BVMask(Mask.size(), PoisonMaskElem);
19092 TryPackScalars(NonConstants, BVMask, false);
19093 auto CheckIfSplatIsProfitable = [&]() {
19096 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
19097 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
19098 if (isa<ExtractElementInst>(V) || isVectorized(V))
19100 InstructionCost SplatCost = TTI->getVectorInstrCost(
19101 Instruction::InsertElement, VecTy, CostKind, 0,
19102 PoisonValue::get(VecTy), V);
19103 SmallVector<int> NewMask(Mask.begin(), Mask.end());
19104 for (auto [Idx, I] : enumerate(BVMask))
19105 if (I != PoisonMaskElem)
19106 NewMask[Idx] = Mask.size();
19107 SplatCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy,
19108 NewMask, CostKind);
19109 InstructionCost BVCost = TTI->getVectorInstrCost(
19110 Instruction::InsertElement, VecTy, CostKind,
19111 *find_if(Mask, [](int I) { return I != PoisonMaskElem; }),
19114 if (count(BVMask, PoisonMaskElem) <
19115 static_cast<int>(BVMask.size() - 1)) {
19116 SmallVector<int> NewMask(Mask.begin(), Mask.end());
19117 for (auto [Idx, I] : enumerate(BVMask))
19118 if (I != PoisonMaskElem)
19120 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
19121 VecTy, NewMask, CostKind);
19123 return SplatCost <= BVCost;
19125 if (!IsSplat || Mask.size() <= 2 || !CheckIfSplatIsProfitable()) {
19129 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
19135 Value *BV = ShuffleBuilder.gather(Values, BVMask.
size());
19138 return I == PoisonMaskElem ? PoisonMaskElem : 0;
19141 BV = CreateShuffle(BV,
nullptr, SplatMask);
19144 Mask[Idx] = BVMask.size() + Idx;
19145 Vec = CreateShuffle(Vec, BV, Mask);
19154 TryPackScalars(GatheredScalars, ReuseMask,
true);
19155 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
19156 ShuffleBuilder.add(BV, ReuseMask);
19157 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
19162 for (
auto [
I, V] :
enumerate(GatheredScalars)) {
19166 Value *BV = ShuffleBuilder.gather(GatheredScalars);
19167 ShuffleBuilder.add(BV, Mask);
19168 Res = ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors,
19173 Res = ShuffleBuilder.createFreeze(Res);
19177Value *BoUpSLP::createBuildVector(
const TreeEntry *
E,
Type *ScalarTy) {
19178 for (
auto [EIdx,
_] :
E->CombinedEntriesWithIndices)
19180 return processBuildVector<ShuffleInstructionBuilder, Value *>(
E, ScalarTy,
19188 for (
Value *V : VL)
19201 IRBuilderBase::InsertPointGuard Guard(Builder);
19203 Value *
V =
E->Scalars.front();
19204 Type *ScalarTy =
V->getType();
19207 auto It = MinBWs.find(
E);
19208 if (It != MinBWs.end()) {
19214 if (
E->VectorizedValue)
19215 return E->VectorizedValue;
19217 if (
E->isGather()) {
19219 if (
E->hasState() &&
E->Idx == 0 && !UserIgnoreList)
19220 setInsertPointAfterBundle(
E);
19221 Value *Vec = createBuildVector(
E, ScalarTy);
19222 E->VectorizedValue = Vec;
19225 if (
E->State == TreeEntry::SplitVectorize) {
19226 assert(
E->CombinedEntriesWithIndices.size() == 2 &&
19227 "Expected exactly 2 combined entries.");
19228 setInsertPointAfterBundle(
E);
19230 *VectorizableTree[
E->CombinedEntriesWithIndices.front().first];
19232 ArrayRef(
E->Scalars).take_front(OpTE1.getVectorFactor())) &&
19233 "Expected same first part of scalars.");
19236 *VectorizableTree[
E->CombinedEntriesWithIndices.back().first];
19238 OpTE2.isSame(
ArrayRef(
E->Scalars).take_back(OpTE2.getVectorFactor())) &&
19239 "Expected same second part of scalars.");
19241 auto GetOperandSignedness = [&](
const TreeEntry *OpE) {
19242 bool IsSigned =
false;
19243 auto It = MinBWs.find(OpE);
19244 if (It != MinBWs.end())
19245 IsSigned = It->second.second;
19248 if (isa<PoisonValue>(V))
19250 return !isKnownNonNegative(R, SimplifyQuery(*DL));
19257 Op1 = Builder.CreateIntCast(
19262 GetOperandSignedness(&OpTE1));
19267 Op2 = Builder.CreateIntCast(
19272 GetOperandSignedness(&OpTE2));
19274 if (
E->ReorderIndices.empty()) {
19278 std::next(
Mask.begin(),
E->CombinedEntriesWithIndices.back().second),
19281 if (ScalarTyNumElements != 1) {
19285 Value *Vec = Builder.CreateShuffleVector(Op1, Mask);
19287 E->CombinedEntriesWithIndices.back().second *
19288 ScalarTyNumElements);
19289 E->VectorizedValue = Vec;
19292 unsigned CommonVF =
19293 std::max(OpTE1.getVectorFactor(), OpTE2.getVectorFactor());
19296 std::iota(
Mask.begin(), std::next(
Mask.begin(), OpTE1.getVectorFactor()),
19298 Op1 = Builder.CreateShuffleVector(Op1, Mask);
19302 std::iota(
Mask.begin(), std::next(
Mask.begin(), OpTE2.getVectorFactor()),
19304 Op2 = Builder.CreateShuffleVector(Op2, Mask);
19306 Value *Vec = Builder.CreateShuffleVector(Op1, Op2,
E->getSplitMask());
19307 E->VectorizedValue = Vec;
19311 bool IsReverseOrder =
19313 auto FinalShuffle = [&](
Value *
V,
const TreeEntry *
E) {
19315 if (
E->getOpcode() == Instruction::Store &&
19316 E->State == TreeEntry::Vectorize) {
19317 ArrayRef<int>
Mask =
19318 ArrayRef(
reinterpret_cast<const int *
>(
E->ReorderIndices.begin()),
19319 E->ReorderIndices.size());
19320 ShuffleBuilder.add(V, Mask);
19321 }
else if ((
E->State == TreeEntry::StridedVectorize && IsReverseOrder) ||
19322 E->State == TreeEntry::CompressVectorize) {
19323 ShuffleBuilder.addOrdered(V, {});
19325 ShuffleBuilder.addOrdered(V,
E->ReorderIndices);
19328 E->CombinedEntriesWithIndices.size());
19330 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](
const auto &
P) {
19331 return std::make_pair(VectorizableTree[P.first].get(), P.second);
19334 (
E->CombinedEntriesWithIndices.empty() ||
E->ReorderIndices.empty()) &&
19335 "Expected either combined subnodes or reordering");
19336 return ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors, {});
19339 assert(!
E->isGather() &&
"Unhandled state");
19340 unsigned ShuffleOrOp =
19341 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector :
E->
getOpcode();
19343 auto GetOperandSignedness = [&](
unsigned Idx) {
19344 const TreeEntry *OpE = getOperandEntry(
E, Idx);
19345 bool IsSigned =
false;
19346 auto It = MinBWs.find(OpE);
19347 if (It != MinBWs.end())
19348 IsSigned = It->second.second;
19351 if (isa<PoisonValue>(V))
19353 return !isKnownNonNegative(R, SimplifyQuery(*DL));
19357 switch (ShuffleOrOp) {
19358 case Instruction::PHI: {
19359 assert((
E->ReorderIndices.empty() || !
E->ReuseShuffleIndices.empty() ||
19360 E != VectorizableTree.front().get() ||
E->UserTreeIndex) &&
19361 "PHI reordering is free.");
19363 Builder.SetInsertPoint(PH->getParent(),
19364 PH->getParent()->getFirstNonPHIIt());
19366 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
19370 Builder.SetInsertPoint(PH->getParent(),
19371 PH->getParent()->getFirstInsertionPt());
19374 V = FinalShuffle(V,
E);
19376 E->VectorizedValue =
V;
19383 SmallPtrSet<BasicBlock *, 4> VisitedBBs;
19390 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
19394 if (!VisitedBBs.
insert(IBB).second) {
19397 TreeEntry *OpTE = getOperandEntry(
E,
I);
19398 assert(!OpTE->VectorizedValue &&
"Expected no vectorized value.");
19399 OpTE->VectorizedValue = VecOp;
19405 Value *Vec = vectorizeOperand(
E,
I);
19406 if (VecTy != Vec->
getType()) {
19408 MinBWs.contains(getOperandEntry(
E,
I))) &&
19409 "Expected item in MinBWs.");
19410 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(
I));
19416 "Invalid number of incoming values");
19417 assert(
E->VectorizedValue &&
"Expected vectorized value.");
19418 return E->VectorizedValue;
19421 case Instruction::ExtractElement: {
19422 Value *
V =
E->getSingleOperand(0);
19423 setInsertPointAfterBundle(
E);
19424 V = FinalShuffle(V,
E);
19425 E->VectorizedValue =
V;
19428 case Instruction::ExtractValue: {
19430 Builder.SetInsertPoint(LI);
19431 Value *Ptr = LI->getPointerOperand();
19432 LoadInst *
V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
19434 NewV = FinalShuffle(NewV,
E);
19435 E->VectorizedValue = NewV;
19438 case Instruction::InsertElement: {
19439 assert(
E->ReuseShuffleIndices.empty() &&
"All inserts should be unique");
19440 if (
const TreeEntry *OpE = getOperandEntry(
E, 1);
19441 OpE && !OpE->isGather() && OpE->hasState() &&
19442 !OpE->hasCopyableElements())
19445 setInsertPointAfterBundle(
E);
19446 Value *
V = vectorizeOperand(
E, 1);
19448 Type *ScalarTy =
Op.front()->getType();
19451 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(
E, 1));
19452 assert(Res.first > 0 &&
"Expected item in MinBWs.");
19453 V = Builder.CreateIntCast(
19463 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
19465 const unsigned NumElts =
19467 const unsigned NumScalars =
E->Scalars.size();
19470 assert(
Offset < NumElts &&
"Failed to find vector index offset");
19473 SmallVector<int>
Mask;
19474 if (!
E->ReorderIndices.empty()) {
19479 std::iota(
Mask.begin(), std::next(
Mask.begin(), NumScalars), 0);
19482 bool IsIdentity =
true;
19484 Mask.swap(PrevMask);
19485 for (
unsigned I = 0;
I < NumScalars; ++
I) {
19488 IsIdentity &= InsertIdx -
Offset ==
I;
19491 if (!IsIdentity || NumElts != NumScalars) {
19492 Value *V2 =
nullptr;
19493 bool IsVNonPoisonous =
19495 SmallVector<int> InsertMask(Mask);
19496 if (NumElts != NumScalars &&
Offset == 0) {
19505 InsertMask[*InsertIdx] = *InsertIdx;
19506 if (!
Ins->hasOneUse())
19509 Ins->getUniqueUndroppableUser());
19511 SmallBitVector UseMask =
19512 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19513 SmallBitVector IsFirstPoison =
19515 SmallBitVector IsFirstUndef =
19517 if (!IsFirstPoison.
all()) {
19519 for (
unsigned I = 0;
I < NumElts;
I++) {
19521 IsFirstUndef.
test(
I)) {
19522 if (IsVNonPoisonous) {
19523 InsertMask[
I] =
I < NumScalars ?
I : 0;
19528 if (Idx >= NumScalars)
19529 Idx = NumScalars - 1;
19530 InsertMask[
I] = NumScalars + Idx;
19543 V = Builder.CreateShuffleVector(V, V2, InsertMask);
19545 GatherShuffleExtractSeq.insert(
I);
19546 CSEBlocks.insert(
I->getParent());
19551 for (
unsigned I = 0;
I < NumElts;
I++) {
19555 SmallBitVector UseMask =
19556 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19557 SmallBitVector IsFirstUndef =
19559 if ((!IsIdentity ||
Offset != 0 || !IsFirstUndef.
all()) &&
19560 NumElts != NumScalars) {
19561 if (IsFirstUndef.
all()) {
19563 SmallBitVector IsFirstPoison =
19565 if (!IsFirstPoison.
all()) {
19566 for (
unsigned I = 0;
I < NumElts;
I++) {
19568 InsertMask[
I] =
I + NumElts;
19571 V = Builder.CreateShuffleVector(
19577 GatherShuffleExtractSeq.insert(
I);
19578 CSEBlocks.insert(
I->getParent());
19582 SmallBitVector IsFirstPoison =
19584 for (
unsigned I = 0;
I < NumElts;
I++) {
19588 InsertMask[
I] += NumElts;
19590 V = Builder.CreateShuffleVector(
19591 FirstInsert->getOperand(0), V, InsertMask,
19594 GatherShuffleExtractSeq.insert(
I);
19595 CSEBlocks.insert(
I->getParent());
19600 ++NumVectorInstructions;
19601 E->VectorizedValue =
V;
19604 case Instruction::ZExt:
19605 case Instruction::SExt:
19606 case Instruction::FPToUI:
19607 case Instruction::FPToSI:
19608 case Instruction::FPExt:
19609 case Instruction::PtrToInt:
19610 case Instruction::IntToPtr:
19611 case Instruction::SIToFP:
19612 case Instruction::UIToFP:
19613 case Instruction::Trunc:
19614 case Instruction::FPTrunc:
19615 case Instruction::BitCast: {
19616 setInsertPointAfterBundle(
E);
19618 Value *InVec = vectorizeOperand(
E, 0);
19623 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
19625 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
19628 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
19629 if (SrcIt != MinBWs.end())
19630 SrcBWSz = SrcIt->second.first;
19631 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->
getScalarType());
19632 if (BWSz == SrcBWSz) {
19633 VecOpcode = Instruction::BitCast;
19634 }
else if (BWSz < SrcBWSz) {
19635 VecOpcode = Instruction::Trunc;
19636 }
else if (It != MinBWs.end()) {
19637 assert(BWSz > SrcBWSz &&
"Invalid cast!");
19638 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
19639 }
else if (SrcIt != MinBWs.end()) {
19640 assert(BWSz > SrcBWSz &&
"Invalid cast!");
19642 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
19644 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
19645 !SrcIt->second.second) {
19646 VecOpcode = Instruction::UIToFP;
19648 Value *
V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
19650 : Builder.CreateCast(VecOpcode, InVec, VecTy);
19651 V = FinalShuffle(V,
E);
19653 E->VectorizedValue =
V;
19654 ++NumVectorInstructions;
19657 case Instruction::FCmp:
19658 case Instruction::ICmp: {
19659 setInsertPointAfterBundle(
E);
19661 Value *
L = vectorizeOperand(
E, 0);
19662 Value *
R = vectorizeOperand(
E, 1);
19663 if (
L->getType() !=
R->getType()) {
19666 MinBWs.contains(getOperandEntry(
E, 0)) ||
19667 MinBWs.contains(getOperandEntry(
E, 1))) &&
19668 "Expected item in MinBWs.");
19673 ->getIntegerBitWidth()) {
19674 Type *CastTy =
R->getType();
19675 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
19677 Type *CastTy =
L->getType();
19678 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
19683 Value *
V = Builder.CreateCmp(P0, L, R);
19686 ICmp->setSameSign(
false);
19689 V = FinalShuffle(V,
E);
19691 E->VectorizedValue =
V;
19692 ++NumVectorInstructions;
19695 case Instruction::Select: {
19696 setInsertPointAfterBundle(
E);
19699 Value *True = vectorizeOperand(
E, 1);
19700 Value *False = vectorizeOperand(
E, 2);
19704 MinBWs.contains(getOperandEntry(
E, 1)) ||
19705 MinBWs.contains(getOperandEntry(
E, 2))) &&
19706 "Expected item in MinBWs.");
19707 if (True->
getType() != VecTy)
19708 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
19709 if (False->
getType() != VecTy)
19710 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
19715 assert(TrueNumElements >= CondNumElements &&
19716 TrueNumElements % CondNumElements == 0 &&
19717 "Cannot vectorize Instruction::Select");
19719 "Cannot vectorize Instruction::Select");
19720 if (CondNumElements != TrueNumElements) {
19723 Cond = Builder.CreateShuffleVector(
19728 "Cannot vectorize Instruction::Select");
19730 Builder.CreateSelectWithUnknownProfile(
Cond, True, False,
DEBUG_TYPE);
19731 V = FinalShuffle(V,
E);
19733 E->VectorizedValue =
V;
19734 ++NumVectorInstructions;
19737 case Instruction::FNeg: {
19738 setInsertPointAfterBundle(
E);
19740 Value *
Op = vectorizeOperand(
E, 0);
19742 Value *
V = Builder.CreateUnOp(
19748 V = FinalShuffle(V,
E);
19750 E->VectorizedValue =
V;
19751 ++NumVectorInstructions;
19755 case Instruction::Freeze: {
19756 setInsertPointAfterBundle(
E);
19758 Value *
Op = vectorizeOperand(
E, 0);
19760 if (
Op->getType() != VecTy) {
19762 MinBWs.contains(getOperandEntry(
E, 0))) &&
19763 "Expected item in MinBWs.");
19764 Op = Builder.CreateIntCast(
Op, VecTy, GetOperandSignedness(0));
19766 Value *
V = Builder.CreateFreeze(
Op);
19767 V = FinalShuffle(V,
E);
19769 E->VectorizedValue =
V;
19770 ++NumVectorInstructions;
19774 case Instruction::Add:
19775 case Instruction::FAdd:
19776 case Instruction::Sub:
19777 case Instruction::FSub:
19778 case Instruction::Mul:
19779 case Instruction::FMul:
19780 case Instruction::UDiv:
19781 case Instruction::SDiv:
19782 case Instruction::FDiv:
19783 case Instruction::URem:
19784 case Instruction::SRem:
19785 case Instruction::FRem:
19786 case Instruction::Shl:
19787 case Instruction::LShr:
19788 case Instruction::AShr:
19789 case Instruction::And:
19790 case Instruction::Or:
19791 case Instruction::Xor: {
19792 setInsertPointAfterBundle(
E);
19796 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
19801 return CI && CI->getValue().countr_one() >= It->second.first;
19803 V = FinalShuffle(
I == 0 ?
RHS :
LHS,
E);
19804 E->VectorizedValue =
V;
19805 ++NumVectorInstructions;
19813 MinBWs.contains(getOperandEntry(
E, 0)) ||
19814 MinBWs.contains(getOperandEntry(
E, 1))) &&
19815 "Expected item in MinBWs.");
19817 LHS = Builder.CreateIntCast(
LHS, VecTy, GetOperandSignedness(0));
19819 RHS = Builder.CreateIntCast(
RHS, VecTy, GetOperandSignedness(1));
19822 Value *
V = Builder.CreateBinOp(
19829 if (!MinBWs.contains(
E) && ShuffleOrOp == Instruction::Sub &&
19831 return isa<PoisonValue>(V) ||
19832 (E->hasCopyableElements() && E->isCopyableElement(V)) ||
19833 isCommutative(cast<Instruction>(V));
19835 I->setHasNoUnsignedWrap(
false);
19838 V = FinalShuffle(V,
E);
19840 E->VectorizedValue =
V;
19841 ++NumVectorInstructions;
19845 case Instruction::Load: {
19848 setInsertPointAfterBundle(
E);
19852 FixedVectorType *StridedLoadTy =
nullptr;
19853 Value *PO = LI->getPointerOperand();
19854 if (
E->State == TreeEntry::Vectorize) {
19855 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
19856 }
else if (
E->State == TreeEntry::CompressVectorize) {
19857 auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
19858 CompressEntryToData.at(
E);
19859 Align CommonAlignment = LI->getAlign();
19865 for (
int I : CompressMask)
19869 MaskValues =
replicateMask(MaskValues, VecTy->getNumElements());
19872 NewLI = Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment,
19875 NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment);
19886 }
else if (
E->State == TreeEntry::StridedVectorize) {
19889 PO = IsReverseOrder ? PtrN : Ptr0;
19890 Type *StrideTy = DL->getIndexType(PO->
getType());
19892 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(
E);
19893 StridedLoadTy = SPtrInfo.Ty;
19894 assert(StridedLoadTy &&
"Missing StridedPoinerInfo for tree entry.");
19895 unsigned StridedLoadEC =
19898 Value *Stride = SPtrInfo.StrideVal;
19900 const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
19901 assert(StrideSCEV &&
"Neither StrideVal nor StrideSCEV were set.");
19902 SCEVExpander Expander(*SE, *DL,
"strided-load-vec");
19903 Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->
getType(),
19904 &*Builder.GetInsertPoint());
19907 Builder.CreateIntCast(Stride, StrideTy,
true);
19908 StrideVal = Builder.CreateMul(
19909 NewStride, ConstantInt::get(
19910 StrideTy, (IsReverseOrder ? -1 : 1) *
19912 DL->getTypeAllocSize(ScalarTy))));
19914 auto *Inst = Builder.CreateIntrinsic(
19915 Intrinsic::experimental_vp_strided_load,
19916 {StridedLoadTy, PO->
getType(), StrideTy},
19919 Builder.getInt32(StridedLoadEC)});
19920 Inst->addParamAttr(
19922 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
19925 assert(
E->State == TreeEntry::ScatterVectorize &&
"Unhandled state");
19926 Value *VecPtr = vectorizeOperand(
E, 0);
19931 unsigned ScalarTyNumElements =
19933 unsigned VecTyNumElements =
19935 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
19936 "Cannot expand getelementptr.");
19937 unsigned VF = VecTyNumElements / ScalarTyNumElements;
19940 return Builder.getInt64(I % ScalarTyNumElements);
19942 VecPtr = Builder.CreateGEP(
19943 VecTy->getElementType(),
19944 Builder.CreateShuffleVector(
19950 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
19952 Value *
V =
E->State == TreeEntry::CompressVectorize
19956 if (StridedLoadTy != VecTy)
19957 V = Builder.CreateBitOrPointerCast(V, VecTy);
19958 V = FinalShuffle(V,
E);
19959 E->VectorizedValue =
V;
19960 ++NumVectorInstructions;
19963 case Instruction::Store: {
19966 setInsertPointAfterBundle(
E);
19968 Value *VecValue = vectorizeOperand(
E, 0);
19969 if (VecValue->
getType() != VecTy)
19971 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
19972 VecValue = FinalShuffle(VecValue,
E);
19974 Value *Ptr =
SI->getPointerOperand();
19976 if (
E->State == TreeEntry::Vectorize) {
19977 ST = Builder.CreateAlignedStore(VecValue, Ptr,
SI->getAlign());
19979 assert(
E->State == TreeEntry::StridedVectorize &&
19980 "Expected either strided or consecutive stores.");
19981 if (!
E->ReorderIndices.empty()) {
19983 Ptr =
SI->getPointerOperand();
19986 Type *StrideTy = DL->getIndexType(
SI->getPointerOperandType());
19987 auto *Inst = Builder.CreateIntrinsic(
19988 Intrinsic::experimental_vp_strided_store,
19989 {VecTy, Ptr->
getType(), StrideTy},
19992 StrideTy, -
static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
19993 Builder.getAllOnesMask(VecTy->getElementCount()),
19994 Builder.getInt32(
E->Scalars.size())});
19995 Inst->addParamAttr(
19997 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
20003 E->VectorizedValue =
V;
20004 ++NumVectorInstructions;
20007 case Instruction::GetElementPtr: {
20009 setInsertPointAfterBundle(
E);
20011 Value *Op0 = vectorizeOperand(
E, 0);
20014 for (
int J = 1,
N = GEP0->getNumOperands(); J <
N; ++J) {
20015 Value *OpVec = vectorizeOperand(
E, J);
20019 Value *
V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
20022 for (
Value *V :
E->Scalars) {
20029 V = FinalShuffle(V,
E);
20031 E->VectorizedValue =
V;
20032 ++NumVectorInstructions;
20036 case Instruction::Call: {
20038 setInsertPointAfterBundle(
E);
20043 CI,
ID, VecTy->getNumElements(),
20044 It != MinBWs.end() ? It->second.first : 0, TTI);
20047 VecCallCosts.first <= VecCallCosts.second;
20049 Value *ScalarArg =
nullptr;
20060 ScalarArg = CEI->getArgOperand(
I);
20063 if (
ID == Intrinsic::abs && It != MinBWs.end() &&
20064 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
20065 ScalarArg = Builder.getFalse();
20072 Value *OpVec = vectorizeOperand(
E,
I);
20073 ScalarArg = CEI->getArgOperand(
I);
20076 It == MinBWs.end()) {
20079 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(
I));
20080 }
else if (It != MinBWs.end()) {
20081 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(
I));
20090 if (!UseIntrinsic) {
20095 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
20102 Value *
V = Builder.CreateCall(CF, OpVecs, OpBundles);
20105 V = FinalShuffle(V,
E);
20107 E->VectorizedValue =
V;
20108 ++NumVectorInstructions;
20111 case Instruction::ShuffleVector: {
20114 setInsertPointAfterBundle(
E);
20115 Value *Src = vectorizeOperand(
E, 0);
20118 SmallVector<int> NewMask(ThisMask.size());
20120 return SVSrc->getShuffleMask()[Mask];
20122 V = Builder.CreateShuffleVector(SVSrc->getOperand(0),
20123 SVSrc->getOperand(1), NewMask);
20125 V = Builder.CreateShuffleVector(Src, ThisMask);
20130 V = FinalShuffle(V,
E);
20138 "Invalid Shuffle Vector Operand");
20142 setInsertPointAfterBundle(
E);
20143 LHS = vectorizeOperand(
E, 0);
20144 RHS = vectorizeOperand(
E, 1);
20146 setInsertPointAfterBundle(
E);
20147 LHS = vectorizeOperand(
E, 0);
20153 assert((It != MinBWs.end() ||
20154 getOperandEntry(
E, 0)->State == TreeEntry::NeedToGather ||
20155 getOperandEntry(
E, 1)->State == TreeEntry::NeedToGather ||
20156 MinBWs.contains(getOperandEntry(
E, 0)) ||
20157 MinBWs.contains(getOperandEntry(
E, 1))) &&
20158 "Expected item in MinBWs.");
20159 Type *CastTy = VecTy;
20165 ->getIntegerBitWidth())
20171 LHS = Builder.CreateIntCast(
LHS, CastTy, GetOperandSignedness(0));
20173 RHS = Builder.CreateIntCast(
RHS, CastTy, GetOperandSignedness(1));
20178 V0 = Builder.CreateBinOp(
20180 V1 = Builder.CreateBinOp(
20183 V0 = Builder.CreateCmp(CI0->getPredicate(),
LHS,
RHS);
20186 V1 = Builder.CreateCmp(AltPred,
LHS,
RHS);
20189 unsigned SrcBWSz = DL->getTypeSizeInBits(
20191 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
20192 if (BWSz <= SrcBWSz) {
20193 if (BWSz < SrcBWSz)
20194 LHS = Builder.CreateIntCast(
LHS, VecTy, It->second.first);
20196 "Expected same type as operand.");
20200 E->VectorizedValue =
LHS;
20201 ++NumVectorInstructions;
20205 V0 = Builder.CreateCast(
20207 V1 = Builder.CreateCast(
20212 for (
Value *V : {V0, V1}) {
20214 GatherShuffleExtractSeq.insert(
I);
20215 CSEBlocks.insert(
I->getParent());
20223 SmallVector<int>
Mask;
20224 E->buildAltOpShuffleMask(
20225 [
E,
this](Instruction *
I) {
20226 assert(
E->getMatchingMainOpOrAltOp(
I) &&
20227 "Unexpected main/alternate opcode");
20231 Mask, &OpScalars, &AltScalars);
20235 auto DropNuwFlag = [&](
Value *Vec,
unsigned Opcode) {
20238 I && Opcode == Instruction::Sub && !MinBWs.contains(
E) &&
20240 if (isa<PoisonValue>(V))
20242 if (E->hasCopyableElements() && E->isCopyableElement(V))
20244 auto *IV = cast<Instruction>(V);
20245 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
20247 I->setHasNoUnsignedWrap(
false);
20249 DropNuwFlag(V0,
E->getOpcode());
20250 DropNuwFlag(V1,
E->getAltOpcode());
20256 V = Builder.CreateShuffleVector(V0, V1, Mask);
20259 GatherShuffleExtractSeq.insert(
I);
20260 CSEBlocks.insert(
I->getParent());
20264 E->VectorizedValue =
V;
20265 ++NumVectorInstructions;
20283 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
20286 EntryToLastInstruction.clear();
20288 for (
auto &BSIter : BlocksSchedules)
20289 scheduleBlock(*
this, BSIter.second.get());
20292 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20293 if (TE->isGather())
20295 (void)getLastInstructionInBundle(TE.get());
20299 Builder.SetInsertPoint(ReductionRoot->
getParent(),
20302 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20306 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20307 if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
20308 TE->UserTreeIndex.UserTE->hasState() &&
20309 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
20310 (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
20311 TE->UserTreeIndex.UserTE->isAltShuffle()) &&
20312 !TE->UserTreeIndex.UserTE->hasCopyableElements() &&
20313 all_of(TE->UserTreeIndex.UserTE->Scalars,
20314 [](
Value *V) { return isUsedOutsideBlock(V); })) {
20316 getLastInstructionInBundle(TE->UserTreeIndex.UserTE);
20320 for (
auto &Entry : GatherEntries) {
20322 Builder.SetInsertPoint(Entry.second);
20323 Builder.SetCurrentDebugLocation(Entry.second->getDebugLoc());
20328 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20329 if (GatheredLoadsEntriesFirst.has_value() &&
20330 TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue &&
20331 (!TE->isGather() || TE->UserTreeIndex)) {
20332 assert((TE->UserTreeIndex ||
20333 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
20334 "Expected gathered load node.");
20343 for (
const TreeEntry *E : PostponedNodes) {
20344 auto *TE =
const_cast<TreeEntry *
>(E);
20346 TE->VectorizedValue =
nullptr;
20357 (TE->UserTreeIndex.UserTE->hasState() &&
20358 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI)) {
20367 if (UI->comesBefore(InsertPt))
20370 Builder.SetInsertPoint(InsertPt);
20372 Builder.SetInsertPoint(PrevVec);
20374 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
20377 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
20378 Builder.GetInsertPoint()->comesBefore(VecI))
20379 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
20380 Builder.GetInsertPoint());
20381 if (Vec->
getType() != PrevVec->getType()) {
20383 PrevVec->getType()->isIntOrIntVectorTy() &&
20384 "Expected integer vector types only.");
20385 std::optional<bool> IsSigned;
20386 for (
Value *V : TE->Scalars) {
20388 for (
const TreeEntry *MNTE : getTreeEntries(V)) {
20389 auto It = MinBWs.find(MNTE);
20390 if (It != MinBWs.end()) {
20391 IsSigned = IsSigned.value_or(
false) || It->second.second;
20396 if (IsSigned.value_or(
false))
20399 for (
const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
20400 auto It = MinBWs.find(BVE);
20401 if (It != MinBWs.end()) {
20402 IsSigned = IsSigned.value_or(
false) || It->second.second;
20407 if (IsSigned.value_or(
false))
20411 IsSigned.value_or(
false) ||
20415 if (IsSigned.value_or(
false))
20419 if (IsSigned.value_or(
false)) {
20421 auto It = MinBWs.find(TE->UserTreeIndex.UserTE);
20422 if (It != MinBWs.end())
20423 IsSigned = It->second.second;
20426 "Expected user node or perfect diamond match in MinBWs.");
20427 Vec = Builder.CreateIntCast(Vec, PrevVec->
getType(), *IsSigned);
20429 PrevVec->replaceAllUsesWith(Vec);
20430 PostponedValues.
try_emplace(Vec).first->second.push_back(TE);
20433 auto It = PostponedValues.
find(PrevVec);
20434 if (It != PostponedValues.
end()) {
20435 for (TreeEntry *VTE : It->getSecond())
20436 VTE->VectorizedValue = Vec;
20456 for (
const auto &ExternalUse : ExternalUses) {
20457 Value *Scalar = ExternalUse.Scalar;
20464 const TreeEntry *E = &ExternalUse.E;
20465 assert(E &&
"Invalid scalar");
20466 assert(!E->isGather() &&
"Extracting from a gather list");
20468 if (E->getOpcode() == Instruction::GetElementPtr &&
20472 Value *Vec = E->VectorizedValue;
20473 assert(Vec &&
"Can't find vectorizable value");
20475 Value *Lane = Builder.getInt32(ExternalUse.Lane);
20476 auto ExtractAndExtendIfNeeded = [&](
Value *Vec) {
20477 if (Scalar->getType() != Vec->
getType()) {
20478 Value *Ex =
nullptr;
20479 Value *ExV =
nullptr;
20481 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
20482 auto It = ScalarToEEs.
find(Scalar);
20483 if (It != ScalarToEEs.
end()) {
20486 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
20487 : Builder.GetInsertBlock());
20488 if (EEIt != It->second.end()) {
20489 Value *PrevV = EEIt->second.first;
20491 I && !ReplaceInst &&
20492 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
20493 Builder.GetInsertPoint()->comesBefore(
I)) {
20494 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
20495 Builder.GetInsertPoint());
20500 ExV = EEIt->second.second ? EEIt->second.second : Ex;
20509 IgnoredExtracts.
insert(EE);
20512 auto *CloneInst = Inst->clone();
20513 CloneInst->insertBefore(Inst->getIterator());
20514 if (Inst->hasName())
20515 CloneInst->takeName(Inst);
20520 Value *V = ES->getVectorOperand();
20523 V = ETEs.front()->VectorizedValue;
20525 !
IV ||
IV == Vec ||
IV->getParent() != IVec->getParent() ||
20526 IV->comesBefore(IVec))
20527 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
20529 Ex = Builder.CreateExtractElement(Vec, Lane);
20530 }
else if (
auto *VecTy =
20533 unsigned VecTyNumElements = VecTy->getNumElements();
20538 ExternalUse.Lane * VecTyNumElements);
20540 Ex = Builder.CreateExtractElement(Vec, Lane);
20545 if (Scalar->getType() != Ex->
getType())
20546 ExV = Builder.CreateIntCast(
20551 : &F->getEntryBlock(),
20552 std::make_pair(Ex, ExV));
20558 GatherShuffleExtractSeq.insert(ExI);
20559 CSEBlocks.insert(ExI->getParent());
20565 "In-tree scalar of vector type is not insertelement?");
20574 if (!ScalarsWithNullptrUser.
insert(Scalar).second)
20577 (ExternallyUsedValues.
count(Scalar) ||
20578 ExternalUsesWithNonUsers.count(Scalar) ||
20579 ExternalUsesAsOriginalScalar.contains(Scalar) ||
20583 if (ExternalUsesAsOriginalScalar.contains(U))
20585 ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
20586 return !UseEntries.empty() &&
20587 (E->State == TreeEntry::Vectorize ||
20588 E->State == TreeEntry::StridedVectorize ||
20589 E->State == TreeEntry::CompressVectorize) &&
20590 any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {
20591 return (UseEntry->State == TreeEntry::Vectorize ||
20593 TreeEntry::StridedVectorize ||
20595 TreeEntry::CompressVectorize) &&
20596 doesInTreeUserNeedToExtract(
20597 Scalar, getRootEntryInstruction(*UseEntry),
20601 "Scalar with nullptr User must be registered in "
20602 "ExternallyUsedValues map or remain as scalar in vectorized "
20606 if (
PHI->getParent()->isLandingPad())
20607 Builder.SetInsertPoint(
20610 PHI->getParent()->getLandingPadInst()->getIterator()));
20612 Builder.SetInsertPoint(
PHI->getParent(),
20613 PHI->getParent()->getFirstNonPHIIt());
20615 Builder.SetInsertPoint(VecI->getParent(),
20616 std::next(VecI->getIterator()));
20619 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20621 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20623 if (Scalar != NewInst) {
20626 "Extractelements should not be replaced.");
20627 Scalar->replaceAllUsesWith(NewInst);
20637 if (!UsedInserts.
insert(VU).second)
20640 auto BWIt = MinBWs.find(E);
20642 auto *ScalarTy = FTy->getElementType();
20643 auto Key = std::make_pair(Vec, ScalarTy);
20644 auto VecIt = VectorCasts.
find(
Key);
20645 if (VecIt == VectorCasts.
end()) {
20648 if (IVec->getParent()->isLandingPad())
20649 Builder.SetInsertPoint(IVec->getParent(),
20650 std::next(IVec->getParent()
20651 ->getLandingPadInst()
20654 Builder.SetInsertPoint(
20655 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
20657 Builder.SetInsertPoint(IVec->getNextNode());
20659 Vec = Builder.CreateIntCast(
20664 BWIt->second.second);
20667 Vec = VecIt->second;
20674 ShuffledInserts, [VU](
const ShuffledInsertData<Value *> &
Data) {
20681 unsigned Idx = *InsertIdx;
20682 if (It == ShuffledInserts.
end()) {
20684 It = std::next(ShuffledInserts.
begin(),
20685 ShuffledInserts.
size() - 1);
20690 Mask[Idx] = ExternalUse.Lane;
20702 for (
unsigned I :
seq<unsigned>(0, PH->getNumIncomingValues())) {
20703 if (PH->getIncomingValue(
I) == Scalar) {
20705 PH->getIncomingBlock(
I)->getTerminator();
20707 Builder.SetInsertPoint(VecI->getParent(),
20708 std::next(VecI->getIterator()));
20710 Builder.SetInsertPoint(PH->getIncomingBlock(
I)->getTerminator());
20712 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20713 PH->setOperand(
I, NewInst);
20718 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20722 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20723 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20734 for (
int I = 0, E = Mask.size();
I < E; ++
I) {
20736 CombinedMask1[
I] = Mask[
I];
20738 CombinedMask2[
I] = Mask[
I] - VF;
20740 ShuffleInstructionBuilder ShuffleBuilder(
20742 ShuffleBuilder.add(V1, CombinedMask1);
20744 ShuffleBuilder.add(V2, CombinedMask2);
20745 return ShuffleBuilder.finalize({}, {}, {});
20748 auto &&ResizeToVF = [&CreateShuffle](
Value *Vec, ArrayRef<int>
Mask,
20749 bool ForSingleMask) {
20750 unsigned VF =
Mask.size();
20753 if (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); })) {
20754 Vec = CreateShuffle(Vec,
nullptr, Mask);
20755 return std::make_pair(Vec,
true);
20757 if (!ForSingleMask) {
20759 for (
unsigned I = 0;
I < VF; ++
I) {
20763 Vec = CreateShuffle(Vec,
nullptr, ResizeMask);
20767 return std::make_pair(Vec,
false);
20771 for (
int I = 0,
E = ShuffledInserts.size();
I <
E; ++
I) {
20774 InsertElementInst *FirstInsert = ShuffledInserts[
I].InsertElements.front();
20775 InsertElementInst *LastInsert = ShuffledInserts[
I].InsertElements.back();
20776 Builder.SetInsertPoint(LastInsert);
20777 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
20782 return cast<VectorType>(Vec->getType())
20783 ->getElementCount()
20784 .getKnownMinValue();
20787 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
20789 assert((Vals.size() == 1 || Vals.size() == 2) &&
20790 "Expected exactly 1 or 2 input values.");
20791 if (Vals.size() == 1) {
20794 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
20795 ->getNumElements() ||
20796 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
20797 return CreateShuffle(Vals.front(), nullptr, Mask);
20798 return Vals.front();
20800 return CreateShuffle(Vals.
front() ? Vals.
front()
20802 Vals.
back(), Mask);
20804 auto It = ShuffledInserts[
I].InsertElements.rbegin();
20806 InsertElementInst *
II =
nullptr;
20807 if (It != ShuffledInserts[
I].InsertElements.rend())
20810 while (It != ShuffledInserts[
I].InsertElements.rend()) {
20811 assert(
II &&
"Must be an insertelement instruction.");
20818 for (Instruction *
II :
reverse(Inserts)) {
20819 II->replaceUsesOfWith(
II->getOperand(0), NewInst);
20821 if (
II->getParent() == NewI->getParent() &&
II->comesBefore(NewI))
20822 II->moveAfter(NewI);
20826 for (InsertElementInst *IE :
reverse(ShuffledInserts[
I].InsertElements)) {
20827 IE->replaceUsesOfWith(
IE->getOperand(0),
20829 IE->replaceUsesOfWith(
IE->getOperand(1),
20833 CSEBlocks.insert(LastInsert->
getParent());
20838 for (
auto &TEPtr : VectorizableTree) {
20839 TreeEntry *
Entry = TEPtr.get();
20842 if (
Entry->isGather() ||
Entry->State == TreeEntry::SplitVectorize)
20845 assert(
Entry->VectorizedValue &&
"Can't find vectorizable value");
20848 for (
int Lane = 0, LE =
Entry->Scalars.size(); Lane != LE; ++Lane) {
20851 if (
Entry->getOpcode() == Instruction::GetElementPtr &&
20855 EE && IgnoredExtracts.contains(EE))
20862 for (User *U :
Scalar->users()) {
20867 (UserIgnoreList && UserIgnoreList->contains(U)) ||
20870 "Deleting out-of-tree value");
20874 LLVM_DEBUG(
dbgs() <<
"SLP: \tErasing scalar:" << *Scalar <<
".\n");
20883 V->mergeDIAssignID(RemovedInsts);
20886 if (UserIgnoreList) {
20887 for (Instruction *
I : RemovedInsts) {
20888 const TreeEntry *
IE = getTreeEntries(
I).front();
20889 if (
IE->Idx != 0 &&
20890 !(VectorizableTree.front()->isGather() &&
IE->UserTreeIndex &&
20891 (ValueToGatherNodes.lookup(
I).contains(
20892 VectorizableTree.front().get()) ||
20893 (
IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
20894 IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
20895 !(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
20896 IE->UserTreeIndex &&
20898 !(GatheredLoadsEntriesFirst.has_value() &&
20899 IE->Idx >= *GatheredLoadsEntriesFirst &&
20900 VectorizableTree.front()->isGather() &&
20902 !(!VectorizableTree.front()->isGather() &&
20903 VectorizableTree.front()->isCopyableElement(
I)))
20908 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
20909 (match(U.getUser(), m_LogicalAnd()) ||
20910 match(U.getUser(), m_LogicalOr())) &&
20911 U.getOperandNo() == 0;
20912 if (IsPoisoningLogicalOp) {
20913 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
20916 return UserIgnoreList->contains(
U.getUser());
20920 for (SelectInst *SI : LogicalOpSelects)
20930 Builder.ClearInsertionPoint();
20931 InstrElementSize.clear();
20933 const TreeEntry &RootTE = *VectorizableTree.front();
20934 Value *Vec = RootTE.VectorizedValue;
20935 if (
auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
20936 It != MinBWs.end() &&
20937 ReductionBitWidth != It->second.first) {
20938 IRBuilder<>::InsertPointGuard Guard(Builder);
20939 Builder.SetInsertPoint(ReductionRoot->getParent(),
20940 ReductionRoot->getIterator());
20941 Vec = Builder.CreateIntCast(
20943 VectorType::get(Builder.getIntNTy(ReductionBitWidth),
20945 It->second.second);
20951 LLVM_DEBUG(
dbgs() <<
"SLP: Optimizing " << GatherShuffleExtractSeq.size()
20952 <<
" gather sequences instructions.\n");
20959 Loop *L = LI->getLoopFor(
I->getParent());
20964 BasicBlock *PreHeader = L->getLoopPreheader();
20972 auto *OpI = dyn_cast<Instruction>(V);
20973 return OpI && L->contains(OpI);
20979 CSEBlocks.insert(PreHeader);
20984 CSEWorkList.
reserve(CSEBlocks.size());
20987 assert(DT->isReachableFromEntry(
N));
20994 assert((
A ==
B) == (
A->getDFSNumIn() ==
B->getDFSNumIn()) &&
20995 "Different nodes should have different DFS numbers");
20996 return A->getDFSNumIn() <
B->getDFSNumIn();
21004 auto &&IsIdenticalOrLessDefined = [TTI = TTI](
Instruction *I1,
21007 if (I1->getType() != I2->getType())
21012 return I1->isIdenticalTo(I2);
21013 if (SI1->isIdenticalTo(SI2))
21015 for (
int I = 0, E = SI1->getNumOperands();
I < E; ++
I)
21016 if (SI1->getOperand(
I) != SI2->getOperand(
I))
21019 NewMask.
assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
21023 unsigned LastUndefsCnt = 0;
21024 for (
int I = 0, E = NewMask.
size();
I < E; ++
I) {
21030 NewMask[
I] != SM1[
I])
21033 NewMask[
I] = SM1[
I];
21037 return SM1.
size() - LastUndefsCnt > 1 &&
21041 SM1.
size() - LastUndefsCnt));
21047 for (
auto I = CSEWorkList.
begin(), E = CSEWorkList.
end();
I != E; ++
I) {
21049 (
I == CSEWorkList.
begin() || !DT->dominates(*
I, *std::prev(
I))) &&
21050 "Worklist not sorted properly!");
21057 !GatherShuffleExtractSeq.contains(&In))
21062 bool Replaced =
false;
21065 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
21066 DT->dominates(V->getParent(), In.getParent())) {
21067 In.replaceAllUsesWith(V);
21070 if (!NewMask.
empty())
21071 SI->setShuffleMask(NewMask);
21076 GatherShuffleExtractSeq.contains(V) &&
21077 IsIdenticalOrLessDefined(V, &In, NewMask) &&
21078 DT->dominates(In.getParent(), V->getParent())) {
21080 V->replaceAllUsesWith(&In);
21083 if (!NewMask.
empty())
21084 SI->setShuffleMask(NewMask);
21092 Visited.push_back(&In);
21097 GatherShuffleExtractSeq.clear();
21100BoUpSLP::ScheduleBundle &BoUpSLP::BlockScheduling::buildBundle(
21103 ScheduledBundlesList.emplace_back(std::make_unique<ScheduleBundle>());
21104 for (
Value *V : VL) {
21105 if (S.isNonSchedulable(V))
21108 if (S.isCopyableElement(V)) {
21110 ScheduleCopyableData &SD =
21111 addScheduleCopyableData(EI,
I, SchedulingRegionID, *BundlePtr);
21113 BundlePtr->add(&SD);
21116 ScheduleData *BundleMember = getScheduleData(V);
21117 assert(BundleMember &&
"no ScheduleData for bundle member "
21118 "(maybe not in same basic block)");
21120 BundlePtr->add(BundleMember);
21121 ScheduledBundles.try_emplace(
I).first->getSecond().push_back(
21124 assert(BundlePtr && *BundlePtr &&
"Failed to find schedule bundle");
21130std::optional<BoUpSLP::ScheduleBundle *>
21132 const InstructionsState &S,
21145 if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() &&
21146 EI.UserTE->doesNotNeedToSchedule() &&
21147 EI.UserTE->getOpcode() != Instruction::PHI &&
21149 auto *I = dyn_cast<Instruction>(V);
21150 if (!I || I->hasOneUser())
21152 for (User *U : I->users()) {
21153 auto *UI = cast<Instruction>(U);
21154 if (isa<BinaryOperator>(UI))
21159 return std::nullopt;
21160 if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() &&
21161 EI.UserTE->hasCopyableElements() &&
21162 EI.UserTE->getMainOp()->getParent() == S.getMainOp()->getParent() &&
21164 if (S.isCopyableElement(V))
21168 return std::nullopt;
21171 if (S.areInstructionsWithCopyableElements() &&
any_of(VL, [&](
Value *V) {
21184 return std::nullopt;
21185 if (S.areInstructionsWithCopyableElements() && EI) {
21186 bool IsNonSchedulableWithParentPhiNode =
21187 EI.UserTE->doesNotNeedToSchedule() && EI.UserTE->UserTreeIndex &&
21188 EI.UserTE->UserTreeIndex.UserTE->hasState() &&
21189 EI.UserTE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
21190 EI.UserTE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
21191 if (IsNonSchedulableWithParentPhiNode) {
21192 SmallSet<std::pair<Value *, Value *>, 4> Values;
21193 for (
const auto [Idx, V] :
21194 enumerate(EI.UserTE->UserTreeIndex.UserTE->Scalars)) {
21195 Value *
Op = EI.UserTE->UserTreeIndex.UserTE->getOperand(
21196 EI.UserTE->UserTreeIndex.EdgeIdx)[Idx];
21200 if (!Values.
insert(std::make_pair(V,
Op)).second)
21201 return std::nullopt;
21205 bool HasCopyables = S.areInstructionsWithCopyableElements();
21207 all_of(VL, [&](
Value *V) {
return S.isNonSchedulable(V); }))) {
21211 SmallVector<ScheduleData *> ControlDependentMembers;
21212 for (
Value *V : VL) {
21214 if (!
I || (HasCopyables && S.isCopyableElement(V)))
21216 SmallDenseMap<std::pair<Instruction *, Value *>,
unsigned> UserOpToNumOps;
21217 for (
const Use &U :
I->operands()) {
21220 .first->getSecond();
21223 Op && areAllOperandsReplacedByCopyableData(
I,
Op, *SLP,
NumOps)) {
21224 if (ScheduleData *OpSD = getScheduleData(
Op);
21225 OpSD && OpSD->hasValidDependencies()) {
21226 OpSD->clearDirectDependencies();
21227 if (RegionHasStackSave ||
21229 ControlDependentMembers.
push_back(OpSD);
21234 if (!ControlDependentMembers.
empty()) {
21235 ScheduleBundle
Invalid = ScheduleBundle::invalid();
21236 calculateDependencies(
Invalid,
true, SLP,
21237 ControlDependentMembers);
21244 LLVM_DEBUG(
dbgs() <<
"SLP: bundle: " << *S.getMainOp() <<
"\n");
21246 auto TryScheduleBundleImpl = [=](
bool ReSchedule, ScheduleBundle &Bundle) {
21249 SmallVector<ScheduleData *> ControlDependentMembers;
21250 auto CheckIfNeedToClearDeps = [&](ScheduleBundle &Bundle) {
21251 SmallDenseMap<std::pair<Instruction *, Value *>,
unsigned> UserOpToNumOps;
21252 for (ScheduleEntity *SE : Bundle.getBundle()) {
21254 if (ScheduleData *BundleMember = getScheduleData(SD->getInst());
21255 BundleMember && BundleMember->hasValidDependencies()) {
21256 BundleMember->clearDirectDependencies();
21257 if (RegionHasStackSave ||
21259 BundleMember->getInst()))
21260 ControlDependentMembers.
push_back(BundleMember);
21265 if (SD->hasValidDependencies() &&
21266 (!S.areInstructionsWithCopyableElements() ||
21267 !S.isCopyableElement(SD->getInst())) &&
21268 !getScheduleCopyableData(SD->getInst()).empty() && EI.UserTE &&
21269 EI.UserTE->hasState() &&
21270 (!EI.UserTE->hasCopyableElements() ||
21271 !EI.UserTE->isCopyableElement(SD->getInst())))
21272 SD->clearDirectDependencies();
21273 for (
const Use &U : SD->getInst()->operands()) {
21276 .
try_emplace(std::make_pair(SD->getInst(),
U.get()), 0)
21277 .first->getSecond();
21280 Op && areAllOperandsReplacedByCopyableData(SD->getInst(),
Op,
21282 if (ScheduleData *OpSD = getScheduleData(
Op);
21283 OpSD && OpSD->hasValidDependencies()) {
21284 OpSD->clearDirectDependencies();
21285 if (RegionHasStackSave ||
21287 ControlDependentMembers.
push_back(OpSD);
21298 if (OldScheduleEnd && ScheduleEnd != OldScheduleEnd) {
21299 for_each(ScheduleDataMap, [&](
auto &
P) {
21300 if (BB !=
P.first->getParent())
21302 ScheduleData *SD =
P.second;
21303 if (isInSchedulingRegion(*SD))
21304 SD->clearDependencies();
21306 for_each(ScheduleCopyableDataMapByInst, [&](
auto &
P) {
21307 for_each(
P.second, [&](ScheduleCopyableData *SD) {
21308 if (isInSchedulingRegion(*SD))
21309 SD->clearDependencies();
21316 if (Bundle && !Bundle.getBundle().empty()) {
21317 if (S.areInstructionsWithCopyableElements() ||
21318 !ScheduleCopyableDataMap.empty())
21319 CheckIfNeedToClearDeps(Bundle);
21320 LLVM_DEBUG(
dbgs() <<
"SLP: try schedule bundle " << Bundle <<
" in block "
21322 calculateDependencies(Bundle, !ReSchedule, SLP,
21323 ControlDependentMembers);
21324 }
else if (!ControlDependentMembers.
empty()) {
21325 ScheduleBundle
Invalid = ScheduleBundle::invalid();
21326 calculateDependencies(
Invalid, !ReSchedule, SLP,
21327 ControlDependentMembers);
21332 initialFillReadyList(ReadyInsts);
21339 while (((!Bundle && ReSchedule) || (Bundle && !Bundle.isReady())) &&
21340 !ReadyInsts.empty()) {
21341 ScheduleEntity *Picked = ReadyInsts.pop_back_val();
21342 assert(Picked->isReady() &&
"must be ready to schedule");
21343 schedule(*SLP, S, EI, Picked, ReadyInsts);
21344 if (Picked == &Bundle)
21351 for (
Value *V : VL) {
21352 if (S.isNonSchedulable(V))
21354 if (!extendSchedulingRegion(V, S)) {
21361 ScheduleBundle
Invalid = ScheduleBundle::invalid();
21362 TryScheduleBundleImpl(
false,
Invalid);
21363 return std::nullopt;
21367 bool ReSchedule =
false;
21368 for (
Value *V : VL) {
21369 if (S.isNonSchedulable(V))
21373 if (!CopyableData.
empty()) {
21374 for (ScheduleCopyableData *SD : CopyableData)
21375 ReadyInsts.remove(SD);
21377 ScheduleData *BundleMember = getScheduleData(V);
21378 assert((BundleMember || S.isCopyableElement(V)) &&
21379 "no ScheduleData for bundle member (maybe not in same basic block)");
21385 ReadyInsts.remove(BundleMember);
21387 !Bundles.
empty()) {
21388 for (ScheduleBundle *
B : Bundles)
21389 ReadyInsts.remove(
B);
21392 if (!S.isCopyableElement(V) && !BundleMember->isScheduled())
21399 LLVM_DEBUG(
dbgs() <<
"SLP: reset schedule because " << *BundleMember
21400 <<
" was already scheduled\n");
21404 ScheduleBundle &Bundle = buildBundle(VL, S, EI);
21405 TryScheduleBundleImpl(ReSchedule, Bundle);
21406 if (!Bundle.isReady()) {
21407 for (ScheduleEntity *BD : Bundle.getBundle()) {
21411 if (BD->isReady()) {
21413 if (Bundles.
empty()) {
21414 ReadyInsts.insert(BD);
21417 for (ScheduleBundle *
B : Bundles)
21419 ReadyInsts.insert(
B);
21422 ScheduledBundlesList.pop_back();
21423 SmallVector<ScheduleData *> ControlDependentMembers;
21424 for (
Value *V : VL) {
21425 if (S.isNonSchedulable(V))
21428 if (S.isCopyableElement(
I)) {
21431 auto KV = std::make_pair(EI,
I);
21432 assert(ScheduleCopyableDataMap.contains(KV) &&
21433 "no ScheduleCopyableData for copyable element");
21434 ScheduleCopyableData *SD =
21435 ScheduleCopyableDataMapByInst.find(
I)->getSecond().pop_back_val();
21436 ScheduleCopyableDataMapByUsers[
I].remove(SD);
21439 const auto *It =
find(
Op,
I);
21440 assert(It !=
Op.end() &&
"Lane not set");
21441 SmallPtrSet<Instruction *, 4> Visited;
21443 int Lane = std::distance(
Op.begin(), It);
21444 assert(Lane >= 0 &&
"Lane not set");
21446 !EI.UserTE->ReorderIndices.empty())
21447 Lane = EI.UserTE->ReorderIndices[Lane];
21448 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
21449 "Couldn't find extract lane");
21451 if (!Visited.
insert(In).second) {
21455 ScheduleCopyableDataMapByInstUser
21456 [std::make_pair(std::make_pair(In, EI.EdgeIdx),
I)]
21459 }
while (It !=
Op.end());
21461 if (ScheduleCopyableData *UserCD = getScheduleCopyableData(UserEI,
I))
21462 ScheduleCopyableDataMapByUsers[
I].insert(UserCD);
21464 if (ScheduleCopyableDataMapByUsers[
I].
empty())
21465 ScheduleCopyableDataMapByUsers.erase(
I);
21466 ScheduleCopyableDataMap.erase(KV);
21468 if (ScheduleData *OpSD = getScheduleData(
I);
21469 OpSD && OpSD->hasValidDependencies()) {
21470 OpSD->clearDirectDependencies();
21471 if (RegionHasStackSave ||
21473 ControlDependentMembers.
push_back(OpSD);
21477 ScheduledBundles.find(
I)->getSecond().pop_back();
21479 if (!ControlDependentMembers.
empty()) {
21480 ScheduleBundle
Invalid = ScheduleBundle::invalid();
21481 calculateDependencies(
Invalid,
false, SLP,
21482 ControlDependentMembers);
21484 return std::nullopt;
21489BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
21491 if (ChunkPos >= ChunkSize) {
21492 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
21495 return &(ScheduleDataChunks.back()[ChunkPos++]);
21498bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
21499 Value *V,
const InstructionsState &S) {
21501 assert(
I &&
"bundle member must be an instruction");
21502 if (getScheduleData(
I))
21504 if (!ScheduleStart) {
21506 initScheduleData(
I,
I->getNextNode(),
nullptr,
nullptr);
21508 ScheduleEnd =
I->getNextNode();
21509 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
21510 LLVM_DEBUG(
dbgs() <<
"SLP: initialize schedule region to " << *
I <<
"\n");
21518 ++ScheduleStart->getIterator().getReverse();
21524 return II->isAssumeLikeIntrinsic();
21527 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21528 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21529 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter !=
I &&
21531 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
21532 LLVM_DEBUG(
dbgs() <<
"SLP: exceeded schedule region size limit\n");
21539 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21540 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21542 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter ==
I)) {
21543 assert(
I->getParent() == ScheduleStart->getParent() &&
21544 "Instruction is in wrong basic block.");
21545 initScheduleData(
I, ScheduleStart,
nullptr, FirstLoadStoreInRegion);
21551 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter ==
I)) &&
21552 "Expected to reach top of the basic block or instruction down the "
21554 assert(
I->getParent() == ScheduleEnd->getParent() &&
21555 "Instruction is in wrong basic block.");
21556 initScheduleData(ScheduleEnd,
I->getNextNode(), LastLoadStoreInRegion,
21558 ScheduleEnd =
I->getNextNode();
21559 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
21560 LLVM_DEBUG(
dbgs() <<
"SLP: extend schedule region end to " << *
I <<
"\n");
21564void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
21566 ScheduleData *PrevLoadStore,
21567 ScheduleData *NextLoadStore) {
21568 ScheduleData *CurrentLoadStore = PrevLoadStore;
21573 ScheduleData *SD = ScheduleDataMap.lookup(
I);
21575 SD = allocateScheduleDataChunks();
21576 ScheduleDataMap[
I] = SD;
21578 assert(!isInSchedulingRegion(*SD) &&
21579 "new ScheduleData already in scheduling region");
21580 SD->init(SchedulingRegionID,
I);
21587 return LI && LI->isSimple() &&
21588 LI->getMetadata(LLVMContext::MD_invariant_load);
21591 if (
I->mayReadOrWriteMemory() &&
21593 !CanIgnoreLoad(
I) &&
21597 Intrinsic::pseudoprobe))) {
21599 if (CurrentLoadStore) {
21600 CurrentLoadStore->setNextLoadStore(SD);
21602 FirstLoadStoreInRegion = SD;
21604 CurrentLoadStore = SD;
21609 RegionHasStackSave =
true;
21611 if (NextLoadStore) {
21612 if (CurrentLoadStore)
21613 CurrentLoadStore->setNextLoadStore(NextLoadStore);
21615 LastLoadStoreInRegion = CurrentLoadStore;
21619void BoUpSLP::BlockScheduling::calculateDependencies(
21620 ScheduleBundle &Bundle,
bool InsertInReadyList,
BoUpSLP *SLP,
21622 SmallVector<ScheduleEntity *> WorkList;
21623 auto ProcessNode = [&](ScheduleEntity *SE) {
21625 if (CD->hasValidDependencies())
21628 CD->initDependencies();
21629 CD->resetUnscheduledDeps();
21630 const EdgeInfo &EI = CD->getEdgeInfo();
21633 const auto *It =
find(
Op, CD->getInst());
21634 assert(It !=
Op.end() &&
"Lane not set");
21635 SmallPtrSet<Instruction *, 4> Visited;
21637 int Lane = std::distance(
Op.begin(), It);
21638 assert(Lane >= 0 &&
"Lane not set");
21640 !EI.UserTE->ReorderIndices.empty())
21641 Lane = EI.UserTE->ReorderIndices[Lane];
21642 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
21643 "Couldn't find extract lane");
21645 if (EI.UserTE->isCopyableElement(In)) {
21648 if (ScheduleCopyableData *UseSD =
21649 getScheduleCopyableData(EI.UserTE->UserTreeIndex, In)) {
21650 CD->incDependencies();
21651 if (!UseSD->isScheduled())
21652 CD->incrementUnscheduledDeps(1);
21653 if (!UseSD->hasValidDependencies() ||
21654 (InsertInReadyList && UseSD->isReady()))
21657 }
else if (Visited.
insert(In).second) {
21658 if (ScheduleData *UseSD = getScheduleData(In)) {
21659 CD->incDependencies();
21660 if (!UseSD->isScheduled())
21661 CD->incrementUnscheduledDeps(1);
21662 if (!UseSD->hasValidDependencies() ||
21663 (InsertInReadyList && UseSD->isReady()))
21668 }
while (It !=
Op.end());
21669 if (CD->isReady() && CD->getDependencies() == 0 &&
21670 (EI.UserTE->hasState() &&
21671 (EI.UserTE->getMainOp()->getParent() !=
21672 CD->getInst()->getParent() ||
21674 (EI.UserTE->getMainOp()->hasNUsesOrMore(
UsesLimit) ||
21675 any_of(EI.UserTE->getMainOp()->users(), [&](User *U) {
21676 auto *IU = dyn_cast<Instruction>(U);
21679 return IU->getParent() == EI.UserTE->getMainOp()->getParent();
21685 CD->incDependencies();
21686 CD->incrementUnscheduledDeps(1);
21692 if (BundleMember->hasValidDependencies())
21694 LLVM_DEBUG(
dbgs() <<
"SLP: update deps of " << *BundleMember <<
"\n");
21695 BundleMember->initDependencies();
21696 BundleMember->resetUnscheduledDeps();
21698 SmallDenseMap<Value *, unsigned> UserToNumOps;
21699 for (User *U : BundleMember->getInst()->users()) {
21702 if (ScheduleData *UseSD = getScheduleData(U)) {
21706 if (areAllOperandsReplacedByCopyableData(
21709 BundleMember->incDependencies();
21710 if (!UseSD->isScheduled())
21711 BundleMember->incrementUnscheduledDeps(1);
21712 if (!UseSD->hasValidDependencies() ||
21713 (InsertInReadyList && UseSD->isReady()))
21717 for (ScheduleCopyableData *UseSD :
21718 getScheduleCopyableDataUsers(BundleMember->getInst())) {
21719 BundleMember->incDependencies();
21720 if (!UseSD->isScheduled())
21721 BundleMember->incrementUnscheduledDeps(1);
21722 if (!UseSD->hasValidDependencies() ||
21723 (InsertInReadyList && UseSD->isReady()))
21727 SmallPtrSet<const Instruction *, 4> Visited;
21730 if (!Visited.
insert(
I).second)
21732 auto *DepDest = getScheduleData(
I);
21733 assert(DepDest &&
"must be in schedule window");
21734 DepDest->addControlDependency(BundleMember);
21735 BundleMember->incDependencies();
21736 if (!DepDest->isScheduled())
21737 BundleMember->incrementUnscheduledDeps(1);
21738 if (!DepDest->hasValidDependencies() ||
21739 (InsertInReadyList && DepDest->isReady()))
21747 for (Instruction *
I = BundleMember->getInst()->getNextNode();
21748 I != ScheduleEnd;
I =
I->getNextNode()) {
21753 MakeControlDependent(
I);
21761 if (RegionHasStackSave) {
21766 match(BundleMember->getInst(),
21768 for (Instruction *
I = BundleMember->getInst()->getNextNode();
21769 I != ScheduleEnd;
I =
I->getNextNode()) {
21780 MakeControlDependent(
I);
21790 BundleMember->getInst()->mayReadOrWriteMemory()) {
21791 for (Instruction *
I = BundleMember->getInst()->getNextNode();
21792 I != ScheduleEnd;
I =
I->getNextNode()) {
21798 MakeControlDependent(
I);
21805 ScheduleData *NextLoadStore = BundleMember->getNextLoadStore();
21806 if (!NextLoadStore)
21810 "NextLoadStore list for non memory effecting bundle?");
21813 unsigned NumAliased = 0;
21814 unsigned DistToSrc = 1;
21815 bool IsNonSimpleSrc = !SrcLoc.
Ptr || !
isSimple(SrcInst);
21817 for (ScheduleData *DepDest = NextLoadStore; DepDest;
21818 DepDest = DepDest->getNextLoadStore()) {
21819 assert(isInSchedulingRegion(*DepDest) &&
"Expected to be in region");
21829 ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
21831 SLP->isAliased(SrcLoc, SrcInst, DepDest->getInst())))) {
21838 DepDest->addMemoryDependency(BundleMember);
21839 BundleMember->incDependencies();
21840 if (!DepDest->isScheduled())
21841 BundleMember->incrementUnscheduledDeps(1);
21842 if (!DepDest->hasValidDependencies() ||
21843 (InsertInReadyList && DepDest->isReady()))
21867 "expected at least one instruction to schedule");
21869 WorkList.
push_back(Bundle.getBundle().front());
21871 SmallPtrSet<ScheduleBundle *, 16> Visited;
21872 while (!WorkList.
empty()) {
21877 CopyableBundle.
push_back(&CD->getBundle());
21878 Bundles = CopyableBundle;
21880 Bundles = getScheduleBundles(SD->getInst());
21882 if (Bundles.
empty()) {
21883 if (!SD->hasValidDependencies())
21885 if (InsertInReadyList && SD->isReady()) {
21886 ReadyInsts.insert(SD);
21887 LLVM_DEBUG(
dbgs() <<
"SLP: gets ready on update: " << *SD <<
"\n");
21891 for (ScheduleBundle *Bundle : Bundles) {
21892 if (Bundle->hasValidDependencies() || !Visited.
insert(Bundle).second)
21894 assert(isInSchedulingRegion(*Bundle) &&
21895 "ScheduleData not in scheduling region");
21896 for_each(Bundle->getBundle(), ProcessNode);
21898 if (InsertInReadyList && SD->isReady()) {
21899 for (ScheduleBundle *Bundle : Bundles) {
21900 assert(isInSchedulingRegion(*Bundle) &&
21901 "ScheduleData not in scheduling region");
21902 if (!Bundle->isReady())
21904 ReadyInsts.insert(Bundle);
21912void BoUpSLP::BlockScheduling::resetSchedule() {
21914 "tried to reset schedule on block which has not been scheduled");
21915 for_each(ScheduleDataMap, [&](
auto &
P) {
21916 if (BB !=
P.first->getParent())
21918 ScheduleData *SD =
P.second;
21919 if (isInSchedulingRegion(*SD)) {
21920 SD->setScheduled(
false);
21921 SD->resetUnscheduledDeps();
21924 for_each(ScheduleCopyableDataMapByInst, [&](
auto &
P) {
21925 for_each(
P.second, [&](ScheduleCopyableData *SD) {
21926 if (isInSchedulingRegion(*SD)) {
21927 SD->setScheduled(false);
21928 SD->resetUnscheduledDeps();
21932 for_each(ScheduledBundles, [&](
auto &
P) {
21933 for_each(
P.second, [&](ScheduleBundle *Bundle) {
21934 if (isInSchedulingRegion(*Bundle))
21935 Bundle->setScheduled(false);
21939 for (
auto &
P : ScheduleCopyableDataMap) {
21940 if (isInSchedulingRegion(*
P.second)) {
21941 P.second->setScheduled(
false);
21942 P.second->resetUnscheduledDeps();
21945 ReadyInsts.clear();
21948void BoUpSLP::scheduleBlock(
const BoUpSLP &R, BlockScheduling *BS) {
21949 if (!BS->ScheduleStart)
21952 LLVM_DEBUG(
dbgs() <<
"SLP: schedule block " << BS->BB->getName() <<
"\n");
21959 BS->resetSchedule();
21966 struct ScheduleDataCompare {
21967 bool operator()(
const ScheduleEntity *SD1,
21968 const ScheduleEntity *SD2)
const {
21969 return SD2->getSchedulingPriority() < SD1->getSchedulingPriority();
21972 std::set<ScheduleEntity *, ScheduleDataCompare> ReadyInsts;
21977 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
21978 I =
I->getNextNode()) {
21980 if (!Bundles.
empty()) {
21981 for (ScheduleBundle *Bundle : Bundles) {
21982 Bundle->setSchedulingPriority(Idx++);
21983 if (!Bundle->hasValidDependencies())
21984 BS->calculateDependencies(*Bundle,
false,
this);
21987 for (ScheduleCopyableData *SD :
reverse(SDs)) {
21988 ScheduleBundle &Bundle = SD->getBundle();
21989 Bundle.setSchedulingPriority(Idx++);
21990 if (!Bundle.hasValidDependencies())
21991 BS->calculateDependencies(Bundle,
false,
this);
21996 BS->getScheduleCopyableDataUsers(
I);
21997 if (ScheduleData *SD = BS->getScheduleData(
I)) {
22000 SDTEs.
front()->doesNotNeedToSchedule() ||
22002 "scheduler and vectorizer bundle mismatch");
22003 SD->setSchedulingPriority(Idx++);
22004 if (!SD->hasValidDependencies() &&
22005 (!CopyableData.
empty() ||
22006 any_of(
R.ValueToGatherNodes.lookup(
I), [&](
const TreeEntry *TE) {
22007 assert(TE->isGather() &&
"expected gather node");
22008 return TE->hasState() && TE->hasCopyableElements() &&
22009 TE->isCopyableElement(I);
22015 ScheduleBundle Bundle;
22017 BS->calculateDependencies(Bundle,
false,
this);
22020 for (ScheduleCopyableData *SD :
reverse(CopyableData)) {
22021 ScheduleBundle &Bundle = SD->getBundle();
22022 Bundle.setSchedulingPriority(Idx++);
22023 if (!Bundle.hasValidDependencies())
22024 BS->calculateDependencies(Bundle,
false,
this);
22027 BS->initialFillReadyList(ReadyInsts);
22029 Instruction *LastScheduledInst = BS->ScheduleEnd;
22032 SmallPtrSet<Instruction *, 16> Scheduled;
22033 while (!ReadyInsts.empty()) {
22034 auto *Picked = *ReadyInsts.begin();
22035 ReadyInsts.erase(ReadyInsts.begin());
22040 for (
const ScheduleEntity *BundleMember : Bundle->getBundle()) {
22041 Instruction *PickedInst = BundleMember->getInst();
22043 bool IsCopyable = Bundle->getTreeEntry()->isCopyableElement(PickedInst);
22044 if ((IsCopyable && BS->getScheduleData(PickedInst)) ||
22045 (!IsCopyable && !Scheduled.
insert(PickedInst).second))
22047 if (PickedInst->
getNextNode() != LastScheduledInst)
22049 LastScheduledInst = PickedInst;
22051 EntryToLastInstruction.try_emplace(Bundle->getTreeEntry(),
22052 LastScheduledInst);
22056 if (PickedInst->
getNextNode() != LastScheduledInst)
22058 LastScheduledInst = PickedInst;
22060 auto Invalid = InstructionsState::invalid();
22065#ifdef EXPENSIVE_CHECKS
22069#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
22071 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
22072 I =
I->getNextNode()) {
22075 [](
const ScheduleBundle *Bundle) {
22076 return Bundle->isScheduled();
22078 "must be scheduled at this point");
22083 BS->ScheduleStart =
nullptr;
22091 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
22096 auto E = InstrElementSize.find(V);
22097 if (E != InstrElementSize.end())
22114 Value *FirstNonBool =
nullptr;
22115 while (!Worklist.
empty()) {
22120 auto *Ty =
I->getType();
22123 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
22131 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
22139 for (
Use &U :
I->operands()) {
22141 if (Visited.
insert(J).second &&
22147 FirstNonBool = U.get();
22158 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
22160 Width = DL->getTypeSizeInBits(V->getType());
22164 InstrElementSize[
I] = Width;
22169bool BoUpSLP::collectValuesToDemote(
22170 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
22173 bool &IsProfitableToDemote,
bool IsTruncRoot)
const {
22178 unsigned OrigBitWidth =
22179 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
22193 if (isa<PoisonValue>(R))
22195 return !isKnownNonNegative(R, SimplifyQuery(*DL));
22197 auto IsPotentiallyTruncated = [&](
Value *V,
unsigned &
BitWidth) ->
bool {
22200 if (getTreeEntries(V).
size() > 1)
22206 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth >
BitWidth) {
22212 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
22216 APInt
Mask = DB->getDemandedBits(
I);
22217 unsigned BitWidth2 =
22218 std::max<unsigned>(1,
Mask.getBitWidth() -
Mask.countl_zero());
22219 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
22225 BitWidth1 = std::min(BitWidth1, BitWidth2);
22230 auto FinalAnalysis = [&, TTI = TTI]() {
22231 if (!IsProfitableToDemote)
22234 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(
BitWidth)));
22236 if (Res &&
E.isGather()) {
22237 if (
E.hasState()) {
22238 if (
const TreeEntry *SameTE =
22239 getSameValuesTreeEntry(
E.getMainOp(),
E.Scalars);
22241 if (collectValuesToDemote(*SameTE, IsProfitableToDemoteRoot,
BitWidth,
22242 ToDemote, Visited, NodesToKeepBWs,
22243 MaxDepthLevel, IsProfitableToDemote,
22251 SmallPtrSet<Value *, 4> UniqueBases;
22252 for (
Value *V :
E.Scalars) {
22256 UniqueBases.
insert(EE->getVectorOperand());
22258 const unsigned VF =
E.Scalars.size();
22259 Type *OrigScalarTy =
E.Scalars.front()->getType();
22260 if (UniqueBases.
size() <= 2 ||
22273 if (
E.isGather() || !Visited.
insert(&
E).second ||
22275 return !isa<Constant>(V) && all_of(V->users(), [&](User *U) {
22276 return isa<InsertElementInst>(U) && !isVectorized(U);
22279 return FinalAnalysis();
22282 return !isa<Constant>(V) && !all_of(V->users(), [=](User *U) {
22283 return isVectorized(U) ||
22284 (E.Idx == 0 && UserIgnoreList &&
22285 UserIgnoreList->contains(U)) ||
22286 (!isa<CmpInst>(U) && U->getType()->isSized() &&
22287 !U->getType()->isScalableTy() &&
22288 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
22289 }) && !IsPotentiallyTruncated(V,
BitWidth);
22294 bool &NeedToExit) {
22295 NeedToExit =
false;
22296 unsigned InitLevel = MaxDepthLevel;
22297 for (
const TreeEntry *
Op : Operands) {
22298 unsigned Level = InitLevel;
22299 if (!collectValuesToDemote(*
Op, IsProfitableToDemoteRoot,
BitWidth,
22300 ToDemote, Visited, NodesToKeepBWs, Level,
22301 IsProfitableToDemote, IsTruncRoot)) {
22302 if (!IsProfitableToDemote)
22305 if (!FinalAnalysis())
22309 MaxDepthLevel = std::max(MaxDepthLevel, Level);
22313 auto AttemptCheckBitwidth =
22314 [&](function_ref<bool(
unsigned,
unsigned)> Checker,
bool &NeedToExit) {
22316 NeedToExit =
false;
22317 unsigned BestFailBitwidth = 0;
22319 if (Checker(
BitWidth, OrigBitWidth))
22321 if (BestFailBitwidth == 0 && FinalAnalysis())
22325 if (BestFailBitwidth == 0) {
22336 auto TryProcessInstruction =
22338 function_ref<bool(
unsigned,
unsigned)> Checker = {}) {
22339 if (Operands.empty()) {
22342 for (
Value *V :
E.Scalars)
22343 (void)IsPotentiallyTruncated(V,
BitWidth);
22348 return !V->hasOneUse() && !IsPotentiallyTruncated(V, BitWidth);
22351 bool NeedToExit =
false;
22352 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
22356 if (!ProcessOperands(Operands, NeedToExit))
22365 return IsProfitableToDemote;
22368 if (
E.State == TreeEntry::SplitVectorize)
22369 return TryProcessInstruction(
22371 {VectorizableTree[
E.CombinedEntriesWithIndices.front().first].get(),
22372 VectorizableTree[
E.CombinedEntriesWithIndices.back().first].get()});
22374 if (
E.isAltShuffle()) {
22376 auto IsDangerousOpcode = [](
unsigned Opcode) {
22378 case Instruction::Shl:
22379 case Instruction::AShr:
22380 case Instruction::LShr:
22381 case Instruction::UDiv:
22382 case Instruction::SDiv:
22383 case Instruction::URem:
22384 case Instruction::SRem:
22391 if (IsDangerousOpcode(
E.getAltOpcode()))
22392 return FinalAnalysis();
22395 switch (
E.getOpcode()) {
22399 case Instruction::Trunc:
22400 if (IsProfitableToDemoteRoot)
22401 IsProfitableToDemote =
true;
22402 return TryProcessInstruction(
BitWidth);
22403 case Instruction::ZExt:
22404 case Instruction::SExt:
22405 if (
E.UserTreeIndex.UserTE &&
E.UserTreeIndex.UserTE->hasState() &&
22406 E.UserTreeIndex.UserTE->getOpcode() == Instruction::BitCast &&
22407 E.UserTreeIndex.UserTE->getMainOp()->getType()->isFPOrFPVectorTy())
22409 IsProfitableToDemote =
true;
22410 return TryProcessInstruction(
BitWidth);
22414 case Instruction::Add:
22415 case Instruction::Sub:
22416 case Instruction::Mul:
22417 case Instruction::And:
22418 case Instruction::Or:
22419 case Instruction::Xor: {
22420 return TryProcessInstruction(
22421 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)});
22423 case Instruction::Freeze:
22424 return TryProcessInstruction(
BitWidth, getOperandEntry(&
E, 0));
22425 case Instruction::Shl: {
22428 auto ShlChecker = [&](
unsigned BitWidth, unsigned) {
22430 if (isa<PoisonValue>(V))
22432 if (E.isCopyableElement(V))
22434 auto *I = cast<Instruction>(V);
22435 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22436 return AmtKnownBits.getMaxValue().ult(BitWidth);
22439 return TryProcessInstruction(
22440 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)}, ShlChecker);
22442 case Instruction::LShr: {
22446 auto LShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22448 if (isa<PoisonValue>(V))
22450 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22451 if (E.isCopyableElement(V))
22452 return MaskedValueIsZero(V, ShiftedBits, SimplifyQuery(*DL));
22453 auto *I = cast<Instruction>(V);
22454 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22455 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
22456 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
22457 SimplifyQuery(*DL));
22460 return TryProcessInstruction(
22461 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)},
22464 case Instruction::AShr: {
22468 auto AShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22470 if (isa<PoisonValue>(V))
22472 auto *I = cast<Instruction>(V);
22473 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22474 unsigned ShiftedBits = OrigBitWidth - BitWidth;
22475 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
22477 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22480 return TryProcessInstruction(
22481 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)},
22484 case Instruction::UDiv:
22485 case Instruction::URem: {
22487 auto Checker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22490 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22491 if (E.hasCopyableElements() && E.isCopyableElement(V))
22492 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
22493 auto *I = cast<Instruction>(V);
22494 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
22495 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
22498 return TryProcessInstruction(
22499 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)}, Checker);
22503 case Instruction::Select: {
22504 return TryProcessInstruction(
22505 BitWidth, {getOperandEntry(&
E, 1), getOperandEntry(&
E, 2)});
22509 case Instruction::PHI: {
22510 const unsigned NumOps =
E.getNumOperands();
22513 [&](
unsigned Idx) { return getOperandEntry(&E, Idx); });
22518 case Instruction::Call: {
22523 if (
ID != Intrinsic::abs &&
ID != Intrinsic::smin &&
22524 ID != Intrinsic::smax &&
ID != Intrinsic::umin &&
ID != Intrinsic::umax)
22527 function_ref<bool(
unsigned,
unsigned)> CallChecker;
22528 auto CompChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22531 auto *I = cast<Instruction>(V);
22532 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
22533 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22534 return MaskedValueIsZero(I->getOperand(0), Mask,
22535 SimplifyQuery(*DL)) &&
22536 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
22538 assert((
ID == Intrinsic::smin ||
ID == Intrinsic::smax) &&
22539 "Expected min/max intrinsics only.");
22540 unsigned SignBits = OrigBitWidth -
BitWidth;
22542 unsigned Op0SignBits =
22544 unsigned Op1SignBits =
22546 return SignBits <= Op0SignBits &&
22547 ((SignBits != Op0SignBits &&
22550 SimplifyQuery(*DL))) &&
22551 SignBits <= Op1SignBits &&
22552 ((SignBits != Op1SignBits &&
22557 auto AbsChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22560 auto *I = cast<Instruction>(V);
22561 unsigned SignBits = OrigBitWidth - BitWidth;
22562 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
22563 unsigned Op0SignBits =
22564 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22565 return SignBits <= Op0SignBits &&
22566 ((SignBits != Op0SignBits &&
22567 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
22568 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
22571 if (
ID != Intrinsic::abs) {
22572 Operands.push_back(getOperandEntry(&
E, 1));
22573 CallChecker = CompChecker;
22575 CallChecker = AbsChecker;
22578 std::numeric_limits<InstructionCost::CostType>::max();
22580 unsigned VF =
E.Scalars.size();
22582 auto Checker = [&](
unsigned BitWidth, unsigned) {
22590 if (
Cost < BestCost) {
22596 [[maybe_unused]]
bool NeedToExit;
22597 (void)AttemptCheckBitwidth(Checker, NeedToExit);
22599 return TryProcessInstruction(
BitWidth, Operands, CallChecker);
22607 return FinalAnalysis();
22614 bool IsStoreOrInsertElt =
22615 VectorizableTree.front()->hasState() &&
22616 (VectorizableTree.front()->
getOpcode() == Instruction::Store ||
22617 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
22618 if ((IsStoreOrInsertElt || UserIgnoreList) &&
22619 ExtraBitWidthNodes.size() <= 1 &&
22620 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
22621 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
22624 unsigned NodeIdx = 0;
22625 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
22629 assert((VectorizableTree[NodeIdx]->
isGather() || NodeIdx != 0 ||
22630 !VectorizableTree[NodeIdx]->UserTreeIndex) &&
22631 "Unexpected tree is graph.");
22635 bool IsTruncRoot =
false;
22636 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
22639 if (NodeIdx != 0 &&
22640 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22641 VectorizableTree[NodeIdx]->
getOpcode() == Instruction::Trunc) {
22642 assert(IsStoreOrInsertElt &&
"Expected store/insertelement seeded graph.");
22643 IsTruncRoot =
true;
22645 IsProfitableToDemoteRoot =
true;
22650 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
22654 auto ComputeMaxBitWidth =
22655 [&](
const TreeEntry &E,
bool IsTopRoot,
bool IsProfitableToDemoteRoot,
22656 unsigned Limit,
bool IsTruncRoot,
bool IsSignedCmp) ->
unsigned {
22660 if (E.isGather() && IsTruncRoot && E.UserTreeIndex &&
22661 !NodesToKeepBWs.
contains(E.Idx) &&
22662 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
22664 return V->hasOneUse() || isa<Constant>(V) ||
22665 (!V->hasNUsesOrMore(UsesLimit) &&
22666 none_of(V->users(), [&](User *U) {
22667 ArrayRef<TreeEntry *> TEs = getTreeEntries(U);
22668 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22669 if (TEs.empty() || is_contained(TEs, UserTE))
22671 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22673 isa<SIToFPInst, UIToFPInst>(U) ||
22674 (UserTE->hasState() &&
22675 (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22676 SelectInst>(UserTE->getMainOp()) ||
22677 isa<SIToFPInst, UIToFPInst>(UserTE->getMainOp()))))
22679 unsigned UserTESz = DL->getTypeSizeInBits(
22680 UserTE->Scalars.front()->getType());
22681 if (all_of(TEs, [&](const TreeEntry *TE) {
22682 auto It = MinBWs.find(TE);
22683 return It != MinBWs.end() &&
22684 It->second.first > UserTESz;
22687 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
22691 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22692 auto It = MinBWs.find(UserTE);
22693 if (It != MinBWs.end())
22694 return It->second.first;
22695 unsigned MaxBitWidth =
22696 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
22697 MaxBitWidth =
bit_ceil(MaxBitWidth);
22698 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22700 return MaxBitWidth;
22706 unsigned VF = E.getVectorFactor();
22707 Type *ScalarTy = E.Scalars.front()->getType();
22714 [&](
Value *V) { return AnalyzedMinBWVals.contains(V); }))
22723 unsigned MaxBitWidth = 1u;
22731 bool IsKnownPositive = !IsSignedCmp &&
all_of(E.Scalars, [&](
Value *R) {
22732 if (isa<PoisonValue>(R))
22734 KnownBits Known = computeKnownBits(R, *DL);
22735 return Known.isNonNegative();
22738 if (!IsKnownPositive && !IsTopRoot && E.UserTreeIndex &&
22739 E.UserTreeIndex.UserTE->hasState() &&
22740 E.UserTreeIndex.UserTE->getOpcode() == Instruction::UIToFP)
22742 std::min(DL->getTypeSizeInBits(
22743 E.UserTreeIndex.UserTE->Scalars.front()->getType()),
22744 DL->getTypeSizeInBits(ScalarTy));
22748 for (
Value *Root : E.Scalars) {
22754 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22770 if (!IsKnownPositive)
22775 MaxBitWidth = std::max(BitWidth1, MaxBitWidth);
22778 APInt Mask = DB->getDemandedBits(
I);
22779 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
22781 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
22784 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22789 if (NumParts > 1 &&
22797 unsigned Opcode = E.getOpcode();
22798 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
22799 Opcode == Instruction::SExt ||
22800 Opcode == Instruction::ZExt || NumParts > 1;
22805 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
22806 bool NeedToDemote = IsProfitableToDemote;
22808 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
22809 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
22810 NeedToDemote, IsTruncRoot) ||
22811 (MaxDepthLevel <= Limit &&
22812 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
22813 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
22814 DL->getTypeSizeInBits(TreeRootIT) /
22815 DL->getTypeSizeInBits(
22816 E.getMainOp()->getOperand(0)->getType()) >
22820 MaxBitWidth =
bit_ceil(MaxBitWidth);
22822 return MaxBitWidth;
22829 if (UserIgnoreList &&
22833 if (
all_of(*UserIgnoreList,
22838 VectorizableTree.front()->State == TreeEntry::Vectorize &&
22839 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
22840 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
22841 Builder.getInt1Ty()) {
22842 ReductionBitWidth = 1;
22844 for (
Value *V : *UserIgnoreList) {
22848 TypeSize NumTypeBits = DL->getTypeSizeInBits(
V->getType());
22849 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22852 unsigned BitWidth2 = BitWidth1;
22855 BitWidth2 =
Mask.getBitWidth() -
Mask.countl_zero();
22857 ReductionBitWidth =
22858 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
22860 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
22861 ReductionBitWidth = 8;
22863 ReductionBitWidth =
bit_ceil(ReductionBitWidth);
22866 bool IsTopRoot = NodeIdx == 0;
22867 while (NodeIdx < VectorizableTree.size() &&
22868 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22869 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
22870 RootDemotes.push_back(NodeIdx);
22872 IsTruncRoot =
true;
22874 bool IsSignedCmp =
false;
22875 if (UserIgnoreList &&
22879 IsSignedCmp =
true;
22880 while (NodeIdx < VectorizableTree.size()) {
22882 unsigned Limit = 2;
22884 ReductionBitWidth ==
22885 DL->getTypeSizeInBits(
22886 VectorizableTree.front()->Scalars.front()->getType()))
22888 unsigned MaxBitWidth = ComputeMaxBitWidth(
22889 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
22890 IsTruncRoot, IsSignedCmp);
22891 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
22892 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
22893 ReductionBitWidth =
bit_ceil(MaxBitWidth);
22894 else if (MaxBitWidth == 0)
22895 ReductionBitWidth = 0;
22898 for (
unsigned Idx : RootDemotes) {
22899 if (
all_of(VectorizableTree[Idx]->Scalars, [&](
Value *V) {
22900 uint32_t OrigBitWidth =
22901 DL->getTypeSizeInBits(
V->getType()->getScalarType());
22902 if (OrigBitWidth > MaxBitWidth) {
22910 RootDemotes.clear();
22912 IsProfitableToDemoteRoot =
true;
22914 if (ExtraBitWidthNodes.empty()) {
22915 NodeIdx = VectorizableTree.size();
22917 unsigned NewIdx = 0;
22919 NewIdx = *ExtraBitWidthNodes.begin();
22920 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
22921 }
while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
22924 NodeIdx < VectorizableTree.size() &&
22925 VectorizableTree[NodeIdx]->UserTreeIndex &&
22926 VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&
22927 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22928 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22929 Instruction::Trunc &&
22930 !VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();
22932 NodeIdx < VectorizableTree.size() &&
22933 VectorizableTree[NodeIdx]->UserTreeIndex &&
22934 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22935 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22936 Instruction::ICmp &&
22938 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->Scalars,
22940 auto *IC = dyn_cast<ICmpInst>(V);
22941 return IC && (IC->isSigned() ||
22942 !isKnownNonNegative(IC->getOperand(0),
22943 SimplifyQuery(*DL)) ||
22944 !isKnownNonNegative(IC->getOperand(1),
22945 SimplifyQuery(*DL)));
22951 if (MaxBitWidth == 0 ||
22955 if (UserIgnoreList)
22956 AnalyzedMinBWVals.insert_range(TreeRoot);
22963 for (
unsigned Idx : ToDemote) {
22964 TreeEntry *
TE = VectorizableTree[Idx].get();
22965 if (MinBWs.contains(TE))
22968 if (isa<PoisonValue>(R))
22970 return !isKnownNonNegative(R, SimplifyQuery(*DL));
22972 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
23013 DL = &
F.getDataLayout();
23021 if (!
TTI->getNumberOfRegisters(
TTI->getRegisterClassForType(
true))) {
23023 dbgs() <<
"SLP: Didn't find any vector registers for target, abort.\n");
23028 if (
F.hasFnAttribute(Attribute::NoImplicitFloat))
23031 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing blocks in " <<
F.getName() <<
".\n");
23035 BoUpSLP R(&
F,
SE,
TTI,
TLI,
AA,
LI,
DT,
AC,
DB,
DL, ORE_);
23041 DT->updateDFSNumbers();
23044 for (
auto *BB :
post_order(&
F.getEntryBlock())) {
23049 R.clearReductionData();
23050 collectSeedInstructions(BB);
23053 if (!Stores.empty()) {
23055 <<
" underlying objects.\n");
23056 Changed |= vectorizeStoreChains(R);
23060 Changed |= vectorizeChainsInBlock(BB, R);
23065 if (!GEPs.empty()) {
23067 <<
" underlying objects.\n");
23068 Changed |= vectorizeGEPIndices(BB, R);
23073 R.optimizeGatherSequence();
23081 unsigned Idx,
unsigned MinVF,
23086 const unsigned Sz = R.getVectorElementSize(Chain[0]);
23087 unsigned VF = Chain.
size();
23093 VF < 2 || VF < MinVF) {
23101 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << VF <<
" stores at offset " << Idx
23105 for (
Value *V : Chain)
23108 InstructionsCompatibilityAnalysis
Analysis(*DT, *
DL, *
TTI, *TLI);
23109 InstructionsState S =
Analysis.buildInstructionsState(
23113 bool IsAllowedSize =
23117 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
23118 (!S.getMainOp()->isSafeToRemove() ||
23121 return !isa<ExtractElementInst>(V) &&
23122 (V->getNumUses() > Chain.size() ||
23123 any_of(V->users(), [&](User *U) {
23124 return !Stores.contains(U);
23127 (ValOps.
size() > Chain.size() / 2 && !S)) {
23128 Size = (!IsAllowedSize && S) ? 1 : 2;
23132 if (
R.isLoadCombineCandidate(Chain))
23134 R.buildTree(Chain);
23136 if (
R.isTreeTinyAndNotFullyVectorizable()) {
23137 if (
R.isGathered(Chain.front()) ||
23139 return std::nullopt;
23140 Size =
R.getCanonicalGraphSize();
23143 if (
R.isProfitableToReorder()) {
23144 R.reorderTopToBottom();
23145 R.reorderBottomToTop();
23147 R.transformNodes();
23148 R.buildExternalUses();
23150 R.computeMinimumValueSizes();
23152 Size =
R.getCanonicalGraphSize();
23153 if (S && S.getOpcode() == Instruction::Load)
23161 using namespace ore;
23163 R.getORE()->emit(OptimizationRemark(
SV_NAME,
"StoresVectorized",
23165 <<
"Stores SLP vectorized with cost " <<
NV(
"Cost",
Cost)
23166 <<
" and with tree size "
23167 <<
NV(
"TreeSize",
R.getTreeSize()));
23181 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
23182 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
23183 unsigned Size = First ? Val.first : Val.second;
23195 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
23196 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
23197 unsigned P = First ? Val.first : Val.second;
23200 return V + (P - Mean) * (P - Mean);
23203 return Dev * 96 / (Mean * Mean) == 0;
23211class RelatedStoreInsts {
23214 : AllStores(AllStores) {
23215 reset(BaseInstrIdx);
23218 void reset(
unsigned NewBaseInstr) {
23219 assert(NewBaseInstr < AllStores.size() &&
23220 "Instruction index out of bounds");
23221 BaseInstrIdx = NewBaseInstr;
23223 insertOrLookup(NewBaseInstr, 0);
23230 std::optional<unsigned> insertOrLookup(
unsigned InstrIdx, int64_t PtrDist) {
23231 auto [It,
Inserted] = Instrs.emplace(PtrDist, InstrIdx);
23232 return Inserted ? std::nullopt : std::make_optional(It->second);
23235 using DistToInstMap = std::map<int64_t, unsigned>;
23236 const DistToInstMap &getStores()
const {
return Instrs; }
23240 std::optional<int64_t> getPointerDiff(StoreInst &SI,
const DataLayout &
DL,
23241 ScalarEvolution &SE)
const {
23242 StoreInst &BaseStore = *AllStores[BaseInstrIdx];
23245 SI.getValueOperand()->getType(),
SI.getPointerOperand(),
DL, SE,
23251 void rebase(
unsigned MinSafeIdx,
unsigned NewBaseInstIdx,
23252 int64_t DistFromCurBase) {
23253 DistToInstMap PrevSet = std::move(Instrs);
23254 reset(NewBaseInstIdx);
23259 for (
auto [Dist, InstIdx] : PrevSet) {
23260 if (InstIdx >= MinSafeIdx)
23261 insertOrLookup(InstIdx, Dist - DistFromCurBase);
23267 DistToInstMap::reverse_iterator LastVectorizedStore =
find_if(
23268 reverse(Instrs), [&](
const std::pair<int64_t, unsigned> &DistAndIdx) {
23269 return VectorizedStores.
contains(AllStores[DistAndIdx.second]);
23274 DistToInstMap::iterator VectorizedStoresEnd = LastVectorizedStore.base();
23275 Instrs.erase(Instrs.begin(), VectorizedStoresEnd);
23280 unsigned BaseInstrIdx;
23283 DistToInstMap Instrs;
23291bool SLPVectorizerPass::vectorizeStores(
23293 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
23300 auto TryToVectorize = [&](
const RelatedStoreInsts::DistToInstMap &StoreSeq) {
23301 int64_t PrevDist = -1;
23305 auto &[Dist, InstIdx] =
Data;
23306 if (Operands.
empty() || Dist - PrevDist == 1) {
23309 if (Idx != StoreSeq.size() - 1)
23318 if (Operands.
size() <= 1 ||
23320 .
insert({Operands.front(),
23321 cast<StoreInst>(Operands.front())->getValueOperand(),
23323 cast<StoreInst>(Operands.back())->getValueOperand(),
23328 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
23329 unsigned EltSize =
R.getVectorElementSize(Operands[0]);
23333 std::min(
R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
23335 Type *StoreTy =
Store->getValueOperand()->getType();
23336 Type *ValueTy = StoreTy;
23338 ValueTy = Trunc->getSrcTy();
23347 R.getMinVF(DL->getTypeStoreSizeInBits(StoreScalarTy)), StoreScalarTy,
23350 MinVF = std::max<unsigned>(2, MinVF);
23352 if (MaxVF < MinVF) {
23353 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
23355 <<
"MinVF (" << MinVF <<
")\n");
23359 unsigned NonPowerOf2VF = 0;
23364 unsigned CandVF = std::clamp<unsigned>(Operands.
size(), MinVF, MaxVF);
23366 NonPowerOf2VF = CandVF;
23367 assert(NonPowerOf2VF != MaxVF &&
23368 "Non-power-of-2 VF should not be equal to MaxVF");
23375 unsigned MaxRegVF = MaxVF;
23377 MaxVF = std::min<unsigned>(MaxVF,
bit_floor(Operands.
size()));
23378 if (MaxVF < MinVF) {
23379 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
23381 <<
"MinVF (" << MinVF <<
")\n");
23385 SmallVector<unsigned> CandidateVFs;
23386 for (
unsigned VF = std::max(MaxVF, NonPowerOf2VF); VF >= MinVF;
23390 unsigned End = Operands.
size();
23391 unsigned Repeat = 0;
23392 constexpr unsigned MaxAttempts = 4;
23393 OwningArrayRef<std::pair<unsigned, unsigned>> RangeSizes(Operands.
size());
23394 for (std::pair<unsigned, unsigned> &
P : RangeSizes)
23395 P.first =
P.second = 1;
23396 DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable;
23397 auto IsNotVectorized = [](
bool First,
23398 const std::pair<unsigned, unsigned> &
P) {
23399 return First ?
P.first > 0 :
P.second > 0;
23401 auto IsVectorized = [](
bool First,
23402 const std::pair<unsigned, unsigned> &
P) {
23403 return First ?
P.first == 0 :
P.second == 0;
23405 auto VFIsProfitable = [](
bool First,
unsigned Size,
23406 const std::pair<unsigned, unsigned> &
P) {
23409 auto FirstSizeSame = [](
unsigned Size,
23410 const std::pair<unsigned, unsigned> &
P) {
23411 return Size ==
P.first;
23415 bool RepeatChanged =
false;
23416 bool AnyProfitableGraph =
false;
23417 for (
unsigned VF : CandidateVFs) {
23418 AnyProfitableGraph =
false;
23419 unsigned FirstUnvecStore =
23420 std::distance(RangeSizes.begin(),
23421 find_if(RangeSizes, std::bind(IsNotVectorized,
23422 VF >= MaxRegVF, _1)));
23426 while (FirstUnvecStore < End) {
23427 unsigned FirstVecStore = std::distance(
23428 RangeSizes.begin(),
23429 find_if(RangeSizes.drop_front(FirstUnvecStore),
23430 std::bind(IsVectorized, VF >= MaxRegVF, _1)));
23431 unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore;
23432 for (
unsigned SliceStartIdx = FirstUnvecStore;
23433 SliceStartIdx + VF <= MaxSliceEnd;) {
23444 ->getValueOperand()
23447 ->getValueOperand()
23450 "Expected all operands of same type.");
23451 if (!NonSchedulable.
empty()) {
23452 auto [NonSchedSizeMax, NonSchedSizeMin] =
23454 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= VF) {
23457 SliceStartIdx += NonSchedSizeMax;
23462 std::optional<bool> Res =
23463 vectorizeStoreChain(Slice, R, SliceStartIdx, MinVF, TreeSize);
23469 .first->getSecond()
23477 AnyProfitableGraph = RepeatChanged =
Changed =
true;
23480 for (std::pair<unsigned, unsigned> &
P :
23481 RangeSizes.slice(SliceStartIdx, VF))
23482 P.first =
P.second = 0;
23483 if (SliceStartIdx < FirstUnvecStore + MinVF) {
23484 for (std::pair<unsigned, unsigned> &
P : RangeSizes.slice(
23485 FirstUnvecStore, SliceStartIdx - FirstUnvecStore))
23486 P.first =
P.second = 0;
23487 FirstUnvecStore = SliceStartIdx + VF;
23489 if (SliceStartIdx > MaxSliceEnd - VF - MinVF) {
23490 for (std::pair<unsigned, unsigned> &
P :
23491 RangeSizes.slice(SliceStartIdx + VF,
23492 MaxSliceEnd - (SliceStartIdx + VF)))
23493 P.first =
P.second = 0;
23494 if (MaxSliceEnd == End)
23495 End = SliceStartIdx;
23496 MaxSliceEnd = SliceStartIdx;
23498 SliceStartIdx += VF;
23501 if (VF > 2 && Res &&
23502 !
all_of(RangeSizes.slice(SliceStartIdx, VF),
23503 std::bind(VFIsProfitable, VF >= MaxRegVF, TreeSize,
23505 SliceStartIdx += VF;
23510 if (VF > MaxRegVF && TreeSize > 1 &&
23511 all_of(RangeSizes.slice(SliceStartIdx, VF),
23512 std::bind(FirstSizeSame, TreeSize, _1))) {
23513 SliceStartIdx += VF;
23514 while (SliceStartIdx != MaxSliceEnd &&
23515 RangeSizes[SliceStartIdx].first == TreeSize)
23519 if (TreeSize > 1) {
23520 for (std::pair<unsigned, unsigned> &
P :
23521 RangeSizes.slice(SliceStartIdx, VF)) {
23522 if (VF >= MaxRegVF)
23523 P.second = std::max(
P.second, TreeSize);
23525 P.first = std::max(
P.first, TreeSize);
23529 AnyProfitableGraph =
true;
23531 if (FirstUnvecStore >= End)
23533 if (MaxSliceEnd - FirstUnvecStore < VF &&
23534 MaxSliceEnd - FirstUnvecStore >= MinVF)
23535 AnyProfitableGraph =
true;
23536 FirstUnvecStore = std::distance(
23537 RangeSizes.begin(),
23538 find_if(RangeSizes.drop_front(MaxSliceEnd),
23539 std::bind(IsNotVectorized, VF >= MaxRegVF, _1)));
23541 if (!AnyProfitableGraph && VF >= MaxRegVF &&
has_single_bit(VF))
23545 if (
all_of(RangeSizes, [](
const std::pair<unsigned, unsigned> &
P) {
23546 return P.first == 0 &&
P.second == 0;
23550 if (Repeat >= MaxAttempts ||
23551 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
23553 constexpr unsigned StoresLimit = 64;
23554 const unsigned MaxTotalNum = std::min<unsigned>(
23556 static_cast<unsigned>(
23559 RangeSizes.begin(),
23560 find_if(RangeSizes, std::bind(IsNotVectorized,
true, _1))) +
23562 unsigned VF =
bit_ceil(CandidateVFs.front()) * 2;
23565 CandidateVFs.clear();
23567 CandidateVFs.push_back(Limit);
23568 if (VF > MaxTotalNum || VF >= StoresLimit)
23570 for (std::pair<unsigned, unsigned> &
P : RangeSizes) {
23572 P.first = std::max(
P.second,
P.first);
23576 CandidateVFs.push_back(VF);
23616 auto FillStoresSet = [&](
unsigned Idx, StoreInst *
SI) {
23617 std::optional<int64_t> PtrDist;
23618 auto *RelatedStores =
find_if(
23619 SortedStores, [&PtrDist, SI,
this](
const RelatedStoreInsts &StoreSeq) {
23620 PtrDist = StoreSeq.getPointerDiff(*SI, *DL, *SE);
23621 return PtrDist.has_value();
23625 if (RelatedStores == SortedStores.
end()) {
23633 if (std::optional<unsigned> PrevInst =
23634 RelatedStores->insertOrLookup(Idx, *PtrDist)) {
23635 TryToVectorize(RelatedStores->getStores());
23636 RelatedStores->clearVectorizedStores(VectorizedStores);
23637 RelatedStores->rebase(*PrevInst + 1,
23642 Type *PrevValTy =
nullptr;
23644 if (
R.isDeleted(SI))
23647 PrevValTy =
SI->getValueOperand()->getType();
23649 if (PrevValTy !=
SI->getValueOperand()->getType()) {
23650 for (RelatedStoreInsts &StoreSeq : SortedStores)
23651 TryToVectorize(StoreSeq.getStores());
23652 SortedStores.clear();
23653 PrevValTy =
SI->getValueOperand()->getType();
23655 FillStoresSet(
I, SI);
23659 for (RelatedStoreInsts &StoreSeq : SortedStores)
23660 TryToVectorize(StoreSeq.getStores());
23665void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
23673 for (Instruction &
I : *BB) {
23677 if (!
SI->isSimple())
23688 if (
GEP->getNumIndices() != 1)
23690 Value *Idx =
GEP->idx_begin()->get();
23695 if (
GEP->getType()->isVectorTy())
23707 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize a list of length = "
23708 << VL.
size() <<
".\n");
23719 for (
Value *V : VL) {
23720 Type *Ty =
V->getType();
23724 R.getORE()->emit([&]() {
23725 std::string TypeStr;
23726 llvm::raw_string_ostream OS(TypeStr);
23728 return OptimizationRemarkMissed(
SV_NAME,
"UnsupportedType", I0)
23729 <<
"Cannot SLP vectorize list: type "
23730 << TypeStr +
" is unsupported by vectorizer";
23737 unsigned Sz =
R.getVectorElementSize(I0);
23738 unsigned MinVF =
R.getMinVF(Sz);
23739 unsigned MaxVF = std::max<unsigned>(
23741 MaxVF = std::min(
R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
23743 R.getORE()->emit([&]() {
23744 return OptimizationRemarkMissed(
SV_NAME,
"SmallVF", I0)
23745 <<
"Cannot SLP vectorize list: vectorization factor "
23746 <<
"less than 2 is not supported";
23752 bool CandidateFound =
false;
23755 unsigned NextInst = 0, MaxInst = VL.size();
23756 for (
unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
23762 if (TTI->getNumberOfParts(VecTy) == VF)
23764 for (
unsigned I = NextInst;
I < MaxInst; ++
I) {
23765 unsigned ActualVF = std::min(MaxInst -
I, VF);
23770 if (MaxVFOnly && ActualVF < MaxVF)
23772 if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
23777 for (
Value *V : VL.drop_front(
I)) {
23781 !Inst || !
R.isDeleted(Inst)) {
23784 if (Idx == ActualVF)
23789 if (Idx != ActualVF)
23792 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << ActualVF <<
" operations "
23796 if (
R.isTreeTinyAndNotFullyVectorizable())
23798 if (
R.isProfitableToReorder()) {
23799 R.reorderTopToBottom();
23802 R.transformNodes();
23803 R.buildExternalUses();
23805 R.computeMinimumValueSizes();
23807 CandidateFound =
true;
23808 MinCost = std::min(MinCost,
Cost);
23811 <<
" for VF=" << ActualVF <<
"\n");
23814 R.getORE()->emit(OptimizationRemark(
SV_NAME,
"VectorizedList",
23816 <<
"SLP vectorized with cost " <<
ore::NV(
"Cost",
Cost)
23817 <<
" and with tree size "
23818 <<
ore::NV(
"TreeSize",
R.getTreeSize()));
23829 if (!
Changed && CandidateFound) {
23830 R.getORE()->emit([&]() {
23831 return OptimizationRemarkMissed(
SV_NAME,
"NotBeneficial", I0)
23832 <<
"List vectorization was possible but not beneficial with cost "
23833 <<
ore::NV(
"Cost", MinCost) <<
" >= "
23837 R.getORE()->emit([&]() {
23838 return OptimizationRemarkMissed(
SV_NAME,
"NotPossible", I0)
23839 <<
"Cannot SLP vectorize list: vectorization was impossible"
23840 <<
" with available vectorization factors";
23875 using ReductionOpsType = SmallVector<Value *, 16>;
23876 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
23877 ReductionOpsListType ReductionOps;
23881 SmallDenseMap<Value *, SmallVector<Instruction *>, 16> ReducedValsToOps;
23882 WeakTrackingVH ReductionRoot;
23887 bool IsSupportedHorRdxIdentityOp =
false;
23894 static bool isCmpSelMinMax(Instruction *
I) {
23902 static bool isBoolLogicOp(Instruction *
I) {
23908 static bool isVectorizable(
RecurKind Kind, Instruction *
I,
23909 bool TwoElementReduction =
false) {
23910 if (Kind == RecurKind::None)
23919 if (TwoElementReduction)
23922 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
23926 return I->getFastMathFlags().noNaNs();
23929 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
23932 return I->isAssociative();
23935 static Value *getRdxOperand(Instruction *
I,
unsigned Index) {
23941 return I->getOperand(2);
23942 return I->getOperand(Index);
23947 Value *
RHS,
const Twine &Name,
bool UseSelect) {
23951 case RecurKind::Or: {
23960 case RecurKind::And: {
23970 case RecurKind::Add:
23971 case RecurKind::Mul:
23972 case RecurKind::Xor:
23973 case RecurKind::FAdd:
23974 case RecurKind::FMul: {
23979 case RecurKind::SMax:
23980 case RecurKind::SMin:
23981 case RecurKind::UMax:
23982 case RecurKind::UMin:
23990 case RecurKind::FMax:
23991 case RecurKind::FMin:
23992 case RecurKind::FMaximum:
23993 case RecurKind::FMinimum:
23994 case RecurKind::FMaximumNum:
23995 case RecurKind::FMinimumNum: {
24008 const ReductionOpsListType &ReductionOps) {
24009 bool UseSelect = ReductionOps.size() == 2 ||
24011 (ReductionOps.size() == 1 &&
24013 assert((!UseSelect || ReductionOps.size() != 2 ||
24015 "Expected cmp + select pairs for reduction");
24016 Value *
Op = createOp(Builder, RdxKind,
LHS,
RHS, Name, UseSelect);
24034 return RecurKind::None;
24036 return RecurKind::Add;
24038 return RecurKind::Mul;
24041 return RecurKind::And;
24044 return RecurKind::Or;
24046 return RecurKind::Xor;
24048 return RecurKind::FAdd;
24050 return RecurKind::FMul;
24053 return RecurKind::FMax;
24055 return RecurKind::FMin;
24058 return RecurKind::FMaximum;
24060 return RecurKind::FMinimum;
24066 return RecurKind::SMax;
24068 return RecurKind::SMin;
24070 return RecurKind::UMax;
24072 return RecurKind::UMin;
24098 return RecurKind::None;
24102 return RecurKind::None;
24105 return RecurKind::None;
24109 return RecurKind::None;
24114 return RecurKind::None;
24117 return RecurKind::SMax;
24120 return RecurKind::SMin;
24123 return RecurKind::UMax;
24126 return RecurKind::UMin;
24129 return RecurKind::None;
24133 static unsigned getFirstOperandIndex(Instruction *
I) {
24134 return isCmpSelMinMax(
I) ? 1 : 0;
24139 static unsigned getNumberOfOperands(Instruction *
I) {
24140 return isCmpSelMinMax(
I) ? 3 : 2;
24145 static bool hasSameParent(Instruction *
I, BasicBlock *BB) {
24146 if (isCmpSelMinMax(
I) || isBoolLogicOp(
I)) {
24149 return Sel->getParent() == BB &&
Cmp &&
Cmp->getParent() == BB;
24151 return I->getParent() == BB;
24155 static bool hasRequiredNumberOfUses(
bool IsCmpSelMinMax, Instruction *
I) {
24156 if (IsCmpSelMinMax) {
24160 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
24161 return I->hasNUses(2);
24169 void initReductionOps(Instruction *
I) {
24170 if (isCmpSelMinMax(
I))
24171 ReductionOps.assign(2, ReductionOpsType());
24173 ReductionOps.assign(1, ReductionOpsType());
24177 void addReductionOps(Instruction *
I) {
24178 if (isCmpSelMinMax(
I)) {
24180 ReductionOps[1].emplace_back(
I);
24182 ReductionOps[0].emplace_back(
I);
24187 int Sz =
Data.size();
24196 : ReductionRoot(
I), ReductionLimit(2) {
24197 RdxKind = HorizontalReduction::getRdxKind(
I);
24198 ReductionOps.emplace_back().push_back(
I);
24201 ReducedValsToOps[
V].push_back(
I);
24204 bool matchReductionForOperands()
const {
24207 assert(ReductionRoot &&
"Reduction root is not set!");
24210 return Ops.size() == 2;
24218 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
24219 ScalarEvolution &SE,
const DataLayout &
DL,
24220 const TargetLibraryInfo &TLI) {
24221 RdxKind = HorizontalReduction::getRdxKind(Root);
24222 if (!isVectorizable(RdxKind, Root))
24234 if (!Sel->getCondition()->hasOneUse())
24237 ReductionRoot = Root;
24242 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
24244 1, std::make_pair(Root, 0));
24249 SmallVectorImpl<Value *> &PossibleReducedVals,
24250 SmallVectorImpl<Instruction *> &ReductionOps,
24253 getNumberOfOperands(TreeN)))) {
24254 Value *EdgeVal = getRdxOperand(TreeN,
I);
24255 ReducedValsToOps[EdgeVal].push_back(TreeN);
24263 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
24264 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
24265 !isVectorizable(RdxKind, EdgeInst) ||
24266 (
R.isAnalyzedReductionRoot(EdgeInst) &&
24268 PossibleReducedVals.push_back(EdgeVal);
24271 ReductionOps.push_back(EdgeInst);
24280 size_t, SmallMapVector<size_t, SmallMapVector<Value *, unsigned, 2>, 2>,
24282 PossibleReducedVals;
24283 initReductionOps(Root);
24285 SmallSet<size_t, 2> LoadKeyUsed;
24287 auto GenerateLoadsSubkey = [&](
size_t Key, LoadInst *LI) {
24292 auto LIt = LoadsMap.
find(std::make_pair(
Key, Ptr));
24293 if (LIt != LoadsMap.
end()) {
24294 for (LoadInst *RLI : LIt->second) {
24300 for (LoadInst *RLI : LIt->second) {
24307 if (LIt->second.size() > 2) {
24309 hash_value(LIt->second.back()->getPointerOperand());
24315 .first->second.push_back(LI);
24319 while (!Worklist.empty()) {
24320 auto [TreeN,
Level] = Worklist.pop_back_val();
24323 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
24324 addReductionOps(TreeN);
24327 for (
Value *V : PossibleRedVals) {
24331 ++PossibleReducedVals[
Key][Idx].
try_emplace(V, 0).first->second;
24333 for (Instruction *
I :
reverse(PossibleReductionOps))
24334 Worklist.emplace_back(
I,
I->getParent() == BB ? 0 : Level + 1);
24336 auto PossibleReducedValsVect = PossibleReducedVals.
takeVector();
24339 for (
auto &PossibleReducedVals : PossibleReducedValsVect) {
24340 auto PossibleRedVals = PossibleReducedVals.second.
takeVector();
24342 for (
auto &Slice : PossibleRedVals) {
24344 auto RedValsVect = Slice.second.takeVector();
24346 for (
const std::pair<Value *, unsigned> &
Data : RedValsVect)
24347 PossibleRedValsVect.
back().append(
Data.second,
Data.first);
24349 stable_sort(PossibleRedValsVect, [](
const auto &P1,
const auto &P2) {
24350 return P1.size() > P2.size();
24357 }
else if (!isGoodForReduction(
Data)) {
24360 if (!LI || !LastLI ||
24365 ReducedVals.
back().append(
Data.rbegin(),
Data.rend());
24371 return P1.size() > P2.
size();
24377 Value *tryToReduce(BoUpSLP &V,
const DataLayout &
DL, TargetTransformInfo *
TTI,
24378 const TargetLibraryInfo &TLI, AssumptionCache *AC,
24379 DominatorTree &DT) {
24380 constexpr unsigned RegMaxNumber = 4;
24381 constexpr unsigned RedValsMaxNumber = 128;
24385 if (
unsigned NumReducedVals = std::accumulate(
24386 ReducedVals.
begin(), ReducedVals.
end(), 0,
24388 if (!isGoodForReduction(Vals))
24390 return Num + Vals.size();
24392 NumReducedVals < ReductionLimit &&
24396 for (ReductionOpsType &RdxOps : ReductionOps)
24397 for (
Value *RdxOp : RdxOps)
24402 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
24408 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.
size() *
24409 ReducedVals.
front().size());
24413 auto &&GetCmpForMinMaxReduction = [](
Instruction *RdxRootInst) {
24415 "Expected min/max reduction to have select root instruction");
24418 "Expected min/max reduction to have compare condition");
24422 bool AnyBoolLogicOp =
any_of(ReductionOps.back(), [](
Value *V) {
24423 return isBoolLogicOp(cast<Instruction>(V));
24426 auto GetNewVectorizedTree = [&](
Value *VectorizedTree,
Value *Res) {
24427 if (VectorizedTree) {
24431 if (AnyBoolLogicOp) {
24432 auto It = ReducedValsToOps.
find(VectorizedTree);
24433 auto It1 = ReducedValsToOps.
find(Res);
24434 if ((It == ReducedValsToOps.
end() && It1 == ReducedValsToOps.
end()) ||
24436 (It != ReducedValsToOps.
end() &&
24437 any_of(It->getSecond(), [&](Instruction *
I) {
24438 return isBoolLogicOp(I) &&
24439 getRdxOperand(I, 0) == VectorizedTree;
24443 (It1 != ReducedValsToOps.
end() &&
24444 any_of(It1->getSecond(), [&](Instruction *
I) {
24445 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
24449 VectorizedTree = Builder.
CreateFreeze(VectorizedTree);
24453 return createOp(Builder, RdxKind, VectorizedTree, Res,
"op.rdx",
24459 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
24460 ReductionOps.front().size());
24461 for (ReductionOpsType &RdxOps : ReductionOps)
24462 for (
Value *RdxOp : RdxOps) {
24465 IgnoreList.insert(RdxOp);
24468 FastMathFlags RdxFMF;
24470 for (
Value *U : IgnoreList)
24472 RdxFMF &= FPMO->getFastMathFlags();
24478 for (
Value *V : Candidates)
24479 TrackedVals.try_emplace(V, V);
24481 auto At = [](SmallMapVector<Value *, unsigned, 16> &MV,
24482 Value *
V) ->
unsigned & {
24483 auto *It = MV.
find(V);
24484 assert(It != MV.
end() &&
"Unable to find given key.");
24488 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.
size());
24491 SmallPtrSet<Value *, 4> RequiredExtract;
24492 WeakTrackingVH VectorizedTree =
nullptr;
24493 bool CheckForReusedReductionOps =
false;
24498 for (
unsigned I = 0,
E = ReducedVals.
size();
I <
E; ++
I) {
24500 InstructionsState S = States[
I];
24503 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.
size());
24504 for (
Value *ReducedVal : OrigReducedVals) {
24505 Value *RdxVal = TrackedVals.at(ReducedVal);
24512 (!S || !S.getMatchingMainOpOrAltOp(Inst))) ||
24516 TrackedToOrig.try_emplace(RdxVal, ReducedVal);
24518 bool ShuffledExtracts =
false;
24520 if (S && S.getOpcode() == Instruction::ExtractElement &&
24521 !S.isAltShuffle() &&
I + 1 <
E) {
24523 for (
Value *RV : ReducedVals[
I + 1]) {
24524 Value *RdxVal = TrackedVals.at(RV);
24531 CommonCandidates.push_back(RdxVal);
24532 TrackedToOrig.try_emplace(RdxVal, RV);
24534 SmallVector<int>
Mask;
24537 Candidates.
swap(CommonCandidates);
24538 ShuffledExtracts =
true;
24545 Value *OrigV = TrackedToOrig.at(Candidates.
front());
24546 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24548 Res = createOp(Builder, RdxKind, Res, VC,
"const.rdx", ReductionOps);
24549 Value *OrigV = TrackedToOrig.at(VC);
24550 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24552 V.analyzedReductionRoot(ResI);
24554 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
24558 unsigned NumReducedVals = Candidates.
size();
24559 if (NumReducedVals < ReductionLimit &&
24560 (NumReducedVals < 2 || !
isSplat(Candidates)))
24565 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
24566 RdxKind != RecurKind::FMul &&
24567 RdxKind != RecurKind::FMulAdd;
24569 SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
24570 if (IsSupportedHorRdxIdentityOp)
24571 for (
Value *V : Candidates) {
24572 Value *OrigV = TrackedToOrig.at(V);
24573 ++SameValuesCounter.
try_emplace(OrigV).first->second;
24585 bool SameScaleFactor =
false;
24586 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
24587 SameValuesCounter.
size() != Candidates.size();
24589 if (OptReusedScalars) {
24591 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
24592 RdxKind == RecurKind::Xor) &&
24594 [&SameValuesCounter](
const std::pair<Value *, unsigned> &
P) {
24595 return P.second == SameValuesCounter.
front().second;
24597 Candidates.resize(SameValuesCounter.
size());
24598 transform(SameValuesCounter, Candidates.begin(),
24599 [&](
const auto &
P) { return TrackedVals.at(P.first); });
24600 NumReducedVals = Candidates.size();
24602 if (NumReducedVals == 1) {
24603 Value *OrigV = TrackedToOrig.at(Candidates.front());
24604 unsigned Cnt = At(SameValuesCounter, OrigV);
24606 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
24607 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
24608 VectorizedVals.try_emplace(OrigV, Cnt);
24609 ExternallyUsedValues.
insert(OrigV);
24614 unsigned MaxVecRegSize =
V.getMaxVecRegSize();
24615 unsigned EltSize =
V.getVectorElementSize(Candidates[0]);
24616 const unsigned MaxElts = std::clamp<unsigned>(
24618 RegMaxNumber * RedValsMaxNumber);
24620 unsigned ReduxWidth = NumReducedVals;
24621 auto GetVectorFactor = [&, &
TTI = *
TTI](
unsigned ReduxWidth) {
24622 unsigned NumParts, NumRegs;
24623 Type *ScalarTy = Candidates.front()->getType();
24630 while (NumParts > NumRegs) {
24631 assert(ReduxWidth > 0 &&
"ReduxWidth is unexpectedly 0.");
24632 ReduxWidth =
bit_floor(ReduxWidth - 1);
24638 if (NumParts > NumRegs / 2)
24643 ReduxWidth = GetVectorFactor(ReduxWidth);
24644 ReduxWidth = std::min(ReduxWidth, MaxElts);
24646 unsigned Start = 0;
24647 unsigned Pos =
Start;
24649 unsigned PrevReduxWidth = ReduxWidth;
24650 bool CheckForReusedReductionOpsLocal =
false;
24651 auto AdjustReducedVals = [&](
bool IgnoreVL =
false) {
24652 bool IsAnyRedOpGathered = !IgnoreVL &&
V.isAnyGathered(IgnoreList);
24653 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
24656 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
24659 if (Pos < NumReducedVals - ReduxWidth + 1)
24660 return IsAnyRedOpGathered;
24663 if (ReduxWidth > 1)
24664 ReduxWidth = GetVectorFactor(ReduxWidth);
24665 return IsAnyRedOpGathered;
24667 bool AnyVectorized =
false;
24668 SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
24669 while (Pos < NumReducedVals - ReduxWidth + 1 &&
24670 ReduxWidth >= ReductionLimit) {
24673 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
24675 CheckForReusedReductionOps =
true;
24678 PrevReduxWidth = ReduxWidth;
24681 if (IgnoredCandidates.
contains(std::make_pair(Pos, ReduxWidth)) ||
24684 std::make_pair(Pos,
bit_floor(ReduxWidth))) ||
24686 std::make_pair(Pos + (ReduxWidth -
bit_floor(ReduxWidth)),
24688 V.areAnalyzedReductionVals(VL)) {
24689 (void)AdjustReducedVals(
true);
24696 return RedValI &&
V.isDeleted(RedValI);
24699 V.buildTree(VL, IgnoreList);
24700 if (
V.isTreeTinyAndNotFullyVectorizable(
true)) {
24701 if (!AdjustReducedVals())
24702 V.analyzedReductionVals(VL);
24705 if (
V.isLoadCombineReductionCandidate(RdxKind)) {
24706 if (!AdjustReducedVals())
24707 V.analyzedReductionVals(VL);
24710 V.reorderTopToBottom();
24713 VL.front()->getType()->isIntOrIntVectorTy() ||
24714 ReductionLimit > 2);
24718 ExternallyUsedValues);
24722 LocalExternallyUsedValues.insert(ReductionRoot);
24723 for (
unsigned Cnt = 0, Sz = ReducedVals.
size(); Cnt < Sz; ++Cnt) {
24724 if (Cnt ==
I || (ShuffledExtracts && Cnt ==
I - 1))
24726 for (
Value *V : ReducedVals[Cnt])
24728 LocalExternallyUsedValues.insert(TrackedVals[V]);
24730 if (!IsSupportedHorRdxIdentityOp) {
24733 "Reused values counter map is not empty");
24734 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
24735 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
24737 Value *
V = Candidates[Cnt];
24738 Value *OrigV = TrackedToOrig.at(V);
24739 ++SameValuesCounter.
try_emplace(OrigV).first->second;
24742 V.transformNodes();
24745 SmallPtrSet<Value *, 4> Visited;
24746 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
24747 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
24749 Value *RdxVal = Candidates[Cnt];
24750 if (
auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
24751 RdxVal = It->second;
24752 if (!Visited.
insert(RdxVal).second)
24756 if (!VLScalars.contains(RdxVal) &&
V.isVectorized(RdxVal)) {
24757 LocalExternallyUsedValues.insert(RdxVal);
24760 Value *OrigV = TrackedToOrig.at(RdxVal);
24762 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
24763 if (
NumOps != ReducedValsToOps.
at(OrigV).size())
24764 LocalExternallyUsedValues.insert(RdxVal);
24767 if (!IsSupportedHorRdxIdentityOp)
24768 SameValuesCounter.
clear();
24769 for (
Value *RdxVal : VL)
24770 if (RequiredExtract.
contains(RdxVal))
24771 LocalExternallyUsedValues.insert(RdxVal);
24772 V.buildExternalUses(LocalExternallyUsedValues);
24774 V.computeMinimumValueSizes();
24778 getReductionCost(
TTI, VL, IsCmpSelMinMax, RdxFMF, V, DT,
DL, TLI);
24781 <<
" for reduction\n");
24785 V.getORE()->emit([&]() {
24786 return OptimizationRemarkMissed(
SV_NAME,
"HorSLPNotBeneficial",
24787 ReducedValsToOps.
at(VL[0]).front())
24788 <<
"Vectorizing horizontal reduction is possible "
24789 <<
"but not beneficial with cost " <<
ore::NV(
"Cost",
Cost)
24790 <<
" and threshold "
24793 if (!AdjustReducedVals()) {
24794 V.analyzedReductionVals(VL);
24796 if (ReduxWidth > ReductionLimit &&
V.isTreeNotExtendable()) {
24799 *
TTI, VL.front()->getType(), ReduxWidth - 1);
24800 VF >= ReductionLimit;
24802 *
TTI, VL.front()->getType(), VF - 1)) {
24804 V.getCanonicalGraphSize() !=
V.getTreeSize())
24807 IgnoredCandidates.
insert(std::make_pair(
Offset + Idx, VF));
24814 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorizing horizontal reduction at cost:"
24815 <<
Cost <<
". (HorRdx)\n");
24816 V.getORE()->emit([&]() {
24817 return OptimizationRemark(
SV_NAME,
"VectorizedHorizontalReduction",
24818 ReducedValsToOps.
at(VL[0]).front())
24819 <<
"Vectorized horizontal reduction with cost "
24820 <<
ore::NV(
"Cost",
Cost) <<
" and with tree size "
24821 <<
ore::NV(
"TreeSize",
V.getTreeSize());
24830 if (IsCmpSelMinMax)
24831 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
24834 Value *VectorizedRoot =
V.vectorizeTree(
24835 LocalExternallyUsedValues, InsertPt, VectorValuesAndScales);
24838 for (
Value *RdxVal : Candidates) {
24839 Value *OrigVal = TrackedToOrig.at(RdxVal);
24840 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
24841 if (TransformedRdxVal != RdxVal)
24842 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
24851 VectorizedRoot = Builder.
CreateFreeze(VectorizedRoot);
24854 if (OptReusedScalars && !SameScaleFactor) {
24855 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
24856 SameValuesCounter, TrackedToOrig);
24859 Type *ScalarTy = VL.front()->getType();
24864 OptReusedScalars && SameScaleFactor
24865 ? SameValuesCounter.
front().second
24868 ?
V.isSignedMinBitwidthRootNode()
24872 for (
Value *RdxVal : VL) {
24873 Value *OrigV = TrackedToOrig.at(RdxVal);
24874 if (IsSupportedHorRdxIdentityOp) {
24875 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
24878 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24879 if (!
V.isVectorized(RdxVal))
24880 RequiredExtract.
insert(RdxVal);
24884 ReduxWidth = NumReducedVals - Pos;
24885 if (ReduxWidth > 1)
24886 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
24887 AnyVectorized =
true;
24889 if (OptReusedScalars && !AnyVectorized) {
24890 for (
const std::pair<Value *, unsigned> &
P : SameValuesCounter) {
24891 Value *RdxVal = TrackedVals.at(
P.first);
24892 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder,
P.second);
24893 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
24894 VectorizedVals.try_emplace(
P.first,
P.second);
24899 if (!VectorValuesAndScales.
empty())
24900 VectorizedTree = GetNewVectorizedTree(
24902 emitReduction(Builder, *
TTI, ReductionRoot->getType()));
24904 if (!VectorizedTree) {
24905 if (!CheckForReusedReductionOps) {
24906 for (ReductionOpsType &RdxOps : ReductionOps)
24907 for (
Value *RdxOp : RdxOps)
24929 auto FixBoolLogicalOps =
24932 if (!AnyBoolLogicOp)
24934 if (isBoolLogicOp(RedOp1) && ((!InitStep &&
LHS == VectorizedTree) ||
24935 getRdxOperand(RedOp1, 0) ==
LHS ||
24938 if (isBoolLogicOp(RedOp2) && ((!InitStep &&
RHS == VectorizedTree) ||
24939 getRdxOperand(RedOp2, 0) ==
RHS ||
24944 if (
LHS != VectorizedTree)
24952 unsigned Sz = InstVals.
size();
24954 for (
unsigned I = 0,
E = (Sz / 2) * 2;
I <
E;
I += 2) {
24957 Value *RdxVal1 = InstVals[
I].second;
24958 Value *StableRdxVal1 = RdxVal1;
24959 auto It1 = TrackedVals.find(RdxVal1);
24960 if (It1 != TrackedVals.end())
24961 StableRdxVal1 = It1->second;
24962 Value *RdxVal2 = InstVals[
I + 1].second;
24963 Value *StableRdxVal2 = RdxVal2;
24964 auto It2 = TrackedVals.find(RdxVal2);
24965 if (It2 != TrackedVals.end())
24966 StableRdxVal2 = It2->second;
24970 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[
I].first,
24972 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
24973 StableRdxVal2,
"op.rdx", ReductionOps);
24974 ExtraReds[
I / 2] = std::make_pair(InstVals[
I].first, ExtraRed);
24977 ExtraReds[Sz / 2] = InstVals.
back();
24983 SmallPtrSet<Value *, 8> Visited;
24985 for (
Value *RdxVal : Candidates) {
24986 if (!Visited.
insert(RdxVal).second)
24988 unsigned NumOps = VectorizedVals.lookup(RdxVal);
24989 for (Instruction *RedOp :
24995 bool InitStep =
true;
24996 while (ExtraReductions.
size() > 1) {
24998 FinalGen(ExtraReductions, InitStep);
24999 ExtraReductions.
swap(NewReds);
25002 VectorizedTree = ExtraReductions.
front().second;
25004 ReductionRoot->replaceAllUsesWith(VectorizedTree);
25011 SmallPtrSet<Value *, 4> IgnoreSet;
25020 for (
auto *U :
Ignore->users()) {
25022 "All users must be either in the reduction ops list.");
25025 if (!
Ignore->use_empty()) {
25027 Ignore->replaceAllUsesWith(
P);
25030 V.removeInstructionsAndOperands(RdxOps, VectorValuesAndScales);
25032 return VectorizedTree;
25038 Value *createSingleOp(IRBuilderBase &Builder,
const TargetTransformInfo &
TTI,
25039 Value *Vec,
unsigned Scale,
bool IsSigned,
25063 Rdx, emitReduction(Lane, Builder, &
TTI, DestTy),
I);
25066 Rdx = emitReduction(Vec, Builder, &
TTI, DestTy);
25068 if (Rdx->
getType() != DestTy)
25074 Rdx = emitScaleForReusedOps(Rdx, Builder, Scale);
25081 bool IsCmpSelMinMax, FastMathFlags FMF,
25082 const BoUpSLP &R, DominatorTree &DT,
25083 const DataLayout &
DL,
25084 const TargetLibraryInfo &TLI) {
25086 Type *ScalarTy = ReducedVals.
front()->getType();
25087 unsigned ReduxWidth = ReducedVals.
size();
25088 FixedVectorType *VectorTy =
R.getReductionType();
25093 auto EvaluateScalarCost = [&](function_ref<
InstructionCost()> GenCostFn) {
25096 int Cnt = ReducedVals.
size();
25097 for (
Value *RdxVal : ReducedVals) {
25102 Cost += GenCostFn();
25106 for (User *U : RdxVal->
users()) {
25108 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
25109 if (RdxKind == RecurKind::FAdd) {
25119 FMACost -= FMulCost;
25121 ScalarCost += FMACost;
25128 ScalarCost = InstructionCost::getInvalid();
25132 Cost += ScalarCost;
25134 Cost += GenCostFn();
25143 bool DoesRequireReductionOp = !AllConsts && VectorValuesAndScales.
empty();
25145 case RecurKind::Add:
25146 case RecurKind::Mul:
25147 case RecurKind::Or:
25148 case RecurKind::And:
25149 case RecurKind::Xor:
25150 case RecurKind::FAdd:
25151 case RecurKind::FMul: {
25154 if (DoesRequireReductionOp) {
25157 unsigned ScalarTyNumElements = VecTy->getNumElements();
25162 ReducedVals.size()),
25173 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
25174 std::make_pair(RedTy,
true));
25175 if (RType == RedTy) {
25180 RdxOpcode, !IsSigned, RedTy,
25186 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
25187 std::make_pair(RedTy,
true));
25190 if (RdxKind == RecurKind::FAdd) {
25195 for (
Value *RdxVal : ReducedVals) {
25201 FMF &= FPCI->getFastMathFlags();
25204 if (!
Ops.empty()) {
25209 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy,
25210 {RVecTy, RVecTy, RVecTy}, FMF);
25216 Instruction::FMul, RVecTy,
CostKind);
25218 <<
"Minus vector FMul cost: " << FMulCost <<
"\n");
25219 FMACost -= FMulCost;
25223 if (FMACost.isValid())
25224 VectorCost += FMACost;
25228 if (RType != RedTy) {
25229 unsigned Opcode = Instruction::Trunc;
25231 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
25237 ScalarCost = EvaluateScalarCost([&]() {
25242 case RecurKind::FMax:
25243 case RecurKind::FMin:
25244 case RecurKind::FMaximum:
25245 case RecurKind::FMinimum:
25246 case RecurKind::SMax:
25247 case RecurKind::SMin:
25248 case RecurKind::UMax:
25249 case RecurKind::UMin: {
25252 if (DoesRequireReductionOp) {
25258 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
25259 std::make_pair(RedTy,
true));
25261 IntrinsicCostAttributes ICA(Id, RVecTy, {RVecTy, RVecTy}, FMF);
25263 if (RType != RedTy) {
25264 unsigned Opcode = Instruction::Trunc;
25266 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
25272 ScalarCost = EvaluateScalarCost([&]() {
25273 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
25282 LLVM_DEBUG(
dbgs() <<
"SLP: Adding cost " << VectorCost - ScalarCost
25284 <<
" (It is a splitting reduction)\n");
25285 return VectorCost - ScalarCost;
25291 Value *emitReduction(IRBuilderBase &Builder,
const TargetTransformInfo &
TTI,
25293 Value *ReducedSubTree =
nullptr;
25295 auto CreateSingleOp = [&](
Value *Vec,
unsigned Scale,
bool IsSigned) {
25296 Value *Rdx = createSingleOp(Builder,
TTI, Vec, Scale, IsSigned, DestTy);
25297 if (ReducedSubTree)
25298 ReducedSubTree = createOp(Builder, RdxKind, ReducedSubTree, Rdx,
25299 "op.rdx", ReductionOps);
25301 ReducedSubTree = Rdx;
25303 if (VectorValuesAndScales.
size() == 1) {
25304 const auto &[Vec, Scale, IsSigned] = VectorValuesAndScales.
front();
25305 CreateSingleOp(Vec, Scale, IsSigned);
25306 return ReducedSubTree;
25310 Value *VecRes =
nullptr;
25311 bool VecResSignedness =
false;
25312 auto CreateVecOp = [&](
Value *Vec,
unsigned Cnt,
bool IsSigned) {
25318 case RecurKind::Add: {
25319 if (ScalarTy == Builder.
getInt1Ty() && ScalarTy != DestTy) {
25322 <<
". (HorRdx)\n");
25325 std::iota(std::next(
Mask.begin(), VF *
I),
25326 std::next(
Mask.begin(), VF * (
I + 1)), 0);
25327 ++NumVectorInstructions;
25338 LLVM_DEBUG(
dbgs() <<
"SLP: Add (to-mul) " << Cnt <<
"of " << Vec
25339 <<
". (HorRdx)\n");
25340 ++NumVectorInstructions;
25344 case RecurKind::Xor: {
25347 <<
"SLP: Xor " << Cnt <<
"of " << Vec <<
". (HorRdx)\n");
25352 case RecurKind::FAdd: {
25356 LLVM_DEBUG(
dbgs() <<
"SLP: FAdd (to-fmul) " << Cnt <<
"of " << Vec
25357 <<
". (HorRdx)\n");
25358 ++NumVectorInstructions;
25362 case RecurKind::And:
25363 case RecurKind::Or:
25364 case RecurKind::SMax:
25365 case RecurKind::SMin:
25366 case RecurKind::UMax:
25367 case RecurKind::UMin:
25368 case RecurKind::FMax:
25369 case RecurKind::FMin:
25370 case RecurKind::FMaximum:
25371 case RecurKind::FMinimum:
25374 case RecurKind::Sub:
25375 case RecurKind::AddChainWithSubs:
25376 case RecurKind::Mul:
25377 case RecurKind::FMul:
25378 case RecurKind::FMulAdd:
25379 case RecurKind::AnyOf:
25380 case RecurKind::FindFirstIVSMin:
25381 case RecurKind::FindFirstIVUMin:
25382 case RecurKind::FindLastIVSMax:
25383 case RecurKind::FindLastIVUMax:
25384 case RecurKind::FMaxNum:
25385 case RecurKind::FMinNum:
25386 case RecurKind::FMaximumNum:
25387 case RecurKind::FMinimumNum:
25388 case RecurKind::None:
25395 VecResSignedness = IsSigned;
25397 ++NumVectorInstructions;
25398 if (ScalarTy == Builder.
getInt1Ty() && ScalarTy != DestTy &&
25404 std::iota(
Mask.begin(),
Mask.end(), 0);
25406 if (VecResVF < VecVF) {
25410 if (VecResVF != VecVF) {
25412 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
25429 if (VecResVF < VecVF) {
25435 if (VecResVF != VecVF)
25437 Op = createOp(Builder, RdxKind,
Op, Vec,
"rdx.op", ReductionOps);
25438 if (VecResVF != VecVF)
25443 for (
auto [Vec, Scale, IsSigned] : VectorValuesAndScales)
25444 CreateVecOp(Vec, Scale, IsSigned);
25445 CreateSingleOp(VecRes, 1,
false);
25447 return ReducedSubTree;
25451 Value *emitReduction(
Value *VectorizedValue, IRBuilderBase &Builder,
25452 const TargetTransformInfo *
TTI,
Type *DestTy) {
25453 assert(VectorizedValue &&
"Need to have a vectorized tree node");
25454 assert(RdxKind != RecurKind::FMulAdd &&
25455 "A call to the llvm.fmuladd intrinsic is not handled yet");
25458 if (FTy->getScalarType() == Builder.
getInt1Ty() &&
25459 RdxKind == RecurKind::Add &&
25464 VectorizedValue, Builder.
getIntNTy(FTy->getNumElements()));
25465 ++NumVectorInstructions;
25468 ++NumVectorInstructions;
25473 Value *emitScaleForReusedOps(
Value *VectorizedValue, IRBuilderBase &Builder,
25475 assert(IsSupportedHorRdxIdentityOp &&
25476 "The optimization of matched scalar identity horizontal reductions "
25477 "must be supported.");
25479 return VectorizedValue;
25481 case RecurKind::Add: {
25483 Value *Scale = ConstantInt::get(VectorizedValue->
getType(), Cnt);
25485 << VectorizedValue <<
". (HorRdx)\n");
25486 return Builder.
CreateMul(VectorizedValue, Scale);
25488 case RecurKind::Xor: {
25490 LLVM_DEBUG(
dbgs() <<
"SLP: Xor " << Cnt <<
"of " << VectorizedValue
25491 <<
". (HorRdx)\n");
25494 return VectorizedValue;
25496 case RecurKind::FAdd: {
25498 Value *Scale = ConstantFP::get(VectorizedValue->
getType(), Cnt);
25500 << VectorizedValue <<
". (HorRdx)\n");
25501 return Builder.
CreateFMul(VectorizedValue, Scale);
25503 case RecurKind::And:
25504 case RecurKind::Or:
25505 case RecurKind::SMax:
25506 case RecurKind::SMin:
25507 case RecurKind::UMax:
25508 case RecurKind::UMin:
25509 case RecurKind::FMax:
25510 case RecurKind::FMin:
25511 case RecurKind::FMaximum:
25512 case RecurKind::FMinimum:
25514 return VectorizedValue;
25515 case RecurKind::Sub:
25516 case RecurKind::AddChainWithSubs:
25517 case RecurKind::Mul:
25518 case RecurKind::FMul:
25519 case RecurKind::FMulAdd:
25520 case RecurKind::AnyOf:
25521 case RecurKind::FindFirstIVSMin:
25522 case RecurKind::FindFirstIVUMin:
25523 case RecurKind::FindLastIVSMax:
25524 case RecurKind::FindLastIVUMax:
25525 case RecurKind::FMaxNum:
25526 case RecurKind::FMinNum:
25527 case RecurKind::FMaximumNum:
25528 case RecurKind::FMinimumNum:
25529 case RecurKind::None:
25538 emitReusedOps(
Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,
25539 const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter,
25540 const DenseMap<Value *, Value *> &TrackedToOrig) {
25541 assert(IsSupportedHorRdxIdentityOp &&
25542 "The optimization of matched scalar identity horizontal reductions "
25543 "must be supported.");
25546 if (VTy->getElementType() != VL.
front()->getType()) {
25550 R.isSignedMinBitwidthRootNode());
25553 case RecurKind::Add: {
25556 for (
Value *V : VL) {
25557 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
25558 Vals.
push_back(ConstantInt::get(
V->getType(), Cnt,
false));
25562 << VectorizedValue <<
". (HorRdx)\n");
25563 return Builder.
CreateMul(VectorizedValue, Scale);
25565 case RecurKind::And:
25566 case RecurKind::Or:
25569 <<
". (HorRdx)\n");
25570 return VectorizedValue;
25571 case RecurKind::SMax:
25572 case RecurKind::SMin:
25573 case RecurKind::UMax:
25574 case RecurKind::UMin:
25575 case RecurKind::FMax:
25576 case RecurKind::FMin:
25577 case RecurKind::FMaximum:
25578 case RecurKind::FMinimum:
25581 <<
". (HorRdx)\n");
25582 return VectorizedValue;
25583 case RecurKind::Xor: {
25588 SmallVector<int>
Mask(
25591 std::iota(
Mask.begin(),
Mask.end(), 0);
25592 bool NeedShuffle =
false;
25593 for (
unsigned I = 0, VF = VL.size();
I < VF; ++
I) {
25595 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
25596 if (Cnt % 2 == 0) {
25598 NeedShuffle =
true;
25604 dbgs() <<
"> of " << VectorizedValue <<
". (HorRdx)\n");
25608 ConstantVector::getNullValue(VectorizedValue->
getType()), Mask);
25609 return VectorizedValue;
25611 case RecurKind::FAdd: {
25614 for (
Value *V : VL) {
25615 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
25616 Vals.
push_back(ConstantFP::get(
V->getType(), Cnt));
25619 return Builder.
CreateFMul(VectorizedValue, Scale);
25621 case RecurKind::Sub:
25622 case RecurKind::AddChainWithSubs:
25623 case RecurKind::Mul:
25624 case RecurKind::FMul:
25625 case RecurKind::FMulAdd:
25626 case RecurKind::AnyOf:
25627 case RecurKind::FindFirstIVSMin:
25628 case RecurKind::FindFirstIVUMin:
25629 case RecurKind::FindLastIVSMax:
25630 case RecurKind::FindLastIVUMax:
25631 case RecurKind::FMaxNum:
25632 case RecurKind::FMinNum:
25633 case RecurKind::FMaximumNum:
25634 case RecurKind::FMinimumNum:
25635 case RecurKind::None:
25645 return HorizontalReduction::getRdxKind(V);
25651 unsigned AggregateSize = 1;
25653 Type *CurrentType =
IV->getType();
25656 for (
auto *Elt : ST->elements())
25657 if (Elt != ST->getElementType(0))
25658 return std::nullopt;
25659 AggregateSize *= ST->getNumElements();
25660 CurrentType = ST->getElementType(0);
25662 AggregateSize *= AT->getNumElements();
25663 CurrentType = AT->getElementType();
25665 AggregateSize *= VT->getNumElements();
25666 return AggregateSize;
25668 return AggregateSize;
25670 return std::nullopt;
25679 unsigned OperandOffset,
const BoUpSLP &R) {
25682 std::optional<unsigned> OperandIndex =
25684 if (!OperandIndex || R.isDeleted(LastInsertInst))
25688 BuildVectorOpds, InsertElts, *OperandIndex, R);
25691 BuildVectorOpds[*OperandIndex] = InsertedOperand;
25692 InsertElts[*OperandIndex] = LastInsertInst;
25695 }
while (LastInsertInst !=
nullptr &&
25722 "Expected insertelement or insertvalue instruction!");
25725 "Expected empty result vectors!");
25728 if (!AggregateSize)
25730 BuildVectorOpds.
resize(*AggregateSize);
25731 InsertElts.
resize(*AggregateSize);
25736 if (BuildVectorOpds.
size() >= 2)
25754 auto DominatedReduxValue = [&](
Value *R) {
25762 if (
P->getIncomingBlock(0) == ParentBB) {
25764 }
else if (
P->getIncomingBlock(1) == ParentBB) {
25768 if (Rdx && DominatedReduxValue(Rdx))
25781 if (
P->getIncomingBlock(0) == BBLatch) {
25783 }
else if (
P->getIncomingBlock(1) == BBLatch) {
25787 if (Rdx && DominatedReduxValue(Rdx))
25823 "Expected binop, select, or intrinsic for reduction matching");
25825 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root));
25827 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
25838 Value *Op0 =
nullptr;
25839 Value *Op1 =
nullptr;
25848 Value *B0 =
nullptr, *B1 =
nullptr;
25853bool SLPVectorizerPass::vectorizeHorReduction(
25854 PHINode *
P, Instruction *Root, BasicBlock *BB,
BoUpSLP &R,
25855 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
25864 auto SelectRoot = [&]() {
25866 HorizontalReduction::getRdxKind(Root) != RecurKind::None)
25883 std::queue<std::pair<Instruction *, unsigned>>
Stack;
25884 Stack.emplace(SelectRoot(), 0);
25885 SmallPtrSet<Value *, 8> VisitedInstrs;
25888 if (
R.isAnalyzedReductionRoot(Inst))
25893 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
25895 return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC, *DT);
25897 auto TryAppendToPostponedInsts = [&](
Instruction *FutureSeed) {
25898 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
25910 while (!
Stack.empty()) {
25913 std::tie(Inst, Level) =
Stack.front();
25918 if (
R.isDeleted(Inst))
25920 if (
Value *VectorizedV = TryToReduce(Inst)) {
25924 Stack.emplace(
I, Level);
25927 if (
R.isDeleted(Inst))
25931 if (!TryAppendToPostponedInsts(Inst)) {
25942 if (VisitedInstrs.
insert(
Op).second)
25947 !
R.isDeleted(
I) &&
I->getParent() == BB)
25948 Stack.emplace(
I, Level);
25953bool SLPVectorizerPass::tryToVectorize(Instruction *
I,
BoUpSLP &R) {
25960 if ((
I->getOpcode() == Instruction::FAdd ||
25961 I->getOpcode() == Instruction::FSub) &&
25971 if (!Op0 || !Op1 || Op0->getParent() !=
P || Op1->getParent() !=
P ||
25972 R.isDeleted(Op0) ||
R.isDeleted(Op1))
25982 if (
A &&
B &&
B->hasOneUse()) {
25985 if (B0 && B0->getParent() ==
P && !
R.isDeleted(B0))
25987 if (B1 && B1->getParent() ==
P && !
R.isDeleted(B1))
25991 if (
B &&
A &&
A->hasOneUse()) {
25994 if (A0 && A0->getParent() ==
P && !
R.isDeleted(A0))
25996 if (A1 && A1->getParent() ==
P && !
R.isDeleted(A1))
26000 auto TryToReduce = [
this, &
R, &TTI = *TTI](
Instruction *Inst,
26004 Type *Ty = Inst->getType();
26008 if (!HorRdx.matchReductionForOperands())
26014 TTI.getScalarizationOverhead(
26017 TTI.getInstructionCost(Inst,
CostKind);
26020 case RecurKind::Add:
26021 case RecurKind::Mul:
26022 case RecurKind::Or:
26023 case RecurKind::And:
26024 case RecurKind::Xor:
26025 case RecurKind::FAdd:
26026 case RecurKind::FMul: {
26029 FMF = FPCI->getFastMathFlags();
26030 RedCost = TTI.getArithmeticReductionCost(Inst->getOpcode(), VecTy, FMF,
26037 if (RedCost >= ScalarCost)
26040 return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC, *DT) !=
nullptr;
26042 if (Candidates.
size() == 1)
26043 return TryToReduce(
I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1},
R);
26046 std::optional<int> BestCandidate =
R.findBestRootPair(Candidates);
26047 if (!BestCandidate)
26049 return (*BestCandidate == 0 &&
26050 TryToReduce(
I, {Candidates[*BestCandidate].first,
26051 Candidates[*BestCandidate].second})) ||
26052 tryToVectorizeList({Candidates[*BestCandidate].first,
26053 Candidates[*BestCandidate].second},
26057bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *
P, Instruction *Root,
26058 BasicBlock *BB,
BoUpSLP &R) {
26060 bool Res = vectorizeHorReduction(
P, Root, BB, R, PostponedInsts);
26061 Res |= tryToVectorize(PostponedInsts, R);
26068 for (
Value *V : Insts)
26070 Res |= tryToVectorize(Inst, R);
26074bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
26077 if (!
R.canMapToVector(IVI->
getType()))
26080 SmallVector<Value *, 16> BuildVectorOpds;
26081 SmallVector<Value *, 16> BuildVectorInsts;
26085 if (MaxVFOnly && BuildVectorOpds.
size() == 2) {
26086 R.getORE()->emit([&]() {
26087 return OptimizationRemarkMissed(
SV_NAME,
"NotPossible", IVI)
26088 <<
"Cannot SLP vectorize list: only 2 elements of buildvalue, "
26089 "trying reduction first.";
26093 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IVI <<
"\n");
26095 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
26098bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
26101 SmallVector<Value *, 16> BuildVectorInsts;
26102 SmallVector<Value *, 16> BuildVectorOpds;
26103 SmallVector<int>
Mask;
26109 if (MaxVFOnly && BuildVectorInsts.
size() == 2) {
26110 R.getORE()->emit([&]() {
26111 return OptimizationRemarkMissed(
SV_NAME,
"NotPossible", IEI)
26112 <<
"Cannot SLP vectorize list: only 2 elements of buildvector, "
26113 "trying reduction first.";
26117 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IEI <<
"\n");
26118 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
26121template <
typename T>
26126 bool MaxVFOnly,
BoUpSLP &R) {
26139 if (!
I || R.isDeleted(
I)) {
26143 auto *SameTypeIt = IncIt;
26146 AreCompatible(VL, *SameTypeIt))) {
26149 if (
I && !R.isDeleted(
I))
26154 unsigned NumElts = VL.
size();
26155 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize starting at nodes ("
26156 << NumElts <<
")\n");
26166 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL), MaxVFOnly)) {
26169 VL.
swap(Candidates);
26170 Candidates.
clear();
26178 auto GetMinNumElements = [&R](
Value *V) {
26179 unsigned EltSize = R.getVectorElementSize(V);
26180 return std::max(2U, R.getMaxVecRegSize() / EltSize);
26182 if (NumElts < GetMinNumElements(*IncIt) &&
26183 (Candidates.
empty() ||
26184 Candidates.
front()->getType() == (*IncIt)->getType())) {
26192 if (Candidates.
size() > 1 &&
26193 (SameTypeIt ==
E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
26194 if (TryToVectorizeHelper(Candidates,
false)) {
26197 }
else if (MaxVFOnly) {
26200 for (
auto *It = Candidates.
begin(), *End = Candidates.
end(); It != End;
26203 if (!
I || R.isDeleted(
I)) {
26207 auto *SameTypeIt = It;
26208 while (SameTypeIt != End &&
26211 AreCompatible(*SameTypeIt, *It))) {
26214 if (
I && !R.isDeleted(
I))
26217 unsigned NumElts = VL.
size();
26218 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL),
26224 Candidates.
clear();
26228 IncIt = SameTypeIt;
26240template <
bool IsCompatibility>
26245 "Expected valid element types only.");
26247 return IsCompatibility;
26250 if (CI1->getOperand(0)->getType()->getTypeID() <
26252 return !IsCompatibility;
26253 if (CI1->getOperand(0)->getType()->getTypeID() >
26256 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
26258 return !IsCompatibility;
26259 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
26268 if (BasePred1 < BasePred2)
26269 return !IsCompatibility;
26270 if (BasePred1 > BasePred2)
26273 bool CI1Preds = Pred1 == BasePred1;
26274 bool CI2Preds = Pred2 == BasePred1;
26275 for (
int I = 0,
E = CI1->getNumOperands();
I <
E; ++
I) {
26276 auto *Op1 = CI1->getOperand(CI1Preds ?
I :
E -
I - 1);
26281 return !IsCompatibility;
26286 if (IsCompatibility) {
26287 if (I1->getParent() != I2->getParent())
26294 return NodeI2 !=
nullptr;
26297 assert((NodeI1 == NodeI2) ==
26299 "Different nodes should have different DFS numbers");
26300 if (NodeI1 != NodeI2)
26304 if (S && (IsCompatibility || !S.isAltShuffle()))
26306 if (IsCompatibility)
26308 if (I1->getOpcode() != I2->getOpcode())
26309 return I1->getOpcode() < I2->getOpcode();
26312 return IsCompatibility;
26315template <
typename ItT>
26321 if (R.isDeleted(
I))
26325 Changed |= vectorizeRootInstruction(
nullptr, RootOp, BB, R);
26326 if (R.isDeleted(
I))
26332 if (R.isDeleted(
I))
26338 auto CompareSorter = [&](
Value *V,
Value *V2) {
26354 if (Vals.
size() <= 1)
26357 Vals, CompareSorter, AreCompatibleCompares,
26360 bool ArePossiblyReducedInOtherBlock =
any_of(Candidates, [](
Value *V) {
26362 auto *Select = dyn_cast<SelectInst>(U);
26364 Select->getParent() != cast<Instruction>(V)->getParent();
26367 if (ArePossiblyReducedInOtherBlock)
26369 return tryToVectorizeList(Candidates, R, MaxVFOnly);
26375bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
26378 "This function only accepts Insert instructions");
26379 bool OpsChanged =
false;
26381 for (
auto *
I :
reverse(Instructions)) {
26387 vectorizeInsertValueInst(LastInsertValue, BB, R,
true);
26390 vectorizeInsertElementInst(LastInsertElem, BB, R,
true);
26393 if (
R.isDeleted(
I))
26395 OpsChanged |= vectorizeHorReduction(
nullptr,
I, BB, R, PostponedInsts);
26401 vectorizeInsertValueInst(LastInsertValue, BB, R,
false);
26403 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
26408 OpsChanged |= tryToVectorize(PostponedInsts, R);
26414bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB,
BoUpSLP &R) {
26417 SmallPtrSet<Value *, 16> VisitedInstrs;
26421 DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;
26422 auto PHICompare = [
this, &PHIToOpcodes](
Value *V1,
Value *V2) {
26425 "Expected vectorizable types only.");
26435 V2->getType()->getScalarSizeInBits())
26438 V2->getType()->getScalarSizeInBits())
26442 if (Opcodes1.
size() < Opcodes2.
size())
26444 if (Opcodes1.
size() > Opcodes2.
size())
26446 for (
int I = 0,
E = Opcodes1.
size();
I <
E; ++
I) {
26455 return NodeI2 !=
nullptr;
26458 assert((NodeI1 == NodeI2) ==
26460 "Different nodes should have different DFS numbers");
26461 if (NodeI1 != NodeI2)
26464 if (S && !S.isAltShuffle() &&
I1->getOpcode() == I2->getOpcode()) {
26480 DT->getNode(V1->getParent());
26482 DT->getNode(V2->getParent());
26484 return NodeI2 !=
nullptr;
26487 assert((NodeI1 == NodeI2) ==
26489 "Different nodes should have different DFS numbers");
26490 if (NodeI1 != NodeI2)
26492 return V1->comesBefore(V2);
26505 return *Id1 < *Id2;
26509 if (
I1->getOpcode() == I2->getOpcode())
26511 return I1->getOpcode() < I2->getOpcode();
26534 auto ValID1 = Opcodes1[
I]->getValueID();
26535 auto ValID2 = Opcodes2[
I]->getValueID();
26536 if (ValID1 == ValID2)
26538 if (ValID1 < ValID2)
26540 if (ValID1 > ValID2)
26549 assert(U1 && U2 &&
"The only thing left should be undef & undef.");
26555 if (VL.empty() || V1 == VL.back())
26557 Value *V2 = VL.back();
26562 if (Opcodes1.
size() != Opcodes2.
size())
26564 for (
int I = 0,
E = Opcodes1.
size();
I <
E; ++
I) {
26570 if (
R.isDeleted(I1) ||
R.isDeleted(I2))
26572 if (
I1->getParent() != I2->getParent())
26580 if (Opcodes1[
I]->getValueID() != Opcodes2[
I]->getValueID())
26586 bool HaveVectorizedPhiNodes =
false;
26590 for (Instruction &
I : *BB) {
26597 if (!VisitedInstrs.
count(
P) && !
R.isDeleted(
P) &&
26602 if (Incoming.
size() <= 1)
26607 for (
Value *V : Incoming) {
26608 SmallVectorImpl<Value *> &Opcodes =
26610 if (!Opcodes.
empty())
26613 SmallPtrSet<Value *, 4> Visited;
26614 while (!Nodes.empty()) {
26618 for (
Value *V :
PHI->incoming_values()) {
26620 Nodes.push_back(PHI1);
26629 Incoming, PHICompare, AreCompatiblePHIs,
26631 return tryToVectorizeList(Candidates, R, MaxVFOnly);
26634 Changed |= HaveVectorizedPhiNodes;
26635 if (HaveVectorizedPhiNodes &&
any_of(PHIToOpcodes, [&](
const auto &
P) {
26637 return !
PHI ||
R.isDeleted(
PHI);
26639 PHIToOpcodes.
clear();
26641 }
while (HaveVectorizedPhiNodes);
26643 VisitedInstrs.
clear();
26645 InstSetVector PostProcessInserts;
26646 SmallSetVector<CmpInst *, 8> PostProcessCmps;
26649 auto VectorizeInsertsAndCmps = [&](
bool VectorizeCmps) {
26650 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
26651 if (VectorizeCmps) {
26653 PostProcessCmps.
clear();
26655 PostProcessInserts.clear();
26661 return PostProcessCmps.
contains(Cmp);
26663 PostProcessInserts.contains(
I);
26669 return I->use_empty() &&
26679 if (
R.isDeleted(&*It))
26682 if (!VisitedInstrs.
insert(&*It).second) {
26683 if (HasNoUsers(&*It) &&
26684 VectorizeInsertsAndCmps(It->isTerminator())) {
26697 if (
P->getNumIncomingValues() == 2) {
26700 if (Root && vectorizeRootInstruction(
P, Root, BB, R)) {
26714 if (BB ==
P->getIncomingBlock(
I) ||
26715 !DT->isReachableFromEntry(
P->getIncomingBlock(
I)))
26721 PI && !IsInPostProcessInstrs(PI)) {
26723 vectorizeRootInstruction(
nullptr, PI,
P->getIncomingBlock(
I), R);
26725 if (Res &&
R.isDeleted(
P)) {
26735 if (HasNoUsers(&*It)) {
26736 bool OpsChanged =
false;
26747 TryToVectorizeRoot |= (
I == Stores.
end() ||
I->second.size() == 1) &&
26748 SI->getValueOperand()->hasOneUse();
26750 if (TryToVectorizeRoot) {
26751 for (
auto *V : It->operand_values()) {
26755 VI && !IsInPostProcessInstrs(VI))
26757 OpsChanged |= vectorizeRootInstruction(
nullptr, VI, BB, R);
26764 VectorizeInsertsAndCmps(It->isTerminator());
26776 PostProcessInserts.insert(&*It);
26784bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB,
BoUpSLP &R) {
26786 for (
auto &Entry : GEPs) {
26789 if (
Entry.second.size() < 2)
26792 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing a getelementptr list of length "
26793 <<
Entry.second.size() <<
".\n");
26801 return !R.isDeleted(GEP);
26803 if (It ==
Entry.second.end())
26805 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
26806 unsigned EltSize =
R.getVectorElementSize(*(*It)->idx_begin());
26807 if (MaxVecRegSize < EltSize)
26810 unsigned MaxElts = MaxVecRegSize / EltSize;
26811 for (
unsigned BI = 0, BE =
Entry.second.size(); BI < BE; BI += MaxElts) {
26812 auto Len = std::min<unsigned>(BE - BI, MaxElts);
26825 Candidates.remove_if([&R](
Value *
I) {
26835 for (
int I = 0,
E = GEPList.size();
I <
E && Candidates.
size() > 1; ++
I) {
26836 auto *GEPI = GEPList[
I];
26837 if (!Candidates.count(GEPI))
26839 const SCEV *SCEVI = SE->getSCEV(GEPList[
I]);
26840 for (
int J =
I + 1; J <
E && Candidates.
size() > 1; ++J) {
26841 auto *GEPJ = GEPList[J];
26842 const SCEV *SCEVJ = SE->getSCEV(GEPList[J]);
26844 Candidates.remove(GEPI);
26845 Candidates.remove(GEPJ);
26846 }
else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
26847 Candidates.remove(GEPJ);
26854 if (Candidates.
size() < 2)
26860 SmallVector<Value *, 16> Bundle(Candidates.
size());
26861 auto BundleIndex = 0
u;
26862 for (
auto *V : Candidates) {
26864 auto *GEPIdx =
GEP->idx_begin()->get();
26866 Bundle[BundleIndex++] = GEPIdx;
26878 Changed |= tryToVectorizeList(Bundle, R);
26884bool SLPVectorizerPass::vectorizeStoreChains(
BoUpSLP &R) {
26889 auto &&StoreSorter = [
this](StoreInst *
V, StoreInst *V2) {
26890 if (
V->getValueOperand()->getType()->getTypeID() <
26893 if (
V->getValueOperand()->getType()->getTypeID() >
26896 if (
V->getPointerOperandType()->getTypeID() <
26897 V2->getPointerOperandType()->getTypeID())
26899 if (
V->getPointerOperandType()->getTypeID() >
26900 V2->getPointerOperandType()->getTypeID())
26902 if (
V->getValueOperand()->getType()->getScalarSizeInBits() <
26905 if (
V->getValueOperand()->getType()->getScalarSizeInBits() >
26911 DomTreeNodeBase<llvm::BasicBlock> *NodeI1 =
26912 DT->getNode(
I1->getParent());
26913 DomTreeNodeBase<llvm::BasicBlock> *NodeI2 =
26914 DT->getNode(I2->getParent());
26915 assert(NodeI1 &&
"Should only process reachable instructions");
26916 assert(NodeI2 &&
"Should only process reachable instructions");
26917 assert((NodeI1 == NodeI2) ==
26919 "Different nodes should have different DFS numbers");
26920 if (NodeI1 != NodeI2)
26922 return I1->getOpcode() < I2->getOpcode();
26924 return V->getValueOperand()->getValueID() <
26928 bool SameParent =
true;
26934 StoreInst *V2 = VL.
back();
26959 SameParent &=
I1 && I2 &&
I1->getParent() == I2->getParent();
26961 for (
auto [SI, V] :
zip(VL, NewVL))
26962 V =
SI->getValueOperand();
26963 NewVL.back() = V1->getValueOperand();
26964 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
26965 InstructionsState S =
Analysis.buildInstructionsState(
26973 return V1->getValueOperand()->
getValueID() ==
26978 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> Attempted;
26979 for (
auto &Pair : Stores) {
26980 if (Pair.second.size() < 2)
26984 << Pair.second.size() <<
".\n");
26993 Pair.second.rend());
26995 ReversedStores, StoreSorter, AreCompatibleStores,
26997 return vectorizeStores(Candidates, R, Attempted);
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool isConstant(const MachineInstr &MI)
AMDGPU Register Bank Select
ReachingDefInfo InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
block Block Frequency Analysis
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
static bool runImpl(Function &F, const TargetLowering &TLI, AssumptionCache *AC)
This is the interface for a simple mod/ref and alias analysis over globals.
static Value * getCondition(Instruction *I)
static void setCondition(Instruction *I, Value *NewCond)
static const HTTPClientCleanup Cleanup
static Type * getIndexType(Value *In)
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
This file provides utility analysis objects describing memory locations.
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
static bool IsSelect(MachineInstr &MI)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
static std::optional< OperandInfo > getOperandInfo(const MachineOperand &MO)
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool isAlternateInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static cl::opt< bool > SplitAlternateInstructions("slp-split-alternate-instructions", cl::init(true), cl::Hidden, cl::desc("Improve the code quality by splitting alternate instructions"))
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static bool isMaskedLoadCompress(ArrayRef< Value * > VL, ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC, const DominatorTree &DT, const TargetLibraryInfo &TLI, const function_ref< bool(Value *)> AreAllUsersVectorized, bool &IsMasked, unsigned &InterleaveFactor, SmallVectorImpl< int > &CompressMask, VectorType *&LoadVecTy)
Checks if the VL can be transformed to a (masked)load + compress or (masked) interleaved load.
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
static cl::opt< bool > VectorizeCopyableElements("slp-copyable-elements", cl::init(true), cl::Hidden, cl::desc("Try to replace values with the idempotent instructions for " "better vectorization."))
Enables vectorization of copyable elements.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static bool allSameOpcode(ArrayRef< Value * > VL)
static InstructionCost canConvertToFMA(ArrayRef< Value * > VL, const InstructionsState &S, DominatorTree &DT, const DataLayout &DL, TargetTransformInfo &TTI, const TargetLibraryInfo &TLI)
Check if we can convert fadd/fsub sequence to FMAD.
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool isCommutative(Instruction *I, Value *ValWithUses, bool IsCopyable=false)
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Creates subvector insert.
static void findBuildAggregateRec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool isSimple(Instruction *I)
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static const SCEV * calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static DebugLoc getDebugLocFromPHI(PHINode &PN)
static std::optional< unsigned > getExtractIndex(const Instruction *E)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static cl::opt< bool > ForceStridedLoads("slp-force-strided-loads", cl::init(false), cl::Hidden, cl::desc("Generate strided loads even if they are not " "profitable. Used for testing only."))
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static bool isMainInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an main operation for the given MainOp and AltOp instruction...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static unsigned getNumberOfPotentiallyCommutativeOps(Instruction *I)
static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy, VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={})
This is similar to TargetTransformInfo::getScalarizationOverhead, but if ScalarTy is a FixedVectorTyp...
static bool buildCompressMask(ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, Type *ScalarTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< int > &CompressMask)
Builds compress-like mask for shuffles for the given PointerOps, ordered with Order.
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
Calculates the costs of vectorized intrinsic (if possible) and vectorized function (if possible) call...
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static bool isValidForAlternation(unsigned Opcode)
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
static SmallVector< Constant * > replicateMask(ArrayRef< Constant * > Val, unsigned VF)
Replicates the given Val VF times.
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static Instruction * findInstructionWithOpcode(ArrayRef< Value * > VL, unsigned Opcode)
Find an instruction with a specific opcode in VL.
static InstructionCost getExtractWithExtendCost(const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput)
This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst is a FixedVectorType,...
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static std::pair< Instruction *, Instruction * > getMainAltOpsNoStateVL(ArrayRef< Value * > VL)
Returns main/alternate instructions for the given VL.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int64_t > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(ArrayRef< T * >, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static unsigned getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy, const unsigned Limit=std::numeric_limits< unsigned >::max())
Returns number of parts, the type VecTy will be split at the codegen phase.
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static bool tryToFindDuplicates(SmallVectorImpl< Value * > &VL, SmallVectorImpl< int > &ReuseShuffleIndices, const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, const InstructionsState &S, const BoUpSLP::EdgeInfo &UserTreeIdx, bool TryPad=false)
Checks that every instruction appears once in the list and if not, packs them, building ReuseShuffleI...
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Value * createExtractVector(IRBuilderBase &Builder, Value *Vec, unsigned SubVecVF, unsigned Index)
Generates subvector extract using Generator or using default shuffle.
static cl::opt< bool > DisableTreeReorder("slp-disable-tree-reorder", cl::init(false), cl::Hidden, cl::desc("Disable tree reordering even if it is " "profitable. Used for testing only."))
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static InstructionCost getVectorInstrCost(const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar, ArrayRef< std::tuple< Value *, User *, int > > ScalarUserAndIdx)
This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy is a FixedVectorType,...
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, Type *ScalarTy, unsigned Opcode0, unsigned Opcode1)
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
static const int BlockSize
LocallyHashedType DenseMapInfo< LocallyHashedType >::Empty
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
static const uint32_t IV[8]
Merges shuffle masks and emits final shuffle instruction, if required.
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost createFreeze(InstructionCost Cost)
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void resetForSameNode()
Reset the builder to handle perfect diamond match.
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Merges shuffle masks and emits final shuffle instruction, if required.
Value * createFreeze(Value *V)
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void resetForSameNode()
Reset the builder to handle perfect diamond match.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
~ShuffleInstructionBuilder()
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
A manager for alias analyses.
Class for arbitrary precision integers.
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
void clearBit(unsigned BitPosition)
Set a given bit to 0.
uint64_t getZExtValue() const
Get zero extended value.
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
unsigned getBitWidth() const
Return the number of bits in the APInt.
bool ult(const APInt &RHS) const
Unsigned less than comparison.
void clearAllBits()
Set every bit to 0.
void negate()
Negate this APInt in place.
unsigned logBase2() const
void setAllBits()
Set every bit to 1.
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
bool isOne() const
Determine if this is a value of 1.
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
const T & back() const
back - Get the last element.
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
const T & front() const
front - Get the first element.
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
const T & consume_front()
consume_front() - Returns the first element and drops it from ArrayRef.
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::reverse_iterator reverse_iterator
InstListType::iterator iterator
Instruction iterators...
LLVM_ABI const_iterator getFirstNonPHIOrDbgOrAlloca() const
Returns an iterator to the first instruction in this block that is not a PHINode, a debug intrinsic,...
InstListType::const_reverse_iterator const_reverse_iterator
bool isEHPad() const
Return true if this basic block is an exception handling block.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Represents analyses that only rely on functions' control flow.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
LLVM_ABI void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
This class is the base class for the comparison instructions.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ ICMP_SLT
signed less than
@ ICMP_SLE
signed less or equal
@ ICMP_UGE
unsigned greater or equal
@ ICMP_UGT
unsigned greater than
@ ICMP_SGT
signed greater than
@ ICMP_ULT
unsigned less than
@ ICMP_SGE
signed greater or equal
@ ICMP_ULE
unsigned less or equal
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Predicate getPredicate() const
Return the predicate for this instruction.
static LLVM_ABI Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
static LLVM_ABI Constant * getBinOpIdentity(unsigned Opcode, Type *Ty, bool AllowRHSConstant=false, bool NSZ=false)
Return the identity constant for a binary opcode.
This is the shared class of boolean and integer constants.
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
const APInt & getValue() const
Return the constant as an APInt value reference.
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
static bool shouldExecute(unsigned CounterName)
static DebugLoc getUnknown()
An analysis that produces DemandedBits for a function.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
bool erase(const KeyT &Val)
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
Base class for the actual dominator tree node.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Convenience struct for specifying and reasoning about fast-math flags.
bool allowReassoc() const
Flag queries.
bool allowContract() const
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
ArrayRef< Type * > params() const
Type * getReturnType() const
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
For the node iterator we just need to turn the TreeEntry iterator into a TreeEntry* iterator so that ...
nodes_iterator operator++()
nodes_iterator(const ItTy &It2)
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Value * CreateFreeze(Value *V, const Twine &Name="")
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
LLVM_ABI Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
LLVM_ABI Value * CreateSelectWithUnknownProfile(Value *C, Value *True, Value *False, StringRef PassName, const Twine &Name="")
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
static InstructionCost getInvalid(CostType Val=0)
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
LLVM_ABI bool mayWriteToMemory() const LLVM_READONLY
Return true if this instruction may modify memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
const SmallVectorImpl< Type * > & getArgTypes() const
An instruction for reading from memory.
Value * getPointerOperand()
Analysis pass that exposes the LoopInfo for a function.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
This class implements a map that also provides access to all stored values in a deterministic order.
VectorType takeVector()
Clear the MapVector and return the underlying vector.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
ValueT lookup(const KeyT &Key) const
std::pair< KeyT, ValueT > & front()
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
T & front() const
front - Get the first element.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
ArrayRef< value_type > getArrayRef() const
size_type size() const
Determine the number of elements in the SetVector.
const value_type & front() const
Return the first element of the SetVector.
void insert_range(Range &&R)
Vector takeVector()
Clear the SetVector and return the underlying vector.
void clear()
Completely clear the SetVector.
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
static LLVM_ABI bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static LLVM_ABI bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static LLVM_ABI bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
bool contains(const T &V) const
Check if the SmallSet contains the given element.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
iterator erase(const_iterator CI)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
The instances of the Type class are immutable: once they are created, they are never changed.
LLVM_ABI bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
bool isVectorTy() const
True if this is an instance of VectorType.
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
bool isPointerTy() const
True if this is an instance of PointerType.
LLVM_ABI unsigned getStructNumElements() const
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
bool isIntegerTy() const
True if this is an instance of IntegerType.
TypeID getTypeID() const
Return the type id for the type.
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
bool isVoidTy() const
Return true if this is 'void'.
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Value * getOperand(unsigned i) const
unsigned getNumOperands() const
iterator_range< value_op_iterator > operand_values()
The Vector Function Database.
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
user_iterator user_begin()
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
unsigned getValueID() const
Return an ID for the concrete type of this object.
LLVM_ABI bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
LLVM_ABI unsigned getNumUses() const
This method computes the number of uses of this Value.
iterator_range< use_iterator > uses()
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
iterator find(const_arg_type_t< ValueT > V)
void insert_range(Range &&R)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
const ParentTy * getParent() const
self_iterator getIterator()
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator I
iterator_adaptor_base()=default
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
static const int ScoreConstants
Constants.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
A helper data structure to hold the operands of a vector of instructions.
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, ArrayRef< ValueList > Operands, const InstructionsState &S, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
Bottom Up SLP Vectorizer.
static bool isIdentityOrder(ArrayRef< unsigned > Order)
Does this non-empty order represent an identity order?
bool isProfitableToReorder() const
Checks if it is profitable to reorder the current tree.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleEntity &SE)
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleBundle &Bundle)
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
unsigned getMaxVecRegSize() const
OptimizationRemarkEmitter * getORE()
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
unsigned getTreeSize() const
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
InstructionCost getSpillCost()
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={}, InstructionCost ReductionCost=TTI::TCC_Free)
bool isVectorized(const Value *V) const
Check if the value is vectorized in the tree.
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool isStridedLoad(ArrayRef< Value * > PointerOps, Type *ScalarTy, Align Alignment, const int64_t Diff, const size_t Sz) const
Checks if strided loads can be generated out of VL loads with pointers PointerOps:
SmallVector< StoreInst *, 8 > StoreList
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
unsigned getMinVecRegSize() const
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
unsigned getMinVF(unsigned Sz) const
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, StridedPtrInfo &SPtrInfo, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
SmallVector< unsigned, 4 > OrdersType
SmallVector< Instruction *, 16 > InstrList
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool analyzeRtStrideCandidate(ArrayRef< Value * > PointerOps, Type *ScalarTy, Align CommonAlignment, SmallVectorImpl< unsigned > &SortedIndices, StridedPtrInfo &SPtrInfo) const
Return true if an array of scalar loads can be replaced with a strided load (with run-time stride).
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleData &SD)
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
SmallVector< Value *, 8 > ValueList
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
bool analyzeConstantStrideCandidate(const ArrayRef< Value * > PointerOps, Type *ElemTy, Align Alignment, const SmallVectorImpl< unsigned > &SortedIndices, const int64_t Diff, Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const
Return true if an array of scalar loads can be replaced with a strided load (with constant stride).
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Gets reordering data for the given tree entry.
SmallPtrSet< Value *, 16 > ValueSet
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals, ArrayRef< std::tuple< Value *, unsigned, bool > > VectorValuesAndScales)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ BasicBlock
Various leaf nodes.
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaxNum(const Opnd0 &Op0, const Opnd1 &Op1)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinimum(const Opnd0 &Op0, const Opnd1 &Op1)
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaximum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinNum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MatchFunctor< Val, Pattern > match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
initializer< Ty > init(const Ty &Val)
unsigned combineHashValue(unsigned a, unsigned b)
Simplistic combination of 32-bit hash values into 32-bit hash values.
@ User
could "use" a pointer
DiagnosticInfoOptimizationBase::Argument NV
friend class Instruction
Iterator for Instructions in a `BasicBlock.
LLVM_ABI iterator begin() const
LLVM_ABI Instruction & front() const
A private "module" namespace for types and utilities used by this pass.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
LLVM_ABI Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
FunctionAddr VTableAddr Value
void stable_sort(R &&Range)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
void fill(R &&Range, T &&Value)
Provide wrappers to std::fill which take ranges instead of having to pass begin/end explicitly.
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
hash_code hash_value(const FixedPointSemantics &Val)
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
MaybeAlign getAlign(const CallInst &I, unsigned Index)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
auto pred_end(const MachineBasicBlock *BB)
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
LLVM_ABI void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
constexpr from_range_t from_range
LLVM_ABI std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
auto cast_or_null(const Y &Val)
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
iterator_range< po_iterator< T > > post_order(const T &G)
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
bool isa_and_nonnull(const Y &Val)
auto binary_search(R &&Range, T &&Value)
Provide wrappers to std::binary_search which take ranges instead of having to pass begin/end explicit...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
DomTreeNodeBase< BasicBlock > DomTreeNode
auto dyn_cast_or_null(const Y &Val)
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
constexpr bool has_single_bit(T Value) noexcept
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
bool isa_and_present(const Y &Val)
isa_and_present<X> - Functionally identical to isa, except that a null value is accepted.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
bool isPointerTy(const Type *T)
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
auto make_first_range(ContainerTy &&c)
Given a container of pairs, return a range over the first elements.
LLVM_ABI std::optional< int64_t > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
LLVM_ABI bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
bool isModOrRefSet(const ModRefInfo MRI)
bool is_sorted(R &&Range, Compare C)
Wrapper function around std::is_sorted to check if elements in a range R are sorted with respect to a...
LLVM_ABI bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Ref
The access may reference the value stored in memory.
@ LLVM_MARK_AS_BITMASK_ENUM
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
LLVM_ABI CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)
Returns the comparison predicate used when expanding a min/max reduction.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
FunctionAddr VTableAddr Next
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
OutputIt copy(R &&Range, OutputIt Out)
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
constexpr unsigned BitWidth
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
auto pred_begin(const MachineBasicBlock *BB)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
template class LLVM_TEMPLATE_ABI DomTreeNodeBase< BasicBlock >
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
LLVM_ABI Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
LLVM_ABI bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
LLVM_ABI bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Used to keep track of an operand bundle.
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
BoUpSLP::TreeEntry TreeEntry
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits(bool IsSimple=false)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits(bool simple=false)
DenseMapInfo< BoUpSLP::TreeEntry * > FirstInfo
static bool isEqual(const BoUpSLP::EdgeInfo &LHS, const BoUpSLP::EdgeInfo &RHS)
static BoUpSLP::EdgeInfo getEmptyKey()
DenseMapInfo< unsigned > SecondInfo
static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val)
static BoUpSLP::EdgeInfo getTombstoneKey()
An information struct used to provide DenseMap with the various necessary components for a given valu...
Add the VectorizableTree to the index iterator to be able to return TreeEntry pointers.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
ContainerTy & VectorizableTree
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
BoUpSLP::TreeEntry TreeEntry
static nodes_iterator nodes_end(BoUpSLP *R)
BoUpSLP::TreeEntry::VecTreeTy ContainerTy
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
TargetTransformInfo * TTI
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Function object to check whether the second component of a container supported by std::get (like std:...
This structure holds any data we need about the edges being traversed during buildTreeRec().
bool operator==(const EdgeInfo &Other) const
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
unsigned EdgeIdx
The operand index of the use.
void dump(raw_ostream &OS) const
Debug print.
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)