74#ifdef EXPENSIVE_CHECKS
109using namespace std::placeholders;
111#define SV_NAME "slp-vectorizer"
112#define DEBUG_TYPE "SLP"
114STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
117 "Controls which SLP graphs should be vectorized.");
121 cl::desc(
"Run the SLP vectorization passes"));
125 cl::desc(
"Enable vectorization for wider vector utilization"));
129 cl::desc(
"Only vectorize if you gain more than this "
134 cl::desc(
"When true, SLP vectorizer bypasses profitability checks based on "
135 "heuristics and makes vectorization decision via cost modeling."));
139 cl::desc(
"Attempt to vectorize horizontal reductions"));
144 "Attempt to vectorize horizontal reductions feeding into a store"));
148 cl::desc(
"Improve the code quality by splitting alternate instructions"));
152 cl::desc(
"Attempt to vectorize for this register size in bits"));
156 cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
164 cl::desc(
"Limit the size of the SLP scheduling region per block"));
168 cl::desc(
"Attempt to vectorize for this register size in bits"));
172 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
176 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
182 cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
191 cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
195 cl::desc(
"The minimum number of loads, which should be considered strided, "
196 "if the stride is > 1 or is runtime value"));
200 cl::desc(
"The maximum stride, considered to be profitable."));
204 cl::desc(
"Disable tree reordering even if it is "
205 "profitable. Used for testing only."));
209 cl::desc(
"Generate strided loads even if they are not "
210 "profitable. Used for testing only."));
214 cl::desc(
"Display the SLP trees with Graphviz"));
218 cl::desc(
"Try to vectorize with non-power-of-2 number of elements."));
223 cl::desc(
"Try to replace values with the idempotent instructions for "
224 "better vectorization."));
256 Ty = Ty->getScalarType();
258 !Ty->isPPC_FP128Ty();
267 return SI->getValueOperand()->getType();
269 return CI->getOperand(0)->getType();
271 return IE->getOperand(1)->getType();
278 "ScalableVectorType is not supported.");
280 return VecTy->getNumElements();
294 Type *Ty,
unsigned Sz) {
299 if (NumParts == 0 || NumParts >= Sz)
314 if (NumParts == 0 || NumParts >= Sz)
319 return (Sz / RegVF) * RegVF;
331 I * VecTyNumElements, VecTyNumElements)))
333 : Mask[
I] * VecTyNumElements + J;
367 unsigned SVNumElements =
369 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
370 if (SVNumElements % ShuffleMaskSize != 0)
372 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
373 if (GroupSize == 0 || (VL.
size() % GroupSize) != 0)
375 unsigned NumGroup = 0;
376 for (
size_t I = 0,
E = VL.
size();
I !=
E;
I += GroupSize) {
378 Value *Src = SV->getOperand(0);
384 if (SV->getOperand(0) != Src)
387 if (!SV->isExtractSubvectorMask(Index))
389 ExpectedIndex.
set(Index / ShuffleMaskSize);
393 if (!ExpectedIndex.
all())
397 assert(NumGroup == (VL.
size() / GroupSize) &&
"Unexpected number of groups");
416 unsigned SVNumElements =
419 unsigned AccumulateLength = 0;
420 for (
Value *V : VL) {
422 for (
int M : SV->getShuffleMask())
424 : AccumulateLength + M);
425 AccumulateLength += SVNumElements;
466 return std::min<unsigned>(PartNumElems,
Size - Part * PartNumElems);
475 OS <<
"Idx: " << Idx <<
", ";
476 OS <<
"n=" << VL.
size() <<
" [" << *VL.
front() <<
", ..]";
499 if (BB !=
II->getParent())
516 Value *FirstNonUndef =
nullptr;
517 for (
Value *V : VL) {
520 if (!FirstNonUndef) {
524 if (V != FirstNonUndef)
527 return FirstNonUndef !=
nullptr;
542 bool IsCopyable =
false) {
544 return Cmp->isCommutative();
546 return BO->isCommutative() ||
547 (BO->getOpcode() == Instruction::Sub &&
554 if (match(U.getUser(),
555 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
556 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
560 auto *I = dyn_cast<BinaryOperator>(U.get());
561 return match(U.getUser(),
562 m_Intrinsic<Intrinsic::abs>(
563 m_Specific(U.get()), m_ConstantInt(Flag))) &&
564 ((!IsCopyable && I && !I->hasNoSignedWrap()) ||
567 (BO->getOpcode() == Instruction::FSub &&
570 return match(U.getUser(),
571 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
573 return I->isCommutative();
593 constexpr unsigned IntrinsicNumOperands = 2;
594 return IntrinsicNumOperands;
596 return I->getNumOperands();
602 static_assert(std::is_same_v<T, InsertElementInst> ||
603 std::is_same_v<T, ExtractElementInst>,
613 if (CI->getValue().uge(VT->getNumElements()))
615 Index *= VT->getNumElements();
616 Index += CI->getZExtValue();
638 Type *CurrentType =
IV->getType();
639 for (
unsigned I :
IV->indices()) {
641 Index *= ST->getNumElements();
642 CurrentType = ST->getElementType(
I);
644 Index *= AT->getNumElements();
645 CurrentType = AT->getElementType();
667 return std::all_of(It, VL.
end(), [&](
Value *V) {
668 if (auto *CI = dyn_cast<CmpInst>(V))
669 return BasePred == CI->getPredicate();
670 if (auto *I = dyn_cast<Instruction>(V))
671 return I->getOpcode() == Opcode;
672 return isa<PoisonValue>(V);
700 if (MaskArg == UseMask::UndefsAsMask)
704 if (MaskArg == UseMask::FirstArg &&
Value < VF)
705 UseMask.reset(
Value);
706 else if (MaskArg == UseMask::SecondArg &&
Value >= VF)
707 UseMask.reset(
Value - VF);
715template <
bool IsPoisonOnly = false>
719 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
727 if (!UseMask.empty()) {
738 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
753 for (
unsigned I = 0,
E = VecTy->getNumElements();
I !=
E; ++
I) {
754 if (
Constant *Elem =
C->getAggregateElement(
I))
756 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
784static std::optional<TargetTransformInfo::ShuffleKind>
791 std::accumulate(VL.
begin(), VL.
end(), 0u, [](
unsigned S,
Value *V) {
792 auto *EI = dyn_cast<ExtractElementInst>(V);
795 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
798 return std::max(S, VTy->getNumElements());
801 Value *Vec1 =
nullptr;
802 Value *Vec2 =
nullptr;
807 Value *Vec = EE->getVectorOperand();
813 ShuffleMode CommonShuffleMode =
Unknown;
815 for (
unsigned I = 0,
E = VL.
size();
I <
E; ++
I) {
822 auto *Vec = EI->getVectorOperand();
836 if (Idx->getValue().uge(
Size))
838 unsigned IntIdx = Idx->getValue().getZExtValue();
845 if (!Vec1 || Vec1 == Vec) {
847 }
else if (!Vec2 || Vec2 == Vec) {
853 if (CommonShuffleMode == Permute)
857 if (Mask[
I] %
Size !=
I) {
858 CommonShuffleMode = Permute;
861 CommonShuffleMode =
Select;
864 if (CommonShuffleMode ==
Select && Vec2)
874 unsigned Opcode =
E->getOpcode();
875 assert((Opcode == Instruction::ExtractElement ||
876 Opcode == Instruction::ExtractValue) &&
877 "Expected extractelement or extractvalue instruction.");
878 if (Opcode == Instruction::ExtractElement) {
882 return CI->getZExtValue();
885 if (EI->getNumIndices() != 1)
887 return *EI->idx_begin();
921class BinOpSameOpcodeHelper {
922 using MaskType = std::uint_fast16_t;
924 constexpr static std::initializer_list<unsigned> SupportedOp = {
925 Instruction::Add, Instruction::Sub, Instruction::Mul, Instruction::Shl,
926 Instruction::AShr, Instruction::And, Instruction::Or, Instruction::Xor};
936 MainOpBIT = 0b100000000,
944 static std::pair<ConstantInt *, unsigned>
945 isBinOpWithConstantInt(
const Instruction *
I) {
946 unsigned Opcode =
I->getOpcode();
952 if (Opcode == Instruction::Sub || Opcode == Instruction::Shl ||
953 Opcode == Instruction::AShr)
959 struct InterchangeableInfo {
962 MaskType Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |
963 MulBIT | AShrBIT | ShlBIT;
968 MaskType SeenBefore = 0;
969 InterchangeableInfo(
const Instruction *I) : I(I) {}
973 bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) {
974 if (Mask & InterchangeableMask) {
975 SeenBefore |= OpcodeInMaskForm;
976 Mask &= InterchangeableMask;
981 bool equal(
unsigned Opcode) {
982 return Opcode == I->getOpcode() && trySet(MainOpBIT, MainOpBIT);
985 MaskType Candidate = Mask & SeenBefore;
986 if (Candidate & MainOpBIT)
987 return I->getOpcode();
988 if (Candidate & ShlBIT)
989 return Instruction::Shl;
990 if (Candidate & AShrBIT)
991 return Instruction::AShr;
992 if (Candidate & MulBIT)
993 return Instruction::Mul;
994 if (Candidate & AddBIT)
995 return Instruction::Add;
996 if (Candidate & SubBIT)
997 return Instruction::Sub;
998 if (Candidate & AndBIT)
999 return Instruction::And;
1000 if (Candidate & OrBIT)
1001 return Instruction::Or;
1002 if (Candidate & XorBIT)
1003 return Instruction::Xor;
1008 bool hasCandidateOpcode(
unsigned Opcode)
const {
1009 MaskType Candidate = Mask & SeenBefore;
1011 case Instruction::Shl:
1012 return Candidate & ShlBIT;
1013 case Instruction::AShr:
1014 return Candidate & AShrBIT;
1015 case Instruction::Mul:
1016 return Candidate & MulBIT;
1017 case Instruction::Add:
1018 return Candidate & AddBIT;
1019 case Instruction::Sub:
1020 return Candidate & SubBIT;
1021 case Instruction::And:
1022 return Candidate & AndBIT;
1023 case Instruction::Or:
1024 return Candidate & OrBIT;
1025 case Instruction::Xor:
1026 return Candidate & XorBIT;
1027 case Instruction::LShr:
1028 case Instruction::FAdd:
1029 case Instruction::FSub:
1030 case Instruction::FMul:
1031 case Instruction::SDiv:
1032 case Instruction::UDiv:
1033 case Instruction::FDiv:
1034 case Instruction::SRem:
1035 case Instruction::URem:
1036 case Instruction::FRem:
1046 unsigned FromOpcode = I->getOpcode();
1047 if (FromOpcode == ToOpcode)
1050 auto [CI, Pos] = isBinOpWithConstantInt(I);
1051 const APInt &FromCIValue = CI->getValue();
1052 unsigned FromCIValueBitWidth = FromCIValue.
getBitWidth();
1054 switch (FromOpcode) {
1055 case Instruction::Shl:
1056 if (ToOpcode == Instruction::Mul) {
1060 assert(FromCIValue.
isZero() &&
"Cannot convert the instruction.");
1061 ToCIValue = ToOpcode == Instruction::And
1063 : APInt::getZero(FromCIValueBitWidth);
1066 case Instruction::Mul:
1068 if (ToOpcode == Instruction::Shl) {
1069 ToCIValue = APInt(FromCIValueBitWidth, FromCIValue.
logBase2());
1071 assert(FromCIValue.
isOne() &&
"Cannot convert the instruction.");
1072 ToCIValue = ToOpcode == Instruction::And
1074 : APInt::getZero(FromCIValueBitWidth);
1077 case Instruction::Add:
1078 case Instruction::Sub:
1079 if (FromCIValue.
isZero()) {
1083 "Cannot convert the instruction.");
1084 ToCIValue = FromCIValue;
1088 case Instruction::And:
1090 ToCIValue = ToOpcode == Instruction::Mul
1092 : APInt::getZero(FromCIValueBitWidth);
1095 assert(FromCIValue.
isZero() &&
"Cannot convert the instruction.");
1099 Value *
LHS = I->getOperand(1 - Pos);
1101 ConstantInt::get(I->getOperand(Pos)->getType(), ToCIValue);
1105 ((FromOpcode == Instruction::Add || FromOpcode == Instruction::Or ||
1106 FromOpcode == Instruction::Xor) &&
1107 ToOpcode == Instruction::Sub))
1112 InterchangeableInfo MainOp;
1113 InterchangeableInfo AltOp;
1115 return ::isValidForAlternation(MainOp.I->getOpcode()) &&
1118 bool initializeAltOp(
const Instruction *
I) {
1128 BinOpSameOpcodeHelper(
const Instruction *MainOp,
1129 const Instruction *AltOp =
nullptr)
1130 : MainOp(MainOp), AltOp(AltOp) {
1133 bool add(
const Instruction *
I) {
1135 "BinOpSameOpcodeHelper only accepts BinaryOperator.");
1136 unsigned Opcode =
I->getOpcode();
1137 MaskType OpcodeInMaskForm;
1140 case Instruction::Shl:
1141 OpcodeInMaskForm = ShlBIT;
1143 case Instruction::AShr:
1144 OpcodeInMaskForm = AShrBIT;
1146 case Instruction::Mul:
1147 OpcodeInMaskForm = MulBIT;
1149 case Instruction::Add:
1150 OpcodeInMaskForm = AddBIT;
1152 case Instruction::Sub:
1153 OpcodeInMaskForm = SubBIT;
1155 case Instruction::And:
1156 OpcodeInMaskForm = AndBIT;
1158 case Instruction::Or:
1159 OpcodeInMaskForm = OrBIT;
1161 case Instruction::Xor:
1162 OpcodeInMaskForm = XorBIT;
1165 return MainOp.equal(Opcode) ||
1166 (initializeAltOp(
I) && AltOp.equal(Opcode));
1168 MaskType InterchangeableMask = OpcodeInMaskForm;
1169 ConstantInt *CI = isBinOpWithConstantInt(
I).first;
1171 constexpr MaskType CanBeAll =
1172 XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | MulBIT | AShrBIT | ShlBIT;
1173 const APInt &CIValue = CI->
getValue();
1175 case Instruction::Shl:
1177 InterchangeableMask = CIValue.
isZero() ? CanBeAll : MulBIT | ShlBIT;
1179 case Instruction::Mul:
1180 if (CIValue.
isOne()) {
1181 InterchangeableMask = CanBeAll;
1185 InterchangeableMask = MulBIT | ShlBIT;
1187 case Instruction::Add:
1188 case Instruction::Sub:
1189 InterchangeableMask = CIValue.
isZero() ? CanBeAll : SubBIT | AddBIT;
1191 case Instruction::And:
1193 InterchangeableMask = CanBeAll;
1195 case Instruction::Xor:
1197 InterchangeableMask = XorBIT | OrBIT | AndBIT | SubBIT | AddBIT;
1201 InterchangeableMask = CanBeAll;
1205 return MainOp.trySet(OpcodeInMaskForm, InterchangeableMask) ||
1206 (initializeAltOp(
I) &&
1207 AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));
1209 unsigned getMainOpcode()
const {
return MainOp.getOpcode(); }
1211 bool hasCandidateOpcode(
unsigned Opcode)
const {
1212 return MainOp.hasCandidateOpcode(Opcode);
1214 bool hasAltOp()
const {
return AltOp.I; }
1215 unsigned getAltOpcode()
const {
1216 return hasAltOp() ? AltOp.getOpcode() : getMainOpcode();
1219 return MainOp.getOperand(
I);
1224class InstructionsState {
1250 bool HasCopyables =
false;
1254 assert(valid() &&
"InstructionsState is invalid.");
1259 assert(valid() &&
"InstructionsState is invalid.");
1264 unsigned getOpcode()
const {
return getMainOp()->getOpcode(); }
1266 unsigned getAltOpcode()
const {
return getAltOp()->getOpcode(); }
1269 bool isAltShuffle()
const {
return getMainOp() != getAltOp(); }
1278 Instruction *getMatchingMainOpOrAltOp(Instruction *
I)
const {
1279 assert(MainOp &&
"MainOp cannot be nullptr.");
1280 if (
I->getOpcode() == MainOp->getOpcode())
1283 assert(AltOp &&
"AltOp cannot be nullptr.");
1284 if (
I->getOpcode() == AltOp->getOpcode())
1286 if (!
I->isBinaryOp())
1288 BinOpSameOpcodeHelper
Converter(MainOp);
1291 if (isAltShuffle() && !
Converter.hasCandidateOpcode(MainOp->getOpcode())) {
1292 BinOpSameOpcodeHelper AltConverter(AltOp);
1293 if (AltConverter.add(
I) && AltConverter.add(AltOp) &&
1294 AltConverter.hasCandidateOpcode(AltOp->getOpcode()))
1297 if (
Converter.hasAltOp() && !isAltShuffle())
1299 return Converter.hasAltOp() ? AltOp : MainOp;
1303 bool isShiftOp()
const {
1304 return getMainOp()->isShift() && getAltOp()->isShift();
1309 return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();
1313 bool isMulDivLikeOp()
const {
1314 constexpr std::array<unsigned, 8> MulDiv = {
1315 Instruction::Mul, Instruction::FMul, Instruction::SDiv,
1316 Instruction::UDiv, Instruction::FDiv, Instruction::SRem,
1317 Instruction::URem, Instruction::FRem};
1323 bool isAddSubLikeOp()
const {
1324 constexpr std::array<unsigned, 4>
AddSub = {
1325 Instruction::Add, Instruction::Sub, Instruction::FAdd,
1332 bool isCmpOp()
const {
1333 return (
getOpcode() == Instruction::ICmp ||
1339 bool valid()
const {
return MainOp && AltOp; }
1341 explicit operator bool()
const {
return valid(); }
1343 InstructionsState() =
delete;
1344 InstructionsState(Instruction *MainOp, Instruction *AltOp,
1345 bool HasCopyables =
false)
1346 : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}
1347 static InstructionsState invalid() {
return {
nullptr,
nullptr}; }
1350 bool isCopyableElement(
Value *V)
const {
1351 assert(valid() &&
"InstructionsState is invalid.");
1354 if (isAltShuffle() ||
getOpcode() == Instruction::GetElementPtr)
1359 if (
I->getParent() != MainOp->getParent() &&
1363 if (
I->getOpcode() == MainOp->getOpcode())
1365 if (!
I->isBinaryOp())
1367 BinOpSameOpcodeHelper
Converter(MainOp);
1373 bool isNonSchedulable(
Value *V)
const {
1374 assert(valid() &&
"InstructionsState is invalid.");
1381 if (getMainOp() == V)
1383 if (isCopyableElement(V)) {
1384 auto IsNonSchedulableCopyableElement = [
this](
Value *
V) {
1386 return !
I ||
isa<PHINode>(
I) ||
I->getParent() != MainOp->getParent() ||
1391 !MainOp->comesBefore(
I));
1394 return IsNonSchedulableCopyableElement(V);
1401 bool areInstructionsWithCopyableElements()
const {
1402 assert(valid() &&
"InstructionsState is invalid.");
1403 return HasCopyables;
1407std::pair<Instruction *, SmallVector<Value *>>
1409 Instruction *SelectedOp = S.getMatchingMainOpOrAltOp(
I);
1410 assert(SelectedOp &&
"Cannot convert the instruction.");
1411 if (
I->isBinaryOp()) {
1413 return std::make_pair(SelectedOp,
Converter.getOperand(SelectedOp));
1432 for (
Value *V : VL) {
1437 if (Inst->getOpcode() == Opcode)
1451 BaseOp0 == Op0 || BaseOp1 == Op1 ||
1462 "Assessing comparisons of different types?");
1472 return (BasePred == Pred &&
1474 (BasePred == SwappedPred &&
1485 return InstructionsState::invalid();
1489 return InstructionsState::invalid();
1494 (VL.
size() == 2 && InstCnt < 2))
1495 return InstructionsState::invalid();
1504 unsigned AltOpcode = Opcode;
1506 BinOpSameOpcodeHelper BinOpHelper(MainOp);
1507 bool SwappedPredsCompatible = IsCmpOp && [&]() {
1509 UniquePreds.
insert(BasePred);
1510 UniqueNonSwappedPreds.
insert(BasePred);
1511 for (
Value *V : VL) {
1518 UniqueNonSwappedPreds.
insert(CurrentPred);
1519 if (!UniquePreds.
contains(CurrentPred) &&
1520 !UniquePreds.
contains(SwappedCurrentPred))
1521 UniquePreds.
insert(CurrentPred);
1526 return UniqueNonSwappedPreds.
size() > 2 && UniquePreds.
size() == 2;
1536 return InstructionsState::invalid();
1538 bool AnyPoison = InstCnt != VL.
size();
1549 if (AnyPoison && (
I->isIntDivRem() ||
I->isFPDivRem() ||
isa<CallInst>(
I)))
1550 return InstructionsState::invalid();
1551 unsigned InstOpcode =
I->getOpcode();
1553 if (BinOpHelper.add(
I))
1558 Value *Op1 =
I->getOperand(0);
1561 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
1563 if (Opcode == AltOpcode) {
1566 "Cast isn't safe for alternation, logic needs to be updated!");
1567 AltOpcode = InstOpcode;
1574 Type *Ty0 = BaseInst->getOperand(0)->getType();
1575 Type *Ty1 = Inst->getOperand(0)->getType();
1577 assert(InstOpcode == Opcode &&
"Expected same CmpInst opcode.");
1578 assert(InstOpcode == AltOpcode &&
1579 "Alternate instructions are only supported by BinaryOperator "
1587 if ((VL.
size() == 2 || SwappedPredsCompatible) &&
1588 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1594 if (MainOp != AltOp) {
1597 }
else if (BasePred != CurrentPred) {
1600 "CmpInst isn't safe for alternation, logic needs to be updated!");
1605 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1606 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1609 }
else if (InstOpcode == Opcode) {
1610 assert(InstOpcode == AltOpcode &&
1611 "Alternate instructions are only supported by BinaryOperator and "
1614 if (Gep->getNumOperands() != 2 ||
1616 return InstructionsState::invalid();
1619 return InstructionsState::invalid();
1622 if (!LI->isSimple() || !BaseLI->isSimple())
1623 return InstructionsState::invalid();
1627 return InstructionsState::invalid();
1628 if (
Call->hasOperandBundles() &&
1630 !std::equal(
Call->op_begin() +
Call->getBundleOperandsStartIndex(),
1631 Call->op_begin() +
Call->getBundleOperandsEndIndex(),
1634 return InstructionsState::invalid();
1637 return InstructionsState::invalid();
1640 if (Mappings.
size() != BaseMappings.
size() ||
1641 Mappings.
front().ISA != BaseMappings.
front().ISA ||
1642 Mappings.
front().ScalarName != BaseMappings.
front().ScalarName ||
1643 Mappings.
front().VectorName != BaseMappings.
front().VectorName ||
1644 Mappings.
front().Shape.VF != BaseMappings.
front().Shape.VF ||
1645 Mappings.
front().Shape.Parameters !=
1646 BaseMappings.
front().Shape.Parameters)
1647 return InstructionsState::invalid();
1652 return InstructionsState::invalid();
1657 assert(MainOp &&
"Cannot find MainOp with Opcode from BinOpHelper.");
1659 assert(MainOp &&
"Cannot find AltOp with Opcode from BinOpHelper.");
1662 "Incorrect implementation of allSameOpcode.");
1663 InstructionsState S(MainOp, AltOp);
1669 "Invalid InstructionsState.");
1677 return all_of(VL, [&](
Value *V) {
return V->getType() == Ty; });
1687 unsigned Opcode = UserInst->
getOpcode();
1689 case Instruction::Load: {
1693 case Instruction::Store: {
1695 return (
SI->getPointerOperand() == Scalar);
1697 case Instruction::Call: {
1701 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1702 Arg.value().get() == Scalar;
1722 return LI->isSimple();
1724 return SI->isSimple();
1726 return !
MI->isVolatile();
1734 bool ExtendingManyInputs =
false) {
1735 if (SubMask.
empty())
1738 (!ExtendingManyInputs || SubMask.
size() > Mask.size() ||
1741 "SubMask with many inputs support must be larger than the mask.");
1743 Mask.append(SubMask.
begin(), SubMask.
end());
1747 int TermValue = std::min(Mask.size(), SubMask.
size());
1748 for (
int I = 0,
E = SubMask.
size();
I <
E; ++
I) {
1750 (!ExtendingManyInputs &&
1751 (SubMask[
I] >= TermValue || Mask[SubMask[
I]] >= TermValue)))
1753 NewMask[
I] = Mask[SubMask[
I]];
1769 const size_t Sz = Order.
size();
1772 for (
unsigned I = 0;
I < Sz; ++
I) {
1774 UnusedIndices.
reset(Order[
I]);
1776 MaskedIndices.
set(
I);
1778 if (MaskedIndices.
none())
1781 "Non-synced masked/available indices.");
1785 assert(Idx >= 0 &&
"Indices must be synced.");
1795 unsigned Opcode0,
unsigned Opcode1) {
1802 OpcodeMask.
set(Lane * ScalarTyNumElements,
1803 Lane * ScalarTyNumElements + ScalarTyNumElements);
1812 "Expected scalar constants.");
1815 std::fill_n(NewVal.begin() +
I * VF, VF, V);
1822 const unsigned E = Indices.
size();
1824 for (
unsigned I = 0;
I <
E; ++
I)
1825 Mask[Indices[
I]] =
I;
1831 assert(!Mask.empty() &&
"Expected non-empty mask.");
1835 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
1837 Scalars[Mask[
I]] = Prev[
I];
1850 auto *IO = dyn_cast<Instruction>(V);
1853 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1866 return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(
UsesLimit) &&
1868 auto *IU = dyn_cast<Instruction>(U);
1871 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1887 return !VL.
empty() &&
1903 return NumParts > 0 && NumParts < Sz &&
has_single_bit(Sz / NumParts) &&
1912 const unsigned Limit = std::numeric_limits<unsigned>::max()) {
1913 unsigned NumParts =
TTI.getNumberOfParts(VecTy);
1914 if (NumParts == 0 || NumParts >= Limit)
1917 if (NumParts >= Sz || Sz % NumParts != 0 ||
1926 class ScheduleEntity;
1928 class ScheduleCopyableData;
1929 class ScheduleBundle;
1939 struct StridedPtrInfo {
1940 Value *StrideVal =
nullptr;
1941 const SCEV *StrideSCEV =
nullptr;
1967 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1968 AC(AC), DB(DB), DL(DL), ORE(ORE),
1987 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
2000 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales = {});
2014 const SmallDenseSet<Value *> &UserIgnoreLst);
2021 assert(!VectorizableTree.empty() &&
"No graph to get the first node from");
2022 return VectorizableTree.front()->Scalars;
2028 const TreeEntry &Root = *VectorizableTree.front();
2029 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
2030 !Root.Scalars.
front()->getType()->isIntegerTy())
2031 return std::nullopt;
2032 auto It = MinBWs.find(&Root);
2033 if (It != MinBWs.end())
2037 if (Root.getOpcode() == Instruction::ZExt ||
2038 Root.getOpcode() == Instruction::SExt)
2039 return std::make_pair(
cast<CastInst>(Root.getMainOp())->getSrcTy(),
2040 Root.getOpcode() == Instruction::SExt);
2041 return std::nullopt;
2047 return MinBWs.at(VectorizableTree.front().get()).second;
2052 if (ReductionBitWidth == 0 ||
2053 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
2054 ReductionBitWidth >=
2055 DL->getTypeSizeInBits(
2056 VectorizableTree.front()->Scalars.front()->getType()))
2058 VectorizableTree.front()->Scalars.front()->getType(),
2059 VectorizableTree.front()->getVectorFactor());
2062 VectorizableTree.front()->Scalars.front()->getContext(),
2064 VectorizableTree.front()->getVectorFactor());
2079 VectorizableTree.clear();
2080 ScalarToTreeEntries.clear();
2081 OperandsToTreeEntry.clear();
2082 ScalarsInSplitNodes.clear();
2084 NonScheduledFirst.clear();
2085 EntryToLastInstruction.clear();
2086 LastInstructionToPos.clear();
2087 LoadEntriesToVectorize.clear();
2088 IsGraphTransformMode =
false;
2089 GatheredLoadsEntriesFirst.reset();
2090 CompressEntryToData.clear();
2091 ExternalUses.clear();
2092 ExternalUsesAsOriginalScalar.clear();
2093 ExternalUsesWithNonUsers.clear();
2094 for (
auto &Iter : BlocksSchedules) {
2095 BlockScheduling *BS = Iter.second.get();
2099 ReductionBitWidth = 0;
2101 CastMaxMinBWSizes.reset();
2102 ExtraBitWidthNodes.clear();
2103 InstrElementSize.clear();
2104 UserIgnoreList =
nullptr;
2105 PostponedGathers.clear();
2106 ValueToGatherNodes.clear();
2107 TreeEntryToStridedPtrInfoMap.clear();
2123 assert(!Order.
empty() &&
"expected non-empty order");
2124 const unsigned Sz = Order.
size();
2126 return P.value() ==
P.index() ||
P.value() == Sz;
2139 bool IgnoreReorder);
2152 std::optional<OrdersType>
2190 return MaxVecRegSize;
2195 return MinVecRegSize;
2203 unsigned MaxVF =
MaxVFOption.getNumOccurrences() ?
2204 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
2205 return MaxVF ? MaxVF : UINT_MAX;
2244 Align Alignment,
const int64_t Diff,
2245 const size_t Sz)
const;
2285 Value *Ptr0,
Value *PtrN, StridedPtrInfo &SPtrInfo)
const;
2303 Align CommonAlignment,
2305 StridedPtrInfo &SPtrInfo)
const;
2320 StridedPtrInfo &SPtrInfo,
2321 unsigned *BestVF =
nullptr,
2322 bool TryRecursiveCheck =
true)
const;
2326 ListOfKnonwnNonVectorizableLoads.insert(
hash_value(VL));
2330 template <
typename T>
2332 return ListOfKnonwnNonVectorizableLoads.contains(
hash_value(VL));
2357 OS <<
"{User:" << (
UserTE ? std::to_string(
UserTE->Idx) :
"null")
2358 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
2383 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
2384 MaxLevel(MaxLevel) {}
2440 auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
2445 auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
2447 return U == U1 || U == U2 || R.isVectorized(U);
2450 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
2453 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
2455 ((
int)V1->getNumUses() == NumLanes ||
2456 AllUsersAreInternal(V1, V2)))
2462 auto CheckSameEntryOrFail = [&]() {
2467 any_of(TEs2, [&](TreeEntry *E) {
return Set.contains(E); }))
2476 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
2478 return CheckSameEntryOrFail();
2481 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
2482 LI2->getPointerOperand(), DL, SE,
true);
2483 if (!Dist || *Dist == 0) {
2486 R.TTI->isLegalMaskedGather(
2489 return CheckSameEntryOrFail();
2493 if (std::abs(*Dist) > NumLanes / 2)
2526 Value *EV2 =
nullptr;
2539 int Dist = Idx2 - Idx1;
2542 if (std::abs(Dist) == 0)
2544 if (std::abs(Dist) > NumLanes / 2)
2551 return CheckSameEntryOrFail();
2557 if (I1->getParent() != I2->getParent())
2558 return CheckSameEntryOrFail();
2566 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.
empty() ||
2567 !S.isAltShuffle()) &&
2571 S.getMainOp()->getNumOperands();
2583 return CheckSameEntryOrFail();
2617 int ShallowScoreAtThisLevel =
2628 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
2631 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
2633 ShallowScoreAtThisLevel))
2634 return ShallowScoreAtThisLevel;
2635 assert(I1 && I2 &&
"Should have early exited.");
2642 for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
2643 OpIdx1 != NumOperands1; ++OpIdx1) {
2645 int MaxTmpScore = 0;
2646 unsigned MaxOpIdx2 = 0;
2647 bool FoundBest =
false;
2651 ? I2->getNumOperands()
2652 : std::min(I2->getNumOperands(), OpIdx1 + 1);
2653 assert(FromIdx <= ToIdx &&
"Bad index");
2654 for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
2656 if (Op2Used.
count(OpIdx2))
2661 I1, I2, CurrLevel + 1, {});
2664 TmpScore > MaxTmpScore) {
2665 MaxTmpScore = TmpScore;
2672 Op2Used.
insert(MaxOpIdx2);
2673 ShallowScoreAtThisLevel += MaxTmpScore;
2676 return ShallowScoreAtThisLevel;
2707 struct OperandData {
2708 OperandData() =
default;
2709 OperandData(
Value *V,
bool APO,
bool IsUsed)
2710 : V(V), APO(APO), IsUsed(IsUsed) {}
2720 bool IsUsed =
false;
2729 enum class ReorderingMode {
2743 unsigned ArgSize = 0;
2749 const Loop *L =
nullptr;
2752 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
2753 return OpsVec[
OpIdx][Lane];
2757 const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const {
2758 return OpsVec[
OpIdx][Lane];
2763 for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
2765 for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2767 OpsVec[
OpIdx][Lane].IsUsed =
false;
2771 void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
2772 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2784 int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx,
2786 Value *IdxLaneV = getData(Idx, Lane).V;
2799 unsigned UniquesCount = Uniques.
size();
2800 auto IdxIt = Uniques.
find(IdxLaneV);
2801 unsigned UniquesCntWithIdxLaneV =
2802 IdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2804 auto OpIdxIt = Uniques.
find(OpIdxLaneV);
2805 unsigned UniquesCntWithOpIdxLaneV =
2806 OpIdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2807 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2809 return std::min(
bit_ceil(UniquesCntWithOpIdxLaneV) -
2810 UniquesCntWithOpIdxLaneV,
2811 UniquesCntWithOpIdxLaneV -
2813 ((IdxIt != Uniques.
end() && UsedLanes.
test(IdxIt->second))
2814 ? UniquesCntWithIdxLaneV -
bit_floor(UniquesCntWithIdxLaneV)
2815 :
bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2824 int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
2825 Value *IdxLaneV = getData(Idx, Lane).V;
2838 return R.areAllUsersVectorized(IdxLaneI)
2846 static const int ScoreScaleFactor = 10;
2854 int Lane,
unsigned OpIdx,
unsigned Idx,
2864 int SplatScore = getSplatScore(Lane,
OpIdx, Idx, UsedLanes);
2865 if (Score <= -SplatScore) {
2869 Score += SplatScore;
2875 Score *= ScoreScaleFactor;
2876 Score += getExternalUseScore(Lane,
OpIdx, Idx);
2894 std::optional<unsigned>
2895 getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
2899 unsigned NumOperands = getNumOperands();
2902 Value *OpLastLane = getData(
OpIdx, LastLane).V;
2905 ReorderingMode RMode = ReorderingModes[
OpIdx];
2906 if (RMode == ReorderingMode::Failed)
2907 return std::nullopt;
2910 bool OpIdxAPO = getData(
OpIdx, Lane).APO;
2916 std::optional<unsigned> Idx;
2920 BestScoresPerLanes.try_emplace(std::make_pair(
OpIdx, Lane), 0)
2926 bool IsUsed = RMode == ReorderingMode::Splat ||
2927 RMode == ReorderingMode::Constant ||
2928 RMode == ReorderingMode::Load;
2930 for (
unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2932 OperandData &OpData = getData(Idx, Lane);
2934 bool OpAPO = OpData.APO;
2943 if (OpAPO != OpIdxAPO)
2948 case ReorderingMode::Load:
2949 case ReorderingMode::Opcode: {
2950 bool LeftToRight = Lane > LastLane;
2951 Value *OpLeft = (LeftToRight) ? OpLastLane :
Op;
2952 Value *OpRight = (LeftToRight) ?
Op : OpLastLane;
2953 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2954 OpIdx, Idx, IsUsed, UsedLanes);
2955 if (Score >
static_cast<int>(BestOp.Score) ||
2956 (Score > 0 && Score ==
static_cast<int>(BestOp.Score) &&
2959 BestOp.Score = Score;
2960 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] = Score;
2964 case ReorderingMode::Constant:
2966 (!BestOp.Score && L && L->isLoopInvariant(
Op))) {
2970 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] =
2977 case ReorderingMode::Splat:
2979 IsUsed =
Op == OpLastLane;
2980 if (
Op == OpLastLane) {
2982 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] =
2988 case ReorderingMode::Failed:
2994 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2998 return std::nullopt;
3005 unsigned getBestLaneToStartReordering()
const {
3006 unsigned Min = UINT_MAX;
3007 unsigned SameOpNumber = 0;
3018 for (
int I = getNumLanes();
I > 0; --
I) {
3019 unsigned Lane =
I - 1;
3020 OperandsOrderData NumFreeOpsHash =
3021 getMaxNumOperandsThatCanBeReordered(Lane);
3024 if (NumFreeOpsHash.NumOfAPOs < Min) {
3025 Min = NumFreeOpsHash.NumOfAPOs;
3026 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3028 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3029 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
3030 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
3033 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3034 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3035 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
3036 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
3037 auto [It, Inserted] =
3038 HashMap.
try_emplace(NumFreeOpsHash.Hash, 1, Lane);
3044 unsigned BestLane = 0;
3045 unsigned CntMin = UINT_MAX;
3047 if (
Data.second.first < CntMin) {
3048 CntMin =
Data.second.first;
3049 BestLane =
Data.second.second;
3056 struct OperandsOrderData {
3059 unsigned NumOfAPOs = UINT_MAX;
3062 unsigned NumOpsWithSameOpcodeParent = 0;
3076 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const {
3077 unsigned CntTrue = 0;
3078 unsigned NumOperands = getNumOperands();
3088 bool AllUndefs =
true;
3089 unsigned NumOpsWithSameOpcodeParent = 0;
3094 const OperandData &OpData = getData(
OpIdx, Lane);
3101 I->getParent() != Parent) {
3102 if (NumOpsWithSameOpcodeParent == 0) {
3103 NumOpsWithSameOpcodeParent = 1;
3105 Parent =
I->getParent();
3107 --NumOpsWithSameOpcodeParent;
3110 ++NumOpsWithSameOpcodeParent;
3119 OperandsOrderData
Data;
3120 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
3121 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
3128 const InstructionsState &S) {
3132 return VL.
size() == getNumLanes();
3134 "Expected same number of lanes");
3135 assert(S.valid() &&
"InstructionsState is invalid.");
3141 OpsVec.resize(ArgSize);
3142 unsigned NumLanes = VL.
size();
3143 for (OperandDataVec &
Ops : OpsVec)
3144 Ops.resize(NumLanes);
3159 OpsVec[
OpIdx][Lane] = {Operands[
OpIdx][Lane],
true,
false};
3162 bool IsInverseOperation =
false;
3163 if (S.isCopyableElement(VL[Lane])) {
3165 IsInverseOperation =
3168 assert(
I &&
"Expected instruction");
3169 auto [SelectedOp,
Ops] = convertTo(
I, S);
3176 bool APO = (
OpIdx == 0) ?
false : IsInverseOperation;
3177 OpsVec[
OpIdx][Lane] = {Operands[
OpIdx][Lane], APO,
false};
3183 unsigned getNumOperands()
const {
return ArgSize; }
3186 unsigned getNumLanes()
const {
return OpsVec[0].size(); }
3189 Value *getValue(
unsigned OpIdx,
unsigned Lane)
const {
3190 return getData(
OpIdx, Lane).V;
3194 bool empty()
const {
return OpsVec.empty(); }
3197 void clear() { OpsVec.clear(); }
3202 bool shouldBroadcast(
Value *
Op,
unsigned OpIdx,
unsigned Lane) {
3204 "Op is expected to be getValue(OpIdx, Lane).");
3208 bool OpAPO = getData(
OpIdx, Lane).APO;
3209 bool IsInvariant = L && L->isLoopInvariant(
Op);
3211 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3215 bool FoundCandidate =
false;
3216 for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
3217 OperandData &
Data = getData(OpI, Ln);
3218 if (
Data.APO != OpAPO ||
Data.IsUsed)
3220 Value *OpILane = getValue(OpI, Lane);
3244 L->isLoopInvariant(
Data.V))) {
3245 FoundCandidate =
true;
3252 if (!FoundCandidate)
3255 return getNumLanes() == 2 || Cnt > 1;
3262 "Op is expected to be getValue(OpIdx, Lane).");
3263 bool OpAPO = getData(
OpIdx, Lane).APO;
3264 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3268 const OperandData &
Data = getData(OpI, Ln);
3269 if (
Data.APO != OpAPO ||
Data.IsUsed)
3271 Value *OpILn = getValue(OpI, Ln);
3272 return (L && L->isLoopInvariant(OpILn)) ||
3284 const InstructionsState &S,
const BoUpSLP &R)
3285 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
3286 L(R.LI->getLoopFor(S.getMainOp()->
getParent())) {
3288 appendOperands(RootVL, Operands, S);
3296 "Expected same num of lanes across all operands");
3297 for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
3298 OpVL[Lane] = OpsVec[
OpIdx][Lane].V;
3306 unsigned NumOperands = getNumOperands();
3307 unsigned NumLanes = getNumLanes();
3327 unsigned FirstLane = getBestLaneToStartReordering();
3336 if (shouldBroadcast(OpLane0,
OpIdx, FirstLane) ||
3337 !canBeVectorized(OpILane0,
OpIdx, FirstLane))
3338 ReorderingModes[
OpIdx] = ReorderingMode::Splat;
3340 ReorderingModes[
OpIdx] = ReorderingMode::Load;
3342 ReorderingModes[
OpIdx] = ReorderingMode::Opcode;
3344 ReorderingModes[
OpIdx] = ReorderingMode::Constant;
3347 ReorderingModes[
OpIdx] = ReorderingMode::Splat;
3357 auto &&SkipReordering = [
this]() {
3360 for (
const OperandData &
Data : Op0)
3363 ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
3364 if (
any_of(
Op, [&UniqueValues](
const OperandData &
Data) {
3371 return UniqueValues.
size() != 2 &&
3373 UniqueValues.
size());
3385 if (SkipReordering())
3388 bool StrategyFailed =
false;
3396 for (
unsigned I = 0;
I < NumOperands; ++
I)
3397 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
3400 UsedLanes.
set(FirstLane);
3401 for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
3403 for (
int Direction : {+1, -1}) {
3404 int Lane = FirstLane + Direction * Distance;
3405 if (Lane < 0 || Lane >= (
int)NumLanes)
3407 UsedLanes.
set(Lane);
3408 int LastLane = Lane - Direction;
3409 assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
3414 std::optional<unsigned> BestIdx =
3415 getBestOperand(
OpIdx, Lane, LastLane, ReorderingModes,
3416 MainAltOps[
OpIdx], UsedLanes);
3423 swap(
OpIdx, *BestIdx, Lane);
3426 StrategyFailed =
true;
3430 OperandData &AltOp = getData(
OpIdx, Lane);
3431 InstructionsState OpS =
3433 if (OpS && OpS.isAltShuffle())
3440 if (!StrategyFailed)
3445#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3448 case ReorderingMode::Load:
3450 case ReorderingMode::Opcode:
3452 case ReorderingMode::Constant:
3454 case ReorderingMode::Splat:
3456 case ReorderingMode::Failed:
3477 const unsigned Indent = 2;
3479 for (
const OperandDataVec &OpDataVec : OpsVec) {
3480 OS <<
"Operand " << Cnt++ <<
"\n";
3481 for (
const OperandData &OpData : OpDataVec) {
3482 OS.
indent(Indent) <<
"{";
3483 if (
Value *V = OpData.V)
3487 OS <<
", APO:" << OpData.APO <<
"}\n";
3509 int BestScore = Limit;
3510 std::optional<int> Index;
3511 for (
int I :
seq<int>(0, Candidates.size())) {
3513 Candidates[
I].second,
3516 if (Score > BestScore) {
3531 DeletedInstructions.insert(
I);
3536 template <
typename T>
3539 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
3541 for (T *V : DeadVals) {
3546 for (T *V : DeadVals) {
3547 if (!V || !Processed.
insert(V).second)
3552 for (
Use &U :
I->operands()) {
3554 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
3556 (Entries.
empty() ||
none_of(Entries, [&](
const TreeEntry *Entry) {
3557 return Entry->VectorizedValue == OpI;
3561 I->dropAllReferences();
3563 for (T *V : DeadVals) {
3565 if (!
I->getParent())
3570 cast<Instruction>(U.getUser()));
3572 "trying to erase instruction with users.");
3573 I->removeFromParent();
3577 while (!DeadInsts.
empty()) {
3580 if (!VI || !VI->getParent())
3583 "Live instruction found in dead worklist!");
3584 assert(VI->use_empty() &&
"Instructions with uses are not dead.");
3591 for (
Use &OpU : VI->operands()) {
3592 Value *OpV = OpU.get();
3604 if (!DeletedInstructions.contains(OpI) &&
3605 (!OpI->getType()->isVectorTy() ||
3606 none_of(VectorValuesAndScales,
3607 [&](
const std::tuple<Value *, unsigned, bool> &V) {
3608 return std::get<0>(V) == OpI;
3614 VI->removeFromParent();
3616 SE->forgetValue(VI);
3623 return AnalyzedReductionsRoots.count(
I);
3628 AnalyzedReductionsRoots.insert(
I);
3633 return AnalyzedReductionVals.contains(
hash_value(VL));
3638 AnalyzedReductionVals.insert(
hash_value(VL));
3642 AnalyzedReductionsRoots.clear();
3643 AnalyzedReductionVals.clear();
3644 AnalyzedMinBWVals.clear();
3652 return MustGather.contains(V);
3656 return NonScheduledFirst.contains(V);
3661 assert(V &&
"V cannot be nullptr.");
3662 return ScalarToTreeEntries.contains(V);
3672 bool collectValuesToDemote(
3673 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
3676 bool &IsProfitableToDemote,
bool IsTruncRoot)
const;
3685 void buildReorderableOperands(
3693 void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
3696 bool areAllUsersVectorized(
3705 const TreeEntry *getOperandEntry(
const TreeEntry *E,
unsigned Idx)
const;
3706 TreeEntry *getOperandEntry(TreeEntry *E,
unsigned Idx) {
3707 return const_cast<TreeEntry *
>(
3708 getOperandEntry(
const_cast<const TreeEntry *
>(E), Idx));
3714 Instruction *getRootEntryInstruction(
const TreeEntry &Entry)
const;
3718 getCastContextHint(
const TreeEntry &TE)
const;
3732 const InstructionsState &LocalState,
3739 unsigned InterleaveFactor = 0);
3750 bool ResizeAllowed =
false)
const;
3757 Value *vectorizeOperand(TreeEntry *
E,
unsigned NodeIdx);
3762 template <
typename BVTy,
typename ResTy,
typename... Args>
3763 ResTy processBuildVector(
const TreeEntry *
E,
Type *ScalarTy, Args &...Params);
3768 Value *createBuildVector(
const TreeEntry *
E,
Type *ScalarTy);
3774 Instruction &getLastInstructionInBundle(
const TreeEntry *
E);
3781 std::optional<TargetTransformInfo::ShuffleKind>
3793 unsigned NumParts)
const;
3805 std::optional<TargetTransformInfo::ShuffleKind>
3806 isGatherShuffledSingleRegisterEntry(
3823 isGatherShuffledEntry(
3826 unsigned NumParts,
bool ForOrder =
false);
3832 Type *ScalarTy)
const;
3836 void setInsertPointAfterBundle(
const TreeEntry *
E);
3846 bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
3851 void tryToVectorizeGatheredLoads(
3853 std::tuple<BasicBlock *, Value *, Type *>,
3861 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
3877 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
3881 void reorderGatherNode(TreeEntry &TE);
3886 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3889 SmallVector<int> getCommonMask()
const {
3890 if (State == TreeEntry::SplitVectorize)
3892 SmallVector<int>
Mask;
3899 SmallVector<int> getSplitMask()
const {
3900 assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
3901 "Expected only split vectorize node.");
3903 unsigned CommonVF = std::max<unsigned>(
3904 CombinedEntriesWithIndices.back().second,
3905 Scalars.size() - CombinedEntriesWithIndices.back().second);
3906 for (
auto [Idx,
I] :
enumerate(ReorderIndices))
3908 Idx + (Idx >= CombinedEntriesWithIndices.back().second
3909 ? CommonVF - CombinedEntriesWithIndices.back().second
3916 void reorderSplitNode(
unsigned Idx, ArrayRef<int> Mask,
3917 ArrayRef<int> MaskOrder);
3922 if (
Mask.size() != VL.
size() && VL.
size() == Scalars.size())
3923 return std::equal(VL.
begin(), VL.
end(), Scalars.begin());
3926 [Scalars](
Value *V,
int Idx) {
3927 return (isa<UndefValue>(V) &&
3928 Idx == PoisonMaskElem) ||
3929 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3932 if (!ReorderIndices.empty()) {
3936 SmallVector<int>
Mask;
3938 if (VL.
size() == Scalars.size())
3939 return IsSame(Scalars, Mask);
3940 if (VL.
size() == ReuseShuffleIndices.size()) {
3942 return IsSame(Scalars, Mask);
3946 return IsSame(Scalars, ReuseShuffleIndices);
3950 bool hasEqualOperands(
const TreeEntry &TE)
const {
3951 if (
TE.getNumOperands() != getNumOperands())
3953 SmallBitVector
Used(getNumOperands());
3954 for (
unsigned I = 0,
E = getNumOperands();
I <
E; ++
I) {
3955 unsigned PrevCount =
Used.count();
3956 for (
unsigned K = 0;
K <
E; ++
K) {
3959 if (getOperand(K) ==
TE.getOperand(
I)) {
3965 if (PrevCount ==
Used.count())
3974 unsigned getVectorFactor()
const {
3975 if (!ReuseShuffleIndices.empty())
3976 return ReuseShuffleIndices.size();
3977 return Scalars.size();
3981 bool isGather()
const {
return State == NeedToGather; }
3987 WeakTrackingVH VectorizedValue =
nullptr;
4008 enum CombinedOpcode {
4010 MinMax = Instruction::OtherOpsEnd + 1,
4013 CombinedOpcode CombinedOp = NotCombinedOp;
4016 SmallVector<int, 4> ReuseShuffleIndices;
4019 SmallVector<unsigned, 4> ReorderIndices;
4027 VecTreeTy &Container;
4030 EdgeInfo UserTreeIndex;
4043 SmallVector<ValueList, 2> Operands;
4046 SmallPtrSet<const Value *, 4> CopyableElements;
4050 InstructionsState S = InstructionsState::invalid();
4053 unsigned InterleaveFactor = 0;
4056 bool DoesNotNeedToSchedule =
false;
4060 if (Operands.size() <
OpIdx + 1)
4061 Operands.resize(
OpIdx + 1);
4064 "Number of operands is greater than the number of scalars.");
4071 unsigned getInterleaveFactor()
const {
return InterleaveFactor; }
4073 void setInterleave(
unsigned Factor) { InterleaveFactor = Factor; }
4076 void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule =
true; }
4079 bool doesNotNeedToSchedule()
const {
return DoesNotNeedToSchedule; }
4084 setOperand(
I, Operands[
I]);
4088 void reorderOperands(ArrayRef<int> Mask) {
4096 return Operands[
OpIdx];
4102 return Operands[
OpIdx];
4106 unsigned getNumOperands()
const {
return Operands.size(); }
4109 Value *getSingleOperand(
unsigned OpIdx)
const {
4112 return Operands[
OpIdx][0];
4116 bool isAltShuffle()
const {
return S.isAltShuffle(); }
4118 Instruction *getMatchingMainOpOrAltOp(Instruction *
I)
const {
4119 return S.getMatchingMainOpOrAltOp(
I);
4127 if (
I && getMatchingMainOpOrAltOp(
I))
4129 return S.getMainOp();
4132 void setOperations(
const InstructionsState &S) {
4133 assert(S &&
"InstructionsState is invalid.");
4137 Instruction *getMainOp()
const {
return S.getMainOp(); }
4139 Instruction *getAltOp()
const {
return S.getAltOp(); }
4142 unsigned getOpcode()
const {
return S.getOpcode(); }
4144 unsigned getAltOpcode()
const {
return S.getAltOpcode(); }
4146 bool hasState()
const {
return S.valid(); }
4149 void addCopyableElement(
Value *V) {
4150 assert(S.isCopyableElement(V) &&
"Not a copyable element.");
4151 CopyableElements.insert(V);
4155 bool isCopyableElement(
Value *V)
const {
4156 return CopyableElements.contains(V);
4160 bool hasCopyableElements()
const {
return !CopyableElements.empty(); }
4163 const InstructionsState &getOperations()
const {
return S; }
4167 unsigned findLaneForValue(
Value *V)
const {
4168 unsigned FoundLane = getVectorFactor();
4169 for (
auto *It =
find(Scalars, V), *End = Scalars.end(); It != End;
4170 std::advance(It, 1)) {
4173 FoundLane = std::distance(Scalars.begin(), It);
4174 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
4175 if (!ReorderIndices.empty())
4176 FoundLane = ReorderIndices[FoundLane];
4177 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
4178 if (ReuseShuffleIndices.empty())
4180 if (
auto *RIt =
find(ReuseShuffleIndices, FoundLane);
4181 RIt != ReuseShuffleIndices.end()) {
4182 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
4186 assert(FoundLane < getVectorFactor() &&
"Unable to find given value.");
4193 buildAltOpShuffleMask(
const function_ref<
bool(Instruction *)> IsAltOp,
4194 SmallVectorImpl<int> &Mask,
4195 SmallVectorImpl<Value *> *OpScalars =
nullptr,
4196 SmallVectorImpl<Value *> *AltScalars =
nullptr)
const;
4199 bool isNonPowOf2Vec()
const {
4201 return IsNonPowerOf2;
4207 hasNonWholeRegisterOrNonPowerOf2Vec(
const TargetTransformInfo &
TTI)
const {
4210 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
4211 "Reshuffling not supported with non-power-of-2 vectors yet.");
4212 return IsNonPowerOf2;
4215 Value *getOrdered(
unsigned Idx)
const {
4216 assert(isGather() &&
"Must be used only for buildvectors/gathers.");
4217 if (ReorderIndices.empty())
4218 return Scalars[Idx];
4219 SmallVector<int>
Mask;
4221 return Scalars[
Mask[Idx]];
4227 dbgs() << Idx <<
".\n";
4228 for (
unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
4229 dbgs() <<
"Operand " << OpI <<
":\n";
4230 for (
const Value *V : Operands[OpI])
4233 dbgs() <<
"Scalars: \n";
4234 for (
Value *V : Scalars)
4236 dbgs() <<
"State: ";
4237 if (S && hasCopyableElements())
4238 dbgs() <<
"[[Copyable]] ";
4241 if (InterleaveFactor > 0) {
4242 dbgs() <<
"Vectorize with interleave factor " << InterleaveFactor
4245 dbgs() <<
"Vectorize\n";
4248 case ScatterVectorize:
4249 dbgs() <<
"ScatterVectorize\n";
4251 case StridedVectorize:
4252 dbgs() <<
"StridedVectorize\n";
4254 case CompressVectorize:
4255 dbgs() <<
"CompressVectorize\n";
4258 dbgs() <<
"NeedToGather\n";
4260 case CombinedVectorize:
4261 dbgs() <<
"CombinedVectorize\n";
4263 case SplitVectorize:
4264 dbgs() <<
"SplitVectorize\n";
4268 dbgs() <<
"MainOp: " << *S.getMainOp() <<
"\n";
4269 dbgs() <<
"AltOp: " << *S.getAltOp() <<
"\n";
4271 dbgs() <<
"MainOp: NULL\n";
4272 dbgs() <<
"AltOp: NULL\n";
4274 dbgs() <<
"VectorizedValue: ";
4275 if (VectorizedValue)
4276 dbgs() << *VectorizedValue <<
"\n";
4279 dbgs() <<
"ReuseShuffleIndices: ";
4280 if (ReuseShuffleIndices.empty())
4283 for (
int ReuseIdx : ReuseShuffleIndices)
4284 dbgs() << ReuseIdx <<
", ";
4286 dbgs() <<
"ReorderIndices: ";
4287 for (
unsigned ReorderIdx : ReorderIndices)
4288 dbgs() << ReorderIdx <<
", ";
4290 dbgs() <<
"UserTreeIndex: ";
4292 dbgs() << UserTreeIndex;
4294 dbgs() <<
"<invalid>";
4296 if (!CombinedEntriesWithIndices.empty()) {
4297 dbgs() <<
"Combined entries: ";
4299 dbgs() <<
"Entry index " <<
P.first <<
" with offset " <<
P.second;
4310 StringRef Banner)
const {
4311 dbgs() <<
"SLP: " << Banner <<
":\n";
4313 dbgs() <<
"SLP: Costs:\n";
4314 dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
4315 dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
4316 dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
4317 dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = "
4318 << ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
4324 const InstructionsState &S,
4326 ArrayRef<int> ReuseShuffleIndices = {}) {
4327 auto Invalid = ScheduleBundle::invalid();
4328 return newTreeEntry(VL,
Invalid, S, UserTreeIdx, ReuseShuffleIndices);
4333 const InstructionsState &S,
4335 ArrayRef<int> ReuseShuffleIndices = {},
4336 ArrayRef<unsigned> ReorderIndices = {},
4337 unsigned InterleaveFactor = 0) {
4338 TreeEntry::EntryState EntryState =
4339 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
4340 TreeEntry *
E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
4341 ReuseShuffleIndices, ReorderIndices);
4342 if (
E && InterleaveFactor > 0)
4343 E->setInterleave(InterleaveFactor);
4348 TreeEntry::EntryState EntryState,
4349 ScheduleBundle &Bundle,
const InstructionsState &S,
4351 ArrayRef<int> ReuseShuffleIndices = {},
4352 ArrayRef<unsigned> ReorderIndices = {}) {
4353 assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||
4354 EntryState == TreeEntry::SplitVectorize)) ||
4355 (Bundle && EntryState != TreeEntry::NeedToGather &&
4356 EntryState != TreeEntry::SplitVectorize)) &&
4357 "Need to vectorize gather entry?");
4359 if (GatheredLoadsEntriesFirst.has_value() &&
4360 EntryState == TreeEntry::NeedToGather && S &&
4361 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
4362 !UserTreeIdx.UserTE)
4364 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
4365 TreeEntry *
Last = VectorizableTree.back().get();
4366 Last->Idx = VectorizableTree.size() - 1;
4367 Last->State = EntryState;
4368 if (UserTreeIdx.UserTE)
4369 OperandsToTreeEntry.try_emplace(
4370 std::make_pair(UserTreeIdx.UserTE, UserTreeIdx.EdgeIdx),
Last);
4375 ReuseShuffleIndices.empty()) &&
4376 "Reshuffling scalars not yet supported for nodes with padding");
4377 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
4378 ReuseShuffleIndices.end());
4379 if (ReorderIndices.
empty()) {
4382 Last->setOperations(S);
4385 Last->Scalars.assign(VL.
size(),
nullptr);
4387 [VL](
unsigned Idx) ->
Value * {
4388 if (Idx >= VL.size())
4389 return UndefValue::get(VL.front()->getType());
4394 Last->setOperations(S);
4395 Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
4397 if (EntryState == TreeEntry::SplitVectorize) {
4398 assert(S &&
"Split nodes must have operations.");
4399 Last->setOperations(S);
4400 SmallPtrSet<Value *, 4> Processed;
4401 for (
Value *V : VL) {
4405 auto It = ScalarsInSplitNodes.find(V);
4406 if (It == ScalarsInSplitNodes.end()) {
4407 ScalarsInSplitNodes.try_emplace(V).first->getSecond().push_back(
Last);
4408 (void)Processed.
insert(V);
4409 }
else if (Processed.
insert(V).second) {
4411 "Value already associated with the node.");
4412 It->getSecond().push_back(
Last);
4415 }
else if (!
Last->isGather()) {
4418 (!S.areInstructionsWithCopyableElements() &&
4420 all_of(VL, [&](
Value *V) {
return S.isNonSchedulable(V); }))
4421 Last->setDoesNotNeedToSchedule();
4422 SmallPtrSet<Value *, 4> Processed;
4423 for (
Value *V : VL) {
4426 if (S.isCopyableElement(V)) {
4427 Last->addCopyableElement(V);
4430 auto It = ScalarToTreeEntries.find(V);
4431 if (It == ScalarToTreeEntries.end()) {
4432 ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(
Last);
4433 (void)Processed.
insert(V);
4434 }
else if (Processed.
insert(V).second) {
4436 "Value already associated with the node.");
4437 It->getSecond().push_back(
Last);
4441 assert((!Bundle.getBundle().empty() ||
Last->doesNotNeedToSchedule()) &&
4442 "Bundle and VL out of sync");
4443 if (!Bundle.getBundle().empty()) {
4444#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
4445 auto *BundleMember = Bundle.getBundle().begin();
4446 SmallPtrSet<Value *, 4> Processed;
4447 for (
Value *V : VL) {
4448 if (S.isNonSchedulable(V) || !Processed.
insert(V).second)
4452 assert(BundleMember == Bundle.getBundle().end() &&
4453 "Bundle and VL out of sync");
4455 Bundle.setTreeEntry(
Last);
4459 bool AllConstsOrCasts =
true;
4460 for (
Value *V : VL) {
4461 if (S && S.areInstructionsWithCopyableElements() &&
4462 S.isCopyableElement(V))
4463 Last->addCopyableElement(V);
4466 AllConstsOrCasts &=
I &&
I->getType()->isIntegerTy();
4467 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
4468 !UserTreeIdx.UserTE->isGather())
4469 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(
Last);
4472 if (AllConstsOrCasts)
4474 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
4475 MustGather.insert_range(VL);
4478 if (UserTreeIdx.UserTE)
4479 Last->UserTreeIndex = UserTreeIdx;
4485 TreeEntry::VecTreeTy VectorizableTree;
4490 for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
4491 VectorizableTree[
Id]->dump();
4499 assert(V &&
"V cannot be nullptr.");
4500 auto It = ScalarToTreeEntries.find(V);
4501 if (It == ScalarToTreeEntries.end())
4503 return It->getSecond();
4508 assert(V &&
"V cannot be nullptr.");
4509 auto It = ScalarsInSplitNodes.find(V);
4510 if (It == ScalarsInSplitNodes.end())
4512 return It->getSecond();
4517 bool SameVF =
false)
const {
4518 assert(V &&
"V cannot be nullptr.");
4519 for (TreeEntry *TE : ScalarToTreeEntries.lookup(V))
4520 if ((!SameVF ||
TE->getVectorFactor() == VL.
size()) &&
TE->isSame(VL))
4531 bool areAltOperandsProfitable(
const InstructionsState &S,
4536 class ScalarsVectorizationLegality {
4537 InstructionsState S;
4539 bool TryToFindDuplicates;
4540 bool TrySplitVectorize;
4543 ScalarsVectorizationLegality(InstructionsState S,
bool IsLegal,
4544 bool TryToFindDuplicates =
true,
4545 bool TrySplitVectorize =
false)
4546 : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
4547 TrySplitVectorize(TrySplitVectorize) {
4548 assert((!IsLegal || (S.valid() && TryToFindDuplicates)) &&
4549 "Inconsistent state");
4551 const InstructionsState &getInstructionsState()
const {
return S; };
4552 bool isLegal()
const {
return IsLegal; }
4553 bool tryToFindDuplicates()
const {
return TryToFindDuplicates; }
4554 bool trySplitVectorize()
const {
return TrySplitVectorize; }
4559 ScalarsVectorizationLegality
4562 bool TryCopyableElementsVectorization)
const;
4566 TreeEntry::EntryState getScalarsVectorizationState(
4568 bool IsScatterVectorizeUserTE,
OrdersType &CurrentOrder,
4569 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo);
4572 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
4575 SmallDenseMap<std::pair<const TreeEntry *, unsigned>, TreeEntry *>
4576 OperandsToTreeEntry;
4579 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarsInSplitNodes;
4582 SmallDenseMap<Value *, unsigned> InstrElementSize;
4596 SmallDenseMap<const TreeEntry *, WeakTrackingVH> EntryToLastInstruction;
4600 SmallDenseMap<const Instruction *, Instruction *> LastInstructionToPos;
4605 SetVector<const TreeEntry *> PostponedGathers;
4607 using ValueToGatherNodesMap =
4608 DenseMap<Value *, SmallSetVector<const TreeEntry *, 4>>;
4609 ValueToGatherNodesMap ValueToGatherNodes;
4614 SetVector<unsigned> LoadEntriesToVectorize;
4617 bool IsGraphTransformMode =
false;
4620 std::optional<unsigned> GatheredLoadsEntriesFirst;
4623 SmallDenseMap<
const TreeEntry *,
4624 std::tuple<SmallVector<int>,
VectorType *, unsigned,
bool>>
4625 CompressEntryToData;
4628 struct ExternalUser {
4629 ExternalUser(
Value *S, llvm::User *U,
const TreeEntry &E,
unsigned L)
4630 : Scalar(S), User(
U), E(E), Lane(
L) {}
4633 Value *Scalar =
nullptr;
4636 llvm::User *User =
nullptr;
4644 using UserList = SmallVector<ExternalUser, 16>;
4650 bool isAliased(
const MemoryLocation &Loc1, Instruction *Inst1,
4651 Instruction *Inst2) {
4654 AliasCacheKey
Key = std::make_pair(Inst1, Inst2);
4655 auto Res = AliasCache.try_emplace(
Key);
4657 return Res.first->second;
4658 bool Aliased =
isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
4660 Res.first->getSecond() = Aliased;
4664 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
4668 SmallDenseMap<AliasCacheKey, bool> AliasCache;
4673 BatchAAResults BatchAA;
4680 DenseSet<Instruction *> DeletedInstructions;
4683 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
4686 DenseSet<size_t> AnalyzedReductionVals;
4690 DenseSet<Value *> AnalyzedMinBWVals;
4696 UserList ExternalUses;
4700 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
4704 SmallPtrSet<Value *, 4> ExternalUsesWithNonUsers;
4707 SmallPtrSet<const Value *, 32> EphValues;
4711 SetVector<Instruction *> GatherShuffleExtractSeq;
4714 DenseSet<BasicBlock *> CSEBlocks;
4717 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
4724 class ScheduleEntity {
4725 friend class ScheduleBundle;
4726 friend class ScheduleData;
4727 friend class ScheduleCopyableData;
4730 enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData };
4731 Kind getKind()
const {
return K; }
4732 ScheduleEntity(Kind K) : K(K) {}
4736 int SchedulingPriority = 0;
4739 bool IsScheduled =
false;
4741 const Kind K = Kind::ScheduleData;
4744 ScheduleEntity() =
delete;
4746 void setSchedulingPriority(
int Priority) { SchedulingPriority = Priority; }
4747 int getSchedulingPriority()
const {
return SchedulingPriority; }
4748 bool isReady()
const {
4750 return SD->isReady();
4752 return CD->isReady();
4758 bool hasValidDependencies()
const {
4760 return SD->hasValidDependencies();
4762 return CD->hasValidDependencies();
4766 int getUnscheduledDeps()
const {
4768 return SD->getUnscheduledDeps();
4770 return CD->getUnscheduledDeps();
4774 int incrementUnscheduledDeps(
int Incr) {
4776 return SD->incrementUnscheduledDeps(Incr);
4780 int getDependencies()
const {
4782 return SD->getDependencies();
4788 return SD->getInst();
4793 bool isScheduled()
const {
return IsScheduled; }
4794 void setScheduled(
bool Scheduled) { IsScheduled = Scheduled; }
4796 static bool classof(
const ScheduleEntity *) {
return true; }
4798#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4799 void dump(raw_ostream &OS)
const {
4801 return SD->dump(OS);
4803 return CD->dump(OS);
4814#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4816 const BoUpSLP::ScheduleEntity &SE) {
4826 class ScheduleData final :
public ScheduleEntity {
4830 enum { InvalidDeps = -1 };
4832 ScheduleData() : ScheduleEntity(Kind::ScheduleData) {}
4833 static bool classof(
const ScheduleEntity *Entity) {
4834 return Entity->getKind() == Kind::ScheduleData;
4837 void init(
int BlockSchedulingRegionID, Instruction *
I) {
4838 NextLoadStore =
nullptr;
4839 IsScheduled =
false;
4840 SchedulingRegionID = BlockSchedulingRegionID;
4841 clearDependencies();
4847 if (hasValidDependencies()) {
4848 assert(UnscheduledDeps <= Dependencies &&
"invariant");
4850 assert(UnscheduledDeps == Dependencies &&
"invariant");
4854 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
4855 "unexpected scheduled state");
4862 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
4866 bool isReady()
const {
return UnscheduledDeps == 0 && !IsScheduled; }
4871 int incrementUnscheduledDeps(
int Incr) {
4872 assert(hasValidDependencies() &&
4873 "increment of unscheduled deps would be meaningless");
4874 UnscheduledDeps += Incr;
4875 assert(UnscheduledDeps >= 0 &&
4876 "Expected valid number of unscheduled deps");
4877 return UnscheduledDeps;
4882 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
4885 void clearDependencies() {
4886 clearDirectDependencies();
4887 MemoryDependencies.clear();
4888 ControlDependencies.clear();
4895 void clearDirectDependencies() {
4896 Dependencies = InvalidDeps;
4897 resetUnscheduledDeps();
4898 IsScheduled =
false;
4902 int getUnscheduledDeps()
const {
return UnscheduledDeps; }
4904 int getDependencies()
const {
return Dependencies; }
4906 void initDependencies() { Dependencies = 0; }
4908 void incDependencies() { Dependencies++; }
4911 int getSchedulingRegionID()
const {
return SchedulingRegionID; }
4918 return MemoryDependencies;
4921 void addMemoryDependency(ScheduleData *Dep) {
4922 MemoryDependencies.push_back(Dep);
4926 return ControlDependencies;
4929 void addControlDependency(ScheduleData *Dep) {
4930 ControlDependencies.push_back(Dep);
4933 ScheduleData *getNextLoadStore()
const {
return NextLoadStore; }
4934 void setNextLoadStore(ScheduleData *
Next) { NextLoadStore =
Next; }
4936 void dump(raw_ostream &OS)
const { OS << *Inst; }
4948 ScheduleData *NextLoadStore =
nullptr;
4952 SmallVector<ScheduleData *> MemoryDependencies;
4958 SmallVector<ScheduleData *> ControlDependencies;
4962 int SchedulingRegionID = 0;
4968 int Dependencies = InvalidDeps;
4974 int UnscheduledDeps = InvalidDeps;
4979 const BoUpSLP::ScheduleData &SD) {
4985 class ScheduleBundle final :
public ScheduleEntity {
4989 bool IsValid =
true;
4991 TreeEntry *TE =
nullptr;
4992 ScheduleBundle(
bool IsValid)
4993 : ScheduleEntity(Kind::ScheduleBundle), IsValid(IsValid) {}
4996 ScheduleBundle() : ScheduleEntity(Kind::ScheduleBundle) {}
4997 static bool classof(
const ScheduleEntity *Entity) {
4998 return Entity->getKind() == Kind::ScheduleBundle;
5003 for (
const ScheduleEntity *SD : Bundle) {
5004 if (SD->hasValidDependencies()) {
5005 assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
5008 assert(SD->getUnscheduledDeps() == SD->getDependencies() &&
5012 if (isScheduled()) {
5013 assert(SD->hasValidDependencies() && SD->getUnscheduledDeps() == 0 &&
5014 "unexpected scheduled state");
5020 int unscheduledDepsInBundle()
const {
5021 assert(*
this &&
"bundle must not be empty");
5023 for (
const ScheduleEntity *BundleMember : Bundle) {
5024 if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
5025 return ScheduleData::InvalidDeps;
5026 Sum += BundleMember->getUnscheduledDeps();
5034 bool hasValidDependencies()
const {
5035 return all_of(Bundle, [](
const ScheduleEntity *SD) {
5036 return SD->hasValidDependencies();
5042 bool isReady()
const {
5043 assert(*
this &&
"bundle must not be empty");
5044 return unscheduledDepsInBundle() == 0 && !isScheduled();
5052 void add(ScheduleEntity *SD) { Bundle.push_back(SD); }
5055 void setTreeEntry(TreeEntry *TE) { this->TE = TE; }
5056 TreeEntry *getTreeEntry()
const {
return TE; }
5058 static ScheduleBundle invalid() {
return {
false}; }
5060 operator bool()
const {
return IsValid; }
5063 void dump(raw_ostream &OS)
const {
5072 OS << *SD->getInst();
5086 const BoUpSLP::ScheduleBundle &Bundle) {
5097 class ScheduleCopyableData final :
public ScheduleEntity {
5104 int SchedulingRegionID = 0;
5106 ScheduleBundle &Bundle;
5109 ScheduleCopyableData(
int BlockSchedulingRegionID,
Instruction *
I,
5110 const EdgeInfo &EI, ScheduleBundle &Bundle)
5111 : ScheduleEntity(Kind::ScheduleCopyableData), Inst(
I), EI(EI),
5112 SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {}
5113 static bool classof(
const ScheduleEntity *Entity) {
5114 return Entity->getKind() == Kind::ScheduleCopyableData;
5119 if (hasValidDependencies()) {
5120 assert(UnscheduledDeps <= Dependencies &&
"invariant");
5122 assert(UnscheduledDeps == Dependencies &&
"invariant");
5126 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
5127 "unexpected scheduled state");
5134 bool hasValidDependencies()
const {
5135 return Dependencies != ScheduleData::InvalidDeps;
5140 bool isReady()
const {
return UnscheduledDeps == 0 && !IsScheduled; }
5145 int incrementUnscheduledDeps(
int Incr) {
5146 assert(hasValidDependencies() &&
5147 "increment of unscheduled deps would be meaningless");
5148 UnscheduledDeps += Incr;
5149 assert(UnscheduledDeps >= 0 &&
"invariant");
5150 return UnscheduledDeps;
5155 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5158 int getUnscheduledDeps()
const {
return UnscheduledDeps; }
5160 int getDependencies()
const {
return Dependencies; }
5162 void initDependencies() { Dependencies = 0; }
5164 void incDependencies() { Dependencies++; }
5167 int getSchedulingRegionID()
const {
return SchedulingRegionID; }
5173 void clearDependencies() {
5174 Dependencies = ScheduleData::InvalidDeps;
5175 UnscheduledDeps = ScheduleData::InvalidDeps;
5176 IsScheduled =
false;
5180 const EdgeInfo &getEdgeInfo()
const {
return EI; }
5183 ScheduleBundle &getBundle() {
return Bundle; }
5184 const ScheduleBundle &getBundle()
const {
return Bundle; }
5186#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5187 void dump(raw_ostream &OS)
const { OS <<
"[Copyable]" << *getInst(); }
5198 int Dependencies = ScheduleData::InvalidDeps;
5204 int UnscheduledDeps = ScheduleData::InvalidDeps;
5234 struct BlockScheduling {
5236 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
5239 ScheduledBundles.clear();
5240 ScheduledBundlesList.
clear();
5241 ScheduleCopyableDataMap.clear();
5242 ScheduleCopyableDataMapByInst.clear();
5243 ScheduleCopyableDataMapByInstUser.clear();
5244 ScheduleCopyableDataMapByUsers.clear();
5246 ScheduleStart =
nullptr;
5247 ScheduleEnd =
nullptr;
5248 FirstLoadStoreInRegion =
nullptr;
5249 LastLoadStoreInRegion =
nullptr;
5250 RegionHasStackSave =
false;
5254 ScheduleRegionSizeLimit -= ScheduleRegionSize;
5257 ScheduleRegionSize = 0;
5261 ++SchedulingRegionID;
5264 ScheduleData *getScheduleData(Instruction *
I) {
5267 if (BB !=
I->getParent())
5270 ScheduleData *SD = ScheduleDataMap.lookup(
I);
5271 if (SD && isInSchedulingRegion(*SD))
5276 ScheduleData *getScheduleData(
Value *V) {
5282 ScheduleCopyableData *getScheduleCopyableData(
const EdgeInfo &EI,
5283 const Value *V)
const {
5284 if (ScheduleCopyableDataMap.empty())
5286 auto It = ScheduleCopyableDataMap.find(std::make_pair(EI, V));
5287 if (It == ScheduleCopyableDataMap.end())
5289 ScheduleCopyableData *SD = It->getSecond().get();
5290 if (!isInSchedulingRegion(*SD))
5298 getScheduleCopyableData(
const Value *User,
unsigned OperandIdx,
5300 if (ScheduleCopyableDataMapByInstUser.empty())
5302 const auto It = ScheduleCopyableDataMapByInstUser.find(
5303 std::make_pair(std::make_pair(User, OperandIdx), V));
5304 if (It == ScheduleCopyableDataMapByInstUser.end())
5307 for (ScheduleCopyableData *SD : It->getSecond()) {
5308 if (isInSchedulingRegion(*SD))
5322 bool areAllOperandsReplacedByCopyableData(Instruction *User,
5326 if (ScheduleCopyableDataMap.empty())
5328 SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;
5329 SmallDenseMap<const TreeEntry *, unsigned> OrderedEntriesCount;
5331 if (Entries.
empty())
5333 for (
const Use &U :
User->operands()) {
5338 for (TreeEntry *TE : Entries) {
5340 bool IsNonSchedulableWithParentPhiNode =
5341 TE->doesNotNeedToSchedule() &&
TE->UserTreeIndex &&
5342 TE->UserTreeIndex.UserTE->hasState() &&
5343 TE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
5344 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
5347 if (IsNonSchedulableWithParentPhiNode) {
5348 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5349 const TreeEntry *ParentTE =
TE->UserTreeIndex.UserTE;
5350 for (
Value *V : ParentTE->Scalars) {
5354 if (ParentsUniqueUsers.
insert(
PHI).second &&
5367 bool IsCommutativeUser =
5372 OrderedEntriesCount.
try_emplace(TE, 0).first->getSecond();
5373 EdgeInfo EI(TE,
U.getOperandNo());
5374 if (!getScheduleCopyableData(EI,
Op))
5380 PotentiallyReorderedEntriesCount.
try_emplace(TE, 0)
5381 .first->getSecond() += Inc;
5384 if (PotentiallyReorderedEntriesCount.
empty())
5385 return all_of(OrderedEntriesCount,
5386 [&](
const std::pair<const TreeEntry *, unsigned> &
P) {
5390 for (
auto &
P : PotentiallyReorderedEntriesCount) {
5391 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5392 bool IsNonSchedulableWithParentPhiNode =
5393 P.first->doesNotNeedToSchedule() &&
P.first->UserTreeIndex &&
5394 P.first->UserTreeIndex.UserTE->hasState() &&
5395 P.first->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
5396 P.first->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
5397 auto *It =
find(
P.first->Scalars, User);
5399 assert(It !=
P.first->Scalars.end() &&
5400 "User is not in the tree entry");
5401 int Lane = std::distance(
P.first->Scalars.begin(), It);
5402 assert(Lane >= 0 &&
"Lane is not found");
5404 Lane =
P.first->ReorderIndices[Lane];
5405 assert(Lane <
static_cast<int>(
P.first->Scalars.size()) &&
5406 "Couldn't find extract lane");
5409 if (IsNonSchedulableWithParentPhiNode) {
5410 const TreeEntry *ParentTE =
P.first->UserTreeIndex.UserTE;
5412 if (!ParentsUniqueUsers.
insert(User).second) {
5418 for (
unsigned OpIdx :
5420 P.first->getMainOp()))) {
5421 if (
P.first->getOperand(
OpIdx)[Lane] ==
Op &&
5422 getScheduleCopyableData(EdgeInfo(
P.first,
OpIdx),
Op))
5426 if (!IsNonSchedulableWithParentPhiNode)
5429 }
while (It !=
P.first->Scalars.end());
5431 return all_of(PotentiallyReorderedEntriesCount,
5432 [&](
const std::pair<const TreeEntry *, unsigned> &
P) {
5433 return P.second ==
NumOps - 1;
5435 all_of(OrderedEntriesCount,
5436 [&](
const std::pair<const TreeEntry *, unsigned> &
P) {
5442 getScheduleCopyableData(
const Instruction *
I)
const {
5443 if (ScheduleCopyableDataMapByInst.empty())
5445 const auto It = ScheduleCopyableDataMapByInst.find(
I);
5446 if (It == ScheduleCopyableDataMapByInst.end())
5449 for (ScheduleCopyableData *SD : It->getSecond()) {
5450 if (isInSchedulingRegion(*SD))
5457 getScheduleCopyableDataUsers(
const Instruction *User)
const {
5458 if (ScheduleCopyableDataMapByUsers.empty())
5460 const auto It = ScheduleCopyableDataMapByUsers.find(User);
5461 if (It == ScheduleCopyableDataMapByUsers.end())
5464 for (ScheduleCopyableData *SD : It->getSecond()) {
5465 if (isInSchedulingRegion(*SD))
5471 ScheduleCopyableData &addScheduleCopyableData(
const EdgeInfo &EI,
5473 int SchedulingRegionID,
5474 ScheduleBundle &Bundle) {
5475 assert(!getScheduleCopyableData(EI,
I) &&
"already in the map");
5476 ScheduleCopyableData *CD =
5477 ScheduleCopyableDataMap
5478 .try_emplace(std::make_pair(EI,
I),
5479 std::make_unique<ScheduleCopyableData>(
5480 SchedulingRegionID,
I, EI, Bundle))
5483 ScheduleCopyableDataMapByInst[
I].push_back(CD);
5487 assert(It !=
Op.end() &&
"Lane not set");
5488 SmallPtrSet<Instruction *, 4> Visited;
5490 int Lane = std::distance(
Op.begin(), It);
5491 assert(Lane >= 0 &&
"Lane not set");
5493 !EI.UserTE->ReorderIndices.empty())
5494 Lane = EI.UserTE->ReorderIndices[Lane];
5495 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
5496 "Couldn't find extract lane");
5498 if (!Visited.
insert(In).second) {
5502 ScheduleCopyableDataMapByInstUser
5503 .try_emplace(std::make_pair(std::make_pair(In, EI.EdgeIdx),
I))
5506 ScheduleCopyableDataMapByUsers.try_emplace(
I)
5513 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
5514 if (ScheduleCopyableData *UserCD =
5515 getScheduleCopyableData(UserEI, In))
5516 ScheduleCopyableDataMapByUsers[
I].remove(UserCD);
5519 }
while (It !=
Op.end());
5521 ScheduleCopyableDataMapByUsers.try_emplace(
I).first->getSecond().insert(
5531 auto It = ScheduledBundles.find(
I);
5532 if (It == ScheduledBundles.end())
5534 return It->getSecond();
5538 bool isInSchedulingRegion(
const ScheduleEntity &SD)
const {
5540 return Data->getSchedulingRegionID() == SchedulingRegionID;
5542 return CD->getSchedulingRegionID() == SchedulingRegionID;
5544 [&](
const ScheduleEntity *BundleMember) {
5545 return isInSchedulingRegion(*BundleMember);
5551 template <
typename ReadyListType>
5552 void schedule(
const BoUpSLP &R,
const InstructionsState &S,
5553 const EdgeInfo &EI, ScheduleEntity *
Data,
5554 ReadyListType &ReadyList) {
5555 auto ProcessBundleMember = [&](ScheduleEntity *BundleMember,
5560 auto DecrUnsched = [&](
auto *
Data,
bool IsControl =
false) {
5561 if ((IsControl ||
Data->hasValidDependencies()) &&
5562 Data->incrementUnscheduledDeps(-1) == 0) {
5569 CopyableBundle.
push_back(&CD->getBundle());
5570 Bundles = CopyableBundle;
5572 Bundles = getScheduleBundles(
Data->getInst());
5574 if (!Bundles.
empty()) {
5575 for (ScheduleBundle *Bundle : Bundles) {
5576 if (Bundle->unscheduledDepsInBundle() == 0) {
5577 assert(!Bundle->isScheduled() &&
5578 "already scheduled bundle gets ready");
5579 ReadyList.insert(Bundle);
5581 <<
"SLP: gets ready: " << *Bundle <<
"\n");
5587 "already scheduled bundle gets ready");
5589 "Expected non-copyable data");
5590 ReadyList.insert(
Data);
5597 if (!ScheduleCopyableDataMap.empty()) {
5599 getScheduleCopyableData(User,
OpIdx,
I);
5600 for (ScheduleCopyableData *CD : CopyableData)
5601 DecrUnsched(CD,
false);
5602 if (!CopyableData.empty())
5605 if (ScheduleData *OpSD = getScheduleData(
I))
5606 DecrUnsched(OpSD,
false);
5612 if (!Bundles.empty()) {
5613 auto *
In = BundleMember->getInst();
5615 SmallDenseMap<const Instruction *, unsigned> OperandsUses;
5616 unsigned TotalOpCount = 0;
5619 TotalOpCount = OperandsUses[
In] = 1;
5621 for (
const Use &U :
In->operands()) {
5624 ++Res.first->getSecond();
5631 auto DecrUnschedForInst =
5633 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>>
5635 if (!ScheduleCopyableDataMap.empty()) {
5636 const EdgeInfo EI = {UserTE,
OpIdx};
5637 if (ScheduleCopyableData *CD =
5638 getScheduleCopyableData(EI,
I)) {
5639 if (!Checked.insert(std::make_pair(CD,
OpIdx)).second)
5641 DecrUnsched(CD,
false);
5645 auto It = OperandsUses.
find(
I);
5646 assert(It != OperandsUses.
end() &&
"Operand not found");
5647 if (It->second > 0) {
5649 assert(TotalOpCount > 0 &&
"No more operands to decrement");
5651 if (ScheduleData *OpSD = getScheduleData(
I)) {
5652 if (!Checked.insert(std::make_pair(OpSD,
OpIdx)).second)
5654 DecrUnsched(OpSD,
false);
5659 for (ScheduleBundle *Bundle : Bundles) {
5660 if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
5662 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5665 auto *It =
find(Bundle->getTreeEntry()->Scalars, In);
5666 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>> Checked;
5667 bool IsNonSchedulableWithParentPhiNode =
5668 Bundle->getTreeEntry()->doesNotNeedToSchedule() &&
5669 Bundle->getTreeEntry()->UserTreeIndex &&
5670 Bundle->getTreeEntry()->UserTreeIndex.UserTE->hasState() &&
5671 Bundle->getTreeEntry()->UserTreeIndex.UserTE->State !=
5672 TreeEntry::SplitVectorize &&
5673 Bundle->getTreeEntry()->UserTreeIndex.UserTE->getOpcode() ==
5677 std::distance(Bundle->getTreeEntry()->Scalars.begin(), It);
5678 assert(Lane >= 0 &&
"Lane not set");
5680 !Bundle->getTreeEntry()->ReorderIndices.empty())
5681 Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
5682 assert(Lane <
static_cast<int>(
5683 Bundle->getTreeEntry()->Scalars.size()) &&
5684 "Couldn't find extract lane");
5694 In->getNumOperands() ==
5695 Bundle->getTreeEntry()->getNumOperands() ||
5696 Bundle->getTreeEntry()->isCopyableElement(In)) &&
5697 "Missed TreeEntry operands?");
5701 if (IsNonSchedulableWithParentPhiNode) {
5702 const TreeEntry *ParentTE =
5703 Bundle->getTreeEntry()->UserTreeIndex.UserTE;
5705 if (!ParentsUniqueUsers.
insert(User).second) {
5706 It = std::find(std::next(It),
5707 Bundle->getTreeEntry()->Scalars.end(), In);
5712 for (
unsigned OpIdx :
5715 Bundle->getTreeEntry()->getOperand(
OpIdx)[Lane])) {
5718 DecrUnschedForInst(
I, Bundle->getTreeEntry(),
OpIdx, Checked);
5721 if (!IsNonSchedulableWithParentPhiNode)
5723 It = std::find(std::next(It),
5724 Bundle->getTreeEntry()->Scalars.end(), In);
5725 }
while (It != Bundle->getTreeEntry()->Scalars.end());
5730 for (Use &U : BundleMember->getInst()->operands()) {
5733 <<
"SLP: check for readiness (def): " << *
I <<
"\n");
5734 DecrUnschedForInst(BundleMember->getInst(),
U.getOperandNo(),
I);
5742 SmallPtrSet<const ScheduleData *, 4> VisitedMemory;
5743 for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) {
5744 if (!VisitedMemory.
insert(MemoryDep).second)
5749 << *MemoryDep <<
"\n");
5750 DecrUnsched(MemoryDep);
5753 SmallPtrSet<const ScheduleData *, 4> VisitedControl;
5754 for (ScheduleData *Dep : SD->getControlDependencies()) {
5755 if (!VisitedControl.
insert(Dep).second)
5760 <<
"SLP: check for readiness (ctrl): " << *Dep <<
"\n");
5761 DecrUnsched(Dep,
true);
5765 SD->setScheduled(
true);
5770 if (
R.isVectorized(In)) {
5772 for (TreeEntry *TE : Entries) {
5774 In->getNumOperands() !=
TE->getNumOperands())
5777 PseudoBundles.
emplace_back(std::make_unique<ScheduleBundle>());
5778 BundlePtr->setTreeEntry(TE);
5783 ProcessBundleMember(SD, Bundles);
5786 Bundle.setScheduled(
true);
5788 auto AreAllBundlesScheduled =
5789 [&](
const ScheduleEntity *SD,
5793 return !SDBundles.empty() &&
5794 all_of(SDBundles, [&](
const ScheduleBundle *SDBundle) {
5795 return SDBundle->isScheduled();
5798 for (ScheduleEntity *SD : Bundle.getBundle()) {
5801 SDBundles = getScheduleBundles(SD->getInst());
5802 if (AreAllBundlesScheduled(SD, SDBundles)) {
5803 SD->setScheduled(
true);
5816 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
5817 ScheduleStart->comesBefore(ScheduleEnd) &&
5818 "Not a valid scheduling region?");
5820 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
5822 if (!Bundles.
empty()) {
5823 for (ScheduleBundle *Bundle : Bundles) {
5824 assert(isInSchedulingRegion(*Bundle) &&
5825 "primary schedule data not in window?");
5830 auto *SD = getScheduleData(
I);
5833 assert(isInSchedulingRegion(*SD) &&
5834 "primary schedule data not in window?");
5839 [](
const ScheduleEntity *Bundle) {
5840 return Bundle->isReady();
5842 "item in ready list not ready?");
5846 template <
typename ReadyListType>
5847 void initialFillReadyList(ReadyListType &ReadyList) {
5848 SmallPtrSet<ScheduleBundle *, 16> Visited;
5849 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
5850 ScheduleData *SD = getScheduleData(
I);
5851 if (SD && SD->hasValidDependencies() && SD->isReady()) {
5854 for (ScheduleBundle *Bundle : Bundles) {
5855 if (!Visited.
insert(Bundle).second)
5857 if (Bundle->hasValidDependencies() && Bundle->isReady()) {
5858 ReadyList.insert(Bundle);
5860 << *Bundle <<
"\n");
5865 ReadyList.insert(SD);
5867 <<
"SLP: initially in ready list: " << *SD <<
"\n");
5878 const InstructionsState &S,
const EdgeInfo &EI);
5885 std::optional<ScheduleBundle *>
5887 const InstructionsState &S,
const EdgeInfo &EI);
5890 ScheduleData *allocateScheduleDataChunks();
5894 bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
5898 void initScheduleData(Instruction *FromI, Instruction *ToI,
5899 ScheduleData *PrevLoadStore,
5900 ScheduleData *NextLoadStore);
5904 void calculateDependencies(ScheduleBundle &Bundle,
bool InsertInReadyList,
5909 void resetSchedule();
5926 SmallDenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
5930 SmallDenseMap<std::pair<EdgeInfo, const Value *>,
5931 std::unique_ptr<ScheduleCopyableData>>
5932 ScheduleCopyableDataMap;
5938 SmallDenseMap<const Instruction *, SmallVector<ScheduleCopyableData *>>
5939 ScheduleCopyableDataMapByInst;
5945 SmallDenseMap<std::pair<std::pair<const Value *, unsigned>,
const Value *>,
5947 ScheduleCopyableDataMapByInstUser;
5967 SmallSetVector<ScheduleCopyableData *, 4>>
5968 ScheduleCopyableDataMapByUsers;
5971 SmallDenseMap<Instruction *, SmallVector<ScheduleBundle *>>
5977 SetVector<ScheduleEntity *> ReadyInsts;
5987 ScheduleData *FirstLoadStoreInRegion =
nullptr;
5991 ScheduleData *LastLoadStoreInRegion =
nullptr;
5996 bool RegionHasStackSave =
false;
5999 int ScheduleRegionSize = 0;
6008 int SchedulingRegionID = 1;
6012 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
6016 void scheduleBlock(
const BoUpSLP &R, BlockScheduling *BS);
6019 const SmallDenseSet<Value *> *UserIgnoreList =
nullptr;
6023 struct OrdersTypeDenseMapInfo {
6036 static unsigned getHashValue(
const OrdersType &V) {
6047 ScalarEvolution *SE;
6048 TargetTransformInfo *TTI;
6049 TargetLibraryInfo *TLI;
6052 AssumptionCache *AC;
6054 const DataLayout *DL;
6055 OptimizationRemarkEmitter *ORE;
6057 unsigned MaxVecRegSize;
6058 unsigned MinVecRegSize;
6061 IRBuilder<TargetFolder> Builder;
6068 DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
6073 unsigned ReductionBitWidth = 0;
6076 unsigned BaseGraphSize = 1;
6080 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
6084 DenseSet<unsigned> ExtraBitWidthNodes;
6092 SecondInfo::getEmptyKey());
6097 SecondInfo::getTombstoneKey());
6102 SecondInfo::getHashValue(Val.
EdgeIdx));
6123 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
6134 return R.VectorizableTree[0].get();
6138 return {&
N->UserTreeIndex,
N->Container};
6142 return {&
N->UserTreeIndex + 1,
N->Container};
6169 static unsigned size(
BoUpSLP *R) {
return R->VectorizableTree.size(); }
6181 OS << Entry->Idx <<
".\n";
6184 for (
auto *V : Entry->Scalars) {
6186 if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
6187 return EU.Scalar == V;
6197 if (Entry->isGather())
6199 if (Entry->State == TreeEntry::ScatterVectorize ||
6200 Entry->State == TreeEntry::StridedVectorize ||
6201 Entry->State == TreeEntry::CompressVectorize)
6202 return "color=blue";
6209 for (
auto *
I : DeletedInstructions) {
6210 if (!
I->getParent()) {
6215 I->insertBefore(F->getEntryBlock(),
6216 F->getEntryBlock().getFirstNonPHIIt());
6218 I->insertBefore(F->getEntryBlock().getTerminator()->getIterator());
6221 for (
Use &U :
I->operands()) {
6223 if (
Op && !DeletedInstructions.count(
Op) &&
Op->hasOneUser() &&
6227 I->dropAllReferences();
6229 for (
auto *
I : DeletedInstructions) {
6231 "trying to erase instruction with users.");
6232 I->eraseFromParent();
6238#ifdef EXPENSIVE_CHECKS
6249 assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
6250 "Expected non-empty mask.");
6253 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
6255 Reuses[Mask[
I]] = Prev[
I];
6263 bool BottomOrder =
false) {
6264 assert(!Mask.empty() &&
"Expected non-empty mask.");
6265 unsigned Sz = Mask.size();
6268 if (Order.
empty()) {
6270 std::iota(PrevOrder.
begin(), PrevOrder.
end(), 0);
6272 PrevOrder.
swap(Order);
6275 for (
unsigned I = 0;
I < Sz; ++
I)
6277 Order[
I] = PrevOrder[Mask[
I]];
6279 return Data.value() == Sz ||
Data.index() ==
Data.value();
6288 if (Order.
empty()) {
6290 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
6300 for (
unsigned I = 0;
I < Sz; ++
I)
6302 Order[MaskOrder[
I]] =
I;
6306std::optional<BoUpSLP::OrdersType>
6308 bool TopToBottom,
bool IgnoreReorder) {
6309 assert(TE.isGather() &&
"Expected gather node only.");
6313 Type *ScalarTy = GatheredScalars.
front()->getType();
6314 size_t NumScalars = GatheredScalars.
size();
6316 return std::nullopt;
6323 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
6325 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
6328 if (GatherShuffles.
empty() && ExtractShuffles.
empty())
6329 return std::nullopt;
6330 OrdersType CurrentOrder(NumScalars, NumScalars);
6331 if (GatherShuffles.
size() == 1 &&
6333 Entries.
front().front()->isSame(TE.Scalars)) {
6337 return std::nullopt;
6339 if (Entries.
front().front()->UserTreeIndex.UserTE ==
6340 TE.UserTreeIndex.UserTE)
6341 return std::nullopt;
6344 if (!IgnoreReorder && Entries.
front().front()->Idx == 0)
6345 return std::nullopt;
6348 if (!Entries.
front().front()->ReuseShuffleIndices.empty() &&
6349 TE.getVectorFactor() == 2 && Mask.size() == 2 &&
6352 return P.value() % 2 != static_cast<int>(P.index()) % 2;
6354 return std::nullopt;
6358 std::iota(CurrentOrder.
begin(), CurrentOrder.
end(), 0);
6359 return CurrentOrder;
6363 return all_of(Mask, [&](
int I) {
6370 if ((ExtractShuffles.
empty() && IsSplatMask(Mask) &&
6371 (Entries.
size() != 1 ||
6372 Entries.
front().front()->ReorderIndices.empty())) ||
6373 (GatherShuffles.
empty() && IsSplatMask(ExtractMask)))
6374 return std::nullopt;
6380 if (ShuffledSubMasks.
test(
I))
6382 const int VF = GetVF(
I);
6388 if (
any_of(Slice, [&](
unsigned I) {
return I != NumScalars; })) {
6390 ShuffledSubMasks.
set(
I);
6394 int FirstMin = INT_MAX;
6395 int SecondVecFound =
false;
6397 int Idx = Mask[
I * PartSz + K];
6399 Value *V = GatheredScalars[
I * PartSz + K];
6401 SecondVecFound =
true;
6410 SecondVecFound =
true;
6414 FirstMin = (FirstMin / PartSz) * PartSz;
6416 if (SecondVecFound) {
6418 ShuffledSubMasks.
set(
I);
6422 int Idx = Mask[
I * PartSz + K];
6426 if (Idx >= PartSz) {
6427 SecondVecFound =
true;
6430 if (CurrentOrder[
I * PartSz + Idx] >
6431 static_cast<unsigned>(
I * PartSz + K) &&
6432 CurrentOrder[
I * PartSz + Idx] !=
6433 static_cast<unsigned>(
I * PartSz + Idx))
6434 CurrentOrder[
I * PartSz + Idx] =
I * PartSz + K;
6437 if (SecondVecFound) {
6439 ShuffledSubMasks.
set(
I);
6445 if (!ExtractShuffles.
empty())
6446 TransformMaskToOrder(
6447 CurrentOrder, ExtractMask, PartSz, NumParts, [&](
unsigned I) {
6448 if (!ExtractShuffles[
I])
6451 unsigned Sz =
getNumElems(TE.getVectorFactor(), PartSz,
I);
6453 int K =
I * PartSz + Idx;
6456 if (!TE.ReuseShuffleIndices.empty())
6457 K = TE.ReuseShuffleIndices[K];
6460 if (!TE.ReorderIndices.empty())
6461 K = std::distance(TE.ReorderIndices.begin(),
6462 find(TE.ReorderIndices, K));
6468 .getKnownMinValue());
6473 if (GatherShuffles.
size() == 1 && NumParts != 1) {
6474 if (ShuffledSubMasks.
any())
6475 return std::nullopt;
6476 PartSz = NumScalars;
6479 if (!Entries.
empty())
6480 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](
unsigned I) {
6481 if (!GatherShuffles[
I])
6483 return std::max(Entries[
I].front()->getVectorFactor(),
6484 Entries[
I].back()->getVectorFactor());
6486 unsigned NumUndefs =
count(CurrentOrder, NumScalars);
6487 if (ShuffledSubMasks.
all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
6488 return std::nullopt;
6489 return std::move(CurrentOrder);
6494 bool CompareOpcodes =
true) {
6500 return (!GEP1 || GEP1->getNumOperands() == 2) &&
6501 (!GEP2 || GEP2->getNumOperands() == 2) &&
6502 (((!GEP1 ||
isConstant(GEP1->getOperand(1))) &&
6503 (!GEP2 ||
isConstant(GEP2->getOperand(1)))) ||
6506 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
6510template <
typename T>
6515 return CommonAlignment;
6521 "Order is empty. Please check it before using isReverseOrder.");
6522 unsigned Sz = Order.
size();
6524 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
6535 const SCEV *PtrSCEVLowest =
nullptr;
6536 const SCEV *PtrSCEVHighest =
nullptr;
6539 for (
Value *Ptr : PointerOps) {
6544 if (!PtrSCEVLowest && !PtrSCEVHighest) {
6545 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
6552 PtrSCEVLowest = PtrSCEV;
6559 PtrSCEVHighest = PtrSCEV;
6567 int Size =
DL.getTypeStoreSize(ElemTy);
6568 auto TryGetStride = [&](
const SCEV *Dist,
6569 const SCEV *Multiplier) ->
const SCEV * {
6571 if (M->getOperand(0) == Multiplier)
6572 return M->getOperand(1);
6573 if (M->getOperand(1) == Multiplier)
6574 return M->getOperand(0);
6577 if (Multiplier == Dist)
6582 const SCEV *Stride =
nullptr;
6583 if (
Size != 1 || SCEVs.
size() > 2) {
6585 Stride = TryGetStride(Dist, Sz);
6593 using DistOrdPair = std::pair<int64_t, int>;
6595 std::set<DistOrdPair,
decltype(Compare)> Offsets(Compare);
6597 bool IsConsecutive =
true;
6598 for (
const SCEV *PtrSCEV : SCEVs) {
6600 if (PtrSCEV != PtrSCEVLowest) {
6602 const SCEV *Coeff = TryGetStride(Diff, Stride);
6612 Dist = SC->getAPInt().getZExtValue();
6617 auto Res = Offsets.emplace(Dist, Cnt);
6621 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
6624 if (Offsets.size() != SCEVs.
size())
6626 SortedIndices.
clear();
6627 if (!IsConsecutive) {
6631 for (
const std::pair<int64_t, int> &Pair : Offsets) {
6632 SortedIndices[Cnt] = Pair.second;
6639static std::pair<InstructionCost, InstructionCost>
6658 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask,
CostKind, Index, SubTp,
6660 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
6663 Mask, NumSrcElts, NumSubElts, Index)) {
6664 if (Index + NumSubElts > NumSrcElts &&
6665 Index + NumSrcElts <=
static_cast<int>(
Mask.size()))
6669 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask,
CostKind, Index, SubTp,
6682 "ScalableVectorType is not supported.");
6685 "Incorrect usage.");
6690 unsigned ScalarTyNumElements = VecTy->getNumElements();
6693 if (!DemandedElts[
I])
6697 I * ScalarTyNumElements, VecTy);
6700 I * ScalarTyNumElements, VecTy);
6704 return TTI.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
6713 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
6714 if (Opcode == Instruction::ExtractElement) {
6720 Index * VecTy->getNumElements(), VecTy);
6723 return TTI.getVectorInstrCost(Opcode, Val,
CostKind, Index, Scalar,
6736 getWidenedType(VecTy->getElementType(), ScalarTy->getNumElements());
6738 Index * ScalarTy->getNumElements(), SubTp) +
6742 return TTI.getExtractWithExtendCost(Opcode, Dst, VecTy, Index,
CostKind);
6758 auto *Begin = std::next(
Mask.begin(), Index);
6759 std::iota(Begin, std::next(Begin, SubVecVF), 0);
6760 Vec = Builder.CreateShuffleVector(V, Mask);
6763 std::iota(
Mask.begin(),
Mask.end(), 0);
6764 std::iota(std::next(
Mask.begin(), Index),
6765 std::next(
Mask.begin(), Index + SubVecVF), VecVF);
6767 return Generator(Vec, V, Mask);
6770 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
6771 V = Builder.CreateShuffleVector(V, ResizeMask);
6773 return Builder.CreateShuffleVector(Vec, V, Mask);
6778 unsigned SubVecVF,
unsigned Index) {
6780 std::iota(Mask.begin(), Mask.end(), Index);
6781 return Builder.CreateShuffleVector(Vec, Mask);
6791 const unsigned Sz = PointerOps.
size();
6794 CompressMask[0] = 0;
6796 std::optional<unsigned> Stride = 0;
6799 Value *Ptr = Order.
empty() ? PointerOps[
I] : PointerOps[Order[
I]];
6800 std::optional<int64_t> OptPos =
6802 if (!OptPos || OptPos > std::numeric_limits<unsigned>::max())
6804 unsigned Pos =
static_cast<unsigned>(*OptPos);
6805 CompressMask[
I] = Pos;
6812 if (Pos != *Stride *
I)
6815 return Stride.has_value();
6828 InterleaveFactor = 0;
6830 const size_t Sz = VL.
size();
6838 if (AreAllUsersVectorized(V))
6841 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
CostKind,
6842 Mask.empty() ?
I : Mask[
I]);
6845 if (ExtractCost <= ScalarCost)
6850 if (Order.
empty()) {
6851 Ptr0 = PointerOps.
front();
6852 PtrN = PointerOps.
back();
6854 Ptr0 = PointerOps[Order.
front()];
6855 PtrN = PointerOps[Order.
back()];
6857 std::optional<int64_t> Diff =
6861 const size_t MaxRegSize =
6865 if (*Diff / Sz >= MaxRegSize / 8)
6869 Align CommonAlignment = LI->getAlign();
6871 Ptr0, LoadVecTy, CommonAlignment,
DL,
6874 if (IsMasked && !
TTI.isLegalMaskedLoad(LoadVecTy, CommonAlignment,
6875 LI->getPointerAddressSpace()))
6881 assert(CompressMask.
size() >= 2 &&
"At least two elements are required");
6885 auto [ScalarGEPCost, VectorGEPCost] =
6887 Instruction::GetElementPtr,
CostKind, ScalarTy, LoadVecTy);
6904 LoadCost =
TTI.getMemIntrinsicInstrCost(
6907 LI->getPointerAddressSpace()),
6911 TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
6912 LI->getPointerAddressSpace(),
CostKind);
6914 if (IsStrided && !IsMasked && Order.
empty()) {
6921 AlignedLoadVecTy = LoadVecTy;
6922 if (
TTI.isLegalInterleavedAccessType(AlignedLoadVecTy, CompressMask[1],
6924 LI->getPointerAddressSpace())) {
6926 VectorGEPCost +
TTI.getInterleavedMemoryOpCost(
6927 Instruction::Load, AlignedLoadVecTy,
6928 CompressMask[1], {}, CommonAlignment,
6929 LI->getPointerAddressSpace(),
CostKind, IsMasked);
6930 if (InterleavedCost < GatherCost) {
6931 InterleaveFactor = CompressMask[1];
6932 LoadVecTy = AlignedLoadVecTy;
6939 if (!Order.
empty()) {
6942 NewMask[
I] = CompressMask[Mask[
I]];
6944 CompressMask.
swap(NewMask);
6946 InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
6947 return TotalVecCost < GatherCost;
6960 unsigned InterleaveFactor;
6964 AreAllUsersVectorized, IsMasked, InterleaveFactor,
6965 CompressMask, LoadVecTy);
6982 Align Alignment,
const int64_t Diff,
6983 const size_t Sz)
const {
6984 if (Diff % (Sz - 1) != 0)
6988 auto IsAnyPointerUsedOutGraph =
any_of(PointerOps, [&](
Value *V) {
6990 return !isVectorized(U) && !MustGather.contains(U);
6994 const uint64_t AbsoluteDiff = std::abs(Diff);
6996 if (IsAnyPointerUsedOutGraph ||
6997 (AbsoluteDiff > Sz &&
7000 AbsoluteDiff % Sz == 0 &&
has_single_bit(AbsoluteDiff / Sz)))) ||
7001 Diff == -(
static_cast<int64_t
>(Sz) - 1)) {
7002 int64_t Stride = Diff /
static_cast<int64_t
>(Sz - 1);
7003 if (Diff != Stride *
static_cast<int64_t
>(Sz - 1))
7005 if (!TTI->isLegalStridedLoadStore(VecTy, Alignment))
7015 Value *Ptr0,
Value *PtrN, StridedPtrInfo &SPtrInfo)
const {
7016 const size_t Sz = PointerOps.
size();
7021 SortedIndices.
empty() ? PointerOps[
I] : PointerOps[SortedIndices[
I]];
7022 SortedOffsetsFromBase[
I] =
7040 int64_t StrideWithinGroup =
7041 SortedOffsetsFromBase[1] - SortedOffsetsFromBase[0];
7044 auto IsEndOfGroupIndex = [=, &SortedOffsetsFromBase](
unsigned Idx) {
7045 return SortedOffsetsFromBase[Idx] - SortedOffsetsFromBase[Idx - 1] !=
7050 unsigned GroupSize = FoundIt != Indices.end() ? *FoundIt : Sz;
7052 unsigned VecSz = Sz;
7053 Type *NewScalarTy = ScalarTy;
7057 bool NeedsWidening = Sz != GroupSize;
7058 if (NeedsWidening) {
7059 if (Sz % GroupSize != 0)
7062 if (StrideWithinGroup != 1)
7064 VecSz = Sz / GroupSize;
7067 DL->getTypeSizeInBits(ScalarTy).getFixedValue() * GroupSize);
7070 if (!
isStridedLoad(PointerOps, NewScalarTy, Alignment, Diff, VecSz))
7073 int64_t StrideIntVal = StrideWithinGroup;
7074 if (NeedsWidening) {
7077 unsigned CurrentGroupStartIdx = GroupSize;
7078 int64_t StrideBetweenGroups =
7079 SortedOffsetsFromBase[GroupSize] - SortedOffsetsFromBase[0];
7080 StrideIntVal = StrideBetweenGroups;
7081 for (; CurrentGroupStartIdx < Sz; CurrentGroupStartIdx += GroupSize) {
7082 if (SortedOffsetsFromBase[CurrentGroupStartIdx] -
7083 SortedOffsetsFromBase[CurrentGroupStartIdx - GroupSize] !=
7084 StrideBetweenGroups)
7088 auto CheckGroup = [=](
const unsigned StartIdx) ->
bool {
7091 unsigned GroupEndIdx = FoundIt != Indices.end() ? *FoundIt : Sz;
7092 return GroupEndIdx - StartIdx == GroupSize;
7094 for (
unsigned I = 0;
I < Sz;
I += GroupSize) {
7100 Type *StrideTy = DL->getIndexType(Ptr0->
getType());
7101 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, StrideIntVal);
7109 StridedPtrInfo &SPtrInfo)
const {
7110 const unsigned Sz = PointerOps.
size();
7112 if (Sz <= MinProfitableStridedLoads || !TTI->isTypeLegal(StridedLoadTy) ||
7113 !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))
7115 if (
const SCEV *Stride =
7118 SPtrInfo.StrideSCEV = Stride;
7127 unsigned *BestVF,
bool TryRecursiveCheck)
const {
7140 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
7146 const size_t Sz = VL.
size();
7148 auto *POIter = PointerOps.
begin();
7149 for (
Value *V : VL) {
7151 if (!L || !L->isSimple())
7153 *POIter = L->getPointerOperand();
7159 bool IsSorted =
sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
7168 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
7169 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
7180 if (Order.
empty()) {
7181 Ptr0 = PointerOps.
front();
7182 PtrN = PointerOps.
back();
7184 Ptr0 = PointerOps[Order.
front()];
7185 PtrN = PointerOps[Order.
back()];
7187 std::optional<int64_t> Diff =
7190 if (
static_cast<uint64_t>(*Diff) == Sz - 1)
7193 *TLI, [&](
Value *V) {
7194 return areAllUsersVectorized(
7202 *Diff, Ptr0, PtrN, SPtrInfo))
7205 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
7206 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
7211 auto CheckForShuffledLoads = [&, &TTI = *TTI](
Align CommonAlignment,
7213 bool ProfitableGatherPointers) {
7218 auto [ScalarGEPCost, VectorGEPCost] =
7220 Instruction::GetElementPtr,
CostKind, ScalarTy, VecTy);
7224 Type *PtrScalarTy = PointerOps.
front()->getType()->getScalarType();
7226 if (
static_cast<unsigned>(
count_if(
7245 return C + TTI.getInstructionCost(
7251 TTI.getMemIntrinsicInstrCost(
7254 false, CommonAlignment),
7256 (ProfitableGatherPointers ? 0 : VectorGEPCost);
7264 constexpr unsigned ListLimit = 4;
7265 if (!TryRecursiveCheck || VL.
size() < ListLimit)
7274 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
7284 for (
unsigned Cnt = 0, End = VL.
size(); Cnt + VF <= End; Cnt += VF) {
7289 PointerOps, SPtrInfo, BestVF,
7297 DemandedElts.
setBits(Cnt, Cnt + VF);
7313 if (!DemandedElts.
isZero()) {
7319 if (DemandedElts[Idx])
7330 LI0->getPointerOperand(),
7331 Instruction::GetElementPtr,
CostKind, ScalarTy,
7335 if (
static_cast<unsigned>(
7337 PointerOps.
size() - 1 ||
7356 TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
7357 LI0->getPointerAddressSpace(),
CostKind,
7362 VecLdCost += TTI.getMemIntrinsicInstrCost(
7364 Intrinsic::experimental_vp_strided_load,
7365 SubVecTy, LI0->getPointerOperand(),
7366 false, CommonAlignment),
7371 VecLdCost += TTI.getMemIntrinsicInstrCost(
7373 Intrinsic::masked_load, SubVecTy,
7374 CommonAlignment, LI0->getPointerAddressSpace()),
7380 VecLdCost += TTI.getMemIntrinsicInstrCost(
7382 Intrinsic::masked_gather, SubVecTy,
7383 LI0->getPointerOperand(),
7384 false, CommonAlignment),
7394 ShuffleMask[Idx] = Idx / VF ==
I ? VL.
size() + Idx % VF : Idx;
7403 if (MaskedGatherCost >= VecLdCost &&
7416 bool ProfitableGatherPointers =
7417 L && Sz > 2 &&
static_cast<unsigned>(
count_if(PointerOps, [L](
Value *V) {
7418 return L->isLoopInvariant(V);
7420 if (ProfitableGatherPointers ||
all_of(PointerOps, [](
Value *
P) {
7423 (
GEP &&
GEP->getNumOperands() == 2 &&
7431 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
7432 ProfitableGatherPointers))
7444 all_of(VL, [](
const Value *V) {
return V->getType()->isPointerTy(); }) &&
7445 "Expected list of pointer operands.");
7450 std::pair<BasicBlock *, Value *>,
7454 .try_emplace(std::make_pair(
7458 SortedIndices.
clear();
7460 auto Key = std::make_pair(BBs[Cnt + 1],
7462 bool Found =
any_of(Bases.try_emplace(
Key).first->second,
7463 [&, &Cnt = Cnt, &Ptr = Ptr](
auto &
Base) {
7464 std::optional<int64_t> Diff =
7465 getPointersDiff(ElemTy, std::get<0>(Base.front()),
7466 ElemTy, Ptr, DL, SE,
7471 Base.emplace_back(Ptr, *Diff, Cnt + 1);
7477 if (Bases.size() > VL.
size() / 2 - 1)
7481 Bases.find(
Key)->second.emplace_back().emplace_back(Ptr, 0, Cnt + 1);
7485 if (Bases.size() == VL.
size())
7488 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
7489 Bases.front().second.size() == VL.
size()))
7494 auto ComparePointers = [](
Value *Ptr1,
Value *Ptr2) {
7503 FirstPointers.
insert(P1);
7504 SecondPointers.
insert(P2);
7510 "Unable to find matching root.");
7513 for (
auto &
Base : Bases) {
7514 for (
auto &Vec :
Base.second) {
7515 if (Vec.size() > 1) {
7517 int64_t InitialOffset = std::get<1>(Vec[0]);
7518 bool AnyConsecutive =
7520 return std::get<1>(
P.value()) ==
7521 int64_t(
P.index()) + InitialOffset;
7525 if (!AnyConsecutive)
7530 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
7534 for (
auto &
T : Bases)
7535 for (
const auto &Vec :
T.second)
7536 for (
const auto &
P : Vec)
7540 "Expected SortedIndices to be the size of VL");
7544std::optional<BoUpSLP::OrdersType>
7546 assert(TE.isGather() &&
"Expected gather node only.");
7547 Type *ScalarTy = TE.Scalars[0]->getType();
7550 Ptrs.
reserve(TE.Scalars.size());
7552 BBs.
reserve(TE.Scalars.size());
7553 for (
Value *V : TE.Scalars) {
7555 if (!L || !L->isSimple())
7556 return std::nullopt;
7562 if (!LoadEntriesToVectorize.contains(TE.Idx) &&
7564 return std::move(Order);
7565 return std::nullopt;
7576 if (VU->
getType() != V->getType())
7579 if (!VU->
hasOneUse() && !V->hasOneUse())
7585 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
7592 bool IsReusedIdx =
false;
7594 if (IE2 == VU && !IE1)
7596 if (IE1 == V && !IE2)
7597 return V->hasOneUse();
7598 if (IE1 && IE1 != V) {
7600 IsReusedIdx |= ReusedIdx.
test(Idx1);
7601 ReusedIdx.
set(Idx1);
7602 if ((IE1 != VU && !IE1->
hasOneUse()) || IsReusedIdx)
7607 if (IE2 && IE2 != VU) {
7609 IsReusedIdx |= ReusedIdx.
test(Idx2);
7610 ReusedIdx.
set(Idx2);
7611 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
7616 }
while (!IsReusedIdx && (IE1 || IE2));
7626std::optional<BoUpSLP::OrdersType>
7628 bool IgnoreReorder) {
7631 if (!TE.ReuseShuffleIndices.empty()) {
7633 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
7634 "Reshuffling scalars not yet supported for nodes with padding");
7637 return std::nullopt;
7645 unsigned Sz = TE.Scalars.size();
7646 if (TE.isGather()) {
7647 if (std::optional<OrdersType> CurrentOrder =
7652 ::addMask(Mask, TE.ReuseShuffleIndices);
7653 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
7654 unsigned Sz = TE.Scalars.size();
7655 for (
int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
7658 Res[Idx + K * Sz] =
I + K * Sz;
7660 return std::move(Res);
7663 if (Sz == 2 && TE.getVectorFactor() == 4 &&
7665 2 * TE.getVectorFactor())) == 1)
7666 return std::nullopt;
7667 if (TE.ReuseShuffleIndices.size() % Sz != 0)
7668 return std::nullopt;
7672 if (TE.ReorderIndices.empty())
7673 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
7676 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
7677 unsigned VF = ReorderMask.
size();
7681 for (
unsigned I = 0;
I < VF;
I += Sz) {
7683 unsigned UndefCnt = 0;
7684 unsigned Limit = std::min(Sz, VF -
I);
7693 Val >=
static_cast<int>(NumParts) || UsedVals.
test(Val) ||
7695 return std::nullopt;
7697 for (
unsigned K = 0; K < NumParts; ++K) {
7698 unsigned Idx = Val + Sz * K;
7699 if (Idx < VF &&
I + K < VF)
7700 ResOrder[Idx] =
I + K;
7703 return std::move(ResOrder);
7705 unsigned VF = TE.getVectorFactor();
7708 TE.ReuseShuffleIndices.end());
7709 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
7711 if (isa<PoisonValue>(V))
7713 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
7714 return Idx && *Idx < Sz;
7716 assert(!TE.isAltShuffle() &&
"Alternate instructions are only supported "
7717 "by BinaryOperator and CastInst.");
7719 if (TE.ReorderIndices.empty())
7720 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
7723 for (
unsigned I = 0;
I < VF; ++
I) {
7724 int &Idx = ReusedMask[
I];
7727 Value *V = TE.Scalars[ReorderMask[Idx]];
7729 Idx = std::distance(ReorderMask.
begin(),
find(ReorderMask, *EI));
7735 std::iota(ResOrder.
begin(), ResOrder.
end(), 0);
7736 auto *It = ResOrder.
begin();
7737 for (
unsigned K = 0; K < VF; K += Sz) {
7741 std::iota(SubMask.
begin(), SubMask.
end(), 0);
7743 transform(CurrentOrder, It, [K](
unsigned Pos) {
return Pos + K; });
7744 std::advance(It, Sz);
7747 return Data.index() ==
Data.value();
7749 return std::nullopt;
7750 return std::move(ResOrder);
7752 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
7753 (!TE.UserTreeIndex || !TE.UserTreeIndex.UserTE->hasState() ||
7755 (TE.ReorderIndices.empty() ||
isReverseOrder(TE.ReorderIndices)))
7756 return std::nullopt;
7757 if (TE.State == TreeEntry::SplitVectorize ||
7758 ((TE.State == TreeEntry::Vectorize ||
7759 TE.State == TreeEntry::StridedVectorize ||
7760 TE.State == TreeEntry::CompressVectorize) &&
7763 assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
7764 "Alternate instructions are only supported by "
7765 "BinaryOperator and CastInst.");
7766 return TE.ReorderIndices;
7768 if (!TopToBottom && IgnoreReorder && TE.State == TreeEntry::Vectorize &&
7769 TE.isAltShuffle()) {
7770 assert(TE.ReuseShuffleIndices.empty() &&
7771 "ReuseShuffleIndices should be "
7772 "empty for alternate instructions.");
7774 TE.buildAltOpShuffleMask(
7776 assert(TE.getMatchingMainOpOrAltOp(
I) &&
7777 "Unexpected main/alternate opcode");
7781 const int VF = TE.getVectorFactor();
7786 ResOrder[Mask[
I] % VF] =
I;
7788 return std::move(ResOrder);
7790 if (!TE.ReorderIndices.empty())
7791 return TE.ReorderIndices;
7792 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
7793 if (!TE.ReorderIndices.empty())
7794 return TE.ReorderIndices;
7797 for (
auto [
I, V] :
zip(UserBVHead, TE.Scalars)) {
7805 while (
II &&
II->hasOneUse() &&
II->getParent() == BB) {
7813 assert(BB1 != BB2 &&
"Expected different basic blocks.");
7814 if (!DT->isReachableFromEntry(BB1))
7816 if (!DT->isReachableFromEntry(BB2))
7818 auto *NodeA = DT->getNode(BB1);
7819 auto *NodeB = DT->getNode(BB2);
7820 assert(NodeA &&
"Should only process reachable instructions");
7821 assert(NodeB &&
"Should only process reachable instructions");
7822 assert((NodeA == NodeB) ==
7823 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
7824 "Different nodes should have different DFS numbers");
7825 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
7827 auto PHICompare = [&](
unsigned I1,
unsigned I2) {
7828 Value *V1 = TE.Scalars[I1];
7829 Value *V2 = TE.Scalars[I2];
7842 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
7843 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
7844 FirstUserOfPhi2->getParent());
7854 if (UserBVHead[I1] && !UserBVHead[I2])
7856 if (!UserBVHead[I1])
7858 if (UserBVHead[I1] == UserBVHead[I2])
7861 return CompareByBasicBlocks(UserBVHead[I1]->
getParent(),
7863 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
7876 if (EE1->getOperand(0) == EE2->getOperand(0))
7878 if (!Inst1 && Inst2)
7880 if (Inst1 && Inst2) {
7888 "Expected either instructions or arguments vector operands.");
7889 return P1->getArgNo() < P2->getArgNo();
7894 std::iota(Phis.
begin(), Phis.
end(), 0);
7897 return std::nullopt;
7898 return std::move(Phis);
7900 if (TE.isGather() &&
7901 (!TE.hasState() || !TE.isAltShuffle() ||
7902 ScalarsInSplitNodes.contains(TE.getMainOp())) &&
7906 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
7910 auto *EE = dyn_cast<ExtractElementInst>(V);
7911 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
7917 canReuseExtract(TE.Scalars, CurrentOrder,
true);
7918 if (Reuse || !CurrentOrder.
empty())
7919 return std::move(CurrentOrder);
7927 int Sz = TE.Scalars.size();
7931 if (It == TE.Scalars.begin())
7934 if (It != TE.Scalars.end()) {
7936 unsigned Idx = std::distance(TE.Scalars.begin(), It);
7951 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
7954 return std::move(Order);
7959 return std::nullopt;
7960 if (TE.Scalars.size() >= 3)
7965 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
7967 StridedPtrInfo SPtrInfo;
7970 CurrentOrder, PointerOps, SPtrInfo);
7973 return std::move(CurrentOrder);
7978 if (std::optional<OrdersType> CurrentOrder =
7980 return CurrentOrder;
7982 return std::nullopt;
7992 for (
unsigned I = Sz,
E = Mask.size();
I <
E;
I += Sz) {
7994 if (Cluster != FirstCluster)
8000void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const {
8003 const unsigned Sz =
TE.Scalars.size();
8005 if (!
TE.isGather() ||
8012 addMask(NewMask,
TE.ReuseShuffleIndices);
8014 TE.ReorderIndices.clear();
8021 for (
auto *It =
TE.ReuseShuffleIndices.begin(),
8022 *End =
TE.ReuseShuffleIndices.end();
8023 It != End; std::advance(It, Sz))
8024 std::iota(It, std::next(It, Sz), 0);
8030 "Expected same size of orders");
8031 size_t Sz = Order.
size();
8034 if (Order[Idx] != Sz)
8035 UsedIndices.
set(Order[Idx]);
8037 if (SecondaryOrder.
empty()) {
8039 if (Order[Idx] == Sz && !UsedIndices.
test(Idx))
8043 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
8044 !UsedIndices.
test(SecondaryOrder[Idx]))
8045 Order[Idx] = SecondaryOrder[Idx];
8053 constexpr unsigned TinyVF = 2;
8054 constexpr unsigned TinyTree = 10;
8055 constexpr unsigned PhiOpsLimit = 12;
8056 constexpr unsigned GatherLoadsLimit = 2;
8057 if (VectorizableTree.size() <= TinyTree)
8059 if (VectorizableTree.front()->hasState() &&
8060 !VectorizableTree.front()->isGather() &&
8061 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
8062 VectorizableTree.front()->getOpcode() == Instruction::PHI ||
8063 (VectorizableTree.front()->getVectorFactor() <= TinyVF &&
8064 (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
8065 VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
8066 VectorizableTree.front()->ReorderIndices.empty()) {
8070 if (VectorizableTree.front()->hasState() &&
8071 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
8072 VectorizableTree.front()->Scalars.size() == TinyVF &&
8073 VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
8076 if (VectorizableTree.front()->hasState() &&
8077 VectorizableTree.front()->getOpcode() == Instruction::Store &&
8078 VectorizableTree.front()->ReorderIndices.empty()) {
8079 const unsigned ReorderedSplitsCnt =
8080 count_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
8081 return TE->State == TreeEntry::SplitVectorize &&
8082 !TE->ReorderIndices.empty() && TE->UserTreeIndex.UserTE &&
8083 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8086 if (ReorderedSplitsCnt <= 1 &&
8088 VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
8089 return ((!TE->isGather() &&
8090 (TE->ReorderIndices.empty() ||
8091 (TE->UserTreeIndex.UserTE &&
8092 TE->UserTreeIndex.UserTE->State ==
8093 TreeEntry::Vectorize &&
8094 !TE->UserTreeIndex.UserTE->ReuseShuffleIndices
8096 (TE->isGather() && TE->ReorderIndices.empty() &&
8097 (!TE->hasState() || TE->isAltShuffle() ||
8098 TE->getOpcode() == Instruction::Load ||
8099 TE->getOpcode() == Instruction::ZExt ||
8100 TE->getOpcode() == Instruction::SExt))) &&
8101 (VectorizableTree.front()->getVectorFactor() > TinyVF ||
8102 !TE->isGather() ||
none_of(TE->Scalars, [&](
Value *V) {
8103 return !isConstant(V) && isVectorized(V);
8105 })) >= VectorizableTree.size() - ReorderedSplitsCnt)
8108 bool HasPhis =
false;
8109 bool HasLoad =
true;
8110 unsigned GatherLoads = 0;
8111 for (
const std::unique_ptr<TreeEntry> &TE :
8112 ArrayRef(VectorizableTree).drop_front()) {
8113 if (TE->State == TreeEntry::SplitVectorize)
8115 if (!TE->hasState()) {
8119 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8124 if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) {
8125 if (!TE->isGather()) {
8132 if (GatherLoads >= GatherLoadsLimit)
8135 if (TE->getOpcode() == Instruction::GetElementPtr ||
8138 if (TE->getOpcode() != Instruction::PHI &&
8139 (!TE->hasCopyableElements() ||
8141 TE->Scalars.size() / 2))
8143 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8144 TE->getNumOperands() > PhiOpsLimit)
8153void BoUpSLP::TreeEntry::reorderSplitNode(
unsigned Idx,
ArrayRef<int> Mask,
8155 assert(State == TreeEntry::SplitVectorize &&
"Expected split user node.");
8158 std::iota(NewMask.
begin(), NewMask.
end(), 0);
8159 std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0);
8162 copy(MaskOrder, NewMaskOrder.begin());
8164 assert(Idx == 1 &&
"Expected either 0 or 1 index.");
8165 unsigned Offset = CombinedEntriesWithIndices.
back().second;
8174 ReorderIndices.clear();
8193 ExternalUserReorderMap;
8197 for_each(VectorizableTree, [&, &TTIRef = *TTI](
8198 const std::unique_ptr<TreeEntry> &TE) {
8201 findExternalStoreUsersReorderIndices(TE.get());
8202 if (!ExternalUserReorderIndices.
empty()) {
8203 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
8205 std::move(ExternalUserReorderIndices));
8211 if (TE->hasState() && TE->isAltShuffle() &&
8212 TE->State != TreeEntry::SplitVectorize) {
8213 Type *ScalarTy = TE->Scalars[0]->getType();
8215 unsigned Opcode0 = TE->getOpcode();
8216 unsigned Opcode1 = TE->getAltOpcode();
8220 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
8221 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
8227 bool IgnoreReorder =
8228 !UserIgnoreList && VectorizableTree.front()->hasState() &&
8229 (VectorizableTree.front()->
getOpcode() == Instruction::InsertElement ||
8230 VectorizableTree.front()->getOpcode() == Instruction::Store);
8231 if (std::optional<OrdersType> CurrentOrder =
8241 const TreeEntry *UserTE = TE.get();
8243 if (!UserTE->UserTreeIndex)
8245 if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8246 UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
8247 UserTE->UserTreeIndex.UserTE->Idx != 0)
8249 UserTE = UserTE->UserTreeIndex.UserTE;
8252 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
8253 if (!(TE->State == TreeEntry::Vectorize ||
8254 TE->State == TreeEntry::StridedVectorize ||
8255 TE->State == TreeEntry::SplitVectorize ||
8256 TE->State == TreeEntry::CompressVectorize) ||
8257 !TE->ReuseShuffleIndices.empty())
8258 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
8259 if (TE->State == TreeEntry::Vectorize &&
8260 TE->getOpcode() == Instruction::PHI)
8261 PhisToOrders.
try_emplace(TE.get(), *CurrentOrder);
8266 for (
unsigned VF = VectorizableTree.front()->getVectorFactor();
8267 !VFToOrderedEntries.
empty() && VF > 1; VF -= 2 - (VF & 1U)) {
8268 auto It = VFToOrderedEntries.
find(VF);
8269 if (It == VFToOrderedEntries.
end())
8283 for (
const TreeEntry *OpTE : OrderedEntries) {
8286 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE) &&
8287 OpTE->State != TreeEntry::SplitVectorize)
8290 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
8292 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
8293 auto It = GathersToOrders.find(OpTE);
8294 if (It != GathersToOrders.end())
8297 if (OpTE->hasState() && OpTE->isAltShuffle()) {
8298 auto It = AltShufflesToOrders.find(OpTE);
8299 if (It != AltShufflesToOrders.end())
8302 if (OpTE->State == TreeEntry::Vectorize &&
8303 OpTE->getOpcode() == Instruction::PHI) {
8304 auto It = PhisToOrders.
find(OpTE);
8305 if (It != PhisToOrders.
end())
8308 return OpTE->ReorderIndices;
8311 auto It = ExternalUserReorderMap.
find(OpTE);
8312 if (It != ExternalUserReorderMap.
end()) {
8313 const auto &ExternalUserReorderIndices = It->second;
8317 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
8318 OrdersUses.try_emplace(
OrdersType(), 0).first->second +=
8319 ExternalUserReorderIndices.size();
8321 for (
const OrdersType &ExtOrder : ExternalUserReorderIndices)
8322 ++OrdersUses.try_emplace(ExtOrder, 0).first->second;
8329 if (OpTE->State == TreeEntry::Vectorize &&
8330 OpTE->getOpcode() == Instruction::Store && !Order.
empty()) {
8331 assert(!OpTE->isAltShuffle() &&
8332 "Alternate instructions are only supported by BinaryOperator "
8336 unsigned E = Order.
size();
8339 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8342 ++OrdersUses.try_emplace(CurrentOrder, 0).first->second;
8344 ++OrdersUses.try_emplace(Order, 0).first->second;
8347 if (OrdersUses.empty())
8350 unsigned IdentityCnt = 0;
8351 unsigned FilledIdentityCnt = 0;
8353 for (
auto &Pair : OrdersUses) {
8355 if (!Pair.first.empty())
8356 FilledIdentityCnt += Pair.second;
8357 IdentityCnt += Pair.second;
8362 unsigned Cnt = IdentityCnt;
8363 for (
auto &Pair : OrdersUses) {
8367 if (Cnt < Pair.second ||
8368 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
8369 Cnt == Pair.second && !BestOrder.
empty() &&
8372 BestOrder = Pair.first;
8385 unsigned E = BestOrder.
size();
8387 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8390 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8392 if (TE->Scalars.size() != VF) {
8393 if (TE->ReuseShuffleIndices.size() == VF) {
8394 assert(TE->State != TreeEntry::SplitVectorize &&
8395 "Split vectorized not expected.");
8400 (!TE->UserTreeIndex ||
8401 TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
8402 TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() ||
8403 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) &&
8404 "All users must be of VF size.");
8411 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->hasState() &&
8417 reorderNodeWithReuses(*TE, Mask);
8419 if (TE->UserTreeIndex &&
8420 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8421 TE->UserTreeIndex.UserTE->reorderSplitNode(
8422 TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder);
8426 if ((TE->State == TreeEntry::SplitVectorize &&
8427 TE->ReuseShuffleIndices.empty()) ||
8428 ((TE->State == TreeEntry::Vectorize ||
8429 TE->State == TreeEntry::StridedVectorize ||
8430 TE->State == TreeEntry::CompressVectorize) &&
8435 (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize &&
8436 TE->ReuseShuffleIndices.empty())) &&
8437 "Alternate instructions are only supported by BinaryOperator "
8443 TE->reorderOperands(Mask);
8446 TE->reorderOperands(Mask);
8447 assert(TE->ReorderIndices.empty() &&
8448 "Expected empty reorder sequence.");
8451 if (!TE->ReuseShuffleIndices.empty()) {
8458 addMask(NewReuses, TE->ReuseShuffleIndices);
8459 TE->ReuseShuffleIndices.swap(NewReuses);
8460 }
else if (TE->UserTreeIndex &&
8461 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8463 TE->UserTreeIndex.UserTE->reorderSplitNode(TE->UserTreeIndex.EdgeIdx,
8469void BoUpSLP::buildReorderableOperands(
8470 TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
8474 if (
any_of(Edges, [
I](
const std::pair<unsigned, TreeEntry *> &OpData) {
8475 return OpData.first ==
I &&
8476 (OpData.second->State == TreeEntry::Vectorize ||
8477 OpData.second->State == TreeEntry::StridedVectorize ||
8478 OpData.second->State == TreeEntry::CompressVectorize ||
8479 OpData.second->State == TreeEntry::SplitVectorize);
8483 if (UserTE->hasState()) {
8484 if (UserTE->getOpcode() == Instruction::ExtractElement ||
8485 UserTE->getOpcode() == Instruction::ExtractValue)
8487 if (UserTE->getOpcode() == Instruction::InsertElement &&
I == 0)
8489 if (UserTE->getOpcode() == Instruction::Store &&
8490 UserTE->State == TreeEntry::Vectorize &&
I == 1)
8492 if (UserTE->getOpcode() == Instruction::Load &&
8493 (UserTE->State == TreeEntry::Vectorize ||
8494 UserTE->State == TreeEntry::StridedVectorize ||
8495 UserTE->State == TreeEntry::CompressVectorize))
8498 TreeEntry *
TE = getOperandEntry(UserTE,
I);
8499 assert(TE &&
"Expected operand entry.");
8500 if (!
TE->isGather()) {
8503 Edges.emplace_back(
I, TE);
8509 if (
TE->State == TreeEntry::ScatterVectorize &&
8510 TE->ReuseShuffleIndices.empty() &&
TE->ReorderIndices.empty())
8514 if (ReorderableGathers.
contains(TE))
8520 struct TreeEntryCompare {
8521 bool operator()(
const TreeEntry *LHS,
const TreeEntry *RHS)
const {
8522 if (LHS->UserTreeIndex && RHS->UserTreeIndex)
8523 return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx;
8524 return LHS->Idx < RHS->Idx;
8533 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8534 if (TE->State != TreeEntry::Vectorize &&
8535 TE->State != TreeEntry::StridedVectorize &&
8536 TE->State != TreeEntry::CompressVectorize &&
8537 TE->State != TreeEntry::SplitVectorize)
8538 NonVectorized.
insert(TE.get());
8539 if (std::optional<OrdersType> CurrentOrder =
8541 Queue.push(TE.get());
8542 if (!(TE->State == TreeEntry::Vectorize ||
8543 TE->State == TreeEntry::StridedVectorize ||
8544 TE->State == TreeEntry::CompressVectorize ||
8545 TE->State == TreeEntry::SplitVectorize) ||
8546 !TE->ReuseShuffleIndices.empty())
8547 GathersToOrders.
insert(TE.get());
8556 while (!Queue.empty()) {
8558 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>
Users;
8559 TreeEntry *TE = Queue.top();
8560 const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
8563 while (!Queue.empty()) {
8565 if (!UserTE || UserTE != TE->UserTreeIndex.UserTE)
8570 for (TreeEntry *TE : OrderedOps) {
8571 if (!(TE->State == TreeEntry::Vectorize ||
8572 TE->State == TreeEntry::StridedVectorize ||
8573 TE->State == TreeEntry::CompressVectorize ||
8574 TE->State == TreeEntry::SplitVectorize ||
8575 (TE->isGather() && GathersToOrders.
contains(TE))) ||
8576 !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
8577 !Visited.
insert(TE).second)
8581 Users.first = TE->UserTreeIndex.UserTE;
8582 Users.second.emplace_back(TE->UserTreeIndex.EdgeIdx, TE);
8586 if (
Data.first->State == TreeEntry::SplitVectorize) {
8588 Data.second.size() <= 2 &&
8589 "Expected not greater than 2 operands for split vectorize node.");
8591 [](
const auto &
Op) { return !Op.second->UserTreeIndex; }))
8594 assert(
Data.first->CombinedEntriesWithIndices.size() == 2 &&
8595 "Expected exactly 2 entries.");
8596 for (
const auto &
P :
Data.first->CombinedEntriesWithIndices) {
8597 TreeEntry &OpTE = *VectorizableTree[
P.first];
8599 if (Order.
empty() || !OpTE.ReuseShuffleIndices.empty()) {
8600 if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
8602 const auto BestOrder =
8611 const unsigned E = Order.
size();
8614 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8616 Data.first->reorderSplitNode(
P.second ? 1 : 0, Mask, MaskOrder);
8618 if (!OpTE.ReorderIndices.empty()) {
8619 OpTE.ReorderIndices.clear();
8620 }
else if (!OpTE.ReuseShuffleIndices.empty()) {
8623 assert(OpTE.isGather() &&
"Expected only gather/buildvector node.");
8627 if (
Data.first->ReuseShuffleIndices.empty() &&
8628 !
Data.first->ReorderIndices.empty()) {
8631 Queue.push(
Data.first);
8637 buildReorderableOperands(
Data.first,
Data.second, NonVectorized,
8649 for (
const auto &
Op :
Data.second) {
8650 TreeEntry *OpTE =
Op.second;
8651 if (!VisitedOps.
insert(OpTE).second)
8653 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
8655 const auto Order = [&]() ->
const OrdersType {
8656 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
8660 return OpTE->ReorderIndices;
8664 if (Order.
size() == 1)
8670 Value *Root = OpTE->hasState()
8673 auto GetSameNodesUsers = [&](
Value *Root) {
8675 for (
const TreeEntry *TE : ValueToGatherNodes.lookup(Root)) {
8676 if (TE != OpTE && TE->UserTreeIndex &&
8677 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8678 TE->Scalars.size() == OpTE->Scalars.size() &&
8679 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8680 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8681 Res.
insert(TE->UserTreeIndex.UserTE);
8683 for (
const TreeEntry *TE : getTreeEntries(Root)) {
8684 if (TE != OpTE && TE->UserTreeIndex &&
8685 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8686 TE->Scalars.size() == OpTE->Scalars.size() &&
8687 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8688 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8689 Res.
insert(TE->UserTreeIndex.UserTE);
8693 auto GetNumOperands = [](
const TreeEntry *TE) {
8694 if (TE->State == TreeEntry::SplitVectorize)
8695 return TE->getNumOperands();
8697 return CI->arg_size();
8698 return TE->getNumOperands();
8700 auto NodeShouldBeReorderedWithOperands = [&, TTI = TTI](
8701 const TreeEntry *TE) {
8709 const TreeEntry *
Op = getOperandEntry(TE, Idx);
8710 if (
Op->isGather() &&
Op->hasState()) {
8711 const TreeEntry *VecOp =
8712 getSameValuesTreeEntry(
Op->getMainOp(),
Op->Scalars);
8716 if (
Op->ReorderIndices.empty() &&
Op->ReuseShuffleIndices.empty())
8723 if (!RevisitedOps.
insert(UTE).second)
8725 return UTE ==
Data.first || !UTE->ReorderIndices.empty() ||
8726 !UTE->ReuseShuffleIndices.empty() ||
8727 (UTE->UserTreeIndex &&
8728 UTE->UserTreeIndex.UserTE ==
Data.first) ||
8729 (
Data.first->UserTreeIndex &&
8730 Data.first->UserTreeIndex.UserTE == UTE) ||
8731 (IgnoreReorder && UTE->UserTreeIndex &&
8732 UTE->UserTreeIndex.UserTE->Idx == 0) ||
8733 NodeShouldBeReorderedWithOperands(UTE);
8736 for (TreeEntry *UTE :
Users) {
8744 const TreeEntry *
Op = getOperandEntry(UTE, Idx);
8746 Queue.push(
const_cast<TreeEntry *
>(
Op));
8751 Data.second, [OpTE](
const std::pair<unsigned, TreeEntry *> &
P) {
8752 return P.second == OpTE;
8755 if (OpTE->State == TreeEntry::Vectorize &&
8756 OpTE->getOpcode() == Instruction::Store && !Order.
empty()) {
8757 assert(!OpTE->isAltShuffle() &&
8758 "Alternate instructions are only supported by BinaryOperator "
8762 unsigned E = Order.
size();
8765 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8768 OrdersUses.try_emplace(CurrentOrder, 0).first->second +=
NumOps;
8770 OrdersUses.try_emplace(Order, 0).first->second +=
NumOps;
8772 auto Res = OrdersUses.try_emplace(
OrdersType(), 0);
8773 const auto AllowsReordering = [&](
const TreeEntry *TE) {
8774 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
8775 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
8776 (IgnoreReorder && TE->Idx == 0))
8778 if (TE->isGather()) {
8788 if (OpTE->UserTreeIndex) {
8789 TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE;
8790 if (!VisitedUsers.
insert(UserTE).second)
8795 if (AllowsReordering(UserTE))
8803 if (
static_cast<unsigned>(
count_if(
8804 Ops, [UserTE, &AllowsReordering](
8805 const std::pair<unsigned, TreeEntry *> &
Op) {
8806 return AllowsReordering(
Op.second) &&
8807 Op.second->UserTreeIndex.UserTE == UserTE;
8808 })) <=
Ops.size() / 2)
8809 ++Res.first->second;
8812 if (OrdersUses.empty()) {
8817 unsigned IdentityCnt = 0;
8818 unsigned VF =
Data.second.front().second->getVectorFactor();
8820 for (
auto &Pair : OrdersUses) {
8822 IdentityCnt += Pair.second;
8827 unsigned Cnt = IdentityCnt;
8828 for (
auto &Pair : OrdersUses) {
8832 if (Cnt < Pair.second) {
8834 BestOrder = Pair.first;
8851 unsigned E = BestOrder.
size();
8853 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8855 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second) {
8856 TreeEntry *TE =
Op.second;
8857 if (!VisitedOps.
insert(TE).second)
8859 if (TE->ReuseShuffleIndices.size() == BestOrder.
size()) {
8860 reorderNodeWithReuses(*TE, Mask);
8864 if (TE->State != TreeEntry::Vectorize &&
8865 TE->State != TreeEntry::StridedVectorize &&
8866 TE->State != TreeEntry::CompressVectorize &&
8867 TE->State != TreeEntry::SplitVectorize &&
8868 (TE->State != TreeEntry::ScatterVectorize ||
8869 TE->ReorderIndices.empty()))
8871 assert((BestOrder.
size() == TE->ReorderIndices.size() ||
8872 TE->ReorderIndices.empty()) &&
8873 "Non-matching sizes of user/operand entries.");
8875 if (IgnoreReorder && TE == VectorizableTree.front().get())
8876 IgnoreReorder =
false;
8879 for (TreeEntry *
Gather : GatherOps) {
8881 "Unexpected reordering of gathers.");
8882 if (!
Gather->ReuseShuffleIndices.empty()) {
8892 auto IsNotProfitableAltCodeNode = [](
const TreeEntry &TE) {
8893 return TE.isAltShuffle() &&
8894 (!TE.ReuseShuffleIndices.empty() || TE.getVectorFactor() == 2 ||
8895 TE.ReorderIndices.empty());
8897 if (
Data.first->State != TreeEntry::Vectorize ||
8899 Data.first->getMainOp()) ||
8900 IsNotProfitableAltCodeNode(*
Data.first))
8901 Data.first->reorderOperands(Mask);
8903 IsNotProfitableAltCodeNode(*
Data.first) ||
8904 Data.first->State == TreeEntry::StridedVectorize ||
8905 Data.first->State == TreeEntry::CompressVectorize) {
8909 if (
Data.first->ReuseShuffleIndices.empty() &&
8910 !
Data.first->ReorderIndices.empty() &&
8911 !IsNotProfitableAltCodeNode(*
Data.first)) {
8914 Queue.push(
Data.first);
8922 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
8923 VectorizableTree.front()->ReuseShuffleIndices.empty())
8924 VectorizableTree.front()->ReorderIndices.
clear();
8927Instruction *BoUpSLP::getRootEntryInstruction(
const TreeEntry &Entry)
const {
8928 if (Entry.hasState() &&
8929 (Entry.getOpcode() == Instruction::Store ||
8930 Entry.getOpcode() == Instruction::Load) &&
8931 Entry.State == TreeEntry::StridedVectorize &&
8932 !Entry.ReorderIndices.empty() &&
isReverseOrder(Entry.ReorderIndices))
8939 const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
8942 for (
auto &TEPtr : VectorizableTree) {
8943 TreeEntry *Entry = TEPtr.get();
8946 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
8950 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
8951 Value *Scalar = Entry->Scalars[Lane];
8956 auto It = ScalarToExtUses.
find(Scalar);
8957 if (It != ScalarToExtUses.
end() && !ExternalUses[It->second].User)
8960 if (Scalar->hasNUsesOrMore(NumVectScalars)) {
8961 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8962 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract from lane " << FoundLane
8963 <<
" from " << *Scalar <<
"for many users.\n");
8964 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
8965 ExternalUses.emplace_back(Scalar,
nullptr, *Entry, FoundLane);
8966 ExternalUsesWithNonUsers.insert(Scalar);
8971 const auto ExtI = ExternallyUsedValues.
find(Scalar);
8972 if (ExtI != ExternallyUsedValues.
end()) {
8973 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8974 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract: Extra arg from lane "
8975 << FoundLane <<
" from " << *Scalar <<
".\n");
8976 ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size());
8977 ExternalUses.emplace_back(Scalar,
nullptr, *Entry, FoundLane);
8988 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
8993 !UseEntries.
empty()) {
8997 if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
9000 all_of(UseEntries, [&](TreeEntry *UseEntry) {
9001 return UseEntry->State == TreeEntry::ScatterVectorize ||
9003 Scalar, getRootEntryInstruction(*UseEntry), TLI,
9006 LLVM_DEBUG(
dbgs() <<
"SLP: \tInternal user will be removed:" << *U
9009 [](TreeEntry *UseEntry) {
9010 return UseEntry->isGather();
9016 if (It != ScalarToExtUses.
end()) {
9017 ExternalUses[It->second].User =
nullptr;
9022 if (U && Scalar->hasNUsesOrMore(
UsesLimit))
9024 unsigned FoundLane = Entry->findLaneForValue(Scalar);
9026 <<
" from lane " << FoundLane <<
" from " << *Scalar
9028 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
9029 ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
9030 ExternalUsesWithNonUsers.insert(Scalar);
9039BoUpSLP::collectUserStores(
const BoUpSLP::TreeEntry *TE)
const {
9043 for (
unsigned Lane :
seq<unsigned>(0, TE->Scalars.size())) {
9044 Value *V = TE->Scalars[Lane];
9057 if (
SI ==
nullptr || !
SI->isSimple() ||
SI->getFunction() !=
F ||
9066 auto &StoresVec = PtrToStoresMap[{
SI->getParent(),
9067 SI->getValueOperand()->getType(), Ptr}];
9070 if (StoresVec.size() > Lane)
9072 if (!StoresVec.empty()) {
9074 SI->getValueOperand()->getType(),
SI->getPointerOperand(),
9075 SI->getValueOperand()->getType(),
9076 StoresVec.front()->getPointerOperand(), *
DL, *SE,
9082 StoresVec.push_back(SI);
9087 for (
auto &
P : PtrToStoresMap) {
9102 StoreInst *S0 = StoresVec[0];
9107 StoreInst *
SI = StoresVec[Idx];
9108 std::optional<int64_t> Diff =
9110 SI->getPointerOperand(), *DL, *SE,
9116 if (StoreOffsetVec.
size() != StoresVec.
size())
9118 sort(StoreOffsetVec, llvm::less_first());
9120 int64_t PrevDist = 0;
9121 for (
const auto &
P : StoreOffsetVec) {
9122 if (Idx > 0 &&
P.first != PrevDist + 1)
9130 ReorderIndices.assign(StoresVec.
size(), 0);
9131 bool IsIdentity =
true;
9133 ReorderIndices[
P.second] =
I;
9134 IsIdentity &=
P.second ==
I;
9140 ReorderIndices.clear();
9147 for (
unsigned Idx : Order)
9148 dbgs() << Idx <<
", ";
9154BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE)
const {
9155 unsigned NumLanes =
TE->Scalars.size();
9168 if (StoresVec.
size() != NumLanes)
9173 if (!canFormVector(StoresVec, ReorderIndices))
9178 ExternalReorderIndices.
push_back(ReorderIndices);
9180 return ExternalReorderIndices;
9186 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9187 "TreeEntryToStridedPtrInfoMap is not cleared");
9188 UserIgnoreList = &UserIgnoreLst;
9191 buildTreeRec(Roots, 0,
EdgeInfo());
9196 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9197 "TreeEntryToStridedPtrInfoMap is not cleared");
9200 buildTreeRec(Roots, 0,
EdgeInfo());
9209 bool AddNew =
true) {
9217 for (
Value *V : VL) {
9221 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
9223 bool IsFound =
false;
9224 for (
auto [Map,
Data] :
zip(ClusteredDistToLoad, ClusteredLoads)) {
9225 assert(LI->getParent() ==
Data.front().first->getParent() &&
9226 LI->getType() ==
Data.front().first->getType() &&
9230 "Expected loads with the same type, same parent and same "
9231 "underlying pointer.");
9233 LI->getType(), LI->getPointerOperand(),
Data.front().first->getType(),
9234 Data.front().first->getPointerOperand(),
DL, SE,
9238 auto It = Map.find(*Dist);
9239 if (It != Map.end() && It->second != LI)
9241 if (It == Map.end()) {
9242 Data.emplace_back(LI, *Dist);
9243 Map.try_emplace(*Dist, LI);
9253 auto FindMatchingLoads =
9258 int64_t &
Offset,
unsigned &Start) {
9260 return GatheredLoads.
end();
9269 std::optional<int64_t> Dist =
9271 Data.front().first->getType(),
9272 Data.front().first->getPointerOperand(),
DL, SE,
9278 for (std::pair<LoadInst *, int64_t>
P :
Data) {
9284 unsigned NumUniques = 0;
9285 for (
auto [Cnt, Pair] :
enumerate(Loads)) {
9286 bool Used = DataLoads.
contains(Pair.first);
9287 if (!Used && !DataDists.
contains(*Dist + Pair.second)) {
9291 Repeated.insert(Cnt);
9294 if (NumUniques > 0 &&
9295 (Loads.
size() == NumUniques ||
9296 (Loads.
size() - NumUniques >= 2 &&
9297 Loads.
size() - NumUniques >= Loads.
size() / 2 &&
9303 return std::next(GatheredLoads.
begin(), Idx);
9307 return GatheredLoads.
end();
9309 for (
ArrayRef<std::pair<LoadInst *, int64_t>>
Data : ClusteredLoads) {
9313 auto *It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
9315 while (It != GatheredLoads.
end()) {
9316 assert(!LocalToAdd.
empty() &&
"Expected some elements to add.");
9317 for (
unsigned Idx : LocalToAdd)
9320 It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
Offset,
9324 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
9331 Loads.push_back(
Data[Idx]);
9337 GatheredLoads, [&](
ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9338 return PD.front().first->getParent() == LI->
getParent() &&
9339 PD.front().first->getType() == LI->
getType();
9341 while (It != GatheredLoads.
end()) {
9344 std::next(It), GatheredLoads.
end(),
9345 [&](
ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9346 return PD.front().first->getParent() == LI->getParent() &&
9347 PD.front().first->getType() == LI->getType();
9351 GatheredLoads.emplace_back().append(
Data.begin(),
Data.end());
9352 AddNewLoads(GatheredLoads.emplace_back());
9357void BoUpSLP::tryToVectorizeGatheredLoads(
9358 const SmallMapVector<
9359 std::tuple<BasicBlock *, Value *, Type *>,
9362 GatheredLoadsEntriesFirst = VectorizableTree.
size();
9365 LoadEntriesToVectorize.size());
9366 for (
auto [Idx, Set] :
zip(LoadEntriesToVectorize, LoadSetsToVectorize))
9367 Set.insert_range(VectorizableTree[Idx]->Scalars);
9370 auto LoadSorter = [](
const std::pair<LoadInst *, int64_t> &L1,
9371 const std::pair<LoadInst *, int64_t> &L2) {
9372 return L1.second > L2.second;
9379 auto *Ty =
getWidenedType(Loads.front()->getType(), Loads.size());
9380 return TTI->isLegalMaskedGather(Ty, Alignment) &&
9381 !TTI->forceScalarizeMaskedGather(Ty, Alignment);
9386 SmallVectorImpl<LoadInst *> &NonVectorized,
9387 bool Final,
unsigned MaxVF) {
9389 unsigned StartIdx = 0;
9390 SmallVector<int> CandidateVFs;
9394 *TTI, Loads.
front()->getType(), MaxVF);
9396 *TTI, Loads.
front()->getType(), NumElts - 1)) {
9402 if (Final && CandidateVFs.
empty())
9405 unsigned BestVF = Final ? CandidateVFs.
back() : 0;
9406 for (
unsigned NumElts : CandidateVFs) {
9407 if (Final && NumElts > BestVF)
9409 SmallVector<unsigned> MaskedGatherVectorized;
9410 for (
unsigned Cnt = StartIdx,
E = Loads.
size(); Cnt <
E;
9414 if (VectorizedLoads.count(Slice.
front()) ||
9415 VectorizedLoads.count(Slice.
back()) ||
9421 bool AllowToVectorize =
false;
9424 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
9427 for (LoadInst *LI : Slice) {
9429 if (LI->hasOneUse())
9435 if (
static_cast<unsigned int>(std::distance(
9436 LI->user_begin(), LI->user_end())) != LI->getNumUses())
9438 if (!IsLegalBroadcastLoad)
9442 for (User *U : LI->users()) {
9445 for (
const TreeEntry *UTE : getTreeEntries(U)) {
9446 for (
int I :
seq<int>(UTE->getNumOperands())) {
9448 return V == LI || isa<PoisonValue>(V);
9458 AllowToVectorize = CheckIfAllowed(Slice);
9462 any_of(ValueToGatherNodes.at(Slice.front()),
9463 [=](
const TreeEntry *TE) {
9464 return TE->Scalars.size() == 2 &&
9465 ((TE->Scalars.front() == Slice.front() &&
9466 TE->Scalars.back() == Slice.back()) ||
9467 (TE->Scalars.front() == Slice.back() &&
9468 TE->Scalars.back() == Slice.front()));
9473 if (AllowToVectorize) {
9478 reinterpret_cast<Value *
const *
>(Slice.begin()), Slice.size());
9479 StridedPtrInfo SPtrInfo;
9481 PointerOps, SPtrInfo, &BestVF);
9483 (BestVF > 1 &&
static_cast<unsigned>(NumElts) == 2 * BestVF)) {
9485 if (MaskedGatherVectorized.
empty() ||
9486 Cnt >= MaskedGatherVectorized.
back() + NumElts)
9491 Results.emplace_back(Values, LS);
9492 VectorizedLoads.insert_range(Slice);
9495 if (Cnt == StartIdx)
9496 StartIdx += NumElts;
9499 if (StartIdx >= Loads.
size())
9503 if (!MaskedGatherVectorized.
empty() &&
9504 Cnt < MaskedGatherVectorized.
back() + NumElts)
9510 if (!AllowToVectorize || BestVF == 0)
9514 for (
unsigned Cnt : MaskedGatherVectorized) {
9516 Cnt, std::min<unsigned>(NumElts, Loads.
size() - Cnt));
9520 VectorizedLoads.insert_range(Slice);
9522 if (Cnt == StartIdx)
9523 StartIdx += NumElts;
9526 for (LoadInst *LI : Loads) {
9527 if (!VectorizedLoads.contains(LI))
9528 NonVectorized.push_back(LI);
9532 auto ProcessGatheredLoads =
9535 bool Final =
false) {
9537 for (
ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists :
9539 if (LoadsDists.size() <= 1) {
9540 NonVectorized.
push_back(LoadsDists.back().first);
9548 unsigned MaxConsecutiveDistance = 0;
9549 unsigned CurrentConsecutiveDist = 1;
9550 int64_t LastDist = LocalLoadsDists.front().second;
9551 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
9552 for (
const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {
9555 assert(LastDist >=
L.second &&
9556 "Expected first distance always not less than second");
9557 if (
static_cast<uint64_t
>(LastDist -
L.second) ==
9558 CurrentConsecutiveDist) {
9559 ++CurrentConsecutiveDist;
9560 MaxConsecutiveDistance =
9561 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
9565 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
9568 CurrentConsecutiveDist = 1;
9569 LastDist =
L.second;
9572 if (Loads.
size() <= 1)
9574 if (AllowMaskedGather)
9575 MaxConsecutiveDistance = Loads.
size();
9576 else if (MaxConsecutiveDistance < 2)
9581 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
9582 Final, MaxConsecutiveDistance);
9584 OriginalLoads.size() == Loads.
size() &&
9585 MaxConsecutiveDistance == Loads.
size() &&
9590 VectorizedLoads.
clear();
9594 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
9595 UnsortedNonVectorized, Final,
9596 OriginalLoads.size());
9597 if (SortedNonVectorized.
size() >= UnsortedNonVectorized.
size()) {
9598 SortedNonVectorized.
swap(UnsortedNonVectorized);
9599 Results.swap(UnsortedResults);
9604 << Slice.
size() <<
")\n");
9606 for (
Value *L : Slice)
9614 unsigned MaxVF = Slice.size();
9615 unsigned UserMaxVF = 0;
9616 unsigned InterleaveFactor = 0;
9621 std::optional<unsigned> InterleavedLoadsDistance = 0;
9623 std::optional<unsigned> CommonVF = 0;
9624 DenseMap<const TreeEntry *, unsigned> EntryToPosition;
9625 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
9626 for (
auto [Idx, V] :
enumerate(Slice)) {
9627 for (
const TreeEntry *
E : ValueToGatherNodes.at(V)) {
9628 UserMaxVF = std::max<unsigned>(UserMaxVF,
E->Scalars.size());
9631 UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
9633 if (*CommonVF == 0) {
9634 CommonVF =
E->Scalars.size();
9637 if (*CommonVF !=
E->Scalars.size())
9641 if (Pos != Idx && InterleavedLoadsDistance) {
9644 if (isa<Constant>(V))
9646 if (isVectorized(V))
9648 const auto &Nodes = ValueToGatherNodes.at(V);
9649 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
9650 !is_contained(Slice, V);
9652 InterleavedLoadsDistance.reset();
9656 if (*InterleavedLoadsDistance == 0) {
9657 InterleavedLoadsDistance = Idx - Pos;
9660 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
9661 (Idx - Pos) / *InterleavedLoadsDistance < Order)
9662 InterleavedLoadsDistance.reset();
9663 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
9667 DeinterleavedNodes.
clear();
9669 if (InterleavedLoadsDistance.value_or(0) > 1 &&
9670 CommonVF.value_or(0) != 0) {
9671 InterleaveFactor =
bit_ceil(*InterleavedLoadsDistance);
9672 unsigned VF = *CommonVF;
9675 StridedPtrInfo SPtrInfo;
9677 if (InterleaveFactor <= Slice.size() &&
9678 TTI.isLegalInterleavedAccessType(
9686 UserMaxVF = InterleaveFactor * VF;
9688 InterleaveFactor = 0;
9693 unsigned ConsecutiveNodesSize = 0;
9694 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
9695 any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9696 [&, Slice = Slice](
const auto &
P) {
9698 return std::get<1>(
P).contains(V);
9700 if (It == Slice.end())
9702 const TreeEntry &
TE =
9703 *VectorizableTree[std::get<0>(
P)];
9707 StridedPtrInfo SPtrInfo;
9709 VL, VL.
front(), Order, PointerOps, SPtrInfo);
9713 ConsecutiveNodesSize += VL.
size();
9714 size_t Start = std::distance(Slice.begin(), It);
9715 size_t Sz = Slice.size() -
Start;
9716 return Sz < VL.
size() ||
9717 Slice.slice(Start, VL.
size()) != VL;
9722 if (InterleaveFactor == 0 &&
9724 [&, Slice = Slice](
unsigned Idx) {
9726 SmallVector<Value *> PointerOps;
9727 StridedPtrInfo SPtrInfo;
9728 return canVectorizeLoads(
9729 Slice.slice(Idx * UserMaxVF, UserMaxVF),
9730 Slice[Idx * UserMaxVF], Order, PointerOps,
9731 SPtrInfo) == LoadsState::ScatterVectorize;
9734 if (Slice.size() != ConsecutiveNodesSize)
9735 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
9737 for (
unsigned VF = MaxVF; VF >= 2; VF /= 2) {
9738 bool IsVectorized =
true;
9739 for (
unsigned I = 0,
E = Slice.size();
I <
E;
I += VF) {
9741 Slice.slice(
I, std::min(VF,
E -
I));
9746 if (
any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9747 [&](
const auto &
P) {
9749 VectorizableTree[std::get<0>(
P)]
9754 unsigned Sz = VectorizableTree.size();
9755 buildTreeRec(SubSlice, 0,
EdgeInfo(), InterleaveFactor);
9756 if (Sz == VectorizableTree.size()) {
9757 IsVectorized =
false;
9760 if (InterleaveFactor > 0) {
9761 VF = 2 * (MaxVF / InterleaveFactor);
9762 InterleaveFactor = 0;
9771 NonVectorized.
append(SortedNonVectorized);
9773 return NonVectorized;
9775 for (
const auto &GLs : GatheredLoads) {
9776 const auto &
Ref = GLs.second;
9778 if (!
Ref.empty() && !NonVectorized.
empty() &&
9780 Ref.begin(),
Ref.end(), 0u,
9781 [](
unsigned S,
ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists)
9782 ->
unsigned { return S + LoadsDists.size(); }) !=
9783 NonVectorized.
size() &&
9784 IsMaskedGatherSupported(NonVectorized)) {
9787 for (LoadInst *LI : NonVectorized) {
9795 (void)ProcessGatheredLoads(FinalGatheredLoads,
true);
9799 for (
unsigned Idx : LoadEntriesToVectorize) {
9800 const TreeEntry &
E = *VectorizableTree[Idx];
9803 if (!
E.ReorderIndices.empty()) {
9806 SmallVector<int> ReorderMask;
9810 buildTreeRec(GatheredScalars, 0,
EdgeInfo());
9814 if (
static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
9815 VectorizableTree.size())
9816 GatheredLoadsEntriesFirst.reset();
9826 bool AllowAlternate) {
9861 std::pair<size_t, size_t> OpVals =
9869 if (CI->isCommutative())
9891 SubKey =
hash_value(Gep->getPointerOperand());
9903 return std::make_pair(
Key, SubKey);
9909 Instruction *AltOp,
const TargetLibraryInfo &TLI);
9911bool BoUpSLP::areAltOperandsProfitable(
const InstructionsState &S,
9913 Type *ScalarTy = S.getMainOp()->getType();
9914 unsigned Opcode0 = S.getOpcode();
9915 unsigned Opcode1 = S.getAltOpcode();
9916 SmallBitVector OpcodeMask(
getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
9919 Opcode1, OpcodeMask))
9922 for (
unsigned I :
seq<unsigned>(S.getMainOp()->getNumOperands())) {
9925 for (
Value *V : VL) {
9927 Operands.
back().push_back(
9934 if (Operands.
size() == 2) {
9938 Candidates[0] = std::make_pair(Operands[0][
I], Operands[0][
I + 1]);
9939 Candidates[1] = std::make_pair(Operands[0][
I], Operands[1][
I + 1]);
9940 Candidates[2] = std::make_pair(Operands[1][
I], Operands[0][
I + 1]);
9942 switch (Res.value_or(0)) {
9946 std::swap(Operands[0][
I + 1], Operands[1][
I + 1]);
9956 DenseSet<unsigned> UniqueOpcodes;
9957 constexpr unsigned NumAltInsts = 3;
9958 unsigned NonInstCnt = 0;
9961 unsigned UndefCnt = 0;
9963 unsigned ExtraShuffleInsts = 0;
9966 if (Operands.
size() == 2) {
9968 if (Operands.
front() == Operands.
back()) {
9972 return is_contained(Operands.back(), V);
9975 ++ExtraShuffleInsts;
9978 const Loop *
L = LI->getLoopFor(S.getMainOp()->getParent());
9990 DenseMap<Value *, unsigned> Uniques;
10000 if (!Res.second && Res.first->second == 1)
10001 ++ExtraShuffleInsts;
10002 ++Res.first->getSecond();
10004 UniqueOpcodes.
insert(
I->getOpcode());
10005 else if (Res.second)
10008 return none_of(Uniques, [&](
const auto &
P) {
10009 return P.first->hasNUsesOrMore(
P.second + 1) &&
10010 none_of(
P.first->users(), [&](User *U) {
10011 return isVectorized(U) || Uniques.contains(U);
10020 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
10021 (UniqueOpcodes.
size() + NonInstCnt + ExtraShuffleInsts +
10022 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
10029 const unsigned VF,
unsigned MinBW,
10052static std::pair<InstructionCost, InstructionCost>
10072 FMF = FPCI->getFastMathFlags();
10075 LibCost.isValid() ? LibCost : ScalarLimit);
10085BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
10087 bool IsScatterVectorizeUserTE,
OrdersType &CurrentOrder,
10088 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo) {
10090 "Expected instructions with same/alternate opcodes only.");
10092 unsigned ShuffleOrOp =
10093 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
10095 switch (ShuffleOrOp) {
10096 case Instruction::PHI: {
10099 return TreeEntry::NeedToGather;
10101 for (
Value *V : VL) {
10105 for (
Value *Incoming :
PHI->incoming_values()) {
10107 if (Term &&
Term->isTerminator()) {
10109 <<
"SLP: Need to swizzle PHINodes (terminator use).\n");
10110 return TreeEntry::NeedToGather;
10115 return TreeEntry::Vectorize;
10117 case Instruction::ExtractElement:
10124 return TreeEntry::NeedToGather;
10126 case Instruction::ExtractValue: {
10127 bool Reuse = canReuseExtract(VL, CurrentOrder);
10131 return TreeEntry::NeedToGather;
10132 if (Reuse || !CurrentOrder.empty())
10133 return TreeEntry::Vectorize;
10135 return TreeEntry::NeedToGather;
10137 case Instruction::InsertElement: {
10141 for (
Value *V : VL) {
10143 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement/poison vector.\n");
10144 return TreeEntry::NeedToGather;
10148 "Non-constant or undef index?");
10152 return !SourceVectors.contains(V);
10155 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
10156 "different source vectors.\n");
10157 return TreeEntry::NeedToGather;
10162 return SourceVectors.contains(V) && !
V->hasOneUse();
10165 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
10166 "multiple uses.\n");
10167 return TreeEntry::NeedToGather;
10170 return TreeEntry::Vectorize;
10172 case Instruction::Load: {
10179 auto IsGatheredNode = [&]() {
10180 if (!GatheredLoadsEntriesFirst)
10185 return any_of(getTreeEntries(V), [&](
const TreeEntry *TE) {
10186 return TE->Idx >= *GatheredLoadsEntriesFirst;
10192 return TreeEntry::Vectorize;
10194 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
10196 LoadEntriesToVectorize.insert(VectorizableTree.size());
10197 return TreeEntry::NeedToGather;
10199 return IsGatheredNode() ? TreeEntry::NeedToGather
10200 : TreeEntry::CompressVectorize;
10202 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
10204 LoadEntriesToVectorize.insert(VectorizableTree.size());
10205 return TreeEntry::NeedToGather;
10207 return IsGatheredNode() ? TreeEntry::NeedToGather
10208 : TreeEntry::ScatterVectorize;
10210 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
10212 LoadEntriesToVectorize.insert(VectorizableTree.size());
10213 return TreeEntry::NeedToGather;
10215 return IsGatheredNode() ? TreeEntry::NeedToGather
10216 : TreeEntry::StridedVectorize;
10220 if (DL->getTypeSizeInBits(ScalarTy) !=
10221 DL->getTypeAllocSizeInBits(ScalarTy))
10222 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering loads of non-packed type.\n");
10225 return !LI || !LI->isSimple();
10229 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering non-consecutive loads.\n");
10232 return TreeEntry::NeedToGather;
10236 case Instruction::ZExt:
10237 case Instruction::SExt:
10238 case Instruction::FPToUI:
10239 case Instruction::FPToSI:
10240 case Instruction::FPExt:
10241 case Instruction::PtrToInt:
10242 case Instruction::IntToPtr:
10243 case Instruction::SIToFP:
10244 case Instruction::UIToFP:
10245 case Instruction::Trunc:
10246 case Instruction::FPTrunc:
10247 case Instruction::BitCast: {
10249 for (
Value *V : VL) {
10255 dbgs() <<
"SLP: Gathering casts with different src types.\n");
10256 return TreeEntry::NeedToGather;
10259 return TreeEntry::Vectorize;
10261 case Instruction::ICmp:
10262 case Instruction::FCmp: {
10267 for (
Value *V : VL) {
10271 if ((
Cmp->getPredicate() != P0 &&
Cmp->getPredicate() != SwapP0) ||
10272 Cmp->getOperand(0)->getType() != ComparedTy) {
10273 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering cmp with different predicate.\n");
10274 return TreeEntry::NeedToGather;
10277 return TreeEntry::Vectorize;
10279 case Instruction::Select:
10280 case Instruction::FNeg:
10281 case Instruction::Add:
10282 case Instruction::FAdd:
10283 case Instruction::Sub:
10284 case Instruction::FSub:
10285 case Instruction::Mul:
10286 case Instruction::FMul:
10287 case Instruction::UDiv:
10288 case Instruction::SDiv:
10289 case Instruction::FDiv:
10290 case Instruction::URem:
10291 case Instruction::SRem:
10292 case Instruction::FRem:
10293 case Instruction::Shl:
10294 case Instruction::LShr:
10295 case Instruction::AShr:
10296 case Instruction::And:
10297 case Instruction::Or:
10298 case Instruction::Xor:
10299 case Instruction::Freeze:
10300 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10301 TTI->isFPVectorizationPotentiallyUnsafe() &&
any_of(VL, [](
Value *V) {
10303 return I &&
I->isBinaryOp() && !
I->isFast();
10305 return TreeEntry::NeedToGather;
10306 return TreeEntry::Vectorize;
10307 case Instruction::GetElementPtr: {
10309 for (
Value *V : VL) {
10313 if (
I->getNumOperands() != 2) {
10314 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (nested indexes).\n");
10315 return TreeEntry::NeedToGather;
10322 for (
Value *V : VL) {
10326 Type *CurTy =
GEP->getSourceElementType();
10327 if (Ty0 != CurTy) {
10328 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (different types).\n");
10329 return TreeEntry::NeedToGather;
10335 for (
Value *V : VL) {
10339 auto *
Op =
I->getOperand(1);
10341 (
Op->getType() != Ty1 &&
10343 Op->getType()->getScalarSizeInBits() >
10344 DL->getIndexSizeInBits(
10345 V->getType()->getPointerAddressSpace())))) {
10347 dbgs() <<
"SLP: not-vectorizable GEP (non-constant indexes).\n");
10348 return TreeEntry::NeedToGather;
10352 return TreeEntry::Vectorize;
10354 case Instruction::Store: {
10356 llvm::Type *ScalarTy =
cast<StoreInst>(VL0)->getValueOperand()->getType();
10359 if (DL->getTypeSizeInBits(ScalarTy) !=
10360 DL->getTypeAllocSizeInBits(ScalarTy)) {
10361 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering stores of non-packed type.\n");
10362 return TreeEntry::NeedToGather;
10366 for (
Value *V : VL) {
10368 if (!
SI->isSimple()) {
10370 return TreeEntry::NeedToGather;
10379 if (CurrentOrder.empty()) {
10380 Ptr0 = PointerOps.
front();
10381 PtrN = PointerOps.
back();
10383 Ptr0 = PointerOps[CurrentOrder.front()];
10384 PtrN = PointerOps[CurrentOrder.back()];
10386 std::optional<int64_t> Dist =
10389 if (
static_cast<uint64_t
>(*Dist) == VL.size() - 1)
10390 return TreeEntry::Vectorize;
10394 return TreeEntry::NeedToGather;
10396 case Instruction::Call: {
10397 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10398 TTI->isFPVectorizationPotentiallyUnsafe() &&
any_of(VL, [](
Value *V) {
10400 return I && !
I->isFast();
10402 return TreeEntry::NeedToGather;
10412 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10416 return TreeEntry::NeedToGather;
10419 unsigned NumArgs = CI->
arg_size();
10421 for (
unsigned J = 0; J != NumArgs; ++J)
10424 for (
Value *V : VL) {
10429 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
10431 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched calls:" << *CI <<
"!=" << *V
10433 return TreeEntry::NeedToGather;
10437 for (
unsigned J = 0; J != NumArgs; ++J) {
10440 if (ScalarArgs[J] != A1J) {
10442 <<
"SLP: mismatched arguments in call:" << *CI
10443 <<
" argument " << ScalarArgs[J] <<
"!=" << A1J <<
"\n");
10444 return TreeEntry::NeedToGather;
10453 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched bundle operands in calls:" << *CI
10454 <<
"!=" << *V <<
'\n');
10455 return TreeEntry::NeedToGather;
10460 auto *VecTy =
getWidenedType(S.getMainOp()->getType(), VL.size());
10462 if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())
10463 return TreeEntry::NeedToGather;
10465 return TreeEntry::Vectorize;
10467 case Instruction::ShuffleVector: {
10468 if (!S.isAltShuffle()) {
10471 return TreeEntry::Vectorize;
10474 LLVM_DEBUG(
dbgs() <<
"SLP: ShuffleVector are not vectorized.\n");
10475 return TreeEntry::NeedToGather;
10480 <<
"SLP: ShuffleVector not vectorized, operands are buildvector and "
10481 "the whole alt sequence is not profitable.\n");
10482 return TreeEntry::NeedToGather;
10485 return TreeEntry::Vectorize;
10489 return TreeEntry::NeedToGather;
10498 PHINode *Main =
nullptr;
10503 PHIHandler() =
delete;
10505 : DT(DT), Main(Main), Phis(Phis),
10506 Operands(Main->getNumIncomingValues(),
10508 void buildOperands() {
10509 constexpr unsigned FastLimit = 4;
10518 for (
auto [Idx, V] :
enumerate(Phis)) {
10522 "Expected isa instruction or poison value.");
10523 Operands[
I][Idx] =
V;
10526 if (
P->getIncomingBlock(
I) == InBB)
10527 Operands[
I][Idx] =
P->getIncomingValue(
I);
10529 Operands[
I][Idx] =
P->getIncomingValueForBlock(InBB);
10534 SmallMapVector<BasicBlock *, SmallVector<unsigned>, 4>
10544 for (
auto [Idx, V] :
enumerate(Phis)) {
10547 Operands[
I][Idx] =
V;
10556 Operands[
I][Idx] =
P->getIncomingValue(
I);
10559 auto *It = Blocks.
find(InBB);
10560 if (It == Blocks.
end())
10562 Operands[It->second.front()][Idx] =
P->getIncomingValue(
I);
10565 for (
const auto &
P : Blocks) {
10566 ArrayRef<unsigned> IncomingValues =
P.second;
10567 if (IncomingValues.
size() <= 1)
10570 for (
unsigned I : IncomingValues) {
10572 [&](
const auto &
Data) {
10573 return !
Data.value() ||
10574 Data.value() == Operands[BasicI][
Data.index()];
10576 "Expected empty operands list.");
10577 Operands[
I] = Operands[BasicI];
10590static std::pair<Instruction *, Instruction *>
10594 for (
Value *V : VL) {
10604 if (MainOp->
getOpcode() ==
I->getOpcode()) {
10623 "Expected different main and alt instructions.");
10624 return std::make_pair(MainOp, AltOp);
10637 const InstructionsState &S,
10639 bool TryPad =
false) {
10643 for (
Value *V : VL) {
10659 size_t NumUniqueScalarValues = UniqueValues.
size();
10662 if (NumUniqueScalarValues == VL.
size() &&
10664 ReuseShuffleIndices.
clear();
10669 if ((UserTreeIdx.
UserTE &&
10670 UserTreeIdx.
UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(
TTI)) ||
10672 LLVM_DEBUG(
dbgs() <<
"SLP: Reshuffling scalars not yet supported "
10673 "for nodes with padding.\n");
10674 ReuseShuffleIndices.
clear();
10679 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
10683 if (TryPad && UniquePositions.
size() > 1 && NumUniqueScalarValues > 1 &&
10684 S.getMainOp()->isSafeToRemove() &&
10685 (S.areInstructionsWithCopyableElements() ||
10689 TTI, UniqueValues.
front()->getType(), UniqueValues.
size());
10690 PWSz = std::min<unsigned>(PWSz, VL.
size());
10691 if (PWSz == VL.
size()) {
10695 ReuseShuffleIndices.
clear();
10699 UniqueValues.
end());
10700 PaddedUniqueValues.
append(
10701 PWSz - UniqueValues.
size(),
10705 if ((!S.areInstructionsWithCopyableElements() &&
10707 (S.areInstructionsWithCopyableElements() && S.isMulDivLikeOp() &&
10708 (S.getMainOp()->isIntDivRem() || S.getMainOp()->isFPDivRem() ||
10711 ReuseShuffleIndices.
clear();
10714 VL = std::move(PaddedUniqueValues);
10719 ReuseShuffleIndices.
clear();
10722 VL = std::move(UniqueValues);
10727 const InstructionsState &LocalState,
10728 SmallVectorImpl<Value *> &Op1,
10729 SmallVectorImpl<Value *> &Op2,
10731 constexpr unsigned SmallNodeSize = 4;
10732 if (VL.
size() <= SmallNodeSize || TTI->preferAlternateOpcodeVectorization() ||
10737 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *LocalState.getMainOp()
10739 for (TreeEntry *
E : getSplitTreeEntries(LocalState.getMainOp())) {
10740 if (
E->isSame(VL)) {
10742 << *LocalState.getMainOp() <<
".\n");
10754 ReorderIndices.assign(VL.
size(), VL.
size());
10755 SmallBitVector Op1Indices(VL.
size());
10760 Op1Indices.set(Idx);
10763 if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
10766 (LocalState.getAltOpcode() == LocalState.getOpcode() &&
10768 LocalState.getAltOp(), *TLI))) {
10770 Op1Indices.set(Idx);
10777 unsigned Opcode0 = LocalState.getOpcode();
10778 unsigned Opcode1 = LocalState.getAltOpcode();
10779 SmallBitVector OpcodeMask(
getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
10784 if (UOp1.size() <= 1 || UOp2.size() <= 1 ||
10785 TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) ||
10790 unsigned Op1Cnt = 0, Op2Cnt = Op1.
size();
10792 if (Op1Indices.test(Idx)) {
10793 ReorderIndices[Op1Cnt] = Idx;
10796 ReorderIndices[Op2Cnt] = Idx;
10801 ReorderIndices.clear();
10802 SmallVector<int>
Mask;
10803 if (!ReorderIndices.empty())
10805 unsigned NumParts = TTI->getNumberOfParts(VecTy);
10810 if (NumParts >= VL.
size())
10815 FixedVectorType *SubVecTy =
10819 if (!LocalState.isCmpOp() && NumParts <= 1 &&
10820 (
Mask.empty() || InsertCost >= NewShuffleCost))
10822 if ((LocalState.getMainOp()->isBinaryOp() &&
10823 LocalState.getAltOp()->isBinaryOp() &&
10824 (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
10825 LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
10826 (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
10827 (LocalState.getMainOp()->isUnaryOp() &&
10828 LocalState.getAltOp()->isUnaryOp())) {
10830 TTI->getArithmeticInstrCost(Opcode0, VecTy, Kind) +
10831 TTI->getArithmeticInstrCost(Opcode1, VecTy, Kind);
10836 OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.
size());
10840 VecTy, OriginalMask, Kind);
10842 TTI->getArithmeticInstrCost(Opcode0, Op1VecTy, Kind) +
10843 TTI->getArithmeticInstrCost(Opcode1, Op2VecTy, Kind);
10845 NewVecOpsCost + InsertCost +
10846 (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
10847 VectorizableTree.front()->getOpcode() == Instruction::Store
10851 if (NewCost >= OriginalCost)
10861class InstructionsCompatibilityAnalysis {
10863 const DataLayout &
DL;
10864 const TargetTransformInfo &
TTI;
10865 const TargetLibraryInfo &TLI;
10866 unsigned MainOpcode = 0;
10871 static bool isSupportedOpcode(
const unsigned Opcode) {
10872 return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
10873 Opcode == Instruction::LShr || Opcode == Instruction::Shl ||
10874 Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
10875 Opcode == Instruction::And || Opcode == Instruction::Or ||
10876 Opcode == Instruction::Xor;
10886 auto IsSupportedInstruction = [&](
Instruction *
I,
bool AnyUndef) {
10887 if (AnyUndef && (
I->isIntDivRem() ||
I->isFPDivRem() ||
isa<CallInst>(
I)))
10889 return I && isSupportedOpcode(
I->getOpcode()) &&
10894 SmallDenseSet<Value *, 8> Operands;
10895 SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
10896 bool AnyUndef =
false;
10897 for (
Value *V : VL) {
10905 if (Candidates.
empty()) {
10906 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
10908 Operands.
insert(
I->op_begin(),
I->op_end());
10911 if (Parent ==
I->getParent()) {
10912 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
10913 Operands.
insert(
I->op_begin(),
I->op_end());
10916 auto *NodeA = DT.
getNode(Parent);
10917 auto *NodeB = DT.
getNode(
I->getParent());
10918 assert(NodeA &&
"Should only process reachable instructions");
10919 assert(NodeB &&
"Should only process reachable instructions");
10920 assert((NodeA == NodeB) ==
10921 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10922 "Different nodes should have different DFS numbers");
10923 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
10924 Candidates.
clear();
10925 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
10928 Operands.
insert(
I->op_begin(),
I->op_end());
10931 unsigned BestOpcodeNum = 0;
10933 bool UsedOutside =
false;
10934 for (
const auto &
P : Candidates) {
10936 if (UsedOutside && !PUsedOutside)
10938 if (!UsedOutside && PUsedOutside)
10940 if (
P.second.size() < BestOpcodeNum)
10943 if (!PUsedOutside &&
any_of(
P.second, [&](Instruction *
I) {
10944 return Operands.contains(I);
10947 UsedOutside = PUsedOutside;
10948 for (Instruction *
I :
P.second) {
10949 if (IsSupportedInstruction(
I, AnyUndef)) {
10951 BestOpcodeNum =
P.second.size();
10961 return I &&
I->getParent() == MainOp->
getParent() &&
10974 Value *selectBestIdempotentValue()
const {
10975 assert(isSupportedOpcode(MainOpcode) &&
"Unsupported opcode");
10986 if (!S.isCopyableElement(V))
10988 assert(isSupportedOpcode(MainOpcode) &&
"Unsupported opcode");
10989 return {
V, selectBestIdempotentValue()};
10995 SmallVectorImpl<BoUpSLP::ValueList> &Operands)
const {
10997 unsigned ShuffleOrOp =
10998 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
11001 switch (ShuffleOrOp) {
11002 case Instruction::PHI: {
11006 PHIHandler Handler(DT, PH, VL);
11007 Handler.buildOperands();
11008 Operands.
assign(PH->getNumOperands(), {});
11010 Operands[
I].
assign(Handler.getOperands(
I).begin(),
11011 Handler.getOperands(
I).end());
11014 case Instruction::ExtractValue:
11015 case Instruction::ExtractElement:
11020 case Instruction::InsertElement:
11028 case Instruction::Load:
11032 for (
auto [V,
Op] :
zip(VL, Operands.
back())) {
11036 Op = LI->getPointerOperand();
11039 case Instruction::ZExt:
11040 case Instruction::SExt:
11041 case Instruction::FPToUI:
11042 case Instruction::FPToSI:
11043 case Instruction::FPExt:
11044 case Instruction::PtrToInt:
11045 case Instruction::IntToPtr:
11046 case Instruction::SIToFP:
11047 case Instruction::UIToFP:
11048 case Instruction::Trunc:
11049 case Instruction::FPTrunc:
11050 case Instruction::BitCast:
11051 case Instruction::ICmp:
11052 case Instruction::FCmp:
11053 case Instruction::Select:
11054 case Instruction::FNeg:
11055 case Instruction::Add:
11056 case Instruction::FAdd:
11057 case Instruction::Sub:
11058 case Instruction::FSub:
11059 case Instruction::Mul:
11060 case Instruction::FMul:
11061 case Instruction::UDiv:
11062 case Instruction::SDiv:
11063 case Instruction::FDiv:
11064 case Instruction::URem:
11065 case Instruction::SRem:
11066 case Instruction::FRem:
11067 case Instruction::Shl:
11068 case Instruction::LShr:
11069 case Instruction::AShr:
11070 case Instruction::And:
11071 case Instruction::Or:
11072 case Instruction::Xor:
11073 case Instruction::Freeze:
11074 case Instruction::Store:
11075 case Instruction::ShuffleVector:
11084 auto [
Op, ConvertedOps] = convertTo(
I, S);
11089 case Instruction::GetElementPtr: {
11096 const unsigned IndexIdx = 1;
11102 return !
GEP || VL0Ty ==
GEP->getOperand(IndexIdx)->getType();
11106 ->getPointerOperandType()
11107 ->getScalarType());
11111 Operands[0][Idx] =
V;
11112 Operands[1][Idx] = ConstantInt::getNullValue(Ty);
11115 Operands[0][Idx] =
GEP->getPointerOperand();
11116 auto *
Op =
GEP->getOperand(IndexIdx);
11119 CI, Ty, CI->getValue().isSignBitSet(),
DL)
11124 case Instruction::Call: {
11131 for (
Value *V : VL) {
11133 Ops.push_back(
I ?
I->getOperand(Idx)
11146 InstructionsCompatibilityAnalysis(DominatorTree &DT,
const DataLayout &
DL,
11147 const TargetTransformInfo &
TTI,
11148 const TargetLibraryInfo &TLI)
11153 bool TryCopyableElementsVectorization,
11154 bool WithProfitabilityCheck =
false,
11155 bool SkipSameCodeCheck =
false) {
11156 InstructionsState S = (SkipSameCodeCheck || !
allSameBlock(VL))
11157 ? InstructionsState::invalid()
11163 findAndSetMainInstruction(VL, R);
11165 return InstructionsState::invalid();
11166 S = InstructionsState(MainOp, MainOp,
true);
11167 if (!WithProfitabilityCheck)
11171 auto BuildCandidates =
11172 [](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates,
Value *V1,
11178 if (I1 && I2 &&
I1->getOpcode() == I2->getOpcode() &&
11179 I1->getParent() != I2->getParent())
11183 if (VL.
size() == 2) {
11186 BuildCandidates(Candidates1, Operands[0][0], Operands[0][1]);
11187 BuildCandidates(Candidates2, Operands[1][0], Operands[1][1]);
11188 bool Res = !Candidates1.
empty() && !Candidates2.
empty() &&
11189 R.findBestRootPair(Candidates1) &&
11190 R.findBestRootPair(Candidates2);
11192 Candidates1.
clear();
11193 Candidates2.
clear();
11194 BuildCandidates(Candidates1, Operands[0][0], Operands[1][1]);
11195 BuildCandidates(Candidates2, Operands[1][0], Operands[0][1]);
11196 Res = !Candidates1.
empty() && !Candidates2.
empty() &&
11197 R.findBestRootPair(Candidates1) &&
11198 R.findBestRootPair(Candidates2);
11201 return InstructionsState::invalid();
11205 FixedVectorType *VecTy =
11207 switch (MainOpcode) {
11208 case Instruction::Add:
11209 case Instruction::Sub:
11210 case Instruction::LShr:
11211 case Instruction::Shl:
11212 case Instruction::SDiv:
11213 case Instruction::UDiv:
11214 case Instruction::And:
11215 case Instruction::Or:
11216 case Instruction::Xor:
11222 if (VectorCost > ScalarCost)
11223 return InstructionsState::invalid();
11226 assert(Operands.
size() == 2 &&
"Unexpected number of operands!");
11227 unsigned CopyableNum =
11228 count_if(VL, [&](
Value *V) {
return S.isCopyableElement(V); });
11229 if (CopyableNum < VL.
size() / 2)
11232 const unsigned Limit = VL.
size() / 24;
11233 if ((CopyableNum >= VL.
size() - Limit ||
11234 (CopyableNum >= VL.
size() - 1 && VL.
size() > 4) ||
11239 return InstructionsState::invalid();
11243 for (
auto &
Ops : Operands) {
11258 return InstructionsState::invalid();
11264 constexpr unsigned Limit = 4;
11265 if (Operands.front().size() >= Limit) {
11266 SmallDenseMap<const Value *, unsigned>
Counters;
11274 return C.second == 1;
11280 InstructionsCompatibilityAnalysis
Analysis(DT,
DL,
TTI, TLI);
11281 InstructionsState OpS =
Analysis.buildInstructionsState(
11283 if (!OpS || (OpS.getOpcode() == Instruction::PHI && !
allSameBlock(
Ops)))
11285 unsigned CopyableNum =
11287 return CopyableNum <= VL.
size() / 2;
11289 if (!CheckOperand(Operands.front()))
11290 return InstructionsState::invalid();
11297 assert(S &&
"Invalid state!");
11299 if (S.areInstructionsWithCopyableElements()) {
11300 MainOp = S.getMainOp();
11301 MainOpcode = S.getOpcode();
11306 for (
auto [OperandIdx, Operand] :
enumerate(OperandsForValue))
11307 Operands[OperandIdx][Idx] = Operand;
11310 buildOriginalOperands(S, VL, Operands);
11317BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
11319 bool TryCopyableElementsVectorization)
const {
11322 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
11323 InstructionsState S =
Analysis.buildInstructionsState(
11324 VL, *
this, TryCopyableElementsVectorization,
11325 true, TryCopyableElementsVectorization);
11327 bool AreScatterAllGEPSameBlock =
false;
11329 SmallVector<unsigned> SortedIndices;
11331 bool IsScatterVectorizeUserTE =
11332 UserTreeIdx.UserTE &&
11333 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11334 AreScatterAllGEPSameBlock =
11348 *SE, SortedIndices));
11349 if (!AreScatterAllGEPSameBlock) {
11350 LLVM_DEBUG(
dbgs() <<
"SLP: Try split and if failed, gathering due to "
11351 "C,S,B,O, small shuffle. \n";
11355 return ScalarsVectorizationLegality(S,
false,
11361 assert(It != VL.
end() &&
"Expected at least one GEP.");
11364 assert(S &&
"Must be valid.");
11370 return ScalarsVectorizationLegality(S,
false,
11376 BasicBlock *BB = S.getMainOp()->getParent();
11379 !DT->isReachableFromEntry(BB)) {
11385 return ScalarsVectorizationLegality(S,
false);
11394 return ScalarsVectorizationLegality(S,
false,
11399 if (S.getOpcode() == Instruction::ExtractElement &&
11402 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to scalable vector type.\n");
11403 return ScalarsVectorizationLegality(S,
false);
11410 (S.isAltShuffle() || VL.
size() < 4 ||
11417 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to max recursion depth.\n");
11418 return ScalarsVectorizationLegality(S,
false);
11422 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *S.getMainOp() <<
".\n");
11423 for (TreeEntry *
E : getTreeEntries(S.getMainOp())) {
11424 if (
E->isSame(VL)) {
11425 LLVM_DEBUG(
dbgs() <<
"SLP: Perfect diamond merge at " << *S.getMainOp()
11427 return ScalarsVectorizationLegality(S,
false);
11432 (S.getOpcode() == Instruction::PHI &&
isa<PHINode>(V) &&
11433 LI->getLoopFor(S.getMainOp()->getParent()) &&
11437 return ScalarsVectorizationLegality(S,
false);
11447 if (!S || !S.isAltShuffle() || VL.
size() > 2)
11455 SmallVector<unsigned, 8> InstsCount;
11456 for (
Value *V : VL) {
11459 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
11462 bool IsCommutative =
11464 if ((IsCommutative &&
11465 std::accumulate(InstsCount.
begin(), InstsCount.
end(), 0) < 2) ||
11467 all_of(InstsCount, [](
unsigned ICnt) {
return ICnt < 2; })))
11469 assert(VL.size() == 2 &&
"Expected only 2 alternate op instructions.");
11473 for (
int Op :
seq<int>(S.getMainOp()->getNumOperands()))
11475 I2->getOperand(
Op));
11476 if (
static_cast<unsigned>(
count_if(
11477 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
11479 })) >= S.getMainOp()->getNumOperands() / 2)
11481 if (S.getMainOp()->getNumOperands() > 2)
11483 if (IsCommutative) {
11485 Candidates.
clear();
11486 for (
int Op = 0,
E = S.getMainOp()->getNumOperands();
Op <
E; ++
Op)
11488 I2->getOperand((
Op + 1) %
E));
11490 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
11497 bool AreAllSameBlock = !AreScatterAllGEPSameBlock;
11498 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
11499 if (!AreAllSameInsts ||
isSplat(VL) ||
11503 NotProfitableForVectorization(VL)) {
11504 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to C,S,B,O, small shuffle. \n";
11508 return ScalarsVectorizationLegality(S,
false);
11512 if (!EphValues.empty()) {
11513 for (
Value *V : VL) {
11514 if (EphValues.count(V)) {
11516 <<
") is ephemeral.\n");
11518 return ScalarsVectorizationLegality(S,
false,
11530 if (S.isAltShuffle()) {
11531 auto GetNumVectorizedExtracted = [&]() {
11537 all_of(
I->operands(), [&](
const Use &U) {
11538 return isa<ExtractElementInst>(U.get());
11543 else if (!
I->hasOneUser() && !areAllUsersVectorized(
I, UserIgnoreList))
11546 return std::make_pair(Vectorized, Extracted);
11548 auto [Vectorized, Extracted] = GetNumVectorizedExtracted();
11550 bool PreferScalarize = !Vectorized.
isAllOnes() && VL.size() == 2;
11551 if (!Vectorized.
isAllOnes() && !PreferScalarize) {
11554 Type *ScalarTy = VL.front()->getType();
11559 false,
true, Kind);
11561 *TTI, ScalarTy, VecTy, Vectorized,
11562 true,
false, Kind,
false);
11563 PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate;
11565 if (PreferScalarize) {
11566 LLVM_DEBUG(
dbgs() <<
"SLP: The instructions are in tree and alternate "
11567 "node is not profitable.\n");
11568 return ScalarsVectorizationLegality(S,
false);
11573 if (UserIgnoreList && !UserIgnoreList->empty()) {
11574 for (
Value *V : VL) {
11575 if (UserIgnoreList->contains(V)) {
11576 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to gathered scalar.\n");
11577 return ScalarsVectorizationLegality(S,
false);
11582 return ScalarsVectorizationLegality(S,
true);
11587 unsigned InterleaveFactor) {
11590 SmallVector<int> ReuseShuffleIndices;
11594 auto TrySplitNode = [&](
const InstructionsState &LocalState) {
11597 if (!canBuildSplitNode(VL, LocalState, Op1, Op2, ReorderIndices))
11600 auto Invalid = ScheduleBundle::invalid();
11601 auto *
TE = newTreeEntry(VL, TreeEntry::SplitVectorize,
Invalid, LocalState,
11602 UserTreeIdx, {}, ReorderIndices);
11607 getSameValuesTreeEntry(S.getMainOp(),
Op,
true))) {
11609 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11610 Idx == 0 ? 0 : Op1.
size());
11611 (void)newTreeEntry(
Op, TreeEntry::NeedToGather,
Invalid, S, {
TE, Idx});
11613 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11614 Idx == 0 ? 0 : Op1.
size());
11624 bool AreConsts =
false;
11625 for (
Value *V : VL) {
11637 if (AreOnlyConstsWithPHIs(VL)) {
11638 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to all constants and PHIs.\n");
11639 newGatherTreeEntry(VL, InstructionsState::invalid(), UserTreeIdx);
11643 ScalarsVectorizationLegality Legality = getScalarsVectorizationLegality(
11644 VL,
Depth, UserTreeIdx,
false);
11645 InstructionsState S = Legality.getInstructionsState();
11646 if (!Legality.isLegal()) {
11647 if (Legality.trySplitVectorize()) {
11650 if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
11654 Legality = getScalarsVectorizationLegality(
11655 VL,
Depth, UserTreeIdx,
true);
11656 if (!Legality.isLegal()) {
11657 if (Legality.tryToFindDuplicates())
11661 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11664 S = Legality.getInstructionsState();
11668 if (S.isAltShuffle() && TrySplitNode(S))
11674 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11679 bool IsScatterVectorizeUserTE =
11680 UserTreeIdx.UserTE &&
11681 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11684 StridedPtrInfo SPtrInfo;
11685 TreeEntry::EntryState State = getScalarsVectorizationState(
11686 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, SPtrInfo);
11687 if (State == TreeEntry::NeedToGather) {
11688 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11694 auto &BSRef = BlocksSchedules[BB];
11696 BSRef = std::make_unique<BlockScheduling>(BB);
11698 BlockScheduling &BS = *BSRef;
11701 std::optional<ScheduleBundle *> BundlePtr =
11702 BS.tryScheduleBundle(UniqueValues.getArrayRef(),
this, S, UserTreeIdx);
11703#ifdef EXPENSIVE_CHECKS
11707 if (!BundlePtr || (*BundlePtr && !*BundlePtr.value())) {
11708 LLVM_DEBUG(
dbgs() <<
"SLP: We are not able to schedule this bundle!\n");
11710 if (S.isAltShuffle() && ReuseShuffleIndices.
empty() && TrySplitNode(S))
11712 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11713 NonScheduledFirst.insert(VL.front());
11714 if (S.getOpcode() == Instruction::Load &&
11715 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
11719 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
11721 ScheduleBundle
Empty;
11722 ScheduleBundle &Bundle = BundlePtr.value() ? *BundlePtr.value() :
Empty;
11723 LLVM_DEBUG(
dbgs() <<
"SLP: We are able to schedule this bundle.\n");
11725 unsigned ShuffleOrOp =
11726 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
11727 auto CreateOperandNodes = [&](TreeEntry *
TE,
const auto &Operands) {
11729 SmallVector<unsigned> PHIOps;
11735 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
11740 for (
unsigned I : PHIOps)
11741 buildTreeRec(Operands[
I],
Depth + 1, {
TE,
I});
11743 switch (ShuffleOrOp) {
11744 case Instruction::PHI: {
11746 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
11750 TE->setOperands(Operands);
11751 CreateOperandNodes(TE, Operands);
11754 case Instruction::ExtractValue:
11755 case Instruction::ExtractElement: {
11756 if (CurrentOrder.empty()) {
11757 LLVM_DEBUG(
dbgs() <<
"SLP: Reusing or shuffling extract sequence.\n");
11760 dbgs() <<
"SLP: Reusing or shuffling of reordered extract sequence "
11762 for (
unsigned Idx : CurrentOrder)
11763 dbgs() <<
" " << Idx;
11770 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11771 ReuseShuffleIndices, CurrentOrder);
11773 "(ExtractValueInst/ExtractElementInst).\n";
11777 TE->setOperands(Operands);
11780 case Instruction::InsertElement: {
11781 assert(ReuseShuffleIndices.
empty() &&
"All inserts should be unique");
11783 auto OrdCompare = [](
const std::pair<int, int> &
P1,
11784 const std::pair<int, int> &P2) {
11785 return P1.first > P2.first;
11788 decltype(OrdCompare)>
11789 Indices(OrdCompare);
11790 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
11792 Indices.emplace(Idx,
I);
11794 OrdersType CurrentOrder(VL.size(), VL.size());
11795 bool IsIdentity =
true;
11796 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
11797 CurrentOrder[Indices.top().second] =
I;
11798 IsIdentity &= Indices.top().second ==
I;
11802 CurrentOrder.clear();
11803 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11805 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (InsertElementInst).\n";
11808 TE->setOperands(Operands);
11809 buildTreeRec(
TE->getOperand(1),
Depth + 1, {TE, 1});
11812 case Instruction::Load: {
11819 TreeEntry *
TE =
nullptr;
11822 case TreeEntry::Vectorize:
11823 TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11824 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
11825 if (CurrentOrder.empty())
11826 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (LoadInst).\n";
11830 <<
"SLP: added a new TreeEntry (jumbled LoadInst).\n";
11833 case TreeEntry::CompressVectorize:
11835 TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S,
11836 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11839 <<
"SLP: added a new TreeEntry (masked LoadInst + compress).\n";
11842 case TreeEntry::StridedVectorize:
11844 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
11845 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11846 TreeEntryToStridedPtrInfoMap[
TE] = SPtrInfo;
11847 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (strided LoadInst).\n";
11850 case TreeEntry::ScatterVectorize:
11852 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
11853 UserTreeIdx, ReuseShuffleIndices);
11856 <<
"SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
11859 case TreeEntry::CombinedVectorize:
11860 case TreeEntry::SplitVectorize:
11861 case TreeEntry::NeedToGather:
11864 if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {
11865 assert(Operands.
size() == 1 &&
"Expected a single operand only");
11866 SmallVector<int>
Mask;
11870 TE->setOperands(Operands);
11871 if (State == TreeEntry::ScatterVectorize)
11872 buildTreeRec(PointerOps,
Depth + 1, {
TE, 0});
11875 case Instruction::ZExt:
11876 case Instruction::SExt:
11877 case Instruction::FPToUI:
11878 case Instruction::FPToSI:
11879 case Instruction::FPExt:
11880 case Instruction::PtrToInt:
11881 case Instruction::IntToPtr:
11882 case Instruction::SIToFP:
11883 case Instruction::UIToFP:
11884 case Instruction::Trunc:
11885 case Instruction::FPTrunc:
11886 case Instruction::BitCast: {
11887 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
11888 std::make_pair(std::numeric_limits<unsigned>::min(),
11889 std::numeric_limits<unsigned>::max()));
11890 if (ShuffleOrOp == Instruction::ZExt ||
11891 ShuffleOrOp == Instruction::SExt) {
11892 CastMaxMinBWSizes = std::make_pair(
11893 std::max<unsigned>(DL->getTypeSizeInBits(VL0->
getType()),
11895 std::min<unsigned>(
11898 }
else if (ShuffleOrOp == Instruction::Trunc) {
11899 CastMaxMinBWSizes = std::make_pair(
11900 std::max<unsigned>(
11903 std::min<unsigned>(DL->getTypeSizeInBits(VL0->
getType()),
11906 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11907 ReuseShuffleIndices);
11908 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (CastInst).\n";
11911 TE->setOperands(Operands);
11913 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
11914 if (ShuffleOrOp == Instruction::Trunc) {
11915 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11916 }
else if (ShuffleOrOp == Instruction::SIToFP ||
11917 ShuffleOrOp == Instruction::UIToFP) {
11918 unsigned NumSignBits =
11921 APInt
Mask = DB->getDemandedBits(OpI);
11922 NumSignBits = std::max(NumSignBits,
Mask.countl_zero());
11924 if (NumSignBits * 2 >=
11926 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11930 case Instruction::ICmp:
11931 case Instruction::FCmp: {
11934 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11935 ReuseShuffleIndices);
11944 "Commutative Predicate mismatch");
11947 Operands.
back() =
Ops.getVL(1);
11954 if (
Cmp->getPredicate() != P0)
11958 TE->setOperands(Operands);
11959 buildTreeRec(Operands.
front(),
Depth + 1, {TE, 0});
11960 buildTreeRec(Operands.
back(),
Depth + 1, {TE, 1});
11961 if (ShuffleOrOp == Instruction::ICmp) {
11962 unsigned NumSignBits0 =
11964 if (NumSignBits0 * 2 >=
11966 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11967 unsigned NumSignBits1 =
11969 if (NumSignBits1 * 2 >=
11971 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
11975 case Instruction::Select:
11976 case Instruction::FNeg:
11977 case Instruction::Add:
11978 case Instruction::FAdd:
11979 case Instruction::Sub:
11980 case Instruction::FSub:
11981 case Instruction::Mul:
11982 case Instruction::FMul:
11983 case Instruction::UDiv:
11984 case Instruction::SDiv:
11985 case Instruction::FDiv:
11986 case Instruction::URem:
11987 case Instruction::SRem:
11988 case Instruction::FRem:
11989 case Instruction::Shl:
11990 case Instruction::LShr:
11991 case Instruction::AShr:
11992 case Instruction::And:
11993 case Instruction::Or:
11994 case Instruction::Xor:
11995 case Instruction::Freeze: {
11996 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11997 ReuseShuffleIndices);
11999 dbgs() <<
"SLP: added a new TreeEntry "
12000 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
12006 Operands[0] =
Ops.getVL(0);
12007 Operands[1] =
Ops.getVL(1);
12009 TE->setOperands(Operands);
12011 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
12014 case Instruction::GetElementPtr: {
12015 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12016 ReuseShuffleIndices);
12017 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (GetElementPtrInst).\n";
12019 TE->setOperands(Operands);
12022 buildTreeRec(Operands[
I],
Depth + 1, {
TE,
I});
12025 case Instruction::Store: {
12026 bool Consecutive = CurrentOrder.empty();
12029 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12030 ReuseShuffleIndices, CurrentOrder);
12032 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (StoreInst).\n";
12036 dbgs() <<
"SLP: added a new TreeEntry (jumbled StoreInst).\n";
12038 TE->setOperands(Operands);
12039 buildTreeRec(
TE->getOperand(0),
Depth + 1, {TE, 0});
12042 case Instruction::Call: {
12048 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12049 ReuseShuffleIndices);
12050 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (CallInst).\n";
12055 Operands[0] =
Ops.getVL(0);
12056 Operands[1] =
Ops.getVL(1);
12058 TE->setOperands(Operands);
12064 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
12068 case Instruction::ShuffleVector: {
12069 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12070 ReuseShuffleIndices);
12071 if (S.isAltShuffle()) {
12072 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (isAltShuffle).\n";
12077 dbgs() <<
"SLP: added a new TreeEntry (ShuffleVectorInst).\n";
12091 "Expected different main/alternate predicates.");
12107 TE->setOperands(Operands);
12108 buildTreeRec(Operands.
front(),
Depth + 1, {TE, 0});
12109 buildTreeRec(Operands.
back(),
Depth + 1, {TE, 1});
12116 Operands[0] =
Ops.getVL(0);
12117 Operands[1] =
Ops.getVL(1);
12119 TE->setOperands(Operands);
12121 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
12139 for (
const auto *Ty : ST->elements())
12140 if (Ty != *ST->element_begin())
12142 N *= ST->getNumElements();
12143 EltTy = *ST->element_begin();
12145 N *= AT->getNumElements();
12146 EltTy = AT->getElementType();
12149 N *= VT->getNumElements();
12150 EltTy = VT->getElementType();
12156 size_t VTSize = DL->getTypeStoreSizeInBits(
getWidenedType(EltTy,
N));
12157 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
12158 VTSize != DL->getTypeStoreSizeInBits(T))
12165 bool ResizeAllowed)
const {
12167 assert(It != VL.
end() &&
"Expected at least one extract instruction.");
12174 Value *Vec = E0->getOperand(0);
12176 CurrentOrder.
clear();
12180 if (E0->getOpcode() == Instruction::ExtractValue) {
12192 unsigned E = VL.
size();
12193 if (!ResizeAllowed && NElts !=
E)
12196 unsigned MinIdx = NElts, MaxIdx = 0;
12201 if (Inst->getOperand(0) != Vec)
12209 const unsigned ExtIdx = *Idx;
12210 if (ExtIdx >= NElts)
12212 Indices[
I] = ExtIdx;
12213 if (MinIdx > ExtIdx)
12215 if (MaxIdx < ExtIdx)
12218 if (MaxIdx - MinIdx + 1 >
E)
12220 if (MaxIdx + 1 <=
E)
12224 bool ShouldKeepOrder =
true;
12231 for (
unsigned I = 0;
I <
E; ++
I) {
12234 const unsigned ExtIdx = Indices[
I] - MinIdx;
12235 if (CurrentOrder[ExtIdx] !=
E) {
12236 CurrentOrder.
clear();
12239 ShouldKeepOrder &= ExtIdx ==
I;
12240 CurrentOrder[ExtIdx] =
I;
12242 if (ShouldKeepOrder)
12243 CurrentOrder.
clear();
12245 return ShouldKeepOrder;
12248bool BoUpSLP::areAllUsersVectorized(
12249 Instruction *
I,
const SmallDenseSet<Value *> *VectorizedVals)
const {
12250 return (
I->hasOneUse() && (!VectorizedVals || VectorizedVals->
contains(
I))) ||
12251 all_of(
I->users(), [
this](User *U) {
12252 return isVectorized(U) || isVectorLikeInstWithConstOps(U) ||
12253 (isa<ExtractElementInst>(U) && MustGather.contains(U));
12257void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
12258 const function_ref<
bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
12259 SmallVectorImpl<Value *> *OpScalars,
12260 SmallVectorImpl<Value *> *AltScalars)
const {
12261 unsigned Sz = Scalars.size();
12263 SmallVector<int> OrderMask;
12264 if (!ReorderIndices.empty())
12266 for (
unsigned I = 0;
I < Sz; ++
I) {
12268 if (!ReorderIndices.empty())
12269 Idx = OrderMask[
I];
12273 if (IsAltOp(OpInst)) {
12274 Mask[
I] = Sz + Idx;
12283 if (!ReuseShuffleIndices.
empty()) {
12285 transform(ReuseShuffleIndices, NewMask.
begin(), [&Mask](
int Idx) {
12286 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
12288 Mask.swap(NewMask);
12295 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(
I) == MainOp;
12305 assert(MainP != AltP &&
"Expected different main/alternate predicates.");
12314 assert((MainP ==
P || AltP ==
P || MainP == SwappedP || AltP == SwappedP) &&
12315 "CmpInst expected to match either main or alternate predicate or "
12317 return MainP !=
P && MainP != SwappedP;
12319 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(
I) == AltOp;
12324 const auto *Op0 =
Ops.front();
12337 return CI->getValue().isPowerOf2();
12343 return CI->getValue().isNegatedPowerOf2();
12348 if (IsConstant && IsUniform)
12350 else if (IsConstant)
12352 else if (IsUniform)
12364class BaseShuffleAnalysis {
12366 Type *ScalarTy =
nullptr;
12368 BaseShuffleAnalysis(
Type *ScalarTy) : ScalarTy(ScalarTy) {}
12376 unsigned getVF(
Value *V)
const {
12377 assert(V &&
"V cannot be nullptr");
12379 "V does not have FixedVectorType");
12380 assert(ScalarTy &&
"ScalarTy cannot be nullptr");
12382 unsigned VNumElements =
12384 assert(VNumElements > ScalarTyNumElements &&
12385 "the number of elements of V is not large enough");
12386 assert(VNumElements % ScalarTyNumElements == 0 &&
12387 "the number of elements of V is not a vectorized value");
12388 return VNumElements / ScalarTyNumElements;
12394 static bool isIdentityMask(ArrayRef<int> Mask,
const FixedVectorType *VecTy,
12396 int Limit =
Mask.size();
12408 if (Limit % VF == 0 &&
all_of(
seq<int>(0, Limit / VF), [=](
int Idx) {
12409 ArrayRef<int> Slice =
Mask.slice(Idx * VF, VF);
12422 static void combineMasks(
unsigned LocalVF, SmallVectorImpl<int> &Mask,
12423 ArrayRef<int> ExtMask) {
12424 unsigned VF =
Mask.size();
12426 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
12429 int MaskedIdx =
Mask[ExtMask[
I] % VF];
12433 Mask.swap(NewMask);
12469 static bool peekThroughShuffles(
Value *&V, SmallVectorImpl<int> &Mask,
12470 bool SinglePermute) {
12472 ShuffleVectorInst *IdentityOp =
nullptr;
12473 SmallVector<int> IdentityMask;
12482 if (isIdentityMask(Mask, SVTy,
false)) {
12483 if (!IdentityOp || !SinglePermute ||
12484 (isIdentityMask(Mask, SVTy,
true) &&
12486 IdentityMask.
size()))) {
12491 IdentityMask.
assign(Mask);
12511 if (SV->isZeroEltSplat()) {
12513 IdentityMask.
assign(Mask);
12515 int LocalVF =
Mask.size();
12518 LocalVF = SVOpTy->getNumElements();
12522 static_cast<unsigned>(
I) >= SV->getShuffleMask().size())
12524 ExtMask[Idx] = SV->getMaskValue(
I);
12534 if (!IsOp1Undef && !IsOp2Undef) {
12536 for (
int &
I : Mask) {
12539 if (SV->getMaskValue(
I % SV->getShuffleMask().size()) ==
12545 SmallVector<int> ShuffleMask(SV->getShuffleMask());
12546 combineMasks(LocalVF, ShuffleMask, Mask);
12547 Mask.swap(ShuffleMask);
12549 Op = SV->getOperand(0);
12551 Op = SV->getOperand(1);
12554 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
12559 "Expected masks of same sizes.");
12564 Mask.swap(IdentityMask);
12566 return SinglePermute &&
12569 (Shuffle &&
Mask.size() == Shuffle->getShuffleMask().size() &&
12570 Shuffle->isZeroEltSplat() &&
12574 Shuffle->getShuffleMask()[
P.index()] == 0;
12587 template <
typename T,
typename ShuffleBuilderTy>
12588 static T createShuffle(
Value *V1,
Value *V2, ArrayRef<int> Mask,
12589 ShuffleBuilderTy &Builder,
Type *ScalarTy) {
12590 assert(V1 &&
"Expected at least one vector value.");
12592 SmallVector<int> NewMask(Mask);
12593 if (ScalarTyNumElements != 1) {
12599 Builder.resizeToMatch(V1, V2);
12600 int VF =
Mask.size();
12602 VF = FTy->getNumElements();
12613 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
12615 CombinedMask1[
I] =
Mask[
I];
12617 CombinedMask2[
I] =
Mask[
I] - VF;
12624 (void)peekThroughShuffles(Op1, CombinedMask1,
false);
12625 (void)peekThroughShuffles(Op2, CombinedMask2,
false);
12631 for (
auto [Idx,
I] :
enumerate(CombinedMask1)) {
12634 ExtMask1[Idx] = SV1->getMaskValue(
I);
12638 ->getNumElements(),
12639 ExtMask1, UseMask::SecondArg);
12640 SmallVector<int> ExtMask2(CombinedMask2.size(),
PoisonMaskElem);
12641 for (
auto [Idx,
I] :
enumerate(CombinedMask2)) {
12644 ExtMask2[Idx] = SV2->getMaskValue(
I);
12648 ->getNumElements(),
12649 ExtMask2, UseMask::SecondArg);
12650 if (SV1->getOperand(0)->getType() ==
12651 SV2->getOperand(0)->getType() &&
12652 SV1->getOperand(0)->getType() != SV1->getType() &&
12655 Op1 = SV1->getOperand(0);
12656 Op2 = SV2->getOperand(0);
12657 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
12658 int LocalVF = ShuffleMask1.size();
12660 LocalVF = FTy->getNumElements();
12661 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
12662 CombinedMask1.swap(ShuffleMask1);
12663 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
12664 LocalVF = ShuffleMask2.size();
12666 LocalVF = FTy->getNumElements();
12667 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
12668 CombinedMask2.swap(ShuffleMask2);
12671 }
while (PrevOp1 != Op1 || PrevOp2 != Op2);
12672 Builder.resizeToMatch(Op1, Op2);
12674 ->getElementCount()
12675 .getKnownMinValue(),
12677 ->getElementCount()
12678 .getKnownMinValue());
12679 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
12682 "Expected undefined mask element");
12683 CombinedMask1[
I] = CombinedMask2[
I] + (Op1 == Op2 ? 0 : VF);
12692 return Builder.createIdentity(Op1);
12693 return Builder.createShuffleVector(
12698 return Builder.createPoison(
12700 bool IsIdentity = peekThroughShuffles(V1, NewMask,
true);
12701 assert(V1 &&
"Expected non-null value after looking through shuffles.");
12704 return Builder.createShuffleVector(V1, NewMask);
12705 return Builder.createIdentity(V1);
12711 ArrayRef<int> Mask) {
12720static std::pair<InstructionCost, InstructionCost>
12731 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
12740 ScalarCost =
TTI.getPointersChainCost(
12741 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
12745 for (
Value *V : Ptrs) {
12746 if (V == BasePtr) {
12759 if (PtrsRetainedInVecCode.
size() == Ptrs.size()) {
12764 VecCost =
TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
12765 TTI::PointersChainInfo::getKnownStride(),
12775 [](
const Value *V) {
12777 return Ptr && !Ptr->hasAllConstantIndices();
12779 ? TTI::PointersChainInfo::getUnknownStride()
12780 : TTI::PointersChainInfo::getKnownStride();
12783 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy,
CostKind);
12787 if (It != Ptrs.
end())
12792 VecCost =
TTI.getGEPCost(BaseGEP->getSourceElementType(),
12793 BaseGEP->getPointerOperand(), Indices, VecTy,
12798 return std::make_pair(ScalarCost, VecCost);
12801void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
12802 assert(
TE.isGather() &&
TE.ReorderIndices.empty() &&
12803 "Expected gather node without reordering.");
12805 SmallSet<size_t, 2> LoadKeyUsed;
12809 if (
TE.Scalars.size() == 2 || (
TE.hasState() && !
TE.isAltShuffle()) ||
12814 return VectorizableTree[Idx]->isSame(TE.Scalars);
12818 auto GenerateLoadsSubkey = [&](
size_t Key, LoadInst *LI) {
12823 auto LIt = LoadsMap.
find(std::make_pair(
Key, Ptr));
12824 if (LIt != LoadsMap.
end()) {
12825 for (LoadInst *RLI : LIt->second) {
12827 LI->
getType(), LI->getPointerOperand(), *DL, *SE,
12831 for (LoadInst *RLI : LIt->second) {
12833 LI->getPointerOperand(), *TLI)) {
12838 if (LIt->second.size() > 2) {
12840 hash_value(LIt->second.back()->getPointerOperand());
12846 LoadsMap.
try_emplace(std::make_pair(
Key, Ptr)).first->second.push_back(LI);
12849 MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues;
12850 SmallDenseMap<Value *, SmallVector<unsigned>, 8> KeyToIndex;
12851 bool IsOrdered =
true;
12852 unsigned NumInstructions = 0;
12856 size_t Key = 1, Idx = 1;
12864 auto &Container = SortedValues[
Key];
12865 if (IsOrdered && !KeyToIndex.
contains(V) &&
12868 ((Container.contains(Idx) &&
12869 KeyToIndex.
at(Container[Idx].back()).back() !=
I - 1) ||
12870 (!Container.empty() && !Container.contains(Idx) &&
12871 KeyToIndex.
at(Container.back().second.back()).back() !=
I - 1)))
12873 auto &KTI = KeyToIndex[
V];
12875 Container[Idx].push_back(V);
12880 if (!IsOrdered && NumInstructions > 1) {
12882 TE.ReorderIndices.resize(
TE.Scalars.size(),
TE.Scalars.size());
12883 for (
const auto &
D : SortedValues) {
12884 for (
const auto &
P :
D.second) {
12886 for (
Value *V :
P.second) {
12887 ArrayRef<unsigned> Indices = KeyToIndex.
at(V);
12888 for (
auto [K, Idx] :
enumerate(Indices)) {
12889 TE.ReorderIndices[Cnt +
K] = Idx;
12890 TE.Scalars[Cnt +
K] =
V;
12892 Sz += Indices.
size();
12893 Cnt += Indices.
size();
12897 *TTI,
TE.Scalars.front()->getType(), Sz);
12901 }
else if (!
P.second.empty() &&
isConstant(
P.second.front())) {
12909 if (!
TE.ReuseShuffleIndices.empty() ||
TE.ReorderIndices.empty())
12914 auto *ScalarTy =
TE.Scalars.front()->getType();
12916 for (
auto [Idx, Sz] : SubVectors) {
12923 int Sz =
TE.Scalars.size();
12924 SmallVector<int> ReorderMask(
TE.ReorderIndices.begin(),
12925 TE.ReorderIndices.end());
12931 ReorderMask[
I] =
I +
TE.ReorderIndices.size();
12935 any_of(ReorderMask, [&](
int I) {
return I >= Sz; })
12938 VecTy, ReorderMask);
12944 DemandedElts.clearBit(
I);
12946 ReorderMask[
I] =
I;
12948 ReorderMask[
I] =
I + Sz;
12954 if (!DemandedElts.isAllOnes())
12956 if (
Cost >= BVCost) {
12957 SmallVector<int>
Mask(
TE.ReorderIndices.begin(),
TE.ReorderIndices.end());
12959 TE.ReorderIndices.clear();
12966 const InstructionsState &S,
12972 return V->getType()->getScalarType()->isFloatingPointTy();
12974 "Can only convert to FMA for floating point types");
12975 assert(S.isAddSubLikeOp() &&
"Can only convert to FMA for add/sub");
12980 for (
Value *V : VL) {
12984 if (S.isCopyableElement(
I))
12986 Instruction *MatchingI = S.getMatchingMainOpOrAltOp(
I);
12987 if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
12990 FMF &= FPCI->getFastMathFlags();
12994 if (!CheckForContractable(VL))
12997 InstructionsCompatibilityAnalysis
Analysis(DT,
DL,
TTI, TLI);
13004 if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
13006 if (!CheckForContractable(Operands.
front()))
13014 for (
Value *V : VL) {
13018 if (!S.isCopyableElement(
I))
13020 FMF &= FPCI->getFastMathFlags();
13021 FMulPlusFAddCost +=
TTI.getInstructionCost(
I,
CostKind);
13024 for (
auto [V,
Op] :
zip(VL, Operands.
front())) {
13025 if (S.isCopyableElement(V))
13028 if (!
I || !
I->hasOneUse() || OpS.isCopyableElement(
I)) {
13030 FMACost +=
TTI.getInstructionCost(OpI,
CostKind);
13037 FMF &= FPCI->getFastMathFlags();
13038 FMulPlusFAddCost +=
TTI.getInstructionCost(
I,
CostKind);
13048 BaseGraphSize = VectorizableTree.size();
13050 class GraphTransformModeRAAI {
13051 bool &SavedIsGraphTransformMode;
13054 GraphTransformModeRAAI(
bool &IsGraphTransformMode)
13055 : SavedIsGraphTransformMode(IsGraphTransformMode) {
13056 IsGraphTransformMode =
true;
13058 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode =
false; }
13059 } TransformContext(IsGraphTransformMode);
13068 const InstructionsState &S) {
13072 I2->getOperand(
Op));
13074 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
13076 [](
const std::pair<Value *, Value *> &
P) {
13086 TreeEntry &E = *VectorizableTree[Idx];
13088 reorderGatherNode(E);
13093 constexpr unsigned VFLimit = 16;
13094 bool ForceLoadGather =
13095 count_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
13096 return TE->isGather() && TE->hasState() &&
13097 TE->getOpcode() == Instruction::Load &&
13098 TE->getVectorFactor() < VFLimit;
13104 return TE->isSame(VL) ||
all_of(VL, [&](
Value *V) {
13113 auto CheckForSameVectorNodes = [&](
const TreeEntry &E) {
13114 if (E.hasState()) {
13116 !TEs.
empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
13117 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
13118 ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
13119 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13120 return is_contained(TEs, TE);
13127 !TEs.
empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
13128 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
13129 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
13130 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13131 return is_contained(TEs, TE);
13139 if (It != E.Scalars.end()) {
13141 !TEs.empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
13142 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
13143 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
13144 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13145 return is_contained(TEs, TE);
13155 for (
unsigned Idx :
seq<unsigned>(BaseGraphSize)) {
13156 TreeEntry &
E = *VectorizableTree[Idx];
13157 if (
E.isGather()) {
13160 unsigned MinVF =
getMinVF(2 * Sz);
13163 if (VL.
size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
13164 !(!
E.hasState() ||
E.getOpcode() == Instruction::Load ||
13170 if (ForceLoadGather &&
E.hasState() &&
E.getOpcode() == Instruction::Load)
13173 if (CheckForSameVectorNodes(
E))
13177 unsigned StartIdx = 0;
13178 unsigned End = VL.
size();
13180 *TTI, VL.
front()->getType(), VL.
size() - 1);
13182 *TTI, VL.
front()->getType(), VF - 1)) {
13183 if (StartIdx + VF > End)
13186 bool AllStrided =
true;
13187 for (
unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
13192 !getSameValuesTreeEntry(Slice.
front(), Slice,
true))
13199 bool IsSplat =
isSplat(Slice);
13200 bool IsTwoRegisterSplat =
true;
13201 if (IsSplat && VF == 2) {
13204 IsTwoRegisterSplat = NumRegs2VF == 2;
13206 if (Slices.
empty() || !IsSplat || !IsTwoRegisterSplat ||
13214 (S.getOpcode() == Instruction::Load &&
13216 (S.getOpcode() != Instruction::Load &&
13222 if ((!UserIgnoreList ||
E.Idx != 0) &&
13223 TTI->getInstructionCost(S.getMainOp(),
CostKind) <
13232 if (S.getOpcode() == Instruction::Load) {
13235 StridedPtrInfo SPtrInfo;
13237 PointerOps, SPtrInfo);
13248 if (UserIgnoreList &&
E.Idx == 0)
13253 }
else if (S.getOpcode() == Instruction::ExtractElement ||
13254 (TTI->getInstructionCost(S.getMainOp(),
CostKind) <
13256 !CheckOperandsProfitability(
13273 if (VF == 2 && AllStrided && Slices.
size() > 2)
13275 auto AddCombinedNode = [&](
unsigned Idx,
unsigned Cnt,
unsigned Sz) {
13276 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
13277 if (StartIdx == Cnt)
13278 StartIdx = Cnt + Sz;
13279 if (End == Cnt + Sz)
13282 for (
auto [Cnt, Sz] : Slices) {
13284 const TreeEntry *SameTE =
nullptr;
13286 It != Slice.
end()) {
13288 SameTE = getSameValuesTreeEntry(*It, Slice);
13290 unsigned PrevSize = VectorizableTree.size();
13291 [[maybe_unused]]
unsigned PrevEntriesSize =
13292 LoadEntriesToVectorize.size();
13293 buildTreeRec(Slice, 0,
EdgeInfo(&
E, UINT_MAX));
13294 if (PrevSize + 1 == VectorizableTree.size() && !SameTE &&
13295 VectorizableTree[PrevSize]->isGather() &&
13296 VectorizableTree[PrevSize]->hasState() &&
13297 VectorizableTree[PrevSize]->getOpcode() !=
13298 Instruction::ExtractElement &&
13300 if (UserIgnoreList &&
E.Idx == 0 && VF == 2)
13302 VectorizableTree.pop_back();
13303 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
13304 "LoadEntriesToVectorize expected to remain the same");
13307 AddCombinedNode(PrevSize, Cnt, Sz);
13311 if (
E.CombinedEntriesWithIndices.empty() && !
E.ReorderIndices.empty()) {
13312 SmallVector<int>
Mask(
E.ReorderIndices.begin(),
E.ReorderIndices.end());
13314 E.ReorderIndices.clear();
13319 switch (
E.getOpcode()) {
13320 case Instruction::Load: {
13323 if (
E.State != TreeEntry::Vectorize)
13325 Type *ScalarTy =
E.getMainOp()->getType();
13331 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13332 SmallVector<int>
Mask;
13336 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
13337 BaseLI->getPointerAddressSpace(),
CostKind,
13341 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_load,
13342 VecTy, BaseLI->getPointerOperand(),
13343 false, CommonAlignment,
13350 ->getPointerOperand()
13352 StridedPtrInfo SPtrInfo;
13353 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1);
13354 SPtrInfo.Ty = VecTy;
13355 TreeEntryToStridedPtrInfoMap[&
E] = SPtrInfo;
13356 E.State = TreeEntry::StridedVectorize;
13361 case Instruction::Store: {
13369 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13370 SmallVector<int>
Mask;
13374 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
13375 BaseSI->getPointerAddressSpace(),
CostKind,
13379 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_store,
13380 VecTy, BaseSI->getPointerOperand(),
13381 false, CommonAlignment,
13384 if (StridedCost < OriginalVecCost)
13387 E.State = TreeEntry::StridedVectorize;
13388 }
else if (!
E.ReorderIndices.empty()) {
13390 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int>
Mask) {
13392 assert(
Mask.size() > 1 &&
"Expected mask greater than 1 element.");
13393 if (
Mask.size() < 4)
13397 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
13398 TTI.isLegalInterleavedAccessType(
13399 VecTy, Factor, BaseSI->getAlign(),
13400 BaseSI->getPointerAddressSpace()))
13406 SmallVector<int>
Mask(
E.ReorderIndices.begin(),
E.ReorderIndices.end());
13407 unsigned InterleaveFactor = IsInterleaveMask(Mask);
13408 if (InterleaveFactor != 0)
13409 E.setInterleave(InterleaveFactor);
13413 case Instruction::Select: {
13414 if (
E.State != TreeEntry::Vectorize)
13420 E.CombinedOp = TreeEntry::MinMax;
13421 TreeEntry *CondEntry = getOperandEntry(&
E, 0);
13422 if (SelectOnly && CondEntry->UserTreeIndex &&
13423 CondEntry->State == TreeEntry::Vectorize) {
13425 CondEntry->State = TreeEntry::CombinedVectorize;
13429 case Instruction::FSub:
13430 case Instruction::FAdd: {
13432 if (
E.State != TreeEntry::Vectorize ||
13433 !
E.getOperations().isAddSubLikeOp())
13439 E.CombinedOp = TreeEntry::FMulAdd;
13440 TreeEntry *FMulEntry = getOperandEntry(&
E, 0);
13441 if (FMulEntry->UserTreeIndex &&
13442 FMulEntry->State == TreeEntry::Vectorize) {
13444 FMulEntry->State = TreeEntry::CombinedVectorize;
13453 if (LoadEntriesToVectorize.empty()) {
13455 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
13456 VectorizableTree.front()->getOpcode() == Instruction::Load)
13459 constexpr unsigned SmallTree = 3;
13460 constexpr unsigned SmallVF = 2;
13461 if ((VectorizableTree.size() <= SmallTree &&
13462 VectorizableTree.front()->Scalars.size() == SmallVF) ||
13463 (VectorizableTree.size() <= 2 && UserIgnoreList))
13466 if (VectorizableTree.front()->isNonPowOf2Vec() &&
13470 [](
const std::unique_ptr<TreeEntry> &TE) {
13471 return TE->isGather() &&
TE->hasState() &&
13472 TE->getOpcode() == Instruction::Load &&
13480 SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
13484 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
13485 TreeEntry &
E = *
TE;
13486 if (
E.isGather() &&
13487 ((
E.hasState() &&
E.getOpcode() == Instruction::Load) ||
13488 (!
E.hasState() &&
any_of(
E.Scalars,
13490 return isa<LoadInst>(V) &&
13491 !isVectorized(V) &&
13492 !isDeleted(cast<Instruction>(V));
13495 for (
Value *V :
E.Scalars) {
13502 *
this, V, *DL, *SE, *TTI,
13503 GatheredLoads[std::make_tuple(
13511 if (!GatheredLoads.
empty())
13512 tryToVectorizeGatheredLoads(GatheredLoads);
13522 bool IsFinalized =
false;
13535 bool SameNodesEstimated =
true;
13538 if (Ty->getScalarType()->isPointerTy()) {
13542 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
13543 Ty->getScalarType());
13561 assert(It != VL.
end() &&
"Expected at least one non-undef value.");
13564 count(VL, *It) > 1 &&
13566 if (!NeedShuffle) {
13569 return TTI.getShuffleCost(
13574 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
13575 CostKind, std::distance(VL.
begin(), It),
13581 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
13584 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
13588 VecTy, ShuffleMask, CostKind,
13592 return GatherCost +
13595 : R.getGatherCost(Gathers, !Root && VL.
equals(Gathers),
13603 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13604 unsigned NumParts) {
13605 assert(VL.
size() > NumParts &&
"Unexpected scalarized shuffle.");
13607 std::accumulate(VL.
begin(), VL.
end(), 0, [](
unsigned Sz,
Value *V) {
13608 auto *EE = dyn_cast<ExtractElementInst>(V);
13611 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
13614 return std::max(Sz, VecTy->getNumElements());
13621 -> std::optional<TTI::ShuffleKind> {
13622 if (NumElts <= EltsPerVector)
13623 return std::nullopt;
13625 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
13627 if (I == PoisonMaskElem)
13629 return std::min(S, I);
13632 int OffsetReg1 = OffsetReg0;
13636 int FirstRegId = -1;
13637 Indices.assign(1, OffsetReg0);
13641 int Idx =
I - OffsetReg0;
13643 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
13644 if (FirstRegId < 0)
13645 FirstRegId = RegId;
13646 RegIndices.
insert(RegId);
13647 if (RegIndices.
size() > 2)
13648 return std::nullopt;
13649 if (RegIndices.
size() == 2) {
13651 if (Indices.
size() == 1) {
13654 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
13655 [&](
int S,
int I) {
13656 if (I == PoisonMaskElem)
13658 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
13659 ((I - OffsetReg0) % NumElts) / EltsPerVector;
13660 if (RegId == FirstRegId)
13662 return std::min(S, I);
13665 unsigned Index = OffsetReg1 % NumElts;
13666 Indices.push_back(Index);
13667 SubVecSizes.push_back(std::min(NumElts - Index, EltsPerVector));
13669 Idx =
I - OffsetReg1;
13671 I = (Idx % NumElts) % EltsPerVector +
13672 (RegId == FirstRegId ? 0 : EltsPerVector);
13674 return ShuffleKind;
13682 if (!ShuffleKinds[Part])
13685 Part * EltsPerVector,
getNumElems(Mask.size(), EltsPerVector, Part));
13690 std::optional<TTI::ShuffleKind> RegShuffleKind =
13691 CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);
13692 if (!RegShuffleKind) {
13695 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.
size())))
13708 *R.TTI, VL.
front()->getType(),
alignTo(NumElts, EltsPerVector));
13709 for (
const auto [Idx, SubVecSize] :
zip(Indices, SubVecSizes)) {
13710 assert((Idx + SubVecSize) <= BaseVF &&
13711 "SK_ExtractSubvector index out of range");
13721 TTI, *ShuffleKinds[Part],
getWidenedType(ScalarTy, NumElts), SubMask);
13722 if (OriginalCost < Cost)
13723 Cost = OriginalCost;
13730 void estimateNodesPermuteCost(
const TreeEntry &E1,
const TreeEntry *E2,
13732 unsigned SliceSize) {
13733 if (SameNodesEstimated) {
13739 if ((InVectors.size() == 2 &&
13743 unsigned Limit =
getNumElems(Mask.size(), SliceSize, Part);
13746 "Expected all poisoned elements.");
13748 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
13753 Cost += createShuffle(InVectors.front(),
13754 InVectors.size() == 1 ?
nullptr : InVectors.back(),
13756 transformMaskAfterShuffle(CommonMask, CommonMask);
13757 }
else if (InVectors.size() == 2) {
13758 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
13759 transformMaskAfterShuffle(CommonMask, CommonMask);
13761 SameNodesEstimated =
false;
13762 if (!E2 && InVectors.size() == 1) {
13763 unsigned VF = E1.getVectorFactor();
13765 VF = std::max(VF, getVF(V1));
13768 VF = std::max(VF, E->getVectorFactor());
13770 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13772 CommonMask[Idx] = Mask[Idx] + VF;
13773 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
13774 transformMaskAfterShuffle(CommonMask, CommonMask);
13776 auto P = InVectors.front();
13777 Cost += createShuffle(&E1, E2, Mask);
13778 unsigned VF = Mask.size();
13784 VF = std::max(VF, E->getVectorFactor());
13786 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13788 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
13789 Cost += createShuffle(
P, InVectors.front(), CommonMask);
13790 transformMaskAfterShuffle(CommonMask, CommonMask);
13794 class ShuffleCostBuilder {
13797 static bool isEmptyOrIdentity(
ArrayRef<int> Mask,
unsigned VF) {
13799 return Mask.empty() ||
13800 (VF == Mask.size() &&
13808 ~ShuffleCostBuilder() =
default;
13814 if (isEmptyOrIdentity(Mask, VF))
13823 if (isEmptyOrIdentity(Mask, VF))
13832 void resizeToMatch(
Value *&,
Value *&)
const {}
13842 ShuffleCostBuilder Builder(TTI);
13845 unsigned CommonVF = Mask.size();
13847 auto GetNodeMinBWAffectedCost = [&](
const TreeEntry &E,
13851 Type *EScalarTy = E.Scalars.front()->getType();
13852 bool IsSigned =
true;
13853 if (
auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
13855 IsSigned = It->second.second;
13857 if (EScalarTy != ScalarTy) {
13858 unsigned CastOpcode = Instruction::Trunc;
13859 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13860 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13862 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13863 return TTI.getCastInstrCost(CastOpcode,
getWidenedType(ScalarTy, VF),
13873 Type *EScalarTy = VecTy->getElementType();
13874 if (EScalarTy != ScalarTy) {
13876 unsigned CastOpcode = Instruction::Trunc;
13877 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13878 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13880 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13881 return TTI.getCastInstrCost(
13887 if (!V1 && !V2 && !P2.
isNull()) {
13890 unsigned VF = E->getVectorFactor();
13892 CommonVF = std::max(VF, E2->getVectorFactor());
13895 return Idx < 2 * static_cast<int>(CommonVF);
13897 "All elements in mask must be less than 2 * CommonVF.");
13898 if (E->Scalars.size() == E2->Scalars.size()) {
13902 for (
int &Idx : CommonMask) {
13905 if (Idx <
static_cast<int>(CommonVF) && !EMask.
empty())
13907 else if (Idx >=
static_cast<int>(CommonVF))
13908 Idx = (E2Mask.
empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
13912 CommonVF = E->Scalars.size();
13913 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
13914 GetNodeMinBWAffectedCost(*E2, CommonVF);
13916 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
13917 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
13920 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13921 }
else if (!V1 && P2.
isNull()) {
13924 unsigned VF = E->getVectorFactor();
13928 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
13929 "All elements in mask must be less than CommonVF.");
13930 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
13932 assert(!EMask.
empty() &&
"Expected non-empty common mask.");
13933 for (
int &Idx : CommonMask) {
13937 CommonVF = E->Scalars.size();
13938 }
else if (
unsigned Factor = E->getInterleaveFactor();
13939 Factor > 0 && E->Scalars.size() != Mask.size() &&
13943 std::iota(CommonMask.begin(), CommonMask.end(), 0);
13945 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
13948 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
13949 CommonVF == CommonMask.size() &&
13951 [](
const auto &&
P) {
13953 static_cast<unsigned>(
P.value()) !=
P.index();
13961 }
else if (V1 && P2.
isNull()) {
13963 ExtraCost += GetValueMinBWAffectedCost(V1);
13964 CommonVF = getVF(V1);
13967 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
13968 "All elements in mask must be less than CommonVF.");
13969 }
else if (V1 && !V2) {
13971 unsigned VF = getVF(V1);
13973 CommonVF = std::max(VF, E2->getVectorFactor());
13976 return Idx < 2 * static_cast<int>(CommonVF);
13978 "All elements in mask must be less than 2 * CommonVF.");
13979 if (E2->Scalars.size() == VF && VF != CommonVF) {
13981 assert(!E2Mask.
empty() &&
"Expected non-empty common mask.");
13982 for (
int &Idx : CommonMask) {
13985 if (Idx >=
static_cast<int>(CommonVF))
13986 Idx = E2Mask[Idx - CommonVF] + VF;
13990 ExtraCost += GetValueMinBWAffectedCost(V1);
13992 ExtraCost += GetNodeMinBWAffectedCost(
13993 *E2, std::min(CommonVF, E2->getVectorFactor()));
13994 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13995 }
else if (!V1 && V2) {
13997 unsigned VF = getVF(V2);
13999 CommonVF = std::max(VF, E1->getVectorFactor());
14002 return Idx < 2 * static_cast<int>(CommonVF);
14004 "All elements in mask must be less than 2 * CommonVF.");
14005 if (E1->Scalars.size() == VF && VF != CommonVF) {
14007 assert(!E1Mask.
empty() &&
"Expected non-empty common mask.");
14008 for (
int &Idx : CommonMask) {
14011 if (Idx >=
static_cast<int>(CommonVF))
14012 Idx = E1Mask[Idx - CommonVF] + VF;
14018 ExtraCost += GetNodeMinBWAffectedCost(
14019 *E1, std::min(CommonVF, E1->getVectorFactor()));
14021 ExtraCost += GetValueMinBWAffectedCost(V2);
14022 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
14024 assert(V1 && V2 &&
"Expected both vectors.");
14025 unsigned VF = getVF(V1);
14026 CommonVF = std::max(VF, getVF(V2));
14029 return Idx < 2 * static_cast<int>(CommonVF);
14031 "All elements in mask must be less than 2 * CommonVF.");
14033 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
14036 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
14041 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
14044 InVectors.front() =
14046 if (InVectors.size() == 2)
14047 InVectors.pop_back();
14048 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
14049 V1, V2, CommonMask, Builder, ScalarTy);
14056 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
14057 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
14058 CheckedExtracts(CheckedExtracts) {}
14060 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
14061 unsigned NumParts,
bool &UseVecBaseAsInput) {
14062 UseVecBaseAsInput =
false;
14065 Value *VecBase =
nullptr;
14067 if (!E->ReorderIndices.empty()) {
14069 E->ReorderIndices.end());
14074 bool PrevNodeFound =
any_of(
14075 ArrayRef(R.VectorizableTree).take_front(E->Idx),
14076 [&](
const std::unique_ptr<TreeEntry> &TE) {
14077 return ((TE->hasState() && !TE->isAltShuffle() &&
14078 TE->getOpcode() == Instruction::ExtractElement) ||
14080 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
14081 return VL.size() > Data.index() &&
14082 (Mask[Data.index()] == PoisonMaskElem ||
14083 isa<UndefValue>(VL[Data.index()]) ||
14084 Data.value() == VL[Data.index()]);
14092 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
14106 VecBase = EE->getVectorOperand();
14107 UniqueBases.
insert(VecBase);
14109 if (!CheckedExtracts.
insert(V).second ||
14113 return isa<GetElementPtrInst>(U) &&
14114 !R.areAllUsersVectorized(cast<Instruction>(U),
14122 unsigned Idx = *EEIdx;
14124 if (EE->hasOneUse() || !PrevNodeFound) {
14130 Cost -= TTI.getExtractWithExtendCost(
14134 Cost += TTI.getCastInstrCost(
14140 APInt &DemandedElts =
14141 VectorOpsToExtracts
14144 .first->getSecond();
14145 DemandedElts.
setBit(Idx);
14148 for (
const auto &[Vec, DemandedElts] : VectorOpsToExtracts)
14150 DemandedElts,
false,
14158 if (!PrevNodeFound)
14159 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
14162 transformMaskAfterShuffle(CommonMask, CommonMask);
14163 SameNodesEstimated =
false;
14164 if (NumParts != 1 && UniqueBases.
size() != 1) {
14165 UseVecBaseAsInput =
true;
14173 std::optional<InstructionCost>
14177 return std::nullopt;
14181 IsFinalized =
false;
14182 CommonMask.clear();
14185 VectorizedVals.clear();
14186 SameNodesEstimated =
true;
14192 return Idx < static_cast<int>(E1.getVectorFactor());
14194 "Expected single vector shuffle mask.");
14198 if (InVectors.empty()) {
14199 CommonMask.assign(Mask.begin(), Mask.end());
14200 InVectors.assign({&E1, &E2});
14203 assert(!CommonMask.empty() &&
"Expected non-empty common mask.");
14209 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
14210 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
14213 if (InVectors.empty()) {
14214 CommonMask.assign(Mask.begin(), Mask.end());
14215 InVectors.assign(1, &E1);
14218 assert(!CommonMask.empty() &&
"Expected non-empty common mask.");
14224 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
14225 estimateNodesPermuteCost(E1,
nullptr, Mask, Part, SliceSize);
14226 if (!SameNodesEstimated && InVectors.size() == 1)
14227 InVectors.emplace_back(&E1);
14233 assert(InVectors.size() == 1 &&
14240 ->getOrdered(
P.index()));
14241 return EI->getVectorOperand() == V1 ||
14242 EI->getVectorOperand() == V2;
14244 "Expected extractelement vectors.");
14248 if (InVectors.empty()) {
14249 assert(CommonMask.empty() && !ForExtracts &&
14250 "Expected empty input mask/vectors.");
14251 CommonMask.assign(Mask.begin(), Mask.end());
14252 InVectors.assign(1, V1);
14258 !CommonMask.empty() &&
14262 ->getOrdered(
P.index());
14264 return P.value() == Mask[
P.index()] ||
14269 return EI->getVectorOperand() == V1;
14271 "Expected only tree entry for extractelement vectors.");
14274 assert(!InVectors.empty() && !CommonMask.empty() &&
14275 "Expected only tree entries from extracts/reused buildvectors.");
14276 unsigned VF = getVF(V1);
14277 if (InVectors.size() == 2) {
14278 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14279 transformMaskAfterShuffle(CommonMask, CommonMask);
14280 VF = std::max<unsigned>(VF, CommonMask.size());
14281 }
else if (
const auto *InTE =
14282 InVectors.front().dyn_cast<
const TreeEntry *>()) {
14283 VF = std::max(VF, InTE->getVectorFactor());
14287 ->getNumElements());
14289 InVectors.push_back(V1);
14290 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14292 CommonMask[Idx] = Mask[Idx] + VF;
14295 Value *Root =
nullptr) {
14296 Cost += getBuildVectorCost(VL, Root);
14300 unsigned VF = VL.
size();
14302 VF = std::min(VF, MaskVF);
14303 Type *VLScalarTy = VL.
front()->getType();
14327 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
14333 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14338 IsFinalized =
true;
14341 if (InVectors.
size() == 2)
14342 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
14344 Cost += createShuffle(Vec,
nullptr, CommonMask);
14345 transformMaskAfterShuffle(CommonMask, CommonMask);
14347 "Expected vector length for the final value before action.");
14350 Cost += createShuffle(V1, V2, Mask);
14353 InVectors.
front() = V;
14355 if (!SubVectors.empty()) {
14357 if (InVectors.
size() == 2)
14358 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
14360 Cost += createShuffle(Vec,
nullptr, CommonMask);
14361 transformMaskAfterShuffle(CommonMask, CommonMask);
14363 if (!SubVectorsMask.
empty()) {
14365 "Expected same size of masks for subvectors and common mask.");
14367 copy(SubVectorsMask, SVMask.begin());
14368 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
14371 I1 = I2 + CommonMask.
size();
14378 for (
auto [
E, Idx] : SubVectors) {
14379 Type *EScalarTy =
E->Scalars.front()->getType();
14380 bool IsSigned =
true;
14381 if (
auto It =
R.MinBWs.find(
E); It !=
R.MinBWs.end()) {
14384 IsSigned = It->second.second;
14386 if (ScalarTy != EScalarTy) {
14387 unsigned CastOpcode = Instruction::Trunc;
14388 unsigned DstSz =
R.DL->getTypeSizeInBits(ScalarTy);
14389 unsigned SrcSz =
R.DL->getTypeSizeInBits(EScalarTy);
14391 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14392 Cost += TTI.getCastInstrCost(
14401 if (!CommonMask.
empty()) {
14402 std::iota(std::next(CommonMask.
begin(), Idx),
14403 std::next(CommonMask.
begin(), Idx +
E->getVectorFactor()),
14409 if (!ExtMask.
empty()) {
14410 if (CommonMask.
empty()) {
14414 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
14417 NewMask[
I] = CommonMask[ExtMask[
I]];
14419 CommonMask.
swap(NewMask);
14422 if (CommonMask.
empty()) {
14423 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
14427 createShuffle(InVectors.
front(),
14428 InVectors.
size() == 2 ? InVectors.
back() :
nullptr,
14433 assert((IsFinalized || CommonMask.empty()) &&
14434 "Shuffle construction must be finalized.");
14438const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(
const TreeEntry *E,
14439 unsigned Idx)
const {
14440 TreeEntry *
Op = OperandsToTreeEntry.
at({E, Idx});
14441 assert(
Op->isSame(
E->getOperand(Idx)) &&
"Operands mismatch!");
14446 if (
TE.State == TreeEntry::ScatterVectorize ||
14447 TE.State == TreeEntry::StridedVectorize)
14449 if (
TE.State == TreeEntry::CompressVectorize)
14451 if (
TE.State == TreeEntry::Vectorize &&
TE.getOpcode() == Instruction::Load &&
14452 !
TE.isAltShuffle()) {
14453 if (
TE.ReorderIndices.empty())
14455 SmallVector<int>
Mask;
14465 SmallPtrSetImpl<Value *> &CheckedExtracts) {
14470 return InstructionCost::getInvalid();
14475 auto It = MinBWs.find(
E);
14476 Type *OrigScalarTy = ScalarTy;
14477 if (It != MinBWs.end()) {
14484 unsigned EntryVF =
E->getVectorFactor();
14487 if (
E->isGather()) {
14491 return InstructionCost::getInvalid();
14493 ScalarTy = VL.
front()->getType();
14494 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
14495 E, ScalarTy, *TTI, VectorizedVals, *
this, CheckedExtracts);
14497 if (
E->State == TreeEntry::SplitVectorize) {
14498 assert(
E->CombinedEntriesWithIndices.size() == 2 &&
14499 "Expected exactly 2 combined entries.");
14500 assert(
E->ReuseShuffleIndices.empty() &&
"Expected empty reuses mask.");
14502 if (
E->ReorderIndices.empty()) {
14505 E->CombinedEntriesWithIndices.back().second,
14508 VectorizableTree[
E->CombinedEntriesWithIndices.back().first]
14509 ->getVectorFactor()));
14511 unsigned CommonVF =
14512 std::max(VectorizableTree[
E->CombinedEntriesWithIndices.front().first]
14513 ->getVectorFactor(),
14514 VectorizableTree[
E->CombinedEntriesWithIndices.back().first]
14515 ->getVectorFactor());
14520 LLVM_DEBUG(dumpTreeCosts(
E, 0, VectorCost, 0,
"Calculated costs for Tree"));
14524 SmallVector<int>
Mask;
14525 if (!
E->ReorderIndices.empty() &&
E->State != TreeEntry::CompressVectorize &&
14526 (
E->State != TreeEntry::StridedVectorize ||
14528 SmallVector<int> NewMask;
14529 if (
E->getOpcode() == Instruction::Store) {
14531 NewMask.
resize(
E->ReorderIndices.size());
14538 if (!
E->ReuseShuffleIndices.empty())
14543 assert((
E->State == TreeEntry::Vectorize ||
14544 E->State == TreeEntry::ScatterVectorize ||
14545 E->State == TreeEntry::StridedVectorize ||
14546 E->State == TreeEntry::CompressVectorize) &&
14547 "Unhandled state");
14550 (
E->getOpcode() == Instruction::GetElementPtr &&
14551 E->getMainOp()->getType()->isPointerTy()) ||
14552 E->hasCopyableElements()) &&
14555 unsigned ShuffleOrOp =
14556 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector :
E->
getOpcode();
14557 if (
E->CombinedOp != TreeEntry::NotCombinedOp)
14558 ShuffleOrOp =
E->CombinedOp;
14559 SmallSetVector<Value *, 16> UniqueValues(VL.
begin(), VL.
end());
14560 const unsigned Sz = UniqueValues.size();
14561 SmallBitVector UsedScalars(Sz,
false);
14562 for (
unsigned I = 0;
I < Sz; ++
I) {
14564 !
E->isCopyableElement(UniqueValues[
I]) &&
14565 getTreeEntries(UniqueValues[
I]).
front() ==
E)
14567 UsedScalars.set(
I);
14569 auto GetCastContextHint = [&](
Value *
V) {
14571 return getCastContextHint(*OpTEs.front());
14572 InstructionsState SrcState =
getSameOpcode(
E->getOperand(0), *TLI);
14573 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
14574 !SrcState.isAltShuffle())
14587 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
14589 for (
unsigned I = 0;
I < Sz; ++
I) {
14590 if (UsedScalars.test(
I))
14592 ScalarCost += ScalarEltCost(
I);
14599 if (It != MinBWs.end() && !UnaryInstruction::isCast(
E->getOpcode()) &&
14601 (
E->getOpcode() != Instruction::Load ||
E->UserTreeIndex)) {
14603 if (!EI.UserTE->hasState() ||
14604 EI.UserTE->getOpcode() != Instruction::Select ||
14606 auto UserBWIt = MinBWs.find(EI.UserTE);
14607 Type *UserScalarTy =
14608 (EI.UserTE->isGather() ||
14609 EI.UserTE->State == TreeEntry::SplitVectorize)
14610 ? EI.UserTE->Scalars.front()->getType()
14611 : EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
14612 if (UserBWIt != MinBWs.end())
14614 UserBWIt->second.first);
14615 if (ScalarTy != UserScalarTy) {
14616 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
14617 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
14618 unsigned VecOpcode;
14620 if (BWSz > SrcBWSz)
14621 VecOpcode = Instruction::Trunc;
14624 It->second.second ? Instruction::SExt : Instruction::ZExt;
14626 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
14631 LLVM_DEBUG(dumpTreeCosts(
E, CommonCost, VecCost - CommonCost,
14632 ScalarCost,
"Calculated costs for Tree"));
14633 return VecCost - ScalarCost;
14638 assert((
E->State == TreeEntry::Vectorize ||
14639 E->State == TreeEntry::StridedVectorize ||
14640 E->State == TreeEntry::CompressVectorize) &&
14641 "Entry state expected to be Vectorize, StridedVectorize or "
14642 "MaskedLoadCompressVectorize here.");
14646 *TTI, Ptrs, BasePtr,
E->getOpcode(),
CostKind, OrigScalarTy, VecTy);
14647 LLVM_DEBUG(dumpTreeCosts(
E, 0, VecCost, ScalarCost,
14648 "Calculated GEPs cost for Tree"));
14650 return VecCost - ScalarCost;
14656 return InstructionCost::getInvalid();
14657 Type *CanonicalType = Ty;
14663 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
14664 {CanonicalType, CanonicalType});
14666 TTI->getIntrinsicInstrCost(CostAttrs,
CostKind);
14669 if (VI && SelectOnly) {
14671 "Expected only for scalar type.");
14674 CI->
getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
14675 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
14676 {TTI::OK_AnyValue, TTI::OP_None}, CI);
14680 auto GetFMulAddCost = [&, &TTI = *TTI](
const InstructionsState &S,
14685 switch (ShuffleOrOp) {
14686 case Instruction::PHI: {
14689 SmallPtrSet<const TreeEntry *, 4> CountedOps;
14690 for (
Value *V : UniqueValues) {
14695 ValueList Operands(
PHI->getNumIncomingValues(),
nullptr);
14696 for (
unsigned I = 0,
N =
PHI->getNumIncomingValues();
I <
N; ++
I) {
14700 if (
const TreeEntry *OpTE =
14701 getSameValuesTreeEntry(Operands.
front(), Operands))
14702 if (CountedOps.
insert(OpTE).second &&
14703 !OpTE->ReuseShuffleIndices.empty())
14704 ScalarCost +=
TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
14705 OpTE->Scalars.size());
14708 return CommonCost - ScalarCost;
14710 case Instruction::ExtractValue:
14711 case Instruction::ExtractElement: {
14712 APInt DemandedElts;
14714 auto GetScalarCost = [&](
unsigned Idx) {
14720 if (ShuffleOrOp == Instruction::ExtractElement) {
14722 SrcVecTy = EE->getVectorOperandType();
14725 Type *AggregateTy = EV->getAggregateOperand()->getType();
14728 NumElts = ATy->getNumElements();
14734 if (
I->hasOneUse()) {
14744 Cost -= TTI->getCastInstrCost(
14750 if (DemandedElts.
isZero())
14756 return CommonCost - (DemandedElts.
isZero()
14758 : TTI.getScalarizationOverhead(
14759 SrcVecTy, DemandedElts,
false,
14762 return GetCostDiff(GetScalarCost, GetVectorCost);
14764 case Instruction::InsertElement: {
14765 assert(
E->ReuseShuffleIndices.empty() &&
14766 "Unique insertelements only are expected.");
14768 unsigned const NumElts = SrcVecTy->getNumElements();
14769 unsigned const NumScalars = VL.
size();
14775 unsigned OffsetEnd = OffsetBeg;
14776 InsertMask[OffsetBeg] = 0;
14779 if (OffsetBeg > Idx)
14781 else if (OffsetEnd < Idx)
14783 InsertMask[Idx] =
I + 1;
14786 if (NumOfParts > 0 && NumOfParts < NumElts)
14787 VecScalarsSz =
PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
14788 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
14790 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
14791 unsigned InsertVecSz = std::min<unsigned>(
14793 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
14794 bool IsWholeSubvector =
14795 OffsetBeg ==
Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
14799 if (OffsetBeg + InsertVecSz > VecSz) {
14802 InsertVecSz = VecSz;
14807 SmallVector<int>
Mask;
14808 if (!
E->ReorderIndices.empty()) {
14813 std::iota(
Mask.begin(), std::next(
Mask.begin(), InsertVecSz), 0);
14815 bool IsIdentity =
true;
14817 Mask.swap(PrevMask);
14818 for (
unsigned I = 0;
I < NumScalars; ++
I) {
14820 DemandedElts.
setBit(InsertIdx);
14821 IsIdentity &= InsertIdx - OffsetBeg ==
I;
14822 Mask[InsertIdx - OffsetBeg] =
I;
14824 assert(
Offset < NumElts &&
"Failed to find vector index offset");
14838 InsertVecTy, Mask);
14840 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
14846 SmallBitVector InMask =
14848 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
14849 if (!InMask.
all() && NumScalars != NumElts && !IsWholeSubvector) {
14850 if (InsertVecSz != VecSz) {
14855 for (
unsigned I = 0, End = OffsetBeg -
Offset;
I < End; ++
I)
14857 for (
unsigned I = OffsetBeg -
Offset, End = OffsetEnd -
Offset;
14861 for (
unsigned I = OffsetEnd + 1 -
Offset;
I < VecSz; ++
I)
14870 case Instruction::ZExt:
14871 case Instruction::SExt:
14872 case Instruction::FPToUI:
14873 case Instruction::FPToSI:
14874 case Instruction::FPExt:
14875 case Instruction::PtrToInt:
14876 case Instruction::IntToPtr:
14877 case Instruction::SIToFP:
14878 case Instruction::UIToFP:
14879 case Instruction::Trunc:
14880 case Instruction::FPTrunc:
14881 case Instruction::BitCast: {
14882 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
14885 unsigned Opcode = ShuffleOrOp;
14886 unsigned VecOpcode = Opcode;
14888 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
14890 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->
getScalarType());
14891 if (SrcIt != MinBWs.end()) {
14892 SrcBWSz = SrcIt->second.first;
14898 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->
getScalarType());
14899 if (BWSz == SrcBWSz) {
14900 VecOpcode = Instruction::BitCast;
14901 }
else if (BWSz < SrcBWSz) {
14902 VecOpcode = Instruction::Trunc;
14903 }
else if (It != MinBWs.end()) {
14904 assert(BWSz > SrcBWSz &&
"Invalid cast!");
14905 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
14906 }
else if (SrcIt != MinBWs.end()) {
14907 assert(BWSz > SrcBWSz &&
"Invalid cast!");
14909 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
14911 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
14912 !SrcIt->second.second) {
14913 VecOpcode = Instruction::UIToFP;
14916 assert(Idx == 0 &&
"Expected 0 index only");
14917 return TTI->getCastInstrCost(Opcode, VL0->
getType(),
14924 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
14926 auto *
VI = VL0->
getOpcode() == Opcode ? VL0 :
nullptr;
14929 bool IsArithmeticExtendedReduction =
14930 E->Idx == 0 && UserIgnoreList &&
14933 return is_contained({Instruction::Add, Instruction::FAdd,
14934 Instruction::Mul, Instruction::FMul,
14935 Instruction::And, Instruction::Or,
14939 if (IsArithmeticExtendedReduction &&
14940 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
14942 return CommonCost +
14943 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH,
CostKind,
14944 VecOpcode == Opcode ? VI :
nullptr);
14946 return GetCostDiff(GetScalarCost, GetVectorCost);
14948 case Instruction::FCmp:
14949 case Instruction::ICmp:
14950 case Instruction::Select: {
14951 CmpPredicate VecPred, SwappedVecPred;
14954 match(VL0, MatchCmp))
14960 auto GetScalarCost = [&](
unsigned Idx) {
14970 !
match(VI, MatchCmp)) ||
14978 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
14979 CostKind, getOperandInfo(
VI->getOperand(0)),
14980 getOperandInfo(
VI->getOperand(1)), VI);
14991 TTI->getCmpSelInstrCost(
E->getOpcode(), VecTy, MaskTy, VecPred,
14992 CostKind, getOperandInfo(
E->getOperand(0)),
14993 getOperandInfo(
E->getOperand(1)), VL0);
14997 unsigned CondNumElements = CondType->getNumElements();
14999 assert(VecTyNumElements >= CondNumElements &&
15000 VecTyNumElements % CondNumElements == 0 &&
15001 "Cannot vectorize Instruction::Select");
15002 if (CondNumElements != VecTyNumElements) {
15011 return VecCost + CommonCost;
15013 return GetCostDiff(GetScalarCost, GetVectorCost);
15015 case TreeEntry::MinMax: {
15016 auto GetScalarCost = [&](
unsigned Idx) {
15017 return GetMinMaxCost(OrigScalarTy);
15021 return VecCost + CommonCost;
15023 return GetCostDiff(GetScalarCost, GetVectorCost);
15025 case TreeEntry::FMulAdd: {
15026 auto GetScalarCost = [&](
unsigned Idx) {
15029 return GetFMulAddCost(
E->getOperations(),
15035 for (
Value *V :
E->Scalars) {
15037 FMF &= FPCI->getFastMathFlags();
15039 FMF &= FPCIOp->getFastMathFlags();
15042 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,
15043 {VecTy, VecTy, VecTy}, FMF);
15045 return VecCost + CommonCost;
15047 return GetCostDiff(GetScalarCost, GetVectorCost);
15049 case Instruction::FNeg:
15050 case Instruction::Add:
15051 case Instruction::FAdd:
15052 case Instruction::Sub:
15053 case Instruction::FSub:
15054 case Instruction::Mul:
15055 case Instruction::FMul:
15056 case Instruction::UDiv:
15057 case Instruction::SDiv:
15058 case Instruction::FDiv:
15059 case Instruction::URem:
15060 case Instruction::SRem:
15061 case Instruction::FRem:
15062 case Instruction::Shl:
15063 case Instruction::LShr:
15064 case Instruction::AShr:
15065 case Instruction::And:
15066 case Instruction::Or:
15067 case Instruction::Xor: {
15068 auto GetScalarCost = [&](
unsigned Idx) {
15075 Value *Op1 =
E->getOperand(0)[Idx];
15077 SmallVector<const Value *, 2> Operands(1, Op1);
15081 Op2 =
E->getOperand(1)[Idx];
15087 ShuffleOrOp, OrigScalarTy,
CostKind, Op1Info, Op2Info, Operands);
15089 I && (ShuffleOrOp == Instruction::FAdd ||
15090 ShuffleOrOp == Instruction::FSub)) {
15098 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
15103 return CI && CI->getValue().countr_one() >= It->second.first;
15111 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy,
CostKind, Op1Info,
15112 Op2Info, {},
nullptr, TLI) +
15115 return GetCostDiff(GetScalarCost, GetVectorCost);
15117 case Instruction::GetElementPtr: {
15118 return CommonCost + GetGEPCostDiff(VL, VL0);
15120 case Instruction::Load: {
15121 auto GetScalarCost = [&](
unsigned Idx) {
15123 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
15124 VI->getAlign(),
VI->getPointerAddressSpace(),
15130 switch (
E->State) {
15131 case TreeEntry::Vectorize:
15132 if (
unsigned Factor =
E->getInterleaveFactor()) {
15133 VecLdCost = TTI->getInterleavedMemoryOpCost(
15134 Instruction::Load, VecTy, Factor, {}, LI0->getAlign(),
15135 LI0->getPointerAddressSpace(),
CostKind);
15138 VecLdCost = TTI->getMemoryOpCost(
15139 Instruction::Load, VecTy, LI0->getAlign(),
15143 case TreeEntry::StridedVectorize: {
15144 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(
E);
15145 FixedVectorType *StridedLoadTy = SPtrInfo.Ty;
15146 assert(StridedLoadTy &&
"Missing StridedPoinerInfo for tree entry.");
15147 Align CommonAlignment =
15149 VecLdCost = TTI->getMemIntrinsicInstrCost(
15150 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_load,
15151 StridedLoadTy, LI0->getPointerOperand(),
15152 false, CommonAlignment),
15154 if (StridedLoadTy != VecTy)
15156 TTI->getCastInstrCost(Instruction::BitCast, VecTy, StridedLoadTy,
15161 case TreeEntry::CompressVectorize: {
15163 unsigned InterleaveFactor;
15164 SmallVector<int> CompressMask;
15167 if (!
E->ReorderIndices.empty()) {
15168 SmallVector<int>
Mask(
E->ReorderIndices.begin(),
15169 E->ReorderIndices.end());
15176 Scalars, PointerOps,
E->ReorderIndices, *TTI, *DL, *SE, *AC, *DT,
15177 *TLI, [](
Value *) { return true; }, IsMasked, InterleaveFactor,
15178 CompressMask, LoadVecTy);
15179 assert(IsVectorized &&
"Failed to vectorize load");
15180 CompressEntryToData.try_emplace(
E, CompressMask, LoadVecTy,
15181 InterleaveFactor, IsMasked);
15182 Align CommonAlignment = LI0->getAlign();
15183 if (InterleaveFactor) {
15184 VecLdCost = TTI->getInterleavedMemoryOpCost(
15185 Instruction::Load, LoadVecTy, InterleaveFactor, {},
15186 CommonAlignment, LI0->getPointerAddressSpace(),
CostKind);
15187 }
else if (IsMasked) {
15188 VecLdCost = TTI->getMemIntrinsicInstrCost(
15189 MemIntrinsicCostAttributes(Intrinsic::masked_load, LoadVecTy,
15191 LI0->getPointerAddressSpace()),
15195 LoadVecTy, CompressMask,
CostKind);
15197 VecLdCost = TTI->getMemoryOpCost(
15198 Instruction::Load, LoadVecTy, CommonAlignment,
15202 LoadVecTy, CompressMask,
CostKind);
15206 case TreeEntry::ScatterVectorize: {
15207 Align CommonAlignment =
15209 VecLdCost = TTI->getMemIntrinsicInstrCost(
15210 MemIntrinsicCostAttributes(Intrinsic::masked_gather, VecTy,
15211 LI0->getPointerOperand(),
15212 false, CommonAlignment),
15216 case TreeEntry::CombinedVectorize:
15217 case TreeEntry::SplitVectorize:
15218 case TreeEntry::NeedToGather:
15221 return VecLdCost + CommonCost;
15227 if (
E->State == TreeEntry::ScatterVectorize)
15234 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
15236 case Instruction::Store: {
15237 bool IsReorder = !
E->ReorderIndices.empty();
15238 auto GetScalarCost = [=](
unsigned Idx) {
15241 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
15242 VI->getAlign(),
VI->getPointerAddressSpace(),
15250 if (
E->State == TreeEntry::StridedVectorize) {
15251 Align CommonAlignment =
15253 VecStCost = TTI->getMemIntrinsicInstrCost(
15254 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_store,
15255 VecTy, BaseSI->getPointerOperand(),
15256 false, CommonAlignment),
15259 assert(
E->State == TreeEntry::Vectorize &&
15260 "Expected either strided or consecutive stores.");
15261 if (
unsigned Factor =
E->getInterleaveFactor()) {
15262 assert(
E->ReuseShuffleIndices.empty() && !
E->ReorderIndices.empty() &&
15263 "No reused shuffles expected");
15265 VecStCost = TTI->getInterleavedMemoryOpCost(
15266 Instruction::Store, VecTy, Factor, {}, BaseSI->getAlign(),
15267 BaseSI->getPointerAddressSpace(),
CostKind);
15270 VecStCost = TTI->getMemoryOpCost(
15271 Instruction::Store, VecTy, BaseSI->getAlign(),
15272 BaseSI->getPointerAddressSpace(),
CostKind, OpInfo);
15275 return VecStCost + CommonCost;
15279 unsigned Idx = IsReorder ?
E->ReorderIndices[
I] :
I;
15283 return GetCostDiff(GetScalarCost, GetVectorCost) +
15284 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
15286 case Instruction::Call: {
15287 auto GetScalarCost = [&](
unsigned Idx) {
15291 IntrinsicCostAttributes CostAttrs(
ID, *CI, 1);
15292 return TTI->getIntrinsicInstrCost(CostAttrs,
CostKind);
15302 CI,
ID, VecTy->getNumElements(),
15303 It != MinBWs.end() ? It->second.first : 0, TTI);
15305 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
15307 return GetCostDiff(GetScalarCost, GetVectorCost);
15309 case Instruction::ShuffleVector: {
15317 "Invalid Shuffle Vector Operand");
15320 auto TryFindNodeWithEqualOperands = [=]() {
15321 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
15324 if (
TE->hasState() &&
TE->isAltShuffle() &&
15325 ((
TE->getOpcode() ==
E->getOpcode() &&
15326 TE->getAltOpcode() ==
E->getAltOpcode()) ||
15327 (
TE->getOpcode() ==
E->getAltOpcode() &&
15328 TE->getAltOpcode() ==
E->getOpcode())) &&
15329 TE->hasEqualOperands(*
E))
15334 auto GetScalarCost = [&](
unsigned Idx) {
15339 assert(
E->getMatchingMainOpOrAltOp(VI) &&
15340 "Unexpected main/alternate opcode");
15342 return TTI->getInstructionCost(VI,
CostKind);
15350 if (TryFindNodeWithEqualOperands()) {
15352 dbgs() <<
"SLP: diamond match for alternate node found.\n";
15359 TTIRef.getArithmeticInstrCost(
E->getOpcode(), VecTy,
CostKind);
15361 TTIRef.getArithmeticInstrCost(
E->getAltOpcode(), VecTy,
CostKind);
15364 VecCost = TTIRef.getCmpSelInstrCost(
15365 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(),
CostKind,
15366 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15368 VecCost += TTIRef.getCmpSelInstrCost(
15369 E->getOpcode(), VecTy, MaskTy,
15371 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15374 Type *SrcSclTy =
E->getMainOp()->getOperand(0)->getType();
15377 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
15378 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
15380 DL->getTypeSizeInBits(
E->getMainOp()->getOperand(0)->getType());
15381 if (SrcIt != MinBWs.end()) {
15382 SrcBWSz = SrcIt->second.first;
15386 if (BWSz <= SrcBWSz) {
15387 if (BWSz < SrcBWSz)
15389 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
15393 <<
"SLP: alternate extension, which should be truncated.\n";
15399 VecCost = TTIRef.getCastInstrCost(
E->getOpcode(), VecTy, SrcTy,
15402 TTIRef.getCastInstrCost(
E->getAltOpcode(), VecTy, SrcTy,
15405 SmallVector<int>
Mask;
15406 E->buildAltOpShuffleMask(
15407 [&](Instruction *
I) {
15408 assert(
E->getMatchingMainOpOrAltOp(
I) &&
15409 "Unexpected main/alternate opcode");
15420 unsigned Opcode0 =
E->getOpcode();
15421 unsigned Opcode1 =
E->getAltOpcode();
15422 SmallBitVector OpcodeMask(
15426 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
15428 VecTy, Opcode0, Opcode1, OpcodeMask,
CostKind);
15429 return AltVecCost < VecCost ? AltVecCost : VecCost;
15435 return GetCostDiff(
15440 "Not supported shufflevector usage.");
15442 unsigned SVNumElements =
15444 ->getNumElements();
15445 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
15446 for (
size_t I = 0, End = VL.
size();
I != End;
I += GroupSize) {
15451 "Not supported shufflevector usage.");
15454 [[maybe_unused]]
bool IsExtractSubvectorMask =
15455 SV->isExtractSubvectorMask(Index);
15456 assert(IsExtractSubvectorMask &&
15457 "Not supported shufflevector usage.");
15458 if (NextIndex != Index)
15460 NextIndex += SV->getShuffleMask().size();
15463 return ::getShuffleCost(
15469 return GetCostDiff(GetScalarCost, GetVectorCost);
15471 case Instruction::Freeze:
15478bool BoUpSLP::isFullyVectorizableTinyTree(
bool ForReduction)
const {
15480 << VectorizableTree.size() <<
" is fully vectorizable .\n");
15482 auto &&AreVectorizableGathers = [
this](
const TreeEntry *
TE,
unsigned Limit) {
15483 SmallVector<int>
Mask;
15484 return TE->isGather() &&
15486 [
this](
Value *V) { return EphValues.contains(V); }) &&
15488 TE->Scalars.size() < Limit ||
15489 (((
TE->hasState() &&
15490 TE->getOpcode() == Instruction::ExtractElement) ||
15493 (
TE->hasState() &&
TE->getOpcode() == Instruction::Load &&
15494 !
TE->isAltShuffle()) ||
15499 if (VectorizableTree.size() == 1 &&
15500 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
15501 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
15502 VectorizableTree[0]->State == TreeEntry::CompressVectorize ||
15504 AreVectorizableGathers(VectorizableTree[0].
get(),
15505 VectorizableTree[0]->Scalars.size()) &&
15506 VectorizableTree[0]->getVectorFactor() > 2)))
15509 if (VectorizableTree.size() != 2)
15516 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
15517 AreVectorizableGathers(VectorizableTree[1].
get(),
15518 VectorizableTree[0]->Scalars.size()))
15522 if (VectorizableTree[0]->
isGather() ||
15523 (VectorizableTree[1]->
isGather() &&
15524 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
15525 VectorizableTree[0]->State != TreeEntry::StridedVectorize &&
15526 VectorizableTree[0]->State != TreeEntry::CompressVectorize))
15534 bool MustMatchOrInst) {
15538 Value *ZextLoad = Root;
15539 const APInt *ShAmtC;
15540 bool FoundOr =
false;
15544 ShAmtC->
urem(8) == 0))) {
15546 ZextLoad = BinOp->getOperand(0);
15547 if (BinOp->getOpcode() == Instruction::Or)
15552 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
15559 Type *SrcTy = Load->getType();
15560 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
15566 LLVM_DEBUG(
dbgs() <<
"SLP: Assume load combining for tree starting at "
15576 unsigned NumElts = VectorizableTree[0]->Scalars.size();
15577 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
15585 unsigned NumElts = Stores.
size();
15586 for (
Value *Scalar : Stores) {
15600 if (VectorizableTree.empty()) {
15601 assert(ExternalUses.empty() &&
"We shouldn't have any external users");
15607 if (VectorizableTree.size() == 2 &&
15609 VectorizableTree[1]->isGather() &&
15610 (VectorizableTree[1]->getVectorFactor() <= 2 ||
15611 !(
isSplat(VectorizableTree[1]->Scalars) ||
15619 constexpr int Limit = 4;
15621 !VectorizableTree.empty() &&
15622 all_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
15623 return (TE->isGather() &&
15624 (!TE->hasState() ||
15625 TE->getOpcode() != Instruction::ExtractElement) &&
15627 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
15634 VectorizableTree.size() <= Limit &&
15635 all_of(VectorizableTree,
15636 [&](
const std::unique_ptr<TreeEntry> &TE) {
15637 return (TE->isGather() &&
15638 (!TE->hasState() ||
15639 TE->getOpcode() != Instruction::ExtractElement) &&
15643 (TE->getOpcode() == Instruction::InsertElement ||
15644 (TE->getOpcode() == Instruction::PHI &&
15646 return isa<PoisonValue>(V) || MustGather.contains(V);
15649 any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
15650 return TE->State == TreeEntry::Vectorize &&
15651 TE->getOpcode() == Instruction::PHI;
15658 unsigned NumGathers = 0;
15659 constexpr int LimitTreeSize = 36;
15661 all_of(VectorizableTree,
15662 [&](
const std::unique_ptr<TreeEntry> &TE) {
15663 if (!TE->isGather() && TE->hasState() &&
15664 (TE->getOpcode() == Instruction::Load ||
15665 TE->getOpcode() == Instruction::Store)) {
15669 if (TE->isGather())
15671 return TE->State == TreeEntry::SplitVectorize ||
15672 (TE->Idx == 0 && TE->Scalars.size() == 2 &&
15673 TE->hasState() && TE->getOpcode() == Instruction::ICmp &&
15674 VectorizableTree.size() > LimitTreeSize) ||
15678 (TE->getOpcode() == Instruction::PHI ||
15679 (TE->hasCopyableElements() &&
15682 TE->Scalars.size() / 2) ||
15683 ((!TE->ReuseShuffleIndices.empty() ||
15684 !TE->ReorderIndices.empty() || TE->isAltShuffle()) &&
15685 TE->Scalars.size() == 2)));
15687 (StoreLoadNodes.
empty() ||
15688 (VectorizableTree.size() > LimitTreeSize * StoreLoadNodes.
size() &&
15689 (NumGathers > 0 ||
none_of(StoreLoadNodes, [&](
const TreeEntry *TE) {
15690 return TE->getOpcode() == Instruction::Store ||
15692 return !isa<LoadInst>(V) ||
15693 areAllUsersVectorized(cast<Instruction>(V));
15701 VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
15702 VectorizableTree.size() >= Limit &&
15704 [&](
const std::unique_ptr<TreeEntry> &TE) {
15705 return !TE->isGather() && TE->UserTreeIndex.UserTE &&
15706 TE->UserTreeIndex.UserTE->Idx == 0;
15713 VectorizableTree.size() > 2 &&
15714 VectorizableTree.front()->State == TreeEntry::Vectorize &&
15715 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
15716 VectorizableTree[1]->State == TreeEntry::Vectorize &&
15717 VectorizableTree[1]->getOpcode() == Instruction::PHI &&
15719 ArrayRef(VectorizableTree).drop_front(2),
15720 [&](
const std::unique_ptr<TreeEntry> &TE) {
return TE->isGather(); }))
15730 if (isFullyVectorizableTinyTree(ForReduction))
15735 bool IsAllowedSingleBVNode =
15736 VectorizableTree.
size() > 1 ||
15737 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
15738 !VectorizableTree.front()->isAltShuffle() &&
15739 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
15740 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
15742 if (
any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
15743 return TE->isGather() &&
all_of(TE->Scalars, [&](
Value *V) {
15744 return isa<ExtractElementInst, Constant>(V) ||
15745 (IsAllowedSingleBVNode &&
15746 !V->hasNUsesOrMore(UsesLimit) &&
15747 any_of(V->users(), IsaPred<InsertElementInst>));
15752 if (VectorizableTree.back()->isGather() &&
15753 VectorizableTree.back()->hasState() &&
15754 VectorizableTree.back()->isAltShuffle() &&
15755 VectorizableTree.back()->getVectorFactor() > 2 &&
15757 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
15758 TTI->getScalarizationOverhead(
15759 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
15760 VectorizableTree.back()->getVectorFactor()),
15773 constexpr unsigned SmallTree = 3;
15774 if (VectorizableTree.front()->isNonPowOf2Vec() &&
15777 [](
const std::unique_ptr<TreeEntry> &TE) {
15778 return TE->isGather() && TE->hasState() &&
15779 TE->getOpcode() == Instruction::Load &&
15787 TreeEntry &E = *VectorizableTree[Idx];
15788 if (E.State == TreeEntry::SplitVectorize)
15792 if ((E.hasState() && E.getOpcode() != Instruction::Load) ||
15811 const TreeEntry *Root = VectorizableTree.front().get();
15812 if (Root->isGather())
15820 for (
const auto &TEPtr : VectorizableTree) {
15821 if (!TEPtr->isGather()) {
15822 Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
15823 EntriesToLastInstruction.
try_emplace(TEPtr.get(), LastInst);
15824 LastInstructions.
insert(LastInst);
15826 if (TEPtr->UserTreeIndex)
15827 EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
15834 if (
II->isAssumeLikeIntrinsic())
15841 return IntrCost < CallCost;
15848 CheckedInstructions;
15849 unsigned Budget = 0;
15850 const unsigned BudgetLimit =
15855 "Expected instructions in same block.");
15856 if (
auto It = CheckedInstructions.
find(
Last);
15857 It != CheckedInstructions.
end()) {
15858 const Instruction *Checked = It->second.getPointer();
15860 return It->second.getInt() != 0;
15866 ++
First->getIterator().getReverse(),
15868 Last->getIterator().getReverse();
15870 while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
15876 for (
const Instruction *LastInst : LastInstsInRange)
15877 CheckedInstructions.
try_emplace(LastInst, &*PrevInstIt, 0);
15880 if (LastInstructions.
contains(&*PrevInstIt))
15881 LastInstsInRange.
push_back(&*PrevInstIt);
15886 for (
const Instruction *LastInst : LastInstsInRange)
15888 LastInst, PrevInstIt == InstIt ?
First : &*PrevInstIt,
15889 Budget <= BudgetLimit ? 1 : 0);
15890 return Budget <= BudgetLimit;
15892 auto AddCosts = [&](
const TreeEntry *
Op) {
15893 Type *ScalarTy =
Op->Scalars.front()->getType();
15894 auto It = MinBWs.find(
Op);
15895 if (It != MinBWs.end())
15898 Cost += TTI->getCostOfKeepingLiveOverCall(VecTy);
15901 Cost -=
Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy);
15908 ParentOpParentToPreds;
15911 auto Key = std::make_pair(Root, OpParent);
15912 if (
auto It = ParentOpParentToPreds.
find(
Key);
15913 It != ParentOpParentToPreds.
end())
15925 for (
const auto &KeyPair : ParentsPairsToAdd) {
15927 "Should not have been added before.");
15931 while (!Worklist.
empty()) {
15933 if (BB == OpParent || !Visited.
insert(BB).second)
15935 auto Pair = std::make_pair(BB, OpParent);
15936 if (
auto It = ParentOpParentToPreds.
find(Pair);
15937 It != ParentOpParentToPreds.
end()) {
15941 ParentsPairsToAdd.
insert(Pair);
15946 if (Budget > BudgetLimit)
15958 while (!LiveEntries.
empty()) {
15961 if (Operands.
empty())
15963 Instruction *LastInst = EntriesToLastInstruction.
at(Entry);
15965 for (
const TreeEntry *
Op : Operands) {
15966 if (!
Op->isGather())
15968 if (Entry->State == TreeEntry::SplitVectorize ||
15969 (Entry->getOpcode() != Instruction::PHI &&
Op->isGather()) ||
15975 Pred = Phi->getIncomingBlock(
Op->UserTreeIndex.EdgeIdx);
15978 if (
Op->isGather()) {
15979 assert(Entry->getOpcode() == Instruction::PHI &&
15980 "Expected phi node only.");
15982 ->getIncomingBlock(
Op->UserTreeIndex.EdgeIdx);
15984 for (
Value *V :
Op->Scalars) {
15995 OpLastInst = EntriesToLastInstruction.
at(
Op);
15999 if (OpParent == Parent) {
16000 if (Entry->getOpcode() == Instruction::PHI) {
16001 if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
16005 if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
16011 if (Entry->getOpcode() != Instruction::PHI &&
16012 !CheckForNonVecCallsInSameBlock(
16013 &*LastInst->
getParent()->getFirstNonPHIOrDbgOrAlloca(),
16019 if (!CheckForNonVecCallsInSameBlock(OpLastInst,
16025 if (!CheckPredecessors(Parent, Pred, OpParent)) {
16041 const auto *I1 = IE1;
16042 const auto *I2 = IE2;
16054 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
16057 if (I2 && ((I2 == IE2 || I2->
hasOneUse())) &&
16060 }
while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
16067struct ValueSelect {
16068 template <
typename U>
16069 static std::enable_if_t<std::is_same_v<Value *, U>,
Value *>
get(
Value *V) {
16072 template <
typename U>
16073 static std::enable_if_t<!std::is_same_v<Value *, U>,
U>
get(
Value *) {
16091template <
typename T>
16097 assert(!ShuffleMask.empty() &&
"Empty list of shuffles for inserts.");
16099 auto VMIt = std::next(ShuffleMask.begin());
16102 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
16104 if (!IsBaseUndef.
all()) {
16106 std::pair<T *, bool> Res =
16107 ResizeAction(ShuffleMask.begin()->first, Mask,
false);
16109 for (
unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
16113 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
16115 [[maybe_unused]]
auto *V = ValueSelect::get<T *>(
Base);
16116 assert((!V || GetVF(V) == Mask.size()) &&
16117 "Expected base vector of VF number of elements.");
16118 Prev = Action(Mask, {
nullptr, Res.first});
16119 }
else if (ShuffleMask.size() == 1) {
16122 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
16128 Prev = Action(Mask, {ShuffleMask.begin()->first});
16132 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
16133 unsigned Vec2VF = GetVF(VMIt->first);
16134 if (Vec1VF == Vec2VF) {
16138 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
16141 Mask[
I] = SecMask[
I] + Vec1VF;
16144 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
16147 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
16149 std::pair<T *, bool> Res2 =
16150 ResizeAction(VMIt->first, VMIt->second,
false);
16152 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
16159 Mask[
I] = (Res2.second ?
I : SecMask[
I]) + VF;
16162 Prev = Action(Mask, {Res1.first, Res2.first});
16164 VMIt = std::next(VMIt);
16166 [[maybe_unused]]
bool IsBaseNotUndef = !IsBaseUndef.
all();
16168 for (
auto E = ShuffleMask.end(); VMIt !=
E; ++VMIt) {
16170 std::pair<T *, bool> Res =
16171 ResizeAction(VMIt->first, VMIt->second,
false);
16173 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
16176 "Multiple uses of scalars.");
16177 Mask[
I] = (Res.second ?
I : SecMask[
I]) + VF;
16182 Prev = Action(Mask, {Prev, Res.first});
16190template <
typename T>
struct ShuffledInsertData {
16194 MapVector<T, SmallVector<int>> ValueMasks;
16202 << VectorizableTree.size() <<
".\n");
16205 for (
unsigned I = 0, E = VectorizableTree.size();
I < E; ++
I) {
16206 TreeEntry &TE = *VectorizableTree[
I];
16209 if (TE.State == TreeEntry::CombinedVectorize) {
16211 dbgs() <<
"SLP: Skipping cost for combined node that starts with "
16212 << *TE.Scalars[0] <<
".\n";
16213 TE.dump();
dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
16216 if (TE.hasState() &&
16217 (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {
16218 if (
const TreeEntry *E =
16219 getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
16220 E && E->getVectorFactor() == TE.getVectorFactor()) {
16225 <<
"SLP: Current total cost = " << Cost <<
"\n");
16232 assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&
16233 "Expected gather nodes with users only.");
16239 <<
"SLP: Current total cost = " << Cost <<
"\n");
16243 none_of(ExternalUses, [](
const ExternalUser &EU) {
16254 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
16261 for (ExternalUser &EU : ExternalUses) {
16262 ScalarUserAndIdx.
emplace_back(EU.Scalar, EU.User, EU.Lane);
16265 for (ExternalUser &EU : ExternalUses) {
16266 LLVM_DEBUG(
dbgs() <<
"SLP: Computing cost for external use of TreeEntry "
16267 << EU.E.Idx <<
" in lane " << EU.Lane <<
"\n");
16269 else dbgs() <<
" User: nullptr\n");
16270 LLVM_DEBUG(
dbgs() <<
" Use: " << EU.Scalar->getNameOrAsOperand() <<
"\n");
16275 if (EphValues.count(EU.User))
16279 if (!CheckedScalarUser.
insert(std::make_pair(EU.Scalar, EU.User)).second ||
16281 CheckedScalarUser.
contains(std::make_pair(EU.Scalar,
nullptr))))
16289 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
16295 !ExtractCostCalculated.
insert(EU.Scalar).second)
16308 if (!UsedInserts.
insert(VU).second)
16312 const TreeEntry *ScalarTE = &EU.E;
16315 [
this, VU](
const ShuffledInsertData<const TreeEntry *> &
Data) {
16320 Value *Op0 =
II->getOperand(0);
16327 if (It == ShuffledInserts.
end()) {
16329 Data.InsertElements.emplace_back(VU);
16331 VecId = ShuffledInserts.
size() - 1;
16332 auto It = MinBWs.find(ScalarTE);
16333 if (It != MinBWs.end() &&
16335 .
insert(std::make_pair(ScalarTE, FTy->getElementType()))
16337 unsigned BWSz = It->second.first;
16338 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
16339 unsigned VecOpcode;
16340 if (DstBWSz < BWSz)
16341 VecOpcode = Instruction::Trunc;
16344 It->second.second ? Instruction::SExt : Instruction::ZExt;
16349 FTy->getNumElements()),
16352 <<
" for extending externally used vector with "
16353 "non-equal minimum bitwidth.\n");
16358 It->InsertElements.front() = VU;
16359 VecId = std::distance(ShuffledInserts.
begin(), It);
16361 int InIdx = *InsertIdx;
16363 ShuffledInserts[VecId].ValueMasks[ScalarTE];
16366 Mask[InIdx] = EU.Lane;
16367 DemandedElts[VecId].setBit(InIdx);
16378 auto *ScalarTy = EU.Scalar->getType();
16379 const unsigned BundleWidth = EU.E.getVectorFactor();
16380 assert(EU.Lane < BundleWidth &&
"Extracted lane out of bounds.");
16382 const TreeEntry *Entry = &EU.E;
16383 auto It = MinBWs.find(Entry);
16384 if (It != MinBWs.end()) {
16389 ? Instruction::ZExt
16390 : Instruction::SExt;
16395 << ExtraCost <<
"\n");
16399 CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx);
16400 LLVM_DEBUG(
dbgs() <<
" ExtractElement cost for " << *ScalarTy <<
" from "
16401 << *VecTy <<
": " << ExtraCost <<
"\n");
16404 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
16405 Entry->getOpcode() == Instruction::Load) {
16407 auto IsPhiInLoop = [&](
const ExternalUser &U) {
16410 const Loop *L = LI->getLoopFor(Phi->getParent());
16411 return L && (Phi->getParent() ==
I->getParent() ||
16412 L == LI->getLoopFor(
I->getParent()));
16416 if (!ValueToExtUses) {
16417 ValueToExtUses.emplace();
16418 for (
const auto &
P :
enumerate(ExternalUses)) {
16420 if (IsPhiInLoop(
P.value()))
16423 ValueToExtUses->try_emplace(
P.value().Scalar,
P.index());
16430 auto OperandIsScalar = [&](
Value *V) {
16436 return !EE->hasOneUse() || !MustGather.contains(EE);
16439 return ValueToExtUses->contains(V);
16441 bool CanBeUsedAsScalar =
all_of(Inst->operands(), OperandIsScalar);
16442 bool CanBeUsedAsScalarCast =
false;
16445 Op &&
all_of(
Op->operands(), OperandIsScalar)) {
16450 if (ScalarCost + OpCost <= ExtraCost) {
16451 CanBeUsedAsScalar = CanBeUsedAsScalarCast =
true;
16452 ScalarCost += OpCost;
16456 if (CanBeUsedAsScalar) {
16457 bool KeepScalar = ScalarCost <= ExtraCost;
16461 bool IsProfitablePHIUser =
16463 VectorizableTree.front()->Scalars.size() > 2)) &&
16464 VectorizableTree.front()->hasState() &&
16465 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
16469 auto *PHIUser = dyn_cast<PHINode>(U);
16470 return (!PHIUser ||
16471 PHIUser->getParent() !=
16473 VectorizableTree.front()->getMainOp())
16478 return ValueToExtUses->contains(V);
16480 if (IsProfitablePHIUser) {
16484 (!GatheredLoadsEntriesFirst.has_value() ||
16485 Entry->Idx < *GatheredLoadsEntriesFirst)) {
16486 unsigned ScalarUsesCount =
count_if(Entry->Scalars, [&](
Value *V) {
16487 return ValueToExtUses->contains(V);
16489 auto It = ExtractsCount.
find(Entry);
16490 if (It != ExtractsCount.
end()) {
16491 assert(ScalarUsesCount >= It->getSecond().size() &&
16492 "Expected total number of external uses not less than "
16493 "number of scalar uses.");
16494 ScalarUsesCount -= It->getSecond().size();
16499 KeepScalar = ScalarUsesCount <= 1 || !
has_single_bit(ScalarUsesCount);
16502 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
16503 for (
Value *V : Inst->operands()) {
16504 auto It = ValueToExtUses->find(V);
16505 if (It != ValueToExtUses->end()) {
16507 ExternalUses[It->second].User =
nullptr;
16510 ExtraCost = ScalarCost;
16511 if (!IsPhiInLoop(EU))
16512 ExtractsCount[Entry].
insert(Inst);
16513 if (CanBeUsedAsScalarCast) {
16514 ScalarOpsFromCasts.
insert(Inst->getOperand(0));
16518 for (
Value *V : IOp->operands()) {
16519 auto It = ValueToExtUses->find(V);
16520 if (It != ValueToExtUses->end()) {
16522 ExternalUses[It->second].User =
nullptr;
16531 ExtractCost += ExtraCost;
16535 for (
Value *V : ScalarOpsFromCasts) {
16536 ExternalUsesAsOriginalScalar.insert(V);
16538 ExternalUses.emplace_back(V,
nullptr, *TEs.front(),
16539 TEs.front()->findLaneForValue(V));
16543 if (!VectorizedVals.
empty()) {
16544 const TreeEntry &Root = *VectorizableTree.front();
16545 auto BWIt = MinBWs.find(&Root);
16546 if (BWIt != MinBWs.end()) {
16547 Type *DstTy = Root.Scalars.front()->getType();
16548 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->
getScalarType());
16550 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
16551 if (OriginalSz != SrcSz) {
16552 unsigned Opcode = Instruction::Trunc;
16553 if (OriginalSz > SrcSz)
16554 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
16560 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
16567 Cost += ExtractCost;
16568 auto &&ResizeToVF = [
this, &Cost](
const TreeEntry *TE,
ArrayRef<int> Mask,
16569 bool ForSingleMask) {
16571 unsigned VF = Mask.size();
16572 unsigned VecVF = TE->getVectorFactor();
16573 bool HasLargeIndex =
16574 any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); });
16575 if ((VF != VecVF && HasLargeIndex) ||
16578 if (HasLargeIndex) {
16580 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
16586 dbgs() <<
"SLP: Adding cost " <<
C
16587 <<
" for final shuffle of insertelement external users.\n";
16588 TE->dump();
dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
16590 return std::make_pair(TE,
true);
16593 if (!ForSingleMask) {
16595 for (
unsigned I = 0;
I < VF; ++
I) {
16597 ResizeMask[Mask[
I]] = Mask[
I];
16604 dbgs() <<
"SLP: Adding cost " <<
C
16605 <<
" for final shuffle of insertelement external users.\n";
16606 TE->dump();
dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
16611 return std::make_pair(TE,
false);
16614 for (
int I = 0, E = ShuffledInserts.
size();
I < E; ++
I) {
16615 Value *
Base = ShuffledInserts[
I].InsertElements.
front()->getOperand(0);
16616 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
16620 assert((TEs.size() == 1 || TEs.size() == 2) &&
16621 "Expected exactly 1 or 2 tree entries.");
16622 if (TEs.size() == 1) {
16624 VF = TEs.front()->getVectorFactor();
16625 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16629 (
Data.index() < VF &&
16630 static_cast<int>(
Data.index()) ==
Data.value());
16635 <<
" for final shuffle of insertelement "
16636 "external users.\n";
16637 TEs.front()->
dump();
16638 dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
16644 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
16645 VF = TEs.front()->getVectorFactor();
16649 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16653 <<
" for final shuffle of vector node and external "
16654 "insertelement users.\n";
16655 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
16656 dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
16664 [](
const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
16665 EstimateShufflesCost);
16668 ShuffledInserts[
I].InsertElements.
front()->getType()),
16671 Cost -= InsertCost;
16675 if (ReductionBitWidth != 0) {
16676 assert(UserIgnoreList &&
"Expected reduction tree.");
16677 const TreeEntry &E = *VectorizableTree.front();
16678 auto It = MinBWs.find(&E);
16679 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
16680 unsigned SrcSize = It->second.first;
16681 unsigned DstSize = ReductionBitWidth;
16682 unsigned Opcode = Instruction::Trunc;
16683 if (SrcSize < DstSize) {
16684 bool IsArithmeticExtendedReduction =
16687 return is_contained({Instruction::Add, Instruction::FAdd,
16688 Instruction::Mul, Instruction::FMul,
16689 Instruction::And, Instruction::Or,
16693 if (IsArithmeticExtendedReduction)
16695 Instruction::BitCast;
16697 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
16699 if (Opcode != Instruction::BitCast) {
16701 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
16703 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
16706 switch (E.getOpcode()) {
16707 case Instruction::SExt:
16708 case Instruction::ZExt:
16709 case Instruction::Trunc: {
16710 const TreeEntry *OpTE = getOperandEntry(&E, 0);
16711 CCH = getCastContextHint(*OpTE);
16717 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
16721 <<
" for final resize for reduction from " << SrcVecTy
16722 <<
" to " << DstVecTy <<
"\n";
16723 dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
16728 std::optional<InstructionCost> SpillCost;
16731 Cost += *SpillCost;
16737 OS <<
"SLP: Spill Cost = ";
16742 OS <<
".\nSLP: Extract Cost = " << ExtractCost <<
".\n"
16743 <<
"SLP: Total Cost = " << Cost <<
".\n";
16747 ViewGraph(
this,
"SLP" + F->getName(),
false, Str);
16758std::optional<TTI::ShuffleKind>
16759BoUpSLP::tryToGatherSingleRegisterExtractElements(
16765 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
16781 if (Idx >= VecTy->getNumElements()) {
16785 SmallBitVector ExtractMask(VecTy->getNumElements(),
true);
16786 ExtractMask.reset(*Idx);
16791 VectorOpToIdx[EI->getVectorOperand()].push_back(
I);
16796 stable_sort(Vectors, [](
const auto &P1,
const auto &P2) {
16797 return P1.second.size() > P2.second.size();
16800 const int UndefSz = UndefVectorExtracts.
size();
16801 unsigned SingleMax = 0;
16802 unsigned PairMax = 0;
16803 if (!Vectors.
empty()) {
16804 SingleMax = Vectors.
front().second.size() + UndefSz;
16805 if (Vectors.
size() > 1) {
16806 auto *ItNext = std::next(Vectors.
begin());
16807 PairMax = SingleMax + ItNext->second.size();
16810 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
16811 return std::nullopt;
16817 if (SingleMax >= PairMax && SingleMax) {
16818 for (
int Idx : Vectors.
front().second)
16819 std::swap(GatheredExtracts[Idx], VL[Idx]);
16820 }
else if (!Vectors.
empty()) {
16821 for (
unsigned Idx : {0, 1})
16822 for (
int Idx : Vectors[Idx].second)
16823 std::swap(GatheredExtracts[Idx], VL[Idx]);
16826 for (
int Idx : UndefVectorExtracts)
16827 std::swap(GatheredExtracts[Idx], VL[Idx]);
16830 std::optional<TTI::ShuffleKind> Res =
16836 return std::nullopt;
16840 for (
int I = 0,
E = GatheredExtracts.size();
I <
E; ++
I) {
16861BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
16862 SmallVectorImpl<int> &Mask,
16863 unsigned NumParts)
const {
16864 assert(NumParts > 0 &&
"NumParts expected be greater than or equal to 1.");
16873 SmallVector<int> SubMask;
16874 std::optional<TTI::ShuffleKind> Res =
16875 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
16876 ShufflesRes[Part] = Res;
16877 copy(SubMask, std::next(
Mask.begin(), Part * SliceSize));
16879 if (
none_of(ShufflesRes, [](
const std::optional<TTI::ShuffleKind> &Res) {
16880 return Res.has_value();
16882 ShufflesRes.clear();
16883 return ShufflesRes;
16886std::optional<TargetTransformInfo::ShuffleKind>
16887BoUpSLP::isGatherShuffledSingleRegisterEntry(
16889 SmallVectorImpl<const TreeEntry *> &Entries,
unsigned Part,
bool ForOrder) {
16893 auto GetUserEntry = [&](
const TreeEntry *
TE) {
16894 while (
TE->UserTreeIndex &&
TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16895 TE =
TE->UserTreeIndex.UserTE;
16896 if (TE == VectorizableTree.front().get())
16897 return EdgeInfo(
const_cast<TreeEntry *
>(TE), 0);
16898 return TE->UserTreeIndex;
16900 auto HasGatherUser = [&](
const TreeEntry *
TE) {
16901 while (
TE->Idx != 0 &&
TE->UserTreeIndex) {
16902 if (
TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16904 TE =
TE->UserTreeIndex.UserTE;
16908 const EdgeInfo TEUseEI = GetUserEntry(TE);
16910 return std::nullopt;
16911 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
16916 TEUseEI.UserTE->hasState() ? TEUseEI.UserTE->getMainOp() :
nullptr);
16917 PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
16918 TEInsertBlock =
PHI->getIncomingBlock(TEUseEI.EdgeIdx);
16921 TEInsertBlock = TEInsertPt->
getParent();
16923 if (!DT->isReachableFromEntry(TEInsertBlock))
16924 return std::nullopt;
16925 auto *NodeUI = DT->getNode(TEInsertBlock);
16926 assert(NodeUI &&
"Should only process reachable instructions");
16928 auto CheckOrdering = [&](
const Instruction *InsertPt) {
16941 const BasicBlock *InsertBlock = InsertPt->getParent();
16942 auto *NodeEUI = DT->getNode(InsertBlock);
16945 assert((NodeUI == NodeEUI) ==
16946 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
16947 "Different nodes should have different DFS numbers");
16949 if (TEInsertPt->
getParent() != InsertBlock &&
16950 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
16952 if (TEInsertPt->
getParent() == InsertBlock &&
16965 SmallDenseMap<Value *, int> UsedValuesEntry;
16966 SmallPtrSet<const Value *, 16> VisitedValue;
16967 auto CheckAndUseSameNode = [&](
const TreeEntry *TEPtr) {
16969 if ((TEPtr->getVectorFactor() != VL.
size() &&
16970 TEPtr->Scalars.size() != VL.
size()) ||
16971 (!TEPtr->isSame(VL) && !TEPtr->isSame(
TE->Scalars)))
16975 for (
Value *V : VL) {
16982 auto CheckParentNodes = [&](
const TreeEntry *User1,
const TreeEntry *User2,
16983 unsigned EdgeIdx) {
16984 const TreeEntry *Ptr1 = User1;
16985 const TreeEntry *Ptr2 = User2;
16986 SmallDenseMap<const TreeEntry *, unsigned> PtrToIdx;
16989 EdgeIdx = Ptr2->UserTreeIndex.EdgeIdx;
16990 Ptr2 = Ptr2->UserTreeIndex.UserTE;
16993 unsigned Idx = Ptr1->UserTreeIndex.EdgeIdx;
16994 Ptr1 = Ptr1->UserTreeIndex.UserTE;
16995 if (
auto It = PtrToIdx.
find(Ptr1); It != PtrToIdx.
end())
16996 return Idx < It->second;
17000 auto CheckNonSchedulableOrdering = [&](
const TreeEntry *
E,
17002 return TEUseEI && TEUseEI.UserTE && TEUseEI.UserTE->hasCopyableElements() &&
17003 !TEUseEI.UserTE->isCopyableElement(
17006 InsertPt->getNextNode() == TEInsertPt &&
17007 (!
E->hasCopyableElements() || !
E->isCopyableElement(InsertPt) ||
17010 for (
Value *V : VL) {
17014 SmallPtrSet<const TreeEntry *, 4> VToTEs;
17015 for (
const TreeEntry *TEPtr : ValueToGatherNodes.lookup(V)) {
17016 if (TEPtr == TE || TEPtr->Idx == 0)
17019 [&](
Value *V) { return GatheredScalars.contains(V); }) &&
17020 "Must contain at least single gathered value.");
17021 assert(TEPtr->UserTreeIndex &&
17022 "Expected only single user of a gather node.");
17023 const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
17025 PHINode *UserPHI = (UseEI.UserTE->State != TreeEntry::SplitVectorize &&
17026 UseEI.UserTE->hasState())
17031 : &getLastInstructionInBundle(UseEI.UserTE);
17032 if (TEInsertPt == InsertPt) {
17034 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
17035 (TEUseEI.UserTE->getOpcode() != Instruction::PHI ||
17036 TEUseEI.UserTE->isAltShuffle()) &&
17038 if (UseEI.UserTE->State != TreeEntry::Vectorize ||
17039 (UseEI.UserTE->hasState() &&
17040 UseEI.UserTE->getOpcode() == Instruction::PHI &&
17041 !UseEI.UserTE->isAltShuffle()) ||
17050 (TEUseEI.UserTE != UseEI.UserTE || TEUseEI.EdgeIdx < UseEI.EdgeIdx))
17053 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
17054 TEUseEI.UserTE->getOpcode() == Instruction::PHI &&
17055 UseEI.UserTE->State == TreeEntry::Vectorize &&
17056 UseEI.UserTE->getOpcode() == Instruction::PHI &&
17057 TEUseEI.UserTE != UseEI.UserTE)
17062 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
17066 if (TEUseEI.UserTE != UseEI.UserTE &&
17067 (TEUseEI.UserTE->Idx < UseEI.UserTE->Idx ||
17068 HasGatherUser(TEUseEI.UserTE)))
17071 if (CheckParentNodes(TEUseEI.UserTE, UseEI.UserTE, UseEI.EdgeIdx))
17075 if (!TEUseEI.UserTE->isGather() && !UserPHI &&
17076 TEUseEI.UserTE->doesNotNeedToSchedule() !=
17077 UseEI.UserTE->doesNotNeedToSchedule() &&
17082 if ((TEInsertBlock != InsertPt->
getParent() ||
17083 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
17084 (!CheckOrdering(InsertPt) ||
17085 (UseEI.UserTE->hasCopyableElements() &&
17090 if (CheckAndUseSameNode(TEPtr))
17095 if (CheckNonSchedulableOrdering(UseEI.UserTE, InsertPt))
17101 VTEs, [&](
const TreeEntry *MTE) {
return MTE != TEUseEI.UserTE; });
17102 if (It != VTEs.end()) {
17103 const TreeEntry *VTE = *It;
17104 if (
none_of(
TE->CombinedEntriesWithIndices,
17105 [&](
const auto &
P) { return P.first == VTE->Idx; })) {
17106 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
17107 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
17111 if (CheckAndUseSameNode(VTE))
17117 const TreeEntry *VTE = VTEs.front();
17118 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
17119 VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
17120 VTEs = VTEs.drop_front();
17122 const auto *MIt =
find_if(VTEs, [](
const TreeEntry *MTE) {
17123 return MTE->State == TreeEntry::Vectorize;
17125 if (MIt == VTEs.end())
17129 if (
none_of(
TE->CombinedEntriesWithIndices,
17130 [&](
const auto &
P) { return P.first == VTE->Idx; })) {
17131 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
17132 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst) ||
17133 CheckNonSchedulableOrdering(VTE, &LastBundleInst))
17137 if (CheckAndUseSameNode(VTE))
17141 if (VToTEs.
empty())
17143 if (UsedTEs.
empty()) {
17151 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
17153 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
17157 if (!VToTEs.
empty()) {
17163 VToTEs = SavedVToTEs;
17168 if (Idx == UsedTEs.
size()) {
17172 if (UsedTEs.
size() == 2)
17174 UsedTEs.push_back(SavedVToTEs);
17175 Idx = UsedTEs.
size() - 1;
17181 if (UsedTEs.
empty()) {
17183 return std::nullopt;
17187 if (UsedTEs.
size() == 1) {
17190 UsedTEs.front().
end());
17191 sort(FirstEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
17192 return TE1->Idx < TE2->Idx;
17195 auto *It =
find_if(FirstEntries, [=](
const TreeEntry *EntryPtr) {
17196 return EntryPtr->isSame(VL) || EntryPtr->isSame(
TE->Scalars);
17198 if (It != FirstEntries.end() &&
17199 ((*It)->getVectorFactor() == VL.size() ||
17200 ((*It)->getVectorFactor() ==
TE->Scalars.size() &&
17201 TE->ReuseShuffleIndices.size() == VL.size() &&
17202 (*It)->isSame(
TE->Scalars)))) {
17204 if ((*It)->getVectorFactor() == VL.size()) {
17205 std::iota(std::next(
Mask.begin(), Part * VL.size()),
17206 std::next(
Mask.begin(), (Part + 1) * VL.size()), 0);
17208 SmallVector<int> CommonMask =
TE->getCommonMask();
17219 Entries.
push_back(FirstEntries.front());
17221 for (
auto &
P : UsedValuesEntry)
17223 VF = FirstEntries.front()->getVectorFactor();
17226 assert(UsedTEs.
size() == 2 &&
"Expected at max 2 permuted entries.");
17228 DenseMap<int, const TreeEntry *> VFToTE;
17229 for (
const TreeEntry *TE : UsedTEs.front()) {
17230 unsigned VF =
TE->getVectorFactor();
17231 auto It = VFToTE.
find(VF);
17232 if (It != VFToTE.
end()) {
17233 if (It->second->Idx >
TE->Idx)
17234 It->getSecond() =
TE;
17241 UsedTEs.back().
end());
17242 sort(SecondEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
17243 return TE1->Idx < TE2->Idx;
17245 for (
const TreeEntry *TE : SecondEntries) {
17246 auto It = VFToTE.
find(
TE->getVectorFactor());
17247 if (It != VFToTE.
end()) {
17256 if (Entries.
empty()) {
17258 UsedTEs.front(), [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
17259 return TE1->Idx < TE2->Idx;
17261 Entries.
push_back(SecondEntries.front());
17262 VF = std::max(Entries.
front()->getVectorFactor(),
17263 Entries.
back()->getVectorFactor());
17265 VF = Entries.
front()->getVectorFactor();
17268 for (
const TreeEntry *
E : Entries)
17272 for (
auto &
P : UsedValuesEntry) {
17274 if (ValuesToEntries[Idx].
contains(
P.first)) {
17284 auto AreCompatiblePHIs = [&](
Value *
V,
Value *V1) {
17291 for (
int I = 0,
E =
PHI->getNumIncomingValues();
I <
E; ++
I) {
17293 Value *In1 = PHI1->getIncomingValue(
I);
17308 auto MightBeIgnored = [=](
Value *
V) {
17312 !areAllUsersVectorized(
I, UserIgnoreList) &&
isSimple(
I);
17317 auto NeighborMightBeIgnored = [&](
Value *
V,
int Idx) {
17318 Value *V1 = VL[Idx];
17319 bool UsedInSameVTE =
false;
17320 auto It = UsedValuesEntry.find(V1);
17321 if (It != UsedValuesEntry.end())
17322 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
17323 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
17330 SmallBitVector UsedIdxs(Entries.size());
17332 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
17334 auto It = UsedValuesEntry.find(V);
17335 if (It == UsedValuesEntry.end())
17341 ((
I > 0 && NeighborMightBeIgnored(V,
I - 1)) ||
17342 (
I !=
E - 1 && NeighborMightBeIgnored(V,
I + 1)))))
17344 unsigned Idx = It->second;
17351 for (
unsigned I = 0, Sz = Entries.size();
I < Sz; ++
I) {
17352 if (!UsedIdxs.test(
I))
17358 for (std::pair<unsigned, int> &Pair : EntryLanes)
17359 if (Pair.first ==
I)
17360 Pair.first = TempEntries.
size();
17363 Entries.swap(TempEntries);
17364 if (EntryLanes.size() == Entries.size() &&
17366 .slice(Part * VL.size(),
17367 std::min<int>(VL.size(),
TE->Scalars.size())))) {
17373 return std::nullopt;
17376 bool IsIdentity = Entries.size() == 1;
17379 for (
const std::pair<unsigned, int> &Pair : EntryLanes) {
17380 unsigned Idx = Part * VL.size() + Pair.second;
17383 (ForOrder ? std::distance(
17384 Entries[Pair.first]->Scalars.begin(),
17385 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
17386 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
17387 IsIdentity &=
Mask[Idx] == Pair.second;
17389 if (ForOrder || IsIdentity || Entries.empty()) {
17390 switch (Entries.size()) {
17392 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
17396 if (EntryLanes.size() > 2 || VL.size() <= 2)
17403 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
17405 SmallVector<int> SubMask(std::next(
Mask.begin(), Part * VL.size()),
17406 std::next(
Mask.begin(), (Part + 1) * VL.size()));
17407 int MinElement = SubMask.
front(), MaxElement = SubMask.
front();
17408 for (
int Idx : SubMask) {
17416 assert(MaxElement >= 0 && MinElement >= 0 &&
17417 MaxElement % VF >= MinElement % VF &&
17418 "Expected at least single element.");
17419 unsigned NewVF = std::max<unsigned>(
17421 (MaxElement % VF) -
17422 (MinElement % VF) + 1));
17424 for (
int &Idx : SubMask) {
17427 Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
17428 (Idx >=
static_cast<int>(VF) ? NewVF : 0);
17436 auto *MaskVecTy =
getWidenedType(VL.front()->getType(), SubMask.size());
17437 auto GetShuffleCost = [&,
17438 &TTI = *TTI](ArrayRef<int>
Mask,
17441 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
17443 Mask, Entries.front()->getInterleaveFactor()))
17445 return ::getShuffleCost(TTI,
17450 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
17452 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
17453 if (Entries.size() == 1 || !Entries[0]->isGather()) {
17454 FirstShuffleCost = ShuffleCost;
17458 bool IsIdentity =
true;
17459 for (
auto [
I, Idx] :
enumerate(FirstMask)) {
17460 if (Idx >=
static_cast<int>(NewVF)) {
17465 IsIdentity &=
static_cast<int>(
I) == Idx;
17469 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
17471 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
17475 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
17476 if (Entries.size() == 1 || !Entries[1]->isGather()) {
17477 SecondShuffleCost = ShuffleCost;
17481 bool IsIdentity =
true;
17482 for (
auto [
I, Idx] :
enumerate(SecondMask)) {
17483 if (Idx <
static_cast<int>(NewVF) && Idx >= 0) {
17489 IsIdentity &=
static_cast<int>(
I) == Idx;
17494 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
17496 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
17504 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
17506 const TreeEntry *BestEntry =
nullptr;
17507 if (FirstShuffleCost < ShuffleCost) {
17508 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
17509 std::next(
Mask.begin(), (Part + 1) * VL.size()),
17511 if (Idx >= static_cast<int>(VF))
17512 Idx = PoisonMaskElem;
17514 BestEntry = Entries.front();
17515 ShuffleCost = FirstShuffleCost;
17517 if (SecondShuffleCost < ShuffleCost) {
17518 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
17519 std::next(
Mask.begin(), (Part + 1) * VL.size()),
17521 if (Idx < static_cast<int>(VF))
17522 Idx = PoisonMaskElem;
17526 BestEntry = Entries[1];
17527 ShuffleCost = SecondShuffleCost;
17529 if (BuildVectorCost >= ShuffleCost) {
17532 Entries.push_back(BestEntry);
17540 std::fill(std::next(
Mask.begin(), Part * VL.size()),
17542 return std::nullopt;
17546BoUpSLP::isGatherShuffledEntry(
17550 assert(NumParts > 0 && NumParts < VL.
size() &&
17551 "Expected positive number of registers.");
17554 if (TE == VectorizableTree.front().get() &&
17555 (!GatheredLoadsEntriesFirst.has_value() ||
17557 [](
const std::unique_ptr<TreeEntry> &TE) {
17558 return !
TE->isGather();
17563 if (
TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
17566 assert((
TE->UserTreeIndex || TE == VectorizableTree.front().get()) &&
17567 "Expected only single user of the gather node.");
17569 "Number of scalars must be divisible by NumParts.");
17570 if (
TE->UserTreeIndex &&
TE->UserTreeIndex.UserTE->isGather() &&
17571 TE->UserTreeIndex.EdgeIdx == UINT_MAX &&
17573 (
TE->hasState() &&
TE->getOpcode() == Instruction::ExtractElement) ||
17576 getSameValuesTreeEntry(
TE->getMainOp(),
TE->Scalars))))
17583 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
17584 std::optional<TTI::ShuffleKind> SubRes =
17585 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
17588 SubEntries.
clear();
17591 SubEntries.
front()->getVectorFactor() == VL.
size() &&
17592 (SubEntries.
front()->isSame(
TE->Scalars) ||
17593 SubEntries.
front()->isSame(VL))) {
17595 LocalSubEntries.
swap(SubEntries);
17598 std::iota(
Mask.begin(),
Mask.end(), 0);
17600 for (
int I = 0, Sz = VL.
size();
I < Sz; ++
I)
17603 Entries.emplace_back(1, LocalSubEntries.
front());
17609 [](
const std::optional<TTI::ShuffleKind> &SK) {
return !SK; })) {
17617 Type *ScalarTy)
const {
17618 const unsigned VF = VL.
size();
17626 auto EstimateInsertCost = [&](
unsigned I,
Value *
V) {
17628 if (
V->getType() != ScalarTy)
17629 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy,
V->getType(),
17633 std::iota(ConstantShuffleMask.begin(), ConstantShuffleMask.end(), 0);
17640 ConstantShuffleMask[
I] =
I + VF;
17643 EstimateInsertCost(
I, V);
17646 bool IsAnyNonUndefConst =
17649 if (!ForPoisonSrc && IsAnyNonUndefConst) {
17651 ConstantShuffleMask);
17655 if (!DemandedElements.
isZero())
17659 ForPoisonSrc && !IsAnyNonUndefConst, VL);
17663Instruction &BoUpSLP::getLastInstructionInBundle(
const TreeEntry *
E) {
17664 auto It = EntryToLastInstruction.find(
E);
17665 if (It != EntryToLastInstruction.end())
17673 if (
E->hasState()) {
17674 Front =
E->getMainOp();
17675 Opcode =
E->getOpcode();
17682 ((GatheredLoadsEntriesFirst.has_value() && Opcode == Instruction::Load &&
17683 E->isGather() &&
E->Idx < *GatheredLoadsEntriesFirst) ||
17684 E->State == TreeEntry::SplitVectorize ||
E->hasCopyableElements() ||
17686 [=](
Value *V) ->
bool {
17687 if (Opcode == Instruction::GetElementPtr &&
17688 !isa<GetElementPtrInst>(V))
17690 auto *I = dyn_cast<Instruction>(V);
17691 return !I || !E->getMatchingMainOpOrAltOp(I) ||
17692 I->getParent() == BB || isVectorLikeInstWithConstOps(I);
17694 "Expected gathered loads or GEPs or instructions from same basic "
17697 auto FindLastInst = [&]() {
17699 for (
Value *V :
E->Scalars) {
17703 if (
E->isCopyableElement(
I))
17705 if (LastInst->
getParent() ==
I->getParent()) {
17710 assert(((Opcode == Instruction::GetElementPtr &&
17712 E->State == TreeEntry::SplitVectorize ||
17715 (GatheredLoadsEntriesFirst.has_value() &&
17716 Opcode == Instruction::Load &&
E->isGather() &&
17717 E->Idx < *GatheredLoadsEntriesFirst)) &&
17718 "Expected vector-like or non-GEP in GEP node insts only.");
17719 if (!DT->isReachableFromEntry(LastInst->
getParent())) {
17723 if (!DT->isReachableFromEntry(
I->getParent()))
17725 auto *NodeA = DT->getNode(LastInst->
getParent());
17726 auto *NodeB = DT->getNode(
I->getParent());
17727 assert(NodeA &&
"Should only process reachable instructions");
17728 assert(NodeB &&
"Should only process reachable instructions");
17729 assert((NodeA == NodeB) ==
17730 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17731 "Different nodes should have different DFS numbers");
17732 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
17739 auto FindFirstInst = [&]() {
17741 for (
Value *V :
E->Scalars) {
17745 if (
E->isCopyableElement(
I))
17747 if (FirstInst->
getParent() ==
I->getParent()) {
17748 if (
I->comesBefore(FirstInst))
17752 assert(((Opcode == Instruction::GetElementPtr &&
17756 "Expected vector-like or non-GEP in GEP node insts only.");
17757 if (!DT->isReachableFromEntry(FirstInst->
getParent())) {
17761 if (!DT->isReachableFromEntry(
I->getParent()))
17763 auto *NodeA = DT->getNode(FirstInst->
getParent());
17764 auto *NodeB = DT->getNode(
I->getParent());
17765 assert(NodeA &&
"Should only process reachable instructions");
17766 assert(NodeB &&
"Should only process reachable instructions");
17767 assert((NodeA == NodeB) ==
17768 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17769 "Different nodes should have different DFS numbers");
17770 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
17776 if (
E->State == TreeEntry::SplitVectorize) {
17777 Res = FindLastInst();
17779 for (
auto *
E : Entries) {
17782 I = &getLastInstructionInBundle(
E);
17787 EntryToLastInstruction.try_emplace(
E, Res);
17792 if (GatheredLoadsEntriesFirst.has_value() &&
17793 E->Idx >= *GatheredLoadsEntriesFirst && !
E->isGather() &&
17794 Opcode == Instruction::Load) {
17795 Res = FindFirstInst();
17796 EntryToLastInstruction.try_emplace(
E, Res);
17802 auto FindScheduleBundle = [&](
const TreeEntry *
E) ->
const ScheduleBundle * {
17806 const auto *It = BlocksSchedules.find(BB);
17807 if (It == BlocksSchedules.end())
17809 for (
Value *V :
E->Scalars) {
17815 if (Bundles.
empty())
17818 Bundles, [&](ScheduleBundle *
B) {
return B->getTreeEntry() ==
E; });
17819 if (It != Bundles.
end())
17824 const ScheduleBundle *Bundle = FindScheduleBundle(
E);
17825 if (!
E->isGather() && !Bundle) {
17826 if ((Opcode == Instruction::GetElementPtr &&
17829 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
17833 return isa<PoisonValue>(V) ||
17834 (E->Idx == 0 && isa<InsertElementInst>(V)) ||
17835 E->isCopyableElement(V) ||
17836 (!isVectorLikeInstWithConstOps(V) &&
17837 isUsedOutsideBlock(V));
17839 (!
E->doesNotNeedToSchedule() ||
17842 if (!isa<Instruction>(V) ||
17843 (E->hasCopyableElements() && E->isCopyableElement(V)))
17845 return !areAllOperandsNonInsts(V);
17848 if (!isa<Instruction>(V) ||
17849 (E->hasCopyableElements() && E->isCopyableElement(V)))
17851 return MustGather.contains(V);
17853 Res = FindLastInst();
17855 Res = FindFirstInst();
17856 EntryToLastInstruction.try_emplace(
E, Res);
17865 assert(!
E->isGather() &&
"Gathered instructions should not be scheduled");
17866 Res = Bundle->getBundle().back()->getInst();
17867 EntryToLastInstruction.try_emplace(
E, Res);
17890 Res = FindLastInst();
17891 assert(Res &&
"Failed to find last instruction in bundle");
17892 EntryToLastInstruction.try_emplace(
E, Res);
17896void BoUpSLP::setInsertPointAfterBundle(
const TreeEntry *
E) {
17897 auto *Front =
E->getMainOp();
17898 Instruction *LastInst = &getLastInstructionInBundle(
E);
17899 assert(LastInst &&
"Failed to find last instruction in bundle");
17904 LastInstIt = LastInst->
getParent()->getFirstNonPHIIt();
17905 if (LastInstIt != LastInst->
getParent()->end() &&
17906 LastInstIt->getParent()->isLandingPad())
17907 LastInstIt = std::next(LastInstIt);
17910 (!
E->isGather() &&
E->State != TreeEntry::SplitVectorize &&
17911 (
E->doesNotNeedToSchedule() ||
17912 (
E->hasCopyableElements() && !
E->isCopyableElement(LastInst) &&
17914 (GatheredLoadsEntriesFirst.has_value() &&
17915 E->Idx >= *GatheredLoadsEntriesFirst && !
E->isGather() &&
17916 E->getOpcode() == Instruction::Load)) {
17917 Builder.SetInsertPoint(LastInst->
getParent(), LastInstIt);
17921 Builder.SetInsertPoint(
17924 if (Instruction *Res = LastInstructionToPos.lookup(LastInst)) {
17927 Res = Builder.CreateAlignedLoad(Builder.getPtrTy(),
17932 LastInstructionToPos.try_emplace(LastInst, Res);
17935 Builder.SetCurrentDebugLocation(Front->
getDebugLoc());
17938Value *BoUpSLP::gather(
17940 function_ref<
Value *(
Value *,
Value *, ArrayRef<int>)> CreateShuffle) {
17946 SmallSet<int, 4> PostponedIndices;
17947 Loop *
L = LI->getLoopFor(Builder.GetInsertBlock());
17949 SmallPtrSet<BasicBlock *, 4> Visited;
17950 while (InsertBB && InsertBB != InstBB && Visited.
insert(InsertBB).second)
17951 InsertBB = InsertBB->getSinglePredecessor();
17952 return InsertBB && InsertBB == InstBB;
17954 for (
int I = 0,
E = VL.
size();
I <
E; ++
I) {
17956 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
17958 (L && (!Root ||
L->isLoopInvariant(Root)) &&
L->contains(Inst))) &&
17959 PostponedIndices.
insert(
I).second)
17963 auto &&CreateInsertElement = [
this](
Value *Vec,
Value *
V,
unsigned Pos,
17966 if (
Scalar->getType() != Ty) {
17977 Scalar = Builder.CreateIntCast(
17991 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
17996 GatherShuffleExtractSeq.insert(InsElt);
18002 User *UserOp =
nullptr;
18007 if (
V->getType()->isVectorTy()) {
18009 SV && SV->getOperand(0) != V && SV->getOperand(1) != V) {
18011 auto FindOperand = [](
Value *Vec,
Value *
V) -> Instruction * {
18013 if (SV->getOperand(0) == V)
18015 if (SV->getOperand(1) == V)
18021 if (Instruction *User = FindOperand(SV->getOperand(0), V))
18023 else if (Instruction *User = FindOperand(SV->getOperand(1), V))
18026 "Failed to find shufflevector, caused by resize.");
18032 unsigned FoundLane = Entries.front()->findLaneForValue(V);
18033 ExternalUses.emplace_back(V, UserOp, *Entries.front(), FoundLane);
18041 SmallVector<int> NonConsts;
18043 std::iota(
Mask.begin(),
Mask.end(), 0);
18044 Value *OriginalRoot = Root;
18047 SV->getOperand(0)->getType() == VecTy) {
18048 Root = SV->getOperand(0);
18049 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
18052 for (
int I = 0,
E = VL.
size();
I <
E; ++
I) {
18061 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
18066 Vec = OriginalRoot;
18068 Vec = CreateShuffle(Root, Vec, Mask);
18070 OI && OI->use_empty() &&
18071 none_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
18072 return TE->VectorizedValue == OI;
18078 for (
int I : NonConsts)
18079 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
18082 for (
const std::pair<Value *, unsigned> &Pair : PostponedInsts)
18083 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
18121 bool IsFinalized =
false;
18134 class ShuffleIRBuilder {
18147 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
18148 CSEBlocks(CSEBlocks),
DL(DL) {}
18149 ~ShuffleIRBuilder() =
default;
18155 "Expected integer vector types only.");
18161 ->getIntegerBitWidth())
18162 V2 = Builder.CreateIntCast(
18165 V1 = Builder.CreateIntCast(
18169 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
18171 GatherShuffleExtractSeq.insert(
I);
18172 CSEBlocks.insert(
I->getParent());
18181 unsigned VF = Mask.size();
18185 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
18187 GatherShuffleExtractSeq.insert(
I);
18188 CSEBlocks.insert(
I->getParent());
18192 Value *createIdentity(
Value *V) {
return V; }
18193 Value *createPoison(
Type *Ty,
unsigned VF) {
18198 void resizeToMatch(
Value *&V1,
Value *&V2) {
18203 int VF = std::max(V1VF, V2VF);
18204 int MinVF = std::min(V1VF, V2VF);
18206 std::iota(IdentityMask.
begin(), std::next(IdentityMask.
begin(), MinVF),
18208 Value *&
Op = MinVF == V1VF ? V1 : V2;
18209 Op = Builder.CreateShuffleVector(
Op, IdentityMask);
18211 GatherShuffleExtractSeq.insert(
I);
18212 CSEBlocks.insert(
I->getParent());
18225 assert(V1 &&
"Expected at least one vector value.");
18226 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
18227 R.CSEBlocks, *R.DL);
18228 return BaseShuffleAnalysis::createShuffle<Value *>(
18229 V1, V2, Mask, ShuffleBuilder, ScalarTy);
18235 std::optional<bool> IsSigned = std::nullopt) {
18238 if (VecTy->getElementType() == ScalarTy->getScalarType())
18240 return Builder.CreateIntCast(
18241 V,
VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
18245 Value *getVectorizedValue(
const TreeEntry &E) {
18246 Value *Vec = E.VectorizedValue;
18249 return castToScalarTyElem(Vec,
any_of(E.Scalars, [&](
Value *V) {
18250 return !isa<PoisonValue>(V) &&
18251 !isKnownNonNegative(
18252 V, SimplifyQuery(*R.DL));
18258 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
18262 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
18263 unsigned NumParts,
bool &UseVecBaseAsInput) {
18264 UseVecBaseAsInput =
false;
18266 Value *VecBase =
nullptr;
18268 if (!E->ReorderIndices.empty()) {
18270 E->ReorderIndices.end());
18273 for (
int I = 0, Sz = Mask.size();
I < Sz; ++
I) {
18278 VecBase = EI->getVectorOperand();
18280 VecBase = TEs.front()->VectorizedValue;
18281 assert(VecBase &&
"Expected vectorized value.");
18282 UniqueBases.
insert(VecBase);
18285 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
18286 (NumParts != 1 &&
count(VL, EI) > 1) ||
18288 ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(U);
18289 return UTEs.empty() || UTEs.size() > 1 ||
18290 (isa<GetElementPtrInst>(U) &&
18291 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
18293 count_if(R.VectorizableTree,
18294 [&](const std::unique_ptr<TreeEntry> &TE) {
18295 return TE->UserTreeIndex.UserTE ==
18297 is_contained(VL, EI);
18301 R.eraseInstruction(EI);
18303 if (NumParts == 1 || UniqueBases.
size() == 1) {
18304 assert(VecBase &&
"Expected vectorized value.");
18305 return castToScalarTyElem(VecBase);
18307 UseVecBaseAsInput =
true;
18317 Value *Vec =
nullptr;
18324 constexpr int MaxBases = 2;
18326 auto VLMask =
zip(SubVL, SubMask);
18327 const unsigned VF = std::accumulate(
18328 VLMask.begin(), VLMask.end(), 0U, [&](
unsigned S,
const auto &
D) {
18329 if (std::get<1>(D) == PoisonMaskElem)
18332 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
18333 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp);
18335 VecOp = TEs.front()->VectorizedValue;
18336 assert(VecOp &&
"Expected vectorized value.");
18337 const unsigned Size =
18338 cast<FixedVectorType>(VecOp->getType())->getNumElements();
18339 return std::max(S, Size);
18341 for (
const auto [V,
I] : VLMask) {
18346 VecOp = TEs.front()->VectorizedValue;
18347 assert(VecOp &&
"Expected vectorized value.");
18348 VecOp = castToScalarTyElem(VecOp);
18349 Bases[
I / VF] = VecOp;
18351 if (!Bases.front())
18354 if (Bases.back()) {
18355 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
18356 TransformToIdentity(SubMask);
18358 SubVec = Bases.front();
18364 ArrayRef<int> SubMask =
18365 Mask.slice(
P * SliceSize,
18368 return all_of(SubMask, [](
int Idx) {
18372 "Expected first part or all previous parts masked.");
18373 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18378 unsigned SubVecVF =
18380 NewVF = std::max(NewVF, SubVecVF);
18383 for (
int &Idx : SubMask)
18386 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18387 Vec = createShuffle(Vec, SubVec, VecMask);
18388 TransformToIdentity(VecMask);
18396 std::optional<Value *>
18402 TEs, [](
const TreeEntry *TE) {
return TE->VectorizedValue; });
18404 return std::nullopt;
18407 auto *ResVecTy =
getWidenedType(ScalarTy, E->getVectorFactor());
18408 return Builder.CreateAlignedLoad(
18415 IsFinalized =
false;
18416 CommonMask.clear();
18422 Value *V1 = getVectorizedValue(E1);
18423 Value *V2 = getVectorizedValue(E2);
18429 Value *V1 = getVectorizedValue(E1);
18434 assert(V1 && V2 && !Mask.empty() &&
"Expected non-empty input vectors.");
18437 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
18438 V1 = castToScalarTyElem(V1);
18439 V2 = castToScalarTyElem(V2);
18440 if (InVectors.empty()) {
18441 InVectors.push_back(V1);
18442 InVectors.push_back(V2);
18443 CommonMask.assign(Mask.begin(), Mask.end());
18446 Value *Vec = InVectors.front();
18447 if (InVectors.size() == 2) {
18448 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18449 transformMaskAfterShuffle(CommonMask, CommonMask);
18452 Vec = createShuffle(Vec,
nullptr, CommonMask);
18453 transformMaskAfterShuffle(CommonMask, CommonMask);
18455 V1 = createShuffle(V1, V2, Mask);
18456 unsigned VF = std::max(getVF(V1), getVF(Vec));
18457 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18459 CommonMask[Idx] = Idx + VF;
18460 InVectors.front() = Vec;
18461 if (InVectors.size() == 2)
18462 InVectors.back() = V1;
18464 InVectors.push_back(V1);
18469 "castToScalarTyElem expects V1 to be FixedVectorType");
18470 V1 = castToScalarTyElem(V1);
18471 if (InVectors.empty()) {
18472 InVectors.push_back(V1);
18473 CommonMask.assign(Mask.begin(), Mask.end());
18476 const auto *It =
find(InVectors, V1);
18477 if (It == InVectors.end()) {
18478 if (InVectors.size() == 2 ||
18479 InVectors.front()->getType() != V1->
getType()) {
18480 Value *V = InVectors.front();
18481 if (InVectors.size() == 2) {
18482 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
18483 transformMaskAfterShuffle(CommonMask, CommonMask);
18485 CommonMask.size()) {
18486 V = createShuffle(InVectors.front(),
nullptr, CommonMask);
18487 transformMaskAfterShuffle(CommonMask, CommonMask);
18489 unsigned VF = std::max(CommonMask.size(), Mask.size());
18490 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18492 CommonMask[Idx] = V->getType() != V1->
getType()
18494 : Mask[Idx] + getVF(V1);
18495 if (V->getType() != V1->
getType())
18496 V1 = createShuffle(V1,
nullptr, Mask);
18497 InVectors.front() = V;
18498 if (InVectors.size() == 2)
18499 InVectors.back() = V1;
18501 InVectors.push_back(V1);
18506 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18508 InVectors.push_back(V1);
18513 for (
Value *V : InVectors)
18514 VF = std::max(VF, getVF(V));
18515 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18517 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
18526 Value *Root =
nullptr) {
18527 return R.gather(VL, Root, ScalarTy,
18529 return createShuffle(V1, V2, Mask);
18538 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
18543 IsFinalized =
true;
18546 if (InVectors.
size() == 2) {
18547 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
18550 Vec = createShuffle(Vec,
nullptr, CommonMask);
18552 transformMaskAfterShuffle(CommonMask, CommonMask);
18554 "Expected vector length for the final value before action.");
18558 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
18559 Vec = createShuffle(Vec,
nullptr, ResizeMask);
18561 Action(Vec, CommonMask, [
this](
Value *V1,
Value *V2, ArrayRef<int> Mask) {
18562 return createShuffle(V1, V2, Mask);
18564 InVectors.
front() = Vec;
18566 if (!SubVectors.empty()) {
18568 if (InVectors.
size() == 2) {
18569 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
18572 Vec = createShuffle(Vec,
nullptr, CommonMask);
18574 transformMaskAfterShuffle(CommonMask, CommonMask);
18575 auto CreateSubVectors = [&](
Value *Vec,
18576 SmallVectorImpl<int> &CommonMask) {
18577 for (
auto [
E, Idx] : SubVectors) {
18578 Value *
V = getVectorizedValue(*
E);
18585 Type *OrigScalarTy = ScalarTy;
18588 Builder, Vec, V, InsertionIndex,
18589 std::bind(&ShuffleInstructionBuilder::createShuffle,
this, _1, _2,
18591 ScalarTy = OrigScalarTy;
18592 if (!CommonMask.
empty()) {
18593 std::iota(std::next(CommonMask.
begin(), Idx),
18594 std::next(CommonMask.
begin(), Idx +
E->getVectorFactor()),
18600 if (SubVectorsMask.
empty()) {
18601 Vec = CreateSubVectors(Vec, CommonMask);
18604 copy(SubVectorsMask, SVMask.begin());
18605 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
18608 I1 = I2 + CommonMask.
size();
18613 Vec = createShuffle(InsertVec, Vec, SVMask);
18614 transformMaskAfterShuffle(CommonMask, SVMask);
18616 InVectors.
front() = Vec;
18619 if (!ExtMask.
empty()) {
18620 if (CommonMask.
empty()) {
18624 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
18627 NewMask[
I] = CommonMask[ExtMask[
I]];
18629 CommonMask.
swap(NewMask);
18632 if (CommonMask.
empty()) {
18633 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
18634 return InVectors.
front();
18636 if (InVectors.
size() == 2)
18637 return createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
18638 return createShuffle(InVectors.
front(),
nullptr, CommonMask);
18642 assert((IsFinalized || CommonMask.empty()) &&
18643 "Shuffle construction must be finalized.");
18647Value *BoUpSLP::vectorizeOperand(TreeEntry *E,
unsigned NodeIdx) {
18651template <
typename BVTy,
typename ResTy,
typename... Args>
18652ResTy BoUpSLP::processBuildVector(
const TreeEntry *E,
Type *ScalarTy,
18654 assert(E->isGather() &&
"Expected gather node.");
18655 unsigned VF = E->getVectorFactor();
18657 bool NeedFreeze =
false;
18660 for (
auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
18662 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
18665 E->CombinedEntriesWithIndices.size());
18666 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
18667 [&](
const auto &
P) {
18668 return std::make_pair(VectorizableTree[P.first].get(), P.second);
18673 E->ReorderIndices.end());
18674 if (!ReorderMask.
empty())
18680 if (!SubVectors.empty() && !SubVectorsMask.
empty()) {
18682 if (E->Scalars[
I] == GatheredScalars[ReorderMask[
I]])
18685 SubVectorsMask.
clear();
18689 unsigned I,
unsigned SliceSize,
18690 bool IsNotPoisonous) {
18692 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
18695 TreeEntry *UserTE = E->UserTreeIndex.UserTE;
18696 unsigned EdgeIdx = E->UserTreeIndex.EdgeIdx;
18697 if (UserTE->getNumOperands() != 2)
18699 if (!IsNotPoisonous) {
18700 auto *It =
find_if(
ArrayRef(VectorizableTree).drop_front(UserTE->Idx + 1),
18701 [=](
const std::unique_ptr<TreeEntry> &TE) {
18702 return TE->UserTreeIndex.UserTE == UserTE &&
18703 TE->UserTreeIndex.EdgeIdx != EdgeIdx;
18705 if (It == VectorizableTree.end())
18708 if (!(*It)->ReorderIndices.empty()) {
18712 if (!
all_of(
zip(GatheredScalars, GS), [&](
const auto &
P) {
18713 Value *V0 = std::get<0>(
P);
18714 Value *V1 = std::get<1>(
P);
18722 if ((Mask.size() < InputVF &&
18725 (Mask.size() == InputVF &&
18728 std::next(Mask.begin(),
I * SliceSize),
18729 std::next(Mask.begin(),
18736 std::next(Mask.begin(),
I * SliceSize),
18737 std::next(Mask.begin(),
18743 BVTy ShuffleBuilder(ScalarTy, Params...);
18744 ResTy Res = ResTy();
18748 Value *ExtractVecBase =
nullptr;
18749 bool UseVecBaseAsInput =
false;
18752 Type *OrigScalarTy = GatheredScalars.
front()->getType();
18757 bool Resized =
false;
18759 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
18760 if (!ExtractShuffles.
empty()) {
18762 for (
auto [Idx,
I] :
enumerate(ExtractMask)) {
18768 ExtractEntries.
append(TEs.begin(), TEs.end());
18770 if (std::optional<ResTy> Delayed =
18771 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
18773 PostponedGathers.insert(E);
18778 if (
Value *VecBase = ShuffleBuilder.adjustExtracts(
18779 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
18780 ExtractVecBase = VecBase;
18782 if (VF == VecBaseTy->getNumElements() &&
18783 GatheredScalars.
size() != VF) {
18785 GatheredScalars.
append(VF - GatheredScalars.
size(),
18793 if (!ExtractShuffles.
empty() || !E->hasState() ||
18794 E->getOpcode() != Instruction::Load ||
18795 (((E->hasState() && E->getOpcode() == Instruction::Load) ||
18799 return isa<LoadInst>(V) && isVectorized(V);
18801 (E->hasState() && E->isAltShuffle()) ||
18802 all_of(E->Scalars, [
this](
Value *V) { return isVectorized(V); }) ||
18804 (E->Scalars != GatheredScalars && GatheredScalars.
size() <= 2)) {
18806 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
18808 if (!GatherShuffles.
empty()) {
18809 if (std::optional<ResTy> Delayed =
18810 ShuffleBuilder.needToDelay(E, Entries)) {
18812 PostponedGathers.insert(E);
18817 if (GatherShuffles.
size() == 1 &&
18819 Entries.
front().front()->isSame(E->Scalars)) {
18822 LLVM_DEBUG(
dbgs() <<
"SLP: perfect diamond match for gather bundle "
18825 Mask.resize(E->Scalars.size());
18826 const TreeEntry *FrontTE = Entries.
front().front();
18827 if (FrontTE->ReorderIndices.empty() &&
18828 ((FrontTE->ReuseShuffleIndices.empty() &&
18829 E->Scalars.size() == FrontTE->Scalars.size()) ||
18830 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
18831 std::iota(Mask.begin(), Mask.end(), 0);
18838 Mask[
I] = FrontTE->findLaneForValue(V);
18843 ShuffleBuilder.resetForSameNode();
18844 ShuffleBuilder.add(*FrontTE, Mask);
18846 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
18850 if (GatheredScalars.
size() != VF &&
18852 return any_of(TEs, [&](
const TreeEntry *TE) {
18853 return TE->getVectorFactor() == VF;
18856 GatheredScalars.
append(VF - GatheredScalars.
size(),
18860 for (
int I = 0, Sz = Mask.size();
I < Sz; ++
I) {
18868 bool IsRootPoison) {
18871 bool IsSplat = IsRootPoison &&
isSplat(Scalars) &&
18878 int NumNonConsts = 0;
18897 Scalars.
front() = OrigV;
18900 const auto Res = UniquePositions.
try_emplace(OrigV,
I);
18901 Scalars[Res.first->second] = OrigV;
18902 ReuseMask[
I] = Res.first->second;
18905 if (NumNonConsts == 1) {
18910 if (!UndefPos.
empty() && UndefPos.
front() == 0)
18913 ReuseMask[SinglePos] = SinglePos;
18914 }
else if (!UndefPos.
empty() && IsSplat) {
18921 (E->UserTreeIndex &&
any_of(V->uses(), [E](
const Use &U) {
18924 return E->UserTreeIndex.EdgeIdx != U.getOperandNo() &&
18925 is_contained(E->UserTreeIndex.UserTE->Scalars,
18929 if (It != Scalars.
end()) {
18931 int Pos = std::distance(Scalars.
begin(), It);
18932 for (
int I : UndefPos) {
18934 ReuseMask[
I] = Pos;
18943 for (
int I : UndefPos) {
18952 if (!ExtractShuffles.
empty() || !GatherShuffles.
empty()) {
18953 bool IsNonPoisoned =
true;
18954 bool IsUsedInExpr =
true;
18955 Value *Vec1 =
nullptr;
18956 if (!ExtractShuffles.
empty()) {
18960 Value *Vec2 =
nullptr;
18961 for (
unsigned I = 0, Sz = ExtractMask.
size();
I < Sz; ++
I) {
18965 if (UseVecBaseAsInput) {
18966 Vec1 = ExtractVecBase;
18968 for (
unsigned I = 0, Sz = ExtractMask.
size();
I < Sz; ++
I) {
18974 Value *VecOp = EI->getVectorOperand();
18976 !TEs.
empty() && TEs.front()->VectorizedValue)
18977 VecOp = TEs.front()->VectorizedValue;
18980 }
else if (Vec1 != VecOp) {
18981 assert((!Vec2 || Vec2 == VecOp) &&
18982 "Expected only 1 or 2 vectors shuffle.");
18988 IsUsedInExpr =
false;
18991 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
18994 IsUsedInExpr &= FindReusedSplat(
18997 ExtractMask.
size(), IsNotPoisonedVec);
18998 ShuffleBuilder.add(Vec1, ExtractMask,
true);
18999 IsNonPoisoned &= IsNotPoisonedVec;
19001 IsUsedInExpr =
false;
19006 if (!GatherShuffles.
empty()) {
19007 unsigned SliceSize =
19011 for (
const auto [
I, TEs] :
enumerate(Entries)) {
19014 "No shuffles with empty entries list expected.");
19017 assert((TEs.size() == 1 || TEs.size() == 2) &&
19018 "Expected shuffle of 1 or 2 entries.");
19019 unsigned Limit =
getNumElems(Mask.size(), SliceSize,
I);
19022 copy(SubMask, std::next(VecMask.
begin(),
I * SliceSize));
19023 if (TEs.size() == 1) {
19024 bool IsNotPoisonedVec =
19025 TEs.front()->VectorizedValue
19029 FindReusedSplat(VecMask, TEs.
front()->getVectorFactor(),
I,
19030 SliceSize, IsNotPoisonedVec);
19031 ShuffleBuilder.add(*TEs.front(), VecMask);
19032 IsNonPoisoned &= IsNotPoisonedVec;
19034 IsUsedInExpr =
false;
19035 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
19036 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
19047 int EMSz = ExtractMask.
size();
19048 int MSz = Mask.size();
19051 bool IsSingleShuffle = ExtractShuffles.
empty() || GatherShuffles.
empty();
19052 bool IsIdentityShuffle =
19053 ((UseVecBaseAsInput ||
19055 [](
const std::optional<TTI::ShuffleKind> &SK) {
19059 none_of(ExtractMask, [&](
int I) {
return I >= EMSz; }) &&
19061 (!GatherShuffles.
empty() &&
19063 [](
const std::optional<TTI::ShuffleKind> &SK) {
19067 none_of(Mask, [&](
int I) {
return I >= MSz; }) &&
19069 bool EnoughConstsForShuffle =
19079 (!IsIdentityShuffle ||
19080 (GatheredScalars.
size() == 2 &&
19088 for (
int I = 0, Sz = GatheredScalars.
size();
I < Sz; ++
I) {
19089 if (EnoughConstsForShuffle &&
isa<Constant>(GatheredScalars[
I]))
19097 TryPackScalars(GatheredScalars, BVMask,
true);
19098 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.
size());
19099 ShuffleBuilder.add(BV, BVMask);
19103 (IsSingleShuffle && ((IsIdentityShuffle &&
19106 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
19109 Res = ShuffleBuilder.finalize(
19110 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.
size(),
19112 bool IsSplat = isSplat(NonConstants);
19113 SmallVector<int> BVMask(Mask.size(), PoisonMaskElem);
19114 TryPackScalars(NonConstants, BVMask, false);
19115 auto CheckIfSplatIsProfitable = [&]() {
19118 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
19119 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
19120 if (isa<ExtractElementInst>(V) || isVectorized(V))
19122 InstructionCost SplatCost = TTI->getVectorInstrCost(
19123 Instruction::InsertElement, VecTy, CostKind, 0,
19124 PoisonValue::get(VecTy), V);
19125 SmallVector<int> NewMask(Mask.begin(), Mask.end());
19126 for (auto [Idx, I] : enumerate(BVMask))
19127 if (I != PoisonMaskElem)
19128 NewMask[Idx] = Mask.size();
19129 SplatCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy,
19130 NewMask, CostKind);
19131 InstructionCost BVCost = TTI->getVectorInstrCost(
19132 Instruction::InsertElement, VecTy, CostKind,
19133 *find_if(Mask, [](int I) { return I != PoisonMaskElem; }),
19136 if (count(BVMask, PoisonMaskElem) <
19137 static_cast<int>(BVMask.size() - 1)) {
19138 SmallVector<int> NewMask(Mask.begin(), Mask.end());
19139 for (auto [Idx, I] : enumerate(BVMask))
19140 if (I != PoisonMaskElem)
19142 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
19143 VecTy, NewMask, CostKind);
19145 return SplatCost <= BVCost;
19147 if (!IsSplat || Mask.size() <= 2 || !CheckIfSplatIsProfitable()) {
19151 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
19157 Value *BV = ShuffleBuilder.gather(Values, BVMask.
size());
19160 return I == PoisonMaskElem ? PoisonMaskElem : 0;
19163 BV = CreateShuffle(BV,
nullptr, SplatMask);
19166 Mask[Idx] = BVMask.size() + Idx;
19167 Vec = CreateShuffle(Vec, BV, Mask);
19176 TryPackScalars(GatheredScalars, ReuseMask,
true);
19177 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
19178 ShuffleBuilder.add(BV, ReuseMask);
19179 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
19184 for (
auto [
I, V] :
enumerate(GatheredScalars)) {
19188 Value *BV = ShuffleBuilder.gather(GatheredScalars);
19189 ShuffleBuilder.add(BV, Mask);
19190 Res = ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors,
19195 Res = ShuffleBuilder.createFreeze(Res);
19199Value *BoUpSLP::createBuildVector(
const TreeEntry *
E,
Type *ScalarTy) {
19200 for (
auto [EIdx,
_] :
E->CombinedEntriesWithIndices)
19202 return processBuildVector<ShuffleInstructionBuilder, Value *>(
E, ScalarTy,
19210 for (
Value *V : VL)
19223 IRBuilderBase::InsertPointGuard Guard(Builder);
19225 Value *
V =
E->Scalars.front();
19226 Type *ScalarTy =
V->getType();
19229 auto It = MinBWs.find(
E);
19230 if (It != MinBWs.end()) {
19236 if (
E->VectorizedValue)
19237 return E->VectorizedValue;
19239 if (
E->isGather()) {
19241 if (
E->hasState() &&
E->Idx == 0 && !UserIgnoreList)
19242 setInsertPointAfterBundle(
E);
19243 Value *Vec = createBuildVector(
E, ScalarTy);
19244 E->VectorizedValue = Vec;
19247 if (
E->State == TreeEntry::SplitVectorize) {
19248 assert(
E->CombinedEntriesWithIndices.size() == 2 &&
19249 "Expected exactly 2 combined entries.");
19250 setInsertPointAfterBundle(
E);
19252 *VectorizableTree[
E->CombinedEntriesWithIndices.front().first];
19254 ArrayRef(
E->Scalars).take_front(OpTE1.getVectorFactor())) &&
19255 "Expected same first part of scalars.");
19258 *VectorizableTree[
E->CombinedEntriesWithIndices.back().first];
19260 OpTE2.isSame(
ArrayRef(
E->Scalars).take_back(OpTE2.getVectorFactor())) &&
19261 "Expected same second part of scalars.");
19263 auto GetOperandSignedness = [&](
const TreeEntry *OpE) {
19264 bool IsSigned =
false;
19265 auto It = MinBWs.find(OpE);
19266 if (It != MinBWs.end())
19267 IsSigned = It->second.second;
19270 if (isa<PoisonValue>(V))
19272 return !isKnownNonNegative(R, SimplifyQuery(*DL));
19279 Op1 = Builder.CreateIntCast(
19284 GetOperandSignedness(&OpTE1));
19289 Op2 = Builder.CreateIntCast(
19294 GetOperandSignedness(&OpTE2));
19296 if (
E->ReorderIndices.empty()) {
19300 std::next(
Mask.begin(),
E->CombinedEntriesWithIndices.back().second),
19303 if (ScalarTyNumElements != 1) {
19307 Value *Vec = Builder.CreateShuffleVector(Op1, Mask);
19309 E->CombinedEntriesWithIndices.back().second *
19310 ScalarTyNumElements);
19311 E->VectorizedValue = Vec;
19314 unsigned CommonVF =
19315 std::max(OpTE1.getVectorFactor(), OpTE2.getVectorFactor());
19318 std::iota(
Mask.begin(), std::next(
Mask.begin(), OpTE1.getVectorFactor()),
19320 Op1 = Builder.CreateShuffleVector(Op1, Mask);
19324 std::iota(
Mask.begin(), std::next(
Mask.begin(), OpTE2.getVectorFactor()),
19326 Op2 = Builder.CreateShuffleVector(Op2, Mask);
19328 Value *Vec = Builder.CreateShuffleVector(Op1, Op2,
E->getSplitMask());
19329 E->VectorizedValue = Vec;
19333 bool IsReverseOrder =
19335 auto FinalShuffle = [&](
Value *
V,
const TreeEntry *
E) {
19337 if (
E->getOpcode() == Instruction::Store &&
19338 E->State == TreeEntry::Vectorize) {
19339 ArrayRef<int>
Mask =
19340 ArrayRef(
reinterpret_cast<const int *
>(
E->ReorderIndices.begin()),
19341 E->ReorderIndices.size());
19342 ShuffleBuilder.add(V, Mask);
19343 }
else if ((
E->State == TreeEntry::StridedVectorize && IsReverseOrder) ||
19344 E->State == TreeEntry::CompressVectorize) {
19345 ShuffleBuilder.addOrdered(V, {});
19347 ShuffleBuilder.addOrdered(V,
E->ReorderIndices);
19350 E->CombinedEntriesWithIndices.size());
19352 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](
const auto &
P) {
19353 return std::make_pair(VectorizableTree[P.first].get(), P.second);
19356 (
E->CombinedEntriesWithIndices.empty() ||
E->ReorderIndices.empty()) &&
19357 "Expected either combined subnodes or reordering");
19358 return ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors, {});
19361 assert(!
E->isGather() &&
"Unhandled state");
19362 unsigned ShuffleOrOp =
19363 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector :
E->
getOpcode();
19365 auto GetOperandSignedness = [&](
unsigned Idx) {
19366 const TreeEntry *OpE = getOperandEntry(
E, Idx);
19367 bool IsSigned =
false;
19368 auto It = MinBWs.find(OpE);
19369 if (It != MinBWs.end())
19370 IsSigned = It->second.second;
19373 if (isa<PoisonValue>(V))
19375 return !isKnownNonNegative(R, SimplifyQuery(*DL));
19379 switch (ShuffleOrOp) {
19380 case Instruction::PHI: {
19381 assert((
E->ReorderIndices.empty() || !
E->ReuseShuffleIndices.empty() ||
19382 E != VectorizableTree.front().get() ||
E->UserTreeIndex) &&
19383 "PHI reordering is free.");
19385 Builder.SetInsertPoint(PH->getParent(),
19386 PH->getParent()->getFirstNonPHIIt());
19388 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
19392 Builder.SetInsertPoint(PH->getParent(),
19393 PH->getParent()->getFirstInsertionPt());
19396 V = FinalShuffle(V,
E);
19398 E->VectorizedValue =
V;
19405 SmallPtrSet<BasicBlock *, 4> VisitedBBs;
19412 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
19416 if (!VisitedBBs.
insert(IBB).second) {
19419 TreeEntry *OpTE = getOperandEntry(
E,
I);
19420 assert(!OpTE->VectorizedValue &&
"Expected no vectorized value.");
19421 OpTE->VectorizedValue = VecOp;
19427 Value *Vec = vectorizeOperand(
E,
I);
19428 if (VecTy != Vec->
getType()) {
19430 MinBWs.contains(getOperandEntry(
E,
I))) &&
19431 "Expected item in MinBWs.");
19432 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(
I));
19438 "Invalid number of incoming values");
19439 assert(
E->VectorizedValue &&
"Expected vectorized value.");
19440 return E->VectorizedValue;
19443 case Instruction::ExtractElement: {
19444 Value *
V =
E->getSingleOperand(0);
19445 setInsertPointAfterBundle(
E);
19446 V = FinalShuffle(V,
E);
19447 E->VectorizedValue =
V;
19450 case Instruction::ExtractValue: {
19452 Builder.SetInsertPoint(LI);
19453 Value *Ptr = LI->getPointerOperand();
19454 LoadInst *
V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
19456 NewV = FinalShuffle(NewV,
E);
19457 E->VectorizedValue = NewV;
19460 case Instruction::InsertElement: {
19461 assert(
E->ReuseShuffleIndices.empty() &&
"All inserts should be unique");
19462 if (
const TreeEntry *OpE = getOperandEntry(
E, 1);
19463 OpE && !OpE->isGather() && OpE->hasState() &&
19464 !OpE->hasCopyableElements())
19467 setInsertPointAfterBundle(
E);
19468 Value *
V = vectorizeOperand(
E, 1);
19470 Type *ScalarTy =
Op.front()->getType();
19473 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(
E, 1));
19474 assert(Res.first > 0 &&
"Expected item in MinBWs.");
19475 V = Builder.CreateIntCast(
19485 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
19487 const unsigned NumElts =
19489 const unsigned NumScalars =
E->Scalars.size();
19492 assert(
Offset < NumElts &&
"Failed to find vector index offset");
19495 SmallVector<int>
Mask;
19496 if (!
E->ReorderIndices.empty()) {
19501 std::iota(
Mask.begin(), std::next(
Mask.begin(), NumScalars), 0);
19504 bool IsIdentity =
true;
19506 Mask.swap(PrevMask);
19507 for (
unsigned I = 0;
I < NumScalars; ++
I) {
19510 IsIdentity &= InsertIdx -
Offset ==
I;
19513 if (!IsIdentity || NumElts != NumScalars) {
19514 Value *V2 =
nullptr;
19515 bool IsVNonPoisonous =
19517 SmallVector<int> InsertMask(Mask);
19518 if (NumElts != NumScalars &&
Offset == 0) {
19527 InsertMask[*InsertIdx] = *InsertIdx;
19533 SmallBitVector UseMask =
19534 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19535 SmallBitVector IsFirstPoison =
19537 SmallBitVector IsFirstUndef =
19539 if (!IsFirstPoison.
all()) {
19541 for (
unsigned I = 0;
I < NumElts;
I++) {
19543 IsFirstUndef.
test(
I)) {
19544 if (IsVNonPoisonous) {
19545 InsertMask[
I] =
I < NumScalars ?
I : 0;
19550 if (Idx >= NumScalars)
19551 Idx = NumScalars - 1;
19552 InsertMask[
I] = NumScalars + Idx;
19565 V = Builder.CreateShuffleVector(V, V2, InsertMask);
19567 GatherShuffleExtractSeq.insert(
I);
19568 CSEBlocks.insert(
I->getParent());
19573 for (
unsigned I = 0;
I < NumElts;
I++) {
19577 SmallBitVector UseMask =
19578 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19579 SmallBitVector IsFirstUndef =
19581 if ((!IsIdentity ||
Offset != 0 || !IsFirstUndef.
all()) &&
19582 NumElts != NumScalars) {
19583 if (IsFirstUndef.
all()) {
19585 SmallBitVector IsFirstPoison =
19587 if (!IsFirstPoison.
all()) {
19588 for (
unsigned I = 0;
I < NumElts;
I++) {
19590 InsertMask[
I] =
I + NumElts;
19593 V = Builder.CreateShuffleVector(
19599 GatherShuffleExtractSeq.insert(
I);
19600 CSEBlocks.insert(
I->getParent());
19604 SmallBitVector IsFirstPoison =
19606 for (
unsigned I = 0;
I < NumElts;
I++) {
19610 InsertMask[
I] += NumElts;
19612 V = Builder.CreateShuffleVector(
19613 FirstInsert->getOperand(0), V, InsertMask,
19616 GatherShuffleExtractSeq.insert(
I);
19617 CSEBlocks.insert(
I->getParent());
19622 ++NumVectorInstructions;
19623 E->VectorizedValue =
V;
19626 case Instruction::ZExt:
19627 case Instruction::SExt:
19628 case Instruction::FPToUI:
19629 case Instruction::FPToSI:
19630 case Instruction::FPExt:
19631 case Instruction::PtrToInt:
19632 case Instruction::IntToPtr:
19633 case Instruction::SIToFP:
19634 case Instruction::UIToFP:
19635 case Instruction::Trunc:
19636 case Instruction::FPTrunc:
19637 case Instruction::BitCast: {
19638 setInsertPointAfterBundle(
E);
19640 Value *InVec = vectorizeOperand(
E, 0);
19645 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
19647 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
19650 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
19651 if (SrcIt != MinBWs.end())
19652 SrcBWSz = SrcIt->second.first;
19653 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->
getScalarType());
19654 if (BWSz == SrcBWSz) {
19655 VecOpcode = Instruction::BitCast;
19656 }
else if (BWSz < SrcBWSz) {
19657 VecOpcode = Instruction::Trunc;
19658 }
else if (It != MinBWs.end()) {
19659 assert(BWSz > SrcBWSz &&
"Invalid cast!");
19660 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
19661 }
else if (SrcIt != MinBWs.end()) {
19662 assert(BWSz > SrcBWSz &&
"Invalid cast!");
19664 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
19666 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
19667 !SrcIt->second.second) {
19668 VecOpcode = Instruction::UIToFP;
19670 Value *
V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
19672 : Builder.CreateCast(VecOpcode, InVec, VecTy);
19673 V = FinalShuffle(V,
E);
19675 E->VectorizedValue =
V;
19676 ++NumVectorInstructions;
19679 case Instruction::FCmp:
19680 case Instruction::ICmp: {
19681 setInsertPointAfterBundle(
E);
19683 Value *
L = vectorizeOperand(
E, 0);
19684 Value *
R = vectorizeOperand(
E, 1);
19685 if (
L->getType() !=
R->getType()) {
19688 MinBWs.contains(getOperandEntry(
E, 0)) ||
19689 MinBWs.contains(getOperandEntry(
E, 1))) &&
19690 "Expected item in MinBWs.");
19695 ->getIntegerBitWidth()) {
19696 Type *CastTy =
R->getType();
19697 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
19699 Type *CastTy =
L->getType();
19700 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
19705 Value *
V = Builder.CreateCmp(P0, L, R);
19708 ICmp->setSameSign(
false);
19711 V = FinalShuffle(V,
E);
19713 E->VectorizedValue =
V;
19714 ++NumVectorInstructions;
19717 case Instruction::Select: {
19718 setInsertPointAfterBundle(
E);
19721 Value *True = vectorizeOperand(
E, 1);
19722 Value *False = vectorizeOperand(
E, 2);
19726 MinBWs.contains(getOperandEntry(
E, 1)) ||
19727 MinBWs.contains(getOperandEntry(
E, 2))) &&
19728 "Expected item in MinBWs.");
19729 if (True->
getType() != VecTy)
19730 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
19731 if (False->
getType() != VecTy)
19732 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
19737 assert(TrueNumElements >= CondNumElements &&
19738 TrueNumElements % CondNumElements == 0 &&
19739 "Cannot vectorize Instruction::Select");
19741 "Cannot vectorize Instruction::Select");
19742 if (CondNumElements != TrueNumElements) {
19745 Cond = Builder.CreateShuffleVector(
19750 "Cannot vectorize Instruction::Select");
19752 Builder.CreateSelectWithUnknownProfile(
Cond, True, False,
DEBUG_TYPE);
19753 V = FinalShuffle(V,
E);
19755 E->VectorizedValue =
V;
19756 ++NumVectorInstructions;
19759 case Instruction::FNeg: {
19760 setInsertPointAfterBundle(
E);
19762 Value *
Op = vectorizeOperand(
E, 0);
19764 Value *
V = Builder.CreateUnOp(
19770 V = FinalShuffle(V,
E);
19772 E->VectorizedValue =
V;
19773 ++NumVectorInstructions;
19777 case Instruction::Freeze: {
19778 setInsertPointAfterBundle(
E);
19780 Value *
Op = vectorizeOperand(
E, 0);
19782 if (
Op->getType() != VecTy) {
19784 MinBWs.contains(getOperandEntry(
E, 0))) &&
19785 "Expected item in MinBWs.");
19786 Op = Builder.CreateIntCast(
Op, VecTy, GetOperandSignedness(0));
19788 Value *
V = Builder.CreateFreeze(
Op);
19789 V = FinalShuffle(V,
E);
19791 E->VectorizedValue =
V;
19792 ++NumVectorInstructions;
19796 case Instruction::Add:
19797 case Instruction::FAdd:
19798 case Instruction::Sub:
19799 case Instruction::FSub:
19800 case Instruction::Mul:
19801 case Instruction::FMul:
19802 case Instruction::UDiv:
19803 case Instruction::SDiv:
19804 case Instruction::FDiv:
19805 case Instruction::URem:
19806 case Instruction::SRem:
19807 case Instruction::FRem:
19808 case Instruction::Shl:
19809 case Instruction::LShr:
19810 case Instruction::AShr:
19811 case Instruction::And:
19812 case Instruction::Or:
19813 case Instruction::Xor: {
19814 setInsertPointAfterBundle(
E);
19818 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
19823 return CI && CI->getValue().countr_one() >= It->second.first;
19825 V = FinalShuffle(
I == 0 ?
RHS :
LHS,
E);
19826 E->VectorizedValue =
V;
19827 ++NumVectorInstructions;
19835 MinBWs.contains(getOperandEntry(
E, 0)) ||
19836 MinBWs.contains(getOperandEntry(
E, 1))) &&
19837 "Expected item in MinBWs.");
19839 LHS = Builder.CreateIntCast(
LHS, VecTy, GetOperandSignedness(0));
19841 RHS = Builder.CreateIntCast(
RHS, VecTy, GetOperandSignedness(1));
19844 Value *
V = Builder.CreateBinOp(
19851 if (!MinBWs.contains(
E) && ShuffleOrOp == Instruction::Sub &&
19853 return isa<PoisonValue>(V) ||
19854 (E->hasCopyableElements() && E->isCopyableElement(V)) ||
19855 isCommutative(cast<Instruction>(V));
19857 I->setHasNoUnsignedWrap(
false);
19860 V = FinalShuffle(V,
E);
19862 E->VectorizedValue =
V;
19863 ++NumVectorInstructions;
19867 case Instruction::Load: {
19870 setInsertPointAfterBundle(
E);
19874 FixedVectorType *StridedLoadTy =
nullptr;
19875 Value *PO = LI->getPointerOperand();
19876 if (
E->State == TreeEntry::Vectorize) {
19877 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
19878 }
else if (
E->State == TreeEntry::CompressVectorize) {
19879 auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
19880 CompressEntryToData.at(
E);
19881 Align CommonAlignment = LI->getAlign();
19887 for (
int I : CompressMask)
19891 MaskValues =
replicateMask(MaskValues, VecTy->getNumElements());
19894 NewLI = Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment,
19897 NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment);
19908 }
else if (
E->State == TreeEntry::StridedVectorize) {
19911 PO = IsReverseOrder ? PtrN : Ptr0;
19912 Type *StrideTy = DL->getIndexType(PO->
getType());
19914 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(
E);
19915 StridedLoadTy = SPtrInfo.Ty;
19916 assert(StridedLoadTy &&
"Missing StridedPoinerInfo for tree entry.");
19917 unsigned StridedLoadEC =
19920 Value *Stride = SPtrInfo.StrideVal;
19922 const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
19923 assert(StrideSCEV &&
"Neither StrideVal nor StrideSCEV were set.");
19924 SCEVExpander Expander(*SE, *DL,
"strided-load-vec");
19925 Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->
getType(),
19926 &*Builder.GetInsertPoint());
19929 Builder.CreateIntCast(Stride, StrideTy,
true);
19930 StrideVal = Builder.CreateMul(
19931 NewStride, ConstantInt::get(
19932 StrideTy, (IsReverseOrder ? -1 : 1) *
19934 DL->getTypeAllocSize(ScalarTy))));
19936 auto *Inst = Builder.CreateIntrinsic(
19937 Intrinsic::experimental_vp_strided_load,
19938 {StridedLoadTy, PO->
getType(), StrideTy},
19941 Builder.getInt32(StridedLoadEC)});
19942 Inst->addParamAttr(
19944 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
19947 assert(
E->State == TreeEntry::ScatterVectorize &&
"Unhandled state");
19948 Value *VecPtr = vectorizeOperand(
E, 0);
19953 unsigned ScalarTyNumElements =
19955 unsigned VecTyNumElements =
19957 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
19958 "Cannot expand getelementptr.");
19959 unsigned VF = VecTyNumElements / ScalarTyNumElements;
19962 return Builder.getInt64(I % ScalarTyNumElements);
19964 VecPtr = Builder.CreateGEP(
19965 VecTy->getElementType(),
19966 Builder.CreateShuffleVector(
19972 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
19974 Value *
V =
E->State == TreeEntry::CompressVectorize
19978 if (StridedLoadTy != VecTy)
19979 V = Builder.CreateBitOrPointerCast(V, VecTy);
19980 V = FinalShuffle(V,
E);
19981 E->VectorizedValue =
V;
19982 ++NumVectorInstructions;
19985 case Instruction::Store: {
19988 setInsertPointAfterBundle(
E);
19990 Value *VecValue = vectorizeOperand(
E, 0);
19991 if (VecValue->
getType() != VecTy)
19993 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
19994 VecValue = FinalShuffle(VecValue,
E);
19996 Value *Ptr =
SI->getPointerOperand();
19998 if (
E->State == TreeEntry::Vectorize) {
19999 ST = Builder.CreateAlignedStore(VecValue, Ptr,
SI->getAlign());
20001 assert(
E->State == TreeEntry::StridedVectorize &&
20002 "Expected either strided or consecutive stores.");
20003 if (!
E->ReorderIndices.empty()) {
20005 Ptr =
SI->getPointerOperand();
20008 Type *StrideTy = DL->getIndexType(
SI->getPointerOperandType());
20009 auto *Inst = Builder.CreateIntrinsic(
20010 Intrinsic::experimental_vp_strided_store,
20011 {VecTy, Ptr->
getType(), StrideTy},
20014 StrideTy, -
static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
20015 Builder.getAllOnesMask(VecTy->getElementCount()),
20016 Builder.getInt32(
E->Scalars.size())});
20017 Inst->addParamAttr(
20019 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
20025 E->VectorizedValue =
V;
20026 ++NumVectorInstructions;
20029 case Instruction::GetElementPtr: {
20031 setInsertPointAfterBundle(
E);
20033 Value *Op0 = vectorizeOperand(
E, 0);
20036 for (
int J = 1,
N = GEP0->getNumOperands(); J <
N; ++J) {
20037 Value *OpVec = vectorizeOperand(
E, J);
20041 Value *
V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
20044 for (
Value *V :
E->Scalars) {
20051 V = FinalShuffle(V,
E);
20053 E->VectorizedValue =
V;
20054 ++NumVectorInstructions;
20058 case Instruction::Call: {
20060 setInsertPointAfterBundle(
E);
20065 CI,
ID, VecTy->getNumElements(),
20066 It != MinBWs.end() ? It->second.first : 0, TTI);
20069 VecCallCosts.first <= VecCallCosts.second;
20071 Value *ScalarArg =
nullptr;
20082 ScalarArg = CEI->getArgOperand(
I);
20085 if (
ID == Intrinsic::abs && It != MinBWs.end() &&
20086 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
20087 ScalarArg = Builder.getFalse();
20094 Value *OpVec = vectorizeOperand(
E,
I);
20095 ScalarArg = CEI->getArgOperand(
I);
20098 It == MinBWs.end()) {
20101 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(
I));
20102 }
else if (It != MinBWs.end()) {
20103 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(
I));
20112 if (!UseIntrinsic) {
20117 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
20124 Value *
V = Builder.CreateCall(CF, OpVecs, OpBundles);
20127 V = FinalShuffle(V,
E);
20129 E->VectorizedValue =
V;
20130 ++NumVectorInstructions;
20133 case Instruction::ShuffleVector: {
20136 setInsertPointAfterBundle(
E);
20137 Value *Src = vectorizeOperand(
E, 0);
20140 SmallVector<int> NewMask(ThisMask.size());
20142 return SVSrc->getShuffleMask()[Mask];
20144 V = Builder.CreateShuffleVector(SVSrc->getOperand(0),
20145 SVSrc->getOperand(1), NewMask);
20147 V = Builder.CreateShuffleVector(Src, ThisMask);
20152 V = FinalShuffle(V,
E);
20160 "Invalid Shuffle Vector Operand");
20164 setInsertPointAfterBundle(
E);
20165 LHS = vectorizeOperand(
E, 0);
20166 RHS = vectorizeOperand(
E, 1);
20168 setInsertPointAfterBundle(
E);
20169 LHS = vectorizeOperand(
E, 0);
20175 assert((It != MinBWs.end() ||
20176 getOperandEntry(
E, 0)->State == TreeEntry::NeedToGather ||
20177 getOperandEntry(
E, 1)->State == TreeEntry::NeedToGather ||
20178 MinBWs.contains(getOperandEntry(
E, 0)) ||
20179 MinBWs.contains(getOperandEntry(
E, 1))) &&
20180 "Expected item in MinBWs.");
20181 Type *CastTy = VecTy;
20187 ->getIntegerBitWidth())
20193 LHS = Builder.CreateIntCast(
LHS, CastTy, GetOperandSignedness(0));
20195 RHS = Builder.CreateIntCast(
RHS, CastTy, GetOperandSignedness(1));
20200 V0 = Builder.CreateBinOp(
20202 V1 = Builder.CreateBinOp(
20205 V0 = Builder.CreateCmp(CI0->getPredicate(),
LHS,
RHS);
20208 V1 = Builder.CreateCmp(AltPred,
LHS,
RHS);
20211 unsigned SrcBWSz = DL->getTypeSizeInBits(
20213 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
20214 if (BWSz <= SrcBWSz) {
20215 if (BWSz < SrcBWSz)
20216 LHS = Builder.CreateIntCast(
LHS, VecTy, It->second.first);
20218 "Expected same type as operand.");
20222 E->VectorizedValue =
LHS;
20223 ++NumVectorInstructions;
20227 V0 = Builder.CreateCast(
20229 V1 = Builder.CreateCast(
20234 for (
Value *V : {V0, V1}) {
20236 GatherShuffleExtractSeq.insert(
I);
20237 CSEBlocks.insert(
I->getParent());
20245 SmallVector<int>
Mask;
20246 E->buildAltOpShuffleMask(
20247 [
E,
this](Instruction *
I) {
20248 assert(
E->getMatchingMainOpOrAltOp(
I) &&
20249 "Unexpected main/alternate opcode");
20253 Mask, &OpScalars, &AltScalars);
20257 auto DropNuwFlag = [&](
Value *Vec,
unsigned Opcode) {
20260 I && Opcode == Instruction::Sub && !MinBWs.contains(
E) &&
20262 if (isa<PoisonValue>(V))
20264 if (E->hasCopyableElements() && E->isCopyableElement(V))
20266 auto *IV = cast<Instruction>(V);
20267 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
20269 I->setHasNoUnsignedWrap(
false);
20271 DropNuwFlag(V0,
E->getOpcode());
20272 DropNuwFlag(V1,
E->getAltOpcode());
20278 V = Builder.CreateShuffleVector(V0, V1, Mask);
20281 GatherShuffleExtractSeq.insert(
I);
20282 CSEBlocks.insert(
I->getParent());
20286 E->VectorizedValue =
V;
20287 ++NumVectorInstructions;
20305 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
20308 EntryToLastInstruction.clear();
20310 for (
auto &BSIter : BlocksSchedules)
20311 scheduleBlock(*
this, BSIter.second.get());
20314 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20315 if (TE->isGather())
20317 (void)getLastInstructionInBundle(TE.get());
20321 Builder.SetInsertPoint(ReductionRoot->
getParent(),
20324 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20328 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20329 if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
20330 TE->UserTreeIndex.UserTE->hasState() &&
20331 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
20332 (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
20333 TE->UserTreeIndex.UserTE->isAltShuffle()) &&
20334 !TE->UserTreeIndex.UserTE->hasCopyableElements() &&
20335 all_of(TE->UserTreeIndex.UserTE->Scalars,
20336 [](
Value *V) { return isUsedOutsideBlock(V); })) {
20338 getLastInstructionInBundle(TE->UserTreeIndex.UserTE);
20342 for (
auto &Entry : GatherEntries) {
20344 Builder.SetInsertPoint(Entry.second);
20345 Builder.SetCurrentDebugLocation(Entry.second->getDebugLoc());
20350 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20351 if (GatheredLoadsEntriesFirst.has_value() &&
20352 TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue &&
20353 (!TE->isGather() || TE->UserTreeIndex)) {
20354 assert((TE->UserTreeIndex ||
20355 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
20356 "Expected gathered load node.");
20365 for (
const TreeEntry *E : PostponedNodes) {
20366 auto *TE =
const_cast<TreeEntry *
>(E);
20368 TE->VectorizedValue =
nullptr;
20379 (TE->UserTreeIndex.UserTE->hasState() &&
20380 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI)) {
20389 if (UI->comesBefore(InsertPt))
20392 Builder.SetInsertPoint(InsertPt);
20394 Builder.SetInsertPoint(PrevVec);
20396 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
20399 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
20400 Builder.GetInsertPoint()->comesBefore(VecI))
20401 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
20402 Builder.GetInsertPoint());
20403 if (Vec->
getType() != PrevVec->getType()) {
20405 PrevVec->getType()->isIntOrIntVectorTy() &&
20406 "Expected integer vector types only.");
20407 std::optional<bool> IsSigned;
20408 for (
Value *V : TE->Scalars) {
20410 for (
const TreeEntry *MNTE : getTreeEntries(V)) {
20411 auto It = MinBWs.find(MNTE);
20412 if (It != MinBWs.end()) {
20413 IsSigned = IsSigned.value_or(
false) || It->second.second;
20418 if (IsSigned.value_or(
false))
20421 for (
const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
20422 auto It = MinBWs.find(BVE);
20423 if (It != MinBWs.end()) {
20424 IsSigned = IsSigned.value_or(
false) || It->second.second;
20429 if (IsSigned.value_or(
false))
20433 IsSigned.value_or(
false) ||
20437 if (IsSigned.value_or(
false))
20441 if (IsSigned.value_or(
false)) {
20443 auto It = MinBWs.find(TE->UserTreeIndex.UserTE);
20444 if (It != MinBWs.end())
20445 IsSigned = It->second.second;
20448 "Expected user node or perfect diamond match in MinBWs.");
20449 Vec = Builder.CreateIntCast(Vec, PrevVec->
getType(), *IsSigned);
20451 PrevVec->replaceAllUsesWith(Vec);
20452 PostponedValues.
try_emplace(Vec).first->second.push_back(TE);
20455 auto It = PostponedValues.
find(PrevVec);
20456 if (It != PostponedValues.
end()) {
20457 for (TreeEntry *VTE : It->getSecond())
20458 VTE->VectorizedValue = Vec;
20478 for (
const auto &ExternalUse : ExternalUses) {
20479 Value *Scalar = ExternalUse.Scalar;
20486 const TreeEntry *E = &ExternalUse.E;
20487 assert(E &&
"Invalid scalar");
20488 assert(!E->isGather() &&
"Extracting from a gather list");
20490 if (E->getOpcode() == Instruction::GetElementPtr &&
20494 Value *Vec = E->VectorizedValue;
20495 assert(Vec &&
"Can't find vectorizable value");
20497 Value *Lane = Builder.getInt32(ExternalUse.Lane);
20498 auto ExtractAndExtendIfNeeded = [&](
Value *Vec) {
20499 if (Scalar->getType() != Vec->
getType()) {
20500 Value *Ex =
nullptr;
20501 Value *ExV =
nullptr;
20503 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
20504 auto It = ScalarToEEs.
find(Scalar);
20505 if (It != ScalarToEEs.
end()) {
20508 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
20509 : Builder.GetInsertBlock());
20510 if (EEIt != It->second.end()) {
20511 Value *PrevV = EEIt->second.first;
20513 I && !ReplaceInst &&
20514 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
20515 Builder.GetInsertPoint()->comesBefore(
I)) {
20516 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
20517 Builder.GetInsertPoint());
20522 ExV = EEIt->second.second ? EEIt->second.second : Ex;
20531 IgnoredExtracts.
insert(EE);
20534 auto *CloneInst = Inst->clone();
20535 CloneInst->insertBefore(Inst->getIterator());
20536 if (Inst->hasName())
20537 CloneInst->takeName(Inst);
20542 Value *V = ES->getVectorOperand();
20545 V = ETEs.front()->VectorizedValue;
20547 !
IV ||
IV == Vec ||
IV->getParent() != IVec->getParent() ||
20548 IV->comesBefore(IVec))
20549 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
20551 Ex = Builder.CreateExtractElement(Vec, Lane);
20552 }
else if (
auto *VecTy =
20555 unsigned VecTyNumElements = VecTy->getNumElements();
20560 ExternalUse.Lane * VecTyNumElements);
20562 Ex = Builder.CreateExtractElement(Vec, Lane);
20567 if (Scalar->getType() != Ex->
getType())
20568 ExV = Builder.CreateIntCast(
20573 : &F->getEntryBlock(),
20574 std::make_pair(Ex, ExV));
20580 GatherShuffleExtractSeq.insert(ExI);
20581 CSEBlocks.insert(ExI->getParent());
20587 "In-tree scalar of vector type is not insertelement?");
20596 if (!ScalarsWithNullptrUser.
insert(Scalar).second)
20599 (ExternallyUsedValues.
count(Scalar) ||
20600 ExternalUsesWithNonUsers.count(Scalar) ||
20601 ExternalUsesAsOriginalScalar.contains(Scalar) ||
20605 if (ExternalUsesAsOriginalScalar.contains(U))
20607 ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
20608 return !UseEntries.empty() &&
20609 (E->State == TreeEntry::Vectorize ||
20610 E->State == TreeEntry::StridedVectorize ||
20611 E->State == TreeEntry::CompressVectorize) &&
20612 any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {
20613 return (UseEntry->State == TreeEntry::Vectorize ||
20615 TreeEntry::StridedVectorize ||
20617 TreeEntry::CompressVectorize) &&
20618 doesInTreeUserNeedToExtract(
20619 Scalar, getRootEntryInstruction(*UseEntry),
20623 "Scalar with nullptr User must be registered in "
20624 "ExternallyUsedValues map or remain as scalar in vectorized "
20628 if (
PHI->getParent()->isLandingPad())
20629 Builder.SetInsertPoint(
20632 PHI->getParent()->getLandingPadInst()->getIterator()));
20634 Builder.SetInsertPoint(
PHI->getParent(),
20635 PHI->getParent()->getFirstNonPHIIt());
20637 Builder.SetInsertPoint(VecI->getParent(),
20638 std::next(VecI->getIterator()));
20641 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20643 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20645 if (Scalar != NewInst) {
20648 "Extractelements should not be replaced.");
20649 Scalar->replaceAllUsesWith(NewInst);
20659 if (!UsedInserts.
insert(VU).second)
20662 auto BWIt = MinBWs.find(E);
20664 auto *ScalarTy = FTy->getElementType();
20665 auto Key = std::make_pair(Vec, ScalarTy);
20666 auto VecIt = VectorCasts.
find(
Key);
20667 if (VecIt == VectorCasts.
end()) {
20670 if (IVec->getParent()->isLandingPad())
20671 Builder.SetInsertPoint(IVec->getParent(),
20672 std::next(IVec->getParent()
20673 ->getLandingPadInst()
20676 Builder.SetInsertPoint(
20677 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
20679 Builder.SetInsertPoint(IVec->getNextNode());
20681 Vec = Builder.CreateIntCast(
20686 BWIt->second.second);
20689 Vec = VecIt->second;
20696 ShuffledInserts, [VU](
const ShuffledInsertData<Value *> &
Data) {
20703 unsigned Idx = *InsertIdx;
20704 if (It == ShuffledInserts.
end()) {
20706 It = std::next(ShuffledInserts.
begin(),
20707 ShuffledInserts.
size() - 1);
20712 Mask[Idx] = ExternalUse.Lane;
20724 for (
unsigned I :
seq<unsigned>(0, PH->getNumIncomingValues())) {
20725 if (PH->getIncomingValue(
I) == Scalar) {
20727 PH->getIncomingBlock(
I)->getTerminator();
20729 Builder.SetInsertPoint(VecI->getParent(),
20730 std::next(VecI->getIterator()));
20732 Builder.SetInsertPoint(PH->getIncomingBlock(
I)->getTerminator());
20734 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20735 PH->setOperand(
I, NewInst);
20740 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20744 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20745 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20756 for (
int I = 0, E = Mask.size();
I < E; ++
I) {
20758 CombinedMask1[
I] = Mask[
I];
20760 CombinedMask2[
I] = Mask[
I] - VF;
20762 ShuffleInstructionBuilder ShuffleBuilder(
20764 ShuffleBuilder.add(V1, CombinedMask1);
20766 ShuffleBuilder.add(V2, CombinedMask2);
20767 return ShuffleBuilder.finalize({}, {}, {});
20770 auto &&ResizeToVF = [&CreateShuffle](
Value *Vec, ArrayRef<int>
Mask,
20771 bool ForSingleMask) {
20772 unsigned VF =
Mask.size();
20775 if (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); })) {
20776 Vec = CreateShuffle(Vec,
nullptr, Mask);
20777 return std::make_pair(Vec,
true);
20779 if (!ForSingleMask) {
20781 for (
unsigned I = 0;
I < VF; ++
I) {
20785 Vec = CreateShuffle(Vec,
nullptr, ResizeMask);
20789 return std::make_pair(Vec,
false);
20793 for (
int I = 0,
E = ShuffledInserts.size();
I <
E; ++
I) {
20796 InsertElementInst *FirstInsert = ShuffledInserts[
I].InsertElements.front();
20797 InsertElementInst *LastInsert = ShuffledInserts[
I].InsertElements.back();
20798 Builder.SetInsertPoint(LastInsert);
20799 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
20804 return cast<VectorType>(Vec->getType())
20805 ->getElementCount()
20806 .getKnownMinValue();
20809 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
20811 assert((Vals.size() == 1 || Vals.size() == 2) &&
20812 "Expected exactly 1 or 2 input values.");
20813 if (Vals.size() == 1) {
20816 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
20817 ->getNumElements() ||
20818 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
20819 return CreateShuffle(Vals.front(), nullptr, Mask);
20820 return Vals.front();
20822 return CreateShuffle(Vals.
front() ? Vals.
front()
20824 Vals.
back(), Mask);
20826 auto It = ShuffledInserts[
I].InsertElements.rbegin();
20828 InsertElementInst *
II =
nullptr;
20829 if (It != ShuffledInserts[
I].InsertElements.rend())
20832 while (It != ShuffledInserts[
I].InsertElements.rend()) {
20833 assert(
II &&
"Must be an insertelement instruction.");
20840 for (Instruction *
II :
reverse(Inserts)) {
20841 II->replaceUsesOfWith(
II->getOperand(0), NewInst);
20843 if (
II->getParent() == NewI->getParent() &&
II->comesBefore(NewI))
20844 II->moveAfter(NewI);
20848 for (InsertElementInst *IE :
reverse(ShuffledInserts[
I].InsertElements)) {
20849 IE->replaceUsesOfWith(
IE->getOperand(0),
20851 IE->replaceUsesOfWith(
IE->getOperand(1),
20855 CSEBlocks.insert(LastInsert->
getParent());
20860 for (
auto &TEPtr : VectorizableTree) {
20861 TreeEntry *
Entry = TEPtr.get();
20864 if (
Entry->isGather() ||
Entry->State == TreeEntry::SplitVectorize)
20867 assert(
Entry->VectorizedValue &&
"Can't find vectorizable value");
20870 for (
int Lane = 0, LE =
Entry->Scalars.size(); Lane != LE; ++Lane) {
20873 if (
Entry->getOpcode() == Instruction::GetElementPtr &&
20877 EE && IgnoredExtracts.contains(EE))
20884 for (User *U :
Scalar->users()) {
20889 (UserIgnoreList && UserIgnoreList->contains(U)) ||
20892 "Deleting out-of-tree value");
20896 LLVM_DEBUG(
dbgs() <<
"SLP: \tErasing scalar:" << *Scalar <<
".\n");
20905 V->mergeDIAssignID(RemovedInsts);
20908 if (UserIgnoreList) {
20909 for (Instruction *
I : RemovedInsts) {
20910 const TreeEntry *
IE = getTreeEntries(
I).front();
20911 if (
IE->Idx != 0 &&
20912 !(VectorizableTree.front()->isGather() &&
IE->UserTreeIndex &&
20913 (ValueToGatherNodes.lookup(
I).contains(
20914 VectorizableTree.front().get()) ||
20915 (
IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
20916 IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
20917 !(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
20918 IE->UserTreeIndex &&
20920 !(GatheredLoadsEntriesFirst.has_value() &&
20921 IE->Idx >= *GatheredLoadsEntriesFirst &&
20922 VectorizableTree.front()->isGather() &&
20924 !(!VectorizableTree.front()->isGather() &&
20925 VectorizableTree.front()->isCopyableElement(
I)))
20930 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
20931 (match(U.getUser(), m_LogicalAnd()) ||
20932 match(U.getUser(), m_LogicalOr())) &&
20933 U.getOperandNo() == 0;
20934 if (IsPoisoningLogicalOp) {
20935 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
20938 return UserIgnoreList->contains(
U.getUser());
20942 for (SelectInst *SI : LogicalOpSelects)
20952 Builder.ClearInsertionPoint();
20953 InstrElementSize.clear();
20955 const TreeEntry &RootTE = *VectorizableTree.front();
20956 Value *Vec = RootTE.VectorizedValue;
20957 if (
auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
20958 It != MinBWs.end() &&
20959 ReductionBitWidth != It->second.first) {
20960 IRBuilder<>::InsertPointGuard Guard(Builder);
20961 Builder.SetInsertPoint(ReductionRoot->getParent(),
20962 ReductionRoot->getIterator());
20963 Vec = Builder.CreateIntCast(
20965 VectorType::get(Builder.getIntNTy(ReductionBitWidth),
20967 It->second.second);
20973 LLVM_DEBUG(
dbgs() <<
"SLP: Optimizing " << GatherShuffleExtractSeq.size()
20974 <<
" gather sequences instructions.\n");
20981 Loop *L = LI->getLoopFor(
I->getParent());
20986 BasicBlock *PreHeader = L->getLoopPreheader();
20994 auto *OpI = dyn_cast<Instruction>(V);
20995 return OpI && L->contains(OpI);
21001 CSEBlocks.insert(PreHeader);
21006 CSEWorkList.
reserve(CSEBlocks.size());
21009 assert(DT->isReachableFromEntry(
N));
21016 assert((
A ==
B) == (
A->getDFSNumIn() ==
B->getDFSNumIn()) &&
21017 "Different nodes should have different DFS numbers");
21018 return A->getDFSNumIn() <
B->getDFSNumIn();
21026 auto &&IsIdenticalOrLessDefined = [TTI = TTI](
Instruction *I1,
21029 if (I1->getType() != I2->getType())
21034 return I1->isIdenticalTo(I2);
21035 if (SI1->isIdenticalTo(SI2))
21037 for (
int I = 0, E = SI1->getNumOperands();
I < E; ++
I)
21038 if (SI1->getOperand(
I) != SI2->getOperand(
I))
21041 NewMask.
assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
21045 unsigned LastUndefsCnt = 0;
21046 for (
int I = 0, E = NewMask.
size();
I < E; ++
I) {
21052 NewMask[
I] != SM1[
I])
21055 NewMask[
I] = SM1[
I];
21059 return SM1.
size() - LastUndefsCnt > 1 &&
21063 SM1.
size() - LastUndefsCnt));
21069 for (
auto I = CSEWorkList.
begin(), E = CSEWorkList.
end();
I != E; ++
I) {
21071 (
I == CSEWorkList.
begin() || !DT->dominates(*
I, *std::prev(
I))) &&
21072 "Worklist not sorted properly!");
21079 !GatherShuffleExtractSeq.contains(&In))
21084 bool Replaced =
false;
21087 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
21088 DT->dominates(V->getParent(), In.getParent())) {
21089 In.replaceAllUsesWith(V);
21092 if (!NewMask.
empty())
21093 SI->setShuffleMask(NewMask);
21098 GatherShuffleExtractSeq.contains(V) &&
21099 IsIdenticalOrLessDefined(V, &In, NewMask) &&
21100 DT->dominates(In.getParent(), V->getParent())) {
21102 V->replaceAllUsesWith(&In);
21105 if (!NewMask.
empty())
21106 SI->setShuffleMask(NewMask);
21114 Visited.push_back(&In);
21119 GatherShuffleExtractSeq.clear();
21122BoUpSLP::ScheduleBundle &BoUpSLP::BlockScheduling::buildBundle(
21125 ScheduledBundlesList.emplace_back(std::make_unique<ScheduleBundle>());
21126 for (
Value *V : VL) {
21127 if (S.isNonSchedulable(V))
21130 if (S.isCopyableElement(V)) {
21132 ScheduleCopyableData &SD =
21133 addScheduleCopyableData(EI,
I, SchedulingRegionID, *BundlePtr);
21135 BundlePtr->add(&SD);
21138 ScheduleData *BundleMember = getScheduleData(V);
21139 assert(BundleMember &&
"no ScheduleData for bundle member "
21140 "(maybe not in same basic block)");
21142 BundlePtr->add(BundleMember);
21143 ScheduledBundles.try_emplace(
I).first->getSecond().push_back(
21146 assert(BundlePtr && *BundlePtr &&
"Failed to find schedule bundle");
21152std::optional<BoUpSLP::ScheduleBundle *>
21154 const InstructionsState &S,
21167 if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() &&
21168 EI.UserTE->doesNotNeedToSchedule() &&
21169 EI.UserTE->getOpcode() != Instruction::PHI &&
21171 auto *I = dyn_cast<Instruction>(V);
21172 if (!I || I->hasOneUser())
21174 for (User *U : I->users()) {
21175 auto *UI = cast<Instruction>(U);
21176 if (isa<BinaryOperator>(UI))
21181 return std::nullopt;
21182 if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() &&
21183 EI.UserTE->hasCopyableElements() &&
21184 EI.UserTE->getMainOp()->getParent() == S.getMainOp()->getParent() &&
21186 if (S.isCopyableElement(V))
21190 return std::nullopt;
21193 if (S.areInstructionsWithCopyableElements() &&
any_of(VL, [&](
Value *V) {
21206 return std::nullopt;
21207 if (S.areInstructionsWithCopyableElements() && EI) {
21208 bool IsNonSchedulableWithParentPhiNode =
21209 EI.UserTE->doesNotNeedToSchedule() && EI.UserTE->UserTreeIndex &&
21210 EI.UserTE->UserTreeIndex.UserTE->hasState() &&
21211 EI.UserTE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
21212 EI.UserTE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
21213 if (IsNonSchedulableWithParentPhiNode) {
21214 SmallSet<std::pair<Value *, Value *>, 4> Values;
21215 for (
const auto [Idx, V] :
21216 enumerate(EI.UserTE->UserTreeIndex.UserTE->Scalars)) {
21217 Value *
Op = EI.UserTE->UserTreeIndex.UserTE->getOperand(
21218 EI.UserTE->UserTreeIndex.EdgeIdx)[Idx];
21222 if (!Values.
insert(std::make_pair(V,
Op)).second)
21223 return std::nullopt;
21227 bool HasCopyables = S.areInstructionsWithCopyableElements();
21229 all_of(VL, [&](
Value *V) {
return S.isNonSchedulable(V); }))) {
21233 SmallVector<ScheduleData *> ControlDependentMembers;
21234 for (
Value *V : VL) {
21236 if (!
I || (HasCopyables && S.isCopyableElement(V)))
21238 SmallDenseMap<std::pair<Instruction *, Value *>,
unsigned> UserOpToNumOps;
21239 for (
const Use &U :
I->operands()) {
21242 .first->getSecond();
21245 Op && areAllOperandsReplacedByCopyableData(
I,
Op, *SLP,
NumOps)) {
21246 if (ScheduleData *OpSD = getScheduleData(
Op);
21247 OpSD && OpSD->hasValidDependencies()) {
21248 OpSD->clearDirectDependencies();
21249 if (RegionHasStackSave ||
21251 ControlDependentMembers.
push_back(OpSD);
21256 if (!ControlDependentMembers.
empty()) {
21257 ScheduleBundle
Invalid = ScheduleBundle::invalid();
21258 calculateDependencies(
Invalid,
true, SLP,
21259 ControlDependentMembers);
21266 LLVM_DEBUG(
dbgs() <<
"SLP: bundle: " << *S.getMainOp() <<
"\n");
21268 auto TryScheduleBundleImpl = [=](
bool ReSchedule, ScheduleBundle &Bundle) {
21271 SmallVector<ScheduleData *> ControlDependentMembers;
21272 auto CheckIfNeedToClearDeps = [&](ScheduleBundle &Bundle) {
21273 SmallDenseMap<std::pair<Instruction *, Value *>,
unsigned> UserOpToNumOps;
21274 for (ScheduleEntity *SE : Bundle.getBundle()) {
21276 if (ScheduleData *BundleMember = getScheduleData(SD->getInst());
21277 BundleMember && BundleMember->hasValidDependencies()) {
21278 BundleMember->clearDirectDependencies();
21279 if (RegionHasStackSave ||
21281 BundleMember->getInst()))
21282 ControlDependentMembers.
push_back(BundleMember);
21287 if (SD->hasValidDependencies() &&
21288 (!S.areInstructionsWithCopyableElements() ||
21289 !S.isCopyableElement(SD->getInst())) &&
21290 !getScheduleCopyableData(SD->getInst()).empty() && EI.UserTE &&
21291 EI.UserTE->hasState() &&
21292 (!EI.UserTE->hasCopyableElements() ||
21293 !EI.UserTE->isCopyableElement(SD->getInst())))
21294 SD->clearDirectDependencies();
21295 for (
const Use &U : SD->getInst()->operands()) {
21298 .
try_emplace(std::make_pair(SD->getInst(),
U.get()), 0)
21299 .first->getSecond();
21302 Op && areAllOperandsReplacedByCopyableData(SD->getInst(),
Op,
21304 if (ScheduleData *OpSD = getScheduleData(
Op);
21305 OpSD && OpSD->hasValidDependencies()) {
21306 OpSD->clearDirectDependencies();
21307 if (RegionHasStackSave ||
21309 ControlDependentMembers.
push_back(OpSD);
21320 if (OldScheduleEnd && ScheduleEnd != OldScheduleEnd) {
21321 for_each(ScheduleDataMap, [&](
auto &
P) {
21322 if (BB !=
P.first->getParent())
21324 ScheduleData *SD =
P.second;
21325 if (isInSchedulingRegion(*SD))
21326 SD->clearDependencies();
21328 for_each(ScheduleCopyableDataMapByInst, [&](
auto &
P) {
21329 for_each(
P.second, [&](ScheduleCopyableData *SD) {
21330 if (isInSchedulingRegion(*SD))
21331 SD->clearDependencies();
21338 if (Bundle && !Bundle.getBundle().empty()) {
21339 if (S.areInstructionsWithCopyableElements() ||
21340 !ScheduleCopyableDataMap.empty())
21341 CheckIfNeedToClearDeps(Bundle);
21342 LLVM_DEBUG(
dbgs() <<
"SLP: try schedule bundle " << Bundle <<
" in block "
21344 calculateDependencies(Bundle, !ReSchedule, SLP,
21345 ControlDependentMembers);
21346 }
else if (!ControlDependentMembers.
empty()) {
21347 ScheduleBundle
Invalid = ScheduleBundle::invalid();
21348 calculateDependencies(
Invalid, !ReSchedule, SLP,
21349 ControlDependentMembers);
21354 initialFillReadyList(ReadyInsts);
21361 while (((!Bundle && ReSchedule) || (Bundle && !Bundle.isReady())) &&
21362 !ReadyInsts.empty()) {
21363 ScheduleEntity *Picked = ReadyInsts.pop_back_val();
21364 assert(Picked->isReady() &&
"must be ready to schedule");
21365 schedule(*SLP, S, EI, Picked, ReadyInsts);
21366 if (Picked == &Bundle)
21373 for (
Value *V : VL) {
21374 if (S.isNonSchedulable(V))
21376 if (!extendSchedulingRegion(V, S)) {
21383 ScheduleBundle
Invalid = ScheduleBundle::invalid();
21384 TryScheduleBundleImpl(
false,
Invalid);
21385 return std::nullopt;
21389 bool ReSchedule =
false;
21390 for (
Value *V : VL) {
21391 if (S.isNonSchedulable(V))
21395 if (!CopyableData.
empty()) {
21396 for (ScheduleCopyableData *SD : CopyableData)
21397 ReadyInsts.remove(SD);
21399 ScheduleData *BundleMember = getScheduleData(V);
21400 assert((BundleMember || S.isCopyableElement(V)) &&
21401 "no ScheduleData for bundle member (maybe not in same basic block)");
21407 ReadyInsts.remove(BundleMember);
21409 !Bundles.
empty()) {
21410 for (ScheduleBundle *
B : Bundles)
21411 ReadyInsts.remove(
B);
21414 if (!S.isCopyableElement(V) && !BundleMember->isScheduled())
21421 LLVM_DEBUG(
dbgs() <<
"SLP: reset schedule because " << *BundleMember
21422 <<
" was already scheduled\n");
21426 ScheduleBundle &Bundle = buildBundle(VL, S, EI);
21427 TryScheduleBundleImpl(ReSchedule, Bundle);
21428 if (!Bundle.isReady()) {
21429 for (ScheduleEntity *BD : Bundle.getBundle()) {
21433 if (BD->isReady()) {
21435 if (Bundles.
empty()) {
21436 ReadyInsts.insert(BD);
21439 for (ScheduleBundle *
B : Bundles)
21441 ReadyInsts.insert(
B);
21444 ScheduledBundlesList.pop_back();
21445 SmallVector<ScheduleData *> ControlDependentMembers;
21446 for (
Value *V : VL) {
21447 if (S.isNonSchedulable(V))
21450 if (S.isCopyableElement(
I)) {
21453 auto KV = std::make_pair(EI,
I);
21454 assert(ScheduleCopyableDataMap.contains(KV) &&
21455 "no ScheduleCopyableData for copyable element");
21456 ScheduleCopyableData *SD =
21457 ScheduleCopyableDataMapByInst.find(
I)->getSecond().pop_back_val();
21458 ScheduleCopyableDataMapByUsers[
I].remove(SD);
21461 const auto *It =
find(
Op,
I);
21462 assert(It !=
Op.end() &&
"Lane not set");
21463 SmallPtrSet<Instruction *, 4> Visited;
21465 int Lane = std::distance(
Op.begin(), It);
21466 assert(Lane >= 0 &&
"Lane not set");
21468 !EI.UserTE->ReorderIndices.empty())
21469 Lane = EI.UserTE->ReorderIndices[Lane];
21470 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
21471 "Couldn't find extract lane");
21473 if (!Visited.
insert(In).second) {
21477 ScheduleCopyableDataMapByInstUser
21478 [std::make_pair(std::make_pair(In, EI.EdgeIdx),
I)]
21481 }
while (It !=
Op.end());
21483 if (ScheduleCopyableData *UserCD = getScheduleCopyableData(UserEI,
I))
21484 ScheduleCopyableDataMapByUsers[
I].insert(UserCD);
21486 if (ScheduleCopyableDataMapByUsers[
I].
empty())
21487 ScheduleCopyableDataMapByUsers.erase(
I);
21488 ScheduleCopyableDataMap.erase(KV);
21490 if (ScheduleData *OpSD = getScheduleData(
I);
21491 OpSD && OpSD->hasValidDependencies()) {
21492 OpSD->clearDirectDependencies();
21493 if (RegionHasStackSave ||
21495 ControlDependentMembers.
push_back(OpSD);
21499 ScheduledBundles.find(
I)->getSecond().pop_back();
21501 if (!ControlDependentMembers.
empty()) {
21502 ScheduleBundle
Invalid = ScheduleBundle::invalid();
21503 calculateDependencies(
Invalid,
false, SLP,
21504 ControlDependentMembers);
21506 return std::nullopt;
21511BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
21513 if (ChunkPos >= ChunkSize) {
21514 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
21517 return &(ScheduleDataChunks.back()[ChunkPos++]);
21520bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
21521 Value *V,
const InstructionsState &S) {
21523 assert(
I &&
"bundle member must be an instruction");
21524 if (getScheduleData(
I))
21526 if (!ScheduleStart) {
21528 initScheduleData(
I,
I->getNextNode(),
nullptr,
nullptr);
21530 ScheduleEnd =
I->getNextNode();
21531 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
21532 LLVM_DEBUG(
dbgs() <<
"SLP: initialize schedule region to " << *
I <<
"\n");
21540 ++ScheduleStart->getIterator().getReverse();
21546 return II->isAssumeLikeIntrinsic();
21549 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21550 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21551 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter !=
I &&
21553 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
21554 LLVM_DEBUG(
dbgs() <<
"SLP: exceeded schedule region size limit\n");
21561 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21562 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21564 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter ==
I)) {
21565 assert(
I->getParent() == ScheduleStart->getParent() &&
21566 "Instruction is in wrong basic block.");
21567 initScheduleData(
I, ScheduleStart,
nullptr, FirstLoadStoreInRegion);
21573 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter ==
I)) &&
21574 "Expected to reach top of the basic block or instruction down the "
21576 assert(
I->getParent() == ScheduleEnd->getParent() &&
21577 "Instruction is in wrong basic block.");
21578 initScheduleData(ScheduleEnd,
I->getNextNode(), LastLoadStoreInRegion,
21580 ScheduleEnd =
I->getNextNode();
21581 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
21582 LLVM_DEBUG(
dbgs() <<
"SLP: extend schedule region end to " << *
I <<
"\n");
21586void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
21588 ScheduleData *PrevLoadStore,
21589 ScheduleData *NextLoadStore) {
21590 ScheduleData *CurrentLoadStore = PrevLoadStore;
21595 ScheduleData *SD = ScheduleDataMap.lookup(
I);
21597 SD = allocateScheduleDataChunks();
21598 ScheduleDataMap[
I] = SD;
21600 assert(!isInSchedulingRegion(*SD) &&
21601 "new ScheduleData already in scheduling region");
21602 SD->init(SchedulingRegionID,
I);
21609 return LI && LI->isSimple() &&
21610 LI->getMetadata(LLVMContext::MD_invariant_load);
21613 if (
I->mayReadOrWriteMemory() &&
21615 !CanIgnoreLoad(
I) &&
21619 Intrinsic::pseudoprobe))) {
21621 if (CurrentLoadStore) {
21622 CurrentLoadStore->setNextLoadStore(SD);
21624 FirstLoadStoreInRegion = SD;
21626 CurrentLoadStore = SD;
21631 RegionHasStackSave =
true;
21633 if (NextLoadStore) {
21634 if (CurrentLoadStore)
21635 CurrentLoadStore->setNextLoadStore(NextLoadStore);
21637 LastLoadStoreInRegion = CurrentLoadStore;
21641void BoUpSLP::BlockScheduling::calculateDependencies(
21642 ScheduleBundle &Bundle,
bool InsertInReadyList,
BoUpSLP *SLP,
21644 SmallVector<ScheduleEntity *> WorkList;
21645 auto ProcessNode = [&](ScheduleEntity *SE) {
21647 if (CD->hasValidDependencies())
21650 CD->initDependencies();
21651 CD->resetUnscheduledDeps();
21652 const EdgeInfo &EI = CD->getEdgeInfo();
21655 const auto *It =
find(
Op, CD->getInst());
21656 assert(It !=
Op.end() &&
"Lane not set");
21657 SmallPtrSet<Instruction *, 4> Visited;
21659 int Lane = std::distance(
Op.begin(), It);
21660 assert(Lane >= 0 &&
"Lane not set");
21662 !EI.UserTE->ReorderIndices.empty())
21663 Lane = EI.UserTE->ReorderIndices[Lane];
21664 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
21665 "Couldn't find extract lane");
21667 if (EI.UserTE->isCopyableElement(In)) {
21670 if (ScheduleCopyableData *UseSD =
21671 getScheduleCopyableData(EI.UserTE->UserTreeIndex, In)) {
21672 CD->incDependencies();
21673 if (!UseSD->isScheduled())
21674 CD->incrementUnscheduledDeps(1);
21675 if (!UseSD->hasValidDependencies() ||
21676 (InsertInReadyList && UseSD->isReady()))
21679 }
else if (Visited.
insert(In).second) {
21680 if (ScheduleData *UseSD = getScheduleData(In)) {
21681 CD->incDependencies();
21682 if (!UseSD->isScheduled())
21683 CD->incrementUnscheduledDeps(1);
21684 if (!UseSD->hasValidDependencies() ||
21685 (InsertInReadyList && UseSD->isReady()))
21690 }
while (It !=
Op.end());
21691 if (CD->isReady() && CD->getDependencies() == 0 &&
21692 (EI.UserTE->hasState() &&
21693 (EI.UserTE->getMainOp()->getParent() !=
21694 CD->getInst()->getParent() ||
21696 (EI.UserTE->getMainOp()->hasNUsesOrMore(
UsesLimit) ||
21697 any_of(EI.UserTE->getMainOp()->users(), [&](User *U) {
21698 auto *IU = dyn_cast<Instruction>(U);
21701 return IU->getParent() == EI.UserTE->getMainOp()->getParent();
21707 CD->incDependencies();
21708 CD->incrementUnscheduledDeps(1);
21714 if (BundleMember->hasValidDependencies())
21716 LLVM_DEBUG(
dbgs() <<
"SLP: update deps of " << *BundleMember <<
"\n");
21717 BundleMember->initDependencies();
21718 BundleMember->resetUnscheduledDeps();
21720 SmallDenseMap<Value *, unsigned> UserToNumOps;
21721 for (User *U : BundleMember->getInst()->users()) {
21724 if (ScheduleData *UseSD = getScheduleData(U)) {
21728 if (areAllOperandsReplacedByCopyableData(
21731 BundleMember->incDependencies();
21732 if (!UseSD->isScheduled())
21733 BundleMember->incrementUnscheduledDeps(1);
21734 if (!UseSD->hasValidDependencies() ||
21735 (InsertInReadyList && UseSD->isReady()))
21739 for (ScheduleCopyableData *UseSD :
21740 getScheduleCopyableDataUsers(BundleMember->getInst())) {
21741 BundleMember->incDependencies();
21742 if (!UseSD->isScheduled())
21743 BundleMember->incrementUnscheduledDeps(1);
21744 if (!UseSD->hasValidDependencies() ||
21745 (InsertInReadyList && UseSD->isReady()))
21749 SmallPtrSet<const Instruction *, 4> Visited;
21752 if (!Visited.
insert(
I).second)
21754 auto *DepDest = getScheduleData(
I);
21755 assert(DepDest &&
"must be in schedule window");
21756 DepDest->addControlDependency(BundleMember);
21757 BundleMember->incDependencies();
21758 if (!DepDest->isScheduled())
21759 BundleMember->incrementUnscheduledDeps(1);
21760 if (!DepDest->hasValidDependencies() ||
21761 (InsertInReadyList && DepDest->isReady()))
21769 for (Instruction *
I = BundleMember->getInst()->getNextNode();
21770 I != ScheduleEnd;
I =
I->getNextNode()) {
21775 MakeControlDependent(
I);
21783 if (RegionHasStackSave) {
21788 match(BundleMember->getInst(),
21790 for (Instruction *
I = BundleMember->getInst()->getNextNode();
21791 I != ScheduleEnd;
I =
I->getNextNode()) {
21802 MakeControlDependent(
I);
21812 BundleMember->getInst()->mayReadOrWriteMemory()) {
21813 for (Instruction *
I = BundleMember->getInst()->getNextNode();
21814 I != ScheduleEnd;
I =
I->getNextNode()) {
21820 MakeControlDependent(
I);
21827 ScheduleData *NextLoadStore = BundleMember->getNextLoadStore();
21828 if (!NextLoadStore)
21832 "NextLoadStore list for non memory effecting bundle?");
21835 unsigned NumAliased = 0;
21836 unsigned DistToSrc = 1;
21837 bool IsNonSimpleSrc = !SrcLoc.
Ptr || !
isSimple(SrcInst);
21839 for (ScheduleData *DepDest = NextLoadStore; DepDest;
21840 DepDest = DepDest->getNextLoadStore()) {
21841 assert(isInSchedulingRegion(*DepDest) &&
"Expected to be in region");
21851 ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
21853 SLP->isAliased(SrcLoc, SrcInst, DepDest->getInst())))) {
21860 DepDest->addMemoryDependency(BundleMember);
21861 BundleMember->incDependencies();
21862 if (!DepDest->isScheduled())
21863 BundleMember->incrementUnscheduledDeps(1);
21864 if (!DepDest->hasValidDependencies() ||
21865 (InsertInReadyList && DepDest->isReady()))
21889 "expected at least one instruction to schedule");
21891 WorkList.
push_back(Bundle.getBundle().front());
21893 SmallPtrSet<ScheduleBundle *, 16> Visited;
21894 while (!WorkList.
empty()) {
21899 CopyableBundle.
push_back(&CD->getBundle());
21900 Bundles = CopyableBundle;
21902 Bundles = getScheduleBundles(SD->getInst());
21904 if (Bundles.
empty()) {
21905 if (!SD->hasValidDependencies())
21907 if (InsertInReadyList && SD->isReady()) {
21908 ReadyInsts.insert(SD);
21909 LLVM_DEBUG(
dbgs() <<
"SLP: gets ready on update: " << *SD <<
"\n");
21913 for (ScheduleBundle *Bundle : Bundles) {
21914 if (Bundle->hasValidDependencies() || !Visited.
insert(Bundle).second)
21916 assert(isInSchedulingRegion(*Bundle) &&
21917 "ScheduleData not in scheduling region");
21918 for_each(Bundle->getBundle(), ProcessNode);
21920 if (InsertInReadyList && SD->isReady()) {
21921 for (ScheduleBundle *Bundle : Bundles) {
21922 assert(isInSchedulingRegion(*Bundle) &&
21923 "ScheduleData not in scheduling region");
21924 if (!Bundle->isReady())
21926 ReadyInsts.insert(Bundle);
21934void BoUpSLP::BlockScheduling::resetSchedule() {
21936 "tried to reset schedule on block which has not been scheduled");
21937 for_each(ScheduleDataMap, [&](
auto &
P) {
21938 if (BB !=
P.first->getParent())
21940 ScheduleData *SD =
P.second;
21941 if (isInSchedulingRegion(*SD)) {
21942 SD->setScheduled(
false);
21943 SD->resetUnscheduledDeps();
21946 for_each(ScheduleCopyableDataMapByInst, [&](
auto &
P) {
21947 for_each(
P.second, [&](ScheduleCopyableData *SD) {
21948 if (isInSchedulingRegion(*SD)) {
21949 SD->setScheduled(false);
21950 SD->resetUnscheduledDeps();
21954 for_each(ScheduledBundles, [&](
auto &
P) {
21955 for_each(
P.second, [&](ScheduleBundle *Bundle) {
21956 if (isInSchedulingRegion(*Bundle))
21957 Bundle->setScheduled(false);
21961 for (
auto &
P : ScheduleCopyableDataMap) {
21962 if (isInSchedulingRegion(*
P.second)) {
21963 P.second->setScheduled(
false);
21964 P.second->resetUnscheduledDeps();
21967 ReadyInsts.clear();
21970void BoUpSLP::scheduleBlock(
const BoUpSLP &R, BlockScheduling *BS) {
21971 if (!BS->ScheduleStart)
21974 LLVM_DEBUG(
dbgs() <<
"SLP: schedule block " << BS->BB->getName() <<
"\n");
21981 BS->resetSchedule();
21988 struct ScheduleDataCompare {
21989 bool operator()(
const ScheduleEntity *SD1,
21990 const ScheduleEntity *SD2)
const {
21991 return SD2->getSchedulingPriority() < SD1->getSchedulingPriority();
21994 std::set<ScheduleEntity *, ScheduleDataCompare> ReadyInsts;
21999 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
22000 I =
I->getNextNode()) {
22002 if (!Bundles.
empty()) {
22003 for (ScheduleBundle *Bundle : Bundles) {
22004 Bundle->setSchedulingPriority(Idx++);
22005 if (!Bundle->hasValidDependencies())
22006 BS->calculateDependencies(*Bundle,
false,
this);
22009 for (ScheduleCopyableData *SD :
reverse(SDs)) {
22010 ScheduleBundle &Bundle = SD->getBundle();
22011 Bundle.setSchedulingPriority(Idx++);
22012 if (!Bundle.hasValidDependencies())
22013 BS->calculateDependencies(Bundle,
false,
this);
22018 BS->getScheduleCopyableDataUsers(
I);
22019 if (ScheduleData *SD = BS->getScheduleData(
I)) {
22022 SDTEs.
front()->doesNotNeedToSchedule() ||
22024 "scheduler and vectorizer bundle mismatch");
22025 SD->setSchedulingPriority(Idx++);
22026 if (!SD->hasValidDependencies() &&
22027 (!CopyableData.
empty() ||
22028 any_of(
R.ValueToGatherNodes.lookup(
I), [&](
const TreeEntry *TE) {
22029 assert(TE->isGather() &&
"expected gather node");
22030 return TE->hasState() && TE->hasCopyableElements() &&
22031 TE->isCopyableElement(I);
22037 ScheduleBundle Bundle;
22039 BS->calculateDependencies(Bundle,
false,
this);
22042 for (ScheduleCopyableData *SD :
reverse(CopyableData)) {
22043 ScheduleBundle &Bundle = SD->getBundle();
22044 Bundle.setSchedulingPriority(Idx++);
22045 if (!Bundle.hasValidDependencies())
22046 BS->calculateDependencies(Bundle,
false,
this);
22049 BS->initialFillReadyList(ReadyInsts);
22051 Instruction *LastScheduledInst = BS->ScheduleEnd;
22054 SmallPtrSet<Instruction *, 16> Scheduled;
22055 while (!ReadyInsts.empty()) {
22056 auto *Picked = *ReadyInsts.begin();
22057 ReadyInsts.erase(ReadyInsts.begin());
22062 for (
const ScheduleEntity *BundleMember : Bundle->getBundle()) {
22063 Instruction *PickedInst = BundleMember->getInst();
22065 bool IsCopyable = Bundle->getTreeEntry()->isCopyableElement(PickedInst);
22066 if ((IsCopyable && BS->getScheduleData(PickedInst)) ||
22067 (!IsCopyable && !Scheduled.
insert(PickedInst).second))
22069 if (PickedInst->
getNextNode() != LastScheduledInst)
22071 LastScheduledInst = PickedInst;
22073 EntryToLastInstruction.try_emplace(Bundle->getTreeEntry(),
22074 LastScheduledInst);
22078 if (PickedInst->
getNextNode() != LastScheduledInst)
22080 LastScheduledInst = PickedInst;
22082 auto Invalid = InstructionsState::invalid();
22087#ifdef EXPENSIVE_CHECKS
22091#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
22093 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
22094 I =
I->getNextNode()) {
22097 [](
const ScheduleBundle *Bundle) {
22098 return Bundle->isScheduled();
22100 "must be scheduled at this point");
22105 BS->ScheduleStart =
nullptr;
22113 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
22118 auto E = InstrElementSize.find(V);
22119 if (E != InstrElementSize.end())
22136 Value *FirstNonBool =
nullptr;
22137 while (!Worklist.
empty()) {
22142 auto *Ty =
I->getType();
22145 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
22153 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
22161 for (
Use &U :
I->operands()) {
22163 if (Visited.
insert(J).second &&
22169 FirstNonBool = U.get();
22180 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
22182 Width = DL->getTypeSizeInBits(V->getType());
22186 InstrElementSize[
I] = Width;
22191bool BoUpSLP::collectValuesToDemote(
22192 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
22195 bool &IsProfitableToDemote,
bool IsTruncRoot)
const {
22200 unsigned OrigBitWidth =
22201 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
22215 if (isa<PoisonValue>(R))
22217 return !isKnownNonNegative(R, SimplifyQuery(*DL));
22219 auto IsPotentiallyTruncated = [&](
Value *V,
unsigned &
BitWidth) ->
bool {
22222 if (getTreeEntries(V).
size() > 1)
22228 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth >
BitWidth) {
22234 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
22238 APInt
Mask = DB->getDemandedBits(
I);
22239 unsigned BitWidth2 =
22240 std::max<unsigned>(1,
Mask.getBitWidth() -
Mask.countl_zero());
22241 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
22247 BitWidth1 = std::min(BitWidth1, BitWidth2);
22252 auto FinalAnalysis = [&, TTI = TTI]() {
22253 if (!IsProfitableToDemote)
22256 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(
BitWidth)));
22258 if (Res &&
E.isGather()) {
22259 if (
E.hasState()) {
22260 if (
const TreeEntry *SameTE =
22261 getSameValuesTreeEntry(
E.getMainOp(),
E.Scalars);
22263 if (collectValuesToDemote(*SameTE, IsProfitableToDemoteRoot,
BitWidth,
22264 ToDemote, Visited, NodesToKeepBWs,
22265 MaxDepthLevel, IsProfitableToDemote,
22273 SmallPtrSet<Value *, 4> UniqueBases;
22274 for (
Value *V :
E.Scalars) {
22278 UniqueBases.
insert(EE->getVectorOperand());
22280 const unsigned VF =
E.Scalars.size();
22281 Type *OrigScalarTy =
E.Scalars.front()->getType();
22282 if (UniqueBases.
size() <= 2 ||
22295 if (
E.isGather() || !Visited.
insert(&
E).second ||
22297 return !isa<Constant>(V) && all_of(V->users(), [&](User *U) {
22298 return isa<InsertElementInst>(U) && !isVectorized(U);
22301 return FinalAnalysis();
22304 return !isa<Constant>(V) && !all_of(V->users(), [=](User *U) {
22305 return isVectorized(U) ||
22306 (E.Idx == 0 && UserIgnoreList &&
22307 UserIgnoreList->contains(U)) ||
22308 (!isa<CmpInst>(U) && U->getType()->isSized() &&
22309 !U->getType()->isScalableTy() &&
22310 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
22311 }) && !IsPotentiallyTruncated(V,
BitWidth);
22316 bool &NeedToExit) {
22317 NeedToExit =
false;
22318 unsigned InitLevel = MaxDepthLevel;
22319 for (
const TreeEntry *
Op : Operands) {
22320 unsigned Level = InitLevel;
22321 if (!collectValuesToDemote(*
Op, IsProfitableToDemoteRoot,
BitWidth,
22322 ToDemote, Visited, NodesToKeepBWs, Level,
22323 IsProfitableToDemote, IsTruncRoot)) {
22324 if (!IsProfitableToDemote)
22327 if (!FinalAnalysis())
22331 MaxDepthLevel = std::max(MaxDepthLevel, Level);
22335 auto AttemptCheckBitwidth =
22336 [&](function_ref<bool(
unsigned,
unsigned)> Checker,
bool &NeedToExit) {
22338 NeedToExit =
false;
22339 unsigned BestFailBitwidth = 0;
22341 if (Checker(
BitWidth, OrigBitWidth))
22343 if (BestFailBitwidth == 0 && FinalAnalysis())
22347 if (BestFailBitwidth == 0) {
22358 auto TryProcessInstruction =
22360 function_ref<bool(
unsigned,
unsigned)> Checker = {}) {
22361 if (Operands.empty()) {
22364 for (
Value *V :
E.Scalars)
22365 (void)IsPotentiallyTruncated(V,
BitWidth);
22370 return !V->hasOneUse() && !IsPotentiallyTruncated(V, BitWidth);
22373 bool NeedToExit =
false;
22374 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
22378 if (!ProcessOperands(Operands, NeedToExit))
22387 return IsProfitableToDemote;
22390 if (
E.State == TreeEntry::SplitVectorize)
22391 return TryProcessInstruction(
22393 {VectorizableTree[
E.CombinedEntriesWithIndices.front().first].get(),
22394 VectorizableTree[
E.CombinedEntriesWithIndices.back().first].get()});
22396 if (
E.isAltShuffle()) {
22398 auto IsDangerousOpcode = [](
unsigned Opcode) {
22400 case Instruction::Shl:
22401 case Instruction::AShr:
22402 case Instruction::LShr:
22403 case Instruction::UDiv:
22404 case Instruction::SDiv:
22405 case Instruction::URem:
22406 case Instruction::SRem:
22413 if (IsDangerousOpcode(
E.getAltOpcode()))
22414 return FinalAnalysis();
22417 switch (
E.getOpcode()) {
22421 case Instruction::Trunc:
22422 if (IsProfitableToDemoteRoot)
22423 IsProfitableToDemote =
true;
22424 return TryProcessInstruction(
BitWidth);
22425 case Instruction::ZExt:
22426 case Instruction::SExt:
22427 if (
E.UserTreeIndex.UserTE &&
E.UserTreeIndex.UserTE->hasState() &&
22428 E.UserTreeIndex.UserTE->getOpcode() == Instruction::BitCast &&
22429 E.UserTreeIndex.UserTE->getMainOp()->getType()->isFPOrFPVectorTy())
22431 IsProfitableToDemote =
true;
22432 return TryProcessInstruction(
BitWidth);
22436 case Instruction::Add:
22437 case Instruction::Sub:
22438 case Instruction::Mul:
22439 case Instruction::And:
22440 case Instruction::Or:
22441 case Instruction::Xor: {
22442 return TryProcessInstruction(
22443 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)});
22445 case Instruction::Freeze:
22446 return TryProcessInstruction(
BitWidth, getOperandEntry(&
E, 0));
22447 case Instruction::Shl: {
22450 auto ShlChecker = [&](
unsigned BitWidth, unsigned) {
22452 if (isa<PoisonValue>(V))
22454 if (E.isCopyableElement(V))
22456 auto *I = cast<Instruction>(V);
22457 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22458 return AmtKnownBits.getMaxValue().ult(BitWidth);
22461 return TryProcessInstruction(
22462 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)}, ShlChecker);
22464 case Instruction::LShr: {
22468 auto LShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22470 if (isa<PoisonValue>(V))
22472 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22473 if (E.isCopyableElement(V))
22474 return MaskedValueIsZero(V, ShiftedBits, SimplifyQuery(*DL));
22475 auto *I = cast<Instruction>(V);
22476 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22477 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
22478 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
22479 SimplifyQuery(*DL));
22482 return TryProcessInstruction(
22483 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)},
22486 case Instruction::AShr: {
22490 auto AShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22492 if (isa<PoisonValue>(V))
22494 auto *I = cast<Instruction>(V);
22495 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22496 unsigned ShiftedBits = OrigBitWidth - BitWidth;
22497 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
22499 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22502 return TryProcessInstruction(
22503 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)},
22506 case Instruction::UDiv:
22507 case Instruction::URem: {
22509 auto Checker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22512 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22513 if (E.hasCopyableElements() && E.isCopyableElement(V))
22514 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
22515 auto *I = cast<Instruction>(V);
22516 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
22517 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
22520 return TryProcessInstruction(
22521 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)}, Checker);
22525 case Instruction::Select: {
22526 return TryProcessInstruction(
22527 BitWidth, {getOperandEntry(&
E, 1), getOperandEntry(&
E, 2)});
22531 case Instruction::PHI: {
22532 const unsigned NumOps =
E.getNumOperands();
22535 [&](
unsigned Idx) { return getOperandEntry(&E, Idx); });
22540 case Instruction::Call: {
22545 if (
ID != Intrinsic::abs &&
ID != Intrinsic::smin &&
22546 ID != Intrinsic::smax &&
ID != Intrinsic::umin &&
ID != Intrinsic::umax)
22549 function_ref<bool(
unsigned,
unsigned)> CallChecker;
22550 auto CompChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22553 auto *I = cast<Instruction>(V);
22554 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
22555 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22556 return MaskedValueIsZero(I->getOperand(0), Mask,
22557 SimplifyQuery(*DL)) &&
22558 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
22560 assert((
ID == Intrinsic::smin ||
ID == Intrinsic::smax) &&
22561 "Expected min/max intrinsics only.");
22562 unsigned SignBits = OrigBitWidth -
BitWidth;
22564 unsigned Op0SignBits =
22566 unsigned Op1SignBits =
22568 return SignBits <= Op0SignBits &&
22569 ((SignBits != Op0SignBits &&
22572 SimplifyQuery(*DL))) &&
22573 SignBits <= Op1SignBits &&
22574 ((SignBits != Op1SignBits &&
22579 auto AbsChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22582 auto *I = cast<Instruction>(V);
22583 unsigned SignBits = OrigBitWidth - BitWidth;
22584 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
22585 unsigned Op0SignBits =
22586 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22587 return SignBits <= Op0SignBits &&
22588 ((SignBits != Op0SignBits &&
22589 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
22590 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
22593 if (
ID != Intrinsic::abs) {
22594 Operands.push_back(getOperandEntry(&
E, 1));
22595 CallChecker = CompChecker;
22597 CallChecker = AbsChecker;
22600 std::numeric_limits<InstructionCost::CostType>::max();
22602 unsigned VF =
E.Scalars.size();
22604 auto Checker = [&](
unsigned BitWidth, unsigned) {
22612 if (
Cost < BestCost) {
22618 [[maybe_unused]]
bool NeedToExit;
22619 (void)AttemptCheckBitwidth(Checker, NeedToExit);
22621 return TryProcessInstruction(
BitWidth, Operands, CallChecker);
22629 return FinalAnalysis();
22636 bool IsStoreOrInsertElt =
22637 VectorizableTree.front()->hasState() &&
22638 (VectorizableTree.front()->
getOpcode() == Instruction::Store ||
22639 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
22640 if ((IsStoreOrInsertElt || UserIgnoreList) &&
22641 ExtraBitWidthNodes.size() <= 1 &&
22642 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
22643 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
22646 unsigned NodeIdx = 0;
22647 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
22651 assert((VectorizableTree[NodeIdx]->
isGather() || NodeIdx != 0 ||
22652 !VectorizableTree[NodeIdx]->UserTreeIndex) &&
22653 "Unexpected tree is graph.");
22657 bool IsTruncRoot =
false;
22658 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
22661 if (NodeIdx != 0 &&
22662 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22663 VectorizableTree[NodeIdx]->
getOpcode() == Instruction::Trunc) {
22664 assert(IsStoreOrInsertElt &&
"Expected store/insertelement seeded graph.");
22665 IsTruncRoot =
true;
22667 IsProfitableToDemoteRoot =
true;
22672 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
22676 auto ComputeMaxBitWidth =
22677 [&](
const TreeEntry &E,
bool IsTopRoot,
bool IsProfitableToDemoteRoot,
22678 unsigned Limit,
bool IsTruncRoot,
bool IsSignedCmp) ->
unsigned {
22682 if (E.isGather() && IsTruncRoot && E.UserTreeIndex &&
22683 !NodesToKeepBWs.
contains(E.Idx) &&
22684 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
22686 return V->hasOneUse() || isa<Constant>(V) ||
22687 (!V->hasNUsesOrMore(UsesLimit) &&
22688 none_of(V->users(), [&](User *U) {
22689 ArrayRef<TreeEntry *> TEs = getTreeEntries(U);
22690 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22691 if (TEs.empty() || is_contained(TEs, UserTE))
22693 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22695 isa<SIToFPInst, UIToFPInst>(U) ||
22696 (UserTE->hasState() &&
22697 (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22698 SelectInst>(UserTE->getMainOp()) ||
22699 isa<SIToFPInst, UIToFPInst>(UserTE->getMainOp()))))
22701 unsigned UserTESz = DL->getTypeSizeInBits(
22702 UserTE->Scalars.front()->getType());
22703 if (all_of(TEs, [&](const TreeEntry *TE) {
22704 auto It = MinBWs.find(TE);
22705 return It != MinBWs.end() &&
22706 It->second.first > UserTESz;
22709 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
22713 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22714 auto It = MinBWs.find(UserTE);
22715 if (It != MinBWs.end())
22716 return It->second.first;
22717 unsigned MaxBitWidth =
22718 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
22719 MaxBitWidth =
bit_ceil(MaxBitWidth);
22720 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22722 return MaxBitWidth;
22728 unsigned VF = E.getVectorFactor();
22729 Type *ScalarTy = E.Scalars.front()->getType();
22736 [&](
Value *V) { return AnalyzedMinBWVals.contains(V); }))
22745 unsigned MaxBitWidth = 1u;
22753 bool IsKnownPositive = !IsSignedCmp &&
all_of(E.Scalars, [&](
Value *R) {
22754 if (isa<PoisonValue>(R))
22756 KnownBits Known = computeKnownBits(R, *DL);
22757 return Known.isNonNegative();
22760 if (!IsKnownPositive && !IsTopRoot && E.UserTreeIndex &&
22761 E.UserTreeIndex.UserTE->hasState() &&
22762 E.UserTreeIndex.UserTE->getOpcode() == Instruction::UIToFP)
22764 std::min(DL->getTypeSizeInBits(
22765 E.UserTreeIndex.UserTE->Scalars.front()->getType()),
22766 DL->getTypeSizeInBits(ScalarTy));
22770 for (
Value *Root : E.Scalars) {
22776 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22792 if (!IsKnownPositive)
22797 MaxBitWidth = std::max(BitWidth1, MaxBitWidth);
22800 APInt Mask = DB->getDemandedBits(
I);
22801 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
22803 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
22806 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22811 if (NumParts > 1 &&
22819 unsigned Opcode = E.getOpcode();
22820 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
22821 Opcode == Instruction::SExt ||
22822 Opcode == Instruction::ZExt || NumParts > 1;
22827 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
22828 bool NeedToDemote = IsProfitableToDemote;
22830 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
22831 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
22832 NeedToDemote, IsTruncRoot) ||
22833 (MaxDepthLevel <= Limit &&
22834 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
22835 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
22836 DL->getTypeSizeInBits(TreeRootIT) /
22837 DL->getTypeSizeInBits(
22838 E.getMainOp()->getOperand(0)->getType()) >
22842 MaxBitWidth =
bit_ceil(MaxBitWidth);
22844 return MaxBitWidth;
22851 if (UserIgnoreList &&
22855 if (
all_of(*UserIgnoreList,
22860 VectorizableTree.front()->State == TreeEntry::Vectorize &&
22861 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
22862 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
22863 Builder.getInt1Ty()) {
22864 ReductionBitWidth = 1;
22866 for (
Value *V : *UserIgnoreList) {
22870 TypeSize NumTypeBits = DL->getTypeSizeInBits(
V->getType());
22871 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22874 unsigned BitWidth2 = BitWidth1;
22877 BitWidth2 =
Mask.getBitWidth() -
Mask.countl_zero();
22879 ReductionBitWidth =
22880 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
22882 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
22883 ReductionBitWidth = 8;
22885 ReductionBitWidth =
bit_ceil(ReductionBitWidth);
22888 bool IsTopRoot = NodeIdx == 0;
22889 while (NodeIdx < VectorizableTree.size() &&
22890 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22891 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
22892 RootDemotes.push_back(NodeIdx);
22894 IsTruncRoot =
true;
22896 bool IsSignedCmp =
false;
22897 if (UserIgnoreList &&
22901 IsSignedCmp =
true;
22902 while (NodeIdx < VectorizableTree.size()) {
22904 unsigned Limit = 2;
22906 ReductionBitWidth ==
22907 DL->getTypeSizeInBits(
22908 VectorizableTree.front()->Scalars.front()->getType()))
22910 unsigned MaxBitWidth = ComputeMaxBitWidth(
22911 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
22912 IsTruncRoot, IsSignedCmp);
22913 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
22914 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
22915 ReductionBitWidth =
bit_ceil(MaxBitWidth);
22916 else if (MaxBitWidth == 0)
22917 ReductionBitWidth = 0;
22920 for (
unsigned Idx : RootDemotes) {
22921 if (
all_of(VectorizableTree[Idx]->Scalars, [&](
Value *V) {
22922 uint32_t OrigBitWidth =
22923 DL->getTypeSizeInBits(
V->getType()->getScalarType());
22924 if (OrigBitWidth > MaxBitWidth) {
22932 RootDemotes.clear();
22934 IsProfitableToDemoteRoot =
true;
22936 if (ExtraBitWidthNodes.empty()) {
22937 NodeIdx = VectorizableTree.size();
22939 unsigned NewIdx = 0;
22941 NewIdx = *ExtraBitWidthNodes.begin();
22942 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
22943 }
while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
22946 NodeIdx < VectorizableTree.size() &&
22947 VectorizableTree[NodeIdx]->UserTreeIndex &&
22948 VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&
22949 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22950 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22951 Instruction::Trunc &&
22952 !VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();
22954 NodeIdx < VectorizableTree.size() &&
22955 VectorizableTree[NodeIdx]->UserTreeIndex &&
22956 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22957 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22958 Instruction::ICmp &&
22960 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->Scalars,
22962 auto *IC = dyn_cast<ICmpInst>(V);
22963 return IC && (IC->isSigned() ||
22964 !isKnownNonNegative(IC->getOperand(0),
22965 SimplifyQuery(*DL)) ||
22966 !isKnownNonNegative(IC->getOperand(1),
22967 SimplifyQuery(*DL)));
22973 if (MaxBitWidth == 0 ||
22977 if (UserIgnoreList)
22978 AnalyzedMinBWVals.insert_range(TreeRoot);
22985 for (
unsigned Idx : ToDemote) {
22986 TreeEntry *
TE = VectorizableTree[Idx].get();
22987 if (MinBWs.contains(TE))
22990 if (isa<PoisonValue>(R))
22992 return !isKnownNonNegative(R, SimplifyQuery(*DL));
22994 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
23035 DL = &
F.getDataLayout();
23043 if (!
TTI->getNumberOfRegisters(
TTI->getRegisterClassForType(
true))) {
23045 dbgs() <<
"SLP: Didn't find any vector registers for target, abort.\n");
23050 if (
F.hasFnAttribute(Attribute::NoImplicitFloat))
23053 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing blocks in " <<
F.getName() <<
".\n");
23057 BoUpSLP R(&
F,
SE,
TTI,
TLI,
AA,
LI,
DT,
AC,
DB,
DL, ORE_);
23063 DT->updateDFSNumbers();
23066 for (
auto *BB :
post_order(&
F.getEntryBlock())) {
23071 R.clearReductionData();
23072 collectSeedInstructions(BB);
23075 if (!Stores.empty()) {
23077 <<
" underlying objects.\n");
23078 Changed |= vectorizeStoreChains(R);
23082 Changed |= vectorizeChainsInBlock(BB, R);
23087 if (!GEPs.empty()) {
23089 <<
" underlying objects.\n");
23090 Changed |= vectorizeGEPIndices(BB, R);
23095 R.optimizeGatherSequence();
23103 unsigned Idx,
unsigned MinVF,
23108 const unsigned Sz = R.getVectorElementSize(Chain[0]);
23109 unsigned VF = Chain.
size();
23115 VF < 2 || VF < MinVF) {
23123 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << VF <<
" stores at offset " << Idx
23127 for (
Value *V : Chain)
23130 InstructionsCompatibilityAnalysis
Analysis(*DT, *
DL, *
TTI, *TLI);
23131 InstructionsState S =
Analysis.buildInstructionsState(
23135 bool IsAllowedSize =
23139 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
23140 (!S.getMainOp()->isSafeToRemove() ||
23143 return !isa<ExtractElementInst>(V) &&
23144 (V->getNumUses() > Chain.size() ||
23145 any_of(V->users(), [&](User *U) {
23146 return !Stores.contains(U);
23149 (ValOps.
size() > Chain.size() / 2 && !S)) {
23150 Size = (!IsAllowedSize && S) ? 1 : 2;
23154 if (
R.isLoadCombineCandidate(Chain))
23156 R.buildTree(Chain);
23158 if (
R.isTreeTinyAndNotFullyVectorizable()) {
23159 if (
R.isGathered(Chain.front()) ||
23161 return std::nullopt;
23162 Size =
R.getCanonicalGraphSize();
23165 if (
R.isProfitableToReorder()) {
23166 R.reorderTopToBottom();
23167 R.reorderBottomToTop();
23169 R.transformNodes();
23170 R.buildExternalUses();
23172 R.computeMinimumValueSizes();
23174 Size =
R.getCanonicalGraphSize();
23175 if (S && S.getOpcode() == Instruction::Load)
23183 using namespace ore;
23185 R.getORE()->emit(OptimizationRemark(
SV_NAME,
"StoresVectorized",
23187 <<
"Stores SLP vectorized with cost " <<
NV(
"Cost",
Cost)
23188 <<
" and with tree size "
23189 <<
NV(
"TreeSize",
R.getTreeSize()));
23203 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
23204 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
23205 unsigned Size = First ? Val.first : Val.second;
23217 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
23218 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
23219 unsigned P = First ? Val.first : Val.second;
23222 return V + (P - Mean) * (P - Mean);
23225 return Dev * 96 / (Mean * Mean) == 0;
23233class RelatedStoreInsts {
23236 : AllStores(AllStores) {
23237 reset(BaseInstrIdx);
23240 void reset(
unsigned NewBaseInstr) {
23241 assert(NewBaseInstr < AllStores.size() &&
23242 "Instruction index out of bounds");
23243 BaseInstrIdx = NewBaseInstr;
23245 insertOrLookup(NewBaseInstr, 0);
23252 std::optional<unsigned> insertOrLookup(
unsigned InstrIdx, int64_t PtrDist) {
23253 auto [It,
Inserted] = Instrs.emplace(PtrDist, InstrIdx);
23254 return Inserted ? std::nullopt : std::make_optional(It->second);
23257 using DistToInstMap = std::map<int64_t, unsigned>;
23258 const DistToInstMap &getStores()
const {
return Instrs; }
23262 std::optional<int64_t> getPointerDiff(StoreInst &SI,
const DataLayout &
DL,
23263 ScalarEvolution &SE)
const {
23264 StoreInst &BaseStore = *AllStores[BaseInstrIdx];
23267 SI.getValueOperand()->getType(),
SI.getPointerOperand(),
DL, SE,
23273 void rebase(
unsigned MinSafeIdx,
unsigned NewBaseInstIdx,
23274 int64_t DistFromCurBase) {
23275 DistToInstMap PrevSet = std::move(Instrs);
23276 reset(NewBaseInstIdx);
23281 for (
auto [Dist, InstIdx] : PrevSet) {
23282 if (InstIdx >= MinSafeIdx)
23283 insertOrLookup(InstIdx, Dist - DistFromCurBase);
23289 DistToInstMap::reverse_iterator LastVectorizedStore =
find_if(
23290 reverse(Instrs), [&](
const std::pair<int64_t, unsigned> &DistAndIdx) {
23291 return VectorizedStores.
contains(AllStores[DistAndIdx.second]);
23296 DistToInstMap::iterator VectorizedStoresEnd = LastVectorizedStore.base();
23297 Instrs.erase(Instrs.begin(), VectorizedStoresEnd);
23302 unsigned BaseInstrIdx;
23305 DistToInstMap Instrs;
23313bool SLPVectorizerPass::vectorizeStores(
23315 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
23322 auto TryToVectorize = [&](
const RelatedStoreInsts::DistToInstMap &StoreSeq) {
23323 int64_t PrevDist = -1;
23327 auto &[Dist, InstIdx] =
Data;
23328 if (Operands.
empty() || Dist - PrevDist == 1) {
23331 if (Idx != StoreSeq.size() - 1)
23340 if (Operands.
size() <= 1 ||
23342 .
insert({Operands.front(),
23343 cast<StoreInst>(Operands.front())->getValueOperand(),
23345 cast<StoreInst>(Operands.back())->getValueOperand(),
23350 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
23351 unsigned EltSize =
R.getVectorElementSize(Operands[0]);
23355 std::min(
R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
23357 Type *StoreTy =
Store->getValueOperand()->getType();
23358 Type *ValueTy = StoreTy;
23360 ValueTy = Trunc->getSrcTy();
23369 R.getMinVF(DL->getTypeStoreSizeInBits(StoreScalarTy)), StoreScalarTy,
23372 MinVF = std::max<unsigned>(2, MinVF);
23374 if (MaxVF < MinVF) {
23375 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
23377 <<
"MinVF (" << MinVF <<
")\n");
23381 unsigned NonPowerOf2VF = 0;
23386 unsigned CandVF = std::clamp<unsigned>(Operands.
size(), MinVF, MaxVF);
23388 NonPowerOf2VF = CandVF;
23389 assert(NonPowerOf2VF != MaxVF &&
23390 "Non-power-of-2 VF should not be equal to MaxVF");
23397 unsigned MaxRegVF = MaxVF;
23399 MaxVF = std::min<unsigned>(MaxVF,
bit_floor(Operands.
size()));
23400 if (MaxVF < MinVF) {
23401 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
23403 <<
"MinVF (" << MinVF <<
")\n");
23407 SmallVector<unsigned> CandidateVFs;
23408 for (
unsigned VF = std::max(MaxVF, NonPowerOf2VF); VF >= MinVF;
23412 unsigned End = Operands.
size();
23413 unsigned Repeat = 0;
23414 constexpr unsigned MaxAttempts = 4;
23415 OwningArrayRef<std::pair<unsigned, unsigned>> RangeSizes(Operands.
size());
23416 for (std::pair<unsigned, unsigned> &
P : RangeSizes)
23417 P.first =
P.second = 1;
23418 DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable;
23419 auto IsNotVectorized = [](
bool First,
23420 const std::pair<unsigned, unsigned> &
P) {
23421 return First ?
P.first > 0 :
P.second > 0;
23423 auto IsVectorized = [](
bool First,
23424 const std::pair<unsigned, unsigned> &
P) {
23425 return First ?
P.first == 0 :
P.second == 0;
23427 auto VFIsProfitable = [](
bool First,
unsigned Size,
23428 const std::pair<unsigned, unsigned> &
P) {
23431 auto FirstSizeSame = [](
unsigned Size,
23432 const std::pair<unsigned, unsigned> &
P) {
23433 return Size ==
P.first;
23437 bool RepeatChanged =
false;
23438 bool AnyProfitableGraph =
false;
23439 for (
unsigned VF : CandidateVFs) {
23440 AnyProfitableGraph =
false;
23441 unsigned FirstUnvecStore =
23442 std::distance(RangeSizes.begin(),
23443 find_if(RangeSizes, std::bind(IsNotVectorized,
23444 VF >= MaxRegVF, _1)));
23448 while (FirstUnvecStore < End) {
23449 unsigned FirstVecStore = std::distance(
23450 RangeSizes.begin(),
23451 find_if(RangeSizes.drop_front(FirstUnvecStore),
23452 std::bind(IsVectorized, VF >= MaxRegVF, _1)));
23453 unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore;
23454 for (
unsigned SliceStartIdx = FirstUnvecStore;
23455 SliceStartIdx + VF <= MaxSliceEnd;) {
23466 ->getValueOperand()
23469 ->getValueOperand()
23472 "Expected all operands of same type.");
23473 if (!NonSchedulable.
empty()) {
23474 auto [NonSchedSizeMax, NonSchedSizeMin] =
23476 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= VF) {
23479 SliceStartIdx += NonSchedSizeMax;
23484 std::optional<bool> Res =
23485 vectorizeStoreChain(Slice, R, SliceStartIdx, MinVF, TreeSize);
23491 .first->getSecond()
23499 AnyProfitableGraph = RepeatChanged =
Changed =
true;
23502 for (std::pair<unsigned, unsigned> &
P :
23503 RangeSizes.slice(SliceStartIdx, VF))
23504 P.first =
P.second = 0;
23505 if (SliceStartIdx < FirstUnvecStore + MinVF) {
23506 for (std::pair<unsigned, unsigned> &
P : RangeSizes.slice(
23507 FirstUnvecStore, SliceStartIdx - FirstUnvecStore))
23508 P.first =
P.second = 0;
23509 FirstUnvecStore = SliceStartIdx + VF;
23511 if (SliceStartIdx > MaxSliceEnd - VF - MinVF) {
23512 for (std::pair<unsigned, unsigned> &
P :
23513 RangeSizes.slice(SliceStartIdx + VF,
23514 MaxSliceEnd - (SliceStartIdx + VF)))
23515 P.first =
P.second = 0;
23516 if (MaxSliceEnd == End)
23517 End = SliceStartIdx;
23518 MaxSliceEnd = SliceStartIdx;
23520 SliceStartIdx += VF;
23523 if (VF > 2 && Res &&
23524 !
all_of(RangeSizes.slice(SliceStartIdx, VF),
23525 std::bind(VFIsProfitable, VF >= MaxRegVF, TreeSize,
23527 SliceStartIdx += VF;
23532 if (VF > MaxRegVF && TreeSize > 1 &&
23533 all_of(RangeSizes.slice(SliceStartIdx, VF),
23534 std::bind(FirstSizeSame, TreeSize, _1))) {
23535 SliceStartIdx += VF;
23536 while (SliceStartIdx != MaxSliceEnd &&
23537 RangeSizes[SliceStartIdx].first == TreeSize)
23541 if (TreeSize > 1) {
23542 for (std::pair<unsigned, unsigned> &
P :
23543 RangeSizes.slice(SliceStartIdx, VF)) {
23544 if (VF >= MaxRegVF)
23545 P.second = std::max(
P.second, TreeSize);
23547 P.first = std::max(
P.first, TreeSize);
23551 AnyProfitableGraph =
true;
23553 if (FirstUnvecStore >= End)
23555 if (MaxSliceEnd - FirstUnvecStore < VF &&
23556 MaxSliceEnd - FirstUnvecStore >= MinVF)
23557 AnyProfitableGraph =
true;
23558 FirstUnvecStore = std::distance(
23559 RangeSizes.begin(),
23560 find_if(RangeSizes.drop_front(MaxSliceEnd),
23561 std::bind(IsNotVectorized, VF >= MaxRegVF, _1)));
23563 if (!AnyProfitableGraph && VF >= MaxRegVF &&
has_single_bit(VF))
23567 if (
all_of(RangeSizes, [](
const std::pair<unsigned, unsigned> &
P) {
23568 return P.first == 0 &&
P.second == 0;
23572 if (Repeat >= MaxAttempts ||
23573 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
23575 constexpr unsigned StoresLimit = 64;
23576 const unsigned MaxTotalNum = std::min<unsigned>(
23578 static_cast<unsigned>(
23581 RangeSizes.begin(),
23582 find_if(RangeSizes, std::bind(IsNotVectorized,
true, _1))) +
23584 unsigned VF =
bit_ceil(CandidateVFs.front()) * 2;
23587 CandidateVFs.clear();
23589 CandidateVFs.push_back(Limit);
23590 if (VF > MaxTotalNum || VF >= StoresLimit)
23592 for (std::pair<unsigned, unsigned> &
P : RangeSizes) {
23594 P.first = std::max(
P.second,
P.first);
23598 CandidateVFs.push_back(VF);
23638 auto FillStoresSet = [&](
unsigned Idx, StoreInst *
SI) {
23639 std::optional<int64_t> PtrDist;
23640 auto *RelatedStores =
find_if(
23641 SortedStores, [&PtrDist, SI,
this](
const RelatedStoreInsts &StoreSeq) {
23642 PtrDist = StoreSeq.getPointerDiff(*SI, *DL, *SE);
23643 return PtrDist.has_value();
23647 if (RelatedStores == SortedStores.
end()) {
23655 if (std::optional<unsigned> PrevInst =
23656 RelatedStores->insertOrLookup(Idx, *PtrDist)) {
23657 TryToVectorize(RelatedStores->getStores());
23658 RelatedStores->clearVectorizedStores(VectorizedStores);
23659 RelatedStores->rebase(*PrevInst + 1,
23664 Type *PrevValTy =
nullptr;
23666 if (
R.isDeleted(SI))
23669 PrevValTy =
SI->getValueOperand()->getType();
23671 if (PrevValTy !=
SI->getValueOperand()->getType()) {
23672 for (RelatedStoreInsts &StoreSeq : SortedStores)
23673 TryToVectorize(StoreSeq.getStores());
23674 SortedStores.clear();
23675 PrevValTy =
SI->getValueOperand()->getType();
23677 FillStoresSet(
I, SI);
23681 for (RelatedStoreInsts &StoreSeq : SortedStores)
23682 TryToVectorize(StoreSeq.getStores());
23687void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
23695 for (Instruction &
I : *BB) {
23699 if (!
SI->isSimple())
23710 if (
GEP->getNumIndices() != 1)
23712 Value *Idx =
GEP->idx_begin()->get();
23717 if (
GEP->getType()->isVectorTy())
23729 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize a list of length = "
23730 << VL.
size() <<
".\n");
23741 for (
Value *V : VL) {
23742 Type *Ty =
V->getType();
23746 R.getORE()->emit([&]() {
23747 std::string TypeStr;
23748 llvm::raw_string_ostream OS(TypeStr);
23750 return OptimizationRemarkMissed(
SV_NAME,
"UnsupportedType", I0)
23751 <<
"Cannot SLP vectorize list: type "
23752 << TypeStr +
" is unsupported by vectorizer";
23759 unsigned Sz =
R.getVectorElementSize(I0);
23760 unsigned MinVF =
R.getMinVF(Sz);
23761 unsigned MaxVF = std::max<unsigned>(
23763 MaxVF = std::min(
R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
23765 R.getORE()->emit([&]() {
23766 return OptimizationRemarkMissed(
SV_NAME,
"SmallVF", I0)
23767 <<
"Cannot SLP vectorize list: vectorization factor "
23768 <<
"less than 2 is not supported";
23774 bool CandidateFound =
false;
23777 unsigned NextInst = 0, MaxInst = VL.size();
23778 for (
unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
23784 if (TTI->getNumberOfParts(VecTy) == VF)
23786 for (
unsigned I = NextInst;
I < MaxInst; ++
I) {
23787 unsigned ActualVF = std::min(MaxInst -
I, VF);
23792 if (MaxVFOnly && ActualVF < MaxVF)
23794 if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
23799 for (
Value *V : VL.drop_front(
I)) {
23803 !Inst || !
R.isDeleted(Inst)) {
23806 if (Idx == ActualVF)
23811 if (Idx != ActualVF)
23814 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << ActualVF <<
" operations "
23818 if (
R.isTreeTinyAndNotFullyVectorizable())
23820 if (
R.isProfitableToReorder()) {
23821 R.reorderTopToBottom();
23824 R.transformNodes();
23825 R.buildExternalUses();
23827 R.computeMinimumValueSizes();
23829 CandidateFound =
true;
23830 MinCost = std::min(MinCost,
Cost);
23833 <<
" for VF=" << ActualVF <<
"\n");
23836 R.getORE()->emit(OptimizationRemark(
SV_NAME,
"VectorizedList",
23838 <<
"SLP vectorized with cost " <<
ore::NV(
"Cost",
Cost)
23839 <<
" and with tree size "
23840 <<
ore::NV(
"TreeSize",
R.getTreeSize()));
23851 if (!
Changed && CandidateFound) {
23852 R.getORE()->emit([&]() {
23853 return OptimizationRemarkMissed(
SV_NAME,
"NotBeneficial", I0)
23854 <<
"List vectorization was possible but not beneficial with cost "
23855 <<
ore::NV(
"Cost", MinCost) <<
" >= "
23859 R.getORE()->emit([&]() {
23860 return OptimizationRemarkMissed(
SV_NAME,
"NotPossible", I0)
23861 <<
"Cannot SLP vectorize list: vectorization was impossible"
23862 <<
" with available vectorization factors";
23897 using ReductionOpsType = SmallVector<Value *, 16>;
23898 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
23899 ReductionOpsListType ReductionOps;
23903 SmallDenseMap<Value *, SmallVector<Instruction *>, 16> ReducedValsToOps;
23904 WeakTrackingVH ReductionRoot;
23909 bool IsSupportedHorRdxIdentityOp =
false;
23916 static bool isCmpSelMinMax(Instruction *
I) {
23924 static bool isBoolLogicOp(Instruction *
I) {
23930 static bool isVectorizable(
RecurKind Kind, Instruction *
I,
23931 bool TwoElementReduction =
false) {
23932 if (Kind == RecurKind::None)
23941 if (TwoElementReduction)
23944 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
23948 return I->getFastMathFlags().noNaNs();
23951 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
23954 return I->isAssociative();
23957 static Value *getRdxOperand(Instruction *
I,
unsigned Index) {
23963 return I->getOperand(2);
23964 return I->getOperand(Index);
23969 Value *
RHS,
const Twine &Name,
bool UseSelect) {
23973 case RecurKind::Or: {
23982 case RecurKind::And: {
23992 case RecurKind::Add:
23993 case RecurKind::Mul:
23994 case RecurKind::Xor:
23995 case RecurKind::FAdd:
23996 case RecurKind::FMul: {
24001 case RecurKind::SMax:
24002 case RecurKind::SMin:
24003 case RecurKind::UMax:
24004 case RecurKind::UMin:
24012 case RecurKind::FMax:
24013 case RecurKind::FMin:
24014 case RecurKind::FMaximum:
24015 case RecurKind::FMinimum:
24016 case RecurKind::FMaximumNum:
24017 case RecurKind::FMinimumNum: {
24030 const ReductionOpsListType &ReductionOps) {
24031 bool UseSelect = ReductionOps.size() == 2 ||
24033 (ReductionOps.size() == 1 &&
24035 assert((!UseSelect || ReductionOps.size() != 2 ||
24037 "Expected cmp + select pairs for reduction");
24038 Value *
Op = createOp(Builder, RdxKind,
LHS,
RHS, Name, UseSelect);
24056 return RecurKind::None;
24058 return RecurKind::Add;
24060 return RecurKind::Mul;
24063 return RecurKind::And;
24066 return RecurKind::Or;
24068 return RecurKind::Xor;
24070 return RecurKind::FAdd;
24072 return RecurKind::FMul;
24075 return RecurKind::FMax;
24077 return RecurKind::FMin;
24080 return RecurKind::FMaximum;
24082 return RecurKind::FMinimum;
24088 return RecurKind::SMax;
24090 return RecurKind::SMin;
24092 return RecurKind::UMax;
24094 return RecurKind::UMin;
24120 return RecurKind::None;
24124 return RecurKind::None;
24127 return RecurKind::None;
24131 return RecurKind::None;
24136 return RecurKind::None;
24139 return RecurKind::SMax;
24142 return RecurKind::SMin;
24145 return RecurKind::UMax;
24148 return RecurKind::UMin;
24151 return RecurKind::None;
24155 static unsigned getFirstOperandIndex(Instruction *
I) {
24156 return isCmpSelMinMax(
I) ? 1 : 0;
24161 static unsigned getNumberOfOperands(Instruction *
I) {
24162 return isCmpSelMinMax(
I) ? 3 : 2;
24167 static bool hasSameParent(Instruction *
I, BasicBlock *BB) {
24168 if (isCmpSelMinMax(
I) || isBoolLogicOp(
I)) {
24171 return Sel->getParent() == BB &&
Cmp &&
Cmp->getParent() == BB;
24173 return I->getParent() == BB;
24177 static bool hasRequiredNumberOfUses(
bool IsCmpSelMinMax, Instruction *
I) {
24178 if (IsCmpSelMinMax) {
24182 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
24183 return I->hasNUses(2);
24191 void initReductionOps(Instruction *
I) {
24192 if (isCmpSelMinMax(
I))
24193 ReductionOps.assign(2, ReductionOpsType());
24195 ReductionOps.assign(1, ReductionOpsType());
24199 void addReductionOps(Instruction *
I) {
24200 if (isCmpSelMinMax(
I)) {
24202 ReductionOps[1].emplace_back(
I);
24204 ReductionOps[0].emplace_back(
I);
24209 int Sz =
Data.size();
24218 : ReductionRoot(
I), ReductionLimit(2) {
24219 RdxKind = HorizontalReduction::getRdxKind(
I);
24220 ReductionOps.emplace_back().push_back(
I);
24223 ReducedValsToOps[
V].push_back(
I);
24226 bool matchReductionForOperands()
const {
24229 assert(ReductionRoot &&
"Reduction root is not set!");
24232 return Ops.size() == 2;
24240 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
24241 ScalarEvolution &SE,
const DataLayout &
DL,
24242 const TargetLibraryInfo &TLI) {
24243 RdxKind = HorizontalReduction::getRdxKind(Root);
24244 if (!isVectorizable(RdxKind, Root))
24256 if (!Sel->getCondition()->hasOneUse())
24259 ReductionRoot = Root;
24264 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
24266 1, std::make_pair(Root, 0));
24271 SmallVectorImpl<Value *> &PossibleReducedVals,
24272 SmallVectorImpl<Instruction *> &ReductionOps,
24275 getNumberOfOperands(TreeN)))) {
24276 Value *EdgeVal = getRdxOperand(TreeN,
I);
24277 ReducedValsToOps[EdgeVal].push_back(TreeN);
24285 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
24286 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
24287 !isVectorizable(RdxKind, EdgeInst) ||
24288 (
R.isAnalyzedReductionRoot(EdgeInst) &&
24290 PossibleReducedVals.push_back(EdgeVal);
24293 ReductionOps.push_back(EdgeInst);
24302 size_t, SmallMapVector<size_t, SmallMapVector<Value *, unsigned, 2>, 2>,
24304 PossibleReducedVals;
24305 initReductionOps(Root);
24307 SmallSet<size_t, 2> LoadKeyUsed;
24309 auto GenerateLoadsSubkey = [&](
size_t Key, LoadInst *LI) {
24314 auto LIt = LoadsMap.
find(std::make_pair(
Key, Ptr));
24315 if (LIt != LoadsMap.
end()) {
24316 for (LoadInst *RLI : LIt->second) {
24322 for (LoadInst *RLI : LIt->second) {
24329 if (LIt->second.size() > 2) {
24331 hash_value(LIt->second.back()->getPointerOperand());
24337 .first->second.push_back(LI);
24341 while (!Worklist.empty()) {
24342 auto [TreeN,
Level] = Worklist.pop_back_val();
24345 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
24346 addReductionOps(TreeN);
24349 for (
Value *V : PossibleRedVals) {
24353 ++PossibleReducedVals[
Key][Idx].
try_emplace(V, 0).first->second;
24355 for (Instruction *
I :
reverse(PossibleReductionOps))
24356 Worklist.emplace_back(
I,
I->getParent() == BB ? 0 : Level + 1);
24358 auto PossibleReducedValsVect = PossibleReducedVals.
takeVector();
24361 for (
auto &PossibleReducedVals : PossibleReducedValsVect) {
24362 auto PossibleRedVals = PossibleReducedVals.second.
takeVector();
24364 for (
auto &Slice : PossibleRedVals) {
24366 auto RedValsVect = Slice.second.takeVector();
24368 for (
const std::pair<Value *, unsigned> &
Data : RedValsVect)
24369 PossibleRedValsVect.
back().append(
Data.second,
Data.first);
24371 stable_sort(PossibleRedValsVect, [](
const auto &P1,
const auto &P2) {
24372 return P1.size() > P2.size();
24379 }
else if (!isGoodForReduction(
Data)) {
24382 if (!LI || !LastLI ||
24387 ReducedVals.
back().append(
Data.rbegin(),
Data.rend());
24393 return P1.size() > P2.
size();
24399 Value *tryToReduce(BoUpSLP &V,
const DataLayout &
DL, TargetTransformInfo *
TTI,
24400 const TargetLibraryInfo &TLI, AssumptionCache *AC,
24401 DominatorTree &DT) {
24402 constexpr unsigned RegMaxNumber = 4;
24403 constexpr unsigned RedValsMaxNumber = 128;
24407 if (
unsigned NumReducedVals = std::accumulate(
24408 ReducedVals.
begin(), ReducedVals.
end(), 0,
24410 if (!isGoodForReduction(Vals))
24412 return Num + Vals.size();
24414 NumReducedVals < ReductionLimit &&
24418 for (ReductionOpsType &RdxOps : ReductionOps)
24419 for (
Value *RdxOp : RdxOps)
24424 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
24430 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.
size() *
24431 ReducedVals.
front().size());
24435 auto &&GetCmpForMinMaxReduction = [](
Instruction *RdxRootInst) {
24437 "Expected min/max reduction to have select root instruction");
24440 "Expected min/max reduction to have compare condition");
24444 bool AnyBoolLogicOp =
any_of(ReductionOps.back(), [](
Value *V) {
24445 return isBoolLogicOp(cast<Instruction>(V));
24448 auto GetNewVectorizedTree = [&](
Value *VectorizedTree,
Value *Res) {
24449 if (VectorizedTree) {
24453 if (AnyBoolLogicOp) {
24454 auto It = ReducedValsToOps.
find(VectorizedTree);
24455 auto It1 = ReducedValsToOps.
find(Res);
24456 if ((It == ReducedValsToOps.
end() && It1 == ReducedValsToOps.
end()) ||
24458 (It != ReducedValsToOps.
end() &&
24459 any_of(It->getSecond(), [&](Instruction *
I) {
24460 return isBoolLogicOp(I) &&
24461 getRdxOperand(I, 0) == VectorizedTree;
24465 (It1 != ReducedValsToOps.
end() &&
24466 any_of(It1->getSecond(), [&](Instruction *
I) {
24467 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
24471 VectorizedTree = Builder.
CreateFreeze(VectorizedTree);
24475 return createOp(Builder, RdxKind, VectorizedTree, Res,
"op.rdx",
24481 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
24482 ReductionOps.front().size());
24483 for (ReductionOpsType &RdxOps : ReductionOps)
24484 for (
Value *RdxOp : RdxOps) {
24487 IgnoreList.insert(RdxOp);
24490 FastMathFlags RdxFMF;
24492 for (
Value *U : IgnoreList)
24494 RdxFMF &= FPMO->getFastMathFlags();
24500 for (
Value *V : Candidates)
24501 TrackedVals.try_emplace(V, V);
24503 auto At = [](SmallMapVector<Value *, unsigned, 16> &MV,
24504 Value *
V) ->
unsigned & {
24505 auto *It = MV.
find(V);
24506 assert(It != MV.
end() &&
"Unable to find given key.");
24510 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.
size());
24513 SmallPtrSet<Value *, 4> RequiredExtract;
24514 WeakTrackingVH VectorizedTree =
nullptr;
24515 bool CheckForReusedReductionOps =
false;
24520 for (
unsigned I = 0,
E = ReducedVals.
size();
I <
E; ++
I) {
24522 InstructionsState S = States[
I];
24525 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.
size());
24526 for (
Value *ReducedVal : OrigReducedVals) {