74#ifdef EXPENSIVE_CHECKS
109using namespace std::placeholders;
111#define SV_NAME "slp-vectorizer"
112#define DEBUG_TYPE "SLP"
114STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
117 "Controls which SLP graphs should be vectorized.");
121 cl::desc(
"Run the SLP vectorization passes"));
125 cl::desc(
"Enable vectorization for wider vector utilization"));
129 cl::desc(
"Only vectorize if you gain more than this "
134 cl::desc(
"When true, SLP vectorizer bypasses profitability checks based on "
135 "heuristics and makes vectorization decision via cost modeling."));
139 cl::desc(
"Attempt to vectorize horizontal reductions"));
144 "Attempt to vectorize horizontal reductions feeding into a store"));
148 cl::desc(
"Improve the code quality by splitting alternate instructions"));
152 cl::desc(
"Attempt to vectorize for this register size in bits"));
156 cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
164 cl::desc(
"Limit the size of the SLP scheduling region per block"));
168 cl::desc(
"Attempt to vectorize for this register size in bits"));
172 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
176 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
182 cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
191 cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
195 cl::desc(
"The minimum number of loads, which should be considered strided, "
196 "if the stride is > 1 or is runtime value"));
200 cl::desc(
"The maximum stride, considered to be profitable."));
204 cl::desc(
"Disable tree reordering even if it is "
205 "profitable. Used for testing only."));
209 cl::desc(
"Generate strided loads even if they are not "
210 "profitable. Used for testing only."));
214 cl::desc(
"Display the SLP trees with Graphviz"));
218 cl::desc(
"Try to vectorize with non-power-of-2 number of elements."));
223 cl::desc(
"Try to replace values with the idempotent instructions for "
224 "better vectorization."));
256 Ty = Ty->getScalarType();
258 !Ty->isPPC_FP128Ty();
267 return SI->getValueOperand()->getType();
269 return CI->getOperand(0)->getType();
271 return IE->getOperand(1)->getType();
278 "ScalableVectorType is not supported.");
280 return VecTy->getNumElements();
294 Type *Ty,
unsigned Sz) {
299 if (NumParts == 0 || NumParts >= Sz)
314 if (NumParts == 0 || NumParts >= Sz)
319 return (Sz / RegVF) * RegVF;
331 I * VecTyNumElements, VecTyNumElements)))
333 : Mask[
I] * VecTyNumElements + J;
367 unsigned SVNumElements =
369 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
370 if (SVNumElements % ShuffleMaskSize != 0)
372 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
373 if (GroupSize == 0 || (VL.
size() % GroupSize) != 0)
375 unsigned NumGroup = 0;
376 for (
size_t I = 0,
E = VL.
size();
I !=
E;
I += GroupSize) {
378 Value *Src = SV->getOperand(0);
384 if (SV->getOperand(0) != Src)
387 if (!SV->isExtractSubvectorMask(Index))
389 ExpectedIndex.
set(Index / ShuffleMaskSize);
393 if (!ExpectedIndex.
all())
397 assert(NumGroup == (VL.
size() / GroupSize) &&
"Unexpected number of groups");
416 unsigned SVNumElements =
419 unsigned AccumulateLength = 0;
420 for (
Value *V : VL) {
422 for (
int M : SV->getShuffleMask())
424 : AccumulateLength + M);
425 AccumulateLength += SVNumElements;
466 return std::min<unsigned>(PartNumElems,
Size - Part * PartNumElems);
475 OS <<
"Idx: " << Idx <<
", ";
476 OS <<
"n=" << VL.
size() <<
" [" << *VL.
front() <<
", ..]";
499 if (BB !=
II->getParent())
516 Value *FirstNonUndef =
nullptr;
517 for (
Value *V : VL) {
520 if (!FirstNonUndef) {
524 if (V != FirstNonUndef)
527 return FirstNonUndef !=
nullptr;
542 bool IsCopyable =
false) {
544 return Cmp->isCommutative();
546 return BO->isCommutative() ||
547 (BO->getOpcode() == Instruction::Sub &&
555 if (match(U.getUser(),
556 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
557 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
561 auto *I = dyn_cast<BinaryOperator>(U.get());
562 return match(U.getUser(),
563 m_Intrinsic<Intrinsic::abs>(
564 m_Specific(U.get()), m_ConstantInt(Flag))) &&
565 ((!IsCopyable && I && !I->hasNoSignedWrap()) ||
568 (BO->getOpcode() == Instruction::FSub &&
572 return match(U.getUser(),
573 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
575 return I->isCommutative();
582 bool IsCopyable =
false) {
584 "The instruction is not commutative.");
588 switch (BO->getOpcode()) {
589 case Instruction::Sub:
590 case Instruction::FSub:
596 return I->isCommutableOperand(
Op);
616 constexpr unsigned IntrinsicNumOperands = 2;
617 return IntrinsicNumOperands;
619 return I->getNumOperands();
625 static_assert(std::is_same_v<T, InsertElementInst> ||
626 std::is_same_v<T, ExtractElementInst>,
636 if (CI->getValue().uge(VT->getNumElements()))
638 Index *= VT->getNumElements();
639 Index += CI->getZExtValue();
661 Type *CurrentType =
IV->getType();
662 for (
unsigned I :
IV->indices()) {
664 Index *= ST->getNumElements();
665 CurrentType = ST->getElementType(
I);
667 Index *= AT->getNumElements();
668 CurrentType = AT->getElementType();
690 return std::all_of(It, VL.
end(), [&](
Value *V) {
691 if (auto *CI = dyn_cast<CmpInst>(V))
692 return BasePred == CI->getPredicate();
693 if (auto *I = dyn_cast<Instruction>(V))
694 return I->getOpcode() == Opcode;
695 return isa<PoisonValue>(V);
723 if (MaskArg == UseMask::UndefsAsMask)
727 if (MaskArg == UseMask::FirstArg &&
Value < VF)
728 UseMask.reset(
Value);
729 else if (MaskArg == UseMask::SecondArg &&
Value >= VF)
730 UseMask.reset(
Value - VF);
738template <
bool IsPoisonOnly = false>
742 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
750 if (!UseMask.empty()) {
761 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
776 for (
unsigned I = 0,
E = VecTy->getNumElements();
I !=
E; ++
I) {
777 if (
Constant *Elem =
C->getAggregateElement(
I))
779 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
807static std::optional<TargetTransformInfo::ShuffleKind>
814 std::accumulate(VL.
begin(), VL.
end(), 0u, [](
unsigned S,
Value *V) {
815 auto *EI = dyn_cast<ExtractElementInst>(V);
818 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
821 return std::max(S, VTy->getNumElements());
824 Value *Vec1 =
nullptr;
825 Value *Vec2 =
nullptr;
830 Value *Vec = EE->getVectorOperand();
836 ShuffleMode CommonShuffleMode =
Unknown;
838 for (
unsigned I = 0,
E = VL.
size();
I <
E; ++
I) {
845 auto *Vec = EI->getVectorOperand();
859 if (Idx->getValue().uge(
Size))
861 unsigned IntIdx = Idx->getValue().getZExtValue();
868 if (!Vec1 || Vec1 == Vec) {
870 }
else if (!Vec2 || Vec2 == Vec) {
876 if (CommonShuffleMode == Permute)
880 if (Mask[
I] %
Size !=
I) {
881 CommonShuffleMode = Permute;
884 CommonShuffleMode =
Select;
887 if (CommonShuffleMode ==
Select && Vec2)
897 unsigned Opcode =
E->getOpcode();
898 assert((Opcode == Instruction::ExtractElement ||
899 Opcode == Instruction::ExtractValue) &&
900 "Expected extractelement or extractvalue instruction.");
901 if (Opcode == Instruction::ExtractElement) {
905 return CI->getZExtValue();
908 if (EI->getNumIndices() != 1)
910 return *EI->idx_begin();
944class BinOpSameOpcodeHelper {
945 using MaskType = std::uint_fast16_t;
947 constexpr static std::initializer_list<unsigned> SupportedOp = {
948 Instruction::Add, Instruction::Sub, Instruction::Mul, Instruction::Shl,
949 Instruction::AShr, Instruction::And, Instruction::Or, Instruction::Xor};
959 MainOpBIT = 0b100000000,
967 static std::pair<ConstantInt *, unsigned>
968 isBinOpWithConstantInt(
const Instruction *
I) {
969 unsigned Opcode =
I->getOpcode();
975 if (Opcode == Instruction::Sub || Opcode == Instruction::Shl ||
976 Opcode == Instruction::AShr)
982 struct InterchangeableInfo {
985 MaskType Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |
986 MulBIT | AShrBIT | ShlBIT;
991 MaskType SeenBefore = 0;
992 InterchangeableInfo(
const Instruction *I) : I(I) {}
996 bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) {
997 if (Mask & InterchangeableMask) {
998 SeenBefore |= OpcodeInMaskForm;
999 Mask &= InterchangeableMask;
1004 bool equal(
unsigned Opcode) {
1005 return Opcode == I->getOpcode() && trySet(MainOpBIT, MainOpBIT);
1008 MaskType Candidate = Mask & SeenBefore;
1009 if (Candidate & MainOpBIT)
1010 return I->getOpcode();
1011 if (Candidate & ShlBIT)
1012 return Instruction::Shl;
1013 if (Candidate & AShrBIT)
1014 return Instruction::AShr;
1015 if (Candidate & MulBIT)
1016 return Instruction::Mul;
1017 if (Candidate & AddBIT)
1018 return Instruction::Add;
1019 if (Candidate & SubBIT)
1020 return Instruction::Sub;
1021 if (Candidate & AndBIT)
1022 return Instruction::And;
1023 if (Candidate & OrBIT)
1024 return Instruction::Or;
1025 if (Candidate & XorBIT)
1026 return Instruction::Xor;
1031 bool hasCandidateOpcode(
unsigned Opcode)
const {
1032 MaskType Candidate = Mask & SeenBefore;
1034 case Instruction::Shl:
1035 return Candidate & ShlBIT;
1036 case Instruction::AShr:
1037 return Candidate & AShrBIT;
1038 case Instruction::Mul:
1039 return Candidate & MulBIT;
1040 case Instruction::Add:
1041 return Candidate & AddBIT;
1042 case Instruction::Sub:
1043 return Candidate & SubBIT;
1044 case Instruction::And:
1045 return Candidate & AndBIT;
1046 case Instruction::Or:
1047 return Candidate & OrBIT;
1048 case Instruction::Xor:
1049 return Candidate & XorBIT;
1050 case Instruction::LShr:
1051 case Instruction::FAdd:
1052 case Instruction::FSub:
1053 case Instruction::FMul:
1054 case Instruction::SDiv:
1055 case Instruction::UDiv:
1056 case Instruction::FDiv:
1057 case Instruction::SRem:
1058 case Instruction::URem:
1059 case Instruction::FRem:
1069 unsigned FromOpcode = I->getOpcode();
1070 if (FromOpcode == ToOpcode)
1073 auto [CI, Pos] = isBinOpWithConstantInt(I);
1074 const APInt &FromCIValue = CI->getValue();
1075 unsigned FromCIValueBitWidth = FromCIValue.
getBitWidth();
1077 switch (FromOpcode) {
1078 case Instruction::Shl:
1079 if (ToOpcode == Instruction::Mul) {
1083 assert(FromCIValue.
isZero() &&
"Cannot convert the instruction.");
1084 ToCIValue = ToOpcode == Instruction::And
1086 : APInt::getZero(FromCIValueBitWidth);
1089 case Instruction::Mul:
1091 if (ToOpcode == Instruction::Shl) {
1092 ToCIValue = APInt(FromCIValueBitWidth, FromCIValue.
logBase2());
1094 assert(FromCIValue.
isOne() &&
"Cannot convert the instruction.");
1095 ToCIValue = ToOpcode == Instruction::And
1097 : APInt::getZero(FromCIValueBitWidth);
1100 case Instruction::Add:
1101 case Instruction::Sub:
1102 if (FromCIValue.
isZero()) {
1106 "Cannot convert the instruction.");
1107 ToCIValue = FromCIValue;
1111 case Instruction::And:
1113 ToCIValue = ToOpcode == Instruction::Mul
1115 : APInt::getZero(FromCIValueBitWidth);
1118 assert(FromCIValue.
isZero() &&
"Cannot convert the instruction.");
1122 Value *
LHS = I->getOperand(1 - Pos);
1124 ConstantInt::get(I->getOperand(Pos)->getType(), ToCIValue);
1128 ((FromOpcode == Instruction::Add || FromOpcode == Instruction::Or ||
1129 FromOpcode == Instruction::Xor) &&
1130 ToOpcode == Instruction::Sub))
1135 InterchangeableInfo MainOp;
1136 InterchangeableInfo AltOp;
1138 return ::isValidForAlternation(MainOp.I->getOpcode()) &&
1141 bool initializeAltOp(
const Instruction *
I) {
1151 BinOpSameOpcodeHelper(
const Instruction *MainOp,
1152 const Instruction *AltOp =
nullptr)
1153 : MainOp(MainOp), AltOp(AltOp) {
1156 bool add(
const Instruction *
I) {
1158 "BinOpSameOpcodeHelper only accepts BinaryOperator.");
1159 unsigned Opcode =
I->getOpcode();
1160 MaskType OpcodeInMaskForm;
1163 case Instruction::Shl:
1164 OpcodeInMaskForm = ShlBIT;
1166 case Instruction::AShr:
1167 OpcodeInMaskForm = AShrBIT;
1169 case Instruction::Mul:
1170 OpcodeInMaskForm = MulBIT;
1172 case Instruction::Add:
1173 OpcodeInMaskForm = AddBIT;
1175 case Instruction::Sub:
1176 OpcodeInMaskForm = SubBIT;
1178 case Instruction::And:
1179 OpcodeInMaskForm = AndBIT;
1181 case Instruction::Or:
1182 OpcodeInMaskForm = OrBIT;
1184 case Instruction::Xor:
1185 OpcodeInMaskForm = XorBIT;
1188 return MainOp.equal(Opcode) ||
1189 (initializeAltOp(
I) && AltOp.equal(Opcode));
1191 MaskType InterchangeableMask = OpcodeInMaskForm;
1192 ConstantInt *CI = isBinOpWithConstantInt(
I).first;
1194 constexpr MaskType CanBeAll =
1195 XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | MulBIT | AShrBIT | ShlBIT;
1196 const APInt &CIValue = CI->
getValue();
1198 case Instruction::Shl:
1200 InterchangeableMask = CIValue.
isZero() ? CanBeAll : MulBIT | ShlBIT;
1202 case Instruction::Mul:
1203 if (CIValue.
isOne()) {
1204 InterchangeableMask = CanBeAll;
1208 InterchangeableMask = MulBIT | ShlBIT;
1210 case Instruction::Add:
1211 case Instruction::Sub:
1212 InterchangeableMask = CIValue.
isZero() ? CanBeAll : SubBIT | AddBIT;
1214 case Instruction::And:
1216 InterchangeableMask = CanBeAll;
1218 case Instruction::Xor:
1220 InterchangeableMask = XorBIT | OrBIT | SubBIT | AddBIT;
1224 InterchangeableMask = CanBeAll;
1228 return MainOp.trySet(OpcodeInMaskForm, InterchangeableMask) ||
1229 (initializeAltOp(
I) &&
1230 AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));
1232 unsigned getMainOpcode()
const {
return MainOp.getOpcode(); }
1234 bool hasCandidateOpcode(
unsigned Opcode)
const {
1235 return MainOp.hasCandidateOpcode(Opcode);
1237 bool hasAltOp()
const {
return AltOp.I; }
1238 unsigned getAltOpcode()
const {
1239 return hasAltOp() ? AltOp.getOpcode() : getMainOpcode();
1242 return MainOp.getOperand(
I);
1247class InstructionsState {
1273 bool HasCopyables =
false;
1277 assert(valid() &&
"InstructionsState is invalid.");
1282 assert(valid() &&
"InstructionsState is invalid.");
1287 unsigned getOpcode()
const {
return getMainOp()->getOpcode(); }
1289 unsigned getAltOpcode()
const {
return getAltOp()->getOpcode(); }
1292 bool isAltShuffle()
const {
return getMainOp() != getAltOp(); }
1301 Instruction *getMatchingMainOpOrAltOp(Instruction *
I)
const {
1302 assert(MainOp &&
"MainOp cannot be nullptr.");
1303 if (
I->getOpcode() == MainOp->getOpcode())
1306 assert(AltOp &&
"AltOp cannot be nullptr.");
1307 if (
I->getOpcode() == AltOp->getOpcode())
1309 if (!
I->isBinaryOp())
1311 BinOpSameOpcodeHelper
Converter(MainOp);
1314 if (isAltShuffle() && !
Converter.hasCandidateOpcode(MainOp->getOpcode())) {
1315 BinOpSameOpcodeHelper AltConverter(AltOp);
1316 if (AltConverter.add(
I) && AltConverter.add(AltOp) &&
1317 AltConverter.hasCandidateOpcode(AltOp->getOpcode()))
1320 if (
Converter.hasAltOp() && !isAltShuffle())
1322 return Converter.hasAltOp() ? AltOp : MainOp;
1326 bool isShiftOp()
const {
1327 return getMainOp()->isShift() && getAltOp()->isShift();
1332 return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();
1336 bool isMulDivLikeOp()
const {
1337 constexpr std::array<unsigned, 8> MulDiv = {
1338 Instruction::Mul, Instruction::FMul, Instruction::SDiv,
1339 Instruction::UDiv, Instruction::FDiv, Instruction::SRem,
1340 Instruction::URem, Instruction::FRem};
1346 bool isAddSubLikeOp()
const {
1347 constexpr std::array<unsigned, 4>
AddSub = {
1348 Instruction::Add, Instruction::Sub, Instruction::FAdd,
1355 bool isCmpOp()
const {
1356 return (
getOpcode() == Instruction::ICmp ||
1362 bool valid()
const {
return MainOp && AltOp; }
1364 explicit operator bool()
const {
return valid(); }
1366 InstructionsState() =
delete;
1367 InstructionsState(Instruction *MainOp, Instruction *AltOp,
1368 bool HasCopyables =
false)
1369 : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}
1370 static InstructionsState invalid() {
return {
nullptr,
nullptr}; }
1373 bool isCopyableElement(
Value *V)
const {
1374 assert(valid() &&
"InstructionsState is invalid.");
1377 if (isAltShuffle() ||
getOpcode() == Instruction::GetElementPtr)
1382 if (
I->getParent() != MainOp->getParent() &&
1386 if (
I->getOpcode() == MainOp->getOpcode())
1388 if (!
I->isBinaryOp())
1390 BinOpSameOpcodeHelper
Converter(MainOp);
1396 bool isNonSchedulable(
Value *V)
const {
1397 assert(valid() &&
"InstructionsState is invalid.");
1404 if (getMainOp() == V)
1406 if (isCopyableElement(V)) {
1407 auto IsNonSchedulableCopyableElement = [
this](
Value *
V) {
1409 return !
I ||
isa<PHINode>(
I) ||
I->getParent() != MainOp->getParent() ||
1414 !MainOp->comesBefore(
I));
1417 return IsNonSchedulableCopyableElement(V);
1424 bool areInstructionsWithCopyableElements()
const {
1425 assert(valid() &&
"InstructionsState is invalid.");
1426 return HasCopyables;
1430std::pair<Instruction *, SmallVector<Value *>>
1432 Instruction *SelectedOp = S.getMatchingMainOpOrAltOp(
I);
1433 assert(SelectedOp &&
"Cannot convert the instruction.");
1434 if (
I->isBinaryOp()) {
1436 return std::make_pair(SelectedOp,
Converter.getOperand(SelectedOp));
1455 for (
Value *V : VL) {
1460 if (Inst->getOpcode() == Opcode)
1474 BaseOp0 == Op0 || BaseOp1 == Op1 ||
1485 "Assessing comparisons of different types?");
1495 return (BasePred == Pred &&
1497 (BasePred == SwappedPred &&
1508 return InstructionsState::invalid();
1512 return InstructionsState::invalid();
1517 (VL.
size() == 2 && InstCnt < 2))
1518 return InstructionsState::invalid();
1527 unsigned AltOpcode = Opcode;
1529 BinOpSameOpcodeHelper BinOpHelper(MainOp);
1530 bool SwappedPredsCompatible = IsCmpOp && [&]() {
1532 UniquePreds.
insert(BasePred);
1533 UniqueNonSwappedPreds.
insert(BasePred);
1534 for (
Value *V : VL) {
1541 UniqueNonSwappedPreds.
insert(CurrentPred);
1542 if (!UniquePreds.
contains(CurrentPred) &&
1543 !UniquePreds.
contains(SwappedCurrentPred))
1544 UniquePreds.
insert(CurrentPred);
1549 return UniqueNonSwappedPreds.
size() > 2 && UniquePreds.
size() == 2;
1559 return InstructionsState::invalid();
1561 bool AnyPoison = InstCnt != VL.
size();
1572 if (AnyPoison && (
I->isIntDivRem() ||
I->isFPDivRem() ||
isa<CallInst>(
I)))
1573 return InstructionsState::invalid();
1574 unsigned InstOpcode =
I->getOpcode();
1576 if (BinOpHelper.add(
I))
1581 Value *Op1 =
I->getOperand(0);
1584 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
1586 if (Opcode == AltOpcode) {
1589 "Cast isn't safe for alternation, logic needs to be updated!");
1590 AltOpcode = InstOpcode;
1597 Type *Ty0 = BaseInst->getOperand(0)->getType();
1598 Type *Ty1 = Inst->getOperand(0)->getType();
1600 assert(InstOpcode == Opcode &&
"Expected same CmpInst opcode.");
1601 assert(InstOpcode == AltOpcode &&
1602 "Alternate instructions are only supported by BinaryOperator "
1610 if ((VL.
size() == 2 || SwappedPredsCompatible) &&
1611 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1617 if (MainOp != AltOp) {
1620 }
else if (BasePred != CurrentPred) {
1623 "CmpInst isn't safe for alternation, logic needs to be updated!");
1628 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1629 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1632 }
else if (InstOpcode == Opcode) {
1633 assert(InstOpcode == AltOpcode &&
1634 "Alternate instructions are only supported by BinaryOperator and "
1637 if (Gep->getNumOperands() != 2 ||
1639 return InstructionsState::invalid();
1642 return InstructionsState::invalid();
1645 if (!LI->isSimple() || !BaseLI->isSimple())
1646 return InstructionsState::invalid();
1650 return InstructionsState::invalid();
1651 if (
Call->hasOperandBundles() &&
1653 !std::equal(
Call->op_begin() +
Call->getBundleOperandsStartIndex(),
1654 Call->op_begin() +
Call->getBundleOperandsEndIndex(),
1657 return InstructionsState::invalid();
1660 return InstructionsState::invalid();
1663 if (Mappings.
size() != BaseMappings.
size() ||
1664 Mappings.
front().ISA != BaseMappings.
front().ISA ||
1665 Mappings.
front().ScalarName != BaseMappings.
front().ScalarName ||
1666 Mappings.
front().VectorName != BaseMappings.
front().VectorName ||
1667 Mappings.
front().Shape.VF != BaseMappings.
front().Shape.VF ||
1668 Mappings.
front().Shape.Parameters !=
1669 BaseMappings.
front().Shape.Parameters)
1670 return InstructionsState::invalid();
1675 return InstructionsState::invalid();
1680 assert(MainOp &&
"Cannot find MainOp with Opcode from BinOpHelper.");
1682 assert(MainOp &&
"Cannot find AltOp with Opcode from BinOpHelper.");
1685 "Incorrect implementation of allSameOpcode.");
1686 InstructionsState S(MainOp, AltOp);
1692 "Invalid InstructionsState.");
1700 return all_of(VL, [&](
Value *V) {
return V->getType() == Ty; });
1710 unsigned Opcode = UserInst->
getOpcode();
1712 case Instruction::Load: {
1716 case Instruction::Store: {
1718 return (
SI->getPointerOperand() == Scalar);
1720 case Instruction::Call: {
1724 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1725 Arg.value().get() == Scalar;
1745 return LI->isSimple();
1747 return SI->isSimple();
1749 return !
MI->isVolatile();
1757 bool ExtendingManyInputs =
false) {
1758 if (SubMask.
empty())
1761 (!ExtendingManyInputs || SubMask.
size() > Mask.size() ||
1764 "SubMask with many inputs support must be larger than the mask.");
1766 Mask.append(SubMask.
begin(), SubMask.
end());
1770 int TermValue = std::min(Mask.size(), SubMask.
size());
1771 for (
int I = 0,
E = SubMask.
size();
I <
E; ++
I) {
1773 (!ExtendingManyInputs &&
1774 (SubMask[
I] >= TermValue || Mask[SubMask[
I]] >= TermValue)))
1776 NewMask[
I] = Mask[SubMask[
I]];
1792 const size_t Sz = Order.
size();
1795 for (
unsigned I = 0;
I < Sz; ++
I) {
1797 UnusedIndices.
reset(Order[
I]);
1799 MaskedIndices.
set(
I);
1801 if (MaskedIndices.
none())
1804 "Non-synced masked/available indices.");
1808 assert(Idx >= 0 &&
"Indices must be synced.");
1818 unsigned Opcode0,
unsigned Opcode1) {
1825 OpcodeMask.
set(Lane * ScalarTyNumElements,
1826 Lane * ScalarTyNumElements + ScalarTyNumElements);
1835 "Expected scalar constants.");
1838 std::fill_n(NewVal.begin() +
I * VF, VF, V);
1845 const unsigned E = Indices.
size();
1847 for (
unsigned I = 0;
I <
E; ++
I)
1848 Mask[Indices[
I]] =
I;
1854 assert(!Mask.empty() &&
"Expected non-empty mask.");
1858 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
1860 Scalars[Mask[
I]] = Prev[
I];
1873 auto *IO = dyn_cast<Instruction>(V);
1876 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1889 return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(
UsesLimit) &&
1891 auto *IU = dyn_cast<Instruction>(U);
1894 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1910 return !VL.
empty() &&
1926 return NumParts > 0 && NumParts < Sz &&
has_single_bit(Sz / NumParts) &&
1935 const unsigned Limit = std::numeric_limits<unsigned>::max()) {
1936 unsigned NumParts =
TTI.getNumberOfParts(VecTy);
1937 if (NumParts == 0 || NumParts >= Limit)
1940 if (NumParts >= Sz || Sz % NumParts != 0 ||
1949 class ScheduleEntity;
1951 class ScheduleCopyableData;
1952 class ScheduleBundle;
1962 struct StridedPtrInfo {
1963 Value *StrideVal =
nullptr;
1964 const SCEV *StrideSCEV =
nullptr;
1990 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1991 AC(AC), DB(DB), DL(DL), ORE(ORE),
2010 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
2023 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales = {});
2043 const SmallDenseSet<Value *> &UserIgnoreLst);
2050 assert(!VectorizableTree.empty() &&
"No graph to get the first node from");
2051 return VectorizableTree.front()->Scalars;
2057 const TreeEntry &Root = *VectorizableTree.front();
2058 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
2059 !Root.Scalars.
front()->getType()->isIntegerTy())
2060 return std::nullopt;
2061 auto It = MinBWs.find(&Root);
2062 if (It != MinBWs.end())
2066 if (Root.getOpcode() == Instruction::ZExt ||
2067 Root.getOpcode() == Instruction::SExt)
2068 return std::make_pair(
cast<CastInst>(Root.getMainOp())->getSrcTy(),
2069 Root.getOpcode() == Instruction::SExt);
2070 return std::nullopt;
2076 return MinBWs.at(VectorizableTree.front().get()).second;
2081 if (ReductionBitWidth == 0 ||
2082 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
2083 ReductionBitWidth >=
2084 DL->getTypeSizeInBits(
2085 VectorizableTree.front()->Scalars.front()->getType()))
2087 VectorizableTree.front()->Scalars.front()->getType(),
2088 VectorizableTree.front()->getVectorFactor());
2091 VectorizableTree.front()->Scalars.front()->getContext(),
2093 VectorizableTree.front()->getVectorFactor());
2108 VectorizableTree.clear();
2109 ScalarToTreeEntries.clear();
2110 DeletedNodes.clear();
2111 TransformedToGatherNodes.clear();
2112 OperandsToTreeEntry.clear();
2113 ScalarsInSplitNodes.clear();
2115 NonScheduledFirst.clear();
2116 EntryToLastInstruction.clear();
2117 LastInstructionToPos.clear();
2118 LoadEntriesToVectorize.clear();
2119 IsGraphTransformMode =
false;
2120 GatheredLoadsEntriesFirst.reset();
2121 CompressEntryToData.clear();
2122 ExternalUses.clear();
2123 ExternalUsesAsOriginalScalar.clear();
2124 ExternalUsesWithNonUsers.clear();
2125 for (
auto &Iter : BlocksSchedules) {
2126 BlockScheduling *BS = Iter.second.get();
2130 ReductionBitWidth = 0;
2132 CastMaxMinBWSizes.reset();
2133 ExtraBitWidthNodes.clear();
2134 InstrElementSize.clear();
2135 UserIgnoreList =
nullptr;
2136 PostponedGathers.clear();
2137 ValueToGatherNodes.clear();
2138 TreeEntryToStridedPtrInfoMap.clear();
2154 assert(!Order.
empty() &&
"expected non-empty order");
2155 const unsigned Sz = Order.
size();
2157 return P.value() ==
P.index() ||
P.value() == Sz;
2170 bool IgnoreReorder);
2183 std::optional<OrdersType>
2221 return MaxVecRegSize;
2226 return MinVecRegSize;
2234 unsigned MaxVF =
MaxVFOption.getNumOccurrences() ?
2235 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
2236 return MaxVF ? MaxVF : UINT_MAX;
2275 Align Alignment,
const int64_t Diff,
2276 const size_t Sz)
const;
2316 Value *Ptr0,
Value *PtrN, StridedPtrInfo &SPtrInfo)
const;
2334 Align CommonAlignment,
2336 StridedPtrInfo &SPtrInfo)
const;
2351 StridedPtrInfo &SPtrInfo,
2352 unsigned *BestVF =
nullptr,
2353 bool TryRecursiveCheck =
true)
const;
2357 ListOfKnonwnNonVectorizableLoads.insert(
hash_value(VL));
2361 template <
typename T>
2363 return ListOfKnonwnNonVectorizableLoads.contains(
hash_value(VL));
2388 OS <<
"{User:" << (
UserTE ? std::to_string(
UserTE->Idx) :
"null")
2389 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
2414 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
2415 MaxLevel(MaxLevel) {}
2471 auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
2476 auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
2478 return U == U1 || U == U2 || R.isVectorized(U);
2481 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
2484 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
2486 ((
int)V1->getNumUses() == NumLanes ||
2487 AllUsersAreInternal(V1, V2)))
2493 auto CheckSameEntryOrFail = [&]() {
2498 any_of(TEs2, [&](TreeEntry *E) {
return Set.contains(E); }))
2507 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
2509 return CheckSameEntryOrFail();
2512 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
2513 LI2->getPointerOperand(), DL, SE,
true);
2514 if (!Dist || *Dist == 0) {
2517 R.TTI->isLegalMaskedGather(
2520 return CheckSameEntryOrFail();
2524 if (std::abs(*Dist) > NumLanes / 2)
2557 Value *EV2 =
nullptr;
2570 int Dist = Idx2 - Idx1;
2573 if (std::abs(Dist) == 0)
2575 if (std::abs(Dist) > NumLanes / 2)
2582 return CheckSameEntryOrFail();
2588 if (I1->getParent() != I2->getParent())
2589 return CheckSameEntryOrFail();
2597 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.
empty() ||
2598 !S.isAltShuffle()) &&
2602 S.getMainOp()->getNumOperands();
2614 return CheckSameEntryOrFail();
2648 int ShallowScoreAtThisLevel =
2659 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
2662 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
2664 ShallowScoreAtThisLevel))
2665 return ShallowScoreAtThisLevel;
2666 assert(I1 && I2 &&
"Should have early exited.");
2673 for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
2674 OpIdx1 != NumOperands1; ++OpIdx1) {
2676 int MaxTmpScore = 0;
2677 unsigned MaxOpIdx2 = 0;
2678 bool FoundBest =
false;
2682 ? I2->getNumOperands()
2683 : std::min(I2->getNumOperands(), OpIdx1 + 1);
2684 assert(FromIdx <= ToIdx &&
"Bad index");
2685 for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
2687 if (Op2Used.
count(OpIdx2))
2692 I1, I2, CurrLevel + 1, {});
2695 TmpScore > MaxTmpScore) {
2696 MaxTmpScore = TmpScore;
2703 Op2Used.
insert(MaxOpIdx2);
2704 ShallowScoreAtThisLevel += MaxTmpScore;
2707 return ShallowScoreAtThisLevel;
2738 struct OperandData {
2739 OperandData() =
default;
2740 OperandData(
Value *V,
bool APO,
bool IsUsed)
2741 : V(V), APO(APO), IsUsed(IsUsed) {}
2751 bool IsUsed =
false;
2760 enum class ReorderingMode {
2774 unsigned ArgSize = 0;
2780 const Loop *L =
nullptr;
2783 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
2784 return OpsVec[
OpIdx][Lane];
2788 const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const {
2789 return OpsVec[
OpIdx][Lane];
2794 for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
2796 for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2798 OpsVec[
OpIdx][Lane].IsUsed =
false;
2802 void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
2803 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2815 int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx,
2817 Value *IdxLaneV = getData(Idx, Lane).V;
2830 unsigned UniquesCount = Uniques.
size();
2831 auto IdxIt = Uniques.
find(IdxLaneV);
2832 unsigned UniquesCntWithIdxLaneV =
2833 IdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2835 auto OpIdxIt = Uniques.
find(OpIdxLaneV);
2836 unsigned UniquesCntWithOpIdxLaneV =
2837 OpIdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2838 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2840 return std::min(
bit_ceil(UniquesCntWithOpIdxLaneV) -
2841 UniquesCntWithOpIdxLaneV,
2842 UniquesCntWithOpIdxLaneV -
2844 ((IdxIt != Uniques.
end() && UsedLanes.
test(IdxIt->second))
2845 ? UniquesCntWithIdxLaneV -
bit_floor(UniquesCntWithIdxLaneV)
2846 :
bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2855 int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
2856 Value *IdxLaneV = getData(Idx, Lane).V;
2869 return R.areAllUsersVectorized(IdxLaneI)
2877 static const int ScoreScaleFactor = 10;
2885 int Lane,
unsigned OpIdx,
unsigned Idx,
2895 int SplatScore = getSplatScore(Lane,
OpIdx, Idx, UsedLanes);
2896 if (Score <= -SplatScore) {
2900 Score += SplatScore;
2906 Score *= ScoreScaleFactor;
2907 Score += getExternalUseScore(Lane,
OpIdx, Idx);
2925 std::optional<unsigned>
2926 getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
2930 unsigned NumOperands = getNumOperands();
2933 Value *OpLastLane = getData(
OpIdx, LastLane).V;
2936 ReorderingMode RMode = ReorderingModes[
OpIdx];
2937 if (RMode == ReorderingMode::Failed)
2938 return std::nullopt;
2941 bool OpIdxAPO = getData(
OpIdx, Lane).APO;
2947 std::optional<unsigned> Idx;
2951 BestScoresPerLanes.try_emplace(std::make_pair(
OpIdx, Lane), 0)
2957 bool IsUsed = RMode == ReorderingMode::Splat ||
2958 RMode == ReorderingMode::Constant ||
2959 RMode == ReorderingMode::Load;
2961 for (
unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2963 OperandData &OpData = getData(Idx, Lane);
2965 bool OpAPO = OpData.APO;
2974 if (OpAPO != OpIdxAPO)
2979 case ReorderingMode::Load:
2980 case ReorderingMode::Opcode: {
2981 bool LeftToRight = Lane > LastLane;
2982 Value *OpLeft = (LeftToRight) ? OpLastLane :
Op;
2983 Value *OpRight = (LeftToRight) ?
Op : OpLastLane;
2984 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2985 OpIdx, Idx, IsUsed, UsedLanes);
2986 if (Score >
static_cast<int>(BestOp.Score) ||
2987 (Score > 0 && Score ==
static_cast<int>(BestOp.Score) &&
2990 BestOp.Score = Score;
2991 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] = Score;
2995 case ReorderingMode::Constant:
2997 (!BestOp.Score && L && L->isLoopInvariant(
Op))) {
3001 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] =
3008 case ReorderingMode::Splat:
3010 IsUsed =
Op == OpLastLane;
3011 if (
Op == OpLastLane) {
3013 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] =
3019 case ReorderingMode::Failed:
3025 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
3029 return std::nullopt;
3036 unsigned getBestLaneToStartReordering()
const {
3037 unsigned Min = UINT_MAX;
3038 unsigned SameOpNumber = 0;
3049 for (
int I = getNumLanes();
I > 0; --
I) {
3050 unsigned Lane =
I - 1;
3051 OperandsOrderData NumFreeOpsHash =
3052 getMaxNumOperandsThatCanBeReordered(Lane);
3055 if (NumFreeOpsHash.NumOfAPOs < Min) {
3056 Min = NumFreeOpsHash.NumOfAPOs;
3057 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3059 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3060 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
3061 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
3064 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3065 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3066 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
3067 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
3068 auto [It, Inserted] =
3069 HashMap.
try_emplace(NumFreeOpsHash.Hash, 1, Lane);
3075 unsigned BestLane = 0;
3076 unsigned CntMin = UINT_MAX;
3078 if (
Data.second.first < CntMin) {
3079 CntMin =
Data.second.first;
3080 BestLane =
Data.second.second;
3087 struct OperandsOrderData {
3090 unsigned NumOfAPOs = UINT_MAX;
3093 unsigned NumOpsWithSameOpcodeParent = 0;
3107 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const {
3108 unsigned CntTrue = 0;
3109 unsigned NumOperands = getNumOperands();
3119 bool AllUndefs =
true;
3120 unsigned NumOpsWithSameOpcodeParent = 0;
3125 const OperandData &OpData = getData(
OpIdx, Lane);
3132 I->getParent() != Parent) {
3133 if (NumOpsWithSameOpcodeParent == 0) {
3134 NumOpsWithSameOpcodeParent = 1;
3136 Parent =
I->getParent();
3138 --NumOpsWithSameOpcodeParent;
3141 ++NumOpsWithSameOpcodeParent;
3150 OperandsOrderData
Data;
3151 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
3152 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
3159 const InstructionsState &S) {
3163 return VL.
size() == getNumLanes();
3165 "Expected same number of lanes");
3166 assert(S.valid() &&
"InstructionsState is invalid.");
3172 OpsVec.resize(ArgSize);
3173 unsigned NumLanes = VL.
size();
3174 for (OperandDataVec &
Ops : OpsVec)
3175 Ops.resize(NumLanes);
3190 OpsVec[
OpIdx][Lane] = {Operands[
OpIdx][Lane],
true,
false};
3193 bool IsInverseOperation =
false;
3194 if (S.isCopyableElement(VL[Lane])) {
3196 IsInverseOperation =
3199 assert(
I &&
"Expected instruction");
3200 auto [SelectedOp,
Ops] = convertTo(
I, S);
3207 bool APO = (
OpIdx == 0) ?
false : IsInverseOperation;
3208 OpsVec[
OpIdx][Lane] = {Operands[
OpIdx][Lane], APO,
false};
3214 unsigned getNumOperands()
const {
return ArgSize; }
3217 unsigned getNumLanes()
const {
return OpsVec[0].size(); }
3220 Value *getValue(
unsigned OpIdx,
unsigned Lane)
const {
3221 return getData(
OpIdx, Lane).V;
3225 bool empty()
const {
return OpsVec.empty(); }
3228 void clear() { OpsVec.clear(); }
3233 bool shouldBroadcast(
Value *
Op,
unsigned OpIdx,
unsigned Lane) {
3235 "Op is expected to be getValue(OpIdx, Lane).");
3239 bool OpAPO = getData(
OpIdx, Lane).APO;
3240 bool IsInvariant = L && L->isLoopInvariant(
Op);
3242 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3246 bool FoundCandidate =
false;
3247 for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
3248 OperandData &
Data = getData(OpI, Ln);
3249 if (
Data.APO != OpAPO ||
Data.IsUsed)
3251 Value *OpILane = getValue(OpI, Lane);
3275 L->isLoopInvariant(
Data.V))) {
3276 FoundCandidate =
true;
3283 if (!FoundCandidate)
3286 return getNumLanes() == 2 || Cnt > 1;
3293 "Op is expected to be getValue(OpIdx, Lane).");
3294 bool OpAPO = getData(
OpIdx, Lane).APO;
3295 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3299 const OperandData &
Data = getData(OpI, Ln);
3300 if (
Data.APO != OpAPO ||
Data.IsUsed)
3302 Value *OpILn = getValue(OpI, Ln);
3303 return (L && L->isLoopInvariant(OpILn)) ||
3315 const InstructionsState &S,
const BoUpSLP &R)
3316 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
3317 L(R.LI->getLoopFor(S.getMainOp()->
getParent())) {
3319 appendOperands(RootVL, Operands, S);
3327 "Expected same num of lanes across all operands");
3328 for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
3329 OpVL[Lane] = OpsVec[
OpIdx][Lane].V;
3337 unsigned NumOperands = getNumOperands();
3338 unsigned NumLanes = getNumLanes();
3358 unsigned FirstLane = getBestLaneToStartReordering();
3367 if (shouldBroadcast(OpLane0,
OpIdx, FirstLane) ||
3368 !canBeVectorized(OpILane0,
OpIdx, FirstLane))
3369 ReorderingModes[
OpIdx] = ReorderingMode::Splat;
3371 ReorderingModes[
OpIdx] = ReorderingMode::Load;
3373 ReorderingModes[
OpIdx] = ReorderingMode::Opcode;
3375 ReorderingModes[
OpIdx] = ReorderingMode::Constant;
3378 ReorderingModes[
OpIdx] = ReorderingMode::Splat;
3388 auto &&SkipReordering = [
this]() {
3391 for (
const OperandData &
Data : Op0)
3394 ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
3395 if (
any_of(
Op, [&UniqueValues](
const OperandData &
Data) {
3402 return UniqueValues.
size() != 2 &&
3404 UniqueValues.
size());
3416 if (SkipReordering())
3419 bool StrategyFailed =
false;
3427 for (
unsigned I = 0;
I < NumOperands; ++
I)
3428 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
3431 UsedLanes.
set(FirstLane);
3432 for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
3434 for (
int Direction : {+1, -1}) {
3435 int Lane = FirstLane + Direction * Distance;
3436 if (Lane < 0 || Lane >= (
int)NumLanes)
3438 UsedLanes.
set(Lane);
3439 int LastLane = Lane - Direction;
3440 assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
3445 std::optional<unsigned> BestIdx =
3446 getBestOperand(
OpIdx, Lane, LastLane, ReorderingModes,
3447 MainAltOps[
OpIdx], UsedLanes);
3454 swap(
OpIdx, *BestIdx, Lane);
3457 StrategyFailed =
true;
3461 OperandData &AltOp = getData(
OpIdx, Lane);
3462 InstructionsState OpS =
3464 if (OpS && OpS.isAltShuffle())
3471 if (!StrategyFailed)
3476#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3479 case ReorderingMode::Load:
3481 case ReorderingMode::Opcode:
3483 case ReorderingMode::Constant:
3485 case ReorderingMode::Splat:
3487 case ReorderingMode::Failed:
3508 const unsigned Indent = 2;
3510 for (
const OperandDataVec &OpDataVec : OpsVec) {
3511 OS <<
"Operand " << Cnt++ <<
"\n";
3512 for (
const OperandData &OpData : OpDataVec) {
3513 OS.
indent(Indent) <<
"{";
3514 if (
Value *V = OpData.V)
3518 OS <<
", APO:" << OpData.APO <<
"}\n";
3540 int BestScore = Limit;
3541 std::optional<int> Index;
3542 for (
int I :
seq<int>(0, Candidates.size())) {
3544 Candidates[
I].second,
3547 if (Score > BestScore) {
3562 DeletedInstructions.insert(
I);
3567 template <
typename T>
3570 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
3572 for (T *V : DeadVals) {
3577 for (T *V : DeadVals) {
3578 if (!V || !Processed.
insert(V).second)
3583 for (
Use &U :
I->operands()) {
3585 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
3587 (Entries.
empty() ||
none_of(Entries, [&](
const TreeEntry *Entry) {
3588 return Entry->VectorizedValue == OpI;
3592 I->dropAllReferences();
3594 for (T *V : DeadVals) {
3596 if (!
I->getParent())
3601 cast<Instruction>(U.getUser()));
3603 "trying to erase instruction with users.");
3604 I->removeFromParent();
3608 while (!DeadInsts.
empty()) {
3611 if (!VI || !VI->getParent())
3614 "Live instruction found in dead worklist!");
3615 assert(VI->use_empty() &&
"Instructions with uses are not dead.");
3622 for (
Use &OpU : VI->operands()) {
3623 Value *OpV = OpU.get();
3635 if (!DeletedInstructions.contains(OpI) &&
3636 (!OpI->getType()->isVectorTy() ||
3637 none_of(VectorValuesAndScales,
3638 [&](
const std::tuple<Value *, unsigned, bool> &V) {
3639 return std::get<0>(V) == OpI;
3645 VI->removeFromParent();
3647 SE->forgetValue(VI);
3654 return AnalyzedReductionsRoots.count(
I);
3659 AnalyzedReductionsRoots.insert(
I);
3664 return AnalyzedReductionVals.contains(
hash_value(VL));
3669 AnalyzedReductionVals.insert(
hash_value(VL));
3673 AnalyzedReductionsRoots.clear();
3674 AnalyzedReductionVals.clear();
3675 AnalyzedMinBWVals.clear();
3683 return MustGather.contains(V);
3687 return NonScheduledFirst.contains(V);
3692 assert(V &&
"V cannot be nullptr.");
3693 return ScalarToTreeEntries.contains(V);
3703 bool collectValuesToDemote(
3704 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
3707 bool &IsProfitableToDemote,
bool IsTruncRoot)
const;
3716 void buildReorderableOperands(
3724 void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
3727 bool areAllUsersVectorized(
3736 const TreeEntry *getOperandEntry(
const TreeEntry *E,
unsigned Idx)
const;
3737 TreeEntry *getOperandEntry(TreeEntry *E,
unsigned Idx) {
3738 return const_cast<TreeEntry *
>(
3739 getOperandEntry(
const_cast<const TreeEntry *
>(E), Idx));
3745 Instruction *getRootEntryInstruction(
const TreeEntry &Entry)
const;
3749 getCastContextHint(
const TreeEntry &TE)
const;
3763 const InstructionsState &LocalState,
3770 unsigned InterleaveFactor = 0);
3781 bool ResizeAllowed =
false)
const;
3788 Value *vectorizeOperand(TreeEntry *
E,
unsigned NodeIdx);
3793 template <
typename BVTy,
typename ResTy,
typename... Args>
3794 ResTy processBuildVector(
const TreeEntry *
E,
Type *ScalarTy, Args &...Params);
3799 Value *createBuildVector(
const TreeEntry *
E,
Type *ScalarTy);
3805 Instruction &getLastInstructionInBundle(
const TreeEntry *
E);
3812 std::optional<TargetTransformInfo::ShuffleKind>
3824 unsigned NumParts)
const;
3836 std::optional<TargetTransformInfo::ShuffleKind>
3837 isGatherShuffledSingleRegisterEntry(
3854 isGatherShuffledEntry(
3857 unsigned NumParts,
bool ForOrder =
false);
3863 Type *ScalarTy)
const;
3867 void setInsertPointAfterBundle(
const TreeEntry *
E);
3877 bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
3882 void tryToVectorizeGatheredLoads(
3884 std::tuple<BasicBlock *, Value *, Type *>,
3892 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
3908 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
3912 void reorderGatherNode(TreeEntry &TE);
3917 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3920 SmallVector<int> getCommonMask()
const {
3921 if (State == TreeEntry::SplitVectorize)
3923 SmallVector<int>
Mask;
3930 SmallVector<int> getSplitMask()
const {
3931 assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
3932 "Expected only split vectorize node.");
3934 unsigned CommonVF = std::max<unsigned>(
3935 CombinedEntriesWithIndices.back().second,
3936 Scalars.size() - CombinedEntriesWithIndices.back().second);
3937 for (
auto [Idx,
I] :
enumerate(ReorderIndices))
3939 Idx + (Idx >= CombinedEntriesWithIndices.back().second
3940 ? CommonVF - CombinedEntriesWithIndices.back().second
3947 void reorderSplitNode(
unsigned Idx, ArrayRef<int> Mask,
3948 ArrayRef<int> MaskOrder);
3953 if (
Mask.size() != VL.
size() && VL.
size() == Scalars.size())
3954 return std::equal(VL.
begin(), VL.
end(), Scalars.begin());
3957 [Scalars](
Value *V,
int Idx) {
3958 return (isa<UndefValue>(V) &&
3959 Idx == PoisonMaskElem) ||
3960 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3963 if (!ReorderIndices.empty()) {
3967 SmallVector<int>
Mask;
3969 if (VL.
size() == Scalars.size())
3970 return IsSame(Scalars, Mask);
3971 if (VL.
size() == ReuseShuffleIndices.size()) {
3973 return IsSame(Scalars, Mask);
3977 return IsSame(Scalars, ReuseShuffleIndices);
3981 bool hasEqualOperands(
const TreeEntry &TE)
const {
3982 if (
TE.getNumOperands() != getNumOperands())
3984 SmallBitVector
Used(getNumOperands());
3985 for (
unsigned I = 0,
E = getNumOperands();
I <
E; ++
I) {
3986 unsigned PrevCount =
Used.count();
3987 for (
unsigned K = 0;
K <
E; ++
K) {
3990 if (getOperand(K) ==
TE.getOperand(
I)) {
3996 if (PrevCount ==
Used.count())
4005 unsigned getVectorFactor()
const {
4006 if (!ReuseShuffleIndices.empty())
4007 return ReuseShuffleIndices.size();
4008 return Scalars.size();
4012 bool isGather()
const {
return State == NeedToGather; }
4018 WeakTrackingVH VectorizedValue =
nullptr;
4039 enum CombinedOpcode {
4041 MinMax = Instruction::OtherOpsEnd + 1,
4044 CombinedOpcode CombinedOp = NotCombinedOp;
4047 SmallVector<int, 4> ReuseShuffleIndices;
4050 SmallVector<unsigned, 4> ReorderIndices;
4058 VecTreeTy &Container;
4061 EdgeInfo UserTreeIndex;
4074 SmallVector<ValueList, 2> Operands;
4077 SmallPtrSet<const Value *, 4> CopyableElements;
4081 InstructionsState S = InstructionsState::invalid();
4084 unsigned InterleaveFactor = 0;
4087 bool DoesNotNeedToSchedule =
false;
4091 if (Operands.size() <
OpIdx + 1)
4092 Operands.resize(
OpIdx + 1);
4095 "Number of operands is greater than the number of scalars.");
4102 unsigned getInterleaveFactor()
const {
return InterleaveFactor; }
4104 void setInterleave(
unsigned Factor) { InterleaveFactor = Factor; }
4107 void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule =
true; }
4110 bool doesNotNeedToSchedule()
const {
return DoesNotNeedToSchedule; }
4115 setOperand(
I, Operands[
I]);
4119 void reorderOperands(ArrayRef<int> Mask) {
4127 return Operands[
OpIdx];
4133 return Operands[
OpIdx];
4137 unsigned getNumOperands()
const {
return Operands.size(); }
4140 Value *getSingleOperand(
unsigned OpIdx)
const {
4143 return Operands[
OpIdx][0];
4147 bool isAltShuffle()
const {
return S.isAltShuffle(); }
4149 Instruction *getMatchingMainOpOrAltOp(Instruction *
I)
const {
4150 return S.getMatchingMainOpOrAltOp(
I);
4158 if (
I && getMatchingMainOpOrAltOp(
I))
4160 return S.getMainOp();
4163 void setOperations(
const InstructionsState &S) {
4164 assert(S &&
"InstructionsState is invalid.");
4168 Instruction *getMainOp()
const {
return S.getMainOp(); }
4170 Instruction *getAltOp()
const {
return S.getAltOp(); }
4173 unsigned getOpcode()
const {
return S.getOpcode(); }
4175 unsigned getAltOpcode()
const {
return S.getAltOpcode(); }
4177 bool hasState()
const {
return S.valid(); }
4180 void addCopyableElement(
Value *V) {
4181 assert(S.isCopyableElement(V) &&
"Not a copyable element.");
4182 CopyableElements.insert(V);
4186 bool isCopyableElement(
Value *V)
const {
4187 return CopyableElements.contains(V);
4191 bool hasCopyableElements()
const {
return !CopyableElements.empty(); }
4194 const InstructionsState &getOperations()
const {
return S; }
4198 unsigned findLaneForValue(
Value *V)
const {
4199 unsigned FoundLane = getVectorFactor();
4200 for (
auto *It =
find(Scalars, V), *End = Scalars.end(); It != End;
4201 std::advance(It, 1)) {
4204 FoundLane = std::distance(Scalars.begin(), It);
4205 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
4206 if (!ReorderIndices.empty())
4207 FoundLane = ReorderIndices[FoundLane];
4208 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
4209 if (ReuseShuffleIndices.empty())
4211 if (
auto *RIt =
find(ReuseShuffleIndices, FoundLane);
4212 RIt != ReuseShuffleIndices.end()) {
4213 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
4217 assert(FoundLane < getVectorFactor() &&
"Unable to find given value.");
4224 buildAltOpShuffleMask(
const function_ref<
bool(Instruction *)> IsAltOp,
4225 SmallVectorImpl<int> &Mask,
4226 SmallVectorImpl<Value *> *OpScalars =
nullptr,
4227 SmallVectorImpl<Value *> *AltScalars =
nullptr)
const;
4230 bool isNonPowOf2Vec()
const {
4232 return IsNonPowerOf2;
4238 hasNonWholeRegisterOrNonPowerOf2Vec(
const TargetTransformInfo &
TTI)
const {
4241 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
4242 "Reshuffling not supported with non-power-of-2 vectors yet.");
4243 return IsNonPowerOf2;
4246 Value *getOrdered(
unsigned Idx)
const {
4247 if (ReorderIndices.empty())
4248 return Scalars[Idx];
4249 SmallVector<int>
Mask;
4251 return Scalars[
Mask[Idx]];
4257 dbgs() << Idx <<
".\n";
4258 for (
unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
4259 dbgs() <<
"Operand " << OpI <<
":\n";
4260 for (
const Value *V : Operands[OpI])
4263 dbgs() <<
"Scalars: \n";
4264 for (
Value *V : Scalars)
4266 dbgs() <<
"State: ";
4267 if (S && hasCopyableElements())
4268 dbgs() <<
"[[Copyable]] ";
4271 if (InterleaveFactor > 0) {
4272 dbgs() <<
"Vectorize with interleave factor " << InterleaveFactor
4275 dbgs() <<
"Vectorize\n";
4278 case ScatterVectorize:
4279 dbgs() <<
"ScatterVectorize\n";
4281 case StridedVectorize:
4282 dbgs() <<
"StridedVectorize\n";
4284 case CompressVectorize:
4285 dbgs() <<
"CompressVectorize\n";
4288 dbgs() <<
"NeedToGather\n";
4290 case CombinedVectorize:
4291 dbgs() <<
"CombinedVectorize\n";
4293 case SplitVectorize:
4294 dbgs() <<
"SplitVectorize\n";
4298 dbgs() <<
"MainOp: " << *S.getMainOp() <<
"\n";
4299 dbgs() <<
"AltOp: " << *S.getAltOp() <<
"\n";
4301 dbgs() <<
"MainOp: NULL\n";
4302 dbgs() <<
"AltOp: NULL\n";
4304 dbgs() <<
"VectorizedValue: ";
4305 if (VectorizedValue)
4306 dbgs() << *VectorizedValue <<
"\n";
4309 dbgs() <<
"ReuseShuffleIndices: ";
4310 if (ReuseShuffleIndices.empty())
4313 for (
int ReuseIdx : ReuseShuffleIndices)
4314 dbgs() << ReuseIdx <<
", ";
4316 dbgs() <<
"ReorderIndices: ";
4317 for (
unsigned ReorderIdx : ReorderIndices)
4318 dbgs() << ReorderIdx <<
", ";
4320 dbgs() <<
"UserTreeIndex: ";
4322 dbgs() << UserTreeIndex;
4324 dbgs() <<
"<invalid>";
4326 if (!CombinedEntriesWithIndices.empty()) {
4327 dbgs() <<
"Combined entries: ";
4329 dbgs() <<
"Entry index " <<
P.first <<
" with offset " <<
P.second;
4340 StringRef Banner)
const {
4341 dbgs() <<
"SLP: " << Banner <<
":\n";
4343 dbgs() <<
"SLP: Costs:\n";
4344 dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
4345 dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
4346 dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
4347 dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = "
4348 << ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
4354 const InstructionsState &S,
4356 ArrayRef<int> ReuseShuffleIndices = {}) {
4357 auto Invalid = ScheduleBundle::invalid();
4358 return newTreeEntry(VL,
Invalid, S, UserTreeIdx, ReuseShuffleIndices);
4363 const InstructionsState &S,
4365 ArrayRef<int> ReuseShuffleIndices = {},
4366 ArrayRef<unsigned> ReorderIndices = {},
4367 unsigned InterleaveFactor = 0) {
4368 TreeEntry::EntryState EntryState =
4369 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
4370 TreeEntry *
E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
4371 ReuseShuffleIndices, ReorderIndices);
4372 if (
E && InterleaveFactor > 0)
4373 E->setInterleave(InterleaveFactor);
4378 TreeEntry::EntryState EntryState,
4379 ScheduleBundle &Bundle,
const InstructionsState &S,
4381 ArrayRef<int> ReuseShuffleIndices = {},
4382 ArrayRef<unsigned> ReorderIndices = {}) {
4383 assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||
4384 EntryState == TreeEntry::SplitVectorize)) ||
4385 (Bundle && EntryState != TreeEntry::NeedToGather &&
4386 EntryState != TreeEntry::SplitVectorize)) &&
4387 "Need to vectorize gather entry?");
4389 if (GatheredLoadsEntriesFirst.has_value() &&
4390 EntryState == TreeEntry::NeedToGather && S &&
4391 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
4392 !UserTreeIdx.UserTE)
4394 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
4395 TreeEntry *
Last = VectorizableTree.back().get();
4396 Last->Idx = VectorizableTree.size() - 1;
4397 Last->State = EntryState;
4398 if (UserTreeIdx.UserTE)
4399 OperandsToTreeEntry.try_emplace(
4400 std::make_pair(UserTreeIdx.UserTE, UserTreeIdx.EdgeIdx),
Last);
4405 ReuseShuffleIndices.empty()) &&
4406 "Reshuffling scalars not yet supported for nodes with padding");
4407 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
4408 ReuseShuffleIndices.end());
4409 if (ReorderIndices.
empty()) {
4412 Last->setOperations(S);
4415 Last->Scalars.assign(VL.
size(),
nullptr);
4417 [VL](
unsigned Idx) ->
Value * {
4418 if (Idx >= VL.size())
4419 return UndefValue::get(VL.front()->getType());
4424 Last->setOperations(S);
4425 Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
4427 if (EntryState == TreeEntry::SplitVectorize) {
4428 assert(S &&
"Split nodes must have operations.");
4429 Last->setOperations(S);
4430 SmallPtrSet<Value *, 4> Processed;
4431 for (
Value *V : VL) {
4435 auto It = ScalarsInSplitNodes.find(V);
4436 if (It == ScalarsInSplitNodes.end()) {
4437 ScalarsInSplitNodes.try_emplace(V).first->getSecond().push_back(
Last);
4438 (void)Processed.
insert(V);
4439 }
else if (Processed.
insert(V).second) {
4441 "Value already associated with the node.");
4442 It->getSecond().push_back(
Last);
4445 }
else if (!
Last->isGather()) {
4448 (!S.areInstructionsWithCopyableElements() &&
4450 all_of(VL, [&](
Value *V) {
return S.isNonSchedulable(V); }))
4451 Last->setDoesNotNeedToSchedule();
4452 SmallPtrSet<Value *, 4> Processed;
4453 for (
Value *V : VL) {
4456 if (S.isCopyableElement(V)) {
4457 Last->addCopyableElement(V);
4460 auto It = ScalarToTreeEntries.find(V);
4461 if (It == ScalarToTreeEntries.end()) {
4462 ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(
Last);
4463 (void)Processed.
insert(V);
4464 }
else if (Processed.
insert(V).second) {
4466 "Value already associated with the node.");
4467 It->getSecond().push_back(
Last);
4471 assert((!Bundle.getBundle().empty() ||
Last->doesNotNeedToSchedule()) &&
4472 "Bundle and VL out of sync");
4473 if (!Bundle.getBundle().empty()) {
4474#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
4475 auto *BundleMember = Bundle.getBundle().begin();
4476 SmallPtrSet<Value *, 4> Processed;
4477 for (
Value *V : VL) {
4478 if (S.isNonSchedulable(V) || !Processed.
insert(V).second)
4482 assert(BundleMember == Bundle.getBundle().end() &&
4483 "Bundle and VL out of sync");
4485 Bundle.setTreeEntry(
Last);
4489 bool AllConstsOrCasts =
true;
4490 for (
Value *V : VL) {
4491 if (S && S.areInstructionsWithCopyableElements() &&
4492 S.isCopyableElement(V))
4493 Last->addCopyableElement(V);
4496 AllConstsOrCasts &=
I &&
I->getType()->isIntegerTy();
4497 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
4498 !UserTreeIdx.UserTE->isGather())
4499 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(
Last);
4502 if (AllConstsOrCasts)
4504 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
4505 MustGather.insert_range(VL);
4508 if (UserTreeIdx.UserTE)
4509 Last->UserTreeIndex = UserTreeIdx;
4515 TreeEntry::VecTreeTy VectorizableTree;
4520 for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
4521 VectorizableTree[
Id]->dump();
4529 assert(V &&
"V cannot be nullptr.");
4530 auto It = ScalarToTreeEntries.find(V);
4531 if (It == ScalarToTreeEntries.end())
4533 return It->getSecond();
4538 assert(V &&
"V cannot be nullptr.");
4539 auto It = ScalarsInSplitNodes.find(V);
4540 if (It == ScalarsInSplitNodes.end())
4542 return It->getSecond();
4547 bool SameVF =
false)
const {
4548 assert(V &&
"V cannot be nullptr.");
4549 for (TreeEntry *TE : ScalarToTreeEntries.lookup(V))
4550 if ((!SameVF ||
TE->getVectorFactor() == VL.
size()) &&
TE->isSame(VL))
4561 bool areAltOperandsProfitable(
const InstructionsState &S,
4566 class ScalarsVectorizationLegality {
4567 InstructionsState S;
4569 bool TryToFindDuplicates;
4570 bool TrySplitVectorize;
4573 ScalarsVectorizationLegality(InstructionsState S,
bool IsLegal,
4574 bool TryToFindDuplicates =
true,
4575 bool TrySplitVectorize =
false)
4576 : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
4577 TrySplitVectorize(TrySplitVectorize) {
4578 assert((!IsLegal || (S.valid() && TryToFindDuplicates)) &&
4579 "Inconsistent state");
4581 const InstructionsState &getInstructionsState()
const {
return S; };
4582 bool isLegal()
const {
return IsLegal; }
4583 bool tryToFindDuplicates()
const {
return TryToFindDuplicates; }
4584 bool trySplitVectorize()
const {
return TrySplitVectorize; }
4589 ScalarsVectorizationLegality
4592 bool TryCopyableElementsVectorization)
const;
4596 TreeEntry::EntryState getScalarsVectorizationState(
4598 bool IsScatterVectorizeUserTE,
OrdersType &CurrentOrder,
4599 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo);
4602 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
4605 SmallPtrSet<const TreeEntry *, 8> DeletedNodes;
4609 SmallDenseMap<const TreeEntry *, InstructionCost> TransformedToGatherNodes;
4612 SmallDenseMap<std::pair<const TreeEntry *, unsigned>, TreeEntry *>
4613 OperandsToTreeEntry;
4616 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarsInSplitNodes;
4619 SmallDenseMap<Value *, unsigned> InstrElementSize;
4633 SmallDenseMap<const TreeEntry *, WeakTrackingVH> EntryToLastInstruction;
4637 SmallDenseMap<const Instruction *, Instruction *> LastInstructionToPos;
4642 SetVector<const TreeEntry *> PostponedGathers;
4644 using ValueToGatherNodesMap =
4645 DenseMap<Value *, SmallSetVector<const TreeEntry *, 4>>;
4646 ValueToGatherNodesMap ValueToGatherNodes;
4651 SetVector<unsigned> LoadEntriesToVectorize;
4654 bool IsGraphTransformMode =
false;
4657 std::optional<unsigned> GatheredLoadsEntriesFirst;
4660 SmallDenseMap<
const TreeEntry *,
4661 std::tuple<SmallVector<int>,
VectorType *, unsigned,
bool>>
4662 CompressEntryToData;
4665 struct ExternalUser {
4666 ExternalUser(
Value *S, llvm::User *U,
const TreeEntry &E,
unsigned L)
4667 : Scalar(S), User(
U), E(E), Lane(
L) {}
4670 Value *Scalar =
nullptr;
4673 llvm::User *User =
nullptr;
4681 using UserList = SmallVector<ExternalUser, 16>;
4687 bool isAliased(
const MemoryLocation &Loc1, Instruction *Inst1,
4688 Instruction *Inst2) {
4691 AliasCacheKey
Key = std::make_pair(Inst1, Inst2);
4692 auto Res = AliasCache.try_emplace(
Key);
4694 return Res.first->second;
4695 bool Aliased =
isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
4697 Res.first->getSecond() = Aliased;
4701 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
4705 SmallDenseMap<AliasCacheKey, bool> AliasCache;
4710 BatchAAResults BatchAA;
4717 DenseSet<Instruction *> DeletedInstructions;
4720 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
4723 DenseSet<size_t> AnalyzedReductionVals;
4727 DenseSet<Value *> AnalyzedMinBWVals;
4733 UserList ExternalUses;
4737 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
4741 SmallPtrSet<Value *, 4> ExternalUsesWithNonUsers;
4744 SmallPtrSet<const Value *, 32> EphValues;
4748 SetVector<Instruction *> GatherShuffleExtractSeq;
4751 DenseSet<BasicBlock *> CSEBlocks;
4754 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
4761 class ScheduleEntity {
4762 friend class ScheduleBundle;
4763 friend class ScheduleData;
4764 friend class ScheduleCopyableData;
4767 enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData };
4768 Kind getKind()
const {
return K; }
4769 ScheduleEntity(Kind K) : K(K) {}
4773 int SchedulingPriority = 0;
4776 bool IsScheduled =
false;
4778 const Kind K = Kind::ScheduleData;
4781 ScheduleEntity() =
delete;
4783 void setSchedulingPriority(
int Priority) { SchedulingPriority = Priority; }
4784 int getSchedulingPriority()
const {
return SchedulingPriority; }
4785 bool isReady()
const {
4787 return SD->isReady();
4789 return CD->isReady();
4795 bool hasValidDependencies()
const {
4797 return SD->hasValidDependencies();
4799 return CD->hasValidDependencies();
4803 int getUnscheduledDeps()
const {
4805 return SD->getUnscheduledDeps();
4807 return CD->getUnscheduledDeps();
4811 int incrementUnscheduledDeps(
int Incr) {
4813 return SD->incrementUnscheduledDeps(Incr);
4817 int getDependencies()
const {
4819 return SD->getDependencies();
4825 return SD->getInst();
4830 bool isScheduled()
const {
return IsScheduled; }
4831 void setScheduled(
bool Scheduled) { IsScheduled = Scheduled; }
4833 static bool classof(
const ScheduleEntity *) {
return true; }
4835#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4836 void dump(raw_ostream &OS)
const {
4838 return SD->dump(OS);
4840 return CD->dump(OS);
4851#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4853 const BoUpSLP::ScheduleEntity &SE) {
4863 class ScheduleData final :
public ScheduleEntity {
4867 enum { InvalidDeps = -1 };
4869 ScheduleData() : ScheduleEntity(Kind::ScheduleData) {}
4870 static bool classof(
const ScheduleEntity *Entity) {
4871 return Entity->getKind() == Kind::ScheduleData;
4874 void init(
int BlockSchedulingRegionID, Instruction *
I) {
4875 NextLoadStore =
nullptr;
4876 IsScheduled =
false;
4877 SchedulingRegionID = BlockSchedulingRegionID;
4878 clearDependencies();
4884 if (hasValidDependencies()) {
4885 assert(UnscheduledDeps <= Dependencies &&
"invariant");
4887 assert(UnscheduledDeps == Dependencies &&
"invariant");
4891 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
4892 "unexpected scheduled state");
4899 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
4903 bool isReady()
const {
return UnscheduledDeps == 0 && !IsScheduled; }
4908 int incrementUnscheduledDeps(
int Incr) {
4909 assert(hasValidDependencies() &&
4910 "increment of unscheduled deps would be meaningless");
4911 UnscheduledDeps += Incr;
4912 assert(UnscheduledDeps >= 0 &&
4913 "Expected valid number of unscheduled deps");
4914 return UnscheduledDeps;
4919 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
4922 void clearDependencies() {
4923 clearDirectDependencies();
4924 MemoryDependencies.clear();
4925 ControlDependencies.clear();
4932 void clearDirectDependencies() {
4933 Dependencies = InvalidDeps;
4934 resetUnscheduledDeps();
4935 IsScheduled =
false;
4939 int getUnscheduledDeps()
const {
return UnscheduledDeps; }
4941 int getDependencies()
const {
return Dependencies; }
4943 void initDependencies() { Dependencies = 0; }
4945 void incDependencies() { Dependencies++; }
4948 int getSchedulingRegionID()
const {
return SchedulingRegionID; }
4955 return MemoryDependencies;
4958 void addMemoryDependency(ScheduleData *Dep) {
4959 MemoryDependencies.push_back(Dep);
4963 return ControlDependencies;
4966 void addControlDependency(ScheduleData *Dep) {
4967 ControlDependencies.push_back(Dep);
4970 ScheduleData *getNextLoadStore()
const {
return NextLoadStore; }
4971 void setNextLoadStore(ScheduleData *
Next) { NextLoadStore =
Next; }
4973 void dump(raw_ostream &OS)
const { OS << *Inst; }
4985 ScheduleData *NextLoadStore =
nullptr;
4989 SmallVector<ScheduleData *> MemoryDependencies;
4995 SmallVector<ScheduleData *> ControlDependencies;
4999 int SchedulingRegionID = 0;
5005 int Dependencies = InvalidDeps;
5011 int UnscheduledDeps = InvalidDeps;
5016 const BoUpSLP::ScheduleData &SD) {
5022 class ScheduleBundle final :
public ScheduleEntity {
5026 bool IsValid =
true;
5028 TreeEntry *TE =
nullptr;
5029 ScheduleBundle(
bool IsValid)
5030 : ScheduleEntity(Kind::ScheduleBundle), IsValid(IsValid) {}
5033 ScheduleBundle() : ScheduleEntity(Kind::ScheduleBundle) {}
5034 static bool classof(
const ScheduleEntity *Entity) {
5035 return Entity->getKind() == Kind::ScheduleBundle;
5040 for (
const ScheduleEntity *SD : Bundle) {
5041 if (SD->hasValidDependencies()) {
5042 assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
5045 assert(SD->getUnscheduledDeps() == SD->getDependencies() &&
5049 if (isScheduled()) {
5050 assert(SD->hasValidDependencies() && SD->getUnscheduledDeps() == 0 &&
5051 "unexpected scheduled state");
5057 int unscheduledDepsInBundle()
const {
5058 assert(*
this &&
"bundle must not be empty");
5060 for (
const ScheduleEntity *BundleMember : Bundle) {
5061 if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
5062 return ScheduleData::InvalidDeps;
5063 Sum += BundleMember->getUnscheduledDeps();
5071 bool hasValidDependencies()
const {
5072 return all_of(Bundle, [](
const ScheduleEntity *SD) {
5073 return SD->hasValidDependencies();
5079 bool isReady()
const {
5080 assert(*
this &&
"bundle must not be empty");
5081 return unscheduledDepsInBundle() == 0 && !isScheduled();
5089 void add(ScheduleEntity *SD) { Bundle.push_back(SD); }
5092 void setTreeEntry(TreeEntry *TE) { this->TE = TE; }
5093 TreeEntry *getTreeEntry()
const {
return TE; }
5095 static ScheduleBundle invalid() {
return {
false}; }
5097 operator bool()
const {
return IsValid; }
5100 void dump(raw_ostream &OS)
const {
5109 OS << *SD->getInst();
5123 const BoUpSLP::ScheduleBundle &Bundle) {
5134 class ScheduleCopyableData final :
public ScheduleEntity {
5141 int SchedulingRegionID = 0;
5143 ScheduleBundle &Bundle;
5146 ScheduleCopyableData(
int BlockSchedulingRegionID,
Instruction *
I,
5147 const EdgeInfo &EI, ScheduleBundle &Bundle)
5148 : ScheduleEntity(Kind::ScheduleCopyableData), Inst(
I), EI(EI),
5149 SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {}
5150 static bool classof(
const ScheduleEntity *Entity) {
5151 return Entity->getKind() == Kind::ScheduleCopyableData;
5156 if (hasValidDependencies()) {
5157 assert(UnscheduledDeps <= Dependencies &&
"invariant");
5159 assert(UnscheduledDeps == Dependencies &&
"invariant");
5163 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
5164 "unexpected scheduled state");
5171 bool hasValidDependencies()
const {
5172 return Dependencies != ScheduleData::InvalidDeps;
5177 bool isReady()
const {
return UnscheduledDeps == 0 && !IsScheduled; }
5182 int incrementUnscheduledDeps(
int Incr) {
5183 assert(hasValidDependencies() &&
5184 "increment of unscheduled deps would be meaningless");
5185 UnscheduledDeps += Incr;
5186 assert(UnscheduledDeps >= 0 &&
"invariant");
5187 return UnscheduledDeps;
5192 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5195 int getUnscheduledDeps()
const {
return UnscheduledDeps; }
5197 int getDependencies()
const {
return Dependencies; }
5199 void initDependencies() { Dependencies = 0; }
5201 void incDependencies() { Dependencies++; }
5204 int getSchedulingRegionID()
const {
return SchedulingRegionID; }
5210 void clearDependencies() {
5211 Dependencies = ScheduleData::InvalidDeps;
5212 UnscheduledDeps = ScheduleData::InvalidDeps;
5213 IsScheduled =
false;
5217 const EdgeInfo &getEdgeInfo()
const {
return EI; }
5220 ScheduleBundle &getBundle() {
return Bundle; }
5221 const ScheduleBundle &getBundle()
const {
return Bundle; }
5223#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5224 void dump(raw_ostream &OS)
const { OS <<
"[Copyable]" << *getInst(); }
5235 int Dependencies = ScheduleData::InvalidDeps;
5241 int UnscheduledDeps = ScheduleData::InvalidDeps;
5271 struct BlockScheduling {
5273 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
5276 ScheduledBundles.clear();
5277 ScheduledBundlesList.
clear();
5278 ScheduleCopyableDataMap.clear();
5279 ScheduleCopyableDataMapByInst.clear();
5280 ScheduleCopyableDataMapByInstUser.clear();
5281 ScheduleCopyableDataMapByUsers.clear();
5283 ScheduleStart =
nullptr;
5284 ScheduleEnd =
nullptr;
5285 FirstLoadStoreInRegion =
nullptr;
5286 LastLoadStoreInRegion =
nullptr;
5287 RegionHasStackSave =
false;
5291 ScheduleRegionSizeLimit -= ScheduleRegionSize;
5294 ScheduleRegionSize = 0;
5298 ++SchedulingRegionID;
5301 ScheduleData *getScheduleData(Instruction *
I) {
5304 if (BB !=
I->getParent())
5307 ScheduleData *SD = ScheduleDataMap.lookup(
I);
5308 if (SD && isInSchedulingRegion(*SD))
5313 ScheduleData *getScheduleData(
Value *V) {
5319 ScheduleCopyableData *getScheduleCopyableData(
const EdgeInfo &EI,
5320 const Value *V)
const {
5321 if (ScheduleCopyableDataMap.empty())
5323 auto It = ScheduleCopyableDataMap.find(std::make_pair(EI, V));
5324 if (It == ScheduleCopyableDataMap.end())
5326 ScheduleCopyableData *SD = It->getSecond().get();
5327 if (!isInSchedulingRegion(*SD))
5335 getScheduleCopyableData(
const Value *User,
unsigned OperandIdx,
5337 if (ScheduleCopyableDataMapByInstUser.empty())
5339 const auto It = ScheduleCopyableDataMapByInstUser.find(
5340 std::make_pair(std::make_pair(User, OperandIdx), V));
5341 if (It == ScheduleCopyableDataMapByInstUser.end())
5344 for (ScheduleCopyableData *SD : It->getSecond()) {
5345 if (isInSchedulingRegion(*SD))
5359 bool areAllOperandsReplacedByCopyableData(Instruction *User,
5363 if (ScheduleCopyableDataMap.empty())
5365 SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;
5367 if (Entries.
empty())
5369 unsigned CurNumOps = 0;
5370 for (
const Use &U :
User->operands()) {
5376 for (TreeEntry *TE : Entries) {
5378 bool IsNonSchedulableWithParentPhiNode =
5379 TE->doesNotNeedToSchedule() &&
TE->UserTreeIndex &&
5380 TE->UserTreeIndex.UserTE->hasState() &&
5381 TE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
5382 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
5385 if (IsNonSchedulableWithParentPhiNode) {
5386 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5387 const TreeEntry *ParentTE =
TE->UserTreeIndex.UserTE;
5388 for (
Value *V : ParentTE->Scalars) {
5392 if (ParentsUniqueUsers.
insert(
PHI).second &&
5397 Inc =
count(
TE->Scalars, User);
5405 bool IsCommutativeUser =
5408 if (!IsCommutativeUser) {
5418 (!IsCommutativeUser ||
5427 "Expected commutative user with 2 first commutable operands");
5428 bool IsCommutativeWithSameOps =
5429 IsCommutativeUser &&
User->getOperand(0) ==
User->getOperand(1);
5430 if ((!IsCommutativeUser || IsCommutativeWithSameOps) &&
5432 EdgeInfo EI(TE,
U.getOperandNo());
5433 if (CurNumOps !=
NumOps || getScheduleCopyableData(EI,
Op))
5437 PotentiallyReorderedEntriesCount.
try_emplace(TE, 0)
5438 .first->getSecond() += Inc;
5441 if (PotentiallyReorderedEntriesCount.
empty())
5444 for (
auto &
P : PotentiallyReorderedEntriesCount) {
5445 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5446 bool IsNonSchedulableWithParentPhiNode =
5447 P.first->doesNotNeedToSchedule() &&
P.first->UserTreeIndex &&
5448 P.first->UserTreeIndex.UserTE->hasState() &&
5449 P.first->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
5450 P.first->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
5451 auto *It =
find(
P.first->Scalars, User);
5453 assert(It !=
P.first->Scalars.end() &&
5454 "User is not in the tree entry");
5455 int Lane = std::distance(
P.first->Scalars.begin(), It);
5456 assert(Lane >= 0 &&
"Lane is not found");
5458 Lane =
P.first->ReorderIndices[Lane];
5459 assert(Lane <
static_cast<int>(
P.first->Scalars.size()) &&
5460 "Couldn't find extract lane");
5463 if (IsNonSchedulableWithParentPhiNode) {
5464 const TreeEntry *ParentTE =
P.first->UserTreeIndex.UserTE;
5466 if (!ParentsUniqueUsers.
insert(User).second) {
5472 for (
unsigned OpIdx :
5474 P.first->getMainOp()))) {
5475 if (
P.first->getOperand(
OpIdx)[Lane] ==
Op &&
5476 getScheduleCopyableData(EdgeInfo(
P.first,
OpIdx),
Op))
5481 }
while (It !=
P.first->Scalars.end());
5483 return all_of(PotentiallyReorderedEntriesCount,
5484 [&](
const std::pair<const TreeEntry *, unsigned> &
P) {
5485 return P.second ==
NumOps - 1;
5490 getScheduleCopyableData(
const Instruction *
I)
const {
5491 if (ScheduleCopyableDataMapByInst.empty())
5493 const auto It = ScheduleCopyableDataMapByInst.find(
I);
5494 if (It == ScheduleCopyableDataMapByInst.end())
5497 for (ScheduleCopyableData *SD : It->getSecond()) {
5498 if (isInSchedulingRegion(*SD))
5505 getScheduleCopyableDataUsers(
const Instruction *User)
const {
5506 if (ScheduleCopyableDataMapByUsers.empty())
5508 const auto It = ScheduleCopyableDataMapByUsers.find(User);
5509 if (It == ScheduleCopyableDataMapByUsers.end())
5512 for (ScheduleCopyableData *SD : It->getSecond()) {
5513 if (isInSchedulingRegion(*SD))
5519 ScheduleCopyableData &addScheduleCopyableData(
const EdgeInfo &EI,
5521 int SchedulingRegionID,
5522 ScheduleBundle &Bundle) {
5523 assert(!getScheduleCopyableData(EI,
I) &&
"already in the map");
5524 ScheduleCopyableData *CD =
5525 ScheduleCopyableDataMap
5526 .try_emplace(std::make_pair(EI,
I),
5527 std::make_unique<ScheduleCopyableData>(
5528 SchedulingRegionID,
I, EI, Bundle))
5531 ScheduleCopyableDataMapByInst[
I].push_back(CD);
5535 assert(It !=
Op.end() &&
"Lane not set");
5536 SmallPtrSet<Instruction *, 4> Visited;
5538 int Lane = std::distance(
Op.begin(), It);
5539 assert(Lane >= 0 &&
"Lane not set");
5541 !EI.UserTE->ReorderIndices.empty())
5542 Lane = EI.UserTE->ReorderIndices[Lane];
5543 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
5544 "Couldn't find extract lane");
5546 if (!Visited.
insert(In).second) {
5550 ScheduleCopyableDataMapByInstUser
5551 .try_emplace(std::make_pair(std::make_pair(In, EI.EdgeIdx),
I))
5554 ScheduleCopyableDataMapByUsers.try_emplace(
I)
5561 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
5562 if (ScheduleCopyableData *UserCD =
5563 getScheduleCopyableData(UserEI, In))
5564 ScheduleCopyableDataMapByUsers[
I].remove(UserCD);
5567 }
while (It !=
Op.end());
5569 ScheduleCopyableDataMapByUsers.try_emplace(
I).first->getSecond().insert(
5579 auto It = ScheduledBundles.find(
I);
5580 if (It == ScheduledBundles.end())
5582 return It->getSecond();
5586 bool isInSchedulingRegion(
const ScheduleEntity &SD)
const {
5588 return Data->getSchedulingRegionID() == SchedulingRegionID;
5590 return CD->getSchedulingRegionID() == SchedulingRegionID;
5592 [&](
const ScheduleEntity *BundleMember) {
5593 return isInSchedulingRegion(*BundleMember);
5599 template <
typename ReadyListType>
5600 void schedule(
const BoUpSLP &R,
const InstructionsState &S,
5601 const EdgeInfo &EI, ScheduleEntity *
Data,
5602 ReadyListType &ReadyList) {
5603 auto ProcessBundleMember = [&](ScheduleEntity *BundleMember,
5608 auto DecrUnsched = [&](
auto *
Data,
bool IsControl =
false) {
5609 if ((IsControl ||
Data->hasValidDependencies()) &&
5610 Data->incrementUnscheduledDeps(-1) == 0) {
5617 CopyableBundle.
push_back(&CD->getBundle());
5618 Bundles = CopyableBundle;
5620 Bundles = getScheduleBundles(
Data->getInst());
5622 if (!Bundles.
empty()) {
5623 for (ScheduleBundle *Bundle : Bundles) {
5624 if (Bundle->unscheduledDepsInBundle() == 0) {
5625 assert(!Bundle->isScheduled() &&
5626 "already scheduled bundle gets ready");
5627 ReadyList.insert(Bundle);
5629 <<
"SLP: gets ready: " << *Bundle <<
"\n");
5635 "already scheduled bundle gets ready");
5637 "Expected non-copyable data");
5638 ReadyList.insert(
Data);
5645 if (!ScheduleCopyableDataMap.empty()) {
5647 getScheduleCopyableData(User,
OpIdx,
I);
5648 for (ScheduleCopyableData *CD : CopyableData)
5649 DecrUnsched(CD,
false);
5650 if (!CopyableData.empty())
5653 if (ScheduleData *OpSD = getScheduleData(
I))
5654 DecrUnsched(OpSD,
false);
5660 if (!Bundles.empty()) {
5661 auto *
In = BundleMember->getInst();
5663 SmallDenseMap<const Instruction *, unsigned> OperandsUses;
5664 unsigned TotalOpCount = 0;
5667 TotalOpCount = OperandsUses[
In] = 1;
5669 for (
const Use &U :
In->operands()) {
5672 ++Res.first->getSecond();
5679 auto DecrUnschedForInst =
5681 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>>
5683 if (!ScheduleCopyableDataMap.empty()) {
5684 const EdgeInfo EI = {UserTE,
OpIdx};
5685 if (ScheduleCopyableData *CD =
5686 getScheduleCopyableData(EI,
I)) {
5687 if (!Checked.insert(std::make_pair(CD,
OpIdx)).second)
5689 DecrUnsched(CD,
false);
5693 auto It = OperandsUses.
find(
I);
5694 assert(It != OperandsUses.
end() &&
"Operand not found");
5695 if (It->second > 0) {
5696 if (ScheduleData *OpSD = getScheduleData(
I)) {
5697 if (!Checked.insert(std::make_pair(OpSD,
OpIdx)).second)
5700 assert(TotalOpCount > 0 &&
"No more operands to decrement");
5702 DecrUnsched(OpSD,
false);
5705 assert(TotalOpCount > 0 &&
"No more operands to decrement");
5711 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>> Checked;
5712 for (ScheduleBundle *Bundle : Bundles) {
5713 if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
5715 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5718 auto *It =
find(Bundle->getTreeEntry()->Scalars, In);
5719 bool IsNonSchedulableWithParentPhiNode =
5720 Bundle->getTreeEntry()->doesNotNeedToSchedule() &&
5721 Bundle->getTreeEntry()->UserTreeIndex &&
5722 Bundle->getTreeEntry()->UserTreeIndex.UserTE->hasState() &&
5723 Bundle->getTreeEntry()->UserTreeIndex.UserTE->State !=
5724 TreeEntry::SplitVectorize &&
5725 Bundle->getTreeEntry()->UserTreeIndex.UserTE->getOpcode() ==
5729 std::distance(Bundle->getTreeEntry()->Scalars.begin(), It);
5730 assert(Lane >= 0 &&
"Lane not set");
5732 !Bundle->getTreeEntry()->ReorderIndices.empty())
5733 Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
5734 assert(Lane <
static_cast<int>(
5735 Bundle->getTreeEntry()->Scalars.size()) &&
5736 "Couldn't find extract lane");
5746 In->getNumOperands() ==
5747 Bundle->getTreeEntry()->getNumOperands() ||
5748 Bundle->getTreeEntry()->isCopyableElement(In)) &&
5749 "Missed TreeEntry operands?");
5753 if (IsNonSchedulableWithParentPhiNode) {
5754 const TreeEntry *ParentTE =
5755 Bundle->getTreeEntry()->UserTreeIndex.UserTE;
5757 if (!ParentsUniqueUsers.
insert(User).second) {
5758 It = std::find(std::next(It),
5759 Bundle->getTreeEntry()->Scalars.end(), In);
5764 for (
unsigned OpIdx :
5767 Bundle->getTreeEntry()->getOperand(
OpIdx)[Lane])) {
5770 DecrUnschedForInst(
I, Bundle->getTreeEntry(),
OpIdx, Checked);
5773 if (Bundle->getTreeEntry()->isCopyableElement(In))
5775 It = std::find(std::next(It),
5776 Bundle->getTreeEntry()->Scalars.end(), In);
5777 }
while (It != Bundle->getTreeEntry()->Scalars.end());
5782 for (Use &U : BundleMember->getInst()->operands()) {
5785 <<
"SLP: check for readiness (def): " << *
I <<
"\n");
5786 DecrUnschedForInst(BundleMember->getInst(),
U.getOperandNo(),
I);
5794 SmallPtrSet<const ScheduleData *, 4> VisitedMemory;
5795 for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) {
5796 if (!VisitedMemory.
insert(MemoryDep).second)
5801 << *MemoryDep <<
"\n");
5802 DecrUnsched(MemoryDep);
5805 SmallPtrSet<const ScheduleData *, 4> VisitedControl;
5806 for (ScheduleData *Dep : SD->getControlDependencies()) {
5807 if (!VisitedControl.
insert(Dep).second)
5812 <<
"SLP: check for readiness (ctrl): " << *Dep <<
"\n");
5813 DecrUnsched(Dep,
true);
5817 SD->setScheduled(
true);
5822 if (
R.isVectorized(In)) {
5824 for (TreeEntry *TE : Entries) {
5826 In->getNumOperands() !=
TE->getNumOperands())
5829 PseudoBundles.
emplace_back(std::make_unique<ScheduleBundle>());
5830 BundlePtr->setTreeEntry(TE);
5835 ProcessBundleMember(SD, Bundles);
5838 Bundle.setScheduled(
true);
5840 auto AreAllBundlesScheduled =
5841 [&](
const ScheduleEntity *SD,
5845 return !SDBundles.empty() &&
5846 all_of(SDBundles, [&](
const ScheduleBundle *SDBundle) {
5847 return SDBundle->isScheduled();
5850 for (ScheduleEntity *SD : Bundle.getBundle()) {
5853 SDBundles = getScheduleBundles(SD->getInst());
5854 if (AreAllBundlesScheduled(SD, SDBundles)) {
5855 SD->setScheduled(
true);
5868 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
5869 ScheduleStart->comesBefore(ScheduleEnd) &&
5870 "Not a valid scheduling region?");
5872 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
5874 if (!Bundles.
empty()) {
5875 for (ScheduleBundle *Bundle : Bundles) {
5876 assert(isInSchedulingRegion(*Bundle) &&
5877 "primary schedule data not in window?");
5882 auto *SD = getScheduleData(
I);
5885 assert(isInSchedulingRegion(*SD) &&
5886 "primary schedule data not in window?");
5891 [](
const ScheduleEntity *Bundle) {
5892 return Bundle->isReady();
5894 "item in ready list not ready?");
5898 template <
typename ReadyListType>
5899 void initialFillReadyList(ReadyListType &ReadyList) {
5900 SmallPtrSet<ScheduleBundle *, 16> Visited;
5901 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
5902 ScheduleData *SD = getScheduleData(
I);
5903 if (SD && SD->hasValidDependencies() && SD->isReady()) {
5906 for (ScheduleBundle *Bundle : Bundles) {
5907 if (!Visited.
insert(Bundle).second)
5909 if (Bundle->hasValidDependencies() && Bundle->isReady()) {
5910 ReadyList.insert(Bundle);
5912 << *Bundle <<
"\n");
5917 ReadyList.insert(SD);
5919 <<
"SLP: initially in ready list: " << *SD <<
"\n");
5930 const InstructionsState &S,
const EdgeInfo &EI);
5937 std::optional<ScheduleBundle *>
5939 const InstructionsState &S,
const EdgeInfo &EI);
5942 ScheduleData *allocateScheduleDataChunks();
5946 bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
5950 void initScheduleData(Instruction *FromI, Instruction *ToI,
5951 ScheduleData *PrevLoadStore,
5952 ScheduleData *NextLoadStore);
5956 void calculateDependencies(ScheduleBundle &Bundle,
bool InsertInReadyList,
5961 void resetSchedule();
5978 SmallDenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
5982 SmallDenseMap<std::pair<EdgeInfo, const Value *>,
5983 std::unique_ptr<ScheduleCopyableData>>
5984 ScheduleCopyableDataMap;
5990 SmallDenseMap<const Instruction *, SmallVector<ScheduleCopyableData *>>
5991 ScheduleCopyableDataMapByInst;
5997 SmallDenseMap<std::pair<std::pair<const Value *, unsigned>,
const Value *>,
5999 ScheduleCopyableDataMapByInstUser;
6019 SmallSetVector<ScheduleCopyableData *, 4>>
6020 ScheduleCopyableDataMapByUsers;
6023 SmallDenseMap<Instruction *, SmallVector<ScheduleBundle *>>
6029 SetVector<ScheduleEntity *> ReadyInsts;
6039 ScheduleData *FirstLoadStoreInRegion =
nullptr;
6043 ScheduleData *LastLoadStoreInRegion =
nullptr;
6048 bool RegionHasStackSave =
false;
6051 int ScheduleRegionSize = 0;
6060 int SchedulingRegionID = 1;
6064 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
6068 void scheduleBlock(
const BoUpSLP &R, BlockScheduling *BS);
6071 const SmallDenseSet<Value *> *UserIgnoreList =
nullptr;
6075 struct OrdersTypeDenseMapInfo {
6088 static unsigned getHashValue(
const OrdersType &V) {
6099 ScalarEvolution *SE;
6100 TargetTransformInfo *TTI;
6101 TargetLibraryInfo *TLI;
6104 AssumptionCache *AC;
6106 const DataLayout *DL;
6107 OptimizationRemarkEmitter *ORE;
6109 unsigned MaxVecRegSize;
6110 unsigned MinVecRegSize;
6113 IRBuilder<TargetFolder> Builder;
6120 DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
6125 unsigned ReductionBitWidth = 0;
6128 unsigned BaseGraphSize = 1;
6132 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
6136 DenseSet<unsigned> ExtraBitWidthNodes;
6144 SecondInfo::getEmptyKey());
6149 SecondInfo::getTombstoneKey());
6154 SecondInfo::getHashValue(Val.
EdgeIdx));
6175 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
6186 return R.VectorizableTree[0].get();
6190 return {&
N->UserTreeIndex,
N->Container};
6194 return {&
N->UserTreeIndex + 1,
N->Container};
6221 static unsigned size(
BoUpSLP *R) {
return R->VectorizableTree.size(); }
6233 OS << Entry->Idx <<
".\n";
6236 for (
auto *V : Entry->Scalars) {
6238 if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
6239 return EU.Scalar == V;
6249 if (Entry->isGather())
6251 if (Entry->State == TreeEntry::ScatterVectorize ||
6252 Entry->State == TreeEntry::StridedVectorize ||
6253 Entry->State == TreeEntry::CompressVectorize)
6254 return "color=blue";
6261 for (
auto *
I : DeletedInstructions) {
6262 if (!
I->getParent()) {
6267 I->insertBefore(F->getEntryBlock(),
6268 F->getEntryBlock().getFirstNonPHIIt());
6270 I->insertBefore(F->getEntryBlock().getTerminator()->getIterator());
6273 for (
Use &U :
I->operands()) {
6275 if (
Op && !DeletedInstructions.count(
Op) &&
Op->hasOneUser() &&
6279 I->dropAllReferences();
6281 for (
auto *
I : DeletedInstructions) {
6283 "trying to erase instruction with users.");
6284 I->eraseFromParent();
6290#ifdef EXPENSIVE_CHECKS
6301 assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
6302 "Expected non-empty mask.");
6305 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
6307 Reuses[Mask[
I]] = Prev[
I];
6315 bool BottomOrder =
false) {
6316 assert(!Mask.empty() &&
"Expected non-empty mask.");
6317 unsigned Sz = Mask.size();
6320 if (Order.
empty()) {
6322 std::iota(PrevOrder.
begin(), PrevOrder.
end(), 0);
6324 PrevOrder.
swap(Order);
6327 for (
unsigned I = 0;
I < Sz; ++
I)
6329 Order[
I] = PrevOrder[Mask[
I]];
6331 return Data.value() == Sz ||
Data.index() ==
Data.value();
6340 if (Order.
empty()) {
6342 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
6352 for (
unsigned I = 0;
I < Sz; ++
I)
6354 Order[MaskOrder[
I]] =
I;
6358std::optional<BoUpSLP::OrdersType>
6360 bool TopToBottom,
bool IgnoreReorder) {
6361 assert(TE.isGather() &&
"Expected gather node only.");
6365 Type *ScalarTy = GatheredScalars.
front()->getType();
6366 size_t NumScalars = GatheredScalars.
size();
6368 return std::nullopt;
6375 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
6377 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
6380 if (GatherShuffles.
empty() && ExtractShuffles.
empty())
6381 return std::nullopt;
6382 OrdersType CurrentOrder(NumScalars, NumScalars);
6383 if (GatherShuffles.
size() == 1 &&
6385 Entries.
front().front()->isSame(TE.Scalars)) {
6389 return std::nullopt;
6391 if (Entries.
front().front()->UserTreeIndex.UserTE ==
6392 TE.UserTreeIndex.UserTE)
6393 return std::nullopt;
6396 if (!IgnoreReorder && Entries.
front().front()->Idx == 0)
6397 return std::nullopt;
6400 if (!Entries.
front().front()->ReuseShuffleIndices.empty() &&
6401 TE.getVectorFactor() == 2 && Mask.size() == 2 &&
6404 return P.value() % 2 != static_cast<int>(P.index()) % 2;
6406 return std::nullopt;
6410 std::iota(CurrentOrder.
begin(), CurrentOrder.
end(), 0);
6411 return CurrentOrder;
6415 return all_of(Mask, [&](
int I) {
6422 if ((ExtractShuffles.
empty() && IsSplatMask(Mask) &&
6423 (Entries.
size() != 1 ||
6424 Entries.
front().front()->ReorderIndices.empty())) ||
6425 (GatherShuffles.
empty() && IsSplatMask(ExtractMask)))
6426 return std::nullopt;
6432 if (ShuffledSubMasks.
test(
I))
6434 const int VF = GetVF(
I);
6442 ShuffledSubMasks.
set(
I);
6446 int FirstMin = INT_MAX;
6447 int SecondVecFound =
false;
6449 int Idx = Mask[
I * PartSz + K];
6451 Value *V = GatheredScalars[
I * PartSz + K];
6453 SecondVecFound =
true;
6462 SecondVecFound =
true;
6466 FirstMin = (FirstMin / PartSz) * PartSz;
6468 if (SecondVecFound) {
6470 ShuffledSubMasks.
set(
I);
6474 int Idx = Mask[
I * PartSz + K];
6478 if (Idx >= PartSz) {
6479 SecondVecFound =
true;
6482 if (CurrentOrder[
I * PartSz + Idx] >
6483 static_cast<unsigned>(
I * PartSz + K) &&
6484 CurrentOrder[
I * PartSz + Idx] !=
6485 static_cast<unsigned>(
I * PartSz + Idx))
6486 CurrentOrder[
I * PartSz + Idx] =
I * PartSz + K;
6489 if (SecondVecFound) {
6491 ShuffledSubMasks.
set(
I);
6497 if (!ExtractShuffles.
empty())
6498 TransformMaskToOrder(
6499 CurrentOrder, ExtractMask, PartSz, NumParts, [&](
unsigned I) {
6500 if (!ExtractShuffles[
I])
6503 unsigned Sz =
getNumElems(TE.getVectorFactor(), PartSz,
I);
6505 int K =
I * PartSz + Idx;
6508 if (!TE.ReuseShuffleIndices.empty())
6509 K = TE.ReuseShuffleIndices[K];
6512 if (!TE.ReorderIndices.empty())
6513 K = std::distance(TE.ReorderIndices.begin(),
6514 find(TE.ReorderIndices, K));
6520 .getKnownMinValue());
6525 if (GatherShuffles.
size() == 1 && NumParts != 1) {
6526 if (ShuffledSubMasks.
any())
6527 return std::nullopt;
6528 PartSz = NumScalars;
6531 if (!Entries.
empty())
6532 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](
unsigned I) {
6533 if (!GatherShuffles[
I])
6535 return std::max(Entries[
I].front()->getVectorFactor(),
6536 Entries[
I].back()->getVectorFactor());
6538 unsigned NumUndefs =
count(CurrentOrder, NumScalars);
6539 if (ShuffledSubMasks.
all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
6540 return std::nullopt;
6541 return std::move(CurrentOrder);
6546 bool CompareOpcodes =
true) {
6552 return (!GEP1 || GEP1->getNumOperands() == 2) &&
6553 (!GEP2 || GEP2->getNumOperands() == 2) &&
6554 (((!GEP1 ||
isConstant(GEP1->getOperand(1))) &&
6555 (!GEP2 ||
isConstant(GEP2->getOperand(1)))) ||
6558 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
6562template <
typename T>
6567 return CommonAlignment;
6573 "Order is empty. Please check it before using isReverseOrder.");
6574 unsigned Sz = Order.
size();
6576 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
6597 "Coeffs vector needs to be of correct size");
6599 const SCEV *PtrSCEVLowest =
nullptr;
6600 const SCEV *PtrSCEVHighest =
nullptr;
6603 for (
Value *Ptr : PointerOps) {
6608 if (!PtrSCEVLowest && !PtrSCEVHighest) {
6609 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
6616 PtrSCEVLowest = PtrSCEV;
6623 PtrSCEVHighest = PtrSCEV;
6631 int Size =
DL.getTypeStoreSize(ElemTy);
6632 auto TryGetStride = [&](
const SCEV *Dist,
6633 const SCEV *Multiplier) ->
const SCEV * {
6635 if (M->getOperand(0) == Multiplier)
6636 return M->getOperand(1);
6637 if (M->getOperand(1) == Multiplier)
6638 return M->getOperand(0);
6641 if (Multiplier == Dist)
6646 const SCEV *Stride =
nullptr;
6647 if (
Size != 1 || SCEVs.
size() > 2) {
6649 Stride = TryGetStride(Dist, Sz);
6657 using DistOrdPair = std::pair<int64_t, int>;
6659 std::set<DistOrdPair,
decltype(Compare)> Offsets(Compare);
6661 bool IsConsecutive =
true;
6662 for (
const auto [Idx, PtrSCEV] :
enumerate(SCEVs)) {
6664 if (PtrSCEV != PtrSCEVLowest) {
6666 const SCEV *Coeff = TryGetStride(Diff, Stride);
6672 Coeffs[Idx] = (int64_t)SC->getAPInt().getLimitedValue();
6677 Dist = SC->getAPInt().getZExtValue();
6684 auto Res = Offsets.emplace(Dist, Cnt);
6688 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
6691 if (Offsets.size() != SCEVs.
size())
6693 SortedIndices.
clear();
6694 if (!IsConsecutive) {
6698 for (
const std::pair<int64_t, int> &Pair : Offsets) {
6699 SortedIndices[Cnt] = Pair.second;
6706static std::pair<InstructionCost, InstructionCost>
6725 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask,
CostKind, Index, SubTp,
6727 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
6730 Mask, NumSrcElts, NumSubElts, Index)) {
6731 if (Index + NumSubElts > NumSrcElts &&
6732 Index + NumSrcElts <=
static_cast<int>(
Mask.size()))
6736 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask,
CostKind, Index, SubTp,
6749 "ScalableVectorType is not supported.");
6752 "Incorrect usage.");
6757 unsigned ScalarTyNumElements = VecTy->getNumElements();
6760 if (!DemandedElts[
I])
6764 I * ScalarTyNumElements, VecTy);
6767 I * ScalarTyNumElements, VecTy);
6771 return TTI.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
6780 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
6781 if (Opcode == Instruction::ExtractElement) {
6787 Index * VecTy->getNumElements(), VecTy);
6790 return TTI.getVectorInstrCost(Opcode, Val,
CostKind, Index, Scalar,
6803 getWidenedType(VecTy->getElementType(), ScalarTy->getNumElements());
6805 Index * ScalarTy->getNumElements(), SubTp) +
6809 return TTI.getExtractWithExtendCost(Opcode, Dst, VecTy, Index,
CostKind);
6825 auto *Begin = std::next(
Mask.begin(), Index);
6826 std::iota(Begin, std::next(Begin, SubVecVF), 0);
6827 Vec = Builder.CreateShuffleVector(V, Mask);
6830 std::iota(
Mask.begin(),
Mask.end(), 0);
6831 std::iota(std::next(
Mask.begin(), Index),
6832 std::next(
Mask.begin(), Index + SubVecVF), VecVF);
6834 return Generator(Vec, V, Mask);
6837 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
6838 V = Builder.CreateShuffleVector(V, ResizeMask);
6840 return Builder.CreateShuffleVector(Vec, V, Mask);
6845 unsigned SubVecVF,
unsigned Index) {
6847 std::iota(Mask.begin(), Mask.end(), Index);
6848 return Builder.CreateShuffleVector(Vec, Mask);
6858 const unsigned Sz = PointerOps.
size();
6861 CompressMask[0] = 0;
6863 std::optional<unsigned> Stride = 0;
6866 Value *Ptr = Order.
empty() ? PointerOps[
I] : PointerOps[Order[
I]];
6867 std::optional<int64_t> OptPos =
6869 if (!OptPos || OptPos > std::numeric_limits<unsigned>::max())
6871 unsigned Pos =
static_cast<unsigned>(*OptPos);
6872 CompressMask[
I] = Pos;
6879 if (Pos != *Stride *
I)
6882 return Stride.has_value();
6895 InterleaveFactor = 0;
6897 const size_t Sz = VL.
size();
6905 if (AreAllUsersVectorized(V))
6908 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
CostKind,
6909 Mask.empty() ?
I : Mask[
I]);
6912 if (ExtractCost <= ScalarCost)
6917 if (Order.
empty()) {
6918 Ptr0 = PointerOps.
front();
6919 PtrN = PointerOps.
back();
6921 Ptr0 = PointerOps[Order.
front()];
6922 PtrN = PointerOps[Order.
back()];
6924 std::optional<int64_t> Diff =
6928 const size_t MaxRegSize =
6932 if (*Diff / Sz >= MaxRegSize / 8)
6936 Align CommonAlignment = LI->getAlign();
6938 Ptr0, LoadVecTy, CommonAlignment,
DL,
6941 if (IsMasked && !
TTI.isLegalMaskedLoad(LoadVecTy, CommonAlignment,
6942 LI->getPointerAddressSpace()))
6948 assert(CompressMask.
size() >= 2 &&
"At least two elements are required");
6952 auto [ScalarGEPCost, VectorGEPCost] =
6954 Instruction::GetElementPtr,
CostKind, ScalarTy, LoadVecTy);
6971 LoadCost =
TTI.getMemIntrinsicInstrCost(
6974 LI->getPointerAddressSpace()),
6978 TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
6979 LI->getPointerAddressSpace(),
CostKind);
6981 if (IsStrided && !IsMasked && Order.
empty()) {
6988 AlignedLoadVecTy = LoadVecTy;
6989 if (
TTI.isLegalInterleavedAccessType(AlignedLoadVecTy, CompressMask[1],
6991 LI->getPointerAddressSpace())) {
6993 VectorGEPCost +
TTI.getInterleavedMemoryOpCost(
6994 Instruction::Load, AlignedLoadVecTy,
6995 CompressMask[1], {}, CommonAlignment,
6996 LI->getPointerAddressSpace(),
CostKind, IsMasked);
6997 if (InterleavedCost < GatherCost) {
6998 InterleaveFactor = CompressMask[1];
6999 LoadVecTy = AlignedLoadVecTy;
7006 if (!Order.
empty()) {
7009 NewMask[
I] = CompressMask[Mask[
I]];
7011 CompressMask.
swap(NewMask);
7013 InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
7014 return TotalVecCost < GatherCost;
7027 unsigned InterleaveFactor;
7031 AreAllUsersVectorized, IsMasked, InterleaveFactor,
7032 CompressMask, LoadVecTy);
7049 Align Alignment,
const int64_t Diff,
7050 const size_t Sz)
const {
7051 if (Diff % (Sz - 1) != 0)
7055 auto IsAnyPointerUsedOutGraph =
any_of(PointerOps, [&](
Value *V) {
7057 return !isVectorized(U) && !MustGather.contains(U);
7061 const uint64_t AbsoluteDiff = std::abs(Diff);
7063 if (IsAnyPointerUsedOutGraph ||
7064 (AbsoluteDiff > Sz &&
7067 AbsoluteDiff % Sz == 0 &&
has_single_bit(AbsoluteDiff / Sz)))) ||
7068 Diff == -(
static_cast<int64_t
>(Sz) - 1)) {
7069 int64_t Stride = Diff /
static_cast<int64_t
>(Sz - 1);
7070 if (Diff != Stride *
static_cast<int64_t
>(Sz - 1))
7072 if (!TTI->isLegalStridedLoadStore(VecTy, Alignment))
7082 Value *Ptr0,
Value *PtrN, StridedPtrInfo &SPtrInfo)
const {
7083 const size_t Sz = PointerOps.
size();
7088 SortedIndices.
empty() ? PointerOps[
I] : PointerOps[SortedIndices[
I]];
7089 SortedOffsetsFromBase[
I] =
7107 int64_t StrideWithinGroup =
7108 SortedOffsetsFromBase[1] - SortedOffsetsFromBase[0];
7111 auto IsEndOfGroupIndex = [=, &SortedOffsetsFromBase](
unsigned Idx) {
7112 return SortedOffsetsFromBase[Idx] - SortedOffsetsFromBase[Idx - 1] !=
7117 unsigned GroupSize = FoundIt != Indices.end() ? *FoundIt : Sz;
7119 unsigned VecSz = Sz;
7120 Type *NewScalarTy = ScalarTy;
7124 bool NeedsWidening = Sz != GroupSize;
7125 if (NeedsWidening) {
7126 if (Sz % GroupSize != 0)
7129 if (StrideWithinGroup != 1)
7131 VecSz = Sz / GroupSize;
7134 DL->getTypeSizeInBits(ScalarTy).getFixedValue() * GroupSize);
7137 if (!
isStridedLoad(PointerOps, NewScalarTy, Alignment, Diff, VecSz))
7140 int64_t StrideIntVal = StrideWithinGroup;
7141 if (NeedsWidening) {
7144 unsigned CurrentGroupStartIdx = GroupSize;
7145 int64_t StrideBetweenGroups =
7146 SortedOffsetsFromBase[GroupSize] - SortedOffsetsFromBase[0];
7147 StrideIntVal = StrideBetweenGroups;
7148 for (; CurrentGroupStartIdx < Sz; CurrentGroupStartIdx += GroupSize) {
7149 if (SortedOffsetsFromBase[CurrentGroupStartIdx] -
7150 SortedOffsetsFromBase[CurrentGroupStartIdx - GroupSize] !=
7151 StrideBetweenGroups)
7155 auto CheckGroup = [=](
const unsigned StartIdx) ->
bool {
7158 unsigned GroupEndIdx = FoundIt != Indices.end() ? *FoundIt : Sz;
7159 return GroupEndIdx - StartIdx == GroupSize;
7161 for (
unsigned I = 0;
I < Sz;
I += GroupSize) {
7167 Type *StrideTy = DL->getIndexType(Ptr0->
getType());
7176 StridedPtrInfo &SPtrInfo)
const {
7182 OffsetToPointerOpIdxMap;
7183 for (
auto [Idx, Ptr] :
enumerate(PointerOps)) {
7184 const SCEV *PtrSCEV = SE->getSCEV(Ptr);
7196 Offset = SC->getAPInt().getSExtValue();
7200 OffsetToPointerOpIdxMap[
Offset].first.push_back(Ptr);
7201 OffsetToPointerOpIdxMap[
Offset].second.push_back(Idx);
7203 unsigned NumOffsets = OffsetToPointerOpIdxMap.
size();
7207 const unsigned Sz = PointerOps.
size();
7208 unsigned VecSz = Sz;
7209 Type *NewScalarTy = ScalarTy;
7210 if (NumOffsets > 1) {
7211 if (Sz % NumOffsets != 0)
7213 VecSz = Sz / NumOffsets;
7216 DL->getTypeSizeInBits(ScalarTy).getFixedValue() * NumOffsets);
7219 if (Sz <= MinProfitableStridedLoads || !TTI->isTypeLegal(StridedLoadTy) ||
7220 !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))
7226 for (
auto [Idx, MapPair] :
enumerate(OffsetToPointerOpIdxMap)) {
7227 if (MapPair.second.first.size() != VecSz)
7229 SortedOffsetsV[Idx] = MapPair.first;
7231 sort(SortedOffsetsV);
7233 if (NumOffsets > 1) {
7235 if (SortedOffsetsV[
I] - SortedOffsetsV[
I - 1] != 1)
7308 auto UpdateSortedIndices =
7311 if (SortedIndicesForOffset.
empty()) {
7312 SortedIndicesForOffset.
resize(IndicesInAllPointerOps.
size());
7313 std::iota(SortedIndicesForOffset.
begin(),
7314 SortedIndicesForOffset.
end(), 0);
7316 for (
const auto [Num, Idx] :
enumerate(SortedIndicesForOffset)) {
7317 SortedIndicesDraft[Num * NumOffsets + OffsetNum] =
7318 IndicesInAllPointerOps[Idx];
7322 int64_t LowestOffset = SortedOffsetsV[0];
7328 SortedIndicesForOffset0, Coeffs0);
7331 unsigned NumCoeffs0 = Coeffs0.
size();
7332 if (NumCoeffs0 * NumOffsets != Sz)
7337 OffsetToPointerOpIdxMap[LowestOffset].second;
7338 UpdateSortedIndices(SortedIndicesForOffset0, IndicesInAllPointerOps0, 0);
7344 for (
int J :
seq<int>(1, NumOffsets)) {
7347 SortedIndicesForOffset.
clear();
7349 int64_t
Offset = SortedOffsetsV[J];
7351 OffsetToPointerOpIdxMap[
Offset].first;
7353 OffsetToPointerOpIdxMap[
Offset].second;
7354 const SCEV *StrideWithinGroup =
7356 SortedIndicesForOffset, Coeffs);
7358 if (!StrideWithinGroup || StrideWithinGroup != Stride0)
7360 if (Coeffs.
size() != NumCoeffs0)
7363 if (Coeffs != Coeffs0)
7366 UpdateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps, J);
7369 SortedIndices.
clear();
7370 SortedIndices = SortedIndicesDraft;
7371 SPtrInfo.StrideSCEV = Stride0;
7372 SPtrInfo.Ty = StridedLoadTy;
7379 unsigned *BestVF,
bool TryRecursiveCheck)
const {
7392 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
7398 const size_t Sz = VL.
size();
7400 auto *POIter = PointerOps.
begin();
7401 for (
Value *V : VL) {
7403 if (!L || !L->isSimple())
7405 *POIter = L->getPointerOperand();
7411 bool IsSorted =
sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
7420 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
7421 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
7432 if (Order.
empty()) {
7433 Ptr0 = PointerOps.
front();
7434 PtrN = PointerOps.
back();
7436 Ptr0 = PointerOps[Order.
front()];
7437 PtrN = PointerOps[Order.
back()];
7439 std::optional<int64_t> Diff =
7442 if (
static_cast<uint64_t>(*Diff) == Sz - 1)
7445 *TLI, [&](
Value *V) {
7446 return areAllUsersVectorized(
7454 *Diff, Ptr0, PtrN, SPtrInfo))
7457 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
7458 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
7463 auto CheckForShuffledLoads = [&, &TTI = *TTI](
Align CommonAlignment,
7465 bool ProfitableGatherPointers) {
7470 auto [ScalarGEPCost, VectorGEPCost] =
7472 Instruction::GetElementPtr,
CostKind, ScalarTy, VecTy);
7476 Type *PtrScalarTy = PointerOps.
front()->getType()->getScalarType();
7478 if (
static_cast<unsigned>(
count_if(
7497 return C + TTI.getInstructionCost(
7503 TTI.getMemIntrinsicInstrCost(
7506 false, CommonAlignment),
7508 (ProfitableGatherPointers ? 0 : VectorGEPCost);
7516 constexpr unsigned ListLimit = 4;
7517 if (!TryRecursiveCheck || VL.
size() < ListLimit)
7526 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
7536 for (
unsigned Cnt = 0, End = VL.
size(); Cnt + VF <= End; Cnt += VF) {
7541 PointerOps, SPtrInfo, BestVF,
7549 DemandedElts.
setBits(Cnt, Cnt + VF);
7565 if (!DemandedElts.
isZero()) {
7571 if (DemandedElts[Idx])
7582 LI0->getPointerOperand(),
7583 Instruction::GetElementPtr,
CostKind, ScalarTy,
7587 if (
static_cast<unsigned>(
7589 PointerOps.
size() - 1 ||
7608 TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
7609 LI0->getPointerAddressSpace(),
CostKind,
7614 VecLdCost += TTI.getMemIntrinsicInstrCost(
7616 Intrinsic::experimental_vp_strided_load,
7617 SubVecTy, LI0->getPointerOperand(),
7618 false, CommonAlignment),
7623 VecLdCost += TTI.getMemIntrinsicInstrCost(
7625 Intrinsic::masked_load, SubVecTy,
7626 CommonAlignment, LI0->getPointerAddressSpace()),
7632 VecLdCost += TTI.getMemIntrinsicInstrCost(
7634 Intrinsic::masked_gather, SubVecTy,
7635 LI0->getPointerOperand(),
7636 false, CommonAlignment),
7646 ShuffleMask[Idx] = Idx / VF ==
I ? VL.
size() + Idx % VF : Idx;
7655 if (MaskedGatherCost >= VecLdCost &&
7668 bool ProfitableGatherPointers =
7669 L && Sz > 2 &&
static_cast<unsigned>(
count_if(PointerOps, [L](
Value *V) {
7670 return L->isLoopInvariant(V);
7672 if (ProfitableGatherPointers ||
all_of(PointerOps, [](
Value *
P) {
7675 (
GEP &&
GEP->getNumOperands() == 2 &&
7683 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
7684 ProfitableGatherPointers))
7696 all_of(VL, [](
const Value *V) {
return V->getType()->isPointerTy(); }) &&
7697 "Expected list of pointer operands.");
7702 std::pair<BasicBlock *, Value *>,
7706 .try_emplace(std::make_pair(
7710 SortedIndices.
clear();
7712 auto Key = std::make_pair(BBs[Cnt + 1],
7714 bool Found =
any_of(Bases.try_emplace(
Key).first->second,
7715 [&, &Cnt = Cnt, &Ptr = Ptr](
auto &
Base) {
7716 std::optional<int64_t> Diff =
7717 getPointersDiff(ElemTy, std::get<0>(Base.front()),
7718 ElemTy, Ptr, DL, SE,
7723 Base.emplace_back(Ptr, *Diff, Cnt + 1);
7729 if (Bases.size() > VL.
size() / 2 - 1)
7733 Bases.find(
Key)->second.emplace_back().emplace_back(Ptr, 0, Cnt + 1);
7737 if (Bases.size() == VL.
size())
7740 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
7741 Bases.front().second.size() == VL.
size()))
7746 auto ComparePointers = [](
Value *Ptr1,
Value *Ptr2) {
7755 FirstPointers.
insert(P1);
7756 SecondPointers.
insert(P2);
7762 "Unable to find matching root.");
7765 for (
auto &
Base : Bases) {
7766 for (
auto &Vec :
Base.second) {
7767 if (Vec.size() > 1) {
7769 int64_t InitialOffset = std::get<1>(Vec[0]);
7770 bool AnyConsecutive =
7772 return std::get<1>(
P.value()) ==
7773 int64_t(
P.index()) + InitialOffset;
7777 if (!AnyConsecutive)
7782 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
7786 for (
auto &
T : Bases)
7787 for (
const auto &Vec :
T.second)
7788 for (
const auto &
P : Vec)
7792 "Expected SortedIndices to be the size of VL");
7796std::optional<BoUpSLP::OrdersType>
7798 assert(TE.isGather() &&
"Expected gather node only.");
7799 Type *ScalarTy = TE.Scalars[0]->getType();
7802 Ptrs.
reserve(TE.Scalars.size());
7804 BBs.
reserve(TE.Scalars.size());
7805 for (
Value *V : TE.Scalars) {
7807 if (!L || !L->isSimple())
7808 return std::nullopt;
7814 if (!LoadEntriesToVectorize.contains(TE.Idx) &&
7816 return std::move(Order);
7817 return std::nullopt;
7828 if (VU->
getType() != V->getType())
7831 if (!VU->
hasOneUse() && !V->hasOneUse())
7837 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
7844 bool IsReusedIdx =
false;
7846 if (IE2 == VU && !IE1)
7848 if (IE1 == V && !IE2)
7849 return V->hasOneUse();
7850 if (IE1 && IE1 != V) {
7852 IsReusedIdx |= ReusedIdx.
test(Idx1);
7853 ReusedIdx.
set(Idx1);
7854 if ((IE1 != VU && !IE1->
hasOneUse()) || IsReusedIdx)
7859 if (IE2 && IE2 != VU) {
7861 IsReusedIdx |= ReusedIdx.
test(Idx2);
7862 ReusedIdx.
set(Idx2);
7863 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
7868 }
while (!IsReusedIdx && (IE1 || IE2));
7878std::optional<BoUpSLP::OrdersType>
7880 bool IgnoreReorder) {
7883 if (!TE.ReuseShuffleIndices.empty()) {
7885 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
7886 "Reshuffling scalars not yet supported for nodes with padding");
7889 return std::nullopt;
7897 unsigned Sz = TE.Scalars.size();
7898 if (TE.isGather()) {
7899 if (std::optional<OrdersType> CurrentOrder =
7904 ::addMask(Mask, TE.ReuseShuffleIndices);
7905 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
7906 unsigned Sz = TE.Scalars.size();
7907 for (
int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
7910 Res[Idx + K * Sz] =
I + K * Sz;
7912 return std::move(Res);
7915 if (Sz == 2 && TE.getVectorFactor() == 4 &&
7917 2 * TE.getVectorFactor())) == 1)
7918 return std::nullopt;
7919 if (TE.ReuseShuffleIndices.size() % Sz != 0)
7920 return std::nullopt;
7924 if (TE.ReorderIndices.empty())
7925 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
7928 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
7929 unsigned VF = ReorderMask.
size();
7933 for (
unsigned I = 0;
I < VF;
I += Sz) {
7935 unsigned UndefCnt = 0;
7936 unsigned Limit = std::min(Sz, VF -
I);
7945 Val >=
static_cast<int>(NumParts) || UsedVals.
test(Val) ||
7947 return std::nullopt;
7949 for (
unsigned K = 0; K < NumParts; ++K) {
7950 unsigned Idx = Val + Sz * K;
7951 if (Idx < VF &&
I + K < VF)
7952 ResOrder[Idx] =
I + K;
7955 return std::move(ResOrder);
7957 unsigned VF = TE.getVectorFactor();
7960 TE.ReuseShuffleIndices.end());
7961 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
7963 if (isa<PoisonValue>(V))
7965 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
7966 return Idx && *Idx < Sz;
7968 assert(!TE.isAltShuffle() &&
"Alternate instructions are only supported "
7969 "by BinaryOperator and CastInst.");
7971 if (TE.ReorderIndices.empty())
7972 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
7975 for (
unsigned I = 0;
I < VF; ++
I) {
7976 int &Idx = ReusedMask[
I];
7979 Value *V = TE.Scalars[ReorderMask[Idx]];
7981 Idx = std::distance(ReorderMask.
begin(),
find(ReorderMask, *EI));
7987 std::iota(ResOrder.
begin(), ResOrder.
end(), 0);
7988 auto *It = ResOrder.
begin();
7989 for (
unsigned K = 0; K < VF; K += Sz) {
7993 std::iota(SubMask.
begin(), SubMask.
end(), 0);
7995 transform(CurrentOrder, It, [K](
unsigned Pos) {
return Pos + K; });
7996 std::advance(It, Sz);
7999 return Data.index() ==
Data.value();
8001 return std::nullopt;
8002 return std::move(ResOrder);
8004 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
8005 (!TE.UserTreeIndex || !TE.UserTreeIndex.UserTE->hasState() ||
8007 (TE.ReorderIndices.empty() ||
isReverseOrder(TE.ReorderIndices)))
8008 return std::nullopt;
8009 if (TE.State == TreeEntry::SplitVectorize ||
8010 ((TE.State == TreeEntry::Vectorize ||
8011 TE.State == TreeEntry::StridedVectorize ||
8012 TE.State == TreeEntry::CompressVectorize) &&
8015 assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
8016 "Alternate instructions are only supported by "
8017 "BinaryOperator and CastInst.");
8018 return TE.ReorderIndices;
8020 if (!TopToBottom && IgnoreReorder && TE.State == TreeEntry::Vectorize &&
8021 TE.isAltShuffle()) {
8022 assert(TE.ReuseShuffleIndices.empty() &&
8023 "ReuseShuffleIndices should be "
8024 "empty for alternate instructions.");
8026 TE.buildAltOpShuffleMask(
8028 assert(TE.getMatchingMainOpOrAltOp(
I) &&
8029 "Unexpected main/alternate opcode");
8033 const int VF = TE.getVectorFactor();
8038 ResOrder[Mask[
I] % VF] =
I;
8040 return std::move(ResOrder);
8042 if (!TE.ReorderIndices.empty())
8043 return TE.ReorderIndices;
8044 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
8045 if (!TE.ReorderIndices.empty())
8046 return TE.ReorderIndices;
8049 for (
auto [
I, V] :
zip(UserBVHead, TE.Scalars)) {
8057 while (
II &&
II->hasOneUse() &&
II->getParent() == BB) {
8065 assert(BB1 != BB2 &&
"Expected different basic blocks.");
8066 if (!DT->isReachableFromEntry(BB1))
8068 if (!DT->isReachableFromEntry(BB2))
8070 auto *NodeA = DT->getNode(BB1);
8071 auto *NodeB = DT->getNode(BB2);
8072 assert(NodeA &&
"Should only process reachable instructions");
8073 assert(NodeB &&
"Should only process reachable instructions");
8074 assert((NodeA == NodeB) ==
8075 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
8076 "Different nodes should have different DFS numbers");
8077 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
8079 auto PHICompare = [&](
unsigned I1,
unsigned I2) {
8080 Value *V1 = TE.Scalars[I1];
8081 Value *V2 = TE.Scalars[I2];
8094 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
8095 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
8096 FirstUserOfPhi2->getParent());
8106 if (UserBVHead[I1] && !UserBVHead[I2])
8108 if (!UserBVHead[I1])
8110 if (UserBVHead[I1] == UserBVHead[I2])
8113 return CompareByBasicBlocks(UserBVHead[I1]->
getParent(),
8115 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
8128 if (EE1->getOperand(0) == EE2->getOperand(0))
8130 if (!Inst1 && Inst2)
8132 if (Inst1 && Inst2) {
8140 "Expected either instructions or arguments vector operands.");
8141 return P1->getArgNo() < P2->getArgNo();
8146 std::iota(Phis.
begin(), Phis.
end(), 0);
8149 return std::nullopt;
8150 return std::move(Phis);
8152 if (TE.isGather() &&
8153 (!TE.hasState() || !TE.isAltShuffle() ||
8154 ScalarsInSplitNodes.contains(TE.getMainOp())) &&
8158 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
8162 auto *EE = dyn_cast<ExtractElementInst>(V);
8163 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
8169 canReuseExtract(TE.Scalars, CurrentOrder,
true);
8170 if (Reuse || !CurrentOrder.
empty())
8171 return std::move(CurrentOrder);
8179 int Sz = TE.Scalars.size();
8183 if (It == TE.Scalars.begin())
8186 if (It != TE.Scalars.end()) {
8188 unsigned Idx = std::distance(TE.Scalars.begin(), It);
8203 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
8206 return std::move(Order);
8211 return std::nullopt;
8212 if (TE.Scalars.size() >= 3)
8217 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
8219 StridedPtrInfo SPtrInfo;
8222 CurrentOrder, PointerOps, SPtrInfo);
8225 return std::move(CurrentOrder);
8230 if (std::optional<OrdersType> CurrentOrder =
8232 return CurrentOrder;
8234 return std::nullopt;
8244 for (
unsigned I = Sz,
E = Mask.size();
I <
E;
I += Sz) {
8246 if (Cluster != FirstCluster)
8252void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const {
8255 const unsigned Sz =
TE.Scalars.size();
8257 if (!
TE.isGather() ||
8264 addMask(NewMask,
TE.ReuseShuffleIndices);
8266 TE.ReorderIndices.clear();
8273 for (
auto *It =
TE.ReuseShuffleIndices.begin(),
8274 *End =
TE.ReuseShuffleIndices.end();
8275 It != End; std::advance(It, Sz))
8276 std::iota(It, std::next(It, Sz), 0);
8282 "Expected same size of orders");
8283 size_t Sz = Order.
size();
8286 if (Order[Idx] != Sz)
8287 UsedIndices.
set(Order[Idx]);
8289 if (SecondaryOrder.
empty()) {
8291 if (Order[Idx] == Sz && !UsedIndices.
test(Idx))
8295 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
8296 !UsedIndices.
test(SecondaryOrder[Idx]))
8297 Order[Idx] = SecondaryOrder[Idx];
8305 constexpr unsigned TinyVF = 2;
8306 constexpr unsigned TinyTree = 10;
8307 constexpr unsigned PhiOpsLimit = 12;
8308 constexpr unsigned GatherLoadsLimit = 2;
8309 if (VectorizableTree.size() <= TinyTree)
8311 if (VectorizableTree.front()->hasState() &&
8312 !VectorizableTree.front()->isGather() &&
8313 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
8314 VectorizableTree.front()->getOpcode() == Instruction::PHI ||
8315 (VectorizableTree.front()->getVectorFactor() <= TinyVF &&
8316 (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
8317 VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
8318 VectorizableTree.front()->ReorderIndices.empty()) {
8322 if (VectorizableTree.front()->hasState() &&
8323 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
8324 VectorizableTree.front()->Scalars.size() == TinyVF &&
8325 VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
8328 if (VectorizableTree.front()->hasState() &&
8329 VectorizableTree.front()->getOpcode() == Instruction::Store &&
8330 VectorizableTree.front()->ReorderIndices.empty()) {
8331 const unsigned ReorderedSplitsCnt =
8332 count_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
8333 return TE->State == TreeEntry::SplitVectorize &&
8334 !TE->ReorderIndices.empty() && TE->UserTreeIndex.UserTE &&
8335 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8338 if (ReorderedSplitsCnt <= 1 &&
8340 VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
8341 return ((!TE->isGather() &&
8342 (TE->ReorderIndices.empty() ||
8343 (TE->UserTreeIndex.UserTE &&
8344 TE->UserTreeIndex.UserTE->State ==
8345 TreeEntry::Vectorize &&
8346 !TE->UserTreeIndex.UserTE->ReuseShuffleIndices
8348 (TE->isGather() && TE->ReorderIndices.empty() &&
8349 (!TE->hasState() || TE->isAltShuffle() ||
8350 TE->getOpcode() == Instruction::Load ||
8351 TE->getOpcode() == Instruction::ZExt ||
8352 TE->getOpcode() == Instruction::SExt))) &&
8353 (VectorizableTree.front()->getVectorFactor() > TinyVF ||
8354 !TE->isGather() ||
none_of(TE->Scalars, [&](
Value *V) {
8355 return !isConstant(V) && isVectorized(V);
8357 })) >= VectorizableTree.size() - ReorderedSplitsCnt)
8360 bool HasPhis =
false;
8361 bool HasLoad =
true;
8362 unsigned GatherLoads = 0;
8363 for (
const std::unique_ptr<TreeEntry> &TE :
8364 ArrayRef(VectorizableTree).drop_front()) {
8365 if (TE->State == TreeEntry::SplitVectorize)
8367 if (!TE->hasState()) {
8371 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8376 if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) {
8377 if (!TE->isGather()) {
8384 if (GatherLoads >= GatherLoadsLimit)
8387 if (TE->getOpcode() == Instruction::GetElementPtr ||
8390 if (TE->getOpcode() != Instruction::PHI &&
8391 (!TE->hasCopyableElements() ||
8393 TE->Scalars.size() / 2))
8395 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8396 TE->getNumOperands() > PhiOpsLimit)
8405void BoUpSLP::TreeEntry::reorderSplitNode(
unsigned Idx,
ArrayRef<int> Mask,
8407 assert(State == TreeEntry::SplitVectorize &&
"Expected split user node.");
8410 std::iota(NewMask.
begin(), NewMask.
end(), 0);
8411 std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0);
8414 copy(MaskOrder, NewMaskOrder.begin());
8416 assert(Idx == 1 &&
"Expected either 0 or 1 index.");
8417 unsigned Offset = CombinedEntriesWithIndices.
back().second;
8426 ReorderIndices.clear();
8445 ExternalUserReorderMap;
8449 for_each(VectorizableTree, [&, &TTIRef = *TTI](
8450 const std::unique_ptr<TreeEntry> &TE) {
8453 findExternalStoreUsersReorderIndices(TE.get());
8454 if (!ExternalUserReorderIndices.
empty()) {
8455 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
8457 std::move(ExternalUserReorderIndices));
8463 if (TE->hasState() && TE->isAltShuffle() &&
8464 TE->State != TreeEntry::SplitVectorize) {
8465 Type *ScalarTy = TE->Scalars[0]->getType();
8467 unsigned Opcode0 = TE->getOpcode();
8468 unsigned Opcode1 = TE->getAltOpcode();
8472 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
8473 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
8479 bool IgnoreReorder =
8480 !UserIgnoreList && VectorizableTree.front()->hasState() &&
8481 (VectorizableTree.front()->
getOpcode() == Instruction::InsertElement ||
8482 VectorizableTree.front()->getOpcode() == Instruction::Store);
8483 if (std::optional<OrdersType> CurrentOrder =
8493 const TreeEntry *UserTE = TE.get();
8495 if (!UserTE->UserTreeIndex)
8497 if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8498 UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
8499 UserTE->UserTreeIndex.UserTE->Idx != 0)
8501 UserTE = UserTE->UserTreeIndex.UserTE;
8504 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
8505 if (!(TE->State == TreeEntry::Vectorize ||
8506 TE->State == TreeEntry::StridedVectorize ||
8507 TE->State == TreeEntry::SplitVectorize ||
8508 TE->State == TreeEntry::CompressVectorize) ||
8509 !TE->ReuseShuffleIndices.empty())
8510 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
8511 if (TE->State == TreeEntry::Vectorize &&
8512 TE->getOpcode() == Instruction::PHI)
8513 PhisToOrders.
try_emplace(TE.get(), *CurrentOrder);
8518 for (
unsigned VF = VectorizableTree.front()->getVectorFactor();
8519 !VFToOrderedEntries.
empty() && VF > 1; VF -= 2 - (VF & 1U)) {
8520 auto It = VFToOrderedEntries.
find(VF);
8521 if (It == VFToOrderedEntries.
end())
8535 for (
const TreeEntry *OpTE : OrderedEntries) {
8538 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE) &&
8539 OpTE->State != TreeEntry::SplitVectorize)
8542 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
8544 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
8545 auto It = GathersToOrders.find(OpTE);
8546 if (It != GathersToOrders.end())
8549 if (OpTE->hasState() && OpTE->isAltShuffle()) {
8550 auto It = AltShufflesToOrders.find(OpTE);
8551 if (It != AltShufflesToOrders.end())
8554 if (OpTE->State == TreeEntry::Vectorize &&
8555 OpTE->getOpcode() == Instruction::PHI) {
8556 auto It = PhisToOrders.
find(OpTE);
8557 if (It != PhisToOrders.
end())
8560 return OpTE->ReorderIndices;
8563 auto It = ExternalUserReorderMap.
find(OpTE);
8564 if (It != ExternalUserReorderMap.
end()) {
8565 const auto &ExternalUserReorderIndices = It->second;
8569 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
8570 OrdersUses.try_emplace(
OrdersType(), 0).first->second +=
8571 ExternalUserReorderIndices.size();
8573 for (
const OrdersType &ExtOrder : ExternalUserReorderIndices)
8574 ++OrdersUses.try_emplace(ExtOrder, 0).first->second;
8581 if (OpTE->State == TreeEntry::Vectorize &&
8582 OpTE->getOpcode() == Instruction::Store && !Order.
empty()) {
8583 assert(!OpTE->isAltShuffle() &&
8584 "Alternate instructions are only supported by BinaryOperator "
8588 unsigned E = Order.
size();
8591 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8594 ++OrdersUses.try_emplace(CurrentOrder, 0).first->second;
8596 ++OrdersUses.try_emplace(Order, 0).first->second;
8599 if (OrdersUses.empty())
8602 unsigned IdentityCnt = 0;
8603 unsigned FilledIdentityCnt = 0;
8605 for (
auto &Pair : OrdersUses) {
8607 if (!Pair.first.empty())
8608 FilledIdentityCnt += Pair.second;
8609 IdentityCnt += Pair.second;
8614 unsigned Cnt = IdentityCnt;
8615 for (
auto &Pair : OrdersUses) {
8619 if (Cnt < Pair.second ||
8620 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
8621 Cnt == Pair.second && !BestOrder.
empty() &&
8624 BestOrder = Pair.first;
8637 unsigned E = BestOrder.
size();
8639 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8642 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8644 if (TE->Scalars.size() != VF) {
8645 if (TE->ReuseShuffleIndices.size() == VF) {
8646 assert(TE->State != TreeEntry::SplitVectorize &&
8647 "Split vectorized not expected.");
8652 (!TE->UserTreeIndex ||
8653 TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
8654 TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() ||
8655 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) &&
8656 "All users must be of VF size.");
8663 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->hasState() &&
8669 reorderNodeWithReuses(*TE, Mask);
8671 if (TE->UserTreeIndex &&
8672 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8673 TE->UserTreeIndex.UserTE->reorderSplitNode(
8674 TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder);
8678 if ((TE->State == TreeEntry::SplitVectorize &&
8679 TE->ReuseShuffleIndices.empty()) ||
8680 ((TE->State == TreeEntry::Vectorize ||
8681 TE->State == TreeEntry::StridedVectorize ||
8682 TE->State == TreeEntry::CompressVectorize) &&
8687 (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize &&
8688 TE->ReuseShuffleIndices.empty())) &&
8689 "Alternate instructions are only supported by BinaryOperator "
8695 TE->reorderOperands(Mask);
8698 TE->reorderOperands(Mask);
8699 assert(TE->ReorderIndices.empty() &&
8700 "Expected empty reorder sequence.");
8703 if (!TE->ReuseShuffleIndices.empty()) {
8710 addMask(NewReuses, TE->ReuseShuffleIndices);
8711 TE->ReuseShuffleIndices.swap(NewReuses);
8712 }
else if (TE->UserTreeIndex &&
8713 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8715 TE->UserTreeIndex.UserTE->reorderSplitNode(TE->UserTreeIndex.EdgeIdx,
8721void BoUpSLP::buildReorderableOperands(
8722 TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
8726 if (
any_of(Edges, [
I](
const std::pair<unsigned, TreeEntry *> &OpData) {
8727 return OpData.first ==
I &&
8728 (OpData.second->State == TreeEntry::Vectorize ||
8729 OpData.second->State == TreeEntry::StridedVectorize ||
8730 OpData.second->State == TreeEntry::CompressVectorize ||
8731 OpData.second->State == TreeEntry::SplitVectorize);
8735 if (UserTE->hasState()) {
8736 if (UserTE->getOpcode() == Instruction::ExtractElement ||
8737 UserTE->getOpcode() == Instruction::ExtractValue)
8739 if (UserTE->getOpcode() == Instruction::InsertElement &&
I == 0)
8741 if (UserTE->getOpcode() == Instruction::Store &&
8742 UserTE->State == TreeEntry::Vectorize &&
I == 1)
8744 if (UserTE->getOpcode() == Instruction::Load &&
8745 (UserTE->State == TreeEntry::Vectorize ||
8746 UserTE->State == TreeEntry::StridedVectorize ||
8747 UserTE->State == TreeEntry::CompressVectorize))
8750 TreeEntry *
TE = getOperandEntry(UserTE,
I);
8751 assert(TE &&
"Expected operand entry.");
8752 if (!
TE->isGather()) {
8755 Edges.emplace_back(
I, TE);
8761 if (
TE->State == TreeEntry::ScatterVectorize &&
8762 TE->ReuseShuffleIndices.empty() &&
TE->ReorderIndices.empty())
8766 if (ReorderableGathers.
contains(TE))
8772 struct TreeEntryCompare {
8773 bool operator()(
const TreeEntry *LHS,
const TreeEntry *RHS)
const {
8774 if (LHS->UserTreeIndex && RHS->UserTreeIndex)
8775 return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx;
8776 return LHS->Idx < RHS->Idx;
8785 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8786 if (TE->State != TreeEntry::Vectorize &&
8787 TE->State != TreeEntry::StridedVectorize &&
8788 TE->State != TreeEntry::CompressVectorize &&
8789 TE->State != TreeEntry::SplitVectorize)
8790 NonVectorized.
insert(TE.get());
8791 if (std::optional<OrdersType> CurrentOrder =
8793 Queue.push(TE.get());
8794 if (!(TE->State == TreeEntry::Vectorize ||
8795 TE->State == TreeEntry::StridedVectorize ||
8796 TE->State == TreeEntry::CompressVectorize ||
8797 TE->State == TreeEntry::SplitVectorize) ||
8798 !TE->ReuseShuffleIndices.empty())
8799 GathersToOrders.
insert(TE.get());
8808 while (!Queue.empty()) {
8810 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>
Users;
8811 TreeEntry *TE = Queue.top();
8812 const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
8815 while (!Queue.empty()) {
8817 if (!UserTE || UserTE != TE->UserTreeIndex.UserTE)
8822 for (TreeEntry *TE : OrderedOps) {
8823 if (!(TE->State == TreeEntry::Vectorize ||
8824 TE->State == TreeEntry::StridedVectorize ||
8825 TE->State == TreeEntry::CompressVectorize ||
8826 TE->State == TreeEntry::SplitVectorize ||
8827 (TE->isGather() && GathersToOrders.
contains(TE))) ||
8828 !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
8829 !Visited.
insert(TE).second)
8833 Users.first = TE->UserTreeIndex.UserTE;
8834 Users.second.emplace_back(TE->UserTreeIndex.EdgeIdx, TE);
8838 if (
Data.first->State == TreeEntry::SplitVectorize) {
8840 Data.second.size() <= 2 &&
8841 "Expected not greater than 2 operands for split vectorize node.");
8843 [](
const auto &
Op) { return !Op.second->UserTreeIndex; }))
8846 assert(
Data.first->CombinedEntriesWithIndices.size() == 2 &&
8847 "Expected exactly 2 entries.");
8848 for (
const auto &
P :
Data.first->CombinedEntriesWithIndices) {
8849 TreeEntry &OpTE = *VectorizableTree[
P.first];
8851 if (Order.
empty() || !OpTE.ReuseShuffleIndices.empty()) {
8852 if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
8854 const auto BestOrder =
8863 const unsigned E = Order.
size();
8866 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8868 Data.first->reorderSplitNode(
P.second ? 1 : 0, Mask, MaskOrder);
8870 if (!OpTE.ReorderIndices.empty()) {
8871 OpTE.ReorderIndices.clear();
8872 }
else if (!OpTE.ReuseShuffleIndices.empty()) {
8875 assert(OpTE.isGather() &&
"Expected only gather/buildvector node.");
8879 if (
Data.first->ReuseShuffleIndices.empty() &&
8880 !
Data.first->ReorderIndices.empty()) {
8883 Queue.push(
Data.first);
8889 buildReorderableOperands(
Data.first,
Data.second, NonVectorized,
8901 for (
const auto &
Op :
Data.second) {
8902 TreeEntry *OpTE =
Op.second;
8903 if (!VisitedOps.
insert(OpTE).second)
8905 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
8907 const auto Order = [&]() ->
const OrdersType {
8908 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
8912 return OpTE->ReorderIndices;
8916 if (Order.
size() == 1)
8922 Value *Root = OpTE->hasState()
8925 auto GetSameNodesUsers = [&](
Value *Root) {
8927 for (
const TreeEntry *TE : ValueToGatherNodes.lookup(Root)) {
8928 if (TE != OpTE && TE->UserTreeIndex &&
8929 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8930 TE->Scalars.size() == OpTE->Scalars.size() &&
8931 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8932 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8933 Res.
insert(TE->UserTreeIndex.UserTE);
8935 for (
const TreeEntry *TE : getTreeEntries(Root)) {
8936 if (TE != OpTE && TE->UserTreeIndex &&
8937 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8938 TE->Scalars.size() == OpTE->Scalars.size() &&
8939 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8940 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8941 Res.
insert(TE->UserTreeIndex.UserTE);
8945 auto GetNumOperands = [](
const TreeEntry *TE) {
8946 if (TE->State == TreeEntry::SplitVectorize)
8947 return TE->getNumOperands();
8949 return CI->arg_size();
8950 return TE->getNumOperands();
8952 auto NodeShouldBeReorderedWithOperands = [&, TTI = TTI](
8953 const TreeEntry *TE) {
8961 const TreeEntry *
Op = getOperandEntry(TE, Idx);
8962 if (
Op->isGather() &&
Op->hasState()) {
8963 const TreeEntry *VecOp =
8964 getSameValuesTreeEntry(
Op->getMainOp(),
Op->Scalars);
8968 if (
Op->ReorderIndices.empty() &&
Op->ReuseShuffleIndices.empty())
8975 if (!RevisitedOps.
insert(UTE).second)
8977 return UTE ==
Data.first || !UTE->ReorderIndices.empty() ||
8978 !UTE->ReuseShuffleIndices.empty() ||
8979 (UTE->UserTreeIndex &&
8980 UTE->UserTreeIndex.UserTE ==
Data.first) ||
8981 (
Data.first->UserTreeIndex &&
8982 Data.first->UserTreeIndex.UserTE == UTE) ||
8983 (IgnoreReorder && UTE->UserTreeIndex &&
8984 UTE->UserTreeIndex.UserTE->Idx == 0) ||
8985 NodeShouldBeReorderedWithOperands(UTE);
8988 for (TreeEntry *UTE :
Users) {
8996 const TreeEntry *
Op = getOperandEntry(UTE, Idx);
8998 Queue.push(
const_cast<TreeEntry *
>(
Op));
9003 Data.second, [OpTE](
const std::pair<unsigned, TreeEntry *> &
P) {
9004 return P.second == OpTE;
9007 if (OpTE->State == TreeEntry::Vectorize &&
9008 OpTE->getOpcode() == Instruction::Store && !Order.
empty()) {
9009 assert(!OpTE->isAltShuffle() &&
9010 "Alternate instructions are only supported by BinaryOperator "
9014 unsigned E = Order.
size();
9017 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
9020 OrdersUses.try_emplace(CurrentOrder, 0).first->second +=
NumOps;
9022 OrdersUses.try_emplace(Order, 0).first->second +=
NumOps;
9024 auto Res = OrdersUses.try_emplace(
OrdersType(), 0);
9025 const auto AllowsReordering = [&](
const TreeEntry *TE) {
9026 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
9027 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
9028 (IgnoreReorder && TE->Idx == 0))
9030 if (TE->isGather()) {
9040 if (OpTE->UserTreeIndex) {
9041 TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE;
9042 if (!VisitedUsers.
insert(UserTE).second)
9047 if (AllowsReordering(UserTE))
9055 if (
static_cast<unsigned>(
count_if(
9056 Ops, [UserTE, &AllowsReordering](
9057 const std::pair<unsigned, TreeEntry *> &
Op) {
9058 return AllowsReordering(
Op.second) &&
9059 Op.second->UserTreeIndex.UserTE == UserTE;
9060 })) <=
Ops.size() / 2)
9061 ++Res.first->second;
9064 if (OrdersUses.empty()) {
9069 unsigned IdentityCnt = 0;
9070 unsigned VF =
Data.second.front().second->getVectorFactor();
9072 for (
auto &Pair : OrdersUses) {
9074 IdentityCnt += Pair.second;
9079 unsigned Cnt = IdentityCnt;
9080 for (
auto &Pair : OrdersUses) {
9084 if (Cnt < Pair.second) {
9086 BestOrder = Pair.first;
9103 unsigned E = BestOrder.
size();
9105 return I < E ? static_cast<int>(I) : PoisonMaskElem;
9107 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second) {
9108 TreeEntry *TE =
Op.second;
9109 if (!VisitedOps.
insert(TE).second)
9111 if (TE->ReuseShuffleIndices.size() == BestOrder.
size()) {
9112 reorderNodeWithReuses(*TE, Mask);
9116 if (TE->State != TreeEntry::Vectorize &&
9117 TE->State != TreeEntry::StridedVectorize &&
9118 TE->State != TreeEntry::CompressVectorize &&
9119 TE->State != TreeEntry::SplitVectorize &&
9120 (TE->State != TreeEntry::ScatterVectorize ||
9121 TE->ReorderIndices.empty()))
9123 assert((BestOrder.
size() == TE->ReorderIndices.size() ||
9124 TE->ReorderIndices.empty()) &&
9125 "Non-matching sizes of user/operand entries.");
9127 if (IgnoreReorder && TE == VectorizableTree.front().get())
9128 IgnoreReorder =
false;
9131 for (TreeEntry *
Gather : GatherOps) {
9133 "Unexpected reordering of gathers.");
9134 if (!
Gather->ReuseShuffleIndices.empty()) {
9144 auto IsNotProfitableAltCodeNode = [](
const TreeEntry &TE) {
9145 return TE.isAltShuffle() &&
9146 (!TE.ReuseShuffleIndices.empty() || TE.getVectorFactor() == 2 ||
9147 TE.ReorderIndices.empty());
9149 if (
Data.first->State != TreeEntry::Vectorize ||
9151 Data.first->getMainOp()) ||
9152 IsNotProfitableAltCodeNode(*
Data.first))
9153 Data.first->reorderOperands(Mask);
9155 IsNotProfitableAltCodeNode(*
Data.first) ||
9156 Data.first->State == TreeEntry::StridedVectorize ||
9157 Data.first->State == TreeEntry::CompressVectorize) {
9161 if (
Data.first->ReuseShuffleIndices.empty() &&
9162 !
Data.first->ReorderIndices.empty() &&
9163 !IsNotProfitableAltCodeNode(*
Data.first)) {
9166 Queue.push(
Data.first);
9174 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
9175 VectorizableTree.front()->ReuseShuffleIndices.empty())
9176 VectorizableTree.front()->ReorderIndices.
clear();
9179Instruction *BoUpSLP::getRootEntryInstruction(
const TreeEntry &Entry)
const {
9180 if (Entry.hasState() &&
9181 (Entry.getOpcode() == Instruction::Store ||
9182 Entry.getOpcode() == Instruction::Load) &&
9183 Entry.State == TreeEntry::StridedVectorize &&
9184 !Entry.ReorderIndices.empty() &&
isReverseOrder(Entry.ReorderIndices))
9191 const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
9194 for (
auto &TEPtr : VectorizableTree) {
9195 TreeEntry *Entry = TEPtr.get();
9198 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize ||
9199 DeletedNodes.contains(Entry) ||
9200 TransformedToGatherNodes.contains(Entry))
9204 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
9205 Value *Scalar = Entry->Scalars[Lane];
9210 auto It = ScalarToExtUses.
find(Scalar);
9211 if (It != ScalarToExtUses.
end() && !ExternalUses[It->second].User)
9214 if (Scalar->hasNUsesOrMore(NumVectScalars)) {
9215 unsigned FoundLane = Entry->findLaneForValue(Scalar);
9216 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract from lane " << FoundLane
9217 <<
" from " << *Scalar <<
"for many users.\n");
9218 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
9219 ExternalUses.emplace_back(Scalar,
nullptr, *Entry, FoundLane);
9220 ExternalUsesWithNonUsers.insert(Scalar);
9225 const auto ExtI = ExternallyUsedValues.
find(Scalar);
9226 if (ExtI != ExternallyUsedValues.
end()) {
9227 unsigned FoundLane = Entry->findLaneForValue(Scalar);
9228 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract: Extra arg from lane "
9229 << FoundLane <<
" from " << *Scalar <<
".\n");
9230 ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size());
9231 ExternalUses.emplace_back(Scalar,
nullptr, *Entry, FoundLane);
9242 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
9247 any_of(UseEntries, [
this](
const TreeEntry *UseEntry) {
9248 return !DeletedNodes.contains(UseEntry) &&
9249 !TransformedToGatherNodes.contains(UseEntry);
9254 if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
9257 all_of(UseEntries, [&](TreeEntry *UseEntry) {
9258 if (DeletedNodes.contains(UseEntry) ||
9259 TransformedToGatherNodes.contains(UseEntry))
9261 return UseEntry->State == TreeEntry::ScatterVectorize ||
9263 Scalar, getRootEntryInstruction(*UseEntry), TLI,
9266 LLVM_DEBUG(
dbgs() <<
"SLP: \tInternal user will be removed:" << *U
9269 [](TreeEntry *UseEntry) {
9270 return UseEntry->isGather();
9276 if (It != ScalarToExtUses.
end()) {
9277 ExternalUses[It->second].User =
nullptr;
9282 if (U && Scalar->hasNUsesOrMore(
UsesLimit))
9284 unsigned FoundLane = Entry->findLaneForValue(Scalar);
9286 <<
" from lane " << FoundLane <<
" from " << *Scalar
9288 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
9289 ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
9290 ExternalUsesWithNonUsers.insert(Scalar);
9299BoUpSLP::collectUserStores(
const BoUpSLP::TreeEntry *TE)
const {
9303 for (
unsigned Lane :
seq<unsigned>(0, TE->Scalars.size())) {
9304 Value *V = TE->Scalars[Lane];
9317 if (
SI ==
nullptr || !
SI->isSimple() ||
SI->getFunction() !=
F ||
9326 auto &StoresVec = PtrToStoresMap[{
SI->getParent(),
9327 SI->getValueOperand()->getType(), Ptr}];
9330 if (StoresVec.size() > Lane)
9332 if (!StoresVec.empty()) {
9334 SI->getValueOperand()->getType(),
SI->getPointerOperand(),
9335 SI->getValueOperand()->getType(),
9336 StoresVec.front()->getPointerOperand(), *
DL, *SE,
9342 StoresVec.push_back(SI);
9347 for (
auto &
P : PtrToStoresMap) {
9362 StoreInst *S0 = StoresVec[0];
9367 StoreInst *
SI = StoresVec[Idx];
9368 std::optional<int64_t> Diff =
9370 SI->getPointerOperand(), *DL, *SE,
9376 if (StoreOffsetVec.
size() != StoresVec.
size())
9378 sort(StoreOffsetVec, llvm::less_first());
9380 int64_t PrevDist = 0;
9381 for (
const auto &
P : StoreOffsetVec) {
9382 if (Idx > 0 &&
P.first != PrevDist + 1)
9390 ReorderIndices.assign(StoresVec.
size(), 0);
9391 bool IsIdentity =
true;
9393 ReorderIndices[
P.second] =
I;
9394 IsIdentity &=
P.second ==
I;
9400 ReorderIndices.clear();
9407 for (
unsigned Idx : Order)
9408 dbgs() << Idx <<
", ";
9414BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE)
const {
9415 unsigned NumLanes =
TE->Scalars.size();
9428 if (StoresVec.
size() != NumLanes)
9433 if (!canFormVector(StoresVec, ReorderIndices))
9438 ExternalReorderIndices.
push_back(ReorderIndices);
9440 return ExternalReorderIndices;
9446 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9447 "TreeEntryToStridedPtrInfoMap is not cleared");
9448 UserIgnoreList = &UserIgnoreLst;
9451 buildTreeRec(Roots, 0,
EdgeInfo());
9456 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9457 "TreeEntryToStridedPtrInfoMap is not cleared");
9460 buildTreeRec(Roots, 0,
EdgeInfo());
9469 bool AddNew =
true) {
9477 for (
Value *V : VL) {
9481 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
9483 bool IsFound =
false;
9484 for (
auto [Map,
Data] :
zip(ClusteredDistToLoad, ClusteredLoads)) {
9485 assert(LI->getParent() ==
Data.front().first->getParent() &&
9486 LI->getType() ==
Data.front().first->getType() &&
9490 "Expected loads with the same type, same parent and same "
9491 "underlying pointer.");
9493 LI->getType(), LI->getPointerOperand(),
Data.front().first->getType(),
9494 Data.front().first->getPointerOperand(),
DL, SE,
9498 auto It = Map.find(*Dist);
9499 if (It != Map.end() && It->second != LI)
9501 if (It == Map.end()) {
9502 Data.emplace_back(LI, *Dist);
9503 Map.try_emplace(*Dist, LI);
9513 auto FindMatchingLoads =
9518 int64_t &
Offset,
unsigned &Start) {
9520 return GatheredLoads.
end();
9529 std::optional<int64_t> Dist =
9531 Data.front().first->getType(),
9532 Data.front().first->getPointerOperand(),
DL, SE,
9538 for (std::pair<LoadInst *, int64_t>
P :
Data) {
9544 unsigned NumUniques = 0;
9545 for (
auto [Cnt, Pair] :
enumerate(Loads)) {
9546 bool Used = DataLoads.
contains(Pair.first);
9547 if (!Used && !DataDists.
contains(*Dist + Pair.second)) {
9551 Repeated.insert(Cnt);
9554 if (NumUniques > 0 &&
9555 (Loads.
size() == NumUniques ||
9556 (Loads.
size() - NumUniques >= 2 &&
9557 Loads.
size() - NumUniques >= Loads.
size() / 2 &&
9563 return std::next(GatheredLoads.
begin(), Idx);
9567 return GatheredLoads.
end();
9569 for (
ArrayRef<std::pair<LoadInst *, int64_t>>
Data : ClusteredLoads) {
9573 auto *It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
9575 while (It != GatheredLoads.
end()) {
9576 assert(!LocalToAdd.
empty() &&
"Expected some elements to add.");
9577 for (
unsigned Idx : LocalToAdd)
9580 It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
Offset,
9584 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
9591 Loads.push_back(
Data[Idx]);
9597 GatheredLoads, [&](
ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9598 return PD.front().first->getParent() == LI->
getParent() &&
9599 PD.front().first->getType() == LI->
getType();
9601 while (It != GatheredLoads.
end()) {
9604 std::next(It), GatheredLoads.
end(),
9605 [&](
ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9606 return PD.front().first->getParent() == LI->getParent() &&
9607 PD.front().first->getType() == LI->getType();
9611 GatheredLoads.emplace_back().append(
Data.begin(),
Data.end());
9612 AddNewLoads(GatheredLoads.emplace_back());
9617void BoUpSLP::tryToVectorizeGatheredLoads(
9618 const SmallMapVector<
9619 std::tuple<BasicBlock *, Value *, Type *>,
9622 GatheredLoadsEntriesFirst = VectorizableTree.
size();
9625 LoadEntriesToVectorize.size());
9626 for (
auto [Idx, Set] :
zip(LoadEntriesToVectorize, LoadSetsToVectorize))
9627 Set.insert_range(VectorizableTree[Idx]->Scalars);
9630 auto LoadSorter = [](
const std::pair<LoadInst *, int64_t> &L1,
9631 const std::pair<LoadInst *, int64_t> &L2) {
9632 return L1.second > L2.second;
9639 auto *Ty =
getWidenedType(Loads.front()->getType(), Loads.size());
9640 return TTI->isLegalMaskedGather(Ty, Alignment) &&
9641 !TTI->forceScalarizeMaskedGather(Ty, Alignment);
9646 SmallVectorImpl<LoadInst *> &NonVectorized,
9647 bool Final,
unsigned MaxVF) {
9649 unsigned StartIdx = 0;
9650 SmallVector<int> CandidateVFs;
9654 *TTI, Loads.
front()->getType(), MaxVF);
9656 *TTI, Loads.
front()->getType(), NumElts - 1)) {
9662 if (Final && CandidateVFs.
empty())
9665 unsigned BestVF = Final ? CandidateVFs.
back() : 0;
9666 for (
unsigned NumElts : CandidateVFs) {
9667 if (Final && NumElts > BestVF)
9669 SmallVector<unsigned> MaskedGatherVectorized;
9670 for (
unsigned Cnt = StartIdx,
E = Loads.
size(); Cnt <
E;
9674 if (VectorizedLoads.count(Slice.
front()) ||
9675 VectorizedLoads.count(Slice.
back()) ||
9681 bool AllowToVectorize =
false;
9684 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
9687 for (LoadInst *LI : Slice) {
9689 if (LI->hasOneUse())
9695 if (
static_cast<unsigned int>(std::distance(
9696 LI->user_begin(), LI->user_end())) != LI->getNumUses())
9698 if (!IsLegalBroadcastLoad)
9702 for (User *U : LI->users()) {
9705 for (
const TreeEntry *UTE : getTreeEntries(U)) {
9706 for (
int I :
seq<int>(UTE->getNumOperands())) {
9708 return V == LI || isa<PoisonValue>(V);
9718 AllowToVectorize = CheckIfAllowed(Slice);
9722 any_of(ValueToGatherNodes.at(Slice.front()),
9723 [=](
const TreeEntry *TE) {
9724 return TE->Scalars.size() == 2 &&
9725 ((TE->Scalars.front() == Slice.front() &&
9726 TE->Scalars.back() == Slice.back()) ||
9727 (TE->Scalars.front() == Slice.back() &&
9728 TE->Scalars.back() == Slice.front()));
9733 if (AllowToVectorize) {
9738 reinterpret_cast<Value *
const *
>(Slice.begin()), Slice.size());
9739 StridedPtrInfo SPtrInfo;
9741 PointerOps, SPtrInfo, &BestVF);
9743 (BestVF > 1 &&
static_cast<unsigned>(NumElts) == 2 * BestVF)) {
9745 if (MaskedGatherVectorized.
empty() ||
9746 Cnt >= MaskedGatherVectorized.
back() + NumElts)
9751 Results.emplace_back(Values, LS);
9752 VectorizedLoads.insert_range(Slice);
9755 if (Cnt == StartIdx)
9756 StartIdx += NumElts;
9759 if (StartIdx >= Loads.
size())
9763 if (!MaskedGatherVectorized.
empty() &&
9764 Cnt < MaskedGatherVectorized.
back() + NumElts)
9770 if (!AllowToVectorize || BestVF == 0)
9774 for (
unsigned Cnt : MaskedGatherVectorized) {
9776 Cnt, std::min<unsigned>(NumElts, Loads.
size() - Cnt));
9780 VectorizedLoads.insert_range(Slice);
9782 if (Cnt == StartIdx)
9783 StartIdx += NumElts;
9786 for (LoadInst *LI : Loads) {
9787 if (!VectorizedLoads.contains(LI))
9788 NonVectorized.push_back(LI);
9792 auto ProcessGatheredLoads =
9795 bool Final =
false) {
9797 for (
ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists :
9799 if (LoadsDists.size() <= 1) {
9800 NonVectorized.
push_back(LoadsDists.back().first);
9808 unsigned MaxConsecutiveDistance = 0;
9809 unsigned CurrentConsecutiveDist = 1;
9810 int64_t LastDist = LocalLoadsDists.front().second;
9811 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
9812 for (
const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {
9815 assert(LastDist >=
L.second &&
9816 "Expected first distance always not less than second");
9817 if (
static_cast<uint64_t
>(LastDist -
L.second) ==
9818 CurrentConsecutiveDist) {
9819 ++CurrentConsecutiveDist;
9820 MaxConsecutiveDistance =
9821 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
9825 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
9828 CurrentConsecutiveDist = 1;
9829 LastDist =
L.second;
9832 if (Loads.
size() <= 1)
9834 if (AllowMaskedGather)
9835 MaxConsecutiveDistance = Loads.
size();
9836 else if (MaxConsecutiveDistance < 2)
9841 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
9842 Final, MaxConsecutiveDistance);
9844 OriginalLoads.size() == Loads.
size() &&
9845 MaxConsecutiveDistance == Loads.
size() &&
9850 VectorizedLoads.
clear();
9854 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
9855 UnsortedNonVectorized, Final,
9856 OriginalLoads.size());
9857 if (SortedNonVectorized.
size() >= UnsortedNonVectorized.
size()) {
9858 SortedNonVectorized.
swap(UnsortedNonVectorized);
9859 Results.swap(UnsortedResults);
9864 << Slice.
size() <<
")\n");
9866 for (
Value *L : Slice)
9874 unsigned MaxVF = Slice.size();
9875 unsigned UserMaxVF = 0;
9876 unsigned InterleaveFactor = 0;
9881 std::optional<unsigned> InterleavedLoadsDistance = 0;
9883 std::optional<unsigned> CommonVF = 0;
9884 DenseMap<const TreeEntry *, unsigned> EntryToPosition;
9885 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
9886 for (
auto [Idx, V] :
enumerate(Slice)) {
9887 for (
const TreeEntry *
E : ValueToGatherNodes.at(V)) {
9888 UserMaxVF = std::max<unsigned>(UserMaxVF,
E->Scalars.size());
9891 UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
9893 if (*CommonVF == 0) {
9894 CommonVF =
E->Scalars.size();
9897 if (*CommonVF !=
E->Scalars.size())
9901 if (Pos != Idx && InterleavedLoadsDistance) {
9904 if (isa<Constant>(V))
9906 if (isVectorized(V))
9908 const auto &Nodes = ValueToGatherNodes.at(V);
9909 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
9910 !is_contained(Slice, V);
9912 InterleavedLoadsDistance.reset();
9916 if (*InterleavedLoadsDistance == 0) {
9917 InterleavedLoadsDistance = Idx - Pos;
9920 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
9921 (Idx - Pos) / *InterleavedLoadsDistance < Order)
9922 InterleavedLoadsDistance.reset();
9923 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
9927 DeinterleavedNodes.
clear();
9929 if (InterleavedLoadsDistance.value_or(0) > 1 &&
9930 CommonVF.value_or(0) != 0) {
9931 InterleaveFactor =
bit_ceil(*InterleavedLoadsDistance);
9932 unsigned VF = *CommonVF;
9935 StridedPtrInfo SPtrInfo;
9937 if (InterleaveFactor <= Slice.size() &&
9938 TTI.isLegalInterleavedAccessType(
9946 UserMaxVF = InterleaveFactor * VF;
9948 InterleaveFactor = 0;
9953 unsigned ConsecutiveNodesSize = 0;
9954 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
9955 any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9956 [&, Slice = Slice](
const auto &
P) {
9958 return std::get<1>(
P).contains(V);
9960 if (It == Slice.end())
9962 const TreeEntry &
TE =
9963 *VectorizableTree[std::get<0>(
P)];
9967 StridedPtrInfo SPtrInfo;
9969 VL, VL.
front(), Order, PointerOps, SPtrInfo);
9973 ConsecutiveNodesSize += VL.
size();
9974 size_t Start = std::distance(Slice.begin(), It);
9975 size_t Sz = Slice.size() -
Start;
9976 return Sz < VL.
size() ||
9977 Slice.slice(Start, VL.
size()) != VL;
9982 if (InterleaveFactor == 0 &&
9984 [&, Slice = Slice](
unsigned Idx) {
9986 SmallVector<Value *> PointerOps;
9987 StridedPtrInfo SPtrInfo;
9988 return canVectorizeLoads(
9989 Slice.slice(Idx * UserMaxVF, UserMaxVF),
9990 Slice[Idx * UserMaxVF], Order, PointerOps,
9991 SPtrInfo) == LoadsState::ScatterVectorize;
9994 if (Slice.size() != ConsecutiveNodesSize)
9995 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
9997 for (
unsigned VF = MaxVF; VF >= 2; VF /= 2) {
9998 bool IsVectorized =
true;
9999 for (
unsigned I = 0,
E = Slice.size();
I <
E;
I += VF) {
10001 Slice.slice(
I, std::min(VF,
E -
I));
10006 if (
any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
10007 [&](
const auto &
P) {
10008 return !SubSlice.
equals(
10009 VectorizableTree[std::get<0>(
P)]
10014 unsigned Sz = VectorizableTree.size();
10015 buildTreeRec(SubSlice, 0,
EdgeInfo(), InterleaveFactor);
10016 if (Sz == VectorizableTree.size()) {
10017 IsVectorized =
false;
10020 if (InterleaveFactor > 0) {
10021 VF = 2 * (MaxVF / InterleaveFactor);
10022 InterleaveFactor = 0;
10031 NonVectorized.
append(SortedNonVectorized);
10033 return NonVectorized;
10035 for (
const auto &GLs : GatheredLoads) {
10036 const auto &
Ref = GLs.second;
10038 if (!
Ref.empty() && !NonVectorized.
empty() &&
10040 Ref.begin(),
Ref.end(), 0u,
10041 [](
unsigned S,
ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists)
10042 ->
unsigned { return S + LoadsDists.size(); }) !=
10043 NonVectorized.
size() &&
10044 IsMaskedGatherSupported(NonVectorized)) {
10046 FinalGatheredLoads;
10047 for (LoadInst *LI : NonVectorized) {
10051 FinalGatheredLoads,
10055 (void)ProcessGatheredLoads(FinalGatheredLoads,
true);
10059 for (
unsigned Idx : LoadEntriesToVectorize) {
10060 const TreeEntry &
E = *VectorizableTree[Idx];
10063 if (!
E.ReorderIndices.empty()) {
10066 SmallVector<int> ReorderMask;
10070 buildTreeRec(GatheredScalars, 0,
EdgeInfo());
10074 if (
static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
10075 VectorizableTree.size())
10076 GatheredLoadsEntriesFirst.reset();
10086 bool AllowAlternate) {
10092 if (LI->isSimple())
10103 SubKey =
hash_value(EI->getVectorOperand());
10110 if (AllowAlternate)
10121 std::pair<size_t, size_t> OpVals =
10129 if (CI->isCommutative())
10151 SubKey =
hash_value(Gep->getPointerOperand());
10163 return std::make_pair(
Key, SubKey);
10169 Instruction *AltOp,
const TargetLibraryInfo &TLI);
10171bool BoUpSLP::areAltOperandsProfitable(
const InstructionsState &S,
10173 Type *ScalarTy = S.getMainOp()->getType();
10174 unsigned Opcode0 = S.getOpcode();
10175 unsigned Opcode1 = S.getAltOpcode();
10176 SmallBitVector OpcodeMask(
getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
10179 Opcode1, OpcodeMask))
10182 for (
unsigned I :
seq<unsigned>(S.getMainOp()->getNumOperands())) {
10185 for (
Value *V : VL) {
10187 Operands.
back().push_back(
10194 if (Operands.
size() == 2) {
10198 Candidates[0] = std::make_pair(Operands[0][
I], Operands[0][
I + 1]);
10199 Candidates[1] = std::make_pair(Operands[0][
I], Operands[1][
I + 1]);
10200 Candidates[2] = std::make_pair(Operands[1][
I], Operands[0][
I + 1]);
10202 switch (Res.value_or(0)) {
10206 std::swap(Operands[0][
I + 1], Operands[1][
I + 1]);
10216 DenseSet<unsigned> UniqueOpcodes;
10217 constexpr unsigned NumAltInsts = 3;
10218 unsigned NonInstCnt = 0;
10221 unsigned UndefCnt = 0;
10223 unsigned ExtraShuffleInsts = 0;
10226 if (Operands.
size() == 2) {
10228 if (Operands.
front() == Operands.
back()) {
10232 return is_contained(Operands.back(), V);
10235 ++ExtraShuffleInsts;
10238 const Loop *
L = LI->getLoopFor(S.getMainOp()->getParent());
10250 DenseMap<Value *, unsigned> Uniques;
10260 if (!Res.second && Res.first->second == 1)
10261 ++ExtraShuffleInsts;
10262 ++Res.first->getSecond();
10264 UniqueOpcodes.
insert(
I->getOpcode());
10265 else if (Res.second)
10268 return none_of(Uniques, [&](
const auto &
P) {
10269 return P.first->hasNUsesOrMore(
P.second + 1) &&
10270 none_of(
P.first->users(), [&](User *U) {
10271 return isVectorized(U) || Uniques.contains(U);
10280 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
10281 (UniqueOpcodes.
size() + NonInstCnt + ExtraShuffleInsts +
10282 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
10289 const unsigned VF,
unsigned MinBW,
10312static std::pair<InstructionCost, InstructionCost>
10332 FMF = FPCI->getFastMathFlags();
10335 LibCost.isValid() ? LibCost : ScalarLimit);
10345BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
10347 bool IsScatterVectorizeUserTE,
OrdersType &CurrentOrder,
10348 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo) {
10350 "Expected instructions with same/alternate opcodes only.");
10352 unsigned ShuffleOrOp =
10353 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
10355 switch (ShuffleOrOp) {
10356 case Instruction::PHI: {
10359 return TreeEntry::NeedToGather;
10361 for (
Value *V : VL) {
10365 for (
Value *Incoming :
PHI->incoming_values()) {
10367 if (Term &&
Term->isTerminator()) {
10369 <<
"SLP: Need to swizzle PHINodes (terminator use).\n");
10370 return TreeEntry::NeedToGather;
10375 return TreeEntry::Vectorize;
10377 case Instruction::ExtractElement:
10384 return TreeEntry::NeedToGather;
10386 case Instruction::ExtractValue: {
10387 bool Reuse = canReuseExtract(VL, CurrentOrder);
10391 return TreeEntry::NeedToGather;
10392 if (Reuse || !CurrentOrder.empty())
10393 return TreeEntry::Vectorize;
10395 return TreeEntry::NeedToGather;
10397 case Instruction::InsertElement: {
10401 for (
Value *V : VL) {
10403 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement/poison vector.\n");
10404 return TreeEntry::NeedToGather;
10408 "Non-constant or undef index?");
10412 return !SourceVectors.contains(V);
10415 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
10416 "different source vectors.\n");
10417 return TreeEntry::NeedToGather;
10422 return SourceVectors.contains(V) && !
V->hasOneUse();
10425 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
10426 "multiple uses.\n");
10427 return TreeEntry::NeedToGather;
10430 return TreeEntry::Vectorize;
10432 case Instruction::Load: {
10439 auto IsGatheredNode = [&]() {
10440 if (!GatheredLoadsEntriesFirst)
10445 return any_of(getTreeEntries(V), [&](
const TreeEntry *TE) {
10446 return TE->Idx >= *GatheredLoadsEntriesFirst;
10452 return TreeEntry::Vectorize;
10454 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
10456 LoadEntriesToVectorize.insert(VectorizableTree.size());
10457 return TreeEntry::NeedToGather;
10459 return IsGatheredNode() ? TreeEntry::NeedToGather
10460 : TreeEntry::CompressVectorize;
10462 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
10464 LoadEntriesToVectorize.insert(VectorizableTree.size());
10465 return TreeEntry::NeedToGather;
10467 return IsGatheredNode() ? TreeEntry::NeedToGather
10468 : TreeEntry::ScatterVectorize;
10470 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
10472 LoadEntriesToVectorize.insert(VectorizableTree.size());
10473 return TreeEntry::NeedToGather;
10475 return IsGatheredNode() ? TreeEntry::NeedToGather
10476 : TreeEntry::StridedVectorize;
10480 if (DL->getTypeSizeInBits(ScalarTy) !=
10481 DL->getTypeAllocSizeInBits(ScalarTy))
10482 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering loads of non-packed type.\n");
10485 return !LI || !LI->isSimple();
10489 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering non-consecutive loads.\n");
10492 return TreeEntry::NeedToGather;
10496 case Instruction::ZExt:
10497 case Instruction::SExt:
10498 case Instruction::FPToUI:
10499 case Instruction::FPToSI:
10500 case Instruction::FPExt:
10501 case Instruction::PtrToInt:
10502 case Instruction::IntToPtr:
10503 case Instruction::SIToFP:
10504 case Instruction::UIToFP:
10505 case Instruction::Trunc:
10506 case Instruction::FPTrunc:
10507 case Instruction::BitCast: {
10509 for (
Value *V : VL) {
10515 dbgs() <<
"SLP: Gathering casts with different src types.\n");
10516 return TreeEntry::NeedToGather;
10519 return TreeEntry::Vectorize;
10521 case Instruction::ICmp:
10522 case Instruction::FCmp: {
10527 for (
Value *V : VL) {
10531 if ((
Cmp->getPredicate() != P0 &&
Cmp->getPredicate() != SwapP0) ||
10532 Cmp->getOperand(0)->getType() != ComparedTy) {
10533 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering cmp with different predicate.\n");
10534 return TreeEntry::NeedToGather;
10537 return TreeEntry::Vectorize;
10539 case Instruction::Select:
10540 case Instruction::FNeg:
10541 case Instruction::Add:
10542 case Instruction::FAdd:
10543 case Instruction::Sub:
10544 case Instruction::FSub:
10545 case Instruction::Mul:
10546 case Instruction::FMul:
10547 case Instruction::UDiv:
10548 case Instruction::SDiv:
10549 case Instruction::FDiv:
10550 case Instruction::URem:
10551 case Instruction::SRem:
10552 case Instruction::FRem:
10553 case Instruction::Shl:
10554 case Instruction::LShr:
10555 case Instruction::AShr:
10556 case Instruction::And:
10557 case Instruction::Or:
10558 case Instruction::Xor:
10559 case Instruction::Freeze:
10560 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10561 TTI->isFPVectorizationPotentiallyUnsafe() &&
any_of(VL, [](
Value *V) {
10563 return I &&
I->isBinaryOp() && !
I->isFast();
10565 return TreeEntry::NeedToGather;
10566 return TreeEntry::Vectorize;
10567 case Instruction::GetElementPtr: {
10569 for (
Value *V : VL) {
10573 if (
I->getNumOperands() != 2) {
10574 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (nested indexes).\n");
10575 return TreeEntry::NeedToGather;
10582 for (
Value *V : VL) {
10586 Type *CurTy =
GEP->getSourceElementType();
10587 if (Ty0 != CurTy) {
10588 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (different types).\n");
10589 return TreeEntry::NeedToGather;
10595 for (
Value *V : VL) {
10599 auto *
Op =
I->getOperand(1);
10601 (
Op->getType() != Ty1 &&
10603 Op->getType()->getScalarSizeInBits() >
10604 DL->getIndexSizeInBits(
10605 V->getType()->getPointerAddressSpace())))) {
10607 dbgs() <<
"SLP: not-vectorizable GEP (non-constant indexes).\n");
10608 return TreeEntry::NeedToGather;
10612 return TreeEntry::Vectorize;
10614 case Instruction::Store: {
10616 llvm::Type *ScalarTy =
cast<StoreInst>(VL0)->getValueOperand()->getType();
10619 if (DL->getTypeSizeInBits(ScalarTy) !=
10620 DL->getTypeAllocSizeInBits(ScalarTy)) {
10621 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering stores of non-packed type.\n");
10622 return TreeEntry::NeedToGather;
10626 for (
Value *V : VL) {
10628 if (!
SI->isSimple()) {
10630 return TreeEntry::NeedToGather;
10639 if (CurrentOrder.empty()) {
10640 Ptr0 = PointerOps.
front();
10641 PtrN = PointerOps.
back();
10643 Ptr0 = PointerOps[CurrentOrder.front()];
10644 PtrN = PointerOps[CurrentOrder.back()];
10646 std::optional<int64_t> Dist =
10649 if (
static_cast<uint64_t
>(*Dist) == VL.size() - 1)
10650 return TreeEntry::Vectorize;
10654 return TreeEntry::NeedToGather;
10656 case Instruction::Call: {
10657 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10658 TTI->isFPVectorizationPotentiallyUnsafe() &&
any_of(VL, [](
Value *V) {
10660 return I && !
I->isFast();
10662 return TreeEntry::NeedToGather;
10672 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10676 return TreeEntry::NeedToGather;
10679 unsigned NumArgs = CI->
arg_size();
10680 SmallVector<Value *, 4> ScalarArgs(NumArgs,
nullptr);
10681 for (
unsigned J = 0; J != NumArgs; ++J)
10684 for (
Value *V : VL) {
10689 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
10691 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched calls:" << *CI <<
"!=" << *V
10693 return TreeEntry::NeedToGather;
10697 for (
unsigned J = 0; J != NumArgs; ++J) {
10700 if (ScalarArgs[J] != A1J) {
10702 <<
"SLP: mismatched arguments in call:" << *CI
10703 <<
" argument " << ScalarArgs[J] <<
"!=" << A1J <<
"\n");
10704 return TreeEntry::NeedToGather;
10713 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched bundle operands in calls:" << *CI
10714 <<
"!=" << *V <<
'\n');
10715 return TreeEntry::NeedToGather;
10720 auto *VecTy =
getWidenedType(S.getMainOp()->getType(), VL.size());
10722 if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())
10723 return TreeEntry::NeedToGather;
10725 return TreeEntry::Vectorize;
10727 case Instruction::ShuffleVector: {
10728 if (!S.isAltShuffle()) {
10731 return TreeEntry::Vectorize;
10734 LLVM_DEBUG(
dbgs() <<
"SLP: ShuffleVector are not vectorized.\n");
10735 return TreeEntry::NeedToGather;
10740 <<
"SLP: ShuffleVector not vectorized, operands are buildvector and "
10741 "the whole alt sequence is not profitable.\n");
10742 return TreeEntry::NeedToGather;
10745 return TreeEntry::Vectorize;
10749 return TreeEntry::NeedToGather;
10758 PHINode *Main =
nullptr;
10763 PHIHandler() =
delete;
10765 : DT(DT), Main(Main), Phis(Phis),
10766 Operands(Main->getNumIncomingValues(),
10768 void buildOperands() {
10769 constexpr unsigned FastLimit = 4;
10778 for (
auto [Idx, V] :
enumerate(Phis)) {
10782 "Expected isa instruction or poison value.");
10783 Operands[
I][Idx] =
V;
10786 if (
P->getIncomingBlock(
I) == InBB)
10787 Operands[
I][Idx] =
P->getIncomingValue(
I);
10789 Operands[
I][Idx] =
P->getIncomingValueForBlock(InBB);
10794 SmallMapVector<BasicBlock *, SmallVector<unsigned>, 4>
10804 for (
auto [Idx, V] :
enumerate(Phis)) {
10807 Operands[
I][Idx] =
V;
10816 Operands[
I][Idx] =
P->getIncomingValue(
I);
10819 auto *It = Blocks.
find(InBB);
10820 if (It == Blocks.
end())
10822 Operands[It->second.front()][Idx] =
P->getIncomingValue(
I);
10825 for (
const auto &
P : Blocks) {
10826 ArrayRef<unsigned> IncomingValues =
P.second;
10827 if (IncomingValues.
size() <= 1)
10830 for (
unsigned I : IncomingValues) {
10832 [&](
const auto &
Data) {
10833 return !
Data.value() ||
10834 Data.value() == Operands[BasicI][
Data.index()];
10836 "Expected empty operands list.");
10837 Operands[
I] = Operands[BasicI];
10850static std::pair<Instruction *, Instruction *>
10854 for (
Value *V : VL) {
10864 if (MainOp->
getOpcode() ==
I->getOpcode()) {
10883 "Expected different main and alt instructions.");
10884 return std::make_pair(MainOp, AltOp);
10897 const InstructionsState &S,
10899 bool TryPad =
false) {
10903 for (
Value *V : VL) {
10919 size_t NumUniqueScalarValues = UniqueValues.
size();
10922 if (NumUniqueScalarValues == VL.
size() &&
10924 ReuseShuffleIndices.
clear();
10929 if ((UserTreeIdx.
UserTE &&
10930 UserTreeIdx.
UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(
TTI)) ||
10932 LLVM_DEBUG(
dbgs() <<
"SLP: Reshuffling scalars not yet supported "
10933 "for nodes with padding.\n");
10934 ReuseShuffleIndices.
clear();
10939 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
10943 if (TryPad && UniquePositions.
size() > 1 && NumUniqueScalarValues > 1 &&
10944 S.getMainOp()->isSafeToRemove() &&
10945 (S.areInstructionsWithCopyableElements() ||
10949 TTI, UniqueValues.
front()->getType(), UniqueValues.
size());
10950 PWSz = std::min<unsigned>(PWSz, VL.
size());
10951 if (PWSz == VL.
size()) {
10955 ReuseShuffleIndices.
clear();
10959 UniqueValues.
end());
10960 PaddedUniqueValues.
append(
10961 PWSz - UniqueValues.
size(),
10965 if ((!S.areInstructionsWithCopyableElements() &&
10967 (S.areInstructionsWithCopyableElements() && S.isMulDivLikeOp() &&
10968 (S.getMainOp()->isIntDivRem() || S.getMainOp()->isFPDivRem() ||
10971 ReuseShuffleIndices.
clear();
10974 VL = std::move(PaddedUniqueValues);
10979 ReuseShuffleIndices.
clear();
10982 VL = std::move(UniqueValues);
10987 const InstructionsState &LocalState,
10988 SmallVectorImpl<Value *> &Op1,
10989 SmallVectorImpl<Value *> &Op2,
10991 constexpr unsigned SmallNodeSize = 4;
10992 if (VL.
size() <= SmallNodeSize || TTI->preferAlternateOpcodeVectorization() ||
10997 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *LocalState.getMainOp()
10999 for (TreeEntry *
E : getSplitTreeEntries(LocalState.getMainOp())) {
11000 if (
E->isSame(VL)) {
11002 << *LocalState.getMainOp() <<
".\n");
11014 ReorderIndices.assign(VL.
size(), VL.
size());
11015 SmallBitVector Op1Indices(VL.
size());
11020 Op1Indices.set(Idx);
11023 if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
11026 (LocalState.getAltOpcode() == LocalState.getOpcode() &&
11028 LocalState.getAltOp(), *TLI))) {
11030 Op1Indices.set(Idx);
11037 unsigned Opcode0 = LocalState.getOpcode();
11038 unsigned Opcode1 = LocalState.getAltOpcode();
11039 SmallBitVector OpcodeMask(
getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
11044 if (UOp1.size() <= 1 || UOp2.size() <= 1 ||
11045 TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) ||
11050 unsigned Op1Cnt = 0, Op2Cnt = Op1.
size();
11052 if (Op1Indices.test(Idx)) {
11053 ReorderIndices[Op1Cnt] = Idx;
11056 ReorderIndices[Op2Cnt] = Idx;
11061 ReorderIndices.clear();
11062 SmallVector<int>
Mask;
11063 if (!ReorderIndices.empty())
11065 unsigned NumParts = TTI->getNumberOfParts(VecTy);
11070 if (NumParts >= VL.
size())
11075 FixedVectorType *SubVecTy =
11079 if (!LocalState.isCmpOp() && NumParts <= 1 &&
11080 (
Mask.empty() || InsertCost >= NewShuffleCost))
11082 if ((LocalState.getMainOp()->isBinaryOp() &&
11083 LocalState.getAltOp()->isBinaryOp() &&
11084 (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
11085 LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
11086 (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
11087 (LocalState.getMainOp()->isUnaryOp() &&
11088 LocalState.getAltOp()->isUnaryOp())) {
11090 TTI->getArithmeticInstrCost(Opcode0, VecTy, Kind) +
11091 TTI->getArithmeticInstrCost(Opcode1, VecTy, Kind);
11096 OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.
size());
11100 VecTy, OriginalMask, Kind);
11102 TTI->getArithmeticInstrCost(Opcode0, Op1VecTy, Kind) +
11103 TTI->getArithmeticInstrCost(Opcode1, Op2VecTy, Kind);
11105 NewVecOpsCost + InsertCost +
11106 (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
11107 VectorizableTree.front()->getOpcode() == Instruction::Store
11111 if (NewCost >= OriginalCost)
11121class InstructionsCompatibilityAnalysis {
11123 const DataLayout &
DL;
11124 const TargetTransformInfo &
TTI;
11125 const TargetLibraryInfo &TLI;
11126 unsigned MainOpcode = 0;
11131 static bool isSupportedOpcode(
const unsigned Opcode) {
11132 return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
11133 Opcode == Instruction::LShr || Opcode == Instruction::Shl ||
11134 Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
11135 Opcode == Instruction::And || Opcode == Instruction::Or ||
11136 Opcode == Instruction::Xor || Opcode == Instruction::FAdd ||
11137 Opcode == Instruction::FSub || Opcode == Instruction::FMul ||
11138 Opcode == Instruction::FDiv;
11148 auto IsSupportedInstruction = [&](
Instruction *
I,
bool AnyUndef) {
11149 if (AnyUndef && (
I->isIntDivRem() ||
I->isFPDivRem() ||
isa<CallInst>(
I)))
11151 return I && isSupportedOpcode(
I->getOpcode()) &&
11156 SmallDenseSet<Value *, 8> Operands;
11157 SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
11158 bool AnyUndef =
false;
11159 for (
Value *V : VL) {
11167 if (Candidates.
empty()) {
11168 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
11170 Operands.
insert(
I->op_begin(),
I->op_end());
11173 if (Parent ==
I->getParent()) {
11174 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
11175 Operands.
insert(
I->op_begin(),
I->op_end());
11178 auto *NodeA = DT.
getNode(Parent);
11179 auto *NodeB = DT.
getNode(
I->getParent());
11180 assert(NodeA &&
"Should only process reachable instructions");
11181 assert(NodeB &&
"Should only process reachable instructions");
11182 assert((NodeA == NodeB) ==
11183 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11184 "Different nodes should have different DFS numbers");
11185 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
11186 Candidates.
clear();
11187 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
11190 Operands.
insert(
I->op_begin(),
I->op_end());
11193 unsigned BestOpcodeNum = 0;
11195 bool UsedOutside =
false;
11196 for (
const auto &
P : Candidates) {
11198 if (UsedOutside && !PUsedOutside)
11200 if (!UsedOutside && PUsedOutside)
11202 if (
P.second.size() < BestOpcodeNum)
11205 if (!PUsedOutside &&
any_of(
P.second, [&](Instruction *
I) {
11206 return Operands.contains(I);
11209 UsedOutside = PUsedOutside;
11210 for (Instruction *
I :
P.second) {
11211 if (IsSupportedInstruction(
I, AnyUndef)) {
11213 BestOpcodeNum =
P.second.size();
11223 return I &&
I->getParent() == MainOp->
getParent() &&
11236 Value *selectBestIdempotentValue()
const {
11237 assert(isSupportedOpcode(MainOpcode) &&
"Unsupported opcode");
11248 if (!S.isCopyableElement(V))
11250 assert(isSupportedOpcode(MainOpcode) &&
"Unsupported opcode");
11251 return {
V, selectBestIdempotentValue()};
11257 SmallVectorImpl<BoUpSLP::ValueList> &Operands)
const {
11259 unsigned ShuffleOrOp =
11260 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
11263 switch (ShuffleOrOp) {
11264 case Instruction::PHI: {
11268 PHIHandler Handler(DT, PH, VL);
11269 Handler.buildOperands();
11270 Operands.
assign(PH->getNumOperands(), {});
11272 Operands[
I].
assign(Handler.getOperands(
I).begin(),
11273 Handler.getOperands(
I).end());
11276 case Instruction::ExtractValue:
11277 case Instruction::ExtractElement:
11282 case Instruction::InsertElement:
11290 case Instruction::Load:
11294 for (
auto [V,
Op] :
zip(VL, Operands.
back())) {
11298 Op = LI->getPointerOperand();
11301 case Instruction::ZExt:
11302 case Instruction::SExt:
11303 case Instruction::FPToUI:
11304 case Instruction::FPToSI:
11305 case Instruction::FPExt:
11306 case Instruction::PtrToInt:
11307 case Instruction::IntToPtr:
11308 case Instruction::SIToFP:
11309 case Instruction::UIToFP:
11310 case Instruction::Trunc:
11311 case Instruction::FPTrunc:
11312 case Instruction::BitCast:
11313 case Instruction::ICmp:
11314 case Instruction::FCmp:
11315 case Instruction::Select:
11316 case Instruction::FNeg:
11317 case Instruction::Add:
11318 case Instruction::FAdd:
11319 case Instruction::Sub:
11320 case Instruction::FSub:
11321 case Instruction::Mul:
11322 case Instruction::FMul:
11323 case Instruction::UDiv:
11324 case Instruction::SDiv:
11325 case Instruction::FDiv:
11326 case Instruction::URem:
11327 case Instruction::SRem:
11328 case Instruction::FRem:
11329 case Instruction::Shl:
11330 case Instruction::LShr:
11331 case Instruction::AShr:
11332 case Instruction::And:
11333 case Instruction::Or:
11334 case Instruction::Xor:
11335 case Instruction::Freeze:
11336 case Instruction::Store:
11337 case Instruction::ShuffleVector:
11346 auto [
Op, ConvertedOps] = convertTo(
I, S);
11351 case Instruction::GetElementPtr: {
11358 const unsigned IndexIdx = 1;
11364 return !
GEP || VL0Ty ==
GEP->getOperand(IndexIdx)->getType();
11368 ->getPointerOperandType()
11369 ->getScalarType());
11373 Operands[0][Idx] =
V;
11374 Operands[1][Idx] = ConstantInt::getNullValue(Ty);
11377 Operands[0][Idx] =
GEP->getPointerOperand();
11378 auto *
Op =
GEP->getOperand(IndexIdx);
11381 CI, Ty, CI->getValue().isSignBitSet(),
DL)
11386 case Instruction::Call: {
11393 for (
Value *V : VL) {
11395 Ops.push_back(
I ?
I->getOperand(Idx)
11408 InstructionsCompatibilityAnalysis(DominatorTree &DT,
const DataLayout &
DL,
11409 const TargetTransformInfo &
TTI,
11410 const TargetLibraryInfo &TLI)
11415 bool TryCopyableElementsVectorization,
11416 bool WithProfitabilityCheck =
false,
11417 bool SkipSameCodeCheck =
false) {
11418 InstructionsState S = (SkipSameCodeCheck || !
allSameBlock(VL))
11419 ? InstructionsState::invalid()
11425 findAndSetMainInstruction(VL, R);
11427 return InstructionsState::invalid();
11428 S = InstructionsState(MainOp, MainOp,
true);
11429 if (!WithProfitabilityCheck)
11433 auto BuildCandidates =
11434 [](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates,
Value *V1,
11440 if (I1 && I2 &&
I1->getOpcode() == I2->getOpcode() &&
11441 I1->getParent() != I2->getParent())
11445 if (VL.
size() == 2) {
11448 BuildCandidates(Candidates1, Operands[0][0], Operands[0][1]);
11449 BuildCandidates(Candidates2, Operands[1][0], Operands[1][1]);
11450 bool Res = !Candidates1.
empty() && !Candidates2.
empty() &&
11451 R.findBestRootPair(Candidates1) &&
11452 R.findBestRootPair(Candidates2);
11454 Candidates1.
clear();
11455 Candidates2.
clear();
11456 BuildCandidates(Candidates1, Operands[0][0], Operands[1][1]);
11457 BuildCandidates(Candidates2, Operands[1][0], Operands[0][1]);
11458 Res = !Candidates1.
empty() && !Candidates2.
empty() &&
11459 R.findBestRootPair(Candidates1) &&
11460 R.findBestRootPair(Candidates2);
11463 return InstructionsState::invalid();
11467 FixedVectorType *VecTy =
11469 switch (MainOpcode) {
11470 case Instruction::Add:
11471 case Instruction::Sub:
11472 case Instruction::LShr:
11473 case Instruction::Shl:
11474 case Instruction::SDiv:
11475 case Instruction::UDiv:
11476 case Instruction::And:
11477 case Instruction::Or:
11478 case Instruction::Xor:
11479 case Instruction::FAdd:
11480 case Instruction::FMul:
11481 case Instruction::FSub:
11482 case Instruction::FDiv:
11488 if (VectorCost > ScalarCost)
11489 return InstructionsState::invalid();
11492 assert(Operands.
size() == 2 &&
"Unexpected number of operands!");
11493 unsigned CopyableNum =
11494 count_if(VL, [&](
Value *V) {
return S.isCopyableElement(V); });
11495 if (CopyableNum < VL.
size() / 2)
11498 const unsigned Limit = VL.
size() / 24;
11499 if ((CopyableNum >= VL.
size() - Limit ||
11500 (CopyableNum >= VL.
size() - 1 && VL.
size() > 4) ||
11505 return InstructionsState::invalid();
11509 for (
auto &
Ops : Operands) {
11524 return InstructionsState::invalid();
11530 constexpr unsigned Limit = 4;
11531 if (Operands.front().size() >= Limit) {
11532 SmallDenseMap<const Value *, unsigned>
Counters;
11540 return C.second == 1;
11546 InstructionsCompatibilityAnalysis
Analysis(DT,
DL,
TTI, TLI);
11547 InstructionsState OpS =
Analysis.buildInstructionsState(
11549 if (!OpS || (OpS.getOpcode() == Instruction::PHI && !
allSameBlock(
Ops)))
11551 unsigned CopyableNum =
11553 return CopyableNum <= VL.
size() / 2;
11555 if (!CheckOperand(Operands.front()))
11556 return InstructionsState::invalid();
11563 assert(S &&
"Invalid state!");
11565 if (S.areInstructionsWithCopyableElements()) {
11566 MainOp = S.getMainOp();
11567 MainOpcode = S.getOpcode();
11572 for (
auto [OperandIdx, Operand] :
enumerate(OperandsForValue))
11573 Operands[OperandIdx][Idx] = Operand;
11576 buildOriginalOperands(S, VL, Operands);
11583BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
11585 bool TryCopyableElementsVectorization)
const {
11588 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
11589 InstructionsState S =
Analysis.buildInstructionsState(
11590 VL, *
this, TryCopyableElementsVectorization,
11591 true, TryCopyableElementsVectorization);
11593 bool AreScatterAllGEPSameBlock =
false;
11595 SmallVector<unsigned> SortedIndices;
11597 bool IsScatterVectorizeUserTE =
11598 UserTreeIdx.UserTE &&
11599 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11600 AreScatterAllGEPSameBlock =
11614 *SE, SortedIndices));
11615 if (!AreScatterAllGEPSameBlock) {
11616 LLVM_DEBUG(
dbgs() <<
"SLP: Try split and if failed, gathering due to "
11617 "C,S,B,O, small shuffle. \n";
11621 return ScalarsVectorizationLegality(S,
false,
11627 assert(It != VL.
end() &&
"Expected at least one GEP.");
11630 assert(S &&
"Must be valid.");
11636 return ScalarsVectorizationLegality(S,
false,
11642 BasicBlock *BB = S.getMainOp()->getParent();
11645 !DT->isReachableFromEntry(BB)) {
11651 return ScalarsVectorizationLegality(S,
false);
11660 return ScalarsVectorizationLegality(S,
false,
11665 if (S.getOpcode() == Instruction::ExtractElement &&
11668 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to scalable vector type.\n");
11669 return ScalarsVectorizationLegality(S,
false);
11676 (S.isAltShuffle() || VL.
size() < 4 ||
11683 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to max recursion depth.\n");
11684 return ScalarsVectorizationLegality(S,
false);
11688 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *S.getMainOp() <<
".\n");
11689 for (TreeEntry *
E : getTreeEntries(S.getMainOp())) {
11690 if (
E->isSame(VL)) {
11691 LLVM_DEBUG(
dbgs() <<
"SLP: Perfect diamond merge at " << *S.getMainOp()
11693 return ScalarsVectorizationLegality(S,
false);
11698 (S.getOpcode() == Instruction::PHI &&
isa<PHINode>(V) &&
11699 LI->getLoopFor(S.getMainOp()->getParent()) &&
11703 return ScalarsVectorizationLegality(S,
false);
11713 if (!S || !S.isAltShuffle() || VL.
size() > 2)
11721 SmallVector<unsigned, 8> InstsCount;
11722 for (
Value *V : VL) {
11725 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
11728 bool IsCommutative =
11730 if ((IsCommutative &&
11731 std::accumulate(InstsCount.
begin(), InstsCount.
end(), 0) < 2) ||
11733 all_of(InstsCount, [](
unsigned ICnt) {
return ICnt < 2; })))
11735 assert(VL.size() == 2 &&
"Expected only 2 alternate op instructions.");
11739 for (
int Op :
seq<int>(S.getMainOp()->getNumOperands()))
11741 I2->getOperand(
Op));
11742 if (
static_cast<unsigned>(
count_if(
11743 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
11745 })) >= S.getMainOp()->getNumOperands() / 2)
11747 if (S.getMainOp()->getNumOperands() > 2)
11749 if (IsCommutative) {
11751 Candidates.
clear();
11752 for (
int Op = 0,
E = S.getMainOp()->getNumOperands();
Op <
E; ++
Op)
11754 I2->getOperand((
Op + 1) %
E));
11756 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
11763 bool AreAllSameBlock = !AreScatterAllGEPSameBlock;
11764 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
11765 if (!AreAllSameInsts ||
isSplat(VL) ||
11769 NotProfitableForVectorization(VL)) {
11770 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to C,S,B,O, small shuffle. \n";
11774 return ScalarsVectorizationLegality(S,
false);
11778 if (!EphValues.empty()) {
11779 for (
Value *V : VL) {
11780 if (EphValues.count(V)) {
11782 <<
") is ephemeral.\n");
11784 return ScalarsVectorizationLegality(S,
false,
11796 if (S.isAltShuffle()) {
11797 auto GetNumVectorizedExtracted = [&]() {
11803 all_of(
I->operands(), [&](
const Use &U) {
11804 return isa<ExtractElementInst>(U.get());
11809 else if (!
I->hasOneUser() && !areAllUsersVectorized(
I, UserIgnoreList))
11812 return std::make_pair(Vectorized, Extracted);
11814 auto [Vectorized, Extracted] = GetNumVectorizedExtracted();
11816 bool PreferScalarize = !Vectorized.
isAllOnes() && VL.size() == 2;
11817 if (!Vectorized.
isAllOnes() && !PreferScalarize) {
11820 Type *ScalarTy = VL.front()->getType();
11825 false,
true, Kind);
11827 *TTI, ScalarTy, VecTy, Vectorized,
11828 true,
false, Kind,
false);
11829 PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate;
11831 if (PreferScalarize) {
11832 LLVM_DEBUG(
dbgs() <<
"SLP: The instructions are in tree and alternate "
11833 "node is not profitable.\n");
11834 return ScalarsVectorizationLegality(S,
false);
11839 if (UserIgnoreList && !UserIgnoreList->empty()) {
11840 for (
Value *V : VL) {
11841 if (UserIgnoreList->contains(V)) {
11842 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to gathered scalar.\n");
11843 return ScalarsVectorizationLegality(S,
false);
11848 return ScalarsVectorizationLegality(S,
true);
11853 unsigned InterleaveFactor) {
11856 SmallVector<int> ReuseShuffleIndices;
11860 auto TrySplitNode = [&](
const InstructionsState &LocalState) {
11863 if (!canBuildSplitNode(VL, LocalState, Op1, Op2, ReorderIndices))
11866 auto Invalid = ScheduleBundle::invalid();
11867 auto *
TE = newTreeEntry(VL, TreeEntry::SplitVectorize,
Invalid, LocalState,
11868 UserTreeIdx, {}, ReorderIndices);
11873 getSameValuesTreeEntry(S.getMainOp(),
Op,
true))) {
11875 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11876 Idx == 0 ? 0 : Op1.
size());
11877 (void)newTreeEntry(
Op, TreeEntry::NeedToGather,
Invalid, S, {
TE, Idx});
11879 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11880 Idx == 0 ? 0 : Op1.
size());
11890 bool AreConsts =
false;
11891 for (
Value *V : VL) {
11903 if (AreOnlyConstsWithPHIs(VL)) {
11904 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to all constants and PHIs.\n");
11905 newGatherTreeEntry(VL, InstructionsState::invalid(), UserTreeIdx);
11909 ScalarsVectorizationLegality Legality = getScalarsVectorizationLegality(
11910 VL,
Depth, UserTreeIdx,
false);
11911 InstructionsState S = Legality.getInstructionsState();
11912 if (!Legality.isLegal()) {
11913 if (Legality.trySplitVectorize()) {
11916 if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
11920 Legality = getScalarsVectorizationLegality(
11921 VL,
Depth, UserTreeIdx,
true);
11922 if (!Legality.isLegal()) {
11923 if (Legality.tryToFindDuplicates())
11927 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11930 S = Legality.getInstructionsState();
11934 if (S.isAltShuffle() && TrySplitNode(S))
11940 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11945 bool IsScatterVectorizeUserTE =
11946 UserTreeIdx.UserTE &&
11947 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11950 StridedPtrInfo SPtrInfo;
11951 TreeEntry::EntryState State = getScalarsVectorizationState(
11952 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, SPtrInfo);
11953 if (State == TreeEntry::NeedToGather) {
11954 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11960 auto &BSRef = BlocksSchedules[BB];
11962 BSRef = std::make_unique<BlockScheduling>(BB);
11964 BlockScheduling &BS = *BSRef;
11967 std::optional<ScheduleBundle *> BundlePtr =
11968 BS.tryScheduleBundle(UniqueValues.getArrayRef(),
this, S, UserTreeIdx);
11969#ifdef EXPENSIVE_CHECKS
11973 if (!BundlePtr || (*BundlePtr && !*BundlePtr.value())) {
11974 LLVM_DEBUG(
dbgs() <<
"SLP: We are not able to schedule this bundle!\n");
11976 if (S.isAltShuffle() && ReuseShuffleIndices.
empty() && TrySplitNode(S))
11978 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11979 NonScheduledFirst.insert(VL.front());
11980 if (S.getOpcode() == Instruction::Load &&
11981 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
11985 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
11987 ScheduleBundle
Empty;
11988 ScheduleBundle &Bundle = BundlePtr.value() ? *BundlePtr.value() :
Empty;
11989 LLVM_DEBUG(
dbgs() <<
"SLP: We are able to schedule this bundle.\n");
11991 unsigned ShuffleOrOp =
11992 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
11993 auto CreateOperandNodes = [&](TreeEntry *
TE,
const auto &Operands) {
11995 SmallVector<unsigned> PHIOps;
12001 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
12006 for (
unsigned I : PHIOps)
12007 buildTreeRec(Operands[
I],
Depth + 1, {
TE,
I});
12009 switch (ShuffleOrOp) {
12010 case Instruction::PHI: {
12012 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
12016 TE->setOperands(Operands);
12017 CreateOperandNodes(TE, Operands);
12020 case Instruction::ExtractValue:
12021 case Instruction::ExtractElement: {
12022 if (CurrentOrder.empty()) {
12023 LLVM_DEBUG(
dbgs() <<
"SLP: Reusing or shuffling extract sequence.\n");
12026 dbgs() <<
"SLP: Reusing or shuffling of reordered extract sequence "
12028 for (
unsigned Idx : CurrentOrder)
12029 dbgs() <<
" " << Idx;
12036 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12037 ReuseShuffleIndices, CurrentOrder);
12039 "(ExtractValueInst/ExtractElementInst).\n";
12043 TE->setOperands(Operands);
12046 case Instruction::InsertElement: {
12047 assert(ReuseShuffleIndices.
empty() &&
"All inserts should be unique");
12049 auto OrdCompare = [](
const std::pair<int, int> &
P1,
12050 const std::pair<int, int> &P2) {
12051 return P1.first > P2.first;
12054 decltype(OrdCompare)>
12055 Indices(OrdCompare);
12056 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
12058 Indices.emplace(Idx,
I);
12060 OrdersType CurrentOrder(VL.size(), VL.size());
12061 bool IsIdentity =
true;
12062 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
12063 CurrentOrder[Indices.top().second] =
I;
12064 IsIdentity &= Indices.top().second ==
I;
12068 CurrentOrder.clear();
12069 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12071 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (InsertElementInst).\n";
12074 TE->setOperands(Operands);
12075 buildTreeRec(
TE->getOperand(1),
Depth + 1, {TE, 1});
12078 case Instruction::Load: {
12085 TreeEntry *
TE =
nullptr;
12088 case TreeEntry::Vectorize:
12089 TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12090 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
12091 if (CurrentOrder.empty())
12092 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (LoadInst).\n";
12096 <<
"SLP: added a new TreeEntry (jumbled LoadInst).\n";
12099 case TreeEntry::CompressVectorize:
12101 TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S,
12102 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
12105 <<
"SLP: added a new TreeEntry (masked LoadInst + compress).\n";
12108 case TreeEntry::StridedVectorize:
12110 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
12111 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
12112 TreeEntryToStridedPtrInfoMap[
TE] = SPtrInfo;
12113 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (strided LoadInst).\n";
12116 case TreeEntry::ScatterVectorize:
12118 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
12119 UserTreeIdx, ReuseShuffleIndices);
12122 <<
"SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
12125 case TreeEntry::CombinedVectorize:
12126 case TreeEntry::SplitVectorize:
12127 case TreeEntry::NeedToGather:
12130 if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {
12131 assert(Operands.
size() == 1 &&
"Expected a single operand only");
12132 SmallVector<int>
Mask;
12136 TE->setOperands(Operands);
12137 if (State == TreeEntry::ScatterVectorize)
12138 buildTreeRec(PointerOps,
Depth + 1, {
TE, 0});
12141 case Instruction::ZExt:
12142 case Instruction::SExt:
12143 case Instruction::FPToUI:
12144 case Instruction::FPToSI:
12145 case Instruction::FPExt:
12146 case Instruction::PtrToInt:
12147 case Instruction::IntToPtr:
12148 case Instruction::SIToFP:
12149 case Instruction::UIToFP:
12150 case Instruction::Trunc:
12151 case Instruction::FPTrunc:
12152 case Instruction::BitCast: {
12153 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
12154 std::make_pair(std::numeric_limits<unsigned>::min(),
12155 std::numeric_limits<unsigned>::max()));
12156 if (ShuffleOrOp == Instruction::ZExt ||
12157 ShuffleOrOp == Instruction::SExt) {
12158 CastMaxMinBWSizes = std::make_pair(
12159 std::max<unsigned>(DL->getTypeSizeInBits(VL0->
getType()),
12161 std::min<unsigned>(
12164 }
else if (ShuffleOrOp == Instruction::Trunc) {
12165 CastMaxMinBWSizes = std::make_pair(
12166 std::max<unsigned>(
12169 std::min<unsigned>(DL->getTypeSizeInBits(VL0->
getType()),
12172 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12173 ReuseShuffleIndices);
12174 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (CastInst).\n";
12177 TE->setOperands(Operands);
12179 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
12180 if (ShuffleOrOp == Instruction::Trunc) {
12181 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
12182 }
else if (ShuffleOrOp == Instruction::SIToFP ||
12183 ShuffleOrOp == Instruction::UIToFP) {
12184 unsigned NumSignBits =
12187 APInt
Mask = DB->getDemandedBits(OpI);
12188 NumSignBits = std::max(NumSignBits,
Mask.countl_zero());
12190 if (NumSignBits * 2 >=
12192 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
12196 case Instruction::ICmp:
12197 case Instruction::FCmp: {
12200 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12201 ReuseShuffleIndices);
12210 "Commutative Predicate mismatch");
12213 Operands.
back() =
Ops.getVL(1);
12220 if (
Cmp->getPredicate() != P0)
12224 TE->setOperands(Operands);
12225 buildTreeRec(Operands.
front(),
Depth + 1, {TE, 0});
12226 buildTreeRec(Operands.
back(),
Depth + 1, {TE, 1});
12227 if (ShuffleOrOp == Instruction::ICmp) {
12228 unsigned NumSignBits0 =
12230 if (NumSignBits0 * 2 >=
12232 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
12233 unsigned NumSignBits1 =
12235 if (NumSignBits1 * 2 >=
12237 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
12241 case Instruction::Select:
12242 case Instruction::FNeg:
12243 case Instruction::Add:
12244 case Instruction::FAdd:
12245 case Instruction::Sub:
12246 case Instruction::FSub:
12247 case Instruction::Mul:
12248 case Instruction::FMul:
12249 case Instruction::UDiv:
12250 case Instruction::SDiv:
12251 case Instruction::FDiv:
12252 case Instruction::URem:
12253 case Instruction::SRem:
12254 case Instruction::FRem:
12255 case Instruction::Shl:
12256 case Instruction::LShr:
12257 case Instruction::AShr:
12258 case Instruction::And:
12259 case Instruction::Or:
12260 case Instruction::Xor:
12261 case Instruction::Freeze: {
12262 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12263 ReuseShuffleIndices);
12265 dbgs() <<
"SLP: added a new TreeEntry "
12266 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
12272 Operands[0] =
Ops.getVL(0);
12273 Operands[1] =
Ops.getVL(1);
12275 TE->setOperands(Operands);
12277 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
12280 case Instruction::GetElementPtr: {
12281 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12282 ReuseShuffleIndices);
12283 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (GetElementPtrInst).\n";
12285 TE->setOperands(Operands);
12288 buildTreeRec(Operands[
I],
Depth + 1, {
TE,
I});
12291 case Instruction::Store: {
12292 bool Consecutive = CurrentOrder.empty();
12295 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12296 ReuseShuffleIndices, CurrentOrder);
12298 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (StoreInst).\n";
12302 dbgs() <<
"SLP: added a new TreeEntry (jumbled StoreInst).\n";
12304 TE->setOperands(Operands);
12305 buildTreeRec(
TE->getOperand(0),
Depth + 1, {TE, 0});
12308 case Instruction::Call: {
12314 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12315 ReuseShuffleIndices);
12316 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (CallInst).\n";
12321 Operands[0] =
Ops.getVL(0);
12322 Operands[1] =
Ops.getVL(1);
12324 TE->setOperands(Operands);
12330 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
12334 case Instruction::ShuffleVector: {
12335 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12336 ReuseShuffleIndices);
12337 if (S.isAltShuffle()) {
12338 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (isAltShuffle).\n";
12343 dbgs() <<
"SLP: added a new TreeEntry (ShuffleVectorInst).\n";
12357 "Expected different main/alternate predicates.");
12373 TE->setOperands(Operands);
12374 buildTreeRec(Operands.
front(),
Depth + 1, {TE, 0});
12375 buildTreeRec(Operands.
back(),
Depth + 1, {TE, 1});
12382 Operands[0] =
Ops.getVL(0);
12383 Operands[1] =
Ops.getVL(1);
12385 TE->setOperands(Operands);
12387 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
12405 for (
const auto *Ty : ST->elements())
12406 if (Ty != *ST->element_begin())
12408 N *= ST->getNumElements();
12409 EltTy = *ST->element_begin();
12411 N *= AT->getNumElements();
12412 EltTy = AT->getElementType();
12415 N *= VT->getNumElements();
12416 EltTy = VT->getElementType();
12422 size_t VTSize = DL->getTypeStoreSizeInBits(
getWidenedType(EltTy,
N));
12423 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
12424 VTSize != DL->getTypeStoreSizeInBits(T))
12431 bool ResizeAllowed)
const {
12433 assert(It != VL.
end() &&
"Expected at least one extract instruction.");
12440 Value *Vec = E0->getOperand(0);
12442 CurrentOrder.
clear();
12446 if (E0->getOpcode() == Instruction::ExtractValue) {
12458 unsigned E = VL.
size();
12459 if (!ResizeAllowed && NElts !=
E)
12462 unsigned MinIdx = NElts, MaxIdx = 0;
12467 if (Inst->getOperand(0) != Vec)
12475 const unsigned ExtIdx = *Idx;
12476 if (ExtIdx >= NElts)
12478 Indices[
I] = ExtIdx;
12479 if (MinIdx > ExtIdx)
12481 if (MaxIdx < ExtIdx)
12484 if (MaxIdx - MinIdx + 1 >
E)
12486 if (MaxIdx + 1 <=
E)
12490 bool ShouldKeepOrder =
true;
12497 for (
unsigned I = 0;
I <
E; ++
I) {
12500 const unsigned ExtIdx = Indices[
I] - MinIdx;
12501 if (CurrentOrder[ExtIdx] !=
E) {
12502 CurrentOrder.
clear();
12505 ShouldKeepOrder &= ExtIdx ==
I;
12506 CurrentOrder[ExtIdx] =
I;
12508 if (ShouldKeepOrder)
12509 CurrentOrder.
clear();
12511 return ShouldKeepOrder;
12514bool BoUpSLP::areAllUsersVectorized(
12515 Instruction *
I,
const SmallDenseSet<Value *> *VectorizedVals)
const {
12516 return (
I->hasOneUse() && (!VectorizedVals || VectorizedVals->
contains(
I))) ||
12517 all_of(
I->users(), [
this](User *U) {
12518 return isVectorized(U) || isVectorLikeInstWithConstOps(U) ||
12519 (isa<ExtractElementInst>(U) && MustGather.contains(U));
12523void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
12524 const function_ref<
bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
12525 SmallVectorImpl<Value *> *OpScalars,
12526 SmallVectorImpl<Value *> *AltScalars)
const {
12527 unsigned Sz = Scalars.size();
12529 SmallVector<int> OrderMask;
12530 if (!ReorderIndices.empty())
12532 for (
unsigned I = 0;
I < Sz; ++
I) {
12534 if (!ReorderIndices.empty())
12535 Idx = OrderMask[
I];
12539 if (IsAltOp(OpInst)) {
12540 Mask[
I] = Sz + Idx;
12549 if (!ReuseShuffleIndices.
empty()) {
12551 transform(ReuseShuffleIndices, NewMask.
begin(), [&Mask](
int Idx) {
12552 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
12554 Mask.swap(NewMask);
12561 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(
I) == MainOp;
12571 assert(MainP != AltP &&
"Expected different main/alternate predicates.");
12580 assert((MainP ==
P || AltP ==
P || MainP == SwappedP || AltP == SwappedP) &&
12581 "CmpInst expected to match either main or alternate predicate or "
12583 return MainP !=
P && MainP != SwappedP;
12585 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(
I) == AltOp;
12590 const auto *Op0 =
Ops.front();
12603 return CI->getValue().isPowerOf2();
12609 return CI->getValue().isNegatedPowerOf2();
12614 if (IsConstant && IsUniform)
12616 else if (IsConstant)
12618 else if (IsUniform)
12630class BaseShuffleAnalysis {
12632 Type *ScalarTy =
nullptr;
12634 BaseShuffleAnalysis(
Type *ScalarTy) : ScalarTy(ScalarTy) {}
12642 unsigned getVF(
Value *V)
const {
12643 assert(V &&
"V cannot be nullptr");
12645 "V does not have FixedVectorType");
12646 assert(ScalarTy &&
"ScalarTy cannot be nullptr");
12648 unsigned VNumElements =
12650 assert(VNumElements > ScalarTyNumElements &&
12651 "the number of elements of V is not large enough");
12652 assert(VNumElements % ScalarTyNumElements == 0 &&
12653 "the number of elements of V is not a vectorized value");
12654 return VNumElements / ScalarTyNumElements;
12660 static bool isIdentityMask(ArrayRef<int> Mask,
const FixedVectorType *VecTy,
12662 int Limit =
Mask.size();
12674 if (Limit % VF == 0 &&
all_of(
seq<int>(0, Limit / VF), [=](
int Idx) {
12675 ArrayRef<int> Slice =
Mask.slice(Idx * VF, VF);
12688 static void combineMasks(
unsigned LocalVF, SmallVectorImpl<int> &Mask,
12689 ArrayRef<int> ExtMask) {
12690 unsigned VF =
Mask.size();
12692 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
12695 int MaskedIdx =
Mask[ExtMask[
I] % VF];
12699 Mask.swap(NewMask);
12735 static bool peekThroughShuffles(
Value *&V, SmallVectorImpl<int> &Mask,
12736 bool SinglePermute) {
12738 ShuffleVectorInst *IdentityOp =
nullptr;
12739 SmallVector<int> IdentityMask;
12748 if (isIdentityMask(Mask, SVTy,
false)) {
12749 if (!IdentityOp || !SinglePermute ||
12750 (isIdentityMask(Mask, SVTy,
true) &&
12752 IdentityMask.
size()))) {
12757 IdentityMask.
assign(Mask);
12777 if (SV->isZeroEltSplat()) {
12779 IdentityMask.
assign(Mask);
12781 int LocalVF =
Mask.size();
12784 LocalVF = SVOpTy->getNumElements();
12788 static_cast<unsigned>(
I) >= SV->getShuffleMask().size())
12790 ExtMask[Idx] = SV->getMaskValue(
I);
12800 if (!IsOp1Undef && !IsOp2Undef) {
12802 for (
int &
I : Mask) {
12805 if (SV->getMaskValue(
I % SV->getShuffleMask().size()) ==
12811 SmallVector<int> ShuffleMask(SV->getShuffleMask());
12812 combineMasks(LocalVF, ShuffleMask, Mask);
12813 Mask.swap(ShuffleMask);
12815 Op = SV->getOperand(0);
12817 Op = SV->getOperand(1);
12820 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
12825 "Expected masks of same sizes.");
12830 Mask.swap(IdentityMask);
12832 return SinglePermute &&
12835 (Shuffle &&
Mask.size() == Shuffle->getShuffleMask().size() &&
12836 Shuffle->isZeroEltSplat() &&
12840 Shuffle->getShuffleMask()[
P.index()] == 0;
12853 template <
typename T,
typename ShuffleBuilderTy>
12854 static T createShuffle(
Value *V1,
Value *V2, ArrayRef<int> Mask,
12855 ShuffleBuilderTy &Builder,
Type *ScalarTy) {
12856 assert(V1 &&
"Expected at least one vector value.");
12858 SmallVector<int> NewMask(Mask);
12859 if (ScalarTyNumElements != 1) {
12865 Builder.resizeToMatch(V1, V2);
12866 int VF =
Mask.size();
12868 VF = FTy->getNumElements();
12879 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
12881 CombinedMask1[
I] =
Mask[
I];
12883 CombinedMask2[
I] =
Mask[
I] - VF;
12890 (void)peekThroughShuffles(Op1, CombinedMask1,
false);
12891 (void)peekThroughShuffles(Op2, CombinedMask2,
false);
12897 for (
auto [Idx,
I] :
enumerate(CombinedMask1)) {
12900 ExtMask1[Idx] = SV1->getMaskValue(
I);
12904 ->getNumElements(),
12905 ExtMask1, UseMask::SecondArg);
12906 SmallVector<int> ExtMask2(CombinedMask2.size(),
PoisonMaskElem);
12907 for (
auto [Idx,
I] :
enumerate(CombinedMask2)) {
12910 ExtMask2[Idx] = SV2->getMaskValue(
I);
12914 ->getNumElements(),
12915 ExtMask2, UseMask::SecondArg);
12916 if (SV1->getOperand(0)->getType() ==
12917 SV2->getOperand(0)->getType() &&
12918 SV1->getOperand(0)->getType() != SV1->getType() &&
12921 Op1 = SV1->getOperand(0);
12922 Op2 = SV2->getOperand(0);
12923 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
12924 int LocalVF = ShuffleMask1.size();
12926 LocalVF = FTy->getNumElements();
12927 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
12928 CombinedMask1.swap(ShuffleMask1);
12929 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
12930 LocalVF = ShuffleMask2.size();
12932 LocalVF = FTy->getNumElements();
12933 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
12934 CombinedMask2.swap(ShuffleMask2);
12937 }
while (PrevOp1 != Op1 || PrevOp2 != Op2);
12938 Builder.resizeToMatch(Op1, Op2);
12940 ->getElementCount()
12941 .getKnownMinValue(),
12943 ->getElementCount()
12944 .getKnownMinValue());
12945 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
12948 "Expected undefined mask element");
12949 CombinedMask1[
I] = CombinedMask2[
I] + (Op1 == Op2 ? 0 : VF);
12958 return Builder.createIdentity(Op1);
12959 return Builder.createShuffleVector(
12964 return Builder.createPoison(
12966 bool IsIdentity = peekThroughShuffles(V1, NewMask,
true);
12967 assert(V1 &&
"Expected non-null value after looking through shuffles.");
12970 return Builder.createShuffleVector(V1, NewMask);
12971 return Builder.createIdentity(V1);
12977 ArrayRef<int> Mask) {
12986static std::pair<InstructionCost, InstructionCost>
12997 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
13006 ScalarCost =
TTI.getPointersChainCost(
13007 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
13011 for (
Value *V : Ptrs) {
13012 if (V == BasePtr) {
13025 if (PtrsRetainedInVecCode.
size() == Ptrs.size()) {
13030 VecCost =
TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
13031 TTI::PointersChainInfo::getKnownStride(),
13041 [](
const Value *V) {
13043 return Ptr && !Ptr->hasAllConstantIndices();
13045 ? TTI::PointersChainInfo::getUnknownStride()
13046 : TTI::PointersChainInfo::getKnownStride();
13049 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy,
CostKind);
13053 if (It != Ptrs.
end())
13058 VecCost =
TTI.getGEPCost(BaseGEP->getSourceElementType(),
13059 BaseGEP->getPointerOperand(), Indices, VecTy,
13064 return std::make_pair(ScalarCost, VecCost);
13067void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
13068 assert(
TE.isGather() &&
TE.ReorderIndices.empty() &&
13069 "Expected gather node without reordering.");
13071 SmallSet<size_t, 2> LoadKeyUsed;
13075 if (
TE.Scalars.size() == 2 || (
TE.hasState() && !
TE.isAltShuffle()) ||
13080 return VectorizableTree[Idx]->isSame(TE.Scalars);
13084 auto GenerateLoadsSubkey = [&](
size_t Key, LoadInst *LI) {
13089 auto LIt = LoadsMap.
find(std::make_pair(
Key, Ptr));
13090 if (LIt != LoadsMap.
end()) {
13091 for (LoadInst *RLI : LIt->second) {
13093 LI->
getType(), LI->getPointerOperand(), *DL, *SE,
13097 for (LoadInst *RLI : LIt->second) {
13099 LI->getPointerOperand(), *TLI)) {
13104 if (LIt->second.size() > 2) {
13106 hash_value(LIt->second.back()->getPointerOperand());
13112 LoadsMap.
try_emplace(std::make_pair(
Key, Ptr)).first->second.push_back(LI);
13115 MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues;
13116 SmallDenseMap<Value *, SmallVector<unsigned>, 8> KeyToIndex;
13117 bool IsOrdered =
true;
13118 unsigned NumInstructions = 0;
13122 size_t Key = 1, Idx = 1;
13130 auto &Container = SortedValues[
Key];
13131 if (IsOrdered && !KeyToIndex.
contains(V) &&
13134 ((Container.contains(Idx) &&
13135 KeyToIndex.
at(Container[Idx].back()).back() !=
I - 1) ||
13136 (!Container.empty() && !Container.contains(Idx) &&
13137 KeyToIndex.
at(Container.back().second.back()).back() !=
I - 1)))
13139 auto &KTI = KeyToIndex[
V];
13141 Container[Idx].push_back(V);
13146 if (!IsOrdered && NumInstructions > 1) {
13148 TE.ReorderIndices.resize(
TE.Scalars.size(),
TE.Scalars.size());
13149 for (
const auto &
D : SortedValues) {
13150 for (
const auto &
P :
D.second) {
13152 for (
Value *V :
P.second) {
13153 ArrayRef<unsigned> Indices = KeyToIndex.
at(V);
13154 for (
auto [K, Idx] :
enumerate(Indices)) {
13155 TE.ReorderIndices[Cnt +
K] = Idx;
13156 TE.Scalars[Cnt +
K] =
V;
13158 Sz += Indices.
size();
13159 Cnt += Indices.
size();
13163 *TTI,
TE.Scalars.front()->getType(), Sz);
13167 }
else if (!
P.second.empty() &&
isConstant(
P.second.front())) {
13175 if (!
TE.ReuseShuffleIndices.empty() ||
TE.ReorderIndices.empty())
13180 auto *ScalarTy =
TE.Scalars.front()->getType();
13182 for (
auto [Idx, Sz] : SubVectors) {
13189 int Sz =
TE.Scalars.size();
13190 SmallVector<int> ReorderMask(
TE.ReorderIndices.begin(),
13191 TE.ReorderIndices.end());
13197 ReorderMask[
I] =
I +
TE.ReorderIndices.size();
13201 any_of(ReorderMask, [&](
int I) {
return I >= Sz; })
13204 VecTy, ReorderMask);
13210 DemandedElts.clearBit(
I);
13212 ReorderMask[
I] =
I;
13214 ReorderMask[
I] =
I + Sz;
13220 if (!DemandedElts.isAllOnes())
13222 if (
Cost >= BVCost) {
13223 SmallVector<int>
Mask(
TE.ReorderIndices.begin(),
TE.ReorderIndices.end());
13225 TE.ReorderIndices.clear();
13232 const InstructionsState &S,
13238 return V->getType()->getScalarType()->isFloatingPointTy();
13240 "Can only convert to FMA for floating point types");
13241 assert(S.isAddSubLikeOp() &&
"Can only convert to FMA for add/sub");
13246 for (
Value *V : VL) {
13250 if (S.isCopyableElement(
I))
13252 Instruction *MatchingI = S.getMatchingMainOpOrAltOp(
I);
13253 if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
13256 FMF &= FPCI->getFastMathFlags();
13260 if (!CheckForContractable(VL))
13263 InstructionsCompatibilityAnalysis
Analysis(DT,
DL,
TTI, TLI);
13270 if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
13272 if (!CheckForContractable(Operands.
front()))
13280 for (
Value *V : VL) {
13284 if (!S.isCopyableElement(
I))
13286 FMF &= FPCI->getFastMathFlags();
13287 FMulPlusFAddCost +=
TTI.getInstructionCost(
I,
CostKind);
13290 for (
auto [V,
Op] :
zip(VL, Operands.
front())) {
13291 if (S.isCopyableElement(V))
13294 if (!
I || !
I->hasOneUse() || OpS.isCopyableElement(
I)) {
13296 FMACost +=
TTI.getInstructionCost(OpI,
CostKind);
13303 FMF &= FPCI->getFastMathFlags();
13304 FMulPlusFAddCost +=
TTI.getInstructionCost(
I,
CostKind);
13314 BaseGraphSize = VectorizableTree.size();
13316 class GraphTransformModeRAAI {
13317 bool &SavedIsGraphTransformMode;
13320 GraphTransformModeRAAI(
bool &IsGraphTransformMode)
13321 : SavedIsGraphTransformMode(IsGraphTransformMode) {
13322 IsGraphTransformMode =
true;
13324 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode =
false; }
13325 } TransformContext(IsGraphTransformMode);
13334 const InstructionsState &S) {
13338 I2->getOperand(
Op));
13340 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
13342 [](
const std::pair<Value *, Value *> &
P) {
13352 TreeEntry &E = *VectorizableTree[Idx];
13354 reorderGatherNode(E);
13359 constexpr unsigned VFLimit = 16;
13360 bool ForceLoadGather =
13361 count_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
13362 return TE->isGather() && TE->hasState() &&
13363 TE->getOpcode() == Instruction::Load &&
13364 TE->getVectorFactor() < VFLimit;
13370 return TE->isSame(VL) ||
all_of(VL, [&](
Value *V) {
13379 auto CheckForSameVectorNodes = [&](
const TreeEntry &E) {
13380 if (E.hasState()) {
13382 !TEs.
empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
13383 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
13384 ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
13385 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13386 return is_contained(TEs, TE);
13393 !TEs.
empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
13394 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
13395 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
13396 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13397 return is_contained(TEs, TE);
13405 if (It != E.Scalars.end()) {
13407 !TEs.empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
13408 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
13409 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
13410 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13411 return is_contained(TEs, TE);
13421 for (
unsigned Idx :
seq<unsigned>(BaseGraphSize)) {
13422 TreeEntry &
E = *VectorizableTree[Idx];
13423 if (
E.isGather()) {
13426 unsigned MinVF =
getMinVF(2 * Sz);
13429 if (VL.
size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
13430 !(!
E.hasState() ||
E.getOpcode() == Instruction::Load ||
13436 if (ForceLoadGather &&
E.hasState() &&
E.getOpcode() == Instruction::Load)
13439 if (CheckForSameVectorNodes(
E))
13443 unsigned StartIdx = 0;
13444 unsigned End = VL.
size();
13445 SmallBitVector Processed(End);
13447 *TTI, VL.
front()->getType(), VL.
size() - 1);
13449 *TTI, VL.
front()->getType(), VF - 1)) {
13450 if (StartIdx + VF > End)
13453 bool AllStrided =
true;
13454 for (
unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
13459 !getSameValuesTreeEntry(Slice.
front(), Slice,
true))
13466 bool IsSplat =
isSplat(Slice);
13467 bool IsTwoRegisterSplat =
true;
13468 if (IsSplat && VF == 2) {
13471 IsTwoRegisterSplat = NumRegs2VF == 2;
13473 if (Slices.
empty() || !IsSplat || !IsTwoRegisterSplat ||
13481 (S.getOpcode() == Instruction::Load &&
13483 (S.getOpcode() != Instruction::Load &&
13489 if ((!UserIgnoreList ||
E.Idx != 0) &&
13490 TTI->getInstructionCost(S.getMainOp(),
CostKind) <
13499 if (S.getOpcode() == Instruction::Load) {
13502 StridedPtrInfo SPtrInfo;
13504 PointerOps, SPtrInfo);
13515 if (UserIgnoreList &&
E.Idx == 0)
13520 }
else if (S.getOpcode() == Instruction::ExtractElement ||
13521 (TTI->getInstructionCost(S.getMainOp(),
CostKind) <
13523 !CheckOperandsProfitability(
13540 if (VF == 2 && AllStrided && Slices.
size() > 2)
13542 auto AddCombinedNode = [&](
unsigned Idx,
unsigned Cnt,
unsigned Sz) {
13543 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
13544 Processed.set(Cnt, Cnt + Sz);
13545 if (StartIdx == Cnt)
13546 StartIdx = Cnt + Sz;
13547 if (End == Cnt + Sz)
13550 for (
auto [Cnt, Sz] : Slices) {
13552 const TreeEntry *SameTE =
nullptr;
13554 It != Slice.
end()) {
13556 SameTE = getSameValuesTreeEntry(*It, Slice);
13558 unsigned PrevSize = VectorizableTree.size();
13559 [[maybe_unused]]
unsigned PrevEntriesSize =
13560 LoadEntriesToVectorize.size();
13561 buildTreeRec(Slice, 0,
EdgeInfo(&
E, UINT_MAX));
13562 if (PrevSize + 1 == VectorizableTree.size() && !SameTE &&
13563 VectorizableTree[PrevSize]->isGather() &&
13564 VectorizableTree[PrevSize]->hasState() &&
13565 VectorizableTree[PrevSize]->getOpcode() !=
13566 Instruction::ExtractElement &&
13568 if (UserIgnoreList &&
E.Idx == 0 && VF == 2)
13570 VectorizableTree.pop_back();
13571 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
13572 "LoadEntriesToVectorize expected to remain the same");
13575 AddCombinedNode(PrevSize, Cnt, Sz);
13579 if (
E.CombinedEntriesWithIndices.empty() && !
E.ReorderIndices.empty()) {
13580 SmallVector<int>
Mask(
E.ReorderIndices.begin(),
E.ReorderIndices.end());
13582 E.ReorderIndices.clear();
13587 switch (
E.getOpcode()) {
13588 case Instruction::Load: {
13591 if (
E.State != TreeEntry::Vectorize)
13593 Type *ScalarTy =
E.getMainOp()->getType();
13599 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13600 SmallVector<int>
Mask;
13604 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
13605 BaseLI->getPointerAddressSpace(),
CostKind,
13609 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_load,
13610 VecTy, BaseLI->getPointerOperand(),
13611 false, CommonAlignment,
13618 ->getPointerOperand()
13620 StridedPtrInfo SPtrInfo;
13621 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1);
13622 SPtrInfo.Ty = VecTy;
13623 TreeEntryToStridedPtrInfoMap[&
E] = SPtrInfo;
13624 E.State = TreeEntry::StridedVectorize;
13629 case Instruction::Store: {
13637 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13638 SmallVector<int>
Mask;
13642 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
13643 BaseSI->getPointerAddressSpace(),
CostKind,
13647 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_store,
13648 VecTy, BaseSI->getPointerOperand(),
13649 false, CommonAlignment,
13652 if (StridedCost < OriginalVecCost)
13655 E.State = TreeEntry::StridedVectorize;
13656 }
else if (!
E.ReorderIndices.empty()) {
13658 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int>
Mask) {
13660 assert(
Mask.size() > 1 &&
"Expected mask greater than 1 element.");
13661 if (
Mask.size() < 4)
13665 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
13666 TTI.isLegalInterleavedAccessType(
13667 VecTy, Factor, BaseSI->getAlign(),
13668 BaseSI->getPointerAddressSpace()))
13674 SmallVector<int>
Mask(
E.ReorderIndices.begin(),
E.ReorderIndices.end());
13675 unsigned InterleaveFactor = IsInterleaveMask(Mask);
13676 if (InterleaveFactor != 0)
13677 E.setInterleave(InterleaveFactor);
13681 case Instruction::Select: {
13682 if (
E.State != TreeEntry::Vectorize)
13688 E.CombinedOp = TreeEntry::MinMax;
13689 TreeEntry *CondEntry = getOperandEntry(&
E, 0);
13690 if (SelectOnly && CondEntry->UserTreeIndex &&
13691 CondEntry->State == TreeEntry::Vectorize) {
13693 CondEntry->State = TreeEntry::CombinedVectorize;
13697 case Instruction::FSub:
13698 case Instruction::FAdd: {
13700 if (
E.State != TreeEntry::Vectorize ||
13701 !
E.getOperations().isAddSubLikeOp())
13707 E.CombinedOp = TreeEntry::FMulAdd;
13708 TreeEntry *FMulEntry = getOperandEntry(&
E, 0);
13709 if (FMulEntry->UserTreeIndex &&
13710 FMulEntry->State == TreeEntry::Vectorize) {
13712 FMulEntry->State = TreeEntry::CombinedVectorize;
13721 if (LoadEntriesToVectorize.empty()) {
13723 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
13724 VectorizableTree.front()->getOpcode() == Instruction::Load)
13727 constexpr unsigned SmallTree = 3;
13728 constexpr unsigned SmallVF = 2;
13729 if ((VectorizableTree.size() <= SmallTree &&
13730 VectorizableTree.front()->Scalars.size() == SmallVF) ||
13731 (VectorizableTree.size() <= 2 && UserIgnoreList))
13734 if (VectorizableTree.front()->isNonPowOf2Vec() &&
13738 [](
const std::unique_ptr<TreeEntry> &TE) {
13739 return TE->isGather() &&
TE->hasState() &&
13740 TE->getOpcode() == Instruction::Load &&
13748 SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
13752 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
13753 TreeEntry &
E = *
TE;
13754 if (
E.isGather() &&
13755 ((
E.hasState() &&
E.getOpcode() == Instruction::Load) ||
13756 (!
E.hasState() &&
any_of(
E.Scalars,
13758 return isa<LoadInst>(V) &&
13759 !isVectorized(V) &&
13760 !isDeleted(cast<Instruction>(V));
13763 for (
Value *V :
E.Scalars) {
13770 *
this, V, *DL, *SE, *TTI,
13771 GatheredLoads[std::make_tuple(
13779 if (!GatheredLoads.
empty())
13780 tryToVectorizeGatheredLoads(GatheredLoads);
13790 bool IsFinalized =
false;
13803 bool SameNodesEstimated =
true;
13806 if (Ty->getScalarType()->isPointerTy()) {
13810 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
13811 Ty->getScalarType());
13829 assert(It != VL.
end() &&
"Expected at least one non-undef value.");
13832 count(VL, *It) > 1 &&
13834 if (!NeedShuffle) {
13837 return TTI.getShuffleCost(
13842 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
13843 CostKind, std::distance(VL.
begin(), It),
13849 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
13852 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
13856 VecTy, ShuffleMask, CostKind,
13860 return GatherCost +
13863 : R.getGatherCost(Gathers, !Root && VL.
equals(Gathers),
13871 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13872 unsigned NumParts) {
13873 assert(VL.
size() > NumParts &&
"Unexpected scalarized shuffle.");
13875 std::accumulate(VL.
begin(), VL.
end(), 0, [](
unsigned Sz,
Value *V) {
13876 auto *EE = dyn_cast<ExtractElementInst>(V);
13879 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
13882 return std::max(Sz, VecTy->getNumElements());
13889 -> std::optional<TTI::ShuffleKind> {
13890 if (NumElts <= EltsPerVector)
13891 return std::nullopt;
13893 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
13895 if (I == PoisonMaskElem)
13897 return std::min(S, I);
13900 int OffsetReg1 = OffsetReg0;
13904 int FirstRegId = -1;
13905 Indices.assign(1, OffsetReg0);
13909 int Idx =
I - OffsetReg0;
13911 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
13912 if (FirstRegId < 0)
13913 FirstRegId = RegId;
13914 RegIndices.
insert(RegId);
13915 if (RegIndices.
size() > 2)
13916 return std::nullopt;
13917 if (RegIndices.
size() == 2) {
13919 if (Indices.
size() == 1) {
13922 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
13923 [&](
int S,
int I) {
13924 if (I == PoisonMaskElem)
13926 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
13927 ((I - OffsetReg0) % NumElts) / EltsPerVector;
13928 if (RegId == FirstRegId)
13930 return std::min(S, I);
13933 unsigned Index = OffsetReg1 % NumElts;
13934 Indices.push_back(Index);
13935 SubVecSizes.push_back(std::min(NumElts - Index, EltsPerVector));
13937 Idx =
I - OffsetReg1;
13939 I = (Idx % NumElts) % EltsPerVector +
13940 (RegId == FirstRegId ? 0 : EltsPerVector);
13942 return ShuffleKind;
13950 if (!ShuffleKinds[Part])
13953 Part * EltsPerVector,
getNumElems(Mask.size(), EltsPerVector, Part));
13958 std::optional<TTI::ShuffleKind> RegShuffleKind =
13959 CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);
13960 if (!RegShuffleKind) {
13963 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.
size())))
13976 *R.TTI, VL.
front()->getType(),
alignTo(NumElts, EltsPerVector));
13977 for (
const auto [Idx, SubVecSize] :
zip(Indices, SubVecSizes)) {
13978 assert((Idx + SubVecSize) <= BaseVF &&
13979 "SK_ExtractSubvector index out of range");
13989 TTI, *ShuffleKinds[Part],
getWidenedType(ScalarTy, NumElts), SubMask);
13990 if (OriginalCost < Cost)
13991 Cost = OriginalCost;
13998 void estimateNodesPermuteCost(
const TreeEntry &E1,
const TreeEntry *E2,
14000 unsigned SliceSize) {
14001 if (SameNodesEstimated) {
14007 if ((InVectors.size() == 2 &&
14011 unsigned Limit =
getNumElems(Mask.size(), SliceSize, Part);
14014 "Expected all poisoned elements.");
14016 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
14021 Cost += createShuffle(InVectors.front(),
14022 InVectors.size() == 1 ?
nullptr : InVectors.back(),
14024 transformMaskAfterShuffle(CommonMask, CommonMask);
14025 }
else if (InVectors.size() == 2) {
14026 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14027 transformMaskAfterShuffle(CommonMask, CommonMask);
14029 SameNodesEstimated =
false;
14030 if (!E2 && InVectors.size() == 1) {
14031 unsigned VF = E1.getVectorFactor();
14033 VF = std::max(VF, getVF(V1));
14036 VF = std::max(VF, E->getVectorFactor());
14038 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14040 CommonMask[Idx] = Mask[Idx] + VF;
14041 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
14042 transformMaskAfterShuffle(CommonMask, CommonMask);
14044 auto P = InVectors.front();
14045 Cost += createShuffle(&E1, E2, Mask);
14046 unsigned VF = Mask.size();
14052 VF = std::max(VF, E->getVectorFactor());
14054 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14056 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
14057 Cost += createShuffle(
P, InVectors.front(), CommonMask);
14058 transformMaskAfterShuffle(CommonMask, CommonMask);
14062 class ShuffleCostBuilder {
14065 static bool isEmptyOrIdentity(
ArrayRef<int> Mask,
unsigned VF) {
14067 return Mask.empty() ||
14068 (VF == Mask.size() &&
14076 ~ShuffleCostBuilder() =
default;
14082 if (isEmptyOrIdentity(Mask, VF))
14091 if (isEmptyOrIdentity(Mask, VF))
14100 void resizeToMatch(
Value *&,
Value *&)
const {}
14110 ShuffleCostBuilder Builder(TTI);
14113 unsigned CommonVF = Mask.size();
14115 auto GetNodeMinBWAffectedCost = [&](
const TreeEntry &E,
14119 Type *EScalarTy = E.Scalars.front()->getType();
14120 bool IsSigned =
true;
14121 if (
auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
14123 IsSigned = It->second.second;
14125 if (EScalarTy != ScalarTy) {
14126 unsigned CastOpcode = Instruction::Trunc;
14127 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
14128 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
14130 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14131 return TTI.getCastInstrCost(CastOpcode,
getWidenedType(ScalarTy, VF),
14141 Type *EScalarTy = VecTy->getElementType();
14142 if (EScalarTy != ScalarTy) {
14144 unsigned CastOpcode = Instruction::Trunc;
14145 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
14146 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
14148 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14149 return TTI.getCastInstrCost(
14155 if (!V1 && !V2 && !P2.
isNull()) {
14158 unsigned VF = E->getVectorFactor();
14160 CommonVF = std::max(VF, E2->getVectorFactor());
14163 return Idx < 2 * static_cast<int>(CommonVF);
14165 "All elements in mask must be less than 2 * CommonVF.");
14166 if (E->Scalars.size() == E2->Scalars.size()) {
14170 for (
int &Idx : CommonMask) {
14173 if (Idx <
static_cast<int>(CommonVF) && !EMask.
empty())
14175 else if (Idx >=
static_cast<int>(CommonVF))
14176 Idx = (E2Mask.
empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
14180 CommonVF = E->Scalars.size();
14181 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
14182 GetNodeMinBWAffectedCost(*E2, CommonVF);
14184 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
14185 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
14188 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
14189 }
else if (!V1 && P2.
isNull()) {
14192 unsigned VF = E->getVectorFactor();
14196 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
14197 "All elements in mask must be less than CommonVF.");
14198 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
14200 assert(!EMask.
empty() &&
"Expected non-empty common mask.");
14201 for (
int &Idx : CommonMask) {
14205 CommonVF = E->Scalars.size();
14206 }
else if (
unsigned Factor = E->getInterleaveFactor();
14207 Factor > 0 && E->Scalars.size() != Mask.size() &&
14211 std::iota(CommonMask.begin(), CommonMask.end(), 0);
14213 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
14216 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
14217 CommonVF == CommonMask.size() &&
14219 [](
const auto &&
P) {
14221 static_cast<unsigned>(
P.value()) !=
P.index();
14229 }
else if (V1 && P2.
isNull()) {
14231 ExtraCost += GetValueMinBWAffectedCost(V1);
14232 CommonVF = getVF(V1);
14235 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
14236 "All elements in mask must be less than CommonVF.");
14237 }
else if (V1 && !V2) {
14239 unsigned VF = getVF(V1);
14241 CommonVF = std::max(VF, E2->getVectorFactor());
14244 return Idx < 2 * static_cast<int>(CommonVF);
14246 "All elements in mask must be less than 2 * CommonVF.");
14247 if (E2->Scalars.size() == VF && VF != CommonVF) {
14249 assert(!E2Mask.
empty() &&
"Expected non-empty common mask.");
14250 for (
int &Idx : CommonMask) {
14253 if (Idx >=
static_cast<int>(CommonVF))
14254 Idx = E2Mask[Idx - CommonVF] + VF;
14258 ExtraCost += GetValueMinBWAffectedCost(V1);
14260 ExtraCost += GetNodeMinBWAffectedCost(
14261 *E2, std::min(CommonVF, E2->getVectorFactor()));
14262 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
14263 }
else if (!V1 && V2) {
14265 unsigned VF = getVF(V2);
14267 CommonVF = std::max(VF, E1->getVectorFactor());
14270 return Idx < 2 * static_cast<int>(CommonVF);
14272 "All elements in mask must be less than 2 * CommonVF.");
14273 if (E1->Scalars.size() == VF && VF != CommonVF) {
14275 assert(!E1Mask.
empty() &&
"Expected non-empty common mask.");
14276 for (
int &Idx : CommonMask) {
14279 if (Idx >=
static_cast<int>(CommonVF))
14280 Idx = E1Mask[Idx - CommonVF] + VF;
14286 ExtraCost += GetNodeMinBWAffectedCost(
14287 *E1, std::min(CommonVF, E1->getVectorFactor()));
14289 ExtraCost += GetValueMinBWAffectedCost(V2);
14290 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
14292 assert(V1 && V2 &&
"Expected both vectors.");
14293 unsigned VF = getVF(V1);
14294 CommonVF = std::max(VF, getVF(V2));
14297 return Idx < 2 * static_cast<int>(CommonVF);
14299 "All elements in mask must be less than 2 * CommonVF.");
14301 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
14304 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
14309 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
14312 InVectors.front() =
14314 if (InVectors.size() == 2)
14315 InVectors.pop_back();
14316 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
14317 V1, V2, CommonMask, Builder, ScalarTy);
14324 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
14325 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
14326 CheckedExtracts(CheckedExtracts) {}
14328 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
14329 unsigned NumParts,
bool &UseVecBaseAsInput) {
14330 UseVecBaseAsInput =
false;
14333 Value *VecBase =
nullptr;
14335 if (!E->ReorderIndices.empty()) {
14337 E->ReorderIndices.end());
14342 bool PrevNodeFound =
any_of(
14343 ArrayRef(R.VectorizableTree).take_front(E->Idx),
14344 [&](
const std::unique_ptr<TreeEntry> &TE) {
14345 return ((TE->hasState() && !TE->isAltShuffle() &&
14346 TE->getOpcode() == Instruction::ExtractElement) ||
14348 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
14349 return VL.size() > Data.index() &&
14350 (Mask[Data.index()] == PoisonMaskElem ||
14351 isa<UndefValue>(VL[Data.index()]) ||
14352 Data.value() == VL[Data.index()]);
14360 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
14374 VecBase = EE->getVectorOperand();
14375 UniqueBases.
insert(VecBase);
14377 if (!CheckedExtracts.
insert(V).second ||
14380 [&](
const TreeEntry *TE) {
14381 return R.DeletedNodes.contains(TE) ||
14382 R.TransformedToGatherNodes.contains(TE);
14384 (E->UserTreeIndex && E->UserTreeIndex.EdgeIdx == UINT_MAX &&
14385 !R.isVectorized(EE) &&
14387 count_if(E->UserTreeIndex.UserTE->Scalars,
14388 [&](
Value *V) { return V == EE; })) ||
14391 return isa<GetElementPtrInst>(U) &&
14392 !R.areAllUsersVectorized(cast<Instruction>(U),
14400 unsigned Idx = *EEIdx;
14402 if (EE->hasOneUse() || !PrevNodeFound) {
14408 Cost -=
TTI.getExtractWithExtendCost(
14412 Cost +=
TTI.getCastInstrCost(
14418 APInt &DemandedElts =
14419 VectorOpsToExtracts
14422 .first->getSecond();
14423 DemandedElts.
setBit(Idx);
14426 for (
const auto &[Vec, DemandedElts] : VectorOpsToExtracts)
14428 DemandedElts,
false,
14436 if (!PrevNodeFound)
14437 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
14440 transformMaskAfterShuffle(CommonMask, CommonMask);
14441 SameNodesEstimated =
false;
14442 if (NumParts != 1 && UniqueBases.
size() != 1) {
14443 UseVecBaseAsInput =
true;
14451 std::optional<InstructionCost>
14455 return std::nullopt;
14459 IsFinalized =
false;
14460 CommonMask.clear();
14463 VectorizedVals.clear();
14464 SameNodesEstimated =
true;
14470 return Idx < static_cast<int>(E1.getVectorFactor());
14472 "Expected single vector shuffle mask.");
14476 if (InVectors.empty()) {
14477 CommonMask.assign(Mask.begin(), Mask.end());
14478 InVectors.assign({&E1, &E2});
14481 assert(!CommonMask.empty() &&
"Expected non-empty common mask.");
14486 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
14487 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
14490 if (InVectors.empty()) {
14491 CommonMask.assign(Mask.begin(), Mask.end());
14492 InVectors.assign(1, &E1);
14495 assert(!CommonMask.empty() &&
"Expected non-empty common mask.");
14500 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
14501 estimateNodesPermuteCost(E1,
nullptr, Mask, Part, SliceSize);
14502 if (!SameNodesEstimated && InVectors.size() == 1)
14503 InVectors.emplace_back(&E1);
14509 assert(InVectors.size() == 1 &&
14516 ->getOrdered(
P.index()));
14517 return EI->getVectorOperand() == V1 ||
14518 EI->getVectorOperand() == V2;
14520 "Expected extractelement vectors.");
14524 if (InVectors.empty()) {
14525 assert(CommonMask.empty() && !ForExtracts &&
14526 "Expected empty input mask/vectors.");
14527 CommonMask.assign(Mask.begin(), Mask.end());
14528 InVectors.assign(1, V1);
14534 !CommonMask.empty() &&
14538 ->getOrdered(
P.index());
14540 return P.value() == Mask[
P.index()] ||
14545 return EI->getVectorOperand() == V1;
14547 "Expected only tree entry for extractelement vectors.");
14550 assert(!InVectors.empty() && !CommonMask.empty() &&
14551 "Expected only tree entries from extracts/reused buildvectors.");
14552 unsigned VF = getVF(V1);
14553 if (InVectors.size() == 2) {
14554 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14555 transformMaskAfterShuffle(CommonMask, CommonMask);
14556 VF = std::max<unsigned>(VF, CommonMask.size());
14557 }
else if (
const auto *InTE =
14558 InVectors.front().dyn_cast<
const TreeEntry *>()) {
14559 VF = std::max(VF, InTE->getVectorFactor());
14563 ->getNumElements());
14565 InVectors.push_back(V1);
14566 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14568 CommonMask[Idx] = Mask[Idx] + VF;
14571 Value *Root =
nullptr) {
14572 Cost += getBuildVectorCost(VL, Root);
14576 unsigned VF = VL.
size();
14578 VF = std::min(VF, MaskVF);
14579 Type *VLScalarTy = VL.
front()->getType();
14603 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
14609 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14614 IsFinalized =
true;
14617 if (InVectors.
size() == 2)
14618 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
14620 Cost += createShuffle(Vec,
nullptr, CommonMask);
14621 transformMaskAfterShuffle(CommonMask, CommonMask);
14623 "Expected vector length for the final value before action.");
14626 Cost += createShuffle(V1, V2, Mask);
14629 InVectors.
front() = V;
14631 if (!SubVectors.empty()) {
14633 if (InVectors.
size() == 2)
14634 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
14636 Cost += createShuffle(Vec,
nullptr, CommonMask);
14637 transformMaskAfterShuffle(CommonMask, CommonMask);
14639 if (!SubVectorsMask.
empty()) {
14641 "Expected same size of masks for subvectors and common mask.");
14643 copy(SubVectorsMask, SVMask.begin());
14644 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
14647 I1 = I2 + CommonMask.
size();
14654 for (
auto [
E, Idx] : SubVectors) {
14655 Type *EScalarTy =
E->Scalars.front()->getType();
14656 bool IsSigned =
true;
14657 if (
auto It =
R.MinBWs.find(
E); It !=
R.MinBWs.end()) {
14660 IsSigned = It->second.second;
14662 if (ScalarTy != EScalarTy) {
14663 unsigned CastOpcode = Instruction::Trunc;
14664 unsigned DstSz =
R.DL->getTypeSizeInBits(ScalarTy);
14665 unsigned SrcSz =
R.DL->getTypeSizeInBits(EScalarTy);
14667 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14668 Cost += TTI.getCastInstrCost(
14677 if (!CommonMask.
empty()) {
14678 std::iota(std::next(CommonMask.
begin(), Idx),
14679 std::next(CommonMask.
begin(), Idx +
E->getVectorFactor()),
14685 if (!ExtMask.
empty()) {
14686 if (CommonMask.
empty()) {
14690 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
14693 NewMask[
I] = CommonMask[ExtMask[
I]];
14695 CommonMask.
swap(NewMask);
14698 if (CommonMask.
empty()) {
14699 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
14703 createShuffle(InVectors.
front(),
14704 InVectors.
size() == 2 ? InVectors.
back() :
nullptr,
14709 assert((IsFinalized || CommonMask.empty()) &&
14710 "Shuffle construction must be finalized.");
14714const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(
const TreeEntry *E,
14715 unsigned Idx)
const {
14716 TreeEntry *
Op = OperandsToTreeEntry.
at({E, Idx});
14717 assert(
Op->isSame(
E->getOperand(Idx)) &&
"Operands mismatch!");
14722 if (
TE.State == TreeEntry::ScatterVectorize ||
14723 TE.State == TreeEntry::StridedVectorize)
14725 if (
TE.State == TreeEntry::CompressVectorize)
14727 if (
TE.State == TreeEntry::Vectorize &&
TE.getOpcode() == Instruction::Load &&
14728 !
TE.isAltShuffle()) {
14729 if (
TE.ReorderIndices.empty())
14731 SmallVector<int>
Mask;
14741 SmallPtrSetImpl<Value *> &CheckedExtracts) {
14746 return InstructionCost::getInvalid();
14751 auto It = MinBWs.find(
E);
14752 Type *OrigScalarTy = ScalarTy;
14753 if (It != MinBWs.end()) {
14760 unsigned EntryVF =
E->getVectorFactor();
14763 if (
E->isGather() || TransformedToGatherNodes.contains(
E)) {
14767 return InstructionCost::getInvalid();
14769 ScalarTy = VL.
front()->getType();
14770 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
14771 E, ScalarTy, *TTI, VectorizedVals, *
this, CheckedExtracts);
14773 if (
E->State == TreeEntry::SplitVectorize) {
14774 assert(
E->CombinedEntriesWithIndices.size() == 2 &&
14775 "Expected exactly 2 combined entries.");
14776 assert(
E->ReuseShuffleIndices.empty() &&
"Expected empty reuses mask.");
14778 if (
E->ReorderIndices.empty()) {
14781 E->CombinedEntriesWithIndices.back().second,
14784 VectorizableTree[
E->CombinedEntriesWithIndices.back().first]
14785 ->getVectorFactor()));
14787 unsigned CommonVF =
14788 std::max(VectorizableTree[
E->CombinedEntriesWithIndices.front().first]
14789 ->getVectorFactor(),
14790 VectorizableTree[
E->CombinedEntriesWithIndices.back().first]
14791 ->getVectorFactor());
14796 LLVM_DEBUG(dumpTreeCosts(
E, 0, VectorCost, 0,
"Calculated costs for Tree"));
14800 SmallVector<int>
Mask;
14801 if (!
E->ReorderIndices.empty() &&
E->State != TreeEntry::CompressVectorize &&
14802 (
E->State != TreeEntry::StridedVectorize ||
14804 SmallVector<int> NewMask;
14805 if (
E->getOpcode() == Instruction::Store) {
14807 NewMask.
resize(
E->ReorderIndices.size());
14814 if (!
E->ReuseShuffleIndices.empty())
14819 assert((
E->State == TreeEntry::Vectorize ||
14820 E->State == TreeEntry::ScatterVectorize ||
14821 E->State == TreeEntry::StridedVectorize ||
14822 E->State == TreeEntry::CompressVectorize) &&
14823 "Unhandled state");
14826 (
E->getOpcode() == Instruction::GetElementPtr &&
14827 E->getMainOp()->getType()->isPointerTy()) ||
14828 E->hasCopyableElements()) &&
14831 unsigned ShuffleOrOp =
14832 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector :
E->
getOpcode();
14833 if (
E->CombinedOp != TreeEntry::NotCombinedOp)
14834 ShuffleOrOp =
E->CombinedOp;
14835 SmallSetVector<Value *, 16> UniqueValues;
14836 SmallVector<unsigned, 16> UniqueIndexes;
14838 if (UniqueValues.insert(V))
14839 UniqueIndexes.push_back(Idx);
14840 const unsigned Sz = UniqueValues.size();
14841 SmallBitVector UsedScalars(Sz,
false);
14842 for (
unsigned I = 0;
I < Sz; ++
I) {
14844 !
E->isCopyableElement(UniqueValues[
I]) &&
14845 getTreeEntries(UniqueValues[
I]).
front() ==
E)
14847 UsedScalars.set(
I);
14849 auto GetCastContextHint = [&](
Value *
V) {
14851 return getCastContextHint(*OpTEs.front());
14852 InstructionsState SrcState =
getSameOpcode(
E->getOperand(0), *TLI);
14853 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
14854 !SrcState.isAltShuffle())
14867 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
14869 for (
unsigned I = 0;
I < Sz; ++
I) {
14870 if (UsedScalars.test(
I))
14872 ScalarCost += ScalarEltCost(
I);
14879 if (It != MinBWs.end() && !UnaryInstruction::isCast(
E->getOpcode()) &&
14881 (
E->getOpcode() != Instruction::Load ||
E->UserTreeIndex)) {
14883 if (!EI.UserTE->hasState() ||
14884 EI.UserTE->getOpcode() != Instruction::Select ||
14886 auto UserBWIt = MinBWs.find(EI.UserTE);
14887 Type *UserScalarTy =
14888 (EI.UserTE->isGather() ||
14889 EI.UserTE->State == TreeEntry::SplitVectorize)
14890 ? EI.UserTE->Scalars.front()->getType()
14891 : EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
14892 if (UserBWIt != MinBWs.end())
14894 UserBWIt->second.first);
14895 if (ScalarTy != UserScalarTy) {
14896 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
14897 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
14898 unsigned VecOpcode;
14900 if (BWSz > SrcBWSz)
14901 VecOpcode = Instruction::Trunc;
14904 It->second.second ? Instruction::SExt : Instruction::ZExt;
14906 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
14911 LLVM_DEBUG(dumpTreeCosts(
E, CommonCost, VecCost - CommonCost,
14912 ScalarCost,
"Calculated costs for Tree"));
14913 return VecCost - ScalarCost;
14918 assert((
E->State == TreeEntry::Vectorize ||
14919 E->State == TreeEntry::StridedVectorize ||
14920 E->State == TreeEntry::CompressVectorize) &&
14921 "Entry state expected to be Vectorize, StridedVectorize or "
14922 "MaskedLoadCompressVectorize here.");
14926 *TTI, Ptrs, BasePtr,
E->getOpcode(),
CostKind, OrigScalarTy, VecTy);
14927 LLVM_DEBUG(dumpTreeCosts(
E, 0, VecCost, ScalarCost,
14928 "Calculated GEPs cost for Tree"));
14930 return VecCost - ScalarCost;
14936 return InstructionCost::getInvalid();
14937 Type *CanonicalType = Ty;
14943 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
14944 {CanonicalType, CanonicalType});
14946 TTI->getIntrinsicInstrCost(CostAttrs,
CostKind);
14949 if (VI && SelectOnly) {
14951 "Expected only for scalar type.");
14954 CI->
getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
14955 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
14956 {TTI::OK_AnyValue, TTI::OP_None}, CI);
14960 auto GetFMulAddCost = [&, &TTI = *TTI](
const InstructionsState &S,
14965 switch (ShuffleOrOp) {
14966 case Instruction::PHI: {
14969 SmallPtrSet<const TreeEntry *, 4> CountedOps;
14970 for (
Value *V : UniqueValues) {
14975 ValueList Operands(
PHI->getNumIncomingValues(),
nullptr);
14976 for (
unsigned I = 0,
N =
PHI->getNumIncomingValues();
I <
N; ++
I) {
14980 if (
const TreeEntry *OpTE =
14981 getSameValuesTreeEntry(Operands.
front(), Operands))
14982 if (CountedOps.
insert(OpTE).second &&
14983 !OpTE->ReuseShuffleIndices.empty())
14984 ScalarCost +=
TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
14985 OpTE->Scalars.size());
14988 return CommonCost - ScalarCost;
14990 case Instruction::ExtractValue:
14991 case Instruction::ExtractElement: {
14992 APInt DemandedElts;
14994 auto GetScalarCost = [&](
unsigned Idx) {
15000 if (ShuffleOrOp == Instruction::ExtractElement) {
15002 SrcVecTy = EE->getVectorOperandType();
15005 Type *AggregateTy = EV->getAggregateOperand()->getType();
15008 NumElts = ATy->getNumElements();
15014 if (
I->hasOneUse()) {
15024 Cost -= TTI->getCastInstrCost(
15030 if (DemandedElts.
isZero())
15036 return CommonCost - (DemandedElts.
isZero()
15038 : TTI.getScalarizationOverhead(
15039 SrcVecTy, DemandedElts,
false,
15042 return GetCostDiff(GetScalarCost, GetVectorCost);
15044 case Instruction::InsertElement: {
15045 assert(
E->ReuseShuffleIndices.empty() &&
15046 "Unique insertelements only are expected.");
15048 unsigned const NumElts = SrcVecTy->getNumElements();
15049 unsigned const NumScalars = VL.
size();
15055 unsigned OffsetEnd = OffsetBeg;
15056 InsertMask[OffsetBeg] = 0;
15059 if (OffsetBeg > Idx)
15061 else if (OffsetEnd < Idx)
15063 InsertMask[Idx] =
I + 1;
15066 if (NumOfParts > 0 && NumOfParts < NumElts)
15067 VecScalarsSz =
PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
15068 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
15070 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
15071 unsigned InsertVecSz = std::min<unsigned>(
15073 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
15074 bool IsWholeSubvector =
15075 OffsetBeg ==
Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
15079 if (OffsetBeg + InsertVecSz > VecSz) {
15082 InsertVecSz = VecSz;
15087 SmallVector<int>
Mask;
15088 if (!
E->ReorderIndices.empty()) {
15093 std::iota(
Mask.begin(), std::next(
Mask.begin(), InsertVecSz), 0);
15095 bool IsIdentity =
true;
15097 Mask.swap(PrevMask);
15098 for (
unsigned I = 0;
I < NumScalars; ++
I) {
15100 DemandedElts.
setBit(InsertIdx);
15101 IsIdentity &= InsertIdx - OffsetBeg ==
I;
15102 Mask[InsertIdx - OffsetBeg] =
I;
15104 assert(
Offset < NumElts &&
"Failed to find vector index offset");
15118 InsertVecTy, Mask);
15120 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
15126 SmallBitVector InMask =
15128 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
15129 if (!InMask.
all() && NumScalars != NumElts && !IsWholeSubvector) {
15130 if (InsertVecSz != VecSz) {
15135 for (
unsigned I = 0, End = OffsetBeg -
Offset;
I < End; ++
I)
15137 for (
unsigned I = OffsetBeg -
Offset, End = OffsetEnd -
Offset;
15141 for (
unsigned I = OffsetEnd + 1 -
Offset;
I < VecSz; ++
I)
15150 case Instruction::ZExt:
15151 case Instruction::SExt:
15152 case Instruction::FPToUI:
15153 case Instruction::FPToSI:
15154 case Instruction::FPExt:
15155 case Instruction::PtrToInt:
15156 case Instruction::IntToPtr:
15157 case Instruction::SIToFP:
15158 case Instruction::UIToFP:
15159 case Instruction::Trunc:
15160 case Instruction::FPTrunc:
15161 case Instruction::BitCast: {
15162 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
15165 unsigned Opcode = ShuffleOrOp;
15166 unsigned VecOpcode = Opcode;
15168 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
15170 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->
getScalarType());
15171 if (SrcIt != MinBWs.end()) {
15172 SrcBWSz = SrcIt->second.first;
15178 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->
getScalarType());
15179 if (BWSz == SrcBWSz) {
15180 VecOpcode = Instruction::BitCast;
15181 }
else if (BWSz < SrcBWSz) {
15182 VecOpcode = Instruction::Trunc;
15183 }
else if (It != MinBWs.end()) {
15184 assert(BWSz > SrcBWSz &&
"Invalid cast!");
15185 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
15186 }
else if (SrcIt != MinBWs.end()) {
15187 assert(BWSz > SrcBWSz &&
"Invalid cast!");
15189 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
15191 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
15192 !SrcIt->second.second) {
15193 VecOpcode = Instruction::UIToFP;
15196 assert(Idx == 0 &&
"Expected 0 index only");
15197 return TTI->getCastInstrCost(Opcode, VL0->
getType(),
15204 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
15206 auto *
VI = VL0->
getOpcode() == Opcode ? VL0 :
nullptr;
15209 bool IsArithmeticExtendedReduction =
15210 E->Idx == 0 && UserIgnoreList &&
15213 return is_contained({Instruction::Add, Instruction::FAdd,
15214 Instruction::Mul, Instruction::FMul,
15215 Instruction::And, Instruction::Or,
15219 if (IsArithmeticExtendedReduction &&
15220 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
15222 return CommonCost +
15223 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH,
CostKind,
15224 VecOpcode == Opcode ? VI :
nullptr);
15226 return GetCostDiff(GetScalarCost, GetVectorCost);
15228 case Instruction::FCmp:
15229 case Instruction::ICmp:
15230 case Instruction::Select: {
15231 CmpPredicate VecPred, SwappedVecPred;
15234 match(VL0, MatchCmp))
15240 auto GetScalarCost = [&](
unsigned Idx) {
15250 !
match(VI, MatchCmp)) ||
15258 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
15259 CostKind, getOperandInfo(
VI->getOperand(0)),
15260 getOperandInfo(
VI->getOperand(1)), VI);
15271 TTI->getCmpSelInstrCost(
E->getOpcode(), VecTy, MaskTy, VecPred,
15272 CostKind, getOperandInfo(
E->getOperand(0)),
15273 getOperandInfo(
E->getOperand(1)), VL0);
15277 unsigned CondNumElements = CondType->getNumElements();
15279 assert(VecTyNumElements >= CondNumElements &&
15280 VecTyNumElements % CondNumElements == 0 &&
15281 "Cannot vectorize Instruction::Select");
15282 if (CondNumElements != VecTyNumElements) {
15291 return VecCost + CommonCost;
15293 return GetCostDiff(GetScalarCost, GetVectorCost);
15295 case TreeEntry::MinMax: {
15296 auto GetScalarCost = [&](
unsigned Idx) {
15297 return GetMinMaxCost(OrigScalarTy);
15301 return VecCost + CommonCost;
15303 return GetCostDiff(GetScalarCost, GetVectorCost);
15305 case TreeEntry::FMulAdd: {
15306 auto GetScalarCost = [&](
unsigned Idx) {
15309 return GetFMulAddCost(
E->getOperations(),
15315 for (
Value *V :
E->Scalars) {
15317 FMF &= FPCI->getFastMathFlags();
15319 FMF &= FPCIOp->getFastMathFlags();
15322 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,
15323 {VecTy, VecTy, VecTy}, FMF);
15325 return VecCost + CommonCost;
15327 return GetCostDiff(GetScalarCost, GetVectorCost);
15329 case Instruction::FNeg:
15330 case Instruction::Add:
15331 case Instruction::FAdd:
15332 case Instruction::Sub:
15333 case Instruction::FSub:
15334 case Instruction::Mul:
15335 case Instruction::FMul:
15336 case Instruction::UDiv:
15337 case Instruction::SDiv:
15338 case Instruction::FDiv:
15339 case Instruction::URem:
15340 case Instruction::SRem:
15341 case Instruction::FRem:
15342 case Instruction::Shl:
15343 case Instruction::LShr:
15344 case Instruction::AShr:
15345 case Instruction::And:
15346 case Instruction::Or:
15347 case Instruction::Xor: {
15348 auto GetScalarCost = [&](
unsigned Idx) {
15355 unsigned Lane = UniqueIndexes[Idx];
15356 Value *Op1 =
E->getOperand(0)[Lane];
15358 SmallVector<const Value *, 2> Operands(1, Op1);
15362 Op2 =
E->getOperand(1)[Lane];
15368 ShuffleOrOp, OrigScalarTy,
CostKind, Op1Info, Op2Info, Operands);
15370 I && (ShuffleOrOp == Instruction::FAdd ||
15371 ShuffleOrOp == Instruction::FSub)) {
15379 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
15384 return CI && CI->getValue().countr_one() >= It->second.first;
15392 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy,
CostKind, Op1Info,
15393 Op2Info, {},
nullptr, TLI) +
15396 return GetCostDiff(GetScalarCost, GetVectorCost);
15398 case Instruction::GetElementPtr: {
15399 return CommonCost + GetGEPCostDiff(VL, VL0);
15401 case Instruction::Load: {
15402 auto GetScalarCost = [&](
unsigned Idx) {
15404 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
15405 VI->getAlign(),
VI->getPointerAddressSpace(),
15411 switch (
E->State) {
15412 case TreeEntry::Vectorize:
15413 if (
unsigned Factor =
E->getInterleaveFactor()) {
15414 VecLdCost = TTI->getInterleavedMemoryOpCost(
15415 Instruction::Load, VecTy, Factor, {}, LI0->getAlign(),
15416 LI0->getPointerAddressSpace(),
CostKind);
15419 VecLdCost = TTI->getMemoryOpCost(
15420 Instruction::Load, VecTy, LI0->getAlign(),
15424 case TreeEntry::StridedVectorize: {
15425 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(
E);
15426 FixedVectorType *StridedLoadTy = SPtrInfo.Ty;
15427 assert(StridedLoadTy &&
"Missing StridedPoinerInfo for tree entry.");
15428 Align CommonAlignment =
15430 VecLdCost = TTI->getMemIntrinsicInstrCost(
15431 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_load,
15432 StridedLoadTy, LI0->getPointerOperand(),
15433 false, CommonAlignment),
15435 if (StridedLoadTy != VecTy)
15437 TTI->getCastInstrCost(Instruction::BitCast, VecTy, StridedLoadTy,
15442 case TreeEntry::CompressVectorize: {
15444 unsigned InterleaveFactor;
15445 SmallVector<int> CompressMask;
15448 if (!
E->ReorderIndices.empty()) {
15449 SmallVector<int>
Mask(
E->ReorderIndices.begin(),
15450 E->ReorderIndices.end());
15457 Scalars, PointerOps,
E->ReorderIndices, *TTI, *DL, *SE, *AC, *DT,
15458 *TLI, [](
Value *) { return true; }, IsMasked, InterleaveFactor,
15459 CompressMask, LoadVecTy);
15460 assert(IsVectorized &&
"Failed to vectorize load");
15461 CompressEntryToData.try_emplace(
E, CompressMask, LoadVecTy,
15462 InterleaveFactor, IsMasked);
15463 Align CommonAlignment = LI0->getAlign();
15464 if (InterleaveFactor) {
15465 VecLdCost = TTI->getInterleavedMemoryOpCost(
15466 Instruction::Load, LoadVecTy, InterleaveFactor, {},
15467 CommonAlignment, LI0->getPointerAddressSpace(),
CostKind);
15468 }
else if (IsMasked) {
15469 VecLdCost = TTI->getMemIntrinsicInstrCost(
15470 MemIntrinsicCostAttributes(Intrinsic::masked_load, LoadVecTy,
15472 LI0->getPointerAddressSpace()),
15476 LoadVecTy, CompressMask,
CostKind);
15478 VecLdCost = TTI->getMemoryOpCost(
15479 Instruction::Load, LoadVecTy, CommonAlignment,
15483 LoadVecTy, CompressMask,
CostKind);
15487 case TreeEntry::ScatterVectorize: {
15488 Align CommonAlignment =
15490 VecLdCost = TTI->getMemIntrinsicInstrCost(
15491 MemIntrinsicCostAttributes(Intrinsic::masked_gather, VecTy,
15492 LI0->getPointerOperand(),
15493 false, CommonAlignment),
15497 case TreeEntry::CombinedVectorize:
15498 case TreeEntry::SplitVectorize:
15499 case TreeEntry::NeedToGather:
15502 return VecLdCost + CommonCost;
15508 if (
E->State == TreeEntry::ScatterVectorize)
15515 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
15517 case Instruction::Store: {
15518 bool IsReorder = !
E->ReorderIndices.empty();
15519 auto GetScalarCost = [=](
unsigned Idx) {
15522 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
15523 VI->getAlign(),
VI->getPointerAddressSpace(),
15531 if (
E->State == TreeEntry::StridedVectorize) {
15532 Align CommonAlignment =
15534 VecStCost = TTI->getMemIntrinsicInstrCost(
15535 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_store,
15536 VecTy, BaseSI->getPointerOperand(),
15537 false, CommonAlignment),
15540 assert(
E->State == TreeEntry::Vectorize &&
15541 "Expected either strided or consecutive stores.");
15542 if (
unsigned Factor =
E->getInterleaveFactor()) {
15543 assert(
E->ReuseShuffleIndices.empty() && !
E->ReorderIndices.empty() &&
15544 "No reused shuffles expected");
15546 VecStCost = TTI->getInterleavedMemoryOpCost(
15547 Instruction::Store, VecTy, Factor, {}, BaseSI->getAlign(),
15548 BaseSI->getPointerAddressSpace(),
CostKind);
15551 VecStCost = TTI->getMemoryOpCost(
15552 Instruction::Store, VecTy, BaseSI->getAlign(),
15553 BaseSI->getPointerAddressSpace(),
CostKind, OpInfo);
15556 return VecStCost + CommonCost;
15560 unsigned Idx = IsReorder ?
E->ReorderIndices[
I] :
I;
15564 return GetCostDiff(GetScalarCost, GetVectorCost) +
15565 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
15567 case Instruction::Call: {
15568 auto GetScalarCost = [&](
unsigned Idx) {
15572 IntrinsicCostAttributes CostAttrs(
ID, *CI, 1);
15573 return TTI->getIntrinsicInstrCost(CostAttrs,
CostKind);
15583 CI,
ID, VecTy->getNumElements(),
15584 It != MinBWs.end() ? It->second.first : 0, TTI);
15586 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
15588 return GetCostDiff(GetScalarCost, GetVectorCost);
15590 case Instruction::ShuffleVector: {
15598 "Invalid Shuffle Vector Operand");
15601 auto TryFindNodeWithEqualOperands = [=]() {
15602 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
15605 if (
TE->hasState() &&
TE->isAltShuffle() &&
15606 ((
TE->getOpcode() ==
E->getOpcode() &&
15607 TE->getAltOpcode() ==
E->getAltOpcode()) ||
15608 (
TE->getOpcode() ==
E->getAltOpcode() &&
15609 TE->getAltOpcode() ==
E->getOpcode())) &&
15610 TE->hasEqualOperands(*
E))
15615 auto GetScalarCost = [&](
unsigned Idx) {
15620 assert(
E->getMatchingMainOpOrAltOp(VI) &&
15621 "Unexpected main/alternate opcode");
15623 return TTI->getInstructionCost(VI,
CostKind);
15631 if (TryFindNodeWithEqualOperands()) {
15633 dbgs() <<
"SLP: diamond match for alternate node found.\n";
15640 TTIRef.getArithmeticInstrCost(
E->getOpcode(), VecTy,
CostKind);
15642 TTIRef.getArithmeticInstrCost(
E->getAltOpcode(), VecTy,
CostKind);
15645 VecCost = TTIRef.getCmpSelInstrCost(
15646 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(),
CostKind,
15647 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15649 VecCost += TTIRef.getCmpSelInstrCost(
15650 E->getOpcode(), VecTy, MaskTy,
15652 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15655 Type *SrcSclTy =
E->getMainOp()->getOperand(0)->getType();
15658 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
15659 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
15661 DL->getTypeSizeInBits(
E->getMainOp()->getOperand(0)->getType());
15662 if (SrcIt != MinBWs.end()) {
15663 SrcBWSz = SrcIt->second.first;
15667 if (BWSz <= SrcBWSz) {
15668 if (BWSz < SrcBWSz)
15670 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
15674 <<
"SLP: alternate extension, which should be truncated.\n";
15680 VecCost = TTIRef.getCastInstrCost(
E->getOpcode(), VecTy, SrcTy,
15683 TTIRef.getCastInstrCost(
E->getAltOpcode(), VecTy, SrcTy,
15686 SmallVector<int>
Mask;
15687 E->buildAltOpShuffleMask(
15688 [&](Instruction *
I) {
15689 assert(
E->getMatchingMainOpOrAltOp(
I) &&
15690 "Unexpected main/alternate opcode");
15701 unsigned Opcode0 =
E->getOpcode();
15702 unsigned Opcode1 =
E->getAltOpcode();
15703 SmallBitVector OpcodeMask(
15707 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
15709 VecTy, Opcode0, Opcode1, OpcodeMask,
CostKind);
15710 return AltVecCost < VecCost ? AltVecCost : VecCost;
15716 return GetCostDiff(
15721 "Not supported shufflevector usage.");
15723 unsigned SVNumElements =
15725 ->getNumElements();
15726 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
15727 for (
size_t I = 0, End = VL.
size();
I != End;
I += GroupSize) {
15732 "Not supported shufflevector usage.");
15735 [[maybe_unused]]
bool IsExtractSubvectorMask =
15736 SV->isExtractSubvectorMask(Index);
15737 assert(IsExtractSubvectorMask &&
15738 "Not supported shufflevector usage.");
15739 if (NextIndex != Index)
15741 NextIndex += SV->getShuffleMask().size();
15744 return ::getShuffleCost(
15750 return GetCostDiff(GetScalarCost, GetVectorCost);
15752 case Instruction::Freeze:
15759bool BoUpSLP::isFullyVectorizableTinyTree(
bool ForReduction)
const {
15761 << VectorizableTree.size() <<
" is fully vectorizable .\n");
15763 auto &&AreVectorizableGathers = [
this](
const TreeEntry *
TE,
unsigned Limit) {
15764 SmallVector<int>
Mask;
15765 return TE->isGather() &&
15767 [
this](
Value *V) { return EphValues.contains(V); }) &&
15769 TE->Scalars.size() < Limit ||
15770 (((
TE->hasState() &&
15771 TE->getOpcode() == Instruction::ExtractElement) ||
15774 (
TE->hasState() &&
TE->getOpcode() == Instruction::Load &&
15775 !
TE->isAltShuffle()) ||
15780 if (VectorizableTree.size() == 1 &&
15781 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
15782 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
15783 VectorizableTree[0]->State == TreeEntry::CompressVectorize ||
15785 AreVectorizableGathers(VectorizableTree[0].
get(),
15786 VectorizableTree[0]->Scalars.size()) &&
15787 VectorizableTree[0]->getVectorFactor() > 2)))
15790 if (VectorizableTree.size() != 2)
15797 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
15798 AreVectorizableGathers(VectorizableTree[1].
get(),
15799 VectorizableTree[0]->Scalars.size()))
15803 if (VectorizableTree[0]->
isGather() ||
15804 (VectorizableTree[1]->
isGather() &&
15805 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
15806 VectorizableTree[0]->State != TreeEntry::StridedVectorize &&
15807 VectorizableTree[0]->State != TreeEntry::CompressVectorize))
15815 bool MustMatchOrInst) {
15819 Value *ZextLoad = Root;
15820 const APInt *ShAmtC;
15821 bool FoundOr =
false;
15825 ShAmtC->
urem(8) == 0))) {
15827 ZextLoad = BinOp->getOperand(0);
15828 if (BinOp->getOpcode() == Instruction::Or)
15833 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
15840 Type *SrcTy = Load->getType();
15841 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
15847 LLVM_DEBUG(
dbgs() <<
"SLP: Assume load combining for tree starting at "
15857 unsigned NumElts = VectorizableTree[0]->Scalars.size();
15858 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
15866 unsigned NumElts = Stores.
size();
15867 for (
Value *Scalar : Stores) {
15881 if (VectorizableTree.empty()) {
15882 assert(ExternalUses.empty() &&
"We shouldn't have any external users");
15888 if (VectorizableTree.size() == 2 &&
15890 VectorizableTree[1]->isGather() &&
15891 (VectorizableTree[1]->getVectorFactor() <= 2 ||
15892 !(
isSplat(VectorizableTree[1]->Scalars) ||
15900 constexpr int Limit = 4;
15902 !VectorizableTree.empty() &&
15903 all_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
15904 return (TE->isGather() &&
15905 (!TE->hasState() ||
15906 TE->getOpcode() != Instruction::ExtractElement) &&
15908 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
15915 VectorizableTree.size() <= Limit &&
15916 all_of(VectorizableTree,
15917 [&](
const std::unique_ptr<TreeEntry> &TE) {
15918 return (TE->isGather() &&
15919 (!TE->hasState() ||
15920 TE->getOpcode() != Instruction::ExtractElement) &&
15924 (TE->getOpcode() == Instruction::InsertElement ||
15925 (TE->getOpcode() == Instruction::PHI &&
15927 return isa<PoisonValue>(V) || MustGather.contains(V);
15930 any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
15931 return TE->State == TreeEntry::Vectorize &&
15932 TE->getOpcode() == Instruction::PHI;
15939 unsigned NumGathers = 0;
15940 constexpr int LimitTreeSize = 36;
15942 all_of(VectorizableTree,
15943 [&](
const std::unique_ptr<TreeEntry> &TE) {
15944 if (!TE->isGather() && TE->hasState() &&
15945 (TE->getOpcode() == Instruction::Load ||
15946 TE->getOpcode() == Instruction::Store)) {
15950 if (TE->isGather())
15952 return TE->State == TreeEntry::SplitVectorize ||
15953 (TE->Idx == 0 && TE->Scalars.size() == 2 &&
15954 TE->hasState() && TE->getOpcode() == Instruction::ICmp &&
15955 VectorizableTree.size() > LimitTreeSize) ||
15959 (TE->getOpcode() == Instruction::PHI ||
15960 (TE->hasCopyableElements() &&
15963 TE->Scalars.size() / 2) ||
15964 ((!TE->ReuseShuffleIndices.empty() ||
15965 !TE->ReorderIndices.empty() || TE->isAltShuffle()) &&
15966 TE->Scalars.size() == 2)));
15968 (StoreLoadNodes.
empty() ||
15969 (VectorizableTree.size() > LimitTreeSize * StoreLoadNodes.
size() &&
15970 (NumGathers > 0 ||
none_of(StoreLoadNodes, [&](
const TreeEntry *TE) {
15971 return TE->getOpcode() == Instruction::Store ||
15973 return !isa<LoadInst>(V) ||
15974 areAllUsersVectorized(cast<Instruction>(V));
15982 VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
15983 VectorizableTree.size() >= Limit &&
15985 [&](
const std::unique_ptr<TreeEntry> &TE) {
15986 return !TE->isGather() && TE->UserTreeIndex.UserTE &&
15987 TE->UserTreeIndex.UserTE->Idx == 0;
15994 VectorizableTree.size() > 2 &&
15995 VectorizableTree.front()->State == TreeEntry::Vectorize &&
15996 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
15997 VectorizableTree[1]->State == TreeEntry::Vectorize &&
15998 VectorizableTree[1]->getOpcode() == Instruction::PHI &&
16000 ArrayRef(VectorizableTree).drop_front(2),
16001 [&](
const std::unique_ptr<TreeEntry> &TE) {
return TE->isGather(); }))
16011 if (isFullyVectorizableTinyTree(ForReduction))
16016 bool IsAllowedSingleBVNode =
16017 VectorizableTree.
size() > 1 ||
16018 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
16019 !VectorizableTree.front()->isAltShuffle() &&
16020 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
16021 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
16023 if (
any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
16024 return TE->isGather() &&
all_of(TE->Scalars, [&](
Value *V) {
16025 return isa<ExtractElementInst, Constant>(V) ||
16026 (IsAllowedSingleBVNode &&
16027 !V->hasNUsesOrMore(UsesLimit) &&
16028 any_of(V->users(), IsaPred<InsertElementInst>));
16033 if (VectorizableTree.back()->isGather() &&
16034 VectorizableTree.back()->hasState() &&
16035 VectorizableTree.back()->isAltShuffle() &&
16036 VectorizableTree.back()->getVectorFactor() > 2 &&
16038 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
16039 TTI->getScalarizationOverhead(
16040 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
16041 VectorizableTree.back()->getVectorFactor()),
16054 constexpr unsigned SmallTree = 3;
16055 if (VectorizableTree.front()->isNonPowOf2Vec() &&
16058 [](
const std::unique_ptr<TreeEntry> &TE) {
16059 return TE->isGather() && TE->hasState() &&
16060 TE->getOpcode() == Instruction::Load &&
16068 TreeEntry &E = *VectorizableTree[Idx];
16069 if (E.State == TreeEntry::SplitVectorize)
16073 if ((E.hasState() && E.getOpcode() != Instruction::Load) ||
16092 const TreeEntry *Root = VectorizableTree.front().get();
16093 if (Root->isGather())
16101 for (
const auto &TEPtr : VectorizableTree) {
16102 if (!TEPtr->isGather()) {
16103 Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
16104 EntriesToLastInstruction.
try_emplace(TEPtr.get(), LastInst);
16105 LastInstructions.
insert(LastInst);
16107 if (TEPtr->UserTreeIndex)
16108 EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
16115 if (
II->isAssumeLikeIntrinsic())
16122 return IntrCost < CallCost;
16129 CheckedInstructions;
16130 unsigned Budget = 0;
16131 const unsigned BudgetLimit =
16136 "Expected instructions in same block.");
16137 if (
auto It = CheckedInstructions.
find(
Last);
16138 It != CheckedInstructions.
end()) {
16139 const Instruction *Checked = It->second.getPointer();
16141 return It->second.getInt() != 0;
16147 ++
First->getIterator().getReverse(),
16149 Last->getIterator().getReverse();
16151 while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
16157 for (
const Instruction *LastInst : LastInstsInRange)
16158 CheckedInstructions.
try_emplace(LastInst, &*PrevInstIt, 0);
16161 if (LastInstructions.
contains(&*PrevInstIt))
16162 LastInstsInRange.
push_back(&*PrevInstIt);
16167 for (
const Instruction *LastInst : LastInstsInRange)
16169 LastInst, PrevInstIt == InstIt ?
First : &*PrevInstIt,
16170 Budget <= BudgetLimit ? 1 : 0);
16171 return Budget <= BudgetLimit;
16173 auto AddCosts = [&](
const TreeEntry *
Op) {
16174 Type *ScalarTy =
Op->Scalars.front()->getType();
16175 auto It = MinBWs.find(
Op);
16176 if (It != MinBWs.end())
16179 Cost += TTI->getCostOfKeepingLiveOverCall(VecTy);
16182 Cost -=
Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy);
16189 ParentOpParentToPreds;
16192 auto Key = std::make_pair(Root, OpParent);
16193 if (
auto It = ParentOpParentToPreds.
find(
Key);
16194 It != ParentOpParentToPreds.
end())
16206 for (
const auto &KeyPair : ParentsPairsToAdd) {
16208 "Should not have been added before.");
16212 while (!Worklist.
empty()) {
16214 if (BB == OpParent || !Visited.
insert(BB).second)
16216 auto Pair = std::make_pair(BB, OpParent);
16217 if (
auto It = ParentOpParentToPreds.
find(Pair);
16218 It != ParentOpParentToPreds.
end()) {
16222 ParentsPairsToAdd.
insert(Pair);
16227 if (Budget > BudgetLimit)
16239 while (!LiveEntries.
empty()) {
16242 if (Operands.
empty())
16244 Instruction *LastInst = EntriesToLastInstruction.
at(Entry);
16246 for (
const TreeEntry *
Op : Operands) {
16247 if (!
Op->isGather())
16249 if (Entry->State == TreeEntry::SplitVectorize ||
16250 (Entry->getOpcode() != Instruction::PHI &&
Op->isGather()) ||
16256 Pred = Phi->getIncomingBlock(
Op->UserTreeIndex.EdgeIdx);
16259 if (
Op->isGather()) {
16260 assert(Entry->getOpcode() == Instruction::PHI &&
16261 "Expected phi node only.");
16263 ->getIncomingBlock(
Op->UserTreeIndex.EdgeIdx);
16265 for (
Value *V :
Op->Scalars) {
16276 OpLastInst = EntriesToLastInstruction.
at(
Op);
16280 if (OpParent == Parent) {
16281 if (Entry->getOpcode() == Instruction::PHI) {
16282 if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
16286 if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
16292 if (Entry->getOpcode() != Instruction::PHI &&
16293 !CheckForNonVecCallsInSameBlock(
16294 &*LastInst->
getParent()->getFirstNonPHIOrDbgOrAlloca(),
16300 if (!CheckForNonVecCallsInSameBlock(OpLastInst,
16306 if (!CheckPredecessors(Parent, Pred, OpParent)) {
16322 const auto *I1 = IE1;
16323 const auto *I2 = IE2;
16335 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
16338 if (I2 && ((I2 == IE2 || I2->
hasOneUse())) &&
16341 }
while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
16348struct ValueSelect {
16349 template <
typename U>
16350 static std::enable_if_t<std::is_same_v<Value *, U>,
Value *>
get(
Value *V) {
16353 template <
typename U>
16354 static std::enable_if_t<!std::is_same_v<Value *, U>,
U>
get(
Value *) {
16372template <
typename T>
16378 assert(!ShuffleMask.empty() &&
"Empty list of shuffles for inserts.");
16380 auto VMIt = std::next(ShuffleMask.begin());
16383 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
16385 if (!IsBaseUndef.
all()) {
16387 std::pair<T *, bool> Res =
16388 ResizeAction(ShuffleMask.begin()->first, Mask,
false);
16390 for (
unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
16394 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
16396 [[maybe_unused]]
auto *V = ValueSelect::get<T *>(
Base);
16397 assert((!V || GetVF(V) == Mask.size()) &&
16398 "Expected base vector of VF number of elements.");
16399 Prev = Action(Mask, {
nullptr, Res.first});
16400 }
else if (ShuffleMask.size() == 1) {
16403 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
16409 Prev = Action(Mask, {ShuffleMask.begin()->first});
16413 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
16414 unsigned Vec2VF = GetVF(VMIt->first);
16415 if (Vec1VF == Vec2VF) {
16419 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
16422 Mask[
I] = SecMask[
I] + Vec1VF;
16425 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
16428 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
16430 std::pair<T *, bool> Res2 =
16431 ResizeAction(VMIt->first, VMIt->second,
false);
16433 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
16440 Mask[
I] = (Res2.second ?
I : SecMask[
I]) + VF;
16443 Prev = Action(Mask, {Res1.first, Res2.first});
16445 VMIt = std::next(VMIt);
16447 [[maybe_unused]]
bool IsBaseNotUndef = !IsBaseUndef.
all();
16449 for (
auto E = ShuffleMask.end(); VMIt !=
E; ++VMIt) {
16451 std::pair<T *, bool> Res =
16452 ResizeAction(VMIt->first, VMIt->second,
false);
16454 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
16457 "Multiple uses of scalars.");
16458 Mask[
I] = (Res.second ?
I : SecMask[
I]) + VF;
16463 Prev = Action(Mask, {Prev, Res.first});
16474 << VectorizableTree.size() <<
".\n");
16476 for (
const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
16477 TreeEntry &TE = *Ptr;
16480 if (TE.State == TreeEntry::CombinedVectorize) {
16482 dbgs() <<
"SLP: Skipping cost for combined node that starts with "
16483 << *TE.Scalars[0] <<
".\n";
16484 TE.dump();
dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
16488 if (TE.hasState() &&
16489 (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {
16490 if (
const TreeEntry *E =
16491 getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
16492 E && E->getVectorFactor() == TE.getVectorFactor()) {
16497 <<
"SLP: Current total cost = " << Cost <<
"\n");
16505 assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&
16506 "Expected gather nodes with users only.");
16513 <<
"SLP: Current total cost = " << Cost <<
"\n");
16515 if (TE.Idx > 0 && !TE.UserTreeIndex && TE.hasState() &&
16516 TE.getOpcode() == Instruction::Load)
16517 GatheredLoadsNodes.
insert(&TE);
16526 if (!GatheredLoadsNodes.
empty())
16529 constexpr unsigned PartLimit = 2;
16530 const unsigned Sz =
16532 const unsigned MinVF =
getMinVF(Sz);
16534 VectorizableTree.front()->Scalars.size() * PartLimit <= MinVF &&
16535 (!VectorizableTree.front()->hasState() ||
16536 (VectorizableTree.front()->getOpcode() != Instruction::Store &&
16537 LI->getLoopFor(VectorizableTree.front()->getMainOp()->getParent()))))
16540 VectorizableTree.size());
16541 for (
const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
16542 TreeEntry &TE = *Ptr;
16544 SubtreeCosts[TE.Idx].first +=
C;
16545 const TreeEntry *UserTE = TE.UserTreeIndex.UserTE;
16547 SubtreeCosts[UserTE->Idx].first +=
C;
16548 SubtreeCosts[UserTE->Idx].second.
push_back(TE.Idx);
16549 UserTE = UserTE->UserTreeIndex.UserTE;
16552 using CostIndicesTy =
16553 std::pair<TreeEntry *, std::pair<InstructionCost, SmallVector<unsigned>>>;
16554 struct FirstGreater {
16555 bool operator()(
const CostIndicesTy &LHS,
const CostIndicesTy &RHS)
const {
16556 return LHS.second.first < RHS.second.first ||
16557 (LHS.second.first == RHS.second.first &&
16558 LHS.first->Idx < RHS.first->Idx);
16563 for (
const auto [Idx,
P] :
enumerate(SubtreeCosts))
16564 Worklist.emplace(VectorizableTree[Idx].
get(),
P);
16567 if (!UserIgnoreList && VectorizableTree.front()->getVectorFactor() < MinVF &&
16568 VectorizableTree.front()->hasState() &&
16569 VectorizableTree.front()->getOpcode() == Instruction::Store &&
16570 (Worklist.top().first->Idx == 0 || Worklist.top().first->Idx == 1))
16574 while (!Worklist.empty() && Worklist.top().second.first > 0) {
16575 TreeEntry *TE = Worklist.top().first;
16576 if (TE->isGather() || TE->Idx == 0 || DeletedNodes.contains(TE)) {
16583 if (SubtreeCost < TE->Scalars.size()) {
16587 if (!TransformedToGatherNodes.empty()) {
16588 for (
unsigned Idx : Worklist.top().second.second) {
16589 auto It = TransformedToGatherNodes.find(VectorizableTree[Idx].
get());
16590 if (It != TransformedToGatherNodes.end()) {
16591 SubtreeCost -= SubtreeCosts[Idx].first;
16592 SubtreeCost += It->second;
16596 if (SubtreeCost < 0 || SubtreeCost < TE->Scalars.size()) {
16600 const unsigned Sz = TE->Scalars.size();
16602 for (
auto [Idx, V] :
enumerate(TE->Scalars)) {
16610 const unsigned EntryVF = TE->getVectorFactor();
16613 *TTI, ScalarTy, VecTy, DemandedElts,
16616 if (!TE->ReorderIndices.empty() &&
16617 TE->State != TreeEntry::CompressVectorize &&
16618 (TE->State != TreeEntry::StridedVectorize ||
16621 if (TE->getOpcode() == Instruction::Store) {
16623 NewMask.
resize(TE->ReorderIndices.size());
16624 copy(TE->ReorderIndices, NewMask.
begin());
16630 if (!TE->ReuseShuffleIndices.empty())
16631 ::addMask(Mask, TE->ReuseShuffleIndices);
16638 return (TE->hasCopyableElements() && TE->isCopyableElement(V)) ||
16639 isConstant(V) || isGathered(V) || getTreeEntries(V).size() > 1;
16643 if (SubtreeCost > GatherCost) {
16646 if (VectorizableTree.front()->hasState() &&
16647 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
16651 LLVM_DEBUG(
dbgs() <<
"SLP: Trimming unprofitable subtree at node "
16652 << TE->Idx <<
" with cost "
16653 << Worklist.top().second.first <<
" and gather cost "
16654 << GatherCost <<
".\n");
16655 if (TE->UserTreeIndex) {
16656 TransformedToGatherNodes.try_emplace(TE, GatherCost);
16657 NodesCosts.
erase(TE);
16659 DeletedNodes.insert(TE);
16660 TransformedToGatherNodes.erase(TE);
16661 NodesCosts.
erase(TE);
16663 for (
unsigned Idx : Worklist.top().second.second) {
16664 TreeEntry &ChildTE = *VectorizableTree[Idx];
16665 DeletedNodes.insert(&ChildTE);
16666 TransformedToGatherNodes.erase(&ChildTE);
16667 NodesCosts.
erase(&ChildTE);
16674 return SubtreeCosts.
front().first;
16676 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
16677 if (!TE->UserTreeIndex && TransformedToGatherNodes.contains(TE.get())) {
16678 assert(TE->getOpcode() == Instruction::Load &&
"Expected load only.");
16681 if (DeletedNodes.contains(TE.get()))
16683 if (!NodesCosts.
contains(TE.get())) {
16685 getEntryCost(TE.get(), VectorizedVals, CheckedExtracts);
16690 LLVM_DEBUG(
dbgs() <<
"SLP: Recalculate costs after tree trimming.\n");
16692 for (
const auto &
P : NodesCosts) {
16693 NewCost +=
P.second;
16694 LLVM_DEBUG(
dbgs() <<
"SLP: Adding cost " <<
P.second <<
" for bundle "
16697 <<
"SLP: Current total cost = " << Cost <<
"\n");
16699 if (NewCost >= Cost) {
16700 DeletedNodes.clear();
16701 TransformedToGatherNodes.clear();
16710template <
typename T>
struct ShuffledInsertData {
16724 none_of(ExternalUses, [](
const ExternalUser &EU) {
16735 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
16742 for (ExternalUser &EU : ExternalUses) {
16743 ScalarUserAndIdx.
emplace_back(EU.Scalar, EU.User, EU.Lane);
16746 for (ExternalUser &EU : ExternalUses) {
16747 LLVM_DEBUG(
dbgs() <<
"SLP: Computing cost for external use of TreeEntry "
16748 << EU.E.Idx <<
" in lane " << EU.Lane <<
"\n");
16750 else dbgs() <<
" User: nullptr\n");
16751 LLVM_DEBUG(
dbgs() <<
" Use: " << EU.Scalar->getNameOrAsOperand() <<
"\n");
16756 if (EphValues.count(EU.User))
16760 if (!CheckedScalarUser.
insert(std::make_pair(EU.Scalar, EU.User)).second ||
16762 CheckedScalarUser.
contains(std::make_pair(EU.Scalar,
nullptr))))
16770 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
16776 !ExtractCostCalculated.
insert(EU.Scalar).second)
16789 if (!UsedInserts.
insert(VU).second)
16793 const TreeEntry *ScalarTE = &EU.E;
16796 [
this, VU](
const ShuffledInsertData<const TreeEntry *> &
Data) {
16801 Value *Op0 =
II->getOperand(0);
16808 if (It == ShuffledInserts.
end()) {
16810 Data.InsertElements.emplace_back(VU);
16812 VecId = ShuffledInserts.
size() - 1;
16813 auto It = MinBWs.find(ScalarTE);
16814 if (It != MinBWs.end() &&
16816 .
insert(std::make_pair(ScalarTE, FTy->getElementType()))
16818 unsigned BWSz = It->second.first;
16819 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
16820 unsigned VecOpcode;
16821 if (DstBWSz < BWSz)
16822 VecOpcode = Instruction::Trunc;
16825 It->second.second ? Instruction::SExt : Instruction::ZExt;
16830 FTy->getNumElements()),
16833 <<
" for extending externally used vector with "
16834 "non-equal minimum bitwidth.\n");
16839 It->InsertElements.front() = VU;
16840 VecId = std::distance(ShuffledInserts.
begin(), It);
16842 int InIdx = *InsertIdx;
16844 ShuffledInserts[VecId].ValueMasks[ScalarTE];
16847 Mask[InIdx] = EU.Lane;
16848 DemandedElts[VecId].setBit(InIdx);
16859 auto *ScalarTy = EU.Scalar->getType();
16860 const unsigned BundleWidth = EU.E.getVectorFactor();
16861 assert(EU.Lane < BundleWidth &&
"Extracted lane out of bounds.");
16863 const TreeEntry *Entry = &EU.E;
16864 auto It = MinBWs.find(Entry);
16865 if (It != MinBWs.end()) {
16870 ? Instruction::ZExt
16871 : Instruction::SExt;
16876 << ExtraCost <<
"\n");
16880 CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx);
16881 LLVM_DEBUG(
dbgs() <<
" ExtractElement cost for " << *ScalarTy <<
" from "
16882 << *VecTy <<
": " << ExtraCost <<
"\n");
16885 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
16886 Entry->getOpcode() == Instruction::Load) {
16888 auto IsPhiInLoop = [&](
const ExternalUser &U) {
16891 const Loop *L = LI->getLoopFor(Phi->getParent());
16892 return L && (Phi->getParent() ==
I->getParent() ||
16893 L == LI->getLoopFor(
I->getParent()));
16897 if (!ValueToExtUses) {
16898 ValueToExtUses.emplace();
16899 for (
const auto &
P :
enumerate(ExternalUses)) {
16901 if (IsPhiInLoop(
P.value()))
16904 ValueToExtUses->try_emplace(
P.value().Scalar,
P.index());
16911 auto OperandIsScalar = [&](
Value *V) {
16917 return !EE->hasOneUse() || !MustGather.contains(EE);
16920 return ValueToExtUses->contains(V);
16922 bool CanBeUsedAsScalar =
all_of(Inst->operands(), OperandIsScalar);
16923 bool CanBeUsedAsScalarCast =
false;
16926 Op &&
all_of(
Op->operands(), OperandIsScalar)) {
16931 if (ScalarCost + OpCost <= ExtraCost) {
16932 CanBeUsedAsScalar = CanBeUsedAsScalarCast =
true;
16933 ScalarCost += OpCost;
16937 if (CanBeUsedAsScalar) {
16938 bool KeepScalar = ScalarCost <= ExtraCost;
16942 bool IsProfitablePHIUser =
16944 VectorizableTree.front()->Scalars.size() > 2)) &&
16945 VectorizableTree.front()->hasState() &&
16946 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
16950 auto *PHIUser = dyn_cast<PHINode>(U);
16951 return (!PHIUser ||
16952 PHIUser->getParent() !=
16954 VectorizableTree.front()->getMainOp())
16959 return ValueToExtUses->contains(V);
16961 if (IsProfitablePHIUser) {
16965 (!GatheredLoadsEntriesFirst.has_value() ||
16966 Entry->Idx < *GatheredLoadsEntriesFirst)) {
16967 unsigned ScalarUsesCount =
count_if(Entry->Scalars, [&](
Value *V) {
16968 return ValueToExtUses->contains(V);
16970 auto It = ExtractsCount.
find(Entry);
16971 if (It != ExtractsCount.
end()) {
16972 assert(ScalarUsesCount >= It->getSecond().size() &&
16973 "Expected total number of external uses not less than "
16974 "number of scalar uses.");
16975 ScalarUsesCount -= It->getSecond().size();
16980 KeepScalar = ScalarUsesCount <= 1 || !
has_single_bit(ScalarUsesCount);
16983 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
16984 for (
Value *V : Inst->operands()) {
16985 auto It = ValueToExtUses->find(V);
16986 if (It != ValueToExtUses->end()) {
16988 ExternalUses[It->second].User =
nullptr;
16991 ExtraCost = ScalarCost;
16992 if (!IsPhiInLoop(EU))
16993 ExtractsCount[Entry].
insert(Inst);
16994 if (CanBeUsedAsScalarCast) {
16995 ScalarOpsFromCasts.
insert(Inst->getOperand(0));
16999 for (
Value *V : IOp->operands()) {
17000 auto It = ValueToExtUses->find(V);
17001 if (It != ValueToExtUses->end()) {
17003 ExternalUses[It->second].User =
nullptr;
17012 ExtractCost += ExtraCost;
17016 for (
Value *V : ScalarOpsFromCasts) {
17017 ExternalUsesAsOriginalScalar.insert(V);
17019 const auto *It =
find_if_not(TEs, [&](TreeEntry *TE) {
17020 return TransformedToGatherNodes.contains(TE) ||
17021 DeletedNodes.contains(TE);
17023 if (It != TEs.end()) {
17024 const TreeEntry *UserTE = *It;
17025 ExternalUses.emplace_back(V,
nullptr, *UserTE,
17026 UserTE->findLaneForValue(V));
17031 if (!VectorizedVals.
empty()) {
17032 const TreeEntry &Root = *VectorizableTree.front();
17033 auto BWIt = MinBWs.find(&Root);
17034 if (BWIt != MinBWs.end()) {
17035 Type *DstTy = Root.Scalars.front()->getType();
17036 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->
getScalarType());
17038 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
17039 if (OriginalSz != SrcSz) {
17040 unsigned Opcode = Instruction::Trunc;
17041 if (OriginalSz > SrcSz)
17042 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
17048 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
17059 VectorizableTree[1]->hasState() &&
17060 VectorizableTree[1]->State == TreeEntry::Vectorize &&
17061 all_of(VectorizableTree[1]->Scalars, [&](
Value *V) {
17062 return ExternalUsesAsOriginalScalar.contains(V);
17066 Cost += ExtractCost;
17067 auto &&ResizeToVF = [
this, &Cost](
const TreeEntry *TE,
ArrayRef<int> Mask,
17068 bool ForSingleMask) {
17070 unsigned VF = Mask.size();
17071 unsigned VecVF = TE->getVectorFactor();
17072 bool HasLargeIndex =
17073 any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); });
17074 if ((VF != VecVF && HasLargeIndex) ||
17077 if (HasLargeIndex) {
17079 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
17085 dbgs() <<
"SLP: Adding cost " <<
C
17086 <<
" for final shuffle of insertelement external users.\n";
17087 TE->dump();
dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
17089 return std::make_pair(TE,
true);
17092 if (!ForSingleMask) {
17094 for (
unsigned I = 0;
I < VF; ++
I) {
17096 ResizeMask[Mask[
I]] = Mask[
I];
17103 dbgs() <<
"SLP: Adding cost " <<
C
17104 <<
" for final shuffle of insertelement external users.\n";
17105 TE->dump();
dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
17110 return std::make_pair(TE,
false);
17113 for (
int I = 0, E = ShuffledInserts.
size();
I < E; ++
I) {
17114 Value *
Base = ShuffledInserts[
I].InsertElements.
front()->getOperand(0);
17115 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
17119 assert((TEs.size() == 1 || TEs.size() == 2) &&
17120 "Expected exactly 1 or 2 tree entries.");
17121 if (TEs.size() == 1) {
17123 VF = TEs.front()->getVectorFactor();
17124 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
17128 (
Data.index() < VF &&
17129 static_cast<int>(
Data.index()) ==
Data.value());
17134 <<
" for final shuffle of insertelement "
17135 "external users.\n";
17136 TEs.front()->
dump();
17137 dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
17143 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
17144 VF = TEs.front()->getVectorFactor();
17148 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
17152 <<
" for final shuffle of vector node and external "
17153 "insertelement users.\n";
17154 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
17155 dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
17163 [](
const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
17164 EstimateShufflesCost);
17167 ShuffledInserts[
I].InsertElements.
front()->getType()),
17170 Cost -= InsertCost;
17174 if (ReductionBitWidth != 0) {
17175 assert(UserIgnoreList &&
"Expected reduction tree.");
17176 const TreeEntry &E = *VectorizableTree.front();
17177 auto It = MinBWs.find(&E);
17178 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
17179 unsigned SrcSize = It->second.first;
17180 unsigned DstSize = ReductionBitWidth;
17181 unsigned Opcode = Instruction::Trunc;
17182 if (SrcSize < DstSize) {
17183 bool IsArithmeticExtendedReduction =
17186 return is_contained({Instruction::Add, Instruction::FAdd,
17187 Instruction::Mul, Instruction::FMul,
17188 Instruction::And, Instruction::Or,
17192 if (IsArithmeticExtendedReduction)
17194 Instruction::BitCast;
17196 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
17198 if (Opcode != Instruction::BitCast) {
17200 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
17202 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
17205 switch (E.getOpcode()) {
17206 case Instruction::SExt:
17207 case Instruction::ZExt:
17208 case Instruction::Trunc: {
17209 const TreeEntry *OpTE = getOperandEntry(&E, 0);
17210 CCH = getCastContextHint(*OpTE);
17216 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
17220 <<
" for final resize for reduction from " << SrcVecTy
17221 <<
" to " << DstVecTy <<
"\n";
17222 dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
17227 std::optional<InstructionCost> SpillCost;
17230 Cost += *SpillCost;
17236 OS <<
"SLP: Spill Cost = ";
17241 OS <<
".\nSLP: Extract Cost = " << ExtractCost <<
".\n"
17242 <<
"SLP: Total Cost = " << Cost <<
".\n";
17246 ViewGraph(
this,
"SLP" + F->getName(),
false, Str);
17257std::optional<TTI::ShuffleKind>
17258BoUpSLP::tryToGatherSingleRegisterExtractElements(
17264 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
17280 if (Idx >= VecTy->getNumElements()) {
17284 SmallBitVector ExtractMask(VecTy->getNumElements(),
true);
17285 ExtractMask.reset(*Idx);
17290 VectorOpToIdx[EI->getVectorOperand()].push_back(
I);
17295 stable_sort(Vectors, [](
const auto &P1,
const auto &P2) {
17296 return P1.second.size() > P2.second.size();
17299 const int UndefSz = UndefVectorExtracts.
size();
17300 unsigned SingleMax = 0;
17301 unsigned PairMax = 0;
17302 if (!Vectors.
empty()) {
17303 SingleMax = Vectors.
front().second.size() + UndefSz;
17304 if (Vectors.
size() > 1) {
17305 auto *ItNext = std::next(Vectors.
begin());
17306 PairMax = SingleMax + ItNext->second.size();
17309 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
17310 return std::nullopt;
17316 if (SingleMax >= PairMax && SingleMax) {
17317 for (
int Idx : Vectors.
front().second)
17318 std::swap(GatheredExtracts[Idx], VL[Idx]);
17319 }
else if (!Vectors.
empty()) {
17320 for (
unsigned Idx : {0, 1})
17321 for (
int Idx : Vectors[Idx].second)
17322 std::swap(GatheredExtracts[Idx], VL[Idx]);
17325 for (
int Idx : UndefVectorExtracts)
17326 std::swap(GatheredExtracts[Idx], VL[Idx]);
17329 std::optional<TTI::ShuffleKind> Res =
17335 return std::nullopt;
17339 for (
int I = 0,
E = GatheredExtracts.size();
I <
E; ++
I) {
17360BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
17361 SmallVectorImpl<int> &Mask,
17362 unsigned NumParts)
const {
17363 assert(NumParts > 0 &&
"NumParts expected be greater than or equal to 1.");
17372 SmallVector<int> SubMask;
17373 std::optional<TTI::ShuffleKind> Res =
17374 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
17375 ShufflesRes[Part] = Res;
17376 copy(SubMask, std::next(
Mask.begin(), Part * SliceSize));
17378 if (
none_of(ShufflesRes, [](
const std::optional<TTI::ShuffleKind> &Res) {
17379 return Res.has_value();
17381 ShufflesRes.clear();
17382 return ShufflesRes;
17385std::optional<TargetTransformInfo::ShuffleKind>
17386BoUpSLP::isGatherShuffledSingleRegisterEntry(
17388 SmallVectorImpl<const TreeEntry *> &Entries,
unsigned Part,
bool ForOrder) {
17391 return std::nullopt;
17394 auto GetUserEntry = [&](
const TreeEntry *
TE) {
17395 while (
TE->UserTreeIndex &&
TE->UserTreeIndex.EdgeIdx == UINT_MAX)
17396 TE =
TE->UserTreeIndex.UserTE;
17397 if (TE == VectorizableTree.front().get())
17398 return EdgeInfo(
const_cast<TreeEntry *
>(TE), 0);
17399 return TE->UserTreeIndex;
17401 auto HasGatherUser = [&](
const TreeEntry *
TE) {
17402 while (
TE->Idx != 0 &&
TE->UserTreeIndex) {
17403 if (
TE->UserTreeIndex.EdgeIdx == UINT_MAX)
17405 TE =
TE->UserTreeIndex.UserTE;
17409 const EdgeInfo TEUseEI = GetUserEntry(TE);
17411 return std::nullopt;
17412 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
17417 TEUseEI.UserTE->hasState() ? TEUseEI.UserTE->getMainOp() :
nullptr);
17418 PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
17419 TEInsertBlock =
PHI->getIncomingBlock(TEUseEI.EdgeIdx);
17422 TEInsertBlock = TEInsertPt->
getParent();
17424 if (!DT->isReachableFromEntry(TEInsertBlock))
17425 return std::nullopt;
17426 auto *NodeUI = DT->getNode(TEInsertBlock);
17427 assert(NodeUI &&
"Should only process reachable instructions");
17429 auto CheckOrdering = [&](
const Instruction *InsertPt) {
17442 const BasicBlock *InsertBlock = InsertPt->getParent();
17443 auto *NodeEUI = DT->getNode(InsertBlock);
17446 assert((NodeUI == NodeEUI) ==
17447 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
17448 "Different nodes should have different DFS numbers");
17450 if (TEInsertPt->
getParent() != InsertBlock &&
17451 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
17453 if (TEInsertPt->
getParent() == InsertBlock &&
17466 SmallDenseMap<Value *, int> UsedValuesEntry;
17467 SmallPtrSet<const Value *, 16> VisitedValue;
17468 auto CheckAndUseSameNode = [&](
const TreeEntry *TEPtr) {
17470 if ((TEPtr->getVectorFactor() != VL.
size() &&
17471 TEPtr->Scalars.size() != VL.
size()) ||
17472 (!TEPtr->isSame(VL) && !TEPtr->isSame(
TE->Scalars)))
17476 for (
Value *V : VL) {
17483 auto CheckParentNodes = [&](
const TreeEntry *User1,
const TreeEntry *User2,
17484 unsigned EdgeIdx) {
17485 const TreeEntry *Ptr1 = User1;
17486 const TreeEntry *Ptr2 = User2;
17487 SmallDenseMap<const TreeEntry *, unsigned> PtrToIdx;
17490 EdgeIdx = Ptr2->UserTreeIndex.EdgeIdx;
17491 Ptr2 = Ptr2->UserTreeIndex.UserTE;
17494 unsigned Idx = Ptr1->UserTreeIndex.EdgeIdx;
17495 Ptr1 = Ptr1->UserTreeIndex.UserTE;
17496 if (
auto It = PtrToIdx.
find(Ptr1); It != PtrToIdx.
end())
17497 return Idx < It->second;
17501 auto CheckNonSchedulableOrdering = [&](
const TreeEntry *
E,
17503 return TEUseEI && TEUseEI.UserTE && TEUseEI.UserTE->hasCopyableElements() &&
17504 !TEUseEI.UserTE->isCopyableElement(
17507 InsertPt->getNextNode() == TEInsertPt &&
17508 (!
E->hasCopyableElements() || !
E->isCopyableElement(InsertPt) ||
17511 for (
Value *V : VL) {
17515 SmallPtrSet<const TreeEntry *, 4> VToTEs;
17517 ValueToGatherNodes.lookup(V).takeVector());
17518 if (TransformedToGatherNodes.contains(TE)) {
17519 for (TreeEntry *
E : getSplitTreeEntries(V)) {
17520 if (TE ==
E || !TransformedToGatherNodes.contains(
E) ||
17521 !
E->UserTreeIndex ||
E->UserTreeIndex.UserTE->isGather())
17523 GatherNodes.push_back(
E);
17525 for (TreeEntry *
E : getTreeEntries(V)) {
17526 if (TE ==
E || !TransformedToGatherNodes.contains(
E) ||
17527 !
E->UserTreeIndex ||
E->UserTreeIndex.UserTE->isGather())
17529 GatherNodes.push_back(
E);
17532 for (
const TreeEntry *TEPtr : GatherNodes) {
17533 if (TEPtr == TE || TEPtr->Idx == 0 || DeletedNodes.contains(TEPtr))
17536 [&](
Value *V) { return GatheredScalars.contains(V); }) &&
17537 "Must contain at least single gathered value.");
17538 assert(TEPtr->UserTreeIndex &&
17539 "Expected only single user of a gather node.");
17540 const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
17542 PHINode *UserPHI = (UseEI.UserTE->State != TreeEntry::SplitVectorize &&
17543 UseEI.UserTE->hasState())
17548 : &getLastInstructionInBundle(UseEI.UserTE);
17549 if (TEInsertPt == InsertPt) {
17551 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
17552 (TEUseEI.UserTE->getOpcode() != Instruction::PHI ||
17553 TEUseEI.UserTE->isAltShuffle()) &&
17555 if (UseEI.UserTE->State != TreeEntry::Vectorize ||
17556 (UseEI.UserTE->hasState() &&
17557 UseEI.UserTE->getOpcode() == Instruction::PHI &&
17558 !UseEI.UserTE->isAltShuffle()) ||
17567 (TEUseEI.UserTE != UseEI.UserTE || TEUseEI.EdgeIdx < UseEI.EdgeIdx))
17570 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
17571 TEUseEI.UserTE->getOpcode() == Instruction::PHI &&
17572 UseEI.UserTE->State == TreeEntry::Vectorize &&
17573 UseEI.UserTE->getOpcode() == Instruction::PHI &&
17574 TEUseEI.UserTE != UseEI.UserTE)
17579 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
17583 if (TEUseEI.UserTE != UseEI.UserTE &&
17584 (TEUseEI.UserTE->Idx < UseEI.UserTE->Idx ||
17585 HasGatherUser(TEUseEI.UserTE)))
17588 if (CheckParentNodes(TEUseEI.UserTE, UseEI.UserTE, UseEI.EdgeIdx))
17592 if (!TEUseEI.UserTE->isGather() && !UserPHI &&
17593 TEUseEI.UserTE->doesNotNeedToSchedule() !=
17594 UseEI.UserTE->doesNotNeedToSchedule() &&
17599 if ((TEInsertBlock != InsertPt->
getParent() ||
17600 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
17601 (!CheckOrdering(InsertPt) ||
17602 (UseEI.UserTE->hasCopyableElements() &&
17607 if (CheckAndUseSameNode(TEPtr))
17612 if (CheckNonSchedulableOrdering(UseEI.UserTE, InsertPt))
17617 const auto *It =
find_if(VTEs, [&](
const TreeEntry *MTE) {
17618 return MTE !=
TE && MTE != TEUseEI.UserTE &&
17619 !DeletedNodes.contains(MTE) &&
17620 !TransformedToGatherNodes.contains(MTE);
17622 if (It != VTEs.end()) {
17623 const TreeEntry *VTE = *It;
17624 if (
none_of(
TE->CombinedEntriesWithIndices,
17625 [&](
const auto &
P) { return P.first == VTE->Idx; })) {
17626 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
17627 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
17631 if (CheckAndUseSameNode(VTE))
17637 const auto *It =
find_if(VTEs, [&, MainTE = TE](
const TreeEntry *TE) {
17638 return TE != MainTE && !DeletedNodes.contains(TE) &&
17639 !TransformedToGatherNodes.contains(TE);
17641 if (It != VTEs.end()) {
17642 const TreeEntry *VTE = *It;
17643 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
17644 VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
17645 VTEs = VTEs.drop_front();
17647 const auto *MIt =
find_if(VTEs, [](
const TreeEntry *MTE) {
17648 return MTE->State == TreeEntry::Vectorize;
17650 if (MIt == VTEs.end())
17654 if (
none_of(
TE->CombinedEntriesWithIndices,
17655 [&](
const auto &
P) { return P.first == VTE->Idx; })) {
17656 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
17657 if (&LastBundleInst == TEInsertPt ||
17658 !CheckOrdering(&LastBundleInst) ||
17659 CheckNonSchedulableOrdering(VTE, &LastBundleInst))
17663 if (CheckAndUseSameNode(VTE))
17668 if (VToTEs.
empty())
17670 if (UsedTEs.
empty()) {
17678 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
17680 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
17684 if (!VToTEs.
empty()) {
17690 VToTEs = SavedVToTEs;
17695 if (Idx == UsedTEs.
size()) {
17699 if (UsedTEs.
size() == 2)
17701 UsedTEs.push_back(SavedVToTEs);
17702 Idx = UsedTEs.
size() - 1;
17708 if (UsedTEs.
empty()) {
17710 return std::nullopt;
17714 if (UsedTEs.
size() == 1) {
17717 UsedTEs.front().
end());
17718 sort(FirstEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
17719 return TE1->Idx < TE2->Idx;
17722 auto *It =
find_if(FirstEntries, [=](
const TreeEntry *EntryPtr) {
17723 return EntryPtr->isSame(VL) || EntryPtr->isSame(
TE->Scalars);
17725 if (It != FirstEntries.end() &&
17726 ((*It)->getVectorFactor() == VL.size() ||
17727 ((*It)->getVectorFactor() ==
TE->Scalars.size() &&
17728 TE->ReuseShuffleIndices.size() == VL.size() &&
17729 (*It)->isSame(
TE->Scalars)))) {
17731 if ((*It)->getVectorFactor() == VL.size()) {
17732 std::iota(std::next(
Mask.begin(), Part * VL.size()),
17733 std::next(
Mask.begin(), (Part + 1) * VL.size()), 0);
17735 SmallVector<int> CommonMask =
TE->getCommonMask();
17746 Entries.
push_back(FirstEntries.front());
17748 for (
auto &
P : UsedValuesEntry)
17750 VF = FirstEntries.front()->getVectorFactor();
17753 assert(UsedTEs.
size() == 2 &&
"Expected at max 2 permuted entries.");
17755 DenseMap<int, const TreeEntry *> VFToTE;
17756 for (
const TreeEntry *TE : UsedTEs.front()) {
17757 unsigned VF =
TE->getVectorFactor();
17758 auto It = VFToTE.
find(VF);
17759 if (It != VFToTE.
end()) {
17760 if (It->second->Idx >
TE->Idx)
17761 It->getSecond() =
TE;
17768 UsedTEs.back().
end());
17769 sort(SecondEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
17770 return TE1->Idx < TE2->Idx;
17772 for (
const TreeEntry *TE : SecondEntries) {
17773 auto It = VFToTE.
find(
TE->getVectorFactor());
17774 if (It != VFToTE.
end()) {
17783 if (Entries.
empty()) {
17785 UsedTEs.front(), [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
17786 return TE1->Idx < TE2->Idx;
17788 Entries.
push_back(SecondEntries.front());
17789 VF = std::max(Entries.
front()->getVectorFactor(),
17790 Entries.
back()->getVectorFactor());
17792 VF = Entries.
front()->getVectorFactor();
17795 for (
const TreeEntry *
E : Entries)
17799 for (
auto &
P : UsedValuesEntry) {
17801 if (ValuesToEntries[Idx].
contains(
P.first)) {
17811 auto AreCompatiblePHIs = [&](
Value *
V,
Value *V1) {
17818 for (
int I = 0,
E =
PHI->getNumIncomingValues();
I <
E; ++
I) {
17820 Value *In1 = PHI1->getIncomingValue(
I);
17835 auto MightBeIgnored = [=](
Value *
V) {
17839 !areAllUsersVectorized(
I, UserIgnoreList) &&
isSimple(
I);
17844 auto NeighborMightBeIgnored = [&](
Value *
V,
int Idx) {
17845 Value *V1 = VL[Idx];
17846 bool UsedInSameVTE =
false;
17847 auto It = UsedValuesEntry.find(V1);
17848 if (It != UsedValuesEntry.end())
17849 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
17850 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
17857 SmallBitVector UsedIdxs(Entries.size());
17859 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
17861 auto It = UsedValuesEntry.find(V);
17862 if (It == UsedValuesEntry.end())
17868 ((
I > 0 && NeighborMightBeIgnored(V,
I - 1)) ||
17869 (
I !=
E - 1 && NeighborMightBeIgnored(V,
I + 1)))))
17871 unsigned Idx = It->second;
17878 for (
unsigned I = 0, Sz = Entries.size();
I < Sz; ++
I) {
17879 if (!UsedIdxs.test(
I))
17885 for (std::pair<unsigned, int> &Pair : EntryLanes)
17886 if (Pair.first ==
I)
17887 Pair.first = TempEntries.
size();
17890 Entries.swap(TempEntries);
17891 if (EntryLanes.size() == Entries.size() &&
17893 .slice(Part * VL.size(),
17894 std::min<int>(VL.size(),
TE->Scalars.size())))) {
17900 return std::nullopt;
17903 bool IsIdentity = Entries.size() == 1;
17906 for (
const std::pair<unsigned, int> &Pair : EntryLanes) {
17907 unsigned Idx = Part * VL.size() + Pair.second;
17910 (ForOrder ? std::distance(
17911 Entries[Pair.first]->Scalars.begin(),
17912 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
17913 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
17914 IsIdentity &=
Mask[Idx] == Pair.second;
17916 if (ForOrder || IsIdentity || Entries.empty()) {
17917 switch (Entries.size()) {
17919 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
17923 if (EntryLanes.size() > 2 || VL.size() <= 2)
17930 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
17932 SmallVector<int> SubMask(std::next(
Mask.begin(), Part * VL.size()),
17933 std::next(
Mask.begin(), (Part + 1) * VL.size()));
17934 int MinElement = SubMask.
front(), MaxElement = SubMask.
front();
17935 for (
int Idx : SubMask) {
17943 assert(MaxElement >= 0 && MinElement >= 0 &&
17944 MaxElement % VF >= MinElement % VF &&
17945 "Expected at least single element.");
17946 unsigned NewVF = std::max<unsigned>(
17948 (MaxElement % VF) -
17949 (MinElement % VF) + 1));
17951 for (
int &Idx : SubMask) {
17954 Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
17955 (Idx >=
static_cast<int>(VF) ? NewVF : 0);
17963 auto *MaskVecTy =
getWidenedType(VL.front()->getType(), SubMask.size());
17964 auto GetShuffleCost = [&,
17965 &TTI = *TTI](ArrayRef<int>
Mask,
17968 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
17970 Mask, Entries.front()->getInterleaveFactor()))
17972 return ::getShuffleCost(TTI,
17977 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
17979 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
17980 if (Entries.size() == 1 || !Entries[0]->isGather()) {
17981 FirstShuffleCost = ShuffleCost;
17985 bool IsIdentity =
true;
17986 for (
auto [
I, Idx] :
enumerate(FirstMask)) {
17987 if (Idx >=
static_cast<int>(NewVF)) {
17992 IsIdentity &=
static_cast<int>(
I) == Idx;
17996 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
17998 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
18002 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
18003 if (Entries.size() == 1 || !Entries[1]->isGather()) {
18004 SecondShuffleCost = ShuffleCost;
18008 bool IsIdentity =
true;
18009 for (
auto [
I, Idx] :
enumerate(SecondMask)) {
18010 if (Idx <
static_cast<int>(NewVF) && Idx >= 0) {
18016 IsIdentity &=
static_cast<int>(
I) == Idx;
18021 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
18023 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
18031 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
18033 const TreeEntry *BestEntry =
nullptr;
18034 if (FirstShuffleCost < ShuffleCost) {
18035 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
18036 std::next(
Mask.begin(), (Part + 1) * VL.size()),
18038 if (Idx >= static_cast<int>(VF))
18039 Idx = PoisonMaskElem;
18041 BestEntry = Entries.front();
18042 ShuffleCost = FirstShuffleCost;
18044 if (SecondShuffleCost < ShuffleCost) {
18045 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
18046 std::next(
Mask.begin(), (Part + 1) * VL.size()),
18048 if (Idx < static_cast<int>(VF))
18049 Idx = PoisonMaskElem;
18053 BestEntry = Entries[1];
18054 ShuffleCost = SecondShuffleCost;
18056 if (BuildVectorCost >= ShuffleCost) {
18059 Entries.push_back(BestEntry);
18067 std::fill(std::next(
Mask.begin(), Part * VL.size()),
18069 return std::nullopt;
18073BoUpSLP::isGatherShuffledEntry(
18077 assert(NumParts > 0 && NumParts < VL.
size() &&
18078 "Expected positive number of registers.");
18081 if (TE == VectorizableTree.front().get() &&
18082 (!GatheredLoadsEntriesFirst.has_value() ||
18084 [](
const std::unique_ptr<TreeEntry> &TE) {
18085 return !
TE->isGather();
18090 if (
TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
18093 assert((
TE->UserTreeIndex || TE == VectorizableTree.front().get()) &&
18094 "Expected only single user of the gather node.");
18096 "Number of scalars must be divisible by NumParts.");
18097 if (
TE->UserTreeIndex &&
TE->UserTreeIndex.UserTE->isGather() &&
18098 TE->UserTreeIndex.EdgeIdx == UINT_MAX &&
18100 (
TE->hasState() &&
TE->getOpcode() == Instruction::ExtractElement) ||
18103 getSameValuesTreeEntry(
TE->getMainOp(),
TE->Scalars))))
18110 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
18111 std::optional<TTI::ShuffleKind> SubRes =
18112 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
18115 SubEntries.
clear();
18118 SubEntries.
front()->getVectorFactor() == VL.
size() &&
18119 (SubEntries.
front()->isSame(
TE->Scalars) ||
18120 SubEntries.
front()->isSame(VL))) {
18122 LocalSubEntries.
swap(SubEntries);
18125 std::iota(
Mask.begin(),
Mask.end(), 0);
18127 for (
int I = 0, Sz = VL.
size();
I < Sz; ++
I)
18130 Entries.emplace_back(1, LocalSubEntries.
front());
18136 [](
const std::optional<TTI::ShuffleKind> &SK) {
return !SK; })) {
18144 Type *ScalarTy)
const {
18145 const unsigned VF = VL.
size();
18153 auto EstimateInsertCost = [&](
unsigned I,
Value *
V) {
18155 if (
V->getType() != ScalarTy)
18156 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy,
V->getType(),
18160 std::iota(ConstantShuffleMask.begin(), ConstantShuffleMask.end(), 0);
18167 ConstantShuffleMask[
I] =
I + VF;
18170 EstimateInsertCost(
I, V);
18173 bool IsAnyNonUndefConst =
18176 if (!ForPoisonSrc && IsAnyNonUndefConst) {
18178 ConstantShuffleMask);
18182 if (!DemandedElements.
isZero())
18186 ForPoisonSrc && !IsAnyNonUndefConst, VL);
18190Instruction &BoUpSLP::getLastInstructionInBundle(
const TreeEntry *
E) {
18191 auto It = EntryToLastInstruction.find(
E);
18192 if (It != EntryToLastInstruction.end())
18200 if (
E->hasState()) {
18201 Front =
E->getMainOp();
18202 Opcode =
E->getOpcode();
18209 ((GatheredLoadsEntriesFirst.has_value() && Opcode == Instruction::Load &&
18210 E->isGather() &&
E->Idx < *GatheredLoadsEntriesFirst) ||
18211 E->State == TreeEntry::SplitVectorize ||
E->hasCopyableElements() ||
18213 [=](
Value *V) ->
bool {
18214 if (Opcode == Instruction::GetElementPtr &&
18215 !isa<GetElementPtrInst>(V))
18217 auto *I = dyn_cast<Instruction>(V);
18218 return !I || !E->getMatchingMainOpOrAltOp(I) ||
18219 I->getParent() == BB || isVectorLikeInstWithConstOps(I);
18221 "Expected gathered loads or GEPs or instructions from same basic "
18224 auto FindLastInst = [&]() {
18226 for (
Value *V :
E->Scalars) {
18230 if (
E->isCopyableElement(
I))
18232 if (LastInst->
getParent() ==
I->getParent()) {
18237 assert(((Opcode == Instruction::GetElementPtr &&
18239 E->State == TreeEntry::SplitVectorize ||
18242 (GatheredLoadsEntriesFirst.has_value() &&
18243 Opcode == Instruction::Load &&
E->isGather() &&
18244 E->Idx < *GatheredLoadsEntriesFirst)) &&
18245 "Expected vector-like or non-GEP in GEP node insts only.");
18246 if (!DT->isReachableFromEntry(LastInst->
getParent())) {
18250 if (!DT->isReachableFromEntry(
I->getParent()))
18252 auto *NodeA = DT->getNode(LastInst->
getParent());
18253 auto *NodeB = DT->getNode(
I->getParent());
18254 assert(NodeA &&
"Should only process reachable instructions");
18255 assert(NodeB &&
"Should only process reachable instructions");
18256 assert((NodeA == NodeB) ==
18257 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
18258 "Different nodes should have different DFS numbers");
18259 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
18266 auto FindFirstInst = [&]() {
18268 for (
Value *V :
E->Scalars) {
18272 if (
E->isCopyableElement(
I))
18274 if (FirstInst->
getParent() ==
I->getParent()) {
18275 if (
I->comesBefore(FirstInst))
18279 assert(((Opcode == Instruction::GetElementPtr &&
18283 "Expected vector-like or non-GEP in GEP node insts only.");
18284 if (!DT->isReachableFromEntry(FirstInst->
getParent())) {
18288 if (!DT->isReachableFromEntry(
I->getParent()))
18290 auto *NodeA = DT->getNode(FirstInst->
getParent());
18291 auto *NodeB = DT->getNode(
I->getParent());
18292 assert(NodeA &&
"Should only process reachable instructions");
18293 assert(NodeB &&
"Should only process reachable instructions");
18294 assert((NodeA == NodeB) ==
18295 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
18296 "Different nodes should have different DFS numbers");
18297 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
18303 if (
E->State == TreeEntry::SplitVectorize) {
18304 Res = FindLastInst();
18306 for (
auto *
E : Entries) {
18309 I = &getLastInstructionInBundle(
E);
18314 EntryToLastInstruction.try_emplace(
E, Res);
18319 if (GatheredLoadsEntriesFirst.has_value() &&
18320 E->Idx >= *GatheredLoadsEntriesFirst && !
E->isGather() &&
18321 Opcode == Instruction::Load) {
18322 Res = FindFirstInst();
18323 EntryToLastInstruction.try_emplace(
E, Res);
18329 auto FindScheduleBundle = [&](
const TreeEntry *
E) ->
const ScheduleBundle * {
18333 const auto *It = BlocksSchedules.find(BB);
18334 if (It == BlocksSchedules.end())
18336 for (
Value *V :
E->Scalars) {
18342 if (Bundles.
empty())
18345 Bundles, [&](ScheduleBundle *
B) {
return B->getTreeEntry() ==
E; });
18346 if (It != Bundles.
end())
18351 const ScheduleBundle *Bundle = FindScheduleBundle(
E);
18352 if (!
E->isGather() && !Bundle) {
18353 if ((Opcode == Instruction::GetElementPtr &&
18356 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
18360 return isa<PoisonValue>(V) ||
18361 (E->Idx == 0 && isa<InsertElementInst>(V)) ||
18362 E->isCopyableElement(V) ||
18363 (!isVectorLikeInstWithConstOps(V) &&
18364 isUsedOutsideBlock(V));
18366 (!
E->doesNotNeedToSchedule() ||
18369 if (!isa<Instruction>(V) ||
18370 (E->hasCopyableElements() && E->isCopyableElement(V)))
18372 return !areAllOperandsNonInsts(V);
18375 if (!isa<Instruction>(V) ||
18376 (E->hasCopyableElements() && E->isCopyableElement(V)))
18378 return MustGather.contains(V);
18380 Res = FindLastInst();
18382 Res = FindFirstInst();
18383 EntryToLastInstruction.try_emplace(
E, Res);
18392 assert(!
E->isGather() &&
"Gathered instructions should not be scheduled");
18393 Res = Bundle->getBundle().back()->getInst();
18394 EntryToLastInstruction.try_emplace(
E, Res);
18417 Res = FindLastInst();
18418 assert(Res &&
"Failed to find last instruction in bundle");
18419 EntryToLastInstruction.try_emplace(
E, Res);
18423void BoUpSLP::setInsertPointAfterBundle(
const TreeEntry *
E) {
18424 auto *Front =
E->getMainOp();
18425 Instruction *LastInst = &getLastInstructionInBundle(
E);
18426 assert(LastInst &&
"Failed to find last instruction in bundle");
18431 LastInstIt = LastInst->
getParent()->getFirstNonPHIIt();
18432 if (LastInstIt != LastInst->
getParent()->end() &&
18433 LastInstIt->getParent()->isLandingPad())
18434 LastInstIt = std::next(LastInstIt);
18437 (!
E->isGather() &&
E->State != TreeEntry::SplitVectorize &&
18438 (
E->doesNotNeedToSchedule() ||
18439 (
E->hasCopyableElements() && !
E->isCopyableElement(LastInst) &&
18441 (GatheredLoadsEntriesFirst.has_value() &&
18442 E->Idx >= *GatheredLoadsEntriesFirst && !
E->isGather() &&
18443 E->getOpcode() == Instruction::Load)) {
18444 Builder.SetInsertPoint(LastInst->
getParent(), LastInstIt);
18448 Builder.SetInsertPoint(
18451 if (Instruction *Res = LastInstructionToPos.lookup(LastInst)) {
18454 Res = Builder.CreateAlignedLoad(Builder.getPtrTy(),
18459 LastInstructionToPos.try_emplace(LastInst, Res);
18462 Builder.SetCurrentDebugLocation(Front->
getDebugLoc());
18465Value *BoUpSLP::gather(
18467 function_ref<
Value *(
Value *,
Value *, ArrayRef<int>)> CreateShuffle) {
18473 SmallSet<int, 4> PostponedIndices;
18474 Loop *
L = LI->getLoopFor(Builder.GetInsertBlock());
18476 SmallPtrSet<BasicBlock *, 4> Visited;
18477 while (InsertBB && InsertBB != InstBB && Visited.
insert(InsertBB).second)
18478 InsertBB = InsertBB->getSinglePredecessor();
18479 return InsertBB && InsertBB == InstBB;
18481 for (
int I = 0,
E = VL.
size();
I <
E; ++
I) {
18483 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
18485 (L && (!Root ||
L->isLoopInvariant(Root)) &&
L->contains(Inst))) &&
18486 PostponedIndices.
insert(
I).second)
18490 auto &&CreateInsertElement = [
this](
Value *Vec,
Value *
V,
unsigned Pos,
18493 if (
Scalar->getType() != Ty) {
18504 Scalar = Builder.CreateIntCast(
18518 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
18523 GatherShuffleExtractSeq.insert(InsElt);
18528 const auto *It =
find_if(Entries, [&](
const TreeEntry *
E) {
18529 return !TransformedToGatherNodes.contains(
E) &&
18530 !DeletedNodes.contains(
E);
18532 if (It != Entries.
end()) {
18534 User *UserOp =
nullptr;
18539 if (
V->getType()->isVectorTy()) {
18541 SV && SV->getOperand(0) != V && SV->getOperand(1) != V) {
18543 auto FindOperand = [](
Value *Vec,
Value *
V) -> Instruction * {
18545 if (SV->getOperand(0) == V)
18547 if (SV->getOperand(1) == V)
18553 if (Instruction *User = FindOperand(SV->getOperand(0), V))
18555 else if (Instruction *User = FindOperand(SV->getOperand(1), V))
18558 "Failed to find shufflevector, caused by resize.");
18564 unsigned FoundLane = (*It)->findLaneForValue(V);
18565 ExternalUses.emplace_back(V, UserOp, **It, FoundLane);
18573 SmallVector<int> NonConsts;
18575 std::iota(
Mask.begin(),
Mask.end(), 0);
18576 Value *OriginalRoot = Root;
18579 SV->getOperand(0)->getType() == VecTy) {
18580 Root = SV->getOperand(0);
18581 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
18584 for (
int I = 0,
E = VL.
size();
I <
E; ++
I) {
18593 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
18598 Vec = OriginalRoot;
18600 Vec = CreateShuffle(Root, Vec, Mask);
18602 OI && OI->use_empty() &&
18603 none_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
18604 return TE->VectorizedValue == OI;
18610 for (
int I : NonConsts)
18611 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
18614 for (
const std::pair<Value *, unsigned> &Pair : PostponedInsts)
18615 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
18653 bool IsFinalized =
false;
18666 class ShuffleIRBuilder {
18679 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
18680 CSEBlocks(CSEBlocks),
DL(DL) {}
18681 ~ShuffleIRBuilder() =
default;
18687 "Expected integer vector types only.");
18693 ->getIntegerBitWidth())
18694 V2 = Builder.CreateIntCast(
18697 V1 = Builder.CreateIntCast(
18701 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
18703 GatherShuffleExtractSeq.insert(
I);
18704 CSEBlocks.insert(
I->getParent());
18713 unsigned VF = Mask.size();
18717 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
18719 GatherShuffleExtractSeq.insert(
I);
18720 CSEBlocks.insert(
I->getParent());
18724 Value *createIdentity(
Value *V) {
return V; }
18725 Value *createPoison(
Type *Ty,
unsigned VF) {
18730 void resizeToMatch(
Value *&V1,
Value *&V2) {
18735 int VF = std::max(V1VF, V2VF);
18736 int MinVF = std::min(V1VF, V2VF);
18738 std::iota(IdentityMask.
begin(), std::next(IdentityMask.
begin(), MinVF),
18740 Value *&
Op = MinVF == V1VF ? V1 : V2;
18741 Op = Builder.CreateShuffleVector(
Op, IdentityMask);
18743 GatherShuffleExtractSeq.insert(
I);
18744 CSEBlocks.insert(
I->getParent());
18757 assert(V1 &&
"Expected at least one vector value.");
18758 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
18759 R.CSEBlocks, *R.DL);
18760 return BaseShuffleAnalysis::createShuffle<Value *>(
18761 V1, V2, Mask, ShuffleBuilder, ScalarTy);
18767 std::optional<bool> IsSigned = std::nullopt) {
18770 if (VecTy->getElementType() == ScalarTy->getScalarType())
18772 return Builder.CreateIntCast(
18773 V,
VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
18777 Value *getVectorizedValue(
const TreeEntry &E) {
18778 Value *Vec = E.VectorizedValue;
18781 return castToScalarTyElem(Vec,
any_of(E.Scalars, [&](
Value *V) {
18782 return !isa<PoisonValue>(V) &&
18783 !isKnownNonNegative(
18784 V, SimplifyQuery(*R.DL));
18790 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
18794 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
18795 unsigned NumParts,
bool &UseVecBaseAsInput) {
18796 UseVecBaseAsInput =
false;
18798 Value *VecBase =
nullptr;
18800 if (!E->ReorderIndices.empty()) {
18802 E->ReorderIndices.end());
18805 for (
int I = 0, Sz = Mask.size();
I < Sz; ++
I) {
18810 VecBase = EI->getVectorOperand();
18812 VecBase = TEs.front()->VectorizedValue;
18813 assert(VecBase &&
"Expected vectorized value.");
18814 UniqueBases.
insert(VecBase);
18817 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
18818 (E->UserTreeIndex && E->UserTreeIndex.EdgeIdx == UINT_MAX &&
18819 !R.isVectorized(EI) &&
18821 count_if(E->UserTreeIndex.UserTE->Scalars,
18822 [&](
Value *V) { return V == EI; })) ||
18823 (NumParts != 1 &&
count(VL, EI) > 1) ||
18825 ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(U);
18826 return UTEs.empty() || UTEs.size() > 1 ||
18828 [&](const TreeEntry *TE) {
18829 return R.DeletedNodes.contains(TE) ||
18830 R.TransformedToGatherNodes.contains(TE);
18836 [&](
const std::unique_ptr<TreeEntry> &TE) {
18837 return TE->UserTreeIndex.UserTE ==
18839 is_contained(VL, EI);
18843 R.eraseInstruction(EI);
18845 if (NumParts == 1 || UniqueBases.
size() == 1) {
18846 assert(VecBase &&
"Expected vectorized value.");
18847 return castToScalarTyElem(VecBase);
18849 UseVecBaseAsInput =
true;
18859 Value *Vec =
nullptr;
18866 constexpr int MaxBases = 2;
18868 auto VLMask =
zip(SubVL, SubMask);
18869 const unsigned VF = std::accumulate(
18870 VLMask.begin(), VLMask.end(), 0U, [&](
unsigned S,
const auto &
D) {
18871 if (std::get<1>(D) == PoisonMaskElem)
18874 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
18875 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp);
18877 VecOp = TEs.front()->VectorizedValue;
18878 assert(VecOp &&
"Expected vectorized value.");
18879 const unsigned Size =
18880 cast<FixedVectorType>(VecOp->getType())->getNumElements();
18881 return std::max(S, Size);
18883 for (
const auto [V,
I] : VLMask) {
18888 VecOp = TEs.front()->VectorizedValue;
18889 assert(VecOp &&
"Expected vectorized value.");
18890 VecOp = castToScalarTyElem(VecOp);
18891 Bases[
I / VF] = VecOp;
18893 if (!Bases.front())
18896 if (Bases.back()) {
18897 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
18898 TransformToIdentity(SubMask);
18900 SubVec = Bases.front();
18906 ArrayRef<int> SubMask =
18907 Mask.slice(
P * SliceSize,
18910 return all_of(SubMask, [](
int Idx) {
18914 "Expected first part or all previous parts masked.");
18915 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18920 unsigned SubVecVF =
18922 NewVF = std::max(NewVF, SubVecVF);
18925 for (
int &Idx : SubMask)
18928 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18929 Vec = createShuffle(Vec, SubVec, VecMask);
18930 TransformToIdentity(VecMask);
18938 std::optional<Value *>
18944 TEs, [](
const TreeEntry *TE) {
return TE->VectorizedValue; });
18946 return std::nullopt;
18949 auto *ResVecTy =
getWidenedType(ScalarTy, E->getVectorFactor());
18950 return Builder.CreateAlignedLoad(
18957 IsFinalized =
false;
18958 CommonMask.clear();
18964 Value *V1 = getVectorizedValue(E1);
18965 Value *V2 = getVectorizedValue(E2);
18971 Value *V1 = getVectorizedValue(E1);
18976 assert(V1 && V2 && !Mask.empty() &&
"Expected non-empty input vectors.");
18979 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
18980 V1 = castToScalarTyElem(V1);
18981 V2 = castToScalarTyElem(V2);
18982 if (InVectors.empty()) {
18983 InVectors.push_back(V1);
18984 InVectors.push_back(V2);
18985 CommonMask.assign(Mask.begin(), Mask.end());
18988 Value *Vec = InVectors.front();
18989 if (InVectors.size() == 2) {
18990 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18991 transformMaskAfterShuffle(CommonMask, CommonMask);
18994 Vec = createShuffle(Vec,
nullptr, CommonMask);
18995 transformMaskAfterShuffle(CommonMask, CommonMask);
18997 V1 = createShuffle(V1, V2, Mask);
18998 unsigned VF = std::max(getVF(V1), getVF(Vec));
18999 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
19001 CommonMask[Idx] = Idx + VF;
19002 InVectors.front() = Vec;
19003 if (InVectors.size() == 2)
19004 InVectors.back() = V1;
19006 InVectors.push_back(V1);
19011 "castToScalarTyElem expects V1 to be FixedVectorType");
19012 V1 = castToScalarTyElem(V1);
19013 if (InVectors.empty()) {
19014 InVectors.push_back(V1);
19015 CommonMask.assign(Mask.begin(), Mask.end());
19018 const auto *It =
find(InVectors, V1);
19019 if (It == InVectors.end()) {
19020 if (InVectors.size() == 2 ||
19021 InVectors.front()->getType() != V1->
getType()) {
19022 Value *V = InVectors.front();
19023 if (InVectors.size() == 2) {
19024 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
19025 transformMaskAfterShuffle(CommonMask, CommonMask);
19027 CommonMask.size()) {
19028 V = createShuffle(InVectors.front(),
nullptr, CommonMask);
19029 transformMaskAfterShuffle(CommonMask, CommonMask);
19031 unsigned VF = std::max(CommonMask.size(), Mask.size());
19032 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
19034 CommonMask[Idx] = V->getType() != V1->
getType()
19036 : Mask[Idx] + getVF(V1);
19037 if (V->getType() != V1->
getType())
19038 V1 = createShuffle(V1,
nullptr, Mask);
19039 InVectors.front() = V;
19040 if (InVectors.size() == 2)
19041 InVectors.back() = V1;
19043 InVectors.push_back(V1);
19048 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
19050 InVectors.push_back(V1);
19055 for (
Value *V : InVectors)
19056 VF = std::max(VF, getVF(V));
19057 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
19059 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
19068 Value *Root =
nullptr) {
19069 return R.gather(VL, Root, ScalarTy,
19071 return createShuffle(V1, V2, Mask);
19080 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
19085 IsFinalized =
true;
19088 if (InVectors.
size() == 2) {
19089 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
19092 Vec = createShuffle(Vec,
nullptr, CommonMask);
19094 transformMaskAfterShuffle(CommonMask, CommonMask);
19096 "Expected vector length for the final value before action.");
19100 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
19101 Vec = createShuffle(Vec,
nullptr, ResizeMask);
19103 Action(Vec, CommonMask, [
this](
Value *V1,
Value *V2, ArrayRef<int> Mask) {
19104 return createShuffle(V1, V2, Mask);
19106 InVectors.
front() = Vec;
19108 if (!SubVectors.empty()) {
19110 if (InVectors.
size() == 2) {
19111 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
19114 Vec = createShuffle(Vec,
nullptr, CommonMask);
19116 transformMaskAfterShuffle(CommonMask, CommonMask);
19117 auto CreateSubVectors = [&](
Value *Vec,
19118 SmallVectorImpl<int> &CommonMask) {
19119 for (
auto [
E, Idx] : SubVectors) {
19120 Value *
V = getVectorizedValue(*
E);
19127 Type *OrigScalarTy = ScalarTy;
19130 Builder, Vec, V, InsertionIndex,
19131 std::bind(&ShuffleInstructionBuilder::createShuffle,
this, _1, _2,
19133 ScalarTy = OrigScalarTy;
19134 if (!CommonMask.
empty()) {
19135 std::iota(std::next(CommonMask.
begin(), Idx),
19136 std::next(CommonMask.
begin(), Idx +
E->getVectorFactor()),
19142 if (SubVectorsMask.
empty()) {
19143 Vec = CreateSubVectors(Vec, CommonMask);
19146 copy(SubVectorsMask, SVMask.begin());
19147 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
19150 I1 = I2 + CommonMask.
size();
19155 Vec = createShuffle(InsertVec, Vec, SVMask);
19156 transformMaskAfterShuffle(CommonMask, SVMask);
19158 InVectors.
front() = Vec;
19161 if (!ExtMask.
empty()) {
19162 if (CommonMask.
empty()) {
19166 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
19169 NewMask[
I] = CommonMask[ExtMask[
I]];
19171 CommonMask.
swap(NewMask);
19174 if (CommonMask.
empty()) {
19175 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
19176 return InVectors.
front();
19178 if (InVectors.
size() == 2)
19179 return createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
19180 return createShuffle(InVectors.
front(),
nullptr, CommonMask);
19184 assert((IsFinalized || CommonMask.empty()) &&
19185 "Shuffle construction must be finalized.");
19189Value *BoUpSLP::vectorizeOperand(TreeEntry *E,
unsigned NodeIdx) {
19193template <
typename BVTy,
typename ResTy,
typename... Args>
19194ResTy BoUpSLP::processBuildVector(
const TreeEntry *E,
Type *ScalarTy,
19196 assert((E->isGather() || TransformedToGatherNodes.contains(E)) &&
19197 "Expected gather node.");
19198 unsigned VF = E->getVectorFactor();
19200 bool NeedFreeze =
false;
19203 for (
auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
19205 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
19208 E->CombinedEntriesWithIndices.size());
19209 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
19210 [&](
const auto &
P) {
19211 return std::make_pair(VectorizableTree[P.first].get(), P.second);
19216 E->ReorderIndices.end());
19217 if (!ReorderMask.
empty())
19223 if (!SubVectors.empty() && !SubVectorsMask.
empty()) {
19225 if (E->Scalars[
I] == GatheredScalars[ReorderMask[
I]])
19228 SubVectorsMask.
clear();
19232 unsigned I,
unsigned SliceSize,
19233 bool IsNotPoisonous) {
19235 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
19238 TreeEntry *UserTE = E->UserTreeIndex.UserTE;
19239 unsigned EdgeIdx = E->UserTreeIndex.EdgeIdx;
19240 if (UserTE->getNumOperands() != 2)
19242 if (!IsNotPoisonous) {
19243 auto *It =
find_if(
ArrayRef(VectorizableTree).drop_front(UserTE->Idx + 1),
19244 [=](
const std::unique_ptr<TreeEntry> &TE) {
19245 return TE->UserTreeIndex.UserTE == UserTE &&
19246 TE->UserTreeIndex.EdgeIdx != EdgeIdx;
19248 if (It == VectorizableTree.end())
19251 if (!(*It)->ReorderIndices.empty()) {
19255 if (!
all_of(
zip(GatheredScalars, GS), [&](
const auto &
P) {
19256 Value *V0 = std::get<0>(
P);
19257 Value *V1 = std::get<1>(
P);
19265 if ((Mask.size() < InputVF &&
19268 (Mask.size() == InputVF &&
19271 std::next(Mask.begin(),
I * SliceSize),
19272 std::next(Mask.begin(),
19279 std::next(Mask.begin(),
I * SliceSize),
19280 std::next(Mask.begin(),
19286 BVTy ShuffleBuilder(ScalarTy, Params...);
19287 ResTy Res = ResTy();
19291 Value *ExtractVecBase =
nullptr;
19292 bool UseVecBaseAsInput =
false;
19295 Type *OrigScalarTy = GatheredScalars.
front()->getType();
19300 bool Resized =
false;
19302 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
19303 if (!ExtractShuffles.
empty()) {
19305 for (
auto [Idx,
I] :
enumerate(ExtractMask)) {
19311 ExtractEntries.
append(TEs.begin(), TEs.end());
19313 if (std::optional<ResTy> Delayed =
19314 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
19316 PostponedGathers.insert(E);
19321 if (
Value *VecBase = ShuffleBuilder.adjustExtracts(
19322 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
19323 ExtractVecBase = VecBase;
19325 if (VF == VecBaseTy->getNumElements() &&
19326 GatheredScalars.
size() != VF) {
19328 GatheredScalars.
append(VF - GatheredScalars.
size(),
19336 if (!ExtractShuffles.
empty() || !E->hasState() ||
19337 E->getOpcode() != Instruction::Load ||
19338 (((E->hasState() && E->getOpcode() == Instruction::Load) ||
19342 return isa<LoadInst>(V) && isVectorized(V);
19344 (E->hasState() && E->isAltShuffle()) ||
19345 all_of(E->Scalars, [
this](
Value *V) { return isVectorized(V); }) ||
19347 (E->Scalars != GatheredScalars && GatheredScalars.
size() <= 2)) {
19349 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
19351 if (!GatherShuffles.
empty()) {
19352 if (std::optional<ResTy> Delayed =
19353 ShuffleBuilder.needToDelay(E, Entries)) {
19355 PostponedGathers.insert(E);
19360 if (GatherShuffles.
size() == 1 &&
19362 Entries.
front().front()->isSame(E->Scalars)) {
19365 LLVM_DEBUG(
dbgs() <<
"SLP: perfect diamond match for gather bundle "
19368 Mask.resize(E->Scalars.size());
19369 const TreeEntry *FrontTE = Entries.
front().front();
19370 if (FrontTE->ReorderIndices.empty() &&
19371 ((FrontTE->ReuseShuffleIndices.empty() &&
19372 E->Scalars.size() == FrontTE->Scalars.size()) ||
19373 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
19374 std::iota(Mask.begin(), Mask.end(), 0);
19381 Mask[
I] = FrontTE->findLaneForValue(V);
19386 ShuffleBuilder.resetForSameNode();
19387 ShuffleBuilder.add(*FrontTE, Mask);
19389 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
19393 if (GatheredScalars.
size() != VF &&
19395 return any_of(TEs, [&](
const TreeEntry *TE) {
19396 return TE->getVectorFactor() == VF;
19399 GatheredScalars.
append(VF - GatheredScalars.
size(),
19403 for (
int I = 0, Sz = Mask.size();
I < Sz; ++
I) {
19411 bool IsRootPoison) {
19414 bool IsSplat = IsRootPoison &&
isSplat(Scalars) &&
19421 int NumNonConsts = 0;
19440 Scalars.
front() = OrigV;
19443 const auto Res = UniquePositions.
try_emplace(OrigV,
I);
19444 Scalars[Res.first->second] = OrigV;
19445 ReuseMask[
I] = Res.first->second;
19448 if (NumNonConsts == 1) {
19453 if (!UndefPos.
empty() && UndefPos.
front() == 0)
19456 ReuseMask[SinglePos] = SinglePos;
19457 }
else if (!UndefPos.
empty() && IsSplat) {
19464 (E->UserTreeIndex &&
any_of(V->uses(), [E](
const Use &U) {
19467 return E->UserTreeIndex.EdgeIdx != U.getOperandNo() &&
19468 is_contained(E->UserTreeIndex.UserTE->Scalars,
19472 if (It != Scalars.
end()) {
19474 int Pos = std::distance(Scalars.
begin(), It);
19475 for (
int I : UndefPos) {
19477 ReuseMask[
I] = Pos;
19486 for (
int I : UndefPos) {
19495 if (!ExtractShuffles.
empty() || !GatherShuffles.
empty()) {
19496 bool IsNonPoisoned =
true;
19497 bool IsUsedInExpr =
true;
19498 Value *Vec1 =
nullptr;
19499 if (!ExtractShuffles.
empty()) {
19503 Value *Vec2 =
nullptr;
19504 for (
unsigned I = 0, Sz = ExtractMask.
size();
I < Sz; ++
I) {
19508 if (UseVecBaseAsInput) {
19509 Vec1 = ExtractVecBase;
19511 for (
unsigned I = 0, Sz = ExtractMask.
size();
I < Sz; ++
I) {
19517 Value *VecOp = EI->getVectorOperand();
19519 !TEs.
empty() && TEs.front()->VectorizedValue)
19520 VecOp = TEs.front()->VectorizedValue;
19523 }
else if (Vec1 != VecOp) {
19524 assert((!Vec2 || Vec2 == VecOp) &&
19525 "Expected only 1 or 2 vectors shuffle.");
19531 IsUsedInExpr =
false;
19534 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
19537 IsUsedInExpr &= FindReusedSplat(
19540 ExtractMask.
size(), IsNotPoisonedVec);
19541 ShuffleBuilder.add(Vec1, ExtractMask,
true);
19542 IsNonPoisoned &= IsNotPoisonedVec;
19544 IsUsedInExpr =
false;
19549 if (!GatherShuffles.
empty()) {
19550 unsigned SliceSize =
19554 for (
const auto [
I, TEs] :
enumerate(Entries)) {
19557 "No shuffles with empty entries list expected.");
19560 assert((TEs.size() == 1 || TEs.size() == 2) &&
19561 "Expected shuffle of 1 or 2 entries.");
19562 unsigned Limit =
getNumElems(Mask.size(), SliceSize,
I);
19565 copy(SubMask, std::next(VecMask.
begin(),
I * SliceSize));
19566 if (TEs.size() == 1) {
19567 bool IsNotPoisonedVec =
19568 TEs.front()->VectorizedValue
19572 FindReusedSplat(VecMask, TEs.
front()->getVectorFactor(),
I,
19573 SliceSize, IsNotPoisonedVec);
19574 ShuffleBuilder.add(*TEs.front(), VecMask);
19575 IsNonPoisoned &= IsNotPoisonedVec;
19577 IsUsedInExpr =
false;
19578 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
19579 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
19590 int EMSz = ExtractMask.
size();
19591 int MSz = Mask.size();
19594 bool IsSingleShuffle = ExtractShuffles.
empty() || GatherShuffles.
empty();
19595 bool IsIdentityShuffle =
19596 ((UseVecBaseAsInput ||
19598 [](
const std::optional<TTI::ShuffleKind> &SK) {
19602 none_of(ExtractMask, [&](
int I) {
return I >= EMSz; }) &&
19604 (!GatherShuffles.
empty() &&
19606 [](
const std::optional<TTI::ShuffleKind> &SK) {
19610 none_of(Mask, [&](
int I) {
return I >= MSz; }) &&
19612 bool EnoughConstsForShuffle =
19622 (!IsIdentityShuffle ||
19623 (GatheredScalars.
size() == 2 &&
19631 for (
int I = 0, Sz = GatheredScalars.
size();
I < Sz; ++
I) {
19632 if (EnoughConstsForShuffle &&
isa<Constant>(GatheredScalars[
I]))
19640 TryPackScalars(GatheredScalars, BVMask,
true);
19641 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.
size());
19642 ShuffleBuilder.add(BV, BVMask);
19646 (IsSingleShuffle && ((IsIdentityShuffle &&
19649 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
19652 Res = ShuffleBuilder.finalize(
19653 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.
size(),
19655 bool IsSplat = isSplat(NonConstants);
19656 SmallVector<int> BVMask(Mask.size(), PoisonMaskElem);
19657 TryPackScalars(NonConstants, BVMask, false);
19658 auto CheckIfSplatIsProfitable = [&]() {
19661 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
19662 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
19663 if (isa<ExtractElementInst>(V) || isVectorized(V))
19665 InstructionCost SplatCost = TTI->getVectorInstrCost(
19666 Instruction::InsertElement, VecTy, CostKind, 0,
19667 PoisonValue::get(VecTy), V);
19668 SmallVector<int> NewMask(Mask.begin(), Mask.end());
19669 for (auto [Idx, I] : enumerate(BVMask))
19670 if (I != PoisonMaskElem)
19671 NewMask[Idx] = Mask.size();
19672 SplatCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy,
19673 NewMask, CostKind);
19674 InstructionCost BVCost = TTI->getVectorInstrCost(
19675 Instruction::InsertElement, VecTy, CostKind,
19676 *find_if(Mask, not_equal_to(PoisonMaskElem)), Vec, V);
19678 if (count(BVMask, PoisonMaskElem) <
19679 static_cast<int>(BVMask.size() - 1)) {
19680 SmallVector<int> NewMask(Mask.begin(), Mask.end());
19681 for (auto [Idx, I] : enumerate(BVMask))
19682 if (I != PoisonMaskElem)
19684 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
19685 VecTy, NewMask, CostKind);
19687 return SplatCost <= BVCost;
19689 if (!IsSplat || Mask.size() <= 2 || !CheckIfSplatIsProfitable()) {
19693 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
19699 Value *BV = ShuffleBuilder.gather(Values, BVMask.
size());
19702 return I == PoisonMaskElem ? PoisonMaskElem : 0;
19705 BV = CreateShuffle(BV,
nullptr, SplatMask);
19708 Mask[Idx] = BVMask.size() + Idx;
19709 Vec = CreateShuffle(Vec, BV, Mask);
19718 TryPackScalars(GatheredScalars, ReuseMask,
true);
19719 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
19720 ShuffleBuilder.add(BV, ReuseMask);
19721 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
19726 for (
auto [
I, V] :
enumerate(GatheredScalars)) {
19730 Value *BV = ShuffleBuilder.gather(GatheredScalars);
19731 ShuffleBuilder.add(BV, Mask);
19732 Res = ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors,
19737 Res = ShuffleBuilder.createFreeze(Res);
19741Value *BoUpSLP::createBuildVector(
const TreeEntry *
E,
Type *ScalarTy) {
19742 for (
auto [EIdx,
_] :
E->CombinedEntriesWithIndices)
19744 return processBuildVector<ShuffleInstructionBuilder, Value *>(
E, ScalarTy,
19752 for (
Value *V : VL)
19765 IRBuilderBase::InsertPointGuard Guard(Builder);
19767 Value *
V =
E->Scalars.front();
19768 Type *ScalarTy =
V->getType();
19771 auto It = MinBWs.find(
E);
19772 if (It != MinBWs.end()) {
19778 if (
E->VectorizedValue)
19779 return E->VectorizedValue;
19781 if (
E->isGather() || TransformedToGatherNodes.contains(
E)) {
19783 if (
E->hasState() &&
E->Idx == 0 && !UserIgnoreList)
19784 setInsertPointAfterBundle(
E);
19785 Value *Vec = createBuildVector(
E, ScalarTy);
19786 E->VectorizedValue = Vec;
19789 if (
E->State == TreeEntry::SplitVectorize) {
19790 assert(
E->CombinedEntriesWithIndices.size() == 2 &&
19791 "Expected exactly 2 combined entries.");
19792 setInsertPointAfterBundle(
E);
19794 *VectorizableTree[
E->CombinedEntriesWithIndices.front().first];
19796 ArrayRef(
E->Scalars).take_front(OpTE1.getVectorFactor())) &&
19797 "Expected same first part of scalars.");
19800 *VectorizableTree[
E->CombinedEntriesWithIndices.back().first];
19802 OpTE2.isSame(
ArrayRef(
E->Scalars).take_back(OpTE2.getVectorFactor())) &&
19803 "Expected same second part of scalars.");
19805 auto GetOperandSignedness = [&](
const TreeEntry *OpE) {
19806 bool IsSigned =
false;
19807 auto It = MinBWs.find(OpE);
19808 if (It != MinBWs.end())
19809 IsSigned = It->second.second;
19812 if (isa<PoisonValue>(V))
19814 return !isKnownNonNegative(R, SimplifyQuery(*DL));
19821 Op1 = Builder.CreateIntCast(
19826 GetOperandSignedness(&OpTE1));
19831 Op2 = Builder.CreateIntCast(
19836 GetOperandSignedness(&OpTE2));
19838 if (
E->ReorderIndices.empty()) {
19842 std::next(
Mask.begin(),
E->CombinedEntriesWithIndices.back().second),
19845 if (ScalarTyNumElements != 1) {
19849 Value *Vec = Builder.CreateShuffleVector(Op1, Mask);
19851 E->CombinedEntriesWithIndices.back().second *
19852 ScalarTyNumElements);
19853 E->VectorizedValue = Vec;
19856 unsigned CommonVF =
19857 std::max(OpTE1.getVectorFactor(), OpTE2.getVectorFactor());
19860 std::iota(
Mask.begin(), std::next(
Mask.begin(), OpTE1.getVectorFactor()),
19862 Op1 = Builder.CreateShuffleVector(Op1, Mask);
19866 std::iota(
Mask.begin(), std::next(
Mask.begin(), OpTE2.getVectorFactor()),
19868 Op2 = Builder.CreateShuffleVector(Op2, Mask);
19870 Value *Vec = Builder.CreateShuffleVector(Op1, Op2,
E->getSplitMask());
19871 E->VectorizedValue = Vec;
19875 bool IsReverseOrder =
19877 auto FinalShuffle = [&](
Value *
V,
const TreeEntry *
E) {
19879 if (
E->getOpcode() == Instruction::Store &&
19880 E->State == TreeEntry::Vectorize) {
19881 ArrayRef<int>
Mask =
19882 ArrayRef(
reinterpret_cast<const int *
>(
E->ReorderIndices.begin()),
19883 E->ReorderIndices.size());
19884 ShuffleBuilder.add(V, Mask);
19885 }
else if ((
E->State == TreeEntry::StridedVectorize && IsReverseOrder) ||
19886 E->State == TreeEntry::CompressVectorize) {
19887 ShuffleBuilder.addOrdered(V, {});
19889 ShuffleBuilder.addOrdered(V,
E->ReorderIndices);
19892 E->CombinedEntriesWithIndices.size());
19894 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](
const auto &
P) {
19895 return std::make_pair(VectorizableTree[P.first].get(), P.second);
19898 (
E->CombinedEntriesWithIndices.empty() ||
E->ReorderIndices.empty()) &&
19899 "Expected either combined subnodes or reordering");
19900 return ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors, {});
19903 assert(!
E->isGather() &&
"Unhandled state");
19904 unsigned ShuffleOrOp =
19905 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector :
E->
getOpcode();
19907 auto GetOperandSignedness = [&](
unsigned Idx) {
19908 const TreeEntry *OpE = getOperandEntry(
E, Idx);
19909 bool IsSigned =
false;
19910 auto It = MinBWs.find(OpE);
19911 if (It != MinBWs.end())
19912 IsSigned = It->second.second;
19915 if (isa<PoisonValue>(V))
19917 return !isKnownNonNegative(R, SimplifyQuery(*DL));
19921 switch (ShuffleOrOp) {
19922 case Instruction::PHI: {
19923 assert((
E->ReorderIndices.empty() || !
E->ReuseShuffleIndices.empty() ||
19924 E != VectorizableTree.front().get() ||
E->UserTreeIndex) &&
19925 "PHI reordering is free.");
19927 Builder.SetInsertPoint(PH->getParent(),
19928 PH->getParent()->getFirstNonPHIIt());
19930 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
19934 Builder.SetInsertPoint(PH->getParent(),
19935 PH->getParent()->getFirstInsertionPt());
19938 V = FinalShuffle(V,
E);
19940 E->VectorizedValue =
V;
19947 SmallPtrSet<BasicBlock *, 4> VisitedBBs;
19954 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
19958 if (!VisitedBBs.
insert(IBB).second) {
19961 TreeEntry *OpTE = getOperandEntry(
E,
I);
19962 assert(!OpTE->VectorizedValue &&
"Expected no vectorized value.");
19963 OpTE->VectorizedValue = VecOp;
19969 Value *Vec = vectorizeOperand(
E,
I);
19970 if (VecTy != Vec->
getType()) {
19972 MinBWs.contains(getOperandEntry(
E,
I))) &&
19973 "Expected item in MinBWs.");
19974 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(
I));
19980 "Invalid number of incoming values");
19981 assert(
E->VectorizedValue &&
"Expected vectorized value.");
19982 return E->VectorizedValue;
19985 case Instruction::ExtractElement: {
19986 Value *
V =
E->getSingleOperand(0);
19987 setInsertPointAfterBundle(
E);
19988 V = FinalShuffle(V,
E);
19989 E->VectorizedValue =
V;
19992 case Instruction::ExtractValue: {
19994 Builder.SetInsertPoint(LI);
19995 Value *Ptr = LI->getPointerOperand();
19996 LoadInst *
V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
19998 NewV = FinalShuffle(NewV,
E);
19999 E->VectorizedValue = NewV;
20002 case Instruction::InsertElement: {
20003 assert(
E->ReuseShuffleIndices.empty() &&
"All inserts should be unique");
20004 if (
const TreeEntry *OpE = getOperandEntry(
E, 1);
20005 OpE && !OpE->isGather() && OpE->hasState() &&
20006 !OpE->hasCopyableElements())
20009 setInsertPointAfterBundle(
E);
20010 Value *
V = vectorizeOperand(
E, 1);
20012 Type *ScalarTy =
Op.front()->getType();
20015 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(
E, 1));
20016 assert(Res.first > 0 &&
"Expected item in MinBWs.");
20017 V = Builder.CreateIntCast(
20027 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
20029 const unsigned NumElts =
20031 const unsigned NumScalars =
E->Scalars.size();
20034 assert(
Offset < NumElts &&
"Failed to find vector index offset");
20037 SmallVector<int>
Mask;
20038 if (!
E->ReorderIndices.empty()) {
20043 std::iota(
Mask.begin(), std::next(
Mask.begin(), NumScalars), 0);
20046 bool IsIdentity =
true;
20048 Mask.swap(PrevMask);
20049 for (
unsigned I = 0;
I < NumScalars; ++
I) {
20052 IsIdentity &= InsertIdx -
Offset ==
I;
20055 if (!IsIdentity || NumElts != NumScalars) {
20056 Value *V2 =
nullptr;
20057 bool IsVNonPoisonous =
20059 SmallVector<int> InsertMask(Mask);
20060 if (NumElts != NumScalars &&
Offset == 0) {
20069 InsertMask[*InsertIdx] = *InsertIdx;
20075 SmallBitVector UseMask =
20076 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
20077 SmallBitVector IsFirstPoison =
20079 SmallBitVector IsFirstUndef =
20081 if (!IsFirstPoison.
all()) {
20083 for (
unsigned I = 0;
I < NumElts;
I++) {
20085 IsFirstUndef.
test(
I)) {
20086 if (IsVNonPoisonous) {
20087 InsertMask[
I] =
I < NumScalars ?
I : 0;
20092 if (Idx >= NumScalars)
20093 Idx = NumScalars - 1;
20094 InsertMask[
I] = NumScalars + Idx;
20107 V = Builder.CreateShuffleVector(V, V2, InsertMask);
20109 GatherShuffleExtractSeq.insert(
I);
20110 CSEBlocks.insert(
I->getParent());
20115 for (
unsigned I = 0;
I < NumElts;
I++) {
20119 SmallBitVector UseMask =
20120 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
20121 SmallBitVector IsFirstUndef =
20123 if ((!IsIdentity ||
Offset != 0 || !IsFirstUndef.
all()) &&
20124 NumElts != NumScalars) {
20125 if (IsFirstUndef.
all()) {
20127 SmallBitVector IsFirstPoison =
20129 if (!IsFirstPoison.
all()) {
20130 for (
unsigned I = 0;
I < NumElts;
I++) {
20132 InsertMask[
I] =
I + NumElts;
20135 V = Builder.CreateShuffleVector(
20141 GatherShuffleExtractSeq.insert(
I);
20142 CSEBlocks.insert(
I->getParent());
20146 SmallBitVector IsFirstPoison =
20148 for (
unsigned I = 0;
I < NumElts;
I++) {
20152 InsertMask[
I] += NumElts;
20154 V = Builder.CreateShuffleVector(
20155 FirstInsert->getOperand(0), V, InsertMask,
20158 GatherShuffleExtractSeq.insert(
I);
20159 CSEBlocks.insert(
I->getParent());
20164 ++NumVectorInstructions;
20165 E->VectorizedValue =
V;
20168 case Instruction::ZExt:
20169 case Instruction::SExt:
20170 case Instruction::FPToUI:
20171 case Instruction::FPToSI:
20172 case Instruction::FPExt:
20173 case Instruction::PtrToInt:
20174 case Instruction::IntToPtr:
20175 case Instruction::SIToFP:
20176 case Instruction::UIToFP:
20177 case Instruction::Trunc:
20178 case Instruction::FPTrunc:
20179 case Instruction::BitCast: {
20180 setInsertPointAfterBundle(
E);
20182 Value *InVec = vectorizeOperand(
E, 0);
20187 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
20189 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
20192 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
20193 if (SrcIt != MinBWs.end())
20194 SrcBWSz = SrcIt->second.first;
20195 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->
getScalarType());
20196 if (BWSz == SrcBWSz) {
20197 VecOpcode = Instruction::BitCast;
20198 }
else if (BWSz < SrcBWSz) {
20199 VecOpcode = Instruction::Trunc;
20200 }
else if (It != MinBWs.end()) {
20201 assert(BWSz > SrcBWSz &&
"Invalid cast!");
20202 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
20203 }
else if (SrcIt != MinBWs.end()) {
20204 assert(BWSz > SrcBWSz &&
"Invalid cast!");
20206 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
20208 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
20209 !SrcIt->second.second) {
20210 VecOpcode = Instruction::UIToFP;
20212 Value *
V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
20214 : Builder.CreateCast(VecOpcode, InVec, VecTy);
20215 V = FinalShuffle(V,
E);
20217 E->VectorizedValue =
V;
20218 ++NumVectorInstructions;
20221 case Instruction::FCmp:
20222 case Instruction::ICmp: {
20223 setInsertPointAfterBundle(
E);
20225 Value *
L = vectorizeOperand(
E, 0);
20226 Value *
R = vectorizeOperand(
E, 1);
20227 if (
L->getType() !=
R->getType()) {
20230 MinBWs.contains(getOperandEntry(
E, 0)) ||
20231 MinBWs.contains(getOperandEntry(
E, 1))) &&
20232 "Expected item in MinBWs.");
20237 ->getIntegerBitWidth()) {
20238 Type *CastTy =
R->getType();
20239 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
20241 Type *CastTy =
L->getType();
20242 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
20247 Value *
V = Builder.CreateCmp(P0, L, R);
20250 ICmp->setSameSign(
false);
20253 V = FinalShuffle(V,
E);
20255 E->VectorizedValue =
V;
20256 ++NumVectorInstructions;
20259 case Instruction::Select: {
20260 setInsertPointAfterBundle(
E);
20263 Value *True = vectorizeOperand(
E, 1);
20264 Value *False = vectorizeOperand(
E, 2);
20268 MinBWs.contains(getOperandEntry(
E, 1)) ||
20269 MinBWs.contains(getOperandEntry(
E, 2))) &&
20270 "Expected item in MinBWs.");
20271 if (True->
getType() != VecTy)
20272 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
20273 if (False->
getType() != VecTy)
20274 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
20279 assert(TrueNumElements >= CondNumElements &&
20280 TrueNumElements % CondNumElements == 0 &&
20281 "Cannot vectorize Instruction::Select");
20283 "Cannot vectorize Instruction::Select");
20284 if (CondNumElements != TrueNumElements) {
20287 Cond = Builder.CreateShuffleVector(
20292 "Cannot vectorize Instruction::Select");
20294 Builder.CreateSelectWithUnknownProfile(
Cond, True, False,
DEBUG_TYPE);
20295 V = FinalShuffle(V,
E);
20297 E->VectorizedValue =
V;
20298 ++NumVectorInstructions;
20301 case Instruction::FNeg: {
20302 setInsertPointAfterBundle(
E);
20304 Value *
Op = vectorizeOperand(
E, 0);
20306 Value *
V = Builder.CreateUnOp(
20312 V = FinalShuffle(V,
E);
20314 E->VectorizedValue =
V;
20315 ++NumVectorInstructions;
20319 case Instruction::Freeze: {
20320 setInsertPointAfterBundle(
E);
20322 Value *
Op = vectorizeOperand(
E, 0);
20324 if (
Op->getType() != VecTy) {
20326 MinBWs.contains(getOperandEntry(
E, 0))) &&
20327 "Expected item in MinBWs.");
20328 Op = Builder.CreateIntCast(
Op, VecTy, GetOperandSignedness(0));
20330 Value *
V = Builder.CreateFreeze(
Op);
20331 V = FinalShuffle(V,
E);
20333 E->VectorizedValue =
V;
20334 ++NumVectorInstructions;
20338 case Instruction::Add:
20339 case Instruction::FAdd:
20340 case Instruction::Sub:
20341 case Instruction::FSub:
20342 case Instruction::Mul:
20343 case Instruction::FMul:
20344 case Instruction::UDiv:
20345 case Instruction::SDiv:
20346 case Instruction::FDiv:
20347 case Instruction::URem:
20348 case Instruction::SRem:
20349 case Instruction::FRem:
20350 case Instruction::Shl:
20351 case Instruction::LShr:
20352 case Instruction::AShr:
20353 case Instruction::And:
20354 case Instruction::Or:
20355 case Instruction::Xor: {
20356 setInsertPointAfterBundle(
E);
20360 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
20365 return CI && CI->getValue().countr_one() >= It->second.first;
20367 V = FinalShuffle(
I == 0 ?
RHS :
LHS,
E);
20368 E->VectorizedValue =
V;
20369 ++NumVectorInstructions;
20377 MinBWs.contains(getOperandEntry(
E, 0)) ||
20378 MinBWs.contains(getOperandEntry(
E, 1))) &&
20379 "Expected item in MinBWs.");
20381 LHS = Builder.CreateIntCast(
LHS, VecTy, GetOperandSignedness(0));
20383 RHS = Builder.CreateIntCast(
RHS, VecTy, GetOperandSignedness(1));
20386 Value *
V = Builder.CreateBinOp(
20393 if (!MinBWs.contains(
E) && ShuffleOrOp == Instruction::Sub &&
20395 return isa<PoisonValue>(V) ||
20396 (E->hasCopyableElements() && E->isCopyableElement(V)) ||
20397 isCommutative(cast<Instruction>(V));
20399 I->setHasNoUnsignedWrap(
false);
20402 V = FinalShuffle(V,
E);
20404 E->VectorizedValue =
V;
20405 ++NumVectorInstructions;
20409 case Instruction::Load: {
20412 setInsertPointAfterBundle(
E);
20416 FixedVectorType *StridedLoadTy =
nullptr;
20417 Value *PO = LI->getPointerOperand();
20418 if (
E->State == TreeEntry::Vectorize) {
20419 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
20420 }
else if (
E->State == TreeEntry::CompressVectorize) {
20421 auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
20422 CompressEntryToData.at(
E);
20423 Align CommonAlignment = LI->getAlign();
20429 for (
int I : CompressMask)
20433 MaskValues =
replicateMask(MaskValues, VecTy->getNumElements());
20436 NewLI = Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment,
20439 NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment);
20450 }
else if (
E->State == TreeEntry::StridedVectorize) {
20453 PO = IsReverseOrder ? PtrN : Ptr0;
20454 Type *StrideTy = DL->getIndexType(PO->
getType());
20456 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(
E);
20457 StridedLoadTy = SPtrInfo.Ty;
20458 assert(StridedLoadTy &&
"Missing StridedPoinerInfo for tree entry.");
20459 unsigned StridedLoadEC =
20462 Value *Stride = SPtrInfo.StrideVal;
20464 const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
20465 assert(StrideSCEV &&
"Neither StrideVal nor StrideSCEV were set.");
20466 SCEVExpander Expander(*SE,
"strided-load-vec");
20467 Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->
getType(),
20468 &*Builder.GetInsertPoint());
20471 Builder.CreateIntCast(Stride, StrideTy,
true);
20472 StrideVal = Builder.CreateMul(
20474 StrideTy, (IsReverseOrder ? -1 : 1) *
20476 DL->getTypeAllocSize(ScalarTy))));
20478 auto *Inst = Builder.CreateIntrinsic(
20479 Intrinsic::experimental_vp_strided_load,
20480 {StridedLoadTy, PO->
getType(), StrideTy},
20483 Builder.getInt32(StridedLoadEC)});
20484 Inst->addParamAttr(
20486 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
20489 assert(
E->State == TreeEntry::ScatterVectorize &&
"Unhandled state");
20490 Value *VecPtr = vectorizeOperand(
E, 0);
20495 unsigned ScalarTyNumElements =
20497 unsigned VecTyNumElements =
20499 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
20500 "Cannot expand getelementptr.");
20501 unsigned VF = VecTyNumElements / ScalarTyNumElements;
20504 return Builder.getInt64(I % ScalarTyNumElements);
20506 VecPtr = Builder.CreateGEP(
20507 VecTy->getElementType(),
20508 Builder.CreateShuffleVector(
20514 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
20516 Value *
V =
E->State == TreeEntry::CompressVectorize
20520 if (StridedLoadTy != VecTy)
20521 V = Builder.CreateBitOrPointerCast(V, VecTy);
20522 V = FinalShuffle(V,
E);
20523 E->VectorizedValue =
V;
20524 ++NumVectorInstructions;
20527 case Instruction::Store: {
20530 setInsertPointAfterBundle(
E);
20532 Value *VecValue = vectorizeOperand(
E, 0);
20533 if (VecValue->
getType() != VecTy)
20535 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
20536 VecValue = FinalShuffle(VecValue,
E);
20538 Value *Ptr =
SI->getPointerOperand();
20540 if (
E->State == TreeEntry::Vectorize) {
20541 ST = Builder.CreateAlignedStore(VecValue, Ptr,
SI->getAlign());
20543 assert(
E->State == TreeEntry::StridedVectorize &&
20544 "Expected either strided or consecutive stores.");
20545 if (!
E->ReorderIndices.empty()) {
20547 Ptr =
SI->getPointerOperand();
20550 Type *StrideTy = DL->getIndexType(
SI->getPointerOperandType());
20551 auto *Inst = Builder.CreateIntrinsic(
20552 Intrinsic::experimental_vp_strided_store,
20553 {VecTy, Ptr->
getType(), StrideTy},
20556 StrideTy, -
static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
20557 Builder.getAllOnesMask(VecTy->getElementCount()),
20558 Builder.getInt32(
E->Scalars.size())});
20559 Inst->addParamAttr(
20561 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
20567 E->VectorizedValue =
V;
20568 ++NumVectorInstructions;
20571 case Instruction::GetElementPtr: {
20573 setInsertPointAfterBundle(
E);
20575 Value *Op0 = vectorizeOperand(
E, 0);
20578 for (
int J = 1,
N = GEP0->getNumOperands(); J <
N; ++J) {
20579 Value *OpVec = vectorizeOperand(
E, J);
20583 Value *
V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
20586 for (
Value *V :
E->Scalars) {
20593 V = FinalShuffle(V,
E);
20595 E->VectorizedValue =
V;
20596 ++NumVectorInstructions;
20600 case Instruction::Call: {
20602 setInsertPointAfterBundle(
E);
20607 CI,
ID, VecTy->getNumElements(),
20608 It != MinBWs.end() ? It->second.first : 0, TTI);
20611 VecCallCosts.first <= VecCallCosts.second;
20613 Value *ScalarArg =
nullptr;
20624 ScalarArg = CEI->getArgOperand(
I);
20627 if (
ID == Intrinsic::abs && It != MinBWs.end() &&
20628 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
20629 ScalarArg = Builder.getFalse();
20636 Value *OpVec = vectorizeOperand(
E,
I);
20637 ScalarArg = CEI->getArgOperand(
I);
20640 It == MinBWs.end()) {
20643 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(
I));
20644 }
else if (It != MinBWs.end()) {
20645 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(
I));
20654 if (!UseIntrinsic) {
20659 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
20666 Value *
V = Builder.CreateCall(CF, OpVecs, OpBundles);
20669 V = FinalShuffle(V,
E);
20671 E->VectorizedValue =
V;
20672 ++NumVectorInstructions;
20675 case Instruction::ShuffleVector: {
20678 setInsertPointAfterBundle(
E);
20679 Value *Src = vectorizeOperand(
E, 0);
20682 SmallVector<int> NewMask(ThisMask.size());
20684 return SVSrc->getShuffleMask()[Mask];
20686 V = Builder.CreateShuffleVector(SVSrc->getOperand(0),
20687 SVSrc->getOperand(1), NewMask);
20689 V = Builder.CreateShuffleVector(Src, ThisMask);
20694 V = FinalShuffle(V,
E);
20702 "Invalid Shuffle Vector Operand");
20706 setInsertPointAfterBundle(
E);
20707 LHS = vectorizeOperand(
E, 0);
20708 RHS = vectorizeOperand(
E, 1);
20710 setInsertPointAfterBundle(
E);
20711 LHS = vectorizeOperand(
E, 0);
20717 assert((It != MinBWs.end() ||
20718 getOperandEntry(
E, 0)->State == TreeEntry::NeedToGather ||
20719 getOperandEntry(
E, 1)->State == TreeEntry::NeedToGather ||
20720 MinBWs.contains(getOperandEntry(
E, 0)) ||
20721 MinBWs.contains(getOperandEntry(
E, 1))) &&
20722 "Expected item in MinBWs.");
20723 Type *CastTy = VecTy;
20729 ->getIntegerBitWidth())
20735 LHS = Builder.CreateIntCast(
LHS, CastTy, GetOperandSignedness(0));
20737 RHS = Builder.CreateIntCast(
RHS, CastTy, GetOperandSignedness(1));
20742 V0 = Builder.CreateBinOp(
20744 V1 = Builder.CreateBinOp(
20747 V0 = Builder.CreateCmp(CI0->getPredicate(),
LHS,
RHS);
20750 V1 = Builder.CreateCmp(AltPred,
LHS,
RHS);
20753 unsigned SrcBWSz = DL->getTypeSizeInBits(
20755 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
20756 if (BWSz <= SrcBWSz) {
20757 if (BWSz < SrcBWSz)
20758 LHS = Builder.CreateIntCast(
LHS, VecTy, It->second.first);
20760 "Expected same type as operand.");
20764 E->VectorizedValue =
LHS;
20765 ++NumVectorInstructions;
20769 V0 = Builder.CreateCast(
20771 V1 = Builder.CreateCast(
20776 for (
Value *V : {V0, V1}) {
20778 GatherShuffleExtractSeq.insert(
I);
20779 CSEBlocks.insert(
I->getParent());
20787 SmallVector<int>
Mask;
20788 E->buildAltOpShuffleMask(
20789 [
E,
this](Instruction *
I) {
20790 assert(
E->getMatchingMainOpOrAltOp(
I) &&
20791 "Unexpected main/alternate opcode");
20795 Mask, &OpScalars, &AltScalars);
20799 auto DropNuwFlag = [&](
Value *Vec,
unsigned Opcode) {
20802 I && Opcode == Instruction::Sub && !MinBWs.contains(
E) &&
20804 if (isa<PoisonValue>(V))
20806 if (E->hasCopyableElements() && E->isCopyableElement(V))
20808 auto *IV = cast<Instruction>(V);
20809 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
20811 I->setHasNoUnsignedWrap(
false);
20813 DropNuwFlag(V0,
E->getOpcode());
20814 DropNuwFlag(V1,
E->getAltOpcode());
20820 V = Builder.CreateShuffleVector(V0, V1, Mask);
20823 GatherShuffleExtractSeq.insert(
I);
20824 CSEBlocks.insert(
I->getParent());
20828 E->VectorizedValue =
V;
20829 ++NumVectorInstructions;
20847 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
20850 EntryToLastInstruction.clear();
20852 for (
auto &BSIter : BlocksSchedules)
20853 scheduleBlock(*
this, BSIter.second.get());
20856 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20857 if (TE->isGather() || DeletedNodes.contains(TE.get()))
20859 (void)getLastInstructionInBundle(TE.get());
20863 Builder.SetInsertPoint(ReductionRoot->
getParent(),
20866 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20870 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20871 if (DeletedNodes.contains(TE.get()))
20873 if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
20874 TE->UserTreeIndex.UserTE->hasState() &&
20875 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
20876 (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
20877 TE->UserTreeIndex.UserTE->isAltShuffle()) &&
20878 !TE->UserTreeIndex.UserTE->hasCopyableElements() &&
20879 all_of(TE->UserTreeIndex.UserTE->Scalars,
20880 [](
Value *V) { return isUsedOutsideBlock(V); })) {
20882 getLastInstructionInBundle(TE->UserTreeIndex.UserTE);
20886 for (
auto &Entry : GatherEntries) {
20888 Builder.SetInsertPoint(Entry.second);
20889 Builder.SetCurrentDebugLocation(Entry.second->getDebugLoc());
20894 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20895 if (DeletedNodes.contains(TE.get()))
20897 if (GatheredLoadsEntriesFirst.has_value() &&
20898 TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue &&
20899 (!TE->isGather() || TE->UserTreeIndex)) {
20900 assert((TE->UserTreeIndex ||
20901 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
20902 "Expected gathered load node.");
20911 for (
const TreeEntry *E : PostponedNodes) {
20912 auto *TE =
const_cast<TreeEntry *
>(E);
20914 TE->VectorizedValue =
nullptr;
20925 (TE->UserTreeIndex.UserTE->hasState() &&
20926 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI)) {
20935 if (UI->comesBefore(InsertPt))
20938 Builder.SetInsertPoint(InsertPt);
20940 Builder.SetInsertPoint(PrevVec);
20942 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
20945 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
20946 Builder.GetInsertPoint()->comesBefore(VecI))
20947 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
20948 Builder.GetInsertPoint());
20949 if (Vec->
getType() != PrevVec->getType()) {
20951 PrevVec->getType()->isIntOrIntVectorTy() &&
20952 "Expected integer vector types only.");
20953 std::optional<bool> IsSigned;
20954 for (
Value *V : TE->Scalars) {
20956 for (
const TreeEntry *MNTE : getTreeEntries(V)) {
20957 auto It = MinBWs.find(MNTE);
20958 if (It != MinBWs.end()) {
20959 IsSigned = IsSigned.value_or(
false) || It->second.second;
20964 if (IsSigned.value_or(
false))
20967 for (
const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
20968 auto It = MinBWs.find(BVE);
20969 if (It != MinBWs.end()) {
20970 IsSigned = IsSigned.value_or(
false) || It->second.second;
20975 if (IsSigned.value_or(
false))
20979 IsSigned.value_or(
false) ||
20983 if (IsSigned.value_or(
false))
20987 if (IsSigned.value_or(
false)) {
20989 auto It = MinBWs.find(TE->UserTreeIndex.UserTE);
20990 if (It != MinBWs.end())
20991 IsSigned = It->second.second;
20994 "Expected user node or perfect diamond match in MinBWs.");
20995 Vec = Builder.CreateIntCast(Vec, PrevVec->
getType(), *IsSigned);
20997 PrevVec->replaceAllUsesWith(Vec);
20998 PostponedValues.
try_emplace(Vec).first->second.push_back(TE);
21001 auto It = PostponedValues.
find(PrevVec);
21002 if (It != PostponedValues.
end()) {
21003 for (TreeEntry *VTE : It->getSecond())
21004 VTE->VectorizedValue = Vec;
21024 for (
const auto &ExternalUse : ExternalUses) {
21025 Value *Scalar = ExternalUse.Scalar;
21032 const TreeEntry *E = &ExternalUse.E;
21033 assert(E &&
"Invalid scalar");
21034 assert(!E->isGather() &&
"Extracting from a gather list");
21036 if (E->getOpcode() == Instruction::GetElementPtr &&
21040 Value *Vec = E->VectorizedValue;
21041 assert(Vec &&
"Can't find vectorizable value");
21043 Value *Lane = Builder.getInt32(ExternalUse.Lane);
21044 auto ExtractAndExtendIfNeeded = [&](
Value *Vec) {
21045 if (Scalar->getType() != Vec->
getType()) {
21046 Value *Ex =
nullptr;
21047 Value *ExV =
nullptr;
21049 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
21050 auto It = ScalarToEEs.
find(Scalar);
21051 if (It != ScalarToEEs.
end()) {
21054 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
21055 : Builder.GetInsertBlock());
21056 if (EEIt != It->second.end()) {
21057 Value *PrevV = EEIt->second.first;
21059 I && !ReplaceInst &&
21060 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
21061 Builder.GetInsertPoint()->comesBefore(
I)) {
21062 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
21063 Builder.GetInsertPoint());
21068 ExV = EEIt->second.second ? EEIt->second.second : Ex;
21077 IgnoredExtracts.
insert(EE);
21080 auto *CloneInst = Inst->clone();
21081 CloneInst->insertBefore(Inst->getIterator());
21082 if (Inst->hasName())
21083 CloneInst->takeName(Inst);
21088 Value *V = ES->getVectorOperand();
21091 V = ETEs.front()->VectorizedValue;
21093 !
IV ||
IV == Vec ||
IV->getParent() != IVec->getParent() ||
21094 IV->comesBefore(IVec))
21095 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
21097 Ex = Builder.CreateExtractElement(Vec, Lane);
21098 }
else if (
auto *VecTy =
21101 unsigned VecTyNumElements = VecTy->getNumElements();
21106 ExternalUse.Lane * VecTyNumElements);
21108 Ex = Builder.CreateExtractElement(Vec, Lane);
21113 if (Scalar->getType() != Ex->
getType())
21114 ExV = Builder.CreateIntCast(
21119 : &F->getEntryBlock(),
21120 std::make_pair(Ex, ExV));
21126 GatherShuffleExtractSeq.insert(ExI);
21127 CSEBlocks.insert(ExI->getParent());
21133 "In-tree scalar of vector type is not insertelement?");
21142 if (!ScalarsWithNullptrUser.
insert(Scalar).second)
21145 (ExternallyUsedValues.
count(Scalar) ||
21146 ExternalUsesWithNonUsers.count(Scalar) ||
21147 ExternalUsesAsOriginalScalar.contains(Scalar) ||
21151 if (ExternalUsesAsOriginalScalar.contains(U))
21153 ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
21154 return !UseEntries.empty() &&
21155 (E->State == TreeEntry::Vectorize ||
21156 E->State == TreeEntry::StridedVectorize ||
21157 E->State == TreeEntry::CompressVectorize) &&
21158 any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {
21159 return (UseEntry->State == TreeEntry::Vectorize ||
21161 TreeEntry::StridedVectorize ||
21163 TreeEntry::CompressVectorize) &&
21164 doesInTreeUserNeedToExtract(
21165 Scalar, getRootEntryInstruction(*UseEntry),
21169 "Scalar with nullptr User must be registered in "
21170 "ExternallyUsedValues map or remain as scalar in vectorized "
21174 if (
PHI->getParent()->isLandingPad())
21175 Builder.SetInsertPoint(
21178 PHI->getParent()->getLandingPadInst()->getIterator()));
21180 Builder.SetInsertPoint(
PHI->getParent(),
21181 PHI->getParent()->getFirstNonPHIIt());
21183 Builder.SetInsertPoint(VecI->getParent(),
21184 std::next(VecI->getIterator()));
21187 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
21189 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
21191 if (Scalar != NewInst) {
21194 "Extractelements should not be replaced.");
21195 Scalar->replaceAllUsesWith(NewInst);
21205 if (!UsedInserts.
insert(VU).second)
21208 auto BWIt = MinBWs.find(E);
21210 auto *ScalarTy = FTy->getElementType();
21211 auto Key = std::make_pair(Vec, ScalarTy);
21212 auto VecIt = VectorCasts.
find(
Key);
21213 if (VecIt == VectorCasts.
end()) {
21216 if (IVec->getParent()->isLandingPad())
21217 Builder.SetInsertPoint(IVec->getParent(),
21218 std::next(IVec->getParent()
21219 ->getLandingPadInst()
21222 Builder.SetInsertPoint(
21223 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
21225 Builder.SetInsertPoint(IVec->getNextNode());
21227 Vec = Builder.CreateIntCast(
21232 BWIt->second.second);
21235 Vec = VecIt->second;
21242 ShuffledInserts, [VU](
const ShuffledInsertData<Value *> &
Data) {
21249 unsigned Idx = *InsertIdx;
21250 if (It == ShuffledInserts.
end()) {
21252 It = std::next(ShuffledInserts.
begin(),
21253 ShuffledInserts.
size() - 1);
21258 Mask[Idx] = ExternalUse.Lane;
21270 for (
unsigned I :
seq<unsigned>(0, PH->getNumIncomingValues())) {
21271 if (PH->getIncomingValue(
I) == Scalar) {
21273 PH->getIncomingBlock(
I)->getTerminator();
21275 Builder.SetInsertPoint(VecI->getParent(),
21276 std::next(VecI->getIterator()));
21278 Builder.SetInsertPoint(PH->getIncomingBlock(
I)->getTerminator());
21280 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
21281 PH->setOperand(
I, NewInst);
21286 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
21290 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
21291 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
21302 for (
int I = 0, E = Mask.size();
I < E; ++
I) {
21304 CombinedMask1[
I] = Mask[
I];
21306 CombinedMask2[
I] = Mask[
I] - VF;
21308 ShuffleInstructionBuilder ShuffleBuilder(
21310 ShuffleBuilder.add(V1, CombinedMask1);
21312 ShuffleBuilder.add(V2, CombinedMask2);
21313 return ShuffleBuilder.finalize({}, {}, {});
21316 auto &&ResizeToVF = [&CreateShuffle](
Value *Vec, ArrayRef<int>
Mask,
21317 bool ForSingleMask) {
21318 unsigned VF =
Mask.size();
21321 if (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); })) {
21322 Vec = CreateShuffle(Vec,
nullptr, Mask);
21323 return std::make_pair(Vec,
true);
21325 if (!ForSingleMask) {
21327 for (
unsigned I = 0;
I < VF; ++
I) {
21331 Vec = CreateShuffle(Vec,
nullptr, ResizeMask);
21335 return std::make_pair(Vec,
false);
21339 for (
int I = 0,
E = ShuffledInserts.size();
I <
E; ++
I) {
21342 InsertElementInst *FirstInsert = ShuffledInserts[
I].InsertElements.front();
21343 InsertElementInst *LastInsert = ShuffledInserts[
I].InsertElements.back();
21344 Builder.SetInsertPoint(LastInsert);
21345 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
21350 return cast<VectorType>(Vec->getType())
21351 ->getElementCount()
21352 .getKnownMinValue();
21355 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
21357 assert((Vals.size() == 1 || Vals.size() == 2) &&
21358 "Expected exactly 1 or 2 input values.");
21359 if (Vals.size() == 1) {
21362 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
21363 ->getNumElements() ||
21364 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
21365 return CreateShuffle(Vals.front(), nullptr, Mask);
21366 return Vals.front();
21368 return CreateShuffle(Vals.
front() ? Vals.
front()
21370 Vals.
back(), Mask);
21372 auto It = ShuffledInserts[
I].InsertElements.rbegin();
21374 InsertElementInst *
II =
nullptr;
21375 if (It != ShuffledInserts[
I].InsertElements.rend())
21378 while (It != ShuffledInserts[
I].InsertElements.rend()) {
21379 assert(
II &&
"Must be an insertelement instruction.");
21386 for (Instruction *
II :
reverse(Inserts)) {
21387 II->replaceUsesOfWith(
II->getOperand(0), NewInst);
21389 if (
II->getParent() == NewI->getParent() &&
II->comesBefore(NewI))
21390 II->moveAfter(NewI);
21394 for (InsertElementInst *IE :
reverse(ShuffledInserts[
I].InsertElements)) {
21395 IE->replaceUsesOfWith(
IE->getOperand(0),
21397 IE->replaceUsesOfWith(
IE->getOperand(1),
21401 CSEBlocks.insert(LastInsert->
getParent());
21406 for (
auto &TEPtr : VectorizableTree) {
21407 TreeEntry *
Entry = TEPtr.get();
21410 if (
Entry->isGather() ||
Entry->State == TreeEntry::SplitVectorize ||
21411 DeletedNodes.contains(Entry) ||
21412 TransformedToGatherNodes.contains(Entry))
21415 assert(
Entry->VectorizedValue &&
"Can't find vectorizable value");
21418 for (
int Lane = 0, LE =
Entry->Scalars.size(); Lane != LE; ++Lane) {
21421 if (
Entry->getOpcode() == Instruction::GetElementPtr &&
21425 EE && IgnoredExtracts.contains(EE))
21432 for (User *U :
Scalar->users()) {
21437 (UserIgnoreList && UserIgnoreList->contains(U)) ||
21440 "Deleting out-of-tree value");
21444 LLVM_DEBUG(
dbgs() <<
"SLP: \tErasing scalar:" << *Scalar <<
".\n");
21453 V->mergeDIAssignID(RemovedInsts);
21456 if (UserIgnoreList) {
21457 for (Instruction *
I : RemovedInsts) {
21458 const TreeEntry *
IE = getTreeEntries(
I).front();
21460 !SplitEntries.empty() && SplitEntries.front()->Idx <
IE->Idx)
21461 IE = SplitEntries.front();
21462 if (
IE->Idx != 0 &&
21463 !(VectorizableTree.front()->isGather() &&
IE->UserTreeIndex &&
21464 (ValueToGatherNodes.lookup(
I).contains(
21465 VectorizableTree.front().get()) ||
21466 (
IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
21467 IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
21468 !(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
21469 IE->UserTreeIndex &&
21471 !(GatheredLoadsEntriesFirst.has_value() &&
21472 IE->Idx >= *GatheredLoadsEntriesFirst &&
21473 VectorizableTree.front()->isGather() &&
21475 !(!VectorizableTree.front()->isGather() &&
21476 VectorizableTree.front()->isCopyableElement(
I)))
21481 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
21482 (match(U.getUser(), m_LogicalAnd()) ||
21483 match(U.getUser(), m_LogicalOr())) &&
21484 U.getOperandNo() == 0;
21485 if (IsPoisoningLogicalOp) {
21486 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
21489 return UserIgnoreList->contains(
U.getUser());
21493 for (SelectInst *SI : LogicalOpSelects)
21503 Builder.ClearInsertionPoint();
21504 InstrElementSize.clear();
21506 const TreeEntry &RootTE = *VectorizableTree.front();
21507 Value *Vec = RootTE.VectorizedValue;
21508 if (
auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
21509 It != MinBWs.end() &&
21510 ReductionBitWidth != It->second.first) {
21511 IRBuilder<>::InsertPointGuard Guard(Builder);
21512 Builder.SetInsertPoint(ReductionRoot->getParent(),
21513 ReductionRoot->getIterator());
21514 Vec = Builder.CreateIntCast(
21516 VectorType::get(Builder.getIntNTy(ReductionBitWidth),
21518 It->second.second);
21524 LLVM_DEBUG(
dbgs() <<
"SLP: Optimizing " << GatherShuffleExtractSeq.size()
21525 <<
" gather sequences instructions.\n");
21532 Loop *L = LI->getLoopFor(
I->getParent());
21537 BasicBlock *PreHeader = L->getLoopPreheader();
21545 auto *OpI = dyn_cast<Instruction>(V);
21546 return OpI && L->contains(OpI);
21552 CSEBlocks.insert(PreHeader);
21557 CSEWorkList.
reserve(CSEBlocks.size());
21560 assert(DT->isReachableFromEntry(
N));
21567 assert((
A ==
B) == (
A->getDFSNumIn() ==
B->getDFSNumIn()) &&
21568 "Different nodes should have different DFS numbers");
21569 return A->getDFSNumIn() <
B->getDFSNumIn();
21577 auto &&IsIdenticalOrLessDefined = [TTI = TTI](
Instruction *I1,
21580 if (I1->getType() != I2->getType())
21585 return I1->isIdenticalTo(I2);
21586 if (SI1->isIdenticalTo(SI2))
21588 for (
int I = 0, E = SI1->getNumOperands();
I < E; ++
I)
21589 if (SI1->getOperand(
I) != SI2->getOperand(
I))
21592 NewMask.
assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
21596 unsigned LastUndefsCnt = 0;
21597 for (
int I = 0, E = NewMask.
size();
I < E; ++
I) {
21603 NewMask[
I] != SM1[
I])
21606 NewMask[
I] = SM1[
I];
21610 return SM1.
size() - LastUndefsCnt > 1 &&
21614 SM1.
size() - LastUndefsCnt));
21620 for (
auto I = CSEWorkList.
begin(), E = CSEWorkList.
end();
I != E; ++
I) {
21622 (
I == CSEWorkList.
begin() || !DT->dominates(*
I, *std::prev(
I))) &&
21623 "Worklist not sorted properly!");
21630 !GatherShuffleExtractSeq.contains(&In))
21635 bool Replaced =
false;
21638 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
21639 DT->dominates(V->getParent(), In.getParent())) {
21640 In.replaceAllUsesWith(V);
21643 if (!NewMask.
empty())
21644 SI->setShuffleMask(NewMask);
21649 GatherShuffleExtractSeq.contains(V) &&
21650 IsIdenticalOrLessDefined(V, &In, NewMask) &&
21651 DT->dominates(In.getParent(), V->getParent())) {
21653 V->replaceAllUsesWith(&In);
21656 if (!NewMask.
empty())
21657 SI->setShuffleMask(NewMask);
21665 Visited.push_back(&In);
21670 GatherShuffleExtractSeq.clear();
21673BoUpSLP::ScheduleBundle &BoUpSLP::BlockScheduling::buildBundle(
21676 ScheduledBundlesList.emplace_back(std::make_unique<ScheduleBundle>());
21677 for (
Value *V : VL) {
21678 if (S.isNonSchedulable(V))
21681 if (S.isCopyableElement(V)) {
21683 ScheduleCopyableData &SD =
21684 addScheduleCopyableData(EI,
I, SchedulingRegionID, *BundlePtr);
21686 BundlePtr->add(&SD);
21689 ScheduleData *BundleMember = getScheduleData(V);
21690 assert(BundleMember &&
"no ScheduleData for bundle member "
21691 "(maybe not in same basic block)");
21693 BundlePtr->add(BundleMember);
21694 ScheduledBundles.try_emplace(
I).first->getSecond().push_back(
21697 assert(BundlePtr && *BundlePtr &&
"Failed to find schedule bundle");
21703std::optional<BoUpSLP::ScheduleBundle *>
21705 const InstructionsState &S,
21718 if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() &&
21719 EI.UserTE->doesNotNeedToSchedule() &&
21720 EI.UserTE->getOpcode() != Instruction::PHI &&
21722 auto *I = dyn_cast<Instruction>(V);
21723 if (!I || I->hasOneUser())
21725 for (User *U : I->users()) {
21726 auto *UI = cast<Instruction>(U);
21727 if (isa<BinaryOperator>(UI))
21732 return std::nullopt;
21733 if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() &&
21734 EI.UserTE->hasCopyableElements() &&
21735 EI.UserTE->getMainOp()->getParent() == S.getMainOp()->getParent() &&
21737 if (S.isCopyableElement(V))
21741 return std::nullopt;
21744 if (S.areInstructionsWithCopyableElements() &&
any_of(VL, [&](
Value *V) {
21757 return std::nullopt;
21758 if (S.areInstructionsWithCopyableElements() && EI) {
21759 bool IsNonSchedulableWithParentPhiNode =
21760 EI.UserTE->doesNotNeedToSchedule() && EI.UserTE->UserTreeIndex &&
21761 EI.UserTE->UserTreeIndex.UserTE->hasState() &&
21762 EI.UserTE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
21763 EI.UserTE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
21764 if (IsNonSchedulableWithParentPhiNode) {
21765 SmallSet<std::pair<Value *, Value *>, 4> Values;
21766 for (
const auto [Idx, V] :
21767 enumerate(EI.UserTE->UserTreeIndex.UserTE->Scalars)) {
21768 Value *
Op = EI.UserTE->UserTreeIndex.UserTE->getOperand(
21769 EI.UserTE->UserTreeIndex.EdgeIdx)[Idx];
21773 if (!Values.
insert(std::make_pair(V,
Op)).second)
21774 return std::nullopt;
21778 bool HasCopyables = S.areInstructionsWithCopyableElements();
21780 all_of(VL, [&](
Value *V) {
return S.isNonSchedulable(V); }))) {
21784 SmallVector<ScheduleData *> ControlDependentMembers;
21785 for (
Value *V : VL) {
21787 if (!
I || (HasCopyables && S.isCopyableElement(V)))
21789 SmallDenseMap<std::pair<Instruction *, Value *>,
unsigned> UserOpToNumOps;
21790 for (
const Use &U :
I->operands()) {
21793 .first->getSecond();
21796 Op && areAllOperandsReplacedByCopyableData(
I,
Op, *SLP,
NumOps)) {
21797 if (ScheduleData *OpSD = getScheduleData(
Op);
21798 OpSD && OpSD->hasValidDependencies())
21800 return std::nullopt;
21809 LLVM_DEBUG(
dbgs() <<
"SLP: bundle: " << *S.getMainOp() <<
"\n");
21811 auto TryScheduleBundleImpl = [=](
bool ReSchedule, ScheduleBundle &Bundle) {
21814 SmallVector<ScheduleData *> ControlDependentMembers;
21815 auto CheckIfNeedToClearDeps = [&](ScheduleBundle &Bundle) {
21816 SmallDenseMap<std::pair<Instruction *, Value *>,
unsigned> UserOpToNumOps;
21817 for (ScheduleEntity *SE : Bundle.getBundle()) {
21819 if (ScheduleData *BundleMember = getScheduleData(SD->getInst());
21820 BundleMember && BundleMember->hasValidDependencies()) {
21821 BundleMember->clearDirectDependencies();
21822 if (RegionHasStackSave ||
21824 BundleMember->getInst()))
21825 ControlDependentMembers.
push_back(BundleMember);
21830 if (SD->hasValidDependencies() &&
21831 (!S.areInstructionsWithCopyableElements() ||
21832 !S.isCopyableElement(SD->getInst())) &&
21833 !getScheduleCopyableData(SD->getInst()).empty() && EI.UserTE &&
21834 EI.UserTE->hasState() &&
21835 (!EI.UserTE->hasCopyableElements() ||
21836 !EI.UserTE->isCopyableElement(SD->getInst())))
21837 SD->clearDirectDependencies();
21838 for (
const Use &U : SD->getInst()->operands()) {
21841 .
try_emplace(std::make_pair(SD->getInst(),
U.get()), 0)
21842 .first->getSecond();
21845 Op && areAllOperandsReplacedByCopyableData(SD->getInst(),
Op,
21847 if (ScheduleData *OpSD = getScheduleData(
Op);
21848 OpSD && OpSD->hasValidDependencies()) {
21849 OpSD->clearDirectDependencies();
21850 if (RegionHasStackSave ||
21852 ControlDependentMembers.
push_back(OpSD);
21863 if (OldScheduleEnd && ScheduleEnd != OldScheduleEnd) {
21864 for_each(ScheduleDataMap, [&](
auto &
P) {
21865 if (BB !=
P.first->getParent())
21867 ScheduleData *SD =
P.second;
21868 if (isInSchedulingRegion(*SD))
21869 SD->clearDependencies();
21871 for_each(ScheduleCopyableDataMapByInst, [&](
auto &
P) {
21872 for_each(
P.second, [&](ScheduleCopyableData *SD) {
21873 if (isInSchedulingRegion(*SD))
21874 SD->clearDependencies();
21881 if (Bundle && !Bundle.getBundle().empty()) {
21882 if (S.areInstructionsWithCopyableElements() ||
21883 !ScheduleCopyableDataMap.empty())
21884 CheckIfNeedToClearDeps(Bundle);
21885 LLVM_DEBUG(
dbgs() <<
"SLP: try schedule bundle " << Bundle <<
" in block "
21887 calculateDependencies(Bundle, !ReSchedule, SLP,
21888 ControlDependentMembers);
21889 }
else if (!ControlDependentMembers.
empty()) {
21890 ScheduleBundle
Invalid = ScheduleBundle::invalid();
21891 calculateDependencies(
Invalid, !ReSchedule, SLP,
21892 ControlDependentMembers);
21897 initialFillReadyList(ReadyInsts);
21904 while (((!Bundle && ReSchedule) || (Bundle && !Bundle.isReady())) &&
21905 !ReadyInsts.empty()) {
21906 ScheduleEntity *Picked = ReadyInsts.pop_back_val();
21907 assert(Picked->isReady() &&
"must be ready to schedule");
21908 schedule(*SLP, S, EI, Picked, ReadyInsts);
21909 if (Picked == &Bundle)
21916 for (
Value *V : VL) {
21917 if (S.isNonSchedulable(V))
21919 if (!extendSchedulingRegion(V, S)) {
21926 ScheduleBundle
Invalid = ScheduleBundle::invalid();
21927 TryScheduleBundleImpl(
false,
Invalid);
21928 return std::nullopt;
21932 bool ReSchedule =
false;
21933 for (
Value *V : VL) {
21934 if (S.isNonSchedulable(V))
21938 if (!CopyableData.
empty()) {
21939 for (ScheduleCopyableData *SD : CopyableData)
21940 ReadyInsts.remove(SD);
21942 ScheduleData *BundleMember = getScheduleData(V);
21943 assert((BundleMember || S.isCopyableElement(V)) &&
21944 "no ScheduleData for bundle member (maybe not in same basic block)");
21950 ReadyInsts.remove(BundleMember);
21952 !Bundles.
empty()) {
21953 for (ScheduleBundle *
B : Bundles)
21954 ReadyInsts.remove(
B);
21957 if (!S.isCopyableElement(V) && !BundleMember->isScheduled())
21964 LLVM_DEBUG(
dbgs() <<
"SLP: reset schedule because " << *BundleMember
21965 <<
" was already scheduled\n");
21969 ScheduleBundle &Bundle = buildBundle(VL, S, EI);
21970 TryScheduleBundleImpl(ReSchedule, Bundle);
21971 if (!Bundle.isReady()) {
21972 for (ScheduleEntity *BD : Bundle.getBundle()) {
21976 if (BD->isReady()) {
21978 if (Bundles.
empty()) {
21979 ReadyInsts.insert(BD);
21982 for (ScheduleBundle *
B : Bundles)
21984 ReadyInsts.insert(
B);
21987 ScheduledBundlesList.pop_back();
21988 SmallVector<ScheduleData *> ControlDependentMembers;
21989 for (
Value *V : VL) {
21990 if (S.isNonSchedulable(V))
21993 if (S.isCopyableElement(
I)) {
21996 auto KV = std::make_pair(EI,
I);
21997 assert(ScheduleCopyableDataMap.contains(KV) &&
21998 "no ScheduleCopyableData for copyable element");
21999 ScheduleCopyableData *SD =
22000 ScheduleCopyableDataMapByInst.find(
I)->getSecond().pop_back_val();
22001 ScheduleCopyableDataMapByUsers[
I].remove(SD);
22004 const auto *It =
find(
Op,
I);
22005 assert(It !=
Op.end() &&
"Lane not set");
22006 SmallPtrSet<Instruction *, 4> Visited;
22008 int Lane = std::distance(
Op.begin(), It);
22009 assert(Lane >= 0 &&
"Lane not set");
22011 !EI.UserTE->ReorderIndices.empty())
22012 Lane = EI.UserTE->ReorderIndices[Lane];
22013 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
22014 "Couldn't find extract lane");
22016 if (!Visited.
insert(In).second) {
22020 ScheduleCopyableDataMapByInstUser
22021 [std::make_pair(std::make_pair(In, EI.EdgeIdx),
I)]
22024 }
while (It !=
Op.end());
22026 if (ScheduleCopyableData *UserCD = getScheduleCopyableData(UserEI,
I))
22027 ScheduleCopyableDataMapByUsers[
I].insert(UserCD);
22029 if (ScheduleCopyableDataMapByUsers[
I].
empty())
22030 ScheduleCopyableDataMapByUsers.erase(
I);
22031 ScheduleCopyableDataMap.erase(KV);
22033 if (ScheduleData *OpSD = getScheduleData(
I);
22034 OpSD && OpSD->hasValidDependencies()) {
22035 OpSD->clearDirectDependencies();
22036 if (RegionHasStackSave ||
22038 ControlDependentMembers.
push_back(OpSD);
22042 ScheduledBundles.find(
I)->getSecond().pop_back();
22044 if (!ControlDependentMembers.
empty()) {
22045 ScheduleBundle
Invalid = ScheduleBundle::invalid();
22046 calculateDependencies(
Invalid,
false, SLP,
22047 ControlDependentMembers);
22049 return std::nullopt;
22054BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
22056 if (ChunkPos >= ChunkSize) {
22057 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
22060 return &(ScheduleDataChunks.back()[ChunkPos++]);
22063bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
22064 Value *V,
const InstructionsState &S) {
22066 assert(
I &&
"bundle member must be an instruction");
22067 if (getScheduleData(
I))
22069 if (!ScheduleStart) {
22071 initScheduleData(
I,
I->getNextNode(),
nullptr,
nullptr);
22073 ScheduleEnd =
I->getNextNode();
22074 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
22075 LLVM_DEBUG(
dbgs() <<
"SLP: initialize schedule region to " << *
I <<
"\n");
22083 ++ScheduleStart->getIterator().getReverse();
22089 return II->isAssumeLikeIntrinsic();
22092 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
22093 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
22094 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter !=
I &&
22096 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
22097 LLVM_DEBUG(
dbgs() <<
"SLP: exceeded schedule region size limit\n");
22104 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
22105 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
22107 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter ==
I)) {
22108 assert(
I->getParent() == ScheduleStart->getParent() &&
22109 "Instruction is in wrong basic block.");
22110 initScheduleData(
I, ScheduleStart,
nullptr, FirstLoadStoreInRegion);
22116 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter ==
I)) &&
22117 "Expected to reach top of the basic block or instruction down the "
22119 assert(
I->getParent() == ScheduleEnd->getParent() &&
22120 "Instruction is in wrong basic block.");
22121 initScheduleData(ScheduleEnd,
I->getNextNode(), LastLoadStoreInRegion,
22123 ScheduleEnd =
I->getNextNode();
22124 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
22125 LLVM_DEBUG(
dbgs() <<
"SLP: extend schedule region end to " << *
I <<
"\n");
22129void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
22131 ScheduleData *PrevLoadStore,
22132 ScheduleData *NextLoadStore) {
22133 ScheduleData *CurrentLoadStore = PrevLoadStore;
22138 ScheduleData *SD = ScheduleDataMap.lookup(
I);
22140 SD = allocateScheduleDataChunks();
22141 ScheduleDataMap[
I] = SD;
22143 assert(!isInSchedulingRegion(*SD) &&
22144 "new ScheduleData already in scheduling region");
22145 SD->init(SchedulingRegionID,
I);
22152 return LI && LI->isSimple() &&
22153 LI->getMetadata(LLVMContext::MD_invariant_load);
22156 if (
I->mayReadOrWriteMemory() &&
22158 !CanIgnoreLoad(
I) &&
22162 Intrinsic::pseudoprobe))) {
22164 if (CurrentLoadStore) {
22165 CurrentLoadStore->setNextLoadStore(SD);
22167 FirstLoadStoreInRegion = SD;
22169 CurrentLoadStore = SD;
22174 RegionHasStackSave =
true;
22176 if (NextLoadStore) {
22177 if (CurrentLoadStore)
22178 CurrentLoadStore->setNextLoadStore(NextLoadStore);
22180 LastLoadStoreInRegion = CurrentLoadStore;
22184void BoUpSLP::BlockScheduling::calculateDependencies(
22185 ScheduleBundle &Bundle,
bool InsertInReadyList,
BoUpSLP *SLP,
22187 SmallVector<ScheduleEntity *> WorkList;
22188 auto ProcessNode = [&](ScheduleEntity *SE) {
22190 if (CD->hasValidDependencies())
22193 CD->initDependencies();
22194 CD->resetUnscheduledDeps();
22195 const EdgeInfo &EI = CD->getEdgeInfo();
22198 const auto *It =
find(
Op, CD->getInst());
22199 assert(It !=
Op.end() &&
"Lane not set");
22200 SmallPtrSet<Instruction *, 4> Visited;
22202 int Lane = std::distance(
Op.begin(), It);
22203 assert(Lane >= 0 &&
"Lane not set");
22205 !EI.UserTE->ReorderIndices.empty())
22206 Lane = EI.UserTE->ReorderIndices[Lane];
22207 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
22208 "Couldn't find extract lane");
22210 if (EI.UserTE->isCopyableElement(In)) {
22213 if (ScheduleCopyableData *UseSD =
22214 getScheduleCopyableData(EI.UserTE->UserTreeIndex, In)) {
22215 CD->incDependencies();
22216 if (!UseSD->isScheduled())
22217 CD->incrementUnscheduledDeps(1);
22218 if (!UseSD->hasValidDependencies() ||
22219 (InsertInReadyList && UseSD->isReady()))
22222 }
else if (Visited.
insert(In).second) {
22223 if (ScheduleData *UseSD = getScheduleData(In)) {
22224 CD->incDependencies();
22225 if (!UseSD->isScheduled())
22226 CD->incrementUnscheduledDeps(1);
22227 if (!UseSD->hasValidDependencies() ||
22228 (InsertInReadyList && UseSD->isReady()))
22233 }
while (It !=
Op.end());
22234 if (CD->isReady() && CD->getDependencies() == 0 &&
22235 (EI.UserTE->hasState() &&
22236 (EI.UserTE->getMainOp()->getParent() !=
22237 CD->getInst()->getParent() ||
22239 (EI.UserTE->getMainOp()->hasNUsesOrMore(
UsesLimit) ||
22240 any_of(EI.UserTE->getMainOp()->users(), [&](User *U) {
22241 auto *IU = dyn_cast<Instruction>(U);
22244 return IU->getParent() == EI.UserTE->getMainOp()->getParent();
22250 CD->incDependencies();
22251 CD->incrementUnscheduledDeps(1);
22257 if (BundleMember->hasValidDependencies())
22259 LLVM_DEBUG(
dbgs() <<
"SLP: update deps of " << *BundleMember <<
"\n");
22260 BundleMember->initDependencies();
22261 BundleMember->resetUnscheduledDeps();
22263 SmallDenseMap<Value *, unsigned> UserToNumOps;
22264 for (User *U : BundleMember->getInst()->users()) {
22267 if (ScheduleData *UseSD = getScheduleData(U)) {
22271 if (areAllOperandsReplacedByCopyableData(
22274 BundleMember->incDependencies();
22275 if (!UseSD->isScheduled())
22276 BundleMember->incrementUnscheduledDeps(1);
22277 if (!UseSD->hasValidDependencies() ||
22278 (InsertInReadyList && UseSD->isReady()))
22282 for (ScheduleCopyableData *UseSD :
22283 getScheduleCopyableDataUsers(BundleMember->getInst())) {
22284 BundleMember->incDependencies();
22285 if (!UseSD->isScheduled())
22286 BundleMember->incrementUnscheduledDeps(1);
22287 if (!UseSD->hasValidDependencies() ||
22288 (InsertInReadyList && UseSD->isReady()))
22292 SmallPtrSet<const Instruction *, 4> Visited;
22295 if (!Visited.
insert(
I).second)
22297 auto *DepDest = getScheduleData(
I);
22298 assert(DepDest &&
"must be in schedule window");
22299 DepDest->addControlDependency(BundleMember);
22300 BundleMember->incDependencies();
22301 if (!DepDest->isScheduled())
22302 BundleMember->incrementUnscheduledDeps(1);
22303 if (!DepDest->hasValidDependencies() ||
22304 (InsertInReadyList && DepDest->isReady()))
22312 for (Instruction *
I = BundleMember->getInst()->getNextNode();
22313 I != ScheduleEnd;
I =
I->getNextNode()) {
22318 MakeControlDependent(
I);
22326 if (RegionHasStackSave) {
22331 match(BundleMember->getInst(),
22333 for (Instruction *
I = BundleMember->getInst()->getNextNode();
22334 I != ScheduleEnd;
I =
I->getNextNode()) {
22345 MakeControlDependent(
I);
22355 BundleMember->getInst()->mayReadOrWriteMemory()) {
22356 for (Instruction *
I = BundleMember->getInst()->getNextNode();
22357 I != ScheduleEnd;
I =
I->getNextNode()) {
22363 MakeControlDependent(
I);
22370 ScheduleData *NextLoadStore = BundleMember->getNextLoadStore();
22371 if (!NextLoadStore)
22375 "NextLoadStore list for non memory effecting bundle?");
22378 unsigned NumAliased = 0;
22379 unsigned DistToSrc = 1;
22380 bool IsNonSimpleSrc = !SrcLoc.
Ptr || !
isSimple(SrcInst);
22382 for (ScheduleData *DepDest = NextLoadStore; DepDest;
22383 DepDest = DepDest->getNextLoadStore()) {
22384 assert(isInSchedulingRegion(*DepDest) &&
"Expected to be in region");
22394 ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
22396 SLP->isAliased(SrcLoc, SrcInst, DepDest->getInst())))) {
22403 DepDest->addMemoryDependency(BundleMember);
22404 BundleMember->incDependencies();
22405 if (!DepDest->isScheduled())
22406 BundleMember->incrementUnscheduledDeps(1);
22407 if (!DepDest->hasValidDependencies() ||
22408 (InsertInReadyList && DepDest->isReady()))
22432 "expected at least one instruction to schedule");
22434 WorkList.
push_back(Bundle.getBundle().front());
22436 SmallPtrSet<ScheduleBundle *, 16> Visited;
22437 while (!WorkList.
empty()) {
22442 CopyableBundle.
push_back(&CD->getBundle());
22443 Bundles = CopyableBundle;
22445 Bundles = getScheduleBundles(SD->getInst());
22447 if (Bundles.
empty()) {
22448 if (!SD->hasValidDependencies())
22450 if (InsertInReadyList && SD->isReady()) {
22451 ReadyInsts.insert(SD);
22452 LLVM_DEBUG(
dbgs() <<
"SLP: gets ready on update: " << *SD <<
"\n");
22456 for (ScheduleBundle *Bundle : Bundles) {
22457 if (Bundle->hasValidDependencies() || !Visited.
insert(Bundle).second)
22459 assert(isInSchedulingRegion(*Bundle) &&
22460 "ScheduleData not in scheduling region");
22461 for_each(Bundle->getBundle(), ProcessNode);
22463 if (InsertInReadyList && SD->isReady()) {
22464 for (ScheduleBundle *Bundle : Bundles) {
22465 assert(isInSchedulingRegion(*Bundle) &&
22466 "ScheduleData not in scheduling region");
22467 if (!Bundle->isReady())
22469 ReadyInsts.insert(Bundle);
22477void BoUpSLP::BlockScheduling::resetSchedule() {
22479 "tried to reset schedule on block which has not been scheduled");
22480 for_each(ScheduleDataMap, [&](
auto &
P) {
22481 if (BB !=
P.first->getParent())
22483 ScheduleData *SD =
P.second;
22484 if (isInSchedulingRegion(*SD)) {
22485 SD->setScheduled(
false);
22486 SD->resetUnscheduledDeps();
22489 for_each(ScheduleCopyableDataMapByInst, [&](
auto &
P) {
22490 for_each(
P.second, [&](ScheduleCopyableData *SD) {
22491 if (isInSchedulingRegion(*SD)) {
22492 SD->setScheduled(false);
22493 SD->resetUnscheduledDeps();
22497 for_each(ScheduledBundles, [&](
auto &
P) {
22498 for_each(
P.second, [&](ScheduleBundle *Bundle) {
22499 if (isInSchedulingRegion(*Bundle))
22500 Bundle->setScheduled(false);
22504 for (
auto &
P : ScheduleCopyableDataMap) {
22505 if (isInSchedulingRegion(*
P.second)) {
22506 P.second->setScheduled(
false);
22507 P.second->resetUnscheduledDeps();
22510 ReadyInsts.clear();
22513void BoUpSLP::scheduleBlock(
const BoUpSLP &R, BlockScheduling *BS) {
22514 if (!BS->ScheduleStart)
22517 LLVM_DEBUG(
dbgs() <<
"SLP: schedule block " << BS->BB->getName() <<
"\n");
22524 BS->resetSchedule();
22531 struct ScheduleDataCompare {
22532 bool operator()(
const ScheduleEntity *SD1,
22533 const ScheduleEntity *SD2)
const {
22534 return SD2->getSchedulingPriority() < SD1->getSchedulingPriority();
22537 std::set<ScheduleEntity *, ScheduleDataCompare> ReadyInsts;
22542 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
22543 I =
I->getNextNode()) {
22545 if (!Bundles.
empty()) {
22546 for (ScheduleBundle *Bundle : Bundles) {
22547 Bundle->setSchedulingPriority(Idx++);
22548 if (!Bundle->hasValidDependencies())
22549 BS->calculateDependencies(*Bundle,
false,
this);
22552 for (ScheduleCopyableData *SD :
reverse(SDs)) {
22553 ScheduleBundle &Bundle = SD->getBundle();
22554 Bundle.setSchedulingPriority(Idx++);
22555 if (!Bundle.hasValidDependencies())
22556 BS->calculateDependencies(Bundle,
false,
this);
22561 BS->getScheduleCopyableDataUsers(
I);
22562 if (ScheduleData *SD = BS->getScheduleData(
I)) {
22565 SDTEs.
front()->doesNotNeedToSchedule() ||
22567 "scheduler and vectorizer bundle mismatch");
22568 SD->setSchedulingPriority(Idx++);
22569 if (!SD->hasValidDependencies() &&
22570 (!CopyableData.
empty() ||
22571 any_of(
R.ValueToGatherNodes.lookup(
I), [&](
const TreeEntry *TE) {
22572 assert(TE->isGather() &&
"expected gather node");
22573 return TE->hasState() && TE->hasCopyableElements() &&
22574 TE->isCopyableElement(I);
22580 ScheduleBundle Bundle;
22582 BS->calculateDependencies(Bundle,
false,
this);
22585 for (ScheduleCopyableData *SD :
reverse(CopyableData)) {
22586 ScheduleBundle &Bundle = SD->getBundle();
22587 Bundle.setSchedulingPriority(Idx++);
22588 if (!Bundle.hasValidDependencies())
22589 BS->calculateDependencies(Bundle,
false,
this);
22592 BS->initialFillReadyList(ReadyInsts);
22594 Instruction *LastScheduledInst = BS->ScheduleEnd;
22597 SmallPtrSet<Instruction *, 16> Scheduled;
22598 while (!ReadyInsts.empty()) {
22599 auto *Picked = *ReadyInsts.begin();
22600 ReadyInsts.erase(ReadyInsts.begin());
22605 for (
const ScheduleEntity *BundleMember : Bundle->getBundle()) {
22606 Instruction *PickedInst = BundleMember->getInst();
22608 bool IsCopyable = Bundle->getTreeEntry()->isCopyableElement(PickedInst);
22609 if ((IsCopyable && BS->getScheduleData(PickedInst)) ||
22610 (!IsCopyable && !Scheduled.
insert(PickedInst).second))
22612 if (PickedInst->
getNextNode() != LastScheduledInst)
22614 LastScheduledInst = PickedInst;
22616 EntryToLastInstruction.try_emplace(Bundle->getTreeEntry(),
22617 LastScheduledInst);
22621 if (PickedInst->
getNextNode() != LastScheduledInst)
22623 LastScheduledInst = PickedInst;
22625 auto Invalid = InstructionsState::invalid();
22630#ifdef EXPENSIVE_CHECKS
22634#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
22636 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
22637 I =
I->getNextNode()) {
22640 [](
const ScheduleBundle *Bundle) {
22641 return Bundle->isScheduled();
22643 "must be scheduled at this point");
22648 BS->ScheduleStart =
nullptr;
22656 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
22661 auto E = InstrElementSize.find(V);
22662 if (E != InstrElementSize.end())
22679 Value *FirstNonBool =
nullptr;
22680 while (!Worklist.
empty()) {
22685 auto *Ty =
I->getType();
22688 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
22696 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
22704 for (
Use &U :
I->operands()) {
22706 if (Visited.
insert(J).second &&
22712 FirstNonBool = U.get();
22723 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
22725 Width = DL->getTypeSizeInBits(V->getType());
22729 InstrElementSize[
I] = Width;
22734bool BoUpSLP::collectValuesToDemote(
22735 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
22738 bool &IsProfitableToDemote,
bool IsTruncRoot)
const {
22743 unsigned OrigBitWidth =
22744 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
22758 if (isa<PoisonValue>(R))
22760 return !isKnownNonNegative(R, SimplifyQuery(*DL));
22762 auto IsPotentiallyTruncated = [&](
Value *V,
unsigned &
BitWidth) ->
bool {
22765 if (getTreeEntries(V).
size() > 1)
22771 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth >
BitWidth) {
22777 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
22781 APInt
Mask = DB->getDemandedBits(
I);
22782 unsigned BitWidth2 =
22783 std::max<unsigned>(1,
Mask.getBitWidth() -
Mask.countl_zero());
22784 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
22790 BitWidth1 = std::min(BitWidth1, BitWidth2);
22795 auto FinalAnalysis = [&, TTI = TTI]() {
22796 if (!IsProfitableToDemote)
22799 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(
BitWidth)));
22801 if (Res &&
E.isGather()) {
22802 if (
E.hasState()) {
22803 if (
const TreeEntry *SameTE =
22804 getSameValuesTreeEntry(
E.getMainOp(),
E.Scalars);
22806 if (collectValuesToDemote(*SameTE, IsProfitableToDemoteRoot,
BitWidth,
22807 ToDemote, Visited, NodesToKeepBWs,
22808 MaxDepthLevel, IsProfitableToDemote,
22816 SmallPtrSet<Value *, 4> UniqueBases;
22817 for (
Value *V :
E.Scalars) {
22821 UniqueBases.
insert(EE->getVectorOperand());
22823 const unsigned VF =
E.Scalars.size();
22824 Type *OrigScalarTy =
E.Scalars.front()->getType();
22825 if (UniqueBases.
size() <= 2 ||
22838 if (
E.isGather() || !Visited.
insert(&
E).second ||
22840 return !isa<Constant>(V) && all_of(V->users(), [&](User *U) {
22841 return isa<InsertElementInst>(U) && !isVectorized(U);
22844 return FinalAnalysis();
22847 return !isa<Constant>(V) && !all_of(V->users(), [=](User *U) {
22848 return isVectorized(U) ||
22849 (E.Idx == 0 && UserIgnoreList &&
22850 UserIgnoreList->contains(U)) ||
22851 (!isa<CmpInst>(U) && U->getType()->isSized() &&
22852 !U->getType()->isScalableTy() &&
22853 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
22854 }) && !IsPotentiallyTruncated(V,
BitWidth);
22859 bool &NeedToExit) {
22860 NeedToExit =
false;
22861 unsigned InitLevel = MaxDepthLevel;
22862 for (
const TreeEntry *
Op : Operands) {
22863 unsigned Level = InitLevel;
22864 if (!collectValuesToDemote(*
Op, IsProfitableToDemoteRoot,
BitWidth,
22865 ToDemote, Visited, NodesToKeepBWs, Level,
22866 IsProfitableToDemote, IsTruncRoot)) {
22867 if (!IsProfitableToDemote)
22870 if (!FinalAnalysis())
22874 MaxDepthLevel = std::max(MaxDepthLevel, Level);
22878 auto AttemptCheckBitwidth =
22879 [&](function_ref<bool(
unsigned,
unsigned)> Checker,
bool &NeedToExit) {
22881 NeedToExit =
false;
22882 unsigned BestFailBitwidth = 0;
22884 if (Checker(
BitWidth, OrigBitWidth))
22886 if (BestFailBitwidth == 0 && FinalAnalysis())
22890 if (BestFailBitwidth == 0) {
22901 auto TryProcessInstruction =
22903 function_ref<bool(
unsigned,
unsigned)> Checker = {}) {
22904 if (Operands.empty()) {
22907 for (
Value *V :
E.Scalars)
22908 (void)IsPotentiallyTruncated(V,
BitWidth);
22913 return !V->hasOneUse() && !IsPotentiallyTruncated(V, BitWidth);
22916 bool NeedToExit =
false;
22917 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
22921 if (!ProcessOperands(Operands, NeedToExit))
22930 return IsProfitableToDemote;
22933 if (
E.State == TreeEntry::SplitVectorize)
22934 return TryProcessInstruction(
22936 {VectorizableTree[
E.CombinedEntriesWithIndices.front().first].get(),
22937 VectorizableTree[
E.CombinedEntriesWithIndices.back().first].get()});
22939 if (
E.isAltShuffle()) {
22941 auto IsDangerousOpcode = [](
unsigned Opcode) {
22943 case Instruction::Shl:
22944 case Instruction::AShr:
22945 case Instruction::LShr:
22946 case Instruction::UDiv:
22947 case Instruction::SDiv:
22948 case Instruction::URem:
22949 case Instruction::SRem:
22956 if (IsDangerousOpcode(
E.getAltOpcode()))
22957 return FinalAnalysis();
22960 switch (
E.getOpcode()) {
22964 case Instruction::Trunc:
22965 if (IsProfitableToDemoteRoot)
22966 IsProfitableToDemote =
true;
22967 return TryProcessInstruction(
BitWidth);
22968 case Instruction::ZExt:
22969 case Instruction::SExt:
22970 if (
E.UserTreeIndex.UserTE &&
E.UserTreeIndex.UserTE->hasState() &&
22971 E.UserTreeIndex.UserTE->getOpcode() == Instruction::BitCast &&
22972 E.UserTreeIndex.UserTE->getMainOp()->getType()->isFPOrFPVectorTy())
22974 IsProfitableToDemote =
true;
22975 return TryProcessInstruction(
BitWidth);
22979 case Instruction::Add:
22980 case Instruction::Sub:
22981 case Instruction::Mul:
22982 case Instruction::And:
22983 case Instruction::Or:
22984 case Instruction::Xor: {
22985 return TryProcessInstruction(
22986 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)});
22988 case Instruction::Freeze:
22989 return TryProcessInstruction(
BitWidth, getOperandEntry(&
E, 0));
22990 case Instruction::Shl: {
22993 auto ShlChecker = [&](
unsigned BitWidth, unsigned) {
22995 if (isa<PoisonValue>(V))
22997 if (E.isCopyableElement(V))
22999 auto *I = cast<Instruction>(V);
23000 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
23001 return AmtKnownBits.getMaxValue().ult(BitWidth);
23004 return TryProcessInstruction(
23005 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)}, ShlChecker);
23007 case Instruction::LShr: {
23011 auto LShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
23013 if (isa<PoisonValue>(V))
23015 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
23016 if (E.isCopyableElement(V))
23017 return MaskedValueIsZero(V, ShiftedBits, SimplifyQuery(*DL));
23018 auto *I = cast<Instruction>(V);
23019 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
23020 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
23021 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
23022 SimplifyQuery(*DL));
23025 return TryProcessInstruction(
23026 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)},
23029 case Instruction::AShr: {
23033 auto AShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
23035 if (isa<PoisonValue>(V))
23037 auto *I = cast<Instruction>(V);
23038 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
23039 unsigned ShiftedBits = OrigBitWidth - BitWidth;
23040 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
23042 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
23045 return TryProcessInstruction(
23046 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)},
23049 case Instruction::UDiv:
23050 case Instruction::URem: {
23052 auto Checker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
23055 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
23056 if (E.hasCopyableElements() && E.isCopyableElement(V))
23057 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
23058 auto *I = cast<Instruction>(V);
23059 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
23060 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
23063 return TryProcessInstruction(
23064 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)}, Checker);
23068 case Instruction::Select: {
23069 return TryProcessInstruction(
23070 BitWidth, {getOperandEntry(&
E, 1), getOperandEntry(&
E, 2)});
23074 case Instruction::PHI: {
23075 const unsigned NumOps =
E.getNumOperands();
23078 [&](
unsigned Idx) { return getOperandEntry(&E, Idx); });
23083 case Instruction::Call: {
23088 if (
ID != Intrinsic::abs &&
ID != Intrinsic::smin &&
23089 ID != Intrinsic::smax &&
ID != Intrinsic::umin &&
ID != Intrinsic::umax)
23092 function_ref<bool(
unsigned,
unsigned)> CallChecker;
23093 auto CompChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
23096 auto *I = cast<Instruction>(V);
23097 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
23098 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
23099 return MaskedValueIsZero(I->getOperand(0), Mask,
23100 SimplifyQuery(*DL)) &&
23101 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
23103 assert((
ID == Intrinsic::smin ||
ID == Intrinsic::smax) &&
23104 "Expected min/max intrinsics only.");
23105 unsigned SignBits = OrigBitWidth -
BitWidth;
23107 unsigned Op0SignBits =
23109 unsigned Op1SignBits =
23111 return SignBits <= Op0SignBits &&
23112 ((SignBits != Op0SignBits &&
23115 SimplifyQuery(*DL))) &&
23116 SignBits <= Op1SignBits &&
23117 ((SignBits != Op1SignBits &&
23122 auto AbsChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
23125 auto *I = cast<Instruction>(V);
23126 unsigned SignBits = OrigBitWidth - BitWidth;
23127 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
23128 unsigned Op0SignBits =
23129 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
23130 return SignBits <= Op0SignBits &&
23131 ((SignBits != Op0SignBits &&
23132 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
23133 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
23136 if (
ID != Intrinsic::abs) {
23137 Operands.push_back(getOperandEntry(&
E, 1));
23138 CallChecker = CompChecker;
23140 CallChecker = AbsChecker;
23143 std::numeric_limits<InstructionCost::CostType>::max();
23145 unsigned VF =
E.Scalars.size();
23147 auto Checker = [&](
unsigned BitWidth, unsigned) {
23155 if (
Cost < BestCost) {
23161 [[maybe_unused]]
bool NeedToExit;
23162 (void)AttemptCheckBitwidth(Checker, NeedToExit);
23164 return TryProcessInstruction(
BitWidth, Operands, CallChecker);
23172 return FinalAnalysis();
23179 bool IsStoreOrInsertElt =
23180 VectorizableTree.front()->hasState() &&
23181 (VectorizableTree.front()->
getOpcode() == Instruction::Store ||
23182 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
23183 if ((IsStoreOrInsertElt || UserIgnoreList) &&
23184 ExtraBitWidthNodes.size() <= 1 &&
23185 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
23186 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
23189 unsigned NodeIdx = 0;
23190 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
23194 assert((VectorizableTree[NodeIdx]->
isGather() || NodeIdx != 0 ||
23195 !VectorizableTree[NodeIdx]->UserTreeIndex) &&
23196 "Unexpected tree is graph.");
23200 bool IsTruncRoot =
false;
23201 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
23204 if (NodeIdx != 0 &&
23205 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
23206 VectorizableTree[NodeIdx]->
getOpcode() == Instruction::Trunc) {
23207 assert(IsStoreOrInsertElt &&
"Expected store/insertelement seeded graph.");
23208 IsTruncRoot =
true;
23210 IsProfitableToDemoteRoot =
true;
23215 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
23219 auto ComputeMaxBitWidth =
23220 [&](
const TreeEntry &E,
bool IsTopRoot,
bool IsProfitableToDemoteRoot,
23221 unsigned Limit,
bool IsTruncRoot,
bool IsSignedCmp) ->
unsigned {
23225 if (E.isGather() && IsTruncRoot && E.UserTreeIndex &&
23226 !NodesToKeepBWs.
contains(E.Idx) &&
23227 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
23229 return V->hasOneUse() || isa<Constant>(V) ||
23230 (!V->hasNUsesOrMore(UsesLimit) &&
23231 none_of(V->users(), [&](User *U) {
23232 ArrayRef<TreeEntry *> TEs = getTreeEntries(U);
23233 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
23234 if (TEs.empty() || is_contained(TEs, UserTE))
23236 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
23238 isa<SIToFPInst, UIToFPInst>(U) ||
23239 (UserTE->hasState() &&
23240 (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
23241 SelectInst>(UserTE->getMainOp()) ||
23242 isa<SIToFPInst, UIToFPInst>(UserTE->getMainOp()))))
23244 unsigned UserTESz = DL->getTypeSizeInBits(
23245 UserTE->Scalars.front()->getType());
23246 if (all_of(TEs, [&](const TreeEntry *TE) {
23247 auto It = MinBWs.find(TE);
23248 return It != MinBWs.end() &&
23249 It->second.first > UserTESz;
23252 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
23256 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
23257 auto It = MinBWs.find(UserTE);
23258 if (It != MinBWs.end())
23259 return It->second.first;
23260 unsigned MaxBitWidth =
23261 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
23262 MaxBitWidth =
bit_ceil(MaxBitWidth);
23263 if (MaxBitWidth < 8 && MaxBitWidth > 1)
23265 return MaxBitWidth;
23271 unsigned VF = E.getVectorFactor();
23272 Type *ScalarTy = E.Scalars.front()->getType();
23279 [&](
Value *V) { return AnalyzedMinBWVals.contains(V); }))
23288 unsigned MaxBitWidth = 1u;
23296 bool IsKnownPositive = !IsSignedCmp &&
all_of(E.Scalars, [&](
Value *R) {
23297 if (isa<PoisonValue>(R))
23299 KnownBits Known = computeKnownBits(R, *DL);
23300 return Known.isNonNegative();
23303 if (!IsKnownPositive && !IsTopRoot && E.UserTreeIndex &&
23304 E.UserTreeIndex.UserTE->hasState() &&
23305 E.UserTreeIndex.UserTE->getOpcode() == Instruction::UIToFP)
23307 std::min(DL->getTypeSizeInBits(
23308 E.UserTreeIndex.UserTE->Scalars.front()->getType()),
23309 DL->getTypeSizeInBits(ScalarTy));
23313 for (
Value *Root : E.Scalars) {
23319 unsigned BitWidth1 = NumTypeBits - NumSignBits;
23335 if (!IsKnownPositive)
23340 MaxBitWidth = std::max(BitWidth1, MaxBitWidth);
23343 APInt Mask = DB->getDemandedBits(
I);
23344 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
23346 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
23349 if (MaxBitWidth < 8 && MaxBitWidth > 1)
23354 if (NumParts > 1 &&
23362 unsigned Opcode = E.getOpcode();
23363 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
23364 Opcode == Instruction::SExt ||
23365 Opcode == Instruction::ZExt || NumParts > 1;
23370 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
23371 bool NeedToDemote = IsProfitableToDemote;
23373 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
23374 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
23375 NeedToDemote, IsTruncRoot) ||
23376 (MaxDepthLevel <= Limit &&
23377 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
23378 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
23379 DL->getTypeSizeInBits(TreeRootIT) /
23380 DL->getTypeSizeInBits(
23381 E.getMainOp()->getOperand(0)->getType()) >
23385 MaxBitWidth =
bit_ceil(MaxBitWidth);
23387 return MaxBitWidth;
23394 if (UserIgnoreList &&
23398 if (
all_of(*UserIgnoreList,
23403 VectorizableTree.front()->State == TreeEntry::Vectorize &&
23404 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
23405 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
23406 Builder.getInt1Ty()) {
23407 ReductionBitWidth = 1;
23409 for (
Value *V : *UserIgnoreList) {
23413 TypeSize NumTypeBits = DL->getTypeSizeInBits(
V->getType());
23414 unsigned BitWidth1 = NumTypeBits - NumSignBits;
23417 unsigned BitWidth2 = BitWidth1;
23420 BitWidth2 =
Mask.getBitWidth() -
Mask.countl_zero();
23422 ReductionBitWidth =
23423 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
23425 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
23426 ReductionBitWidth = 8;
23428 ReductionBitWidth =
bit_ceil(ReductionBitWidth);
23431 bool IsTopRoot = NodeIdx == 0;
23432 while (NodeIdx < VectorizableTree.size() &&
23433 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
23434 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
23435 RootDemotes.push_back(NodeIdx);
23437 IsTruncRoot =
true;
23439 bool IsSignedCmp =
false;
23440 if (UserIgnoreList &&
23444 IsSignedCmp =
true;
23445 while (NodeIdx < VectorizableTree.size()) {
23447 unsigned Limit = 2;
23449 ReductionBitWidth ==
23450 DL->getTypeSizeInBits(
23451 VectorizableTree.front()->Scalars.front()->getType()))
23453 unsigned MaxBitWidth = ComputeMaxBitWidth(
23454 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
23455 IsTruncRoot, IsSignedCmp);
23456 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
23457 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
23458 ReductionBitWidth =
bit_ceil(MaxBitWidth);
23459 else if (MaxBitWidth == 0)
23460 ReductionBitWidth = 0;
23463 for (
unsigned Idx : RootDemotes) {
23464 if (
all_of(VectorizableTree[Idx]->Scalars, [&](
Value *V) {
23465 uint32_t OrigBitWidth =
23466 DL->getTypeSizeInBits(
V->getType()->getScalarType());
23467 if (OrigBitWidth > MaxBitWidth) {
23475 RootDemotes.clear();
23477 IsProfitableToDemoteRoot =
true;
23479 if (ExtraBitWidthNodes.empty()) {
23480 NodeIdx = VectorizableTree.size();
23482 unsigned NewIdx = 0;
23484 NewIdx = *ExtraBitWidthNodes.begin();
23485 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
23486 }
while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
23489 NodeIdx < VectorizableTree.size() &&
23490 VectorizableTree[NodeIdx]->UserTreeIndex &&
23491 VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&
23492 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
23493 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
23494 Instruction::Trunc &&
23495 !VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();
23497 NodeIdx < VectorizableTree.size() &&
23498 VectorizableTree[NodeIdx]->UserTreeIndex &&
23499 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
23500 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
23501 Instruction::ICmp &&
23503 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->Scalars,
23505 auto *IC = dyn_cast<ICmpInst>(V);
23506 return IC && (IC->isSigned() ||
23507 !isKnownNonNegative(IC->getOperand(0),
23508 SimplifyQuery(*DL)) ||
23509 !isKnownNonNegative(IC->getOperand(1),
23510 SimplifyQuery(*DL)));
23516 if (MaxBitWidth == 0 ||
23520 if (UserIgnoreList)
23521 AnalyzedMinBWVals.insert_range(TreeRoot);
23528 for (
unsigned Idx : ToDemote) {
23529 TreeEntry *
TE = VectorizableTree[Idx].get();
23530 if (MinBWs.contains(TE))
23533 if (isa<PoisonValue>(R))
23535 return !isKnownNonNegative(R, SimplifyQuery(*DL));
23537 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
23578 DL = &
F.getDataLayout();
23586 if (!
TTI->getNumberOfRegisters(
TTI->getRegisterClassForType(
true))) {
23588 dbgs() <<
"SLP: Didn't find any vector registers for target, abort.\n");
23593 if (
F.hasFnAttribute(Attribute::NoImplicitFloat))
23596 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing blocks in " <<
F.getName() <<
".\n");
23600 BoUpSLP R(&
F,
SE,
TTI,
TLI,
AA,
LI,
DT,
AC,
DB,
DL, ORE_);
23606 DT->updateDFSNumbers();
23609 for (
auto *BB :
post_order(&
F.getEntryBlock())) {
23614 R.clearReductionData();
23615 collectSeedInstructions(BB);
23618 if (!Stores.empty()) {
23620 <<
" underlying objects.\n");
23621 Changed |= vectorizeStoreChains(R);
23625 Changed |= vectorizeChainsInBlock(BB, R);
23630 if (!GEPs.empty()) {
23632 <<
" underlying objects.\n");
23633 Changed |= vectorizeGEPIndices(BB, R);
23638 R.optimizeGatherSequence();
23646 unsigned Idx,
unsigned MinVF,
23651 const unsigned Sz = R.getVectorElementSize(Chain[0]);
23652 unsigned VF = Chain.
size();
23658 VF < 2 || VF < MinVF) {
23666 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << VF <<
" stores at offset " << Idx
23670 for (
Value *V : Chain)
23673 InstructionsCompatibilityAnalysis
Analysis(*DT, *
DL, *
TTI, *TLI);
23674 InstructionsState S =
Analysis.buildInstructionsState(
23678 bool IsAllowedSize =
23682 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
23683 (!S.getMainOp()->isSafeToRemove() ||
23686 return !isa<ExtractElementInst>(V) &&
23687 (V->getNumUses() > Chain.size() ||
23688 any_of(V->users(), [&](User *U) {
23689 return !Stores.contains(U);
23692 (ValOps.
size() > Chain.size() / 2 && !S)) {
23693 Size = (!IsAllowedSize && S) ? 1 : 2;
23697 if (
R.isLoadCombineCandidate(Chain))
23699 R.buildTree(Chain);
23701 if (
R.isTreeTinyAndNotFullyVectorizable()) {
23702 if (
R.isGathered(Chain.front()) ||
23704 return std::nullopt;
23705 Size =
R.getCanonicalGraphSize();
23708 if (
R.isProfitableToReorder()) {
23709 R.reorderTopToBottom();
23710 R.reorderBottomToTop();
23712 R.transformNodes();
23713 R.computeMinimumValueSizes();
23716 R.buildExternalUses();
23718 Size =
R.getCanonicalGraphSize();
23719 if (S && S.getOpcode() == Instruction::Load)
23727 using namespace ore;
23729 R.getORE()->emit(OptimizationRemark(
SV_NAME,
"StoresVectorized",
23731 <<
"Stores SLP vectorized with cost " <<
NV(
"Cost",
Cost)
23732 <<
" and with tree size "
23733 <<
NV(
"TreeSize",
R.getTreeSize()));
23747 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
23748 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
23749 unsigned Size = First ? Val.first : Val.second;
23761 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
23762 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
23763 unsigned P = First ? Val.first : Val.second;
23766 return V + (P - Mean) * (P - Mean);
23769 return Dev * 96 / (Mean * Mean) == 0;
23777class RelatedStoreInsts {
23780 : AllStores(AllStores) {
23781 reset(BaseInstrIdx);
23784 void reset(
unsigned NewBaseInstr) {
23785 assert(NewBaseInstr < AllStores.size() &&
23786 "Instruction index out of bounds");
23787 BaseInstrIdx = NewBaseInstr;
23789 insertOrLookup(NewBaseInstr, 0);
23796 std::optional<unsigned> insertOrLookup(
unsigned InstrIdx, int64_t PtrDist) {
23797 auto [It,
Inserted] = Instrs.emplace(PtrDist, InstrIdx);
23798 return Inserted ? std::nullopt : std::make_optional(It->second);
23801 using DistToInstMap = std::map<int64_t, unsigned>;
23802 const DistToInstMap &getStores()
const {
return Instrs; }
23806 std::optional<int64_t> getPointerDiff(StoreInst &SI,
const DataLayout &
DL,
23807 ScalarEvolution &SE)
const {
23808 StoreInst &BaseStore = *AllStores[BaseInstrIdx];
23811 SI.getValueOperand()->getType(),
SI.getPointerOperand(),
DL, SE,
23817 void rebase(
unsigned MinSafeIdx,
unsigned NewBaseInstIdx,
23818 int64_t DistFromCurBase) {
23819 DistToInstMap PrevSet = std::move(Instrs);
23820 reset(NewBaseInstIdx);
23825 for (
auto [Dist, InstIdx] : PrevSet) {
23826 if (InstIdx >= MinSafeIdx)
23827 insertOrLookup(InstIdx, Dist - DistFromCurBase);
23833 DistToInstMap::reverse_iterator LastVectorizedStore =
find_if(
23834 reverse(Instrs), [&](
const std::pair<int64_t, unsigned> &DistAndIdx) {
23835 return VectorizedStores.
contains(AllStores[DistAndIdx.second]);
23840 DistToInstMap::iterator VectorizedStoresEnd = LastVectorizedStore.base();
23841 Instrs.erase(Instrs.begin(), VectorizedStoresEnd);
23846 unsigned BaseInstrIdx;
23849 DistToInstMap Instrs;
23857bool SLPVectorizerPass::vectorizeStores(
23859 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
23866 auto TryToVectorize = [&](
const RelatedStoreInsts::DistToInstMap &StoreSeq) {
23867 int64_t PrevDist = -1;
23871 auto &[Dist, InstIdx] =
Data;
23872 if (Operands.
empty() || Dist - PrevDist == 1) {
23875 if (Idx != StoreSeq.size() - 1)
23884 if (Operands.
size() <= 1 ||
23886 .
insert({Operands.front(),
23887 cast<StoreInst>(Operands.front())->getValueOperand(),
23889 cast<StoreInst>(Operands.back())->getValueOperand(),
23894 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
23895 unsigned EltSize =
R.getVectorElementSize(Operands[0]);
23899 std::min(
R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
23901 Type *StoreTy =
Store->getValueOperand()->getType();
23902 Type *ValueTy = StoreTy;
23904 ValueTy = Trunc->getSrcTy();
23913 R.getMinVF(DL->getTypeStoreSizeInBits(StoreScalarTy)), StoreScalarTy,
23916 MinVF = std::max<unsigned>(2, MinVF);
23918 if (MaxVF < MinVF) {
23919 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
23921 <<
"MinVF (" << MinVF <<
")\n");
23925 unsigned NonPowerOf2VF = 0;
23930 unsigned CandVF = std::clamp<unsigned>(Operands.
size(), MinVF, MaxVF);
23932 NonPowerOf2VF = CandVF;
23933 assert(NonPowerOf2VF != MaxVF &&
23934 "Non-power-of-2 VF should not be equal to MaxVF");
23941 unsigned MaxRegVF = MaxVF;
23943 MaxVF = std::min<unsigned>(MaxVF,
bit_floor(Operands.
size()));
23944 if (MaxVF < MinVF) {
23945 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
23947 <<
"MinVF (" << MinVF <<
")\n");
23951 SmallVector<unsigned> CandidateVFs;
23952 for (
unsigned VF = std::max(MaxVF, NonPowerOf2VF); VF >= MinVF;
23956 unsigned End = Operands.
size();
23957 unsigned Repeat = 0;
23958 constexpr unsigned MaxAttempts = 4;
23959 OwningArrayRef<std::pair<unsigned, unsigned>> RangeSizes(Operands.
size());
23960 for (std::pair<unsigned, unsigned> &
P : RangeSizes)
23961 P.first =
P.second = 1;
23962 DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable;
23963 auto IsNotVectorized = [](
bool First,
23964 const std::pair<unsigned, unsigned> &
P) {
23965 return First ?
P.first > 0 :
P.second > 0;
23967 auto IsVectorized = [](
bool First,
23968 const std::pair<unsigned, unsigned> &
P) {
23969 return First ?
P.first == 0 :
P.second == 0;
23971 auto VFIsProfitable = [](
bool First,
unsigned Size,
23972 const std::pair<unsigned, unsigned> &
P) {
23975 auto FirstSizeSame = [](
unsigned Size,
23976 const std::pair<unsigned, unsigned> &
P) {
23977 return Size ==
P.first;
23981 bool RepeatChanged =
false;
23982 bool AnyProfitableGraph =
false;
23983 for (
unsigned VF : CandidateVFs) {
23984 AnyProfitableGraph =
false;
23985 unsigned FirstUnvecStore =
23986 std::distance(RangeSizes.begin(),
23987 find_if(RangeSizes, std::bind(IsNotVectorized,
23988 VF >= MaxRegVF, _1)));
23992 while (FirstUnvecStore < End) {
23993 unsigned FirstVecStore = std::distance(
23994 RangeSizes.begin(),
23995 find_if(RangeSizes.drop_front(FirstUnvecStore),
23996 std::bind(IsVectorized, VF >= MaxRegVF, _1)));
23997 unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore;
23998 for (
unsigned SliceStartIdx = FirstUnvecStore;
23999 SliceStartIdx + VF <= MaxSliceEnd;) {
24010 ->getValueOperand()
24013 ->getValueOperand()
24016 "Expected all operands of same type.");
24017 if (!NonSchedulable.
empty()) {
24018 auto [NonSchedSizeMax, NonSchedSizeMin] =
24020 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= VF) {
24023 SliceStartIdx += NonSchedSizeMax;
24028 std::optional<bool> Res =
24029 vectorizeStoreChain(Slice, R, SliceStartIdx, MinVF, TreeSize);
24035 .first->getSecond()
24043 AnyProfitableGraph = RepeatChanged =
Changed =
true;
24046 for (std::pair<unsigned, unsigned> &
P :
24047 RangeSizes.slice(SliceStartIdx, VF))
24048 P.first =
P.second = 0;
24049 if (SliceStartIdx < FirstUnvecStore + MinVF) {
24050 for (std::pair<unsigned, unsigned> &
P : RangeSizes.slice(
24051 FirstUnvecStore, SliceStartIdx - FirstUnvecStore))
24052 P.first =
P.second = 0;
24053 FirstUnvecStore = SliceStartIdx + VF;
24055 if (SliceStartIdx > MaxSliceEnd - VF - MinVF) {
24056 for (std::pair<unsigned, unsigned> &
P :
24057 RangeSizes.slice(SliceStartIdx + VF,
24058 MaxSliceEnd - (SliceStartIdx + VF)))
24059 P.first =
P.second = 0;
24060 if (MaxSliceEnd == End)
24061 End = SliceStartIdx;
24062 MaxSliceEnd = SliceStartIdx;
24064 SliceStartIdx += VF;
24067 if (VF > 2 && Res &&
24068 !
all_of(RangeSizes.slice(SliceStartIdx, VF),
24069 std::bind(VFIsProfitable, VF >= MaxRegVF, TreeSize,
24071 SliceStartIdx += VF;
24076 if (VF > MaxRegVF && TreeSize > 1 &&
24077 all_of(RangeSizes.slice(SliceStartIdx, VF),
24078 std::bind(FirstSizeSame, TreeSize, _1))) {
24079 SliceStartIdx += VF;
24080 while (SliceStartIdx != MaxSliceEnd &&
24081 RangeSizes[SliceStartIdx].first == TreeSize)
24085 if (TreeSize > 1) {
24086 for (std::pair<unsigned, unsigned> &
P :
24087 RangeSizes.slice(SliceStartIdx, VF)) {
24088 if (VF >= MaxRegVF)
24089 P.second = std::max(
P.second, TreeSize);
24091 P.first = std::max(
P.first, TreeSize);
24095 AnyProfitableGraph =
true;
24097 if (FirstUnvecStore >= End)
24099 if (MaxSliceEnd - FirstUnvecStore < VF &&
24100 MaxSliceEnd - FirstUnvecStore >= MinVF)
24101 AnyProfitableGraph =
true;
24102 FirstUnvecStore = std::distance(
24103 RangeSizes.begin(),
24104 find_if(RangeSizes.drop_front(MaxSliceEnd),
24105 std::bind(IsNotVectorized, VF >= MaxRegVF, _1)));
24107 if (!AnyProfitableGraph && VF >= MaxRegVF &&
has_single_bit(VF))
24111 if (
all_of(RangeSizes, [](
const std::pair<unsigned, unsigned> &
P) {
24112 return P.first == 0 &&
P.second == 0;
24116 if (Repeat >= MaxAttempts ||
24117 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
24119 constexpr unsigned StoresLimit = 64;
24120 const unsigned MaxTotalNum = std::min<unsigned>(
24122 static_cast<unsigned>(
24125 RangeSizes.begin(),
24126 find_if(RangeSizes, std::bind(IsNotVectorized,
true, _1))) +
24128 unsigned VF =
bit_ceil(CandidateVFs.front()) * 2;
24131 CandidateVFs.clear();
24133 CandidateVFs.push_back(Limit);
24134 if (VF > MaxTotalNum || VF >= StoresLimit)
24136 for (std::pair<unsigned, unsigned> &
P : RangeSizes) {
24138 P.first = std::max(
P.second,
P.first);
24142 CandidateVFs.push_back(VF);
24182 auto FillStoresSet = [&](
unsigned Idx, StoreInst *
SI) {
24183 std::optional<int64_t> PtrDist;
24184 auto *RelatedStores =
find_if(
24185 SortedStores, [&PtrDist, SI,
this](
const RelatedStoreInsts &StoreSeq) {
24186 PtrDist = StoreSeq.getPointerDiff(*SI, *DL, *SE);
24187 return PtrDist.has_value();
24191 if (RelatedStores == SortedStores.
end()) {
24199 if (std::optional<unsigned> PrevInst =
24200 RelatedStores->insertOrLookup(Idx, *PtrDist)) {
24201 TryToVectorize(RelatedStores->getStores());
24202 RelatedStores->clearVectorizedStores(VectorizedStores);
24203 RelatedStores->rebase(*PrevInst + 1,
24208 Type *PrevValTy =
nullptr;
24210 if (
R.isDeleted(SI))
24213 PrevValTy =
SI->getValueOperand()->getType();
24215 if (PrevValTy !=
SI->getValueOperand()->getType()) {
24216 for (RelatedStoreInsts &StoreSeq : SortedStores)
24217 TryToVectorize(StoreSeq.getStores());
24218 SortedStores.clear();
24219 PrevValTy =
SI->getValueOperand()->getType();
24221 FillStoresSet(
I, SI);
24225 for (RelatedStoreInsts &StoreSeq : SortedStores)
24226 TryToVectorize(StoreSeq.getStores());
24231void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
24239 for (Instruction &
I : *BB) {
24243 if (!
SI->isSimple())
24254 if (
GEP->getNumIndices() != 1)
24256 Value *Idx =
GEP->idx_begin()->get();
24261 if (
GEP->getType()->isVectorTy())
24273 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize a list of length = "
24274 << VL.
size() <<
".\n");
24285 for (
Value *V : VL) {
24286 Type *Ty =
V->getType();
24290 R.getORE()->emit([&]() {
24291 std::string TypeStr;
24292 llvm::raw_string_ostream OS(TypeStr);
24294 return OptimizationRemarkMissed(
SV_NAME,
"UnsupportedType", I0)
24295 <<
"Cannot SLP vectorize list: type "
24296 << TypeStr +
" is unsupported by vectorizer";
24303 unsigned Sz =
R.getVectorElementSize(I0);
24304 unsigned MinVF =
R.getMinVF(Sz);
24305 unsigned MaxVF = std::max<unsigned>(
24307 MaxVF = std::min(
R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
24309 R.getORE()->emit([&]() {
24310 return OptimizationRemarkMissed(
SV_NAME,
"SmallVF", I0)
24311 <<
"Cannot SLP vectorize list: vectorization factor "
24312 <<
"less than 2 is not supported";
24318 bool CandidateFound =
false;
24321 unsigned NextInst = 0, MaxInst = VL.size();
24322 for (
unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
24328 if (TTI->getNumberOfParts(VecTy) == VF)
24330 for (
unsigned I = NextInst;
I < MaxInst; ++
I) {
24331 unsigned ActualVF = std::min(MaxInst -
I, VF);
24336 if (MaxVFOnly && ActualVF < MaxVF)
24338 if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
24343 for (
Value *V : VL.drop_front(
I)) {
24347 !Inst || !
R.isDeleted(Inst)) {
24350 if (Idx == ActualVF)
24355 if (Idx != ActualVF)
24358 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << ActualVF <<
" operations "
24362 if (
R.isTreeTinyAndNotFullyVectorizable())
24364 if (
R.isProfitableToReorder()) {
24365 R.reorderTopToBottom();
24368 R.transformNodes();
24369 R.computeMinimumValueSizes();
24371 R.buildExternalUses();
24374 CandidateFound =
true;
24375 MinCost = std::min(MinCost,
Cost);
24378 <<
" for VF=" << ActualVF <<
"\n");
24381 R.getORE()->emit(OptimizationRemark(
SV_NAME,
"VectorizedList",
24383 <<
"SLP vectorized with cost " <<
ore::NV(
"Cost",
Cost)
24384 <<
" and with tree size "
24385 <<
ore::NV(
"TreeSize",
R.getTreeSize()));
24396 if (!
Changed && CandidateFound) {
24397 R.getORE()->emit([&]() {
24398 return OptimizationRemarkMissed(
SV_NAME,
"NotBeneficial", I0)
24399 <<
"List vectorization was possible but not beneficial with cost "
24400 <<
ore::NV(
"Cost", MinCost) <<
" >= "
24404 R.getORE()->emit([&]() {
24405 return OptimizationRemarkMissed(
SV_NAME,
"NotPossible", I0)
24406 <<
"Cannot SLP vectorize list: vectorization was impossible"
24407 <<
" with available vectorization factors";
24442 using ReductionOpsType = SmallVector<Value *, 16>;
24443 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
24444 ReductionOpsListType ReductionOps;
24448 SmallDenseMap<Value *, SmallVector<Instruction *>, 16> ReducedValsToOps;
24449 WeakTrackingVH ReductionRoot;
24454 bool IsSupportedHorRdxIdentityOp =
false;
24461 static bool isCmpSelMinMax(Instruction *
I) {
24469 static bool isBoolLogicOp(Instruction *
I) {
24475 static bool isVectorizable(
RecurKind Kind, Instruction *
I,
24476 bool TwoElementReduction =
false) {
24477 if (Kind == RecurKind::None)
24486 if (TwoElementReduction)
24489 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
24493 return I->getFastMathFlags().noNaNs();
24496 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
24499 return I->isAssociative();
24502 static Value *getRdxOperand(Instruction *
I,
unsigned Index) {
24508 return I->getOperand(2);
24509 return I->getOperand(Index);
24514 Value *
RHS,
const Twine &Name,
bool UseSelect) {
24518 case RecurKind::Or: {
24527 case RecurKind::And: {
24537 case RecurKind::Add:
24538 case RecurKind::Mul:
24539 case RecurKind::Xor:
24540 case RecurKind::FAdd:
24541 case RecurKind::FMul: {
24546 case RecurKind::SMax:
24547 case RecurKind::SMin:
24548 case RecurKind::UMax:
24549 case RecurKind::UMin:
24557 case RecurKind::FMax:
24558 case RecurKind::FMin:
24559 case RecurKind::FMaximum:
24560 case RecurKind::FMinimum:
24561 case RecurKind::FMaximumNum:
24562 case RecurKind::FMinimumNum: {
24575 const ReductionOpsListType &ReductionOps) {
24576 bool UseSelect = ReductionOps.size() == 2 ||
24578 (ReductionOps.size() == 1 &&
24580 assert((!UseSelect || ReductionOps.size() != 2 ||
24582 "Expected cmp + select pairs for reduction");
24583 Value *
Op = createOp(Builder, RdxKind,
LHS,
RHS, Name, UseSelect);
24601 return RecurKind::None;
24603 return RecurKind::Add;
24605 return RecurKind::Mul;
24608 return RecurKind::And;
24611 return RecurKind::Or;
24613 return RecurKind::Xor;
24615 return RecurKind::FAdd;
24617 return RecurKind::FMul;
24620 return RecurKind::FMax;
24622 return RecurKind::FMin;
24625 return RecurKind::FMaximum;
24627 return RecurKind::FMinimum;
24633 return RecurKind::SMax;
24635 return RecurKind::SMin;
24637 return RecurKind::UMax;
24639 return RecurKind::UMin;
24665 return RecurKind::None;
24669 return RecurKind::None;
24672 return RecurKind::None;
24676 return RecurKind::None;
24681 return RecurKind::None;
24684 return RecurKind::SMax;
24687 return RecurKind::SMin;
24690 return RecurKind::UMax;
24693 return RecurKind::UMin;
24696 return RecurKind::None;
24700 static unsigned getFirstOperandIndex(Instruction *
I) {
24701 return isCmpSelMinMax(
I) ? 1 : 0;
24706 static unsigned getNumberOfOperands(Instruction *
I) {
24707 return isCmpSelMinMax(
I) ? 3 : 2;
24712 static bool hasSameParent(Instruction *
I, BasicBlock *BB) {
24713 if (isCmpSelMinMax(
I) || isBoolLogicOp(
I)) {
24716 return Sel->getParent() == BB &&
Cmp &&
Cmp->getParent() == BB;
24718 return I->getParent() == BB;
24722 static bool hasRequiredNumberOfUses(
bool IsCmpSelMinMax, Instruction *
I) {
24723 if (IsCmpSelMinMax) {
24727 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
24728 return I->hasNUses(2);
24736 void initReductionOps(Instruction *
I) {
24737 if (isCmpSelMinMax(
I))
24738 ReductionOps.assign(2, ReductionOpsType());
24740 ReductionOps.assign(1, ReductionOpsType());
24744 void addReductionOps(Instruction *
I) {
24745 if (isCmpSelMinMax(
I)) {
24747 ReductionOps[1].emplace_back(
I);
24749 ReductionOps[0].emplace_back(
I);
24754 int Sz =
Data.size();
24763 : ReductionRoot(
I), ReductionLimit(2) {
24764 RdxKind = HorizontalReduction::getRdxKind(
I);
24765 ReductionOps.emplace_back().push_back(
I);
24768 ReducedValsToOps[
V].push_back(
I);
24771 bool matchReductionForOperands()
const {
24774 assert(ReductionRoot &&
"Reduction root is not set!");
24777 return Ops.size() == 2;
24785 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
24786 ScalarEvolution &SE,
const DataLayout &
DL,
24787 const TargetLibraryInfo &TLI) {
24788 RdxKind = HorizontalReduction::getRdxKind(Root);
24789 if (!isVectorizable(RdxKind, Root))
24801 if (!Sel->getCondition()->hasOneUse())
24804 ReductionRoot = Root;
24809 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
24811 1, std::make_pair(Root, 0));
24816 SmallVectorImpl<Value *> &PossibleReducedVals,
24817 SmallVectorImpl<Instruction *> &ReductionOps,
24820 getNumberOfOperands(TreeN)))) {
24821 Value *EdgeVal = getRdxOperand(TreeN,
I);
24822 ReducedValsToOps[EdgeVal].push_back(TreeN);
24830 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
24831 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
24832 !isVectorizable(RdxKind, EdgeInst) ||
24833 (
R.isAnalyzedReductionRoot(EdgeInst) &&
24835 PossibleReducedVals.push_back(EdgeVal);
24838 ReductionOps.push_back(EdgeInst);
24847 size_t, SmallMapVector<size_t, SmallMapVector<Value *, unsigned, 2>, 2>,
24849 PossibleReducedVals;
24850 initReductionOps(Root);
24852 SmallSet<size_t, 2> LoadKeyUsed;
24854 auto GenerateLoadsSubkey = [&](
size_t Key, LoadInst *LI) {
24859 auto LIt = LoadsMap.
find(std::make_pair(
Key, Ptr));
24860 if (LIt != LoadsMap.
end()) {
24861 for (LoadInst *RLI : LIt->second) {
24867 for (LoadInst *RLI : LIt->second) {
24874 if (LIt->second.size() > 2) {
24876 hash_value(LIt->second.back()->getPointerOperand());
24882 .first->second.push_back(LI);
24886 while (!Worklist.empty()) {
24887 auto [TreeN,
Level] = Worklist.pop_back_val();
24890 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
24891 addReductionOps(TreeN);
24894 for (
Value *V : PossibleRedVals) {
24898 ++PossibleReducedVals[
Key][Idx].
try_emplace(V, 0).first->second;
24900 for (Instruction *
I :
reverse(PossibleReductionOps))
24901 Worklist.emplace_back(
I,
I->getParent() == BB ? 0 : Level + 1);
24903 auto PossibleReducedValsVect = PossibleReducedVals.
takeVector();
24906 for (
auto &PossibleReducedVals : PossibleReducedValsVect) {
24907 auto PossibleRedVals = PossibleReducedVals.second.
takeVector();
24909 for (
auto &Slice : PossibleRedVals) {
24911 auto RedValsVect = Slice.second.takeVector();
24913 for (
const std::pair<Value *, unsigned> &
Data : RedValsVect)
24914 PossibleRedValsVect.
back().append(
Data.second,
Data.first);
24916 stable_sort(PossibleRedValsVect, [](
const auto &P1,
const auto &P2) {
24917 return P1.size() > P2.size();
24924 }
else if (!isGoodForReduction(
Data)) {
24927 if (!LI || !LastLI ||
24932 ReducedVals.
back().append(
Data.rbegin(),
Data.rend());
24938 return P1.size() > P2.
size();
24944 Value *tryToReduce(BoUpSLP &V,
const DataLayout &
DL, TargetTransformInfo *
TTI,
24945 const TargetLibraryInfo &TLI, AssumptionCache *AC,
24946 DominatorTree &DT) {
24947 constexpr unsigned RegMaxNumber = 4;
24948 constexpr unsigned RedValsMaxNumber = 128;
24952 if (
unsigned NumReducedVals = std::accumulate(
24953 ReducedVals.
begin(), ReducedVals.
end(), 0,
24955 if (!isGoodForReduction(Vals))
24957 return Num + Vals.size();
24959 NumReducedVals < ReductionLimit &&
24963 for (ReductionOpsType &RdxOps : ReductionOps)
24964 for (
Value *RdxOp : RdxOps)
24969 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
24975 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.
size() *
24976 ReducedVals.
front().size());
24980 auto &&GetCmpForMinMaxReduction = [](
Instruction *RdxRootInst) {
24982 "Expected min/max reduction to have select root instruction");
24985 "Expected min/max reduction to have compare condition");
24989 bool AnyBoolLogicOp =
any_of(ReductionOps.back(), [](
Value *V) {
24990 return isBoolLogicOp(cast<Instruction>(V));
24993 auto GetNewVectorizedTree = [&](
Value *VectorizedTree,
Value *Res) {
24994 if (VectorizedTree) {
24998 if (AnyBoolLogicOp) {
24999 auto It = ReducedValsToOps.
find(VectorizedTree);
25000 auto It1 = ReducedValsToOps.
find(Res);
25001 if ((It == ReducedValsToOps.
end() && It1 == ReducedValsToOps.
end()) ||
25003 (It != ReducedValsToOps.
end() &&
25004 any_of(It->getSecond(), [&](Instruction *
I) {
25005 return isBoolLogicOp(I) &&
25006 getRdxOperand(I, 0) == VectorizedTree;
25010 (It1 != ReducedValsToOps.
end() &&
25011 any_of(It1->getSecond(), [&](Instruction *
I) {
25012 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
25016 VectorizedTree = Builder.
CreateFreeze(VectorizedTree);
25020 return createOp(Builder, RdxKind, VectorizedTree, Res,
"op.rdx",
25026 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
25027 ReductionOps.front().size());
25028 for (ReductionOpsType &RdxOps : ReductionOps)
25029 for (
Value *RdxOp : RdxOps) {
25032 IgnoreList.insert(RdxOp);
25035 FastMathFlags RdxFMF;
25037 for (
Value *U : IgnoreList)
25039 RdxFMF &= FPMO->getFastMathFlags();
25045 for (
Value *V : Candidates)
25046 TrackedVals.try_emplace(V, V);
25048 auto At = [](SmallMapVector<Value *, unsigned, 16> &MV,
25049 Value *
V) ->
unsigned & {
25050 auto *It = MV.
find(V);
25051 assert(It != MV.
end() &&
"Unable to find given key.");
25055 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.
size());
25058 SmallPtrSet<Value *, 4> RequiredExtract;
25059 WeakTrackingVH VectorizedTree =
nullptr;
25060 bool CheckForReusedReductionOps =
false;
25070 States.
back().getOpcode() == Instruction::Load)) {
25071 LocalReducedVals.
emplace_back().append(RV.begin(), RV.end());
25072 States.
push_back(InstructionsState::invalid());
25075 if (!LocalReducedVals.
empty() &&
25078 LocalReducedVals.
emplace_back().append(RV.begin(), RV.end());
25083 if (!LocalReducedVals.
empty())
25084 Ops = LocalReducedVals.
back();
25085 Ops.append(RV.begin(), RV.end());
25086 InstructionsCompatibilityAnalysis
Analysis(DT,
DL, *
TTI, TLI);
25087 InstructionsState OpS =
25089 if (LocalReducedVals.
empty()) {
25095 LocalReducedVals.
back().swap(
Ops);
25096 States.
back() = OpS;
25099 LocalReducedVals.
emplace_back().append(RV.begin(), RV.end());
25102 ReducedVals.swap(LocalReducedVals);
25103 for (
unsigned I = 0,
E = ReducedVals.
size();
I <
E; ++
I) {
25105 InstructionsState S = States[
I];
25108 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.
size());
25109 for (
Value *ReducedVal : OrigReducedVals) {
25110 Value *RdxVal = TrackedVals.at(ReducedVal);
25117 (!S || (!S.getMatchingMainOpOrAltOp(Inst) &&
25118 !S.isCopyableElement(Inst)))) ||
25120 !S.isCopyableElement(RdxVal)))
25123 TrackedToOrig.try_emplace(RdxVal, ReducedVal);
25125 bool ShuffledExtracts =
false;
25127 if (S && S.getOpcode() == Instruction::ExtractElement &&
25128 !S.isAltShuffle() &&
I + 1 <
E) {
25130 for (
Value *RV : ReducedVals[
I + 1]) {
25131 Value *RdxVal = TrackedVals.at(RV);
25138 CommonCandidates.push_back(RdxVal);
25139 TrackedToOrig.try_emplace(RdxVal, RV);
25141 SmallVector<int>
Mask;
25144 Candidates.
swap(CommonCandidates);
25145 ShuffledExtracts =
true;
25152 Value *OrigV = TrackedToOrig.at(Candidates.
front());
25153 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
25155 Res = createOp(Builder, RdxKind, Res, VC,
"const.rdx", ReductionOps);
25156 Value *OrigV = TrackedToOrig.at(VC);
25157 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
25159 V.analyzedReductionRoot(ResI);
25161 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
25165 unsigned NumReducedVals = Candidates.
size();
25166 if (NumReducedVals < ReductionLimit &&
25167 (NumReducedVals < 2 || !
isSplat(Candidates)))
25172 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
25173 RdxKind != RecurKind::FMul &&
25174 RdxKind != RecurKind::FMulAdd;
25176 SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
25177 if (IsSupportedHorRdxIdentityOp)
25178 for (
Value *V : Candidates) {
25179 Value *OrigV = TrackedToOrig.at(V);
25180 ++SameValuesCounter.
try_emplace(OrigV).first->second;
25192 bool SameScaleFactor =
false;
25193 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
25194 SameValuesCounter.
size() != Candidates.size();
25196 if (OptReusedScalars) {
25198 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
25199 RdxKind == RecurKind::Xor) &&
25201 [&SameValuesCounter](
const std::pair<Value *, unsigned> &
P) {
25202 return P.second == SameValuesCounter.
front().second;
25204 Candidates.resize(SameValuesCounter.
size());
25205 transform(SameValuesCounter, Candidates.begin(),
25206 [&](
const auto &
P) { return TrackedVals.at(P.first); });
25207 NumReducedVals = Candidates.size();
25209 if (NumReducedVals == 1) {
25210 Value *OrigV = TrackedToOrig.at(Candidates.front());
25211 unsigned Cnt = At(SameValuesCounter, OrigV);
25213 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
25214 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
25215 VectorizedVals.try_emplace(OrigV, Cnt);
25216 ExternallyUsedValues.
insert(OrigV);
25221 unsigned MaxVecRegSize =
V.getMaxVecRegSize();
25222 unsigned EltSize =
V.getVectorElementSize(Candidates[0]);
25223 const unsigned MaxElts = std::clamp<unsigned>(
25225 RegMaxNumber * RedValsMaxNumber);
25227 unsigned ReduxWidth = NumReducedVals;
25228 auto GetVectorFactor = [&, &
TTI = *
TTI](
unsigned ReduxWidth) {
25229 unsigned NumParts, NumRegs;
25230 Type *ScalarTy = Candidates.front()->getType();
25237 while (NumParts > NumRegs) {
25238 assert(ReduxWidth > 0 &&
"ReduxWidth is unexpectedly 0.");
25239 ReduxWidth =
bit_floor(ReduxWidth - 1);
25245 if (NumParts > NumRegs / 2)
25250 ReduxWidth = GetVectorFactor(ReduxWidth);
25251 ReduxWidth = std::min(ReduxWidth, MaxElts);
25253 unsigned Start = 0;
25254 unsigned Pos =
Start;
25256 unsigned PrevReduxWidth = ReduxWidth;
25257 bool CheckForReusedReductionOpsLocal =
false;
25258 auto AdjustReducedVals = [&](
bool IgnoreVL =
false) {
25259 bool IsAnyRedOpGathered = !IgnoreVL &&
V.isAnyGathered(IgnoreList);
25260 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
25263 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
25266 if (Pos < NumReducedVals - ReduxWidth + 1)
25267 return IsAnyRedOpGathered;
25270 if (ReduxWidth > 1)
25271 ReduxWidth = GetVectorFactor(ReduxWidth);
25272 return IsAnyRedOpGathered;
25274 bool AnyVectorized =
false;
25275 SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
25276 while (Pos < NumReducedVals - ReduxWidth + 1 &&
25277 ReduxWidth >= ReductionLimit) {
25280 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
25282 CheckForReusedReductionOps =
true;
25285 PrevReduxWidth = ReduxWidth;
25288 if (IgnoredCandidates.
contains(std::make_pair(Pos, ReduxWidth)) ||
25291 std::make_pair(Pos,
bit_floor(ReduxWidth))) ||
25293 std::make_pair(Pos + (ReduxWidth -
bit_floor(ReduxWidth)),
25295 V.areAnalyzedReductionVals(VL)) {
25296 (void)AdjustReducedVals(
true);
25303 return RedValI &&
V.isDeleted(RedValI);
25306 V.buildTree(VL, IgnoreList);
25307 if (
V.isTreeTinyAndNotFullyVectorizable(
true)) {
25308 if (!AdjustReducedVals())
25309 V.analyzedReductionVals(VL);
25312 if (
V.isLoadCombineReductionCandidate(RdxKind)) {
25313 if (!AdjustReducedVals())
25314 V.analyzedReductionVals(VL);
25317 V.reorderTopToBottom();
25320 VL.front()->getType()->isIntOrIntVectorTy() ||
25321 ReductionLimit > 2);
25325 ExternallyUsedValues);
25329 LocalExternallyUsedValues.insert(ReductionRoot);
25330 for (
unsigned Cnt = 0, Sz = ReducedVals.
size(); Cnt < Sz; ++Cnt) {
25331 if (Cnt ==
I || (ShuffledExtracts && Cnt ==
I - 1))
25333 for (
Value *V : ReducedVals[Cnt])
25335 LocalExternallyUsedValues.insert(TrackedVals[V]);
25337 if (!IsSupportedHorRdxIdentityOp) {
25340 "Reused values counter map is not empty");
25341 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
25342 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
25344 Value *
V = Candidates[Cnt];
25345 Value *OrigV = TrackedToOrig.at(V);
25346 ++SameValuesCounter.
try_emplace(OrigV).first->second;
25349 V.transformNodes();
25350 V.computeMinimumValueSizes();
25355 SmallPtrSet<Value *, 4> Visited;
25356 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
25357 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
25359 Value *RdxVal = Candidates[Cnt];
25360 if (
auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
25361 RdxVal = It->second;
25362 if (!Visited.
insert(RdxVal).second)
25366 if (!VLScalars.contains(RdxVal) &&
V.isVectorized(RdxVal)) {
25367 LocalExternallyUsedValues.insert(RdxVal);
25370 Value *OrigV = TrackedToOrig.at(RdxVal);
25372 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
25373 if (
NumOps != ReducedValsToOps.
at(OrigV).size())
25374 LocalExternallyUsedValues.insert(RdxVal);
25377 if (!IsSupportedHorRdxIdentityOp)
25378 SameValuesCounter.
clear();
25379 for (
Value *RdxVal : VL)
25380 if (RequiredExtract.
contains(RdxVal))
25381 LocalExternallyUsedValues.insert(RdxVal);
25382 V.buildExternalUses(LocalExternallyUsedValues);
25386 getReductionCost(
TTI, VL, IsCmpSelMinMax, RdxFMF, V, DT,
DL, TLI);
25389 <<
" for reduction\n");
25393 V.getORE()->emit([&]() {
25394 return OptimizationRemarkMissed(
SV_NAME,
"HorSLPNotBeneficial",
25395 ReducedValsToOps.
at(VL[0]).front())
25396 <<
"Vectorizing horizontal reduction is possible "
25397 <<
"but not beneficial with cost " <<
ore::NV(
"Cost",
Cost)
25398 <<
" and threshold "
25401 if (!AdjustReducedVals()) {
25402 V.analyzedReductionVals(VL);
25404 if (ReduxWidth > ReductionLimit &&
V.isTreeNotExtendable()) {
25407 *
TTI, VL.front()->getType(), ReduxWidth - 1);
25408 VF >= ReductionLimit;
25410 *
TTI, VL.front()->getType(), VF - 1)) {
25412 V.getCanonicalGraphSize() !=
V.getTreeSize())
25415 IgnoredCandidates.
insert(std::make_pair(
Offset + Idx, VF));
25422 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorizing horizontal reduction at cost:"
25423 <<
Cost <<
". (HorRdx)\n");
25424 V.getORE()->emit([&]() {
25425 return OptimizationRemark(
SV_NAME,
"VectorizedHorizontalReduction",
25426 ReducedValsToOps.
at(VL[0]).front())
25427 <<
"Vectorized horizontal reduction with cost "
25428 <<
ore::NV(
"Cost",
Cost) <<
" and with tree size "
25429 <<
ore::NV(
"TreeSize",
V.getTreeSize());
25438 if (IsCmpSelMinMax)
25439 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
25442 Value *VectorizedRoot =
V.vectorizeTree(
25443 LocalExternallyUsedValues, InsertPt, VectorValuesAndScales);
25446 for (
Value *RdxVal : Candidates) {
25447 Value *OrigVal = TrackedToOrig.at(RdxVal);
25448 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
25449 if (TransformedRdxVal != RdxVal)
25450 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
25459 VectorizedRoot = Builder.
CreateFreeze(VectorizedRoot);
25462 if (OptReusedScalars && !SameScaleFactor) {
25463 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
25464 SameValuesCounter, TrackedToOrig);
25467 Type *ScalarTy = VL.front()->getType();
25472 OptReusedScalars && SameScaleFactor
25473 ? SameValuesCounter.
front().second
25476 ?
V.isSignedMinBitwidthRootNode()
25480 for (
Value *RdxVal : VL) {
25481 Value *OrigV = TrackedToOrig.at(RdxVal);
25482 if (IsSupportedHorRdxIdentityOp) {
25483 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
25486 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
25487 if (!
V.isVectorized(RdxVal))
25488 RequiredExtract.
insert(RdxVal);
25492 ReduxWidth = NumReducedVals - Pos;
25493 if (ReduxWidth > 1)
25494 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
25495 AnyVectorized =
true;
25497 if (OptReusedScalars && !AnyVectorized) {
25498 for (
const std::pair<Value *, unsigned> &
P : SameValuesCounter) {
25499 Value *RdxVal = TrackedVals.at(
P.first);
25500 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder,
P.second);
25501 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
25502 VectorizedVals.try_emplace(
P.first,
P.second);
25507 if (!VectorValuesAndScales.
empty())
25508 VectorizedTree = GetNewVectorizedTree(
25510 emitReduction(Builder, *
TTI, ReductionRoot->getType()));
25512 if (!VectorizedTree) {
25513 if (!CheckForReusedReductionOps) {
25514 for (ReductionOpsType &RdxOps : ReductionOps)
25515 for (
Value *RdxOp : RdxOps)
25537 auto FixBoolLogicalOps =
25540 if (!AnyBoolLogicOp)
25542 if (isBoolLogicOp(RedOp1) && ((!InitStep &&
LHS == VectorizedTree) ||
25543 getRdxOperand(RedOp1, 0) ==
LHS ||
25546 bool NeedFreeze =
LHS != VectorizedTree;
25547 if (isBoolLogicOp(RedOp2) && ((!InitStep &&
RHS == VectorizedTree) ||
25548 getRdxOperand(RedOp2, 0) ==
RHS ||
25551 if ((InitStep ||
RHS != VectorizedTree) &&
25552 getRdxOperand(RedOp2, 0) ==
RHS &&
25553 ((isBoolLogicOp(RedOp1) &&
25554 getRdxOperand(RedOp1, 1) == RedOp2) ||
25558 return OpI && isBoolLogicOp(OpI) &&
25559 getRdxOperand(OpI, 1) == RedOp2;
25562 NeedFreeze =
false;
25576 unsigned Sz = InstVals.
size();
25578 for (
unsigned I = 0,
E = (Sz / 2) * 2;
I <
E;
I += 2) {
25581 Value *RdxVal1 = InstVals[
I].second;
25582 Value *StableRdxVal1 = RdxVal1;
25583 auto It1 = TrackedVals.find(RdxVal1);
25584 if (It1 != TrackedVals.end())
25585 StableRdxVal1 = It1->second;
25586 Value *RdxVal2 = InstVals[
I + 1].second;
25587 Value *StableRdxVal2 = RdxVal2;
25588 auto It2 = TrackedVals.find(RdxVal2);
25589 if (It2 != TrackedVals.end())
25590 StableRdxVal2 = It2->second;
25594 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[
I].first,
25596 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
25597 StableRdxVal2,
"op.rdx", ReductionOps);
25598 ExtraReds[
I / 2] = std::make_pair(InstVals[
I].first, ExtraRed);
25601 ExtraReds[Sz / 2] = InstVals.
back();
25607 SmallPtrSet<Value *, 8> Visited;
25609 for (
Value *RdxVal : Candidates) {
25610 if (!Visited.
insert(RdxVal).second)
25612 unsigned NumOps = VectorizedVals.lookup(RdxVal);
25613 for (Instruction *RedOp :
25619 bool InitStep =
true;
25620 while (ExtraReductions.
size() > 1) {
25622 FinalGen(ExtraReductions, InitStep);
25623 ExtraReductions.
swap(NewReds);
25626 VectorizedTree = ExtraReductions.
front().second;
25628 ReductionRoot->replaceAllUsesWith(VectorizedTree);
25635 SmallPtrSet<Value *, 4> IgnoreSet;
25644 for (
auto *U :
Ignore->users()) {
25646 "All users must be either in the reduction ops list.");
25649 if (!
Ignore->use_empty()) {
25651 Ignore->replaceAllUsesWith(
P);
25654 V.removeInstructionsAndOperands(RdxOps, VectorValuesAndScales);
25656 return VectorizedTree;
25662 Value *createSingleOp(IRBuilderBase &Builder,
const TargetTransformInfo &
TTI,
25663 Value *Vec,
unsigned Scale,
bool IsSigned,
25687 Rdx, emitReduction(Lane, Builder, &
TTI, DestTy),
I);
25690 Rdx = emitReduction(Vec, Builder, &
TTI, DestTy);
25692 if (Rdx->
getType() != DestTy)
25698 Rdx = emitScaleForReusedOps(Rdx, Builder, Scale);
25705 bool IsCmpSelMinMax, FastMathFlags FMF,
25706 const BoUpSLP &R, DominatorTree &DT,
25707 const DataLayout &
DL,
25708 const TargetLibraryInfo &TLI) {
25710 Type *ScalarTy = ReducedVals.
front()->getType();
25711 unsigned ReduxWidth = ReducedVals.
size();
25712 FixedVectorType *VectorTy =
R.getReductionType();
25717 auto EvaluateScalarCost = [&](function_ref<
InstructionCost()> GenCostFn) {
25720 int Cnt = ReducedVals.
size();
25721 for (
Value *RdxVal : ReducedVals) {
25728 Cost += GenCostFn();
25732 for (User *U : RdxVal->
users()) {
25734 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
25735 if (RdxKind == RecurKind::FAdd) {
25745 FMACost -= FMulCost;
25747 ScalarCost += FMACost;
25754 ScalarCost = InstructionCost::getInvalid();
25758 Cost += ScalarCost;
25760 Cost += GenCostFn();
25769 bool DoesRequireReductionOp = !AllConsts && VectorValuesAndScales.
empty();
25771 case RecurKind::Add:
25772 case RecurKind::Mul:
25773 case RecurKind::Or:
25774 case RecurKind::And:
25775 case RecurKind::Xor:
25776 case RecurKind::FAdd:
25777 case RecurKind::FMul: {
25780 if (DoesRequireReductionOp) {
25783 unsigned ScalarTyNumElements = VecTy->getNumElements();
25788 ReducedVals.size()),
25799 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
25800 std::make_pair(RedTy,
true));
25801 if (RType == RedTy) {
25806 RdxOpcode, !IsSigned, RedTy,
25812 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
25813 std::make_pair(RedTy,
true));
25816 if (RdxKind == RecurKind::FAdd) {
25821 for (
Value *RdxVal : ReducedVals) {
25827 FMF &= FPCI->getFastMathFlags();
25830 if (!
Ops.empty()) {
25835 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy,
25836 {RVecTy, RVecTy, RVecTy}, FMF);
25842 Instruction::FMul, RVecTy,
CostKind);
25844 <<
"Minus vector FMul cost: " << FMulCost <<
"\n");
25845 FMACost -= FMulCost;
25849 if (FMACost.isValid())
25850 VectorCost += FMACost;
25854 if (RType != RedTy) {
25855 unsigned Opcode = Instruction::Trunc;
25857 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
25863 ScalarCost = EvaluateScalarCost([&]() {
25868 case RecurKind::FMax:
25869 case RecurKind::FMin:
25870 case RecurKind::FMaximum:
25871 case RecurKind::FMinimum:
25872 case RecurKind::SMax:
25873 case RecurKind::SMin:
25874 case RecurKind::UMax:
25875 case RecurKind::UMin: {
25878 if (DoesRequireReductionOp) {
25884 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
25885 std::make_pair(RedTy,
true));
25887 IntrinsicCostAttributes ICA(Id, RVecTy, {RVecTy, RVecTy}, FMF);
25889 if (RType != RedTy) {
25890 unsigned Opcode = Instruction::Trunc;
25892 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
25898 ScalarCost = EvaluateScalarCost([&]() {
25899 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
25908 LLVM_DEBUG(
dbgs() <<
"SLP: Adding cost " << VectorCost - ScalarCost
25910 <<
" (It is a splitting reduction)\n");
25911 return VectorCost - ScalarCost;
25917 Value *emitReduction(IRBuilderBase &Builder,
const TargetTransformInfo &
TTI,
25919 Value *ReducedSubTree =
nullptr;
25921 auto CreateSingleOp = [&](
Value *Vec,
unsigned Scale,
bool IsSigned) {
25922 Value *Rdx = createSingleOp(Builder,
TTI, Vec, Scale, IsSigned, DestTy);
25923 if (ReducedSubTree)
25924 ReducedSubTree = createOp(Builder, RdxKind, ReducedSubTree, Rdx,
25925 "op.rdx", ReductionOps);
25927 ReducedSubTree = Rdx;
25929 if (VectorValuesAndScales.
size() == 1) {
25930 const auto &[Vec, Scale, IsSigned] = VectorValuesAndScales.
front();
25931 CreateSingleOp(Vec, Scale, IsSigned);
25932 return ReducedSubTree;
25936 Value *VecRes =
nullptr;
25937 bool VecResSignedness =
false;
25938 auto CreateVecOp = [&](
Value *Vec,
unsigned Cnt,
bool IsSigned) {
25944 case RecurKind::Add: {
25945 if (ScalarTy == Builder.
getInt1Ty() && ScalarTy != DestTy) {
25948 <<
". (HorRdx)\n");
25951 std::iota(std::next(
Mask.begin(), VF *
I),
25952 std::next(
Mask.begin(), VF * (
I + 1)), 0);
25953 ++NumVectorInstructions;
25964 LLVM_DEBUG(
dbgs() <<
"SLP: Add (to-mul) " << Cnt <<
"of " << Vec
25965 <<
". (HorRdx)\n");
25966 ++NumVectorInstructions;
25970 case RecurKind::Xor: {
25973 <<
"SLP: Xor " << Cnt <<
"of " << Vec <<
". (HorRdx)\n");
25978 case RecurKind::FAdd: {
25982 LLVM_DEBUG(
dbgs() <<
"SLP: FAdd (to-fmul) " << Cnt <<
"of " << Vec
25983 <<
". (HorRdx)\n");
25984 ++NumVectorInstructions;
25988 case RecurKind::And:
25989 case RecurKind::Or:
25990 case RecurKind::SMax:
25991 case RecurKind::SMin:
25992 case RecurKind::UMax:
25993 case RecurKind::UMin:
25994 case RecurKind::FMax:
25995 case RecurKind::FMin:
25996 case RecurKind::FMaximum:
25997 case RecurKind::FMinimum:
26000 case RecurKind::Sub:
26001 case RecurKind::AddChainWithSubs:
26002 case RecurKind::Mul:
26003 case RecurKind::FMul:
26004 case RecurKind::FMulAdd:
26005 case RecurKind::AnyOf:
26006 case RecurKind::FindFirstIVSMin:
26007 case RecurKind::FindFirstIVUMin:
26008 case RecurKind::FindLastIVSMax:
26009 case RecurKind::FindLastIVUMax:
26010 case RecurKind::FMaxNum:
26011 case RecurKind::FMinNum:
26012 case RecurKind::FMaximumNum:
26013 case RecurKind::FMinimumNum:
26014 case RecurKind::None:
26021 VecResSignedness = IsSigned;
26023 ++NumVectorInstructions;
26024 if (ScalarTy == Builder.
getInt1Ty() && ScalarTy != DestTy &&
26030 std::iota(
Mask.begin(),
Mask.end(), 0);
26032 if (VecResVF < VecVF) {
26036 if (VecResVF != VecVF) {
26038 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
26055 if (VecResVF < VecVF) {
26061 if (VecResVF != VecVF)
26063 Op = createOp(Builder, RdxKind,
Op, Vec,
"rdx.op", ReductionOps);
26064 if (VecResVF != VecVF)
26069 for (
auto [Vec, Scale, IsSigned] : VectorValuesAndScales)
26070 CreateVecOp(Vec, Scale, IsSigned);
26071 CreateSingleOp(VecRes, 1,
false);
26073 return ReducedSubTree;
26077 Value *emitReduction(
Value *VectorizedValue, IRBuilderBase &Builder,
26078 const TargetTransformInfo *
TTI,
Type *DestTy) {
26079 assert(VectorizedValue &&
"Need to have a vectorized tree node");
26080 assert(RdxKind != RecurKind::FMulAdd &&
26081 "A call to the llvm.fmuladd intrinsic is not handled yet");
26084 if (FTy->getScalarType() == Builder.
getInt1Ty() &&
26085 RdxKind == RecurKind::Add &&
26090 VectorizedValue, Builder.
getIntNTy(FTy->getNumElements()));
26091 ++NumVectorInstructions;
26094 ++NumVectorInstructions;
26099 Value *emitScaleForReusedOps(
Value *VectorizedValue, IRBuilderBase &Builder,
26101 assert(IsSupportedHorRdxIdentityOp &&
26102 "The optimization of matched scalar identity horizontal reductions "
26103 "must be supported.");
26105 return VectorizedValue;
26107 case RecurKind::Add: {
26109 Value *Scale = ConstantInt::get(VectorizedValue->
getType(), Cnt);
26111 << VectorizedValue <<
". (HorRdx)\n");
26112 return Builder.
CreateMul(VectorizedValue, Scale);
26114 case RecurKind::Xor: {
26116 LLVM_DEBUG(
dbgs() <<
"SLP: Xor " << Cnt <<
"of " << VectorizedValue
26117 <<
". (HorRdx)\n");
26120 return VectorizedValue;
26122 case RecurKind::FAdd: {
26124 Value *Scale = ConstantFP::get(VectorizedValue->
getType(), Cnt);
26126 << VectorizedValue <<
". (HorRdx)\n");
26127 return Builder.
CreateFMul(VectorizedValue, Scale);
26129 case RecurKind::And:
26130 case RecurKind::Or:
26131 case RecurKind::SMax:
26132 case RecurKind::SMin:
26133 case RecurKind::UMax:
26134 case RecurKind::UMin:
26135 case RecurKind::FMax:
26136 case RecurKind::FMin:
26137 case RecurKind::FMaximum:
26138 case RecurKind::FMinimum:
26140 return VectorizedValue;
26141 case RecurKind::Sub:
26142 case RecurKind::AddChainWithSubs:
26143 case RecurKind::Mul:
26144 case RecurKind::FMul:
26145 case RecurKind::FMulAdd:
26146 case RecurKind::AnyOf:
26147 case RecurKind::FindFirstIVSMin:
26148 case RecurKind::FindFirstIVUMin:
26149 case RecurKind::FindLastIVSMax:
26150 case RecurKind::FindLastIVUMax:
26151 case RecurKind::FMaxNum:
26152 case RecurKind::FMinNum:
26153 case RecurKind::FMaximumNum:
26154 case RecurKind::FMinimumNum:
26155 case RecurKind::None:
26164 emitReusedOps(
Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,
26165 const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter,
26166 const DenseMap<Value *, Value *> &TrackedToOrig) {
26167 assert(IsSupportedHorRdxIdentityOp &&
26168 "The optimization of matched scalar identity horizontal reductions "
26169 "must be supported.");
26172 if (VTy->getElementType() != VL.
front()->getType()) {
26176 R.isSignedMinBitwidthRootNode());
26179 case RecurKind::Add: {
26182 for (
Value *V : VL) {
26183 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
26184 Vals.
push_back(ConstantInt::get(
V->getType(), Cnt,
false));
26188 << VectorizedValue <<
". (HorRdx)\n");
26189 return Builder.
CreateMul(VectorizedValue, Scale);
26191 case RecurKind::And:
26192 case RecurKind::Or:
26195 <<
". (HorRdx)\n");
26196 return VectorizedValue;
26197 case RecurKind::SMax:
26198 case RecurKind::SMin:
26199 case RecurKind::UMax:
26200 case RecurKind::UMin:
26201 case RecurKind::FMax:
26202 case RecurKind::FMin:
26203 case RecurKind::FMaximum:
26204 case RecurKind::FMinimum:
26207 <<
". (HorRdx)\n");
26208 return VectorizedValue;
26209 case RecurKind::Xor: {
26214 SmallVector<int>
Mask(
26217 std::iota(
Mask.begin(),
Mask.end(), 0);
26218 bool NeedShuffle =
false;
26219 for (
unsigned I = 0, VF = VL.size();
I < VF; ++
I) {
26221 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
26222 if (Cnt % 2 == 0) {
26224 NeedShuffle =
true;
26230 dbgs() <<
"> of " << VectorizedValue <<
". (HorRdx)\n");
26234 ConstantVector::getNullValue(VectorizedValue->
getType()), Mask);
26235 return VectorizedValue;
26237 case RecurKind::FAdd: {
26240 for (
Value *V : VL) {
26241 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
26242 Vals.
push_back(ConstantFP::get(
V->getType(), Cnt));
26245 return Builder.
CreateFMul(VectorizedValue, Scale);
26247 case RecurKind::Sub:
26248 case RecurKind::AddChainWithSubs:
26249 case RecurKind::Mul:
26250 case RecurKind::FMul:
26251 case RecurKind::FMulAdd:
26252 case RecurKind::AnyOf:
26253 case RecurKind::FindFirstIVSMin:
26254 case RecurKind::FindFirstIVUMin:
26255 case RecurKind::FindLastIVSMax:
26256 case RecurKind::FindLastIVUMax:
26257 case RecurKind::FMaxNum:
26258 case RecurKind::FMinNum:
26259 case RecurKind::FMaximumNum:
26260 case RecurKind::FMinimumNum:
26261 case RecurKind::None:
26271 return HorizontalReduction::getRdxKind(V);
26277 unsigned AggregateSize = 1;
26279 Type *CurrentType =
IV->getType();
26282 for (
auto *Elt : ST->elements())
26283 if (Elt != ST->getElementType(0))
26284 return std::nullopt;
26285 AggregateSize *= ST->getNumElements();
26286 CurrentType = ST->getElementType(0);
26288 AggregateSize *= AT->getNumElements();
26289 CurrentType = AT->getElementType();
26291 AggregateSize *= VT->getNumElements();
26292 return AggregateSize;
26294 return AggregateSize;
26296 return std::nullopt;
26305 unsigned OperandOffset,
const BoUpSLP &R) {
26308 std::optional<unsigned> OperandIndex =
26310 if (!OperandIndex || R.isDeleted(LastInsertInst))
26314 BuildVectorOpds, InsertElts, *OperandIndex, R);
26317 BuildVectorOpds[*OperandIndex] = InsertedOperand;
26318 InsertElts[*OperandIndex] = LastInsertInst;
26321 }
while (LastInsertInst !=
nullptr &&
26348 "Expected insertelement or insertvalue instruction!");
26351 "Expected empty result vectors!");
26354 if (!AggregateSize)
26356 BuildVectorOpds.
resize(*AggregateSize);
26357 InsertElts.
resize(*AggregateSize);
26362 if (BuildVectorOpds.
size() >= 2)
26380 auto DominatedReduxValue = [&](
Value *R) {
26388 if (
P->getIncomingBlock(0) == ParentBB) {
26390 }
else if (
P->getIncomingBlock(1) == ParentBB) {
26394 if (Rdx && DominatedReduxValue(Rdx))
26407 if (
P->getIncomingBlock(0) == BBLatch) {
26409 }
else if (
P->getIncomingBlock(1) == BBLatch) {
26413 if (Rdx && DominatedReduxValue(Rdx))
26449 "Expected binop, select, or intrinsic for reduction matching");
26451 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root));
26453 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
26464 Value *Op0 =
nullptr;
26465 Value *Op1 =
nullptr;
26474 Value *B0 =
nullptr, *B1 =
nullptr;
26479bool SLPVectorizerPass::vectorizeHorReduction(
26480 PHINode *
P, Instruction *Root, BasicBlock *BB,
BoUpSLP &R,
26481 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
26490 auto SelectRoot = [&]() {
26492 HorizontalReduction::getRdxKind(Root) != RecurKind::None)
26509 std::queue<std::pair<Instruction *, unsigned>>
Stack;
26510 Stack.emplace(SelectRoot(), 0);
26511 SmallPtrSet<Value *, 8> VisitedInstrs;
26514 if (
R.isAnalyzedReductionRoot(Inst))
26519 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
26521 return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC, *DT);
26523 auto TryAppendToPostponedInsts = [&](
Instruction *FutureSeed) {
26524 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
26536 while (!
Stack.empty()) {
26539 std::tie(Inst, Level) =
Stack.front();
26544 if (
R.isDeleted(Inst))
26546 if (
Value *VectorizedV = TryToReduce(Inst)) {
26550 Stack.emplace(
I, Level);
26553 if (
R.isDeleted(Inst))
26557 if (!TryAppendToPostponedInsts(Inst)) {
26568 if (VisitedInstrs.
insert(
Op).second)
26573 !
R.isDeleted(
I) &&
I->getParent() == BB)
26574 Stack.emplace(
I, Level);
26579bool SLPVectorizerPass::tryToVectorize(Instruction *
I,
BoUpSLP &R) {
26586 if ((
I->getOpcode() == Instruction::FAdd ||
26587 I->getOpcode() == Instruction::FSub) &&
26597 if (!Op0 || !Op1 || Op0->getParent() !=
P || Op1->getParent() !=
P ||
26598 R.isDeleted(Op0) ||
R.isDeleted(Op1))
26608 if (
A &&
B &&
B->hasOneUse()) {
26611 if (B0 && B0->getParent() ==
P && !
R.isDeleted(B0))
26613 if (B1 && B1->getParent() ==
P && !
R.isDeleted(B1))
26617 if (
B &&
A &&
A->hasOneUse()) {
26620 if (A0 && A0->getParent() ==
P && !
R.isDeleted(A0))
26622 if (A1 && A1->getParent() ==
P && !
R.isDeleted(A1))
26626 auto TryToReduce = [
this, &
R, &TTI = *TTI](
Instruction *Inst,
26630 Type *Ty = Inst->getType();
26634 if (!HorRdx.matchReductionForOperands())
26640 TTI.getScalarizationOverhead(
26643 TTI.getInstructionCost(Inst,
CostKind);
26646 case RecurKind::Add:
26647 case RecurKind::Mul:
26648 case RecurKind::Or:
26649 case RecurKind::And:
26650 case RecurKind::Xor:
26651 case RecurKind::FAdd:
26652 case RecurKind::FMul: {
26655 FMF = FPCI->getFastMathFlags();
26656 RedCost = TTI.getArithmeticReductionCost(Inst->getOpcode(), VecTy, FMF,
26663 if (RedCost >= ScalarCost)
26666 return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC, *DT) !=
nullptr;
26668 if (Candidates.
size() == 1)
26669 return TryToReduce(
I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1},
R);
26672 std::optional<int> BestCandidate =
R.findBestRootPair(Candidates);
26673 if (!BestCandidate)
26675 return (*BestCandidate == 0 &&
26676 TryToReduce(
I, {Candidates[*BestCandidate].first,
26677 Candidates[*BestCandidate].second})) ||
26678 tryToVectorizeList({Candidates[*BestCandidate].first,
26679 Candidates[*BestCandidate].second},
26683bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *
P, Instruction *Root,
26684 BasicBlock *BB,
BoUpSLP &R) {
26686 bool Res = vectorizeHorReduction(
P, Root, BB, R, PostponedInsts);
26687 Res |= tryToVectorize(PostponedInsts, R);
26694 for (
Value *V : Insts)
26696 Res |= tryToVectorize(Inst, R);
26700bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
26703 if (!
R.canMapToVector(IVI->
getType()))
26706 SmallVector<Value *, 16> BuildVectorOpds;
26707 SmallVector<Value *, 16> BuildVectorInsts;
26711 if (MaxVFOnly && BuildVectorOpds.
size() == 2) {
26712 R.getORE()->emit([&]() {
26713 return OptimizationRemarkMissed(
SV_NAME,
"NotPossible", IVI)
26714 <<
"Cannot SLP vectorize list: only 2 elements of buildvalue, "
26715 "trying reduction first.";
26719 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IVI <<
"\n");
26721 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
26724bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
26727 SmallVector<Value *, 16> BuildVectorInsts;
26728 SmallVector<Value *, 16> BuildVectorOpds;
26729 SmallVector<int>
Mask;
26735 if (MaxVFOnly && BuildVectorInsts.
size() == 2) {
26736 R.getORE()->emit([&]() {
26737 return OptimizationRemarkMissed(
SV_NAME,
"NotPossible", IEI)
26738 <<
"Cannot SLP vectorize list: only 2 elements of buildvector, "
26739 "trying reduction first.";
26743 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IEI <<
"\n");
26744 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
26747template <
typename T>
26752 bool MaxVFOnly,
BoUpSLP &R) {
26765 if (!
I || R.isDeleted(
I)) {
26769 auto *SameTypeIt = IncIt;
26772 AreCompatible(VL, *SameTypeIt))) {
26775 if (
I && !R.isDeleted(
I))
26780 unsigned NumElts = VL.
size();
26781 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize starting at nodes ("
26782 << NumElts <<
")\n");
26792 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL), MaxVFOnly)) {
26795 VL.
swap(Candidates);
26796 Candidates.
clear();
26804 auto GetMinNumElements = [&R](
Value *V) {
26805 unsigned EltSize = R.getVectorElementSize(V);
26806 return std::max(2U, R.getMaxVecRegSize() / EltSize);
26808 if (NumElts < GetMinNumElements(*IncIt) &&
26809 (Candidates.
empty() ||
26810 Candidates.
front()->getType() == (*IncIt)->getType())) {
26818 if (Candidates.
size() > 1 &&
26819 (SameTypeIt ==
E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
26820 if (TryToVectorizeHelper(Candidates,
false)) {
26823 }
else if (MaxVFOnly) {
26826 for (
auto *It = Candidates.
begin(), *End = Candidates.
end(); It != End;
26829 if (!
I || R.isDeleted(
I)) {
26833 auto *SameTypeIt = It;
26834 while (SameTypeIt != End &&
26837 AreCompatible(*SameTypeIt, *It))) {
26840 if (
I && !R.isDeleted(
I))
26843 unsigned NumElts = VL.
size();
26844 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL),
26850 Candidates.
clear();
26854 IncIt = SameTypeIt;
26866template <
bool IsCompatibility>
26871 "Expected valid element types only.");
26873 return IsCompatibility;
26876 if (CI1->getOperand(0)->getType()->getTypeID() <
26878 return !IsCompatibility;
26879 if (CI1->getOperand(0)->getType()->getTypeID() >
26882 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
26884 return !IsCompatibility;
26885 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
26894 if (BasePred1 < BasePred2)
26895 return !IsCompatibility;
26896 if (BasePred1 > BasePred2)
26899 bool CI1Preds = Pred1 == BasePred1;
26900 bool CI2Preds = Pred2 == BasePred1;
26901 for (
int I = 0,
E = CI1->getNumOperands();
I <
E; ++
I) {
26902 auto *Op1 = CI1->getOperand(CI1Preds ?
I :
E -
I - 1);
26907 return !IsCompatibility;
26912 if (IsCompatibility) {
26913 if (I1->getParent() != I2->getParent())
26920 return NodeI2 !=
nullptr;
26923 assert((NodeI1 == NodeI2) ==
26925 "Different nodes should have different DFS numbers");
26926 if (NodeI1 != NodeI2)
26930 if (S && (IsCompatibility || !S.isAltShuffle()))
26932 if (IsCompatibility)
26934 if (I1->getOpcode() != I2->getOpcode())
26935 return I1->getOpcode() < I2->getOpcode();
26938 return IsCompatibility;
26941template <
typename ItT>
26947 if (R.isDeleted(
I))
26951 Changed |= vectorizeRootInstruction(
nullptr, RootOp, BB, R);
26952 if (R.isDeleted(
I))
26958 if (R.isDeleted(
I))
26964 auto CompareSorter = [&](
Value *V,
Value *V2) {
26980 if (Vals.
size() <= 1)
26983 Vals, CompareSorter, AreCompatibleCompares,
26986 bool ArePossiblyReducedInOtherBlock =
any_of(Candidates, [](
Value *V) {
26988 auto *Select = dyn_cast<SelectInst>(U);
26990 Select->getParent() != cast<Instruction>(V)->getParent();
26993 if (ArePossiblyReducedInOtherBlock)
26995 return tryToVectorizeList(Candidates, R, MaxVFOnly);
27001bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
27004 "This function only accepts Insert instructions");
27005 bool OpsChanged =
false;
27007 for (
auto *
I :
reverse(Instructions)) {
27013 vectorizeInsertValueInst(LastInsertValue, BB, R,
true);
27016 vectorizeInsertElementInst(LastInsertElem, BB, R,
true);
27019 if (
R.isDeleted(
I))
27021 OpsChanged |= vectorizeHorReduction(
nullptr,
I, BB, R, PostponedInsts);
27027 vectorizeInsertValueInst(LastInsertValue, BB, R,
false);
27029 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
27034 OpsChanged |= tryToVectorize(PostponedInsts, R);
27040bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB,
BoUpSLP &R) {
27042 SmallVector<Value *, 4> Incoming;
27043 SmallPtrSet<Value *, 16> VisitedInstrs;
27047 DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;
27048 auto PHICompare = [
this, &PHIToOpcodes](
Value *V1,
Value *V2) {
27051 "Expected vectorizable types only.");
27061 V2->getType()->getScalarSizeInBits())
27064 V2->getType()->getScalarSizeInBits())
27068 if (Opcodes1.
size() < Opcodes2.
size())
27070 if (Opcodes1.
size() > Opcodes2.
size())
27072 for (
int I = 0,
E = Opcodes1.
size();
I <
E; ++
I) {
27081 return NodeI2 !=
nullptr;
27084 assert((NodeI1 == NodeI2) ==
27086 "Different nodes should have different DFS numbers");
27087 if (NodeI1 != NodeI2)
27090 if (S && !S.isAltShuffle() &&
I1->getOpcode() == I2->getOpcode()) {
27106 DT->getNode(V1->getParent());
27108 DT->getNode(V2->getParent());
27110 return NodeI2 !=
nullptr;
27113 assert((NodeI1 == NodeI2) ==
27115 "Different nodes should have different DFS numbers");
27116 if (NodeI1 != NodeI2)
27118 return V1->comesBefore(V2);
27131 return *Id1 < *Id2;
27135 if (
I1->getOpcode() == I2->getOpcode())
27137 return I1->getOpcode() < I2->getOpcode();
27160 auto ValID1 = Opcodes1[
I]->getValueID();
27161 auto ValID2 = Opcodes2[
I]->getValueID();
27162 if (ValID1 == ValID2)
27164 if (ValID1 < ValID2)
27166 if (ValID1 > ValID2)
27175 assert(U1 && U2 &&
"The only thing left should be undef & undef.");
27181 if (VL.empty() || V1 == VL.back())
27183 Value *V2 = VL.back();
27188 if (Opcodes1.
size() != Opcodes2.
size())
27190 for (
int I = 0,
E = Opcodes1.
size();
I <
E; ++
I) {
27196 if (
R.isDeleted(I1) ||
R.isDeleted(I2))
27198 if (
I1->getParent() != I2->getParent())
27206 if (Opcodes1[
I]->getValueID() != Opcodes2[
I]->getValueID())
27212 bool HaveVectorizedPhiNodes =
false;
27216 for (Instruction &
I : *BB) {
27223 if (!VisitedInstrs.
count(
P) && !
R.isDeleted(
P) &&
27228 if (Incoming.
size() <= 1)
27233 for (
Value *V : Incoming) {
27234 SmallVectorImpl<Value *> &Opcodes =
27236 if (!Opcodes.
empty())
27238 SmallVector<Value *, 4> Nodes(1, V);
27239 SmallPtrSet<Value *, 4> Visited;
27240 while (!Nodes.empty()) {
27244 for (
Value *V :
PHI->incoming_values()) {
27246 Nodes.push_back(PHI1);
27255 Incoming, PHICompare, AreCompatiblePHIs,
27257 return tryToVectorizeList(Candidates, R, MaxVFOnly);
27260 Changed |= HaveVectorizedPhiNodes;
27261 if (HaveVectorizedPhiNodes &&
any_of(PHIToOpcodes, [&](
const auto &
P) {
27263 return !
PHI ||
R.isDeleted(
PHI);
27265 PHIToOpcodes.
clear();
27267 }
while (HaveVectorizedPhiNodes);
27269 VisitedInstrs.
clear();
27271 InstSetVector PostProcessInserts;
27272 SmallSetVector<CmpInst *, 8> PostProcessCmps;
27275 auto VectorizeInsertsAndCmps = [&](
bool VectorizeCmps) {
27276 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
27277 if (VectorizeCmps) {
27279 PostProcessCmps.
clear();
27281 PostProcessInserts.clear();
27287 return PostProcessCmps.
contains(Cmp);
27289 PostProcessInserts.contains(
I);
27295 return I->use_empty() &&
27305 if (
R.isDeleted(&*It))
27308 if (!VisitedInstrs.
insert(&*It).second) {
27309 if (HasNoUsers(&*It) &&
27310 VectorizeInsertsAndCmps(It->isTerminator())) {
27323 if (
P->getNumIncomingValues() == 2) {
27326 if (Root && vectorizeRootInstruction(
P, Root, BB, R)) {
27340 if (BB ==
P->getIncomingBlock(
I) ||
27341 !DT->isReachableFromEntry(
P->getIncomingBlock(
I)))
27347 PI && !IsInPostProcessInstrs(PI)) {
27349 vectorizeRootInstruction(
nullptr, PI,
P->getIncomingBlock(
I), R);
27351 if (Res &&
R.isDeleted(
P)) {
27361 if (HasNoUsers(&*It)) {
27362 bool OpsChanged =
false;
27373 TryToVectorizeRoot |= (
I == Stores.
end() ||
I->second.size() == 1) &&
27374 SI->getValueOperand()->hasOneUse();
27376 if (TryToVectorizeRoot) {
27377 for (
auto *V : It->operand_values()) {
27381 VI && !IsInPostProcessInstrs(VI))
27383 OpsChanged |= vectorizeRootInstruction(
nullptr, VI, BB, R);
27390 VectorizeInsertsAndCmps(It->isTerminator());
27402 PostProcessInserts.insert(&*It);
27410bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB,
BoUpSLP &R) {
27412 for (
auto &Entry : GEPs) {
27415 if (
Entry.second.size() < 2)
27418 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing a getelementptr list of length "
27419 <<
Entry.second.size() <<
".\n");
27427 return !R.isDeleted(GEP);
27429 if (It ==
Entry.second.end())
27431 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
27432 unsigned EltSize =
R.getVectorElementSize(*(*It)->idx_begin());
27433 if (MaxVecRegSize < EltSize)
27436 unsigned MaxElts = MaxVecRegSize / EltSize;
27437 for (
unsigned BI = 0, BE =
Entry.second.size(); BI < BE; BI += MaxElts) {
27438 auto Len = std::min<unsigned>(BE - BI, MaxElts);
27451 Candidates.remove_if([&R](
Value *
I) {
27461 for (
int I = 0,
E = GEPList.size();
I <
E && Candidates.
size() > 1; ++
I) {
27462 auto *GEPI = GEPList[
I];
27463 if (!Candidates.count(GEPI))
27465 const SCEV *SCEVI = SE->getSCEV(GEPList[
I]);
27466 for (
int J =
I + 1; J <
E && Candidates.
size() > 1; ++J) {
27467 auto *GEPJ = GEPList[J];
27468 const SCEV *SCEVJ = SE->getSCEV(GEPList[J]);
27470 Candidates.remove(GEPI);
27471 Candidates.remove(GEPJ);
27472 }
else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
27473 Candidates.remove(GEPJ);
27480 if (Candidates.
size() < 2)
27486 SmallVector<Value *, 16> Bundle(Candidates.
size());
27487 auto BundleIndex = 0
u;
27488 for (
auto *V : Candidates) {
27490 auto *GEPIdx =
GEP->idx_begin()->get();
27492 Bundle[BundleIndex++] = GEPIdx;
27504 Changed |= tryToVectorizeList(Bundle, R);
27510bool SLPVectorizerPass::vectorizeStoreChains(
BoUpSLP &R) {
27515 auto &&StoreSorter = [
this](StoreInst *
V, StoreInst *V2) {
27516 if (
V->getValueOperand()->getType()->getTypeID() <
27519 if (
V->getValueOperand()->getType()->getTypeID() >
27522 if (
V->getPointerOperandType()->getTypeID() <
27523 V2->getPointerOperandType()->getTypeID())
27525 if (
V->getPointerOperandType()->getTypeID() >
27526 V2->getPointerOperandType()->getTypeID())
27528 if (
V->getValueOperand()->getType()->getScalarSizeInBits() <
27531 if (
V->getValueOperand()->getType()->getScalarSizeInBits() >
27538 DomTreeNodeBase<llvm::BasicBlock> *NodeI1 = DT->getNode(
I1->getParent());
27539 DomTreeNodeBase<llvm::BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
27540 assert(NodeI1 &&
"Should only process reachable instructions");
27541 assert(NodeI2 &&
"Should only process reachable instructions");
27542 assert((NodeI1 == NodeI2) ==
27544 "Different nodes should have different DFS numbers");
27545 if (NodeI1 != NodeI2)
27547 return I1->getOpcode() < I2->getOpcode();
27553 return V->getValueOperand()->getValueID() <
27557 bool SameParent =
true;
27563 StoreInst *V2 = VL.
back();
27588 SameParent &=
I1 && I2 &&
I1->getParent() == I2->getParent();
27590 for (
auto [SI, V] :
zip(VL, NewVL))
27591 V =
SI->getValueOperand();
27592 NewVL.back() = V1->getValueOperand();
27593 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
27594 InstructionsState S =
Analysis.buildInstructionsState(
27602 return V1->getValueOperand()->
getValueID() ==
27607 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> Attempted;
27608 for (
auto &Pair : Stores) {
27609 if (Pair.second.size() < 2)
27613 << Pair.second.size() <<
".\n");
27622 Pair.second.rend());
27624 ReversedStores, StoreSorter, AreCompatibleStores,
27626 return vectorizeStores(Candidates, R, Attempted);
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool isConstant(const MachineInstr &MI)
AMDGPU Register Bank Select
ReachingDefInfo InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
block Block Frequency Analysis
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
static bool runImpl(Function &F, const TargetLowering &TLI, const LibcallLoweringInfo &Libcalls, AssumptionCache *AC)
This is the interface for a simple mod/ref and alias analysis over globals.
static Value * getCondition(Instruction *I)
static void setCondition(Instruction *I, Value *NewCond)
static const HTTPClientCleanup Cleanup
static Type * getIndexType(Value *In)
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
This file provides utility analysis objects describing memory locations.
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
static bool IsSelect(MachineInstr &MI)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
static std::optional< OperandInfo > getOperandInfo(const MachineOperand &MO)
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool isAlternateInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static cl::opt< bool > SplitAlternateInstructions("slp-split-alternate-instructions", cl::init(true), cl::Hidden, cl::desc("Improve the code quality by splitting alternate instructions"))
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static bool isMaskedLoadCompress(ArrayRef< Value * > VL, ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC, const DominatorTree &DT, const TargetLibraryInfo &TLI, const function_ref< bool(Value *)> AreAllUsersVectorized, bool &IsMasked, unsigned &InterleaveFactor, SmallVectorImpl< int > &CompressMask, VectorType *&LoadVecTy)
Checks if the VL can be transformed to a (masked)load + compress or (masked) interleaved load.
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
static cl::opt< bool > VectorizeCopyableElements("slp-copyable-elements", cl::init(true), cl::Hidden, cl::desc("Try to replace values with the idempotent instructions for " "better vectorization."))
Enables vectorization of copyable elements.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static bool allSameOpcode(ArrayRef< Value * > VL)
static InstructionCost canConvertToFMA(ArrayRef< Value * > VL, const InstructionsState &S, DominatorTree &DT, const DataLayout &DL, TargetTransformInfo &TTI, const TargetLibraryInfo &TLI)
Check if we can convert fadd/fsub sequence to FMAD.
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool isCommutative(Instruction *I, Value *ValWithUses, bool IsCopyable=false)
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Creates subvector insert.
static void findBuildAggregateRec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool isSimple(Instruction *I)
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static DebugLoc getDebugLocFromPHI(PHINode &PN)
static std::optional< unsigned > getExtractIndex(const Instruction *E)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static cl::opt< bool > ForceStridedLoads("slp-force-strided-loads", cl::init(false), cl::Hidden, cl::desc("Generate strided loads even if they are not " "profitable. Used for testing only."))
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static bool isMainInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an main operation for the given MainOp and AltOp instruction...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static unsigned getNumberOfPotentiallyCommutativeOps(Instruction *I)
static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy, VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={})
This is similar to TargetTransformInfo::getScalarizationOverhead, but if ScalarTy is a FixedVectorTyp...
static bool isCommutableOperand(Instruction *I, Value *ValWithUses, unsigned Op, bool IsCopyable=false)
Checks if the operand is commutative.
static bool buildCompressMask(ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, Type *ScalarTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< int > &CompressMask)
Builds compress-like mask for shuffles for the given PointerOps, ordered with Order.
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
Calculates the costs of vectorized intrinsic (if possible) and vectorized function (if possible) call...
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static bool isValidForAlternation(unsigned Opcode)
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
static SmallVector< Constant * > replicateMask(ArrayRef< Constant * > Val, unsigned VF)
Replicates the given Val VF times.
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static Instruction * findInstructionWithOpcode(ArrayRef< Value * > VL, unsigned Opcode)
Find an instruction with a specific opcode in VL.
static const SCEV * calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, SmallVectorImpl< int64_t > &Coeffs)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static InstructionCost getExtractWithExtendCost(const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput)
This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst is a FixedVectorType,...
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static std::pair< Instruction *, Instruction * > getMainAltOpsNoStateVL(ArrayRef< Value * > VL)
Returns main/alternate instructions for the given VL.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int64_t > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(ArrayRef< T * >, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static unsigned getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy, const unsigned Limit=std::numeric_limits< unsigned >::max())
Returns number of parts, the type VecTy will be split at the codegen phase.
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static bool tryToFindDuplicates(SmallVectorImpl< Value * > &VL, SmallVectorImpl< int > &ReuseShuffleIndices, const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, const InstructionsState &S, const BoUpSLP::EdgeInfo &UserTreeIdx, bool TryPad=false)
Checks that every instruction appears once in the list and if not, packs them, building ReuseShuffleI...
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Value * createExtractVector(IRBuilderBase &Builder, Value *Vec, unsigned SubVecVF, unsigned Index)
Generates subvector extract using Generator or using default shuffle.
static cl::opt< bool > DisableTreeReorder("slp-disable-tree-reorder", cl::init(false), cl::Hidden, cl::desc("Disable tree reordering even if it is " "profitable. Used for testing only."))
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static InstructionCost getVectorInstrCost(const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar, ArrayRef< std::tuple< Value *, User *, int > > ScalarUserAndIdx)
This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy is a FixedVectorType,...
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, Type *ScalarTy, unsigned Opcode0, unsigned Opcode1)
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
static const int BlockSize
LocallyHashedType DenseMapInfo< LocallyHashedType >::Empty
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
static const uint32_t IV[8]
Merges shuffle masks and emits final shuffle instruction, if required.
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost createFreeze(InstructionCost Cost)
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void resetForSameNode()
Reset the builder to handle perfect diamond match.
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Merges shuffle masks and emits final shuffle instruction, if required.
Value * createFreeze(Value *V)
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void resetForSameNode()
Reset the builder to handle perfect diamond match.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
~ShuffleInstructionBuilder()
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
A manager for alias analyses.
Class for arbitrary precision integers.
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
void clearBit(unsigned BitPosition)
Set a given bit to 0.
uint64_t getZExtValue() const
Get zero extended value.
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
unsigned getBitWidth() const
Return the number of bits in the APInt.
bool ult(const APInt &RHS) const
Unsigned less than comparison.
void clearAllBits()
Set every bit to 0.
void negate()
Negate this APInt in place.
unsigned logBase2() const
void setAllBits()
Set every bit to 1.
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
bool isOne() const
Determine if this is a value of 1.
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
const T & back() const
back - Get the last element.
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
const T & front() const
front - Get the first element.
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
const T & consume_front()
consume_front() - Returns the first element and drops it from ArrayRef.
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::reverse_iterator reverse_iterator
InstListType::iterator iterator
Instruction iterators...
LLVM_ABI const_iterator getFirstNonPHIOrDbgOrAlloca() const
Returns an iterator to the first instruction in this block that is not a PHINode, a debug intrinsic,...
InstListType::const_reverse_iterator const_reverse_iterator
bool isEHPad() const
Return true if this basic block is an exception handling block.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Represents analyses that only rely on functions' control flow.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
LLVM_ABI void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
This class is the base class for the comparison instructions.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ ICMP_SLT
signed less than
@ ICMP_SLE
signed less or equal
@ ICMP_UGE
unsigned greater or equal
@ ICMP_UGT
unsigned greater than
@ ICMP_SGT
signed greater than
@ ICMP_ULT
unsigned less than
@ ICMP_SGE
signed greater or equal
@ ICMP_ULE
unsigned less or equal
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Predicate getPredicate() const
Return the predicate for this instruction.
static LLVM_ABI Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
static LLVM_ABI Constant * getBinOpIdentity(unsigned Opcode, Type *Ty, bool AllowRHSConstant=false, bool NSZ=false)
Return the identity constant for a binary opcode.
This is the shared class of boolean and integer constants.
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
const APInt & getValue() const
Return the constant as an APInt value reference.
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
static bool shouldExecute(CounterInfo &Counter)
static DebugLoc getUnknown()
An analysis that produces DemandedBits for a function.
ValueT & at(const_arg_type_t< KeyT > Val)
at - Return the entry for the specified key, or abort if no such entry exists.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
bool erase(const KeyT &Val)
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
Base class for the actual dominator tree node.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Convenience struct for specifying and reasoning about fast-math flags.
bool allowReassoc() const
Flag queries.
bool allowContract() const
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
ArrayRef< Type * > params() const
Type * getReturnType() const
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
For the node iterator we just need to turn the TreeEntry iterator into a TreeEntry* iterator so that ...
nodes_iterator operator++()
nodes_iterator(const ItTy &It2)
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Value * CreateFreeze(Value *V, const Twine &Name="")
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
LLVM_ABI Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
LLVM_ABI Value * CreateSelectWithUnknownProfile(Value *C, Value *True, Value *False, StringRef PassName, const Twine &Name="")
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
static InstructionCost getInvalid(CostType Val=0)
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
LLVM_ABI bool mayWriteToMemory() const LLVM_READONLY
Return true if this instruction may modify memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
const SmallVectorImpl< Type * > & getArgTypes() const
An instruction for reading from memory.
Value * getPointerOperand()
Analysis pass that exposes the LoopInfo for a function.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
This class implements a map that also provides access to all stored values in a deterministic order.
VectorType takeVector()
Clear the MapVector and return the underlying vector.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
ValueT lookup(const KeyT &Key) const
std::pair< KeyT, ValueT > & front()
Information for memory intrinsic cost model.
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
T & front() const
front - Get the first element.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
ArrayRef< value_type > getArrayRef() const
size_type size() const
Determine the number of elements in the SetVector.
const value_type & front() const
Return the first element of the SetVector.
void insert_range(Range &&R)
Vector takeVector()
Clear the SetVector and return the underlying vector.
bool contains(const_arg_type key) const
Check if the SetVector contains the given key.
void clear()
Completely clear the SetVector.
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
static LLVM_ABI bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static LLVM_ABI bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static LLVM_ABI bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
bool contains(const T &V) const
Check if the SmallSet contains the given element.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
iterator erase(const_iterator CI)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
The instances of the Type class are immutable: once they are created, they are never changed.
LLVM_ABI bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
bool isVectorTy() const
True if this is an instance of VectorType.
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
bool isPointerTy() const
True if this is an instance of PointerType.
LLVM_ABI unsigned getStructNumElements() const
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
bool isIntegerTy() const
True if this is an instance of IntegerType.
TypeID getTypeID() const
Return the type id for the type.
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
bool isVoidTy() const
Return true if this is 'void'.
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Value * getOperand(unsigned i) const
unsigned getNumOperands() const
iterator_range< value_op_iterator > operand_values()
The Vector Function Database.
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
user_iterator user_begin()
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
unsigned getValueID() const
Return an ID for the concrete type of this object.
bool hasUseList() const
Check if this Value has a use-list.
LLVM_ABI bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
LLVM_ABI User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
LLVM_ABI unsigned getNumUses() const
This method computes the number of uses of this Value.
iterator_range< use_iterator > uses()
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
iterator find(const_arg_type_t< ValueT > V)
void insert_range(Range &&R)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
const ParentTy * getParent() const
self_iterator getIterator()
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator I
iterator_adaptor_base()=default
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
static const int ScoreConstants
Constants.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
A helper data structure to hold the operands of a vector of instructions.
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, ArrayRef< ValueList > Operands, const InstructionsState &S, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
Bottom Up SLP Vectorizer.
static bool isIdentityOrder(ArrayRef< unsigned > Order)
Does this non-empty order represent an identity order?
bool isProfitableToReorder() const
Checks if it is profitable to reorder the current tree.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleEntity &SE)
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleBundle &Bundle)
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
unsigned getMaxVecRegSize() const
OptimizationRemarkEmitter * getORE()
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
unsigned getTreeSize() const
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
InstructionCost getSpillCost()
bool isVectorized(const Value *V) const
Check if the value is vectorized in the tree.
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool isStridedLoad(ArrayRef< Value * > PointerOps, Type *ScalarTy, Align Alignment, const int64_t Diff, const size_t Sz) const
Checks if strided loads can be generated out of VL loads with pointers PointerOps:
SmallVector< StoreInst *, 8 > StoreList
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
unsigned getMinVecRegSize() const
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
unsigned getMinVF(unsigned Sz) const
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, StridedPtrInfo &SPtrInfo, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
SmallVector< unsigned, 4 > OrdersType
SmallVector< Instruction *, 16 > InstrList
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
InstructionCost getTreeCost(InstructionCost TreeCost, ArrayRef< Value * > VectorizedVals={}, InstructionCost ReductionCost=TTI::TCC_Free)
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool analyzeRtStrideCandidate(ArrayRef< Value * > PointerOps, Type *ScalarTy, Align CommonAlignment, SmallVectorImpl< unsigned > &SortedIndices, StridedPtrInfo &SPtrInfo) const
Return true if an array of scalar loads can be replaced with a strided load (with run-time stride).
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleData &SD)
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
SmallVector< Value *, 8 > ValueList
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
bool analyzeConstantStrideCandidate(const ArrayRef< Value * > PointerOps, Type *ElemTy, Align Alignment, const SmallVectorImpl< unsigned > &SortedIndices, const int64_t Diff, Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const
Return true if an array of scalar loads can be replaced with a strided load (with constant stride).
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
InstructionCost calculateTreeCostAndTrimNonProfitable(ArrayRef< Value * > VectorizedVals={})
Calculates the cost of the subtrees, trims non-profitable ones and returns final cost.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Gets reordering data for the given tree entry.
SmallPtrSet< Value *, 16 > ValueSet
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals, ArrayRef< std::tuple< Value *, unsigned, bool > > VectorValuesAndScales)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ BasicBlock
Various leaf nodes.
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaxNum(const Opnd0 &Op0, const Opnd1 &Op1)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
auto match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinimum(const Opnd0 &Op0, const Opnd1 &Op1)
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaximum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinNum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
initializer< Ty > init(const Ty &Val)
unsigned combineHashValue(unsigned a, unsigned b)
Simplistic combination of 32-bit hash values into 32-bit hash values.
@ User
could "use" a pointer
DiagnosticInfoOptimizationBase::Argument NV
friend class Instruction
Iterator for Instructions in a `BasicBlock.
LLVM_ABI iterator begin() const
LLVM_ABI Instruction & front() const
A private "module" namespace for types and utilities used by this pass.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
LLVM_ABI Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
constexpr auto not_equal_to(T &&Arg)
Functor variant of std::not_equal_to that can be used as a UnaryPredicate in functional algorithms li...
FunctionAddr VTableAddr Value
void stable_sort(R &&Range)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
void fill(R &&Range, T &&Value)
Provide wrappers to std::fill which take ranges instead of having to pass begin/end explicitly.
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
hash_code hash_value(const FixedPointSemantics &Val)
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
MaybeAlign getAlign(const CallInst &I, unsigned Index)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
auto pred_end(const MachineBasicBlock *BB)
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
LLVM_ABI void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
scope_exit(Callable) -> scope_exit< Callable >
constexpr from_range_t from_range
LLVM_ABI std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
auto cast_or_null(const Y &Val)
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
iterator_range< po_iterator< T > > post_order(const T &G)
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
bool isa_and_nonnull(const Y &Val)
auto binary_search(R &&Range, T &&Value)
Provide wrappers to std::binary_search which take ranges instead of having to pass begin/end explicit...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
DomTreeNodeBase< BasicBlock > DomTreeNode
auto dyn_cast_or_null(const Y &Val)
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
constexpr bool has_single_bit(T Value) noexcept
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
bool isa_and_present(const Y &Val)
isa_and_present<X> - Functionally identical to isa, except that a null value is accepted.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
bool isPointerTy(const Type *T)
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
auto make_first_range(ContainerTy &&c)
Given a container of pairs, return a range over the first elements.
LLVM_ABI std::optional< int64_t > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
LLVM_ABI bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
bool isModOrRefSet(const ModRefInfo MRI)
bool is_sorted(R &&Range, Compare C)
Wrapper function around std::is_sorted to check if elements in a range R are sorted with respect to a...
LLVM_ABI bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Ref
The access may reference the value stored in memory.
@ LLVM_MARK_AS_BITMASK_ENUM
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
LLVM_ABI CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)
Returns the comparison predicate used when expanding a min/max reduction.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
FunctionAddr VTableAddr Next
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
OutputIt copy(R &&Range, OutputIt Out)
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
constexpr unsigned BitWidth
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
auto pred_begin(const MachineBasicBlock *BB)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
template class LLVM_TEMPLATE_ABI DomTreeNodeBase< BasicBlock >
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
LLVM_ABI Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
LLVM_ABI bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
LLVM_ABI bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Used to keep track of an operand bundle.
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
BoUpSLP::TreeEntry TreeEntry
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits(bool IsSimple=false)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits(bool simple=false)
DenseMapInfo< BoUpSLP::TreeEntry * > FirstInfo
static bool isEqual(const BoUpSLP::EdgeInfo &LHS, const BoUpSLP::EdgeInfo &RHS)
static BoUpSLP::EdgeInfo getEmptyKey()
DenseMapInfo< unsigned > SecondInfo
static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val)
static BoUpSLP::EdgeInfo getTombstoneKey()
An information struct used to provide DenseMap with the various necessary components for a given valu...
Add the VectorizableTree to the index iterator to be able to return TreeEntry pointers.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
ContainerTy & VectorizableTree
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
BoUpSLP::TreeEntry TreeEntry
static nodes_iterator nodes_end(BoUpSLP *R)
BoUpSLP::TreeEntry::VecTreeTy ContainerTy
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
TargetTransformInfo * TTI
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Function object to check whether the second component of a container supported by std::get (like std:...
This structure holds any data we need about the edges being traversed during buildTreeRec().
bool operator==(const EdgeInfo &Other) const
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
unsigned EdgeIdx
The operand index of the use.
void dump(raw_ostream &OS) const
Debug print.
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)