74#ifdef EXPENSIVE_CHECKS
109using namespace std::placeholders;
111#define SV_NAME "slp-vectorizer"
112#define DEBUG_TYPE "SLP"
114STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
117 "Controls which SLP graphs should be vectorized.");
121 cl::desc(
"Run the SLP vectorization passes"));
125 cl::desc(
"Enable vectorization for wider vector utilization"));
129 cl::desc(
"Only vectorize if you gain more than this "
134 cl::desc(
"When true, SLP vectorizer bypasses profitability checks based on "
135 "heuristics and makes vectorization decision via cost modeling."));
139 cl::desc(
"Attempt to vectorize horizontal reductions"));
144 "Attempt to vectorize horizontal reductions feeding into a store"));
148 cl::desc(
"Improve the code quality by splitting alternate instructions"));
152 cl::desc(
"Attempt to vectorize for this register size in bits"));
156 cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
164 cl::desc(
"Limit the size of the SLP scheduling region per block"));
168 cl::desc(
"Attempt to vectorize for this register size in bits"));
172 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
176 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
182 cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
191 cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
195 cl::desc(
"The minimum number of loads, which should be considered strided, "
196 "if the stride is > 1 or is runtime value"));
200 cl::desc(
"The maximum stride, considered to be profitable."));
204 cl::desc(
"Disable tree reordering even if it is "
205 "profitable. Used for testing only."));
209 cl::desc(
"Generate strided loads even if they are not "
210 "profitable. Used for testing only."));
214 cl::desc(
"Display the SLP trees with Graphviz"));
218 cl::desc(
"Try to vectorize with non-power-of-2 number of elements."));
223 cl::desc(
"Try to replace values with the idempotent instructions for "
224 "better vectorization."));
256 Ty = Ty->getScalarType();
258 !Ty->isPPC_FP128Ty();
267 return SI->getValueOperand()->getType();
269 return CI->getOperand(0)->getType();
271 return IE->getOperand(1)->getType();
278 "ScalableVectorType is not supported.");
280 return VecTy->getNumElements();
294 Type *Ty,
unsigned Sz) {
299 if (NumParts == 0 || NumParts >= Sz)
314 if (NumParts == 0 || NumParts >= Sz)
319 return (Sz / RegVF) * RegVF;
331 I * VecTyNumElements, VecTyNumElements)))
333 : Mask[
I] * VecTyNumElements + J;
367 unsigned SVNumElements =
369 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
370 if (SVNumElements % ShuffleMaskSize != 0)
372 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
373 if (GroupSize == 0 || (VL.
size() % GroupSize) != 0)
375 unsigned NumGroup = 0;
376 for (
size_t I = 0,
E = VL.
size();
I !=
E;
I += GroupSize) {
378 Value *Src = SV->getOperand(0);
384 if (SV->getOperand(0) != Src)
387 if (!SV->isExtractSubvectorMask(Index))
389 ExpectedIndex.
set(Index / ShuffleMaskSize);
393 if (!ExpectedIndex.
all())
397 assert(NumGroup == (VL.
size() / GroupSize) &&
"Unexpected number of groups");
416 unsigned SVNumElements =
419 unsigned AccumulateLength = 0;
420 for (
Value *V : VL) {
422 for (
int M : SV->getShuffleMask())
424 : AccumulateLength + M);
425 AccumulateLength += SVNumElements;
466 return std::min<unsigned>(PartNumElems,
Size - Part * PartNumElems);
475 OS <<
"Idx: " << Idx <<
", ";
476 OS <<
"n=" << VL.
size() <<
" [" << *VL.
front() <<
", ..]";
499 if (BB !=
II->getParent())
516 Value *FirstNonUndef =
nullptr;
517 for (
Value *V : VL) {
520 if (!FirstNonUndef) {
524 if (V != FirstNonUndef)
527 return FirstNonUndef !=
nullptr;
542 bool IsCopyable =
false) {
544 return Cmp->isCommutative();
546 return BO->isCommutative() ||
547 (BO->getOpcode() == Instruction::Sub &&
555 if (match(U.getUser(),
556 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
557 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
561 auto *I = dyn_cast<BinaryOperator>(U.get());
562 return match(U.getUser(),
563 m_Intrinsic<Intrinsic::abs>(
564 m_Specific(U.get()), m_ConstantInt(Flag))) &&
565 ((!IsCopyable && I && !I->hasNoSignedWrap()) ||
568 (BO->getOpcode() == Instruction::FSub &&
572 return match(U.getUser(),
573 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
575 return I->isCommutative();
582 bool IsCopyable =
false) {
584 "The instruction is not commutative.");
588 switch (BO->getOpcode()) {
589 case Instruction::Sub:
590 case Instruction::FSub:
596 return I->isCommutableOperand(
Op);
616 constexpr unsigned IntrinsicNumOperands = 2;
617 return IntrinsicNumOperands;
619 return I->getNumOperands();
625 static_assert(std::is_same_v<T, InsertElementInst> ||
626 std::is_same_v<T, ExtractElementInst>,
636 if (CI->getValue().uge(VT->getNumElements()))
638 Index *= VT->getNumElements();
639 Index += CI->getZExtValue();
661 Type *CurrentType =
IV->getType();
662 for (
unsigned I :
IV->indices()) {
664 Index *= ST->getNumElements();
665 CurrentType = ST->getElementType(
I);
667 Index *= AT->getNumElements();
668 CurrentType = AT->getElementType();
690 return std::all_of(It, VL.
end(), [&](
Value *V) {
691 if (auto *CI = dyn_cast<CmpInst>(V))
692 return BasePred == CI->getPredicate();
693 if (auto *I = dyn_cast<Instruction>(V))
694 return I->getOpcode() == Opcode;
695 return isa<PoisonValue>(V);
723 if (MaskArg == UseMask::UndefsAsMask)
727 if (MaskArg == UseMask::FirstArg &&
Value < VF)
728 UseMask.reset(
Value);
729 else if (MaskArg == UseMask::SecondArg &&
Value >= VF)
730 UseMask.reset(
Value - VF);
738template <
bool IsPoisonOnly = false>
742 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
750 if (!UseMask.empty()) {
761 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
776 for (
unsigned I = 0,
E = VecTy->getNumElements();
I !=
E; ++
I) {
777 if (
Constant *Elem =
C->getAggregateElement(
I))
779 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
807static std::optional<TargetTransformInfo::ShuffleKind>
814 std::accumulate(VL.
begin(), VL.
end(), 0u, [](
unsigned S,
Value *V) {
815 auto *EI = dyn_cast<ExtractElementInst>(V);
818 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
821 return std::max(S, VTy->getNumElements());
824 Value *Vec1 =
nullptr;
825 Value *Vec2 =
nullptr;
830 Value *Vec = EE->getVectorOperand();
836 ShuffleMode CommonShuffleMode =
Unknown;
838 for (
unsigned I = 0,
E = VL.
size();
I <
E; ++
I) {
845 auto *Vec = EI->getVectorOperand();
859 if (Idx->getValue().uge(
Size))
861 unsigned IntIdx = Idx->getValue().getZExtValue();
868 if (!Vec1 || Vec1 == Vec) {
870 }
else if (!Vec2 || Vec2 == Vec) {
876 if (CommonShuffleMode == Permute)
880 if (Mask[
I] %
Size !=
I) {
881 CommonShuffleMode = Permute;
884 CommonShuffleMode =
Select;
887 if (CommonShuffleMode ==
Select && Vec2)
897 unsigned Opcode =
E->getOpcode();
898 assert((Opcode == Instruction::ExtractElement ||
899 Opcode == Instruction::ExtractValue) &&
900 "Expected extractelement or extractvalue instruction.");
901 if (Opcode == Instruction::ExtractElement) {
905 return CI->getZExtValue();
908 if (EI->getNumIndices() != 1)
910 return *EI->idx_begin();
944class BinOpSameOpcodeHelper {
945 using MaskType = std::uint_fast16_t;
947 constexpr static std::initializer_list<unsigned> SupportedOp = {
948 Instruction::Add, Instruction::Sub, Instruction::Mul, Instruction::Shl,
949 Instruction::AShr, Instruction::And, Instruction::Or, Instruction::Xor};
959 MainOpBIT = 0b100000000,
967 static std::pair<ConstantInt *, unsigned>
968 isBinOpWithConstantInt(
const Instruction *
I) {
969 unsigned Opcode =
I->getOpcode();
975 if (Opcode == Instruction::Sub || Opcode == Instruction::Shl ||
976 Opcode == Instruction::AShr)
982 struct InterchangeableInfo {
985 MaskType Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |
986 MulBIT | AShrBIT | ShlBIT;
991 MaskType SeenBefore = 0;
992 InterchangeableInfo(
const Instruction *I) : I(I) {}
996 bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) {
997 if (Mask & InterchangeableMask) {
998 SeenBefore |= OpcodeInMaskForm;
999 Mask &= InterchangeableMask;
1004 bool equal(
unsigned Opcode) {
1005 return Opcode == I->getOpcode() && trySet(MainOpBIT, MainOpBIT);
1008 MaskType Candidate = Mask & SeenBefore;
1009 if (Candidate & MainOpBIT)
1010 return I->getOpcode();
1011 if (Candidate & ShlBIT)
1012 return Instruction::Shl;
1013 if (Candidate & AShrBIT)
1014 return Instruction::AShr;
1015 if (Candidate & MulBIT)
1016 return Instruction::Mul;
1017 if (Candidate & AddBIT)
1018 return Instruction::Add;
1019 if (Candidate & SubBIT)
1020 return Instruction::Sub;
1021 if (Candidate & AndBIT)
1022 return Instruction::And;
1023 if (Candidate & OrBIT)
1024 return Instruction::Or;
1025 if (Candidate & XorBIT)
1026 return Instruction::Xor;
1031 bool hasCandidateOpcode(
unsigned Opcode)
const {
1032 MaskType Candidate = Mask & SeenBefore;
1034 case Instruction::Shl:
1035 return Candidate & ShlBIT;
1036 case Instruction::AShr:
1037 return Candidate & AShrBIT;
1038 case Instruction::Mul:
1039 return Candidate & MulBIT;
1040 case Instruction::Add:
1041 return Candidate & AddBIT;
1042 case Instruction::Sub:
1043 return Candidate & SubBIT;
1044 case Instruction::And:
1045 return Candidate & AndBIT;
1046 case Instruction::Or:
1047 return Candidate & OrBIT;
1048 case Instruction::Xor:
1049 return Candidate & XorBIT;
1050 case Instruction::LShr:
1051 case Instruction::FAdd:
1052 case Instruction::FSub:
1053 case Instruction::FMul:
1054 case Instruction::SDiv:
1055 case Instruction::UDiv:
1056 case Instruction::FDiv:
1057 case Instruction::SRem:
1058 case Instruction::URem:
1059 case Instruction::FRem:
1069 unsigned FromOpcode = I->getOpcode();
1070 if (FromOpcode == ToOpcode)
1073 auto [CI, Pos] = isBinOpWithConstantInt(I);
1074 const APInt &FromCIValue = CI->getValue();
1075 unsigned FromCIValueBitWidth = FromCIValue.
getBitWidth();
1077 switch (FromOpcode) {
1078 case Instruction::Shl:
1079 if (ToOpcode == Instruction::Mul) {
1083 assert(FromCIValue.
isZero() &&
"Cannot convert the instruction.");
1084 ToCIValue = ToOpcode == Instruction::And
1086 : APInt::getZero(FromCIValueBitWidth);
1089 case Instruction::Mul:
1091 if (ToOpcode == Instruction::Shl) {
1092 ToCIValue = APInt(FromCIValueBitWidth, FromCIValue.
logBase2());
1094 assert(FromCIValue.
isOne() &&
"Cannot convert the instruction.");
1095 ToCIValue = ToOpcode == Instruction::And
1097 : APInt::getZero(FromCIValueBitWidth);
1100 case Instruction::Add:
1101 case Instruction::Sub:
1102 if (FromCIValue.
isZero()) {
1106 "Cannot convert the instruction.");
1107 ToCIValue = FromCIValue;
1111 case Instruction::And:
1113 ToCIValue = ToOpcode == Instruction::Mul
1115 : APInt::getZero(FromCIValueBitWidth);
1118 assert(FromCIValue.
isZero() &&
"Cannot convert the instruction.");
1122 Value *
LHS = I->getOperand(1 - Pos);
1124 ConstantInt::get(I->getOperand(Pos)->getType(), ToCIValue);
1128 ((FromOpcode == Instruction::Add || FromOpcode == Instruction::Or ||
1129 FromOpcode == Instruction::Xor) &&
1130 ToOpcode == Instruction::Sub))
1135 InterchangeableInfo MainOp;
1136 InterchangeableInfo AltOp;
1138 return ::isValidForAlternation(MainOp.I->getOpcode()) &&
1141 bool initializeAltOp(
const Instruction *
I) {
1151 BinOpSameOpcodeHelper(
const Instruction *MainOp,
1152 const Instruction *AltOp =
nullptr)
1153 : MainOp(MainOp), AltOp(AltOp) {
1156 bool add(
const Instruction *
I) {
1158 "BinOpSameOpcodeHelper only accepts BinaryOperator.");
1159 unsigned Opcode =
I->getOpcode();
1160 MaskType OpcodeInMaskForm;
1163 case Instruction::Shl:
1164 OpcodeInMaskForm = ShlBIT;
1166 case Instruction::AShr:
1167 OpcodeInMaskForm = AShrBIT;
1169 case Instruction::Mul:
1170 OpcodeInMaskForm = MulBIT;
1172 case Instruction::Add:
1173 OpcodeInMaskForm = AddBIT;
1175 case Instruction::Sub:
1176 OpcodeInMaskForm = SubBIT;
1178 case Instruction::And:
1179 OpcodeInMaskForm = AndBIT;
1181 case Instruction::Or:
1182 OpcodeInMaskForm = OrBIT;
1184 case Instruction::Xor:
1185 OpcodeInMaskForm = XorBIT;
1188 return MainOp.equal(Opcode) ||
1189 (initializeAltOp(
I) && AltOp.equal(Opcode));
1191 MaskType InterchangeableMask = OpcodeInMaskForm;
1192 ConstantInt *CI = isBinOpWithConstantInt(
I).first;
1194 constexpr MaskType CanBeAll =
1195 XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | MulBIT | AShrBIT | ShlBIT;
1196 const APInt &CIValue = CI->
getValue();
1198 case Instruction::Shl:
1200 InterchangeableMask = CIValue.
isZero() ? CanBeAll : MulBIT | ShlBIT;
1202 case Instruction::Mul:
1203 if (CIValue.
isOne()) {
1204 InterchangeableMask = CanBeAll;
1208 InterchangeableMask = MulBIT | ShlBIT;
1210 case Instruction::Add:
1211 case Instruction::Sub:
1212 InterchangeableMask = CIValue.
isZero() ? CanBeAll : SubBIT | AddBIT;
1214 case Instruction::And:
1216 InterchangeableMask = CanBeAll;
1218 case Instruction::Xor:
1220 InterchangeableMask = XorBIT | OrBIT | SubBIT | AddBIT;
1224 InterchangeableMask = CanBeAll;
1228 return MainOp.trySet(OpcodeInMaskForm, InterchangeableMask) ||
1229 (initializeAltOp(
I) &&
1230 AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));
1232 unsigned getMainOpcode()
const {
return MainOp.getOpcode(); }
1234 bool hasCandidateOpcode(
unsigned Opcode)
const {
1235 return MainOp.hasCandidateOpcode(Opcode);
1237 bool hasAltOp()
const {
return AltOp.I; }
1238 unsigned getAltOpcode()
const {
1239 return hasAltOp() ? AltOp.getOpcode() : getMainOpcode();
1242 return MainOp.getOperand(
I);
1247class InstructionsState {
1273 bool HasCopyables =
false;
1277 assert(valid() &&
"InstructionsState is invalid.");
1282 assert(valid() &&
"InstructionsState is invalid.");
1287 unsigned getOpcode()
const {
return getMainOp()->getOpcode(); }
1289 unsigned getAltOpcode()
const {
return getAltOp()->getOpcode(); }
1292 bool isAltShuffle()
const {
return getMainOp() != getAltOp(); }
1301 Instruction *getMatchingMainOpOrAltOp(Instruction *
I)
const {
1302 assert(MainOp &&
"MainOp cannot be nullptr.");
1303 if (
I->getOpcode() == MainOp->getOpcode())
1306 assert(AltOp &&
"AltOp cannot be nullptr.");
1307 if (
I->getOpcode() == AltOp->getOpcode())
1309 if (!
I->isBinaryOp())
1311 BinOpSameOpcodeHelper
Converter(MainOp);
1314 if (isAltShuffle() && !
Converter.hasCandidateOpcode(MainOp->getOpcode())) {
1315 BinOpSameOpcodeHelper AltConverter(AltOp);
1316 if (AltConverter.add(
I) && AltConverter.add(AltOp) &&
1317 AltConverter.hasCandidateOpcode(AltOp->getOpcode()))
1320 if (
Converter.hasAltOp() && !isAltShuffle())
1322 return Converter.hasAltOp() ? AltOp : MainOp;
1326 bool isShiftOp()
const {
1327 return getMainOp()->isShift() && getAltOp()->isShift();
1332 return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();
1336 bool isMulDivLikeOp()
const {
1337 constexpr std::array<unsigned, 8> MulDiv = {
1338 Instruction::Mul, Instruction::FMul, Instruction::SDiv,
1339 Instruction::UDiv, Instruction::FDiv, Instruction::SRem,
1340 Instruction::URem, Instruction::FRem};
1346 bool isAddSubLikeOp()
const {
1347 constexpr std::array<unsigned, 4>
AddSub = {
1348 Instruction::Add, Instruction::Sub, Instruction::FAdd,
1355 bool isCmpOp()
const {
1356 return (
getOpcode() == Instruction::ICmp ||
1362 bool valid()
const {
return MainOp && AltOp; }
1364 explicit operator bool()
const {
return valid(); }
1366 InstructionsState() =
delete;
1367 InstructionsState(Instruction *MainOp, Instruction *AltOp,
1368 bool HasCopyables =
false)
1369 : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}
1370 static InstructionsState invalid() {
return {
nullptr,
nullptr}; }
1373 bool isCopyableElement(
Value *V)
const {
1374 assert(valid() &&
"InstructionsState is invalid.");
1377 if (isAltShuffle() ||
getOpcode() == Instruction::GetElementPtr)
1382 if (
I->getParent() != MainOp->getParent() &&
1386 if (
I->getOpcode() == MainOp->getOpcode())
1388 if (!
I->isBinaryOp())
1390 BinOpSameOpcodeHelper
Converter(MainOp);
1396 bool isNonSchedulable(
Value *V)
const {
1397 assert(valid() &&
"InstructionsState is invalid.");
1404 if (getMainOp() == V)
1406 if (isCopyableElement(V)) {
1407 auto IsNonSchedulableCopyableElement = [
this](
Value *
V) {
1409 return !
I ||
isa<PHINode>(
I) ||
I->getParent() != MainOp->getParent() ||
1414 !MainOp->comesBefore(
I));
1417 return IsNonSchedulableCopyableElement(V);
1424 bool areInstructionsWithCopyableElements()
const {
1425 assert(valid() &&
"InstructionsState is invalid.");
1426 return HasCopyables;
1430std::pair<Instruction *, SmallVector<Value *>>
1432 Instruction *SelectedOp = S.getMatchingMainOpOrAltOp(
I);
1433 assert(SelectedOp &&
"Cannot convert the instruction.");
1434 if (
I->isBinaryOp()) {
1436 return std::make_pair(SelectedOp,
Converter.getOperand(SelectedOp));
1455 for (
Value *V : VL) {
1460 if (Inst->getOpcode() == Opcode)
1474 BaseOp0 == Op0 || BaseOp1 == Op1 ||
1485 "Assessing comparisons of different types?");
1495 return (BasePred == Pred &&
1497 (BasePred == SwappedPred &&
1508 return InstructionsState::invalid();
1512 return InstructionsState::invalid();
1517 (VL.
size() == 2 && InstCnt < 2))
1518 return InstructionsState::invalid();
1527 unsigned AltOpcode = Opcode;
1529 BinOpSameOpcodeHelper BinOpHelper(MainOp);
1530 bool SwappedPredsCompatible = IsCmpOp && [&]() {
1532 UniquePreds.
insert(BasePred);
1533 UniqueNonSwappedPreds.
insert(BasePred);
1534 for (
Value *V : VL) {
1541 UniqueNonSwappedPreds.
insert(CurrentPred);
1542 if (!UniquePreds.
contains(CurrentPred) &&
1543 !UniquePreds.
contains(SwappedCurrentPred))
1544 UniquePreds.
insert(CurrentPred);
1549 return UniqueNonSwappedPreds.
size() > 2 && UniquePreds.
size() == 2;
1559 return InstructionsState::invalid();
1561 bool AnyPoison = InstCnt != VL.
size();
1572 if (AnyPoison && (
I->isIntDivRem() ||
I->isFPDivRem() ||
isa<CallInst>(
I)))
1573 return InstructionsState::invalid();
1574 unsigned InstOpcode =
I->getOpcode();
1576 if (BinOpHelper.add(
I))
1581 Value *Op1 =
I->getOperand(0);
1584 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
1586 if (Opcode == AltOpcode) {
1589 "Cast isn't safe for alternation, logic needs to be updated!");
1590 AltOpcode = InstOpcode;
1597 Type *Ty0 = BaseInst->getOperand(0)->getType();
1598 Type *Ty1 = Inst->getOperand(0)->getType();
1600 assert(InstOpcode == Opcode &&
"Expected same CmpInst opcode.");
1601 assert(InstOpcode == AltOpcode &&
1602 "Alternate instructions are only supported by BinaryOperator "
1610 if ((VL.
size() == 2 || SwappedPredsCompatible) &&
1611 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1617 if (MainOp != AltOp) {
1620 }
else if (BasePred != CurrentPred) {
1623 "CmpInst isn't safe for alternation, logic needs to be updated!");
1628 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1629 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1632 }
else if (InstOpcode == Opcode) {
1633 assert(InstOpcode == AltOpcode &&
1634 "Alternate instructions are only supported by BinaryOperator and "
1637 if (Gep->getNumOperands() != 2 ||
1639 return InstructionsState::invalid();
1642 return InstructionsState::invalid();
1645 if (!LI->isSimple() || !BaseLI->isSimple())
1646 return InstructionsState::invalid();
1650 return InstructionsState::invalid();
1651 if (
Call->hasOperandBundles() &&
1653 !std::equal(
Call->op_begin() +
Call->getBundleOperandsStartIndex(),
1654 Call->op_begin() +
Call->getBundleOperandsEndIndex(),
1657 return InstructionsState::invalid();
1660 return InstructionsState::invalid();
1663 if (Mappings.
size() != BaseMappings.
size() ||
1664 Mappings.
front().ISA != BaseMappings.
front().ISA ||
1665 Mappings.
front().ScalarName != BaseMappings.
front().ScalarName ||
1666 Mappings.
front().VectorName != BaseMappings.
front().VectorName ||
1667 Mappings.
front().Shape.VF != BaseMappings.
front().Shape.VF ||
1668 Mappings.
front().Shape.Parameters !=
1669 BaseMappings.
front().Shape.Parameters)
1670 return InstructionsState::invalid();
1675 return InstructionsState::invalid();
1680 assert(MainOp &&
"Cannot find MainOp with Opcode from BinOpHelper.");
1682 assert(MainOp &&
"Cannot find AltOp with Opcode from BinOpHelper.");
1685 "Incorrect implementation of allSameOpcode.");
1686 InstructionsState S(MainOp, AltOp);
1692 "Invalid InstructionsState.");
1700 return all_of(VL, [&](
Value *V) {
return V->getType() == Ty; });
1710 unsigned Opcode = UserInst->
getOpcode();
1712 case Instruction::Load: {
1716 case Instruction::Store: {
1718 return (
SI->getPointerOperand() == Scalar);
1720 case Instruction::Call: {
1724 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1725 Arg.value().get() == Scalar;
1745 return LI->isSimple();
1747 return SI->isSimple();
1749 return !
MI->isVolatile();
1757 bool ExtendingManyInputs =
false) {
1758 if (SubMask.
empty())
1761 (!ExtendingManyInputs || SubMask.
size() > Mask.size() ||
1764 "SubMask with many inputs support must be larger than the mask.");
1766 Mask.append(SubMask.
begin(), SubMask.
end());
1770 int TermValue = std::min(Mask.size(), SubMask.
size());
1771 for (
int I = 0,
E = SubMask.
size();
I <
E; ++
I) {
1773 (!ExtendingManyInputs &&
1774 (SubMask[
I] >= TermValue || Mask[SubMask[
I]] >= TermValue)))
1776 NewMask[
I] = Mask[SubMask[
I]];
1792 const size_t Sz = Order.
size();
1795 for (
unsigned I = 0;
I < Sz; ++
I) {
1797 UnusedIndices.
reset(Order[
I]);
1799 MaskedIndices.
set(
I);
1801 if (MaskedIndices.
none())
1804 "Non-synced masked/available indices.");
1808 assert(Idx >= 0 &&
"Indices must be synced.");
1818 unsigned Opcode0,
unsigned Opcode1) {
1825 OpcodeMask.
set(Lane * ScalarTyNumElements,
1826 Lane * ScalarTyNumElements + ScalarTyNumElements);
1835 "Expected scalar constants.");
1838 std::fill_n(NewVal.begin() +
I * VF, VF, V);
1845 const unsigned E = Indices.
size();
1847 for (
unsigned I = 0;
I <
E; ++
I)
1848 Mask[Indices[
I]] =
I;
1854 assert(!Mask.empty() &&
"Expected non-empty mask.");
1858 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
1860 Scalars[Mask[
I]] = Prev[
I];
1873 auto *IO = dyn_cast<Instruction>(V);
1876 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1889 return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(
UsesLimit) &&
1891 auto *IU = dyn_cast<Instruction>(U);
1894 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1910 return !VL.
empty() &&
1926 return NumParts > 0 && NumParts < Sz &&
has_single_bit(Sz / NumParts) &&
1935 const unsigned Limit = std::numeric_limits<unsigned>::max()) {
1936 unsigned NumParts =
TTI.getNumberOfParts(VecTy);
1937 if (NumParts == 0 || NumParts >= Limit)
1940 if (NumParts >= Sz || Sz % NumParts != 0 ||
1949 class ScheduleEntity;
1951 class ScheduleCopyableData;
1952 class ScheduleBundle;
1962 struct StridedPtrInfo {
1963 Value *StrideVal =
nullptr;
1964 const SCEV *StrideSCEV =
nullptr;
1990 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1991 AC(AC), DB(DB), DL(DL), ORE(ORE),
2010 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
2023 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales = {});
2043 const SmallDenseSet<Value *> &UserIgnoreLst);
2050 assert(!VectorizableTree.empty() &&
"No graph to get the first node from");
2051 return VectorizableTree.front()->Scalars;
2057 const TreeEntry &Root = *VectorizableTree.front();
2058 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
2059 !Root.Scalars.
front()->getType()->isIntegerTy())
2060 return std::nullopt;
2061 auto It = MinBWs.find(&Root);
2062 if (It != MinBWs.end())
2066 if (Root.getOpcode() == Instruction::ZExt ||
2067 Root.getOpcode() == Instruction::SExt)
2068 return std::make_pair(
cast<CastInst>(Root.getMainOp())->getSrcTy(),
2069 Root.getOpcode() == Instruction::SExt);
2070 return std::nullopt;
2076 return MinBWs.at(VectorizableTree.front().get()).second;
2081 if (ReductionBitWidth == 0 ||
2082 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
2083 ReductionBitWidth >=
2084 DL->getTypeSizeInBits(
2085 VectorizableTree.front()->Scalars.front()->getType()))
2087 VectorizableTree.front()->Scalars.front()->getType(),
2088 VectorizableTree.front()->getVectorFactor());
2091 VectorizableTree.front()->Scalars.front()->getContext(),
2093 VectorizableTree.front()->getVectorFactor());
2108 VectorizableTree.clear();
2109 ScalarToTreeEntries.clear();
2110 DeletedNodes.clear();
2111 TransformedToGatherNodes.clear();
2112 OperandsToTreeEntry.clear();
2113 ScalarsInSplitNodes.clear();
2115 NonScheduledFirst.clear();
2116 EntryToLastInstruction.clear();
2117 LastInstructionToPos.clear();
2118 LoadEntriesToVectorize.clear();
2119 IsGraphTransformMode =
false;
2120 GatheredLoadsEntriesFirst.reset();
2121 CompressEntryToData.clear();
2122 ExternalUses.clear();
2123 ExternalUsesAsOriginalScalar.clear();
2124 ExternalUsesWithNonUsers.clear();
2125 for (
auto &Iter : BlocksSchedules) {
2126 BlockScheduling *BS = Iter.second.get();
2130 ReductionBitWidth = 0;
2132 CastMaxMinBWSizes.reset();
2133 ExtraBitWidthNodes.clear();
2134 InstrElementSize.clear();
2135 UserIgnoreList =
nullptr;
2136 PostponedGathers.clear();
2137 ValueToGatherNodes.clear();
2138 TreeEntryToStridedPtrInfoMap.clear();
2154 assert(!Order.
empty() &&
"expected non-empty order");
2155 const unsigned Sz = Order.
size();
2157 return P.value() ==
P.index() ||
P.value() == Sz;
2170 bool IgnoreReorder);
2183 std::optional<OrdersType>
2221 return MaxVecRegSize;
2226 return MinVecRegSize;
2234 unsigned MaxVF =
MaxVFOption.getNumOccurrences() ?
2235 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
2236 return MaxVF ? MaxVF : UINT_MAX;
2275 Align Alignment,
const int64_t Diff,
2276 const size_t Sz)
const;
2316 Value *Ptr0,
Value *PtrN, StridedPtrInfo &SPtrInfo)
const;
2334 Align CommonAlignment,
2336 StridedPtrInfo &SPtrInfo)
const;
2351 StridedPtrInfo &SPtrInfo,
2352 unsigned *BestVF =
nullptr,
2353 bool TryRecursiveCheck =
true)
const;
2357 ListOfKnonwnNonVectorizableLoads.insert(
hash_value(VL));
2361 template <
typename T>
2363 return ListOfKnonwnNonVectorizableLoads.contains(
hash_value(VL));
2388 OS <<
"{User:" << (
UserTE ? std::to_string(
UserTE->Idx) :
"null")
2389 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
2414 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
2415 MaxLevel(MaxLevel) {}
2471 auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
2476 auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
2478 return U == U1 || U == U2 || R.isVectorized(U);
2481 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
2484 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
2486 ((
int)V1->getNumUses() == NumLanes ||
2487 AllUsersAreInternal(V1, V2)))
2493 auto CheckSameEntryOrFail = [&]() {
2498 any_of(TEs2, [&](TreeEntry *E) {
return Set.contains(E); }))
2507 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
2509 return CheckSameEntryOrFail();
2512 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
2513 LI2->getPointerOperand(), DL, SE,
true);
2514 if (!Dist || *Dist == 0) {
2517 R.TTI->isLegalMaskedGather(
2520 return CheckSameEntryOrFail();
2524 if (std::abs(*Dist) > NumLanes / 2)
2557 Value *EV2 =
nullptr;
2570 int Dist = Idx2 - Idx1;
2573 if (std::abs(Dist) == 0)
2575 if (std::abs(Dist) > NumLanes / 2)
2582 return CheckSameEntryOrFail();
2588 if (I1->getParent() != I2->getParent())
2589 return CheckSameEntryOrFail();
2597 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.
empty() ||
2598 !S.isAltShuffle()) &&
2602 S.getMainOp()->getNumOperands();
2614 return CheckSameEntryOrFail();
2648 int ShallowScoreAtThisLevel =
2659 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
2662 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
2664 ShallowScoreAtThisLevel))
2665 return ShallowScoreAtThisLevel;
2666 assert(I1 && I2 &&
"Should have early exited.");
2673 for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
2674 OpIdx1 != NumOperands1; ++OpIdx1) {
2676 int MaxTmpScore = 0;
2677 unsigned MaxOpIdx2 = 0;
2678 bool FoundBest =
false;
2682 ? I2->getNumOperands()
2683 : std::min(I2->getNumOperands(), OpIdx1 + 1);
2684 assert(FromIdx <= ToIdx &&
"Bad index");
2685 for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
2687 if (Op2Used.
count(OpIdx2))
2692 I1, I2, CurrLevel + 1, {});
2695 TmpScore > MaxTmpScore) {
2696 MaxTmpScore = TmpScore;
2703 Op2Used.
insert(MaxOpIdx2);
2704 ShallowScoreAtThisLevel += MaxTmpScore;
2707 return ShallowScoreAtThisLevel;
2738 struct OperandData {
2739 OperandData() =
default;
2740 OperandData(
Value *V,
bool APO,
bool IsUsed)
2741 : V(V), APO(APO), IsUsed(IsUsed) {}
2751 bool IsUsed =
false;
2760 enum class ReorderingMode {
2774 unsigned ArgSize = 0;
2780 const Loop *L =
nullptr;
2783 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
2784 return OpsVec[
OpIdx][Lane];
2788 const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const {
2789 return OpsVec[
OpIdx][Lane];
2794 for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
2796 for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2798 OpsVec[
OpIdx][Lane].IsUsed =
false;
2802 void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
2803 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2815 int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx,
2817 Value *IdxLaneV = getData(Idx, Lane).V;
2830 unsigned UniquesCount = Uniques.
size();
2831 auto IdxIt = Uniques.
find(IdxLaneV);
2832 unsigned UniquesCntWithIdxLaneV =
2833 IdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2835 auto OpIdxIt = Uniques.
find(OpIdxLaneV);
2836 unsigned UniquesCntWithOpIdxLaneV =
2837 OpIdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2838 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2840 return std::min(
bit_ceil(UniquesCntWithOpIdxLaneV) -
2841 UniquesCntWithOpIdxLaneV,
2842 UniquesCntWithOpIdxLaneV -
2844 ((IdxIt != Uniques.
end() && UsedLanes.
test(IdxIt->second))
2845 ? UniquesCntWithIdxLaneV -
bit_floor(UniquesCntWithIdxLaneV)
2846 :
bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2855 int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
2856 Value *IdxLaneV = getData(Idx, Lane).V;
2869 return R.areAllUsersVectorized(IdxLaneI)
2877 static const int ScoreScaleFactor = 10;
2885 int Lane,
unsigned OpIdx,
unsigned Idx,
2895 int SplatScore = getSplatScore(Lane,
OpIdx, Idx, UsedLanes);
2896 if (Score <= -SplatScore) {
2900 Score += SplatScore;
2906 Score *= ScoreScaleFactor;
2907 Score += getExternalUseScore(Lane,
OpIdx, Idx);
2925 std::optional<unsigned>
2926 getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
2930 unsigned NumOperands = getNumOperands();
2933 Value *OpLastLane = getData(
OpIdx, LastLane).V;
2936 ReorderingMode RMode = ReorderingModes[
OpIdx];
2937 if (RMode == ReorderingMode::Failed)
2938 return std::nullopt;
2941 bool OpIdxAPO = getData(
OpIdx, Lane).APO;
2947 std::optional<unsigned> Idx;
2951 BestScoresPerLanes.try_emplace(std::make_pair(
OpIdx, Lane), 0)
2957 bool IsUsed = RMode == ReorderingMode::Splat ||
2958 RMode == ReorderingMode::Constant ||
2959 RMode == ReorderingMode::Load;
2961 for (
unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2963 OperandData &OpData = getData(Idx, Lane);
2965 bool OpAPO = OpData.APO;
2974 if (OpAPO != OpIdxAPO)
2979 case ReorderingMode::Load:
2980 case ReorderingMode::Opcode: {
2981 bool LeftToRight = Lane > LastLane;
2982 Value *OpLeft = (LeftToRight) ? OpLastLane :
Op;
2983 Value *OpRight = (LeftToRight) ?
Op : OpLastLane;
2984 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2985 OpIdx, Idx, IsUsed, UsedLanes);
2986 if (Score >
static_cast<int>(BestOp.Score) ||
2987 (Score > 0 && Score ==
static_cast<int>(BestOp.Score) &&
2990 BestOp.Score = Score;
2991 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] = Score;
2995 case ReorderingMode::Constant:
2997 (!BestOp.Score && L && L->isLoopInvariant(
Op))) {
3001 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] =
3008 case ReorderingMode::Splat:
3010 IsUsed =
Op == OpLastLane;
3011 if (
Op == OpLastLane) {
3013 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] =
3019 case ReorderingMode::Failed:
3025 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
3029 return std::nullopt;
3036 unsigned getBestLaneToStartReordering()
const {
3037 unsigned Min = UINT_MAX;
3038 unsigned SameOpNumber = 0;
3049 for (
int I = getNumLanes();
I > 0; --
I) {
3050 unsigned Lane =
I - 1;
3051 OperandsOrderData NumFreeOpsHash =
3052 getMaxNumOperandsThatCanBeReordered(Lane);
3055 if (NumFreeOpsHash.NumOfAPOs < Min) {
3056 Min = NumFreeOpsHash.NumOfAPOs;
3057 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3059 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3060 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
3061 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
3064 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3065 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3066 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
3067 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
3068 auto [It, Inserted] =
3069 HashMap.
try_emplace(NumFreeOpsHash.Hash, 1, Lane);
3075 unsigned BestLane = 0;
3076 unsigned CntMin = UINT_MAX;
3078 if (
Data.second.first < CntMin) {
3079 CntMin =
Data.second.first;
3080 BestLane =
Data.second.second;
3087 struct OperandsOrderData {
3090 unsigned NumOfAPOs = UINT_MAX;
3093 unsigned NumOpsWithSameOpcodeParent = 0;
3107 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const {
3108 unsigned CntTrue = 0;
3109 unsigned NumOperands = getNumOperands();
3119 bool AllUndefs =
true;
3120 unsigned NumOpsWithSameOpcodeParent = 0;
3125 const OperandData &OpData = getData(
OpIdx, Lane);
3132 I->getParent() != Parent) {
3133 if (NumOpsWithSameOpcodeParent == 0) {
3134 NumOpsWithSameOpcodeParent = 1;
3136 Parent =
I->getParent();
3138 --NumOpsWithSameOpcodeParent;
3141 ++NumOpsWithSameOpcodeParent;
3150 OperandsOrderData
Data;
3151 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
3152 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
3159 const InstructionsState &S) {
3163 return VL.
size() == getNumLanes();
3165 "Expected same number of lanes");
3166 assert(S.valid() &&
"InstructionsState is invalid.");
3172 OpsVec.resize(ArgSize);
3173 unsigned NumLanes = VL.
size();
3174 for (OperandDataVec &
Ops : OpsVec)
3175 Ops.resize(NumLanes);
3190 OpsVec[
OpIdx][Lane] = {Operands[
OpIdx][Lane],
true,
false};
3193 bool IsInverseOperation =
false;
3194 if (S.isCopyableElement(VL[Lane])) {
3196 IsInverseOperation =
3199 assert(
I &&
"Expected instruction");
3200 auto [SelectedOp,
Ops] = convertTo(
I, S);
3207 bool APO = (
OpIdx == 0) ?
false : IsInverseOperation;
3208 OpsVec[
OpIdx][Lane] = {Operands[
OpIdx][Lane], APO,
false};
3214 unsigned getNumOperands()
const {
return ArgSize; }
3217 unsigned getNumLanes()
const {
return OpsVec[0].size(); }
3220 Value *getValue(
unsigned OpIdx,
unsigned Lane)
const {
3221 return getData(
OpIdx, Lane).V;
3225 bool empty()
const {
return OpsVec.empty(); }
3228 void clear() { OpsVec.clear(); }
3233 bool shouldBroadcast(
Value *
Op,
unsigned OpIdx,
unsigned Lane) {
3235 "Op is expected to be getValue(OpIdx, Lane).");
3239 bool OpAPO = getData(
OpIdx, Lane).APO;
3240 bool IsInvariant = L && L->isLoopInvariant(
Op);
3242 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3246 bool FoundCandidate =
false;
3247 for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
3248 OperandData &
Data = getData(OpI, Ln);
3249 if (
Data.APO != OpAPO ||
Data.IsUsed)
3251 Value *OpILane = getValue(OpI, Lane);
3275 L->isLoopInvariant(
Data.V))) {
3276 FoundCandidate =
true;
3283 if (!FoundCandidate)
3286 return getNumLanes() == 2 || Cnt > 1;
3293 "Op is expected to be getValue(OpIdx, Lane).");
3294 bool OpAPO = getData(
OpIdx, Lane).APO;
3295 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3299 const OperandData &
Data = getData(OpI, Ln);
3300 if (
Data.APO != OpAPO ||
Data.IsUsed)
3302 Value *OpILn = getValue(OpI, Ln);
3303 return (L && L->isLoopInvariant(OpILn)) ||
3315 const InstructionsState &S,
const BoUpSLP &R)
3316 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
3317 L(R.LI->getLoopFor(S.getMainOp()->
getParent())) {
3319 appendOperands(RootVL, Operands, S);
3327 "Expected same num of lanes across all operands");
3328 for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
3329 OpVL[Lane] = OpsVec[
OpIdx][Lane].V;
3337 unsigned NumOperands = getNumOperands();
3338 unsigned NumLanes = getNumLanes();
3358 unsigned FirstLane = getBestLaneToStartReordering();
3367 if (shouldBroadcast(OpLane0,
OpIdx, FirstLane) ||
3368 !canBeVectorized(OpILane0,
OpIdx, FirstLane))
3369 ReorderingModes[
OpIdx] = ReorderingMode::Splat;
3371 ReorderingModes[
OpIdx] = ReorderingMode::Load;
3373 ReorderingModes[
OpIdx] = ReorderingMode::Opcode;
3375 ReorderingModes[
OpIdx] = ReorderingMode::Constant;
3378 ReorderingModes[
OpIdx] = ReorderingMode::Splat;
3388 auto &&SkipReordering = [
this]() {
3391 for (
const OperandData &
Data : Op0)
3394 ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
3395 if (
any_of(
Op, [&UniqueValues](
const OperandData &
Data) {
3402 return UniqueValues.
size() != 2 &&
3404 UniqueValues.
size());
3416 if (SkipReordering())
3419 bool StrategyFailed =
false;
3427 for (
unsigned I = 0;
I < NumOperands; ++
I)
3428 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
3431 UsedLanes.
set(FirstLane);
3432 for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
3434 for (
int Direction : {+1, -1}) {
3435 int Lane = FirstLane + Direction * Distance;
3436 if (Lane < 0 || Lane >= (
int)NumLanes)
3438 UsedLanes.
set(Lane);
3439 int LastLane = Lane - Direction;
3440 assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
3445 std::optional<unsigned> BestIdx =
3446 getBestOperand(
OpIdx, Lane, LastLane, ReorderingModes,
3447 MainAltOps[
OpIdx], UsedLanes);
3454 swap(
OpIdx, *BestIdx, Lane);
3457 StrategyFailed =
true;
3461 OperandData &AltOp = getData(
OpIdx, Lane);
3462 InstructionsState OpS =
3464 if (OpS && OpS.isAltShuffle())
3471 if (!StrategyFailed)
3476#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3479 case ReorderingMode::Load:
3481 case ReorderingMode::Opcode:
3483 case ReorderingMode::Constant:
3485 case ReorderingMode::Splat:
3487 case ReorderingMode::Failed:
3508 const unsigned Indent = 2;
3510 for (
const OperandDataVec &OpDataVec : OpsVec) {
3511 OS <<
"Operand " << Cnt++ <<
"\n";
3512 for (
const OperandData &OpData : OpDataVec) {
3513 OS.
indent(Indent) <<
"{";
3514 if (
Value *V = OpData.V)
3518 OS <<
", APO:" << OpData.APO <<
"}\n";
3540 int BestScore = Limit;
3541 std::optional<int> Index;
3542 for (
int I :
seq<int>(0, Candidates.size())) {
3544 Candidates[
I].second,
3547 if (Score > BestScore) {
3562 DeletedInstructions.insert(
I);
3567 template <
typename T>
3570 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
3572 for (T *V : DeadVals) {
3577 for (T *V : DeadVals) {
3578 if (!V || !Processed.
insert(V).second)
3583 for (
Use &U :
I->operands()) {
3585 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
3587 (Entries.
empty() ||
none_of(Entries, [&](
const TreeEntry *Entry) {
3588 return Entry->VectorizedValue == OpI;
3592 I->dropAllReferences();
3594 for (T *V : DeadVals) {
3596 if (!
I->getParent())
3601 cast<Instruction>(U.getUser()));
3603 "trying to erase instruction with users.");
3604 I->removeFromParent();
3608 while (!DeadInsts.
empty()) {
3611 if (!VI || !VI->getParent())
3614 "Live instruction found in dead worklist!");
3615 assert(VI->use_empty() &&
"Instructions with uses are not dead.");
3622 for (
Use &OpU : VI->operands()) {
3623 Value *OpV = OpU.get();
3635 if (!DeletedInstructions.contains(OpI) &&
3636 (!OpI->getType()->isVectorTy() ||
3637 none_of(VectorValuesAndScales,
3638 [&](
const std::tuple<Value *, unsigned, bool> &V) {
3639 return std::get<0>(V) == OpI;
3645 VI->removeFromParent();
3647 SE->forgetValue(VI);
3654 return AnalyzedReductionsRoots.count(
I);
3659 AnalyzedReductionsRoots.insert(
I);
3664 return AnalyzedReductionVals.contains(
hash_value(VL));
3669 AnalyzedReductionVals.insert(
hash_value(VL));
3673 AnalyzedReductionsRoots.clear();
3674 AnalyzedReductionVals.clear();
3675 AnalyzedMinBWVals.clear();
3683 return MustGather.contains(V);
3687 return NonScheduledFirst.contains(V);
3692 assert(V &&
"V cannot be nullptr.");
3693 return ScalarToTreeEntries.contains(V);
3703 bool collectValuesToDemote(
3704 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
3707 bool &IsProfitableToDemote,
bool IsTruncRoot)
const;
3716 void buildReorderableOperands(
3724 void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
3727 bool areAllUsersVectorized(
3736 const TreeEntry *getOperandEntry(
const TreeEntry *E,
unsigned Idx)
const;
3737 TreeEntry *getOperandEntry(TreeEntry *E,
unsigned Idx) {
3738 return const_cast<TreeEntry *
>(
3739 getOperandEntry(
const_cast<const TreeEntry *
>(E), Idx));
3745 Instruction *getRootEntryInstruction(
const TreeEntry &Entry)
const;
3749 getCastContextHint(
const TreeEntry &TE)
const;
3763 const InstructionsState &LocalState,
3770 unsigned InterleaveFactor = 0);
3781 bool ResizeAllowed =
false)
const;
3788 Value *vectorizeOperand(TreeEntry *
E,
unsigned NodeIdx);
3793 template <
typename BVTy,
typename ResTy,
typename... Args>
3794 ResTy processBuildVector(
const TreeEntry *
E,
Type *ScalarTy, Args &...Params);
3799 Value *createBuildVector(
const TreeEntry *
E,
Type *ScalarTy);
3805 Instruction &getLastInstructionInBundle(
const TreeEntry *
E);
3812 std::optional<TargetTransformInfo::ShuffleKind>
3824 unsigned NumParts)
const;
3836 std::optional<TargetTransformInfo::ShuffleKind>
3837 isGatherShuffledSingleRegisterEntry(
3854 isGatherShuffledEntry(
3857 unsigned NumParts,
bool ForOrder =
false);
3863 Type *ScalarTy)
const;
3867 void setInsertPointAfterBundle(
const TreeEntry *
E);
3877 bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
3882 void tryToVectorizeGatheredLoads(
3884 std::tuple<BasicBlock *, Value *, Type *>,
3892 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
3908 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
3912 void reorderGatherNode(TreeEntry &TE);
3917 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3920 SmallVector<int> getCommonMask()
const {
3921 if (State == TreeEntry::SplitVectorize)
3923 SmallVector<int>
Mask;
3930 SmallVector<int> getSplitMask()
const {
3931 assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
3932 "Expected only split vectorize node.");
3934 unsigned CommonVF = std::max<unsigned>(
3935 CombinedEntriesWithIndices.back().second,
3936 Scalars.size() - CombinedEntriesWithIndices.back().second);
3937 for (
auto [Idx,
I] :
enumerate(ReorderIndices))
3939 Idx + (Idx >= CombinedEntriesWithIndices.back().second
3940 ? CommonVF - CombinedEntriesWithIndices.back().second
3947 void reorderSplitNode(
unsigned Idx, ArrayRef<int> Mask,
3948 ArrayRef<int> MaskOrder);
3953 if (
Mask.size() != VL.
size() && VL.
size() == Scalars.size())
3954 return std::equal(VL.
begin(), VL.
end(), Scalars.begin());
3957 [Scalars](
Value *V,
int Idx) {
3958 return (isa<UndefValue>(V) &&
3959 Idx == PoisonMaskElem) ||
3960 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3963 if (!ReorderIndices.empty()) {
3967 SmallVector<int>
Mask;
3969 if (VL.
size() == Scalars.size())
3970 return IsSame(Scalars, Mask);
3971 if (VL.
size() == ReuseShuffleIndices.size()) {
3973 return IsSame(Scalars, Mask);
3977 return IsSame(Scalars, ReuseShuffleIndices);
3981 bool hasEqualOperands(
const TreeEntry &TE)
const {
3982 if (
TE.getNumOperands() != getNumOperands())
3984 SmallBitVector
Used(getNumOperands());
3985 for (
unsigned I = 0,
E = getNumOperands();
I <
E; ++
I) {
3986 unsigned PrevCount =
Used.count();
3987 for (
unsigned K = 0;
K <
E; ++
K) {
3990 if (getOperand(K) ==
TE.getOperand(
I)) {
3996 if (PrevCount ==
Used.count())
4005 unsigned getVectorFactor()
const {
4006 if (!ReuseShuffleIndices.empty())
4007 return ReuseShuffleIndices.size();
4008 return Scalars.size();
4012 bool isGather()
const {
return State == NeedToGather; }
4018 WeakTrackingVH VectorizedValue =
nullptr;
4039 enum CombinedOpcode {
4041 MinMax = Instruction::OtherOpsEnd + 1,
4044 CombinedOpcode CombinedOp = NotCombinedOp;
4047 SmallVector<int, 4> ReuseShuffleIndices;
4050 SmallVector<unsigned, 4> ReorderIndices;
4058 VecTreeTy &Container;
4061 EdgeInfo UserTreeIndex;
4074 SmallVector<ValueList, 2> Operands;
4077 SmallPtrSet<const Value *, 4> CopyableElements;
4081 InstructionsState S = InstructionsState::invalid();
4084 unsigned InterleaveFactor = 0;
4087 bool DoesNotNeedToSchedule =
false;
4091 if (Operands.size() <
OpIdx + 1)
4092 Operands.resize(
OpIdx + 1);
4095 "Number of operands is greater than the number of scalars.");
4102 unsigned getInterleaveFactor()
const {
return InterleaveFactor; }
4104 void setInterleave(
unsigned Factor) { InterleaveFactor = Factor; }
4107 void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule =
true; }
4110 bool doesNotNeedToSchedule()
const {
return DoesNotNeedToSchedule; }
4115 setOperand(
I, Operands[
I]);
4119 void reorderOperands(ArrayRef<int> Mask) {
4127 return Operands[
OpIdx];
4133 return Operands[
OpIdx];
4137 unsigned getNumOperands()
const {
return Operands.size(); }
4140 Value *getSingleOperand(
unsigned OpIdx)
const {
4143 return Operands[
OpIdx][0];
4147 bool isAltShuffle()
const {
return S.isAltShuffle(); }
4149 Instruction *getMatchingMainOpOrAltOp(Instruction *
I)
const {
4150 return S.getMatchingMainOpOrAltOp(
I);
4158 if (
I && getMatchingMainOpOrAltOp(
I))
4160 return S.getMainOp();
4163 void setOperations(
const InstructionsState &S) {
4164 assert(S &&
"InstructionsState is invalid.");
4168 Instruction *getMainOp()
const {
return S.getMainOp(); }
4170 Instruction *getAltOp()
const {
return S.getAltOp(); }
4173 unsigned getOpcode()
const {
return S.getOpcode(); }
4175 unsigned getAltOpcode()
const {
return S.getAltOpcode(); }
4177 bool hasState()
const {
return S.valid(); }
4180 void addCopyableElement(
Value *V) {
4181 assert(S.isCopyableElement(V) &&
"Not a copyable element.");
4182 CopyableElements.insert(V);
4186 bool isCopyableElement(
Value *V)
const {
4187 return CopyableElements.contains(V);
4191 bool hasCopyableElements()
const {
return !CopyableElements.empty(); }
4194 const InstructionsState &getOperations()
const {
return S; }
4198 unsigned findLaneForValue(
Value *V)
const {
4199 unsigned FoundLane = getVectorFactor();
4200 for (
auto *It =
find(Scalars, V), *End = Scalars.end(); It != End;
4201 std::advance(It, 1)) {
4204 FoundLane = std::distance(Scalars.begin(), It);
4205 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
4206 if (!ReorderIndices.empty())
4207 FoundLane = ReorderIndices[FoundLane];
4208 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
4209 if (ReuseShuffleIndices.empty())
4211 if (
auto *RIt =
find(ReuseShuffleIndices, FoundLane);
4212 RIt != ReuseShuffleIndices.end()) {
4213 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
4217 assert(FoundLane < getVectorFactor() &&
"Unable to find given value.");
4224 buildAltOpShuffleMask(
const function_ref<
bool(Instruction *)> IsAltOp,
4225 SmallVectorImpl<int> &Mask,
4226 SmallVectorImpl<Value *> *OpScalars =
nullptr,
4227 SmallVectorImpl<Value *> *AltScalars =
nullptr)
const;
4230 bool isNonPowOf2Vec()
const {
4232 return IsNonPowerOf2;
4238 hasNonWholeRegisterOrNonPowerOf2Vec(
const TargetTransformInfo &
TTI)
const {
4241 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
4242 "Reshuffling not supported with non-power-of-2 vectors yet.");
4243 return IsNonPowerOf2;
4246 Value *getOrdered(
unsigned Idx)
const {
4247 if (ReorderIndices.empty())
4248 return Scalars[Idx];
4249 SmallVector<int>
Mask;
4251 return Scalars[
Mask[Idx]];
4257 dbgs() << Idx <<
".\n";
4258 for (
unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
4259 dbgs() <<
"Operand " << OpI <<
":\n";
4260 for (
const Value *V : Operands[OpI])
4263 dbgs() <<
"Scalars: \n";
4264 for (
Value *V : Scalars)
4266 dbgs() <<
"State: ";
4267 if (S && hasCopyableElements())
4268 dbgs() <<
"[[Copyable]] ";
4271 if (InterleaveFactor > 0) {
4272 dbgs() <<
"Vectorize with interleave factor " << InterleaveFactor
4275 dbgs() <<
"Vectorize\n";
4278 case ScatterVectorize:
4279 dbgs() <<
"ScatterVectorize\n";
4281 case StridedVectorize:
4282 dbgs() <<
"StridedVectorize\n";
4284 case CompressVectorize:
4285 dbgs() <<
"CompressVectorize\n";
4288 dbgs() <<
"NeedToGather\n";
4290 case CombinedVectorize:
4291 dbgs() <<
"CombinedVectorize\n";
4293 case SplitVectorize:
4294 dbgs() <<
"SplitVectorize\n";
4298 dbgs() <<
"MainOp: " << *S.getMainOp() <<
"\n";
4299 dbgs() <<
"AltOp: " << *S.getAltOp() <<
"\n";
4301 dbgs() <<
"MainOp: NULL\n";
4302 dbgs() <<
"AltOp: NULL\n";
4304 dbgs() <<
"VectorizedValue: ";
4305 if (VectorizedValue)
4306 dbgs() << *VectorizedValue <<
"\n";
4309 dbgs() <<
"ReuseShuffleIndices: ";
4310 if (ReuseShuffleIndices.empty())
4313 for (
int ReuseIdx : ReuseShuffleIndices)
4314 dbgs() << ReuseIdx <<
", ";
4316 dbgs() <<
"ReorderIndices: ";
4317 for (
unsigned ReorderIdx : ReorderIndices)
4318 dbgs() << ReorderIdx <<
", ";
4320 dbgs() <<
"UserTreeIndex: ";
4322 dbgs() << UserTreeIndex;
4324 dbgs() <<
"<invalid>";
4326 if (!CombinedEntriesWithIndices.empty()) {
4327 dbgs() <<
"Combined entries: ";
4329 dbgs() <<
"Entry index " <<
P.first <<
" with offset " <<
P.second;
4340 StringRef Banner)
const {
4341 dbgs() <<
"SLP: " << Banner <<
":\n";
4343 dbgs() <<
"SLP: Costs:\n";
4344 dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
4345 dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
4346 dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
4347 dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = "
4348 << ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
4354 const InstructionsState &S,
4356 ArrayRef<int> ReuseShuffleIndices = {}) {
4357 auto Invalid = ScheduleBundle::invalid();
4358 return newTreeEntry(VL,
Invalid, S, UserTreeIdx, ReuseShuffleIndices);
4363 const InstructionsState &S,
4365 ArrayRef<int> ReuseShuffleIndices = {},
4366 ArrayRef<unsigned> ReorderIndices = {},
4367 unsigned InterleaveFactor = 0) {
4368 TreeEntry::EntryState EntryState =
4369 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
4370 TreeEntry *
E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
4371 ReuseShuffleIndices, ReorderIndices);
4372 if (
E && InterleaveFactor > 0)
4373 E->setInterleave(InterleaveFactor);
4378 TreeEntry::EntryState EntryState,
4379 ScheduleBundle &Bundle,
const InstructionsState &S,
4381 ArrayRef<int> ReuseShuffleIndices = {},
4382 ArrayRef<unsigned> ReorderIndices = {}) {
4383 assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||
4384 EntryState == TreeEntry::SplitVectorize)) ||
4385 (Bundle && EntryState != TreeEntry::NeedToGather &&
4386 EntryState != TreeEntry::SplitVectorize)) &&
4387 "Need to vectorize gather entry?");
4389 if (GatheredLoadsEntriesFirst.has_value() &&
4390 EntryState == TreeEntry::NeedToGather && S &&
4391 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
4392 !UserTreeIdx.UserTE)
4394 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
4395 TreeEntry *
Last = VectorizableTree.back().get();
4396 Last->Idx = VectorizableTree.size() - 1;
4397 Last->State = EntryState;
4398 if (UserTreeIdx.UserTE)
4399 OperandsToTreeEntry.try_emplace(
4400 std::make_pair(UserTreeIdx.UserTE, UserTreeIdx.EdgeIdx),
Last);
4405 ReuseShuffleIndices.empty()) &&
4406 "Reshuffling scalars not yet supported for nodes with padding");
4407 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
4408 ReuseShuffleIndices.end());
4409 if (ReorderIndices.
empty()) {
4412 Last->setOperations(S);
4415 Last->Scalars.assign(VL.
size(),
nullptr);
4417 [VL](
unsigned Idx) ->
Value * {
4418 if (Idx >= VL.size())
4419 return UndefValue::get(VL.front()->getType());
4424 Last->setOperations(S);
4425 Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
4427 if (EntryState == TreeEntry::SplitVectorize) {
4428 assert(S &&
"Split nodes must have operations.");
4429 Last->setOperations(S);
4430 SmallPtrSet<Value *, 4> Processed;
4431 for (
Value *V : VL) {
4435 auto It = ScalarsInSplitNodes.find(V);
4436 if (It == ScalarsInSplitNodes.end()) {
4437 ScalarsInSplitNodes.try_emplace(V).first->getSecond().push_back(
Last);
4438 (void)Processed.
insert(V);
4439 }
else if (Processed.
insert(V).second) {
4441 "Value already associated with the node.");
4442 It->getSecond().push_back(
Last);
4445 }
else if (!
Last->isGather()) {
4448 (!S.areInstructionsWithCopyableElements() &&
4450 all_of(VL, [&](
Value *V) {
return S.isNonSchedulable(V); }))
4451 Last->setDoesNotNeedToSchedule();
4452 SmallPtrSet<Value *, 4> Processed;
4453 for (
Value *V : VL) {
4456 if (S.isCopyableElement(V)) {
4457 Last->addCopyableElement(V);
4460 auto It = ScalarToTreeEntries.find(V);
4461 if (It == ScalarToTreeEntries.end()) {
4462 ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(
Last);
4463 (void)Processed.
insert(V);
4464 }
else if (Processed.
insert(V).second) {
4466 "Value already associated with the node.");
4467 It->getSecond().push_back(
Last);
4471 assert((!Bundle.getBundle().empty() ||
Last->doesNotNeedToSchedule()) &&
4472 "Bundle and VL out of sync");
4473 if (!Bundle.getBundle().empty()) {
4474#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
4475 auto *BundleMember = Bundle.getBundle().begin();
4476 SmallPtrSet<Value *, 4> Processed;
4477 for (
Value *V : VL) {
4478 if (S.isNonSchedulable(V) || !Processed.
insert(V).second)
4482 assert(BundleMember == Bundle.getBundle().end() &&
4483 "Bundle and VL out of sync");
4485 Bundle.setTreeEntry(
Last);
4489 bool AllConstsOrCasts =
true;
4490 for (
Value *V : VL) {
4491 if (S && S.areInstructionsWithCopyableElements() &&
4492 S.isCopyableElement(V))
4493 Last->addCopyableElement(V);
4496 AllConstsOrCasts &=
I &&
I->getType()->isIntegerTy();
4497 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
4498 !UserTreeIdx.UserTE->isGather())
4499 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(
Last);
4502 if (AllConstsOrCasts)
4504 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
4505 MustGather.insert_range(VL);
4508 if (UserTreeIdx.UserTE)
4509 Last->UserTreeIndex = UserTreeIdx;
4515 TreeEntry::VecTreeTy VectorizableTree;
4520 for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
4521 VectorizableTree[
Id]->dump();
4529 assert(V &&
"V cannot be nullptr.");
4530 auto It = ScalarToTreeEntries.find(V);
4531 if (It == ScalarToTreeEntries.end())
4533 return It->getSecond();
4538 assert(V &&
"V cannot be nullptr.");
4539 auto It = ScalarsInSplitNodes.find(V);
4540 if (It == ScalarsInSplitNodes.end())
4542 return It->getSecond();
4547 bool SameVF =
false)
const {
4548 assert(V &&
"V cannot be nullptr.");
4549 for (TreeEntry *TE : ScalarToTreeEntries.lookup(V))
4550 if ((!SameVF ||
TE->getVectorFactor() == VL.
size()) &&
TE->isSame(VL))
4561 bool areAltOperandsProfitable(
const InstructionsState &S,
4566 class ScalarsVectorizationLegality {
4567 InstructionsState S;
4569 bool TryToFindDuplicates;
4570 bool TrySplitVectorize;
4573 ScalarsVectorizationLegality(InstructionsState S,
bool IsLegal,
4574 bool TryToFindDuplicates =
true,
4575 bool TrySplitVectorize =
false)
4576 : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
4577 TrySplitVectorize(TrySplitVectorize) {
4578 assert((!IsLegal || (S.valid() && TryToFindDuplicates)) &&
4579 "Inconsistent state");
4581 const InstructionsState &getInstructionsState()
const {
return S; };
4582 bool isLegal()
const {
return IsLegal; }
4583 bool tryToFindDuplicates()
const {
return TryToFindDuplicates; }
4584 bool trySplitVectorize()
const {
return TrySplitVectorize; }
4589 ScalarsVectorizationLegality
4592 bool TryCopyableElementsVectorization)
const;
4596 TreeEntry::EntryState getScalarsVectorizationState(
4598 bool IsScatterVectorizeUserTE,
OrdersType &CurrentOrder,
4599 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo);
4602 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
4605 SmallPtrSet<const TreeEntry *, 8> DeletedNodes;
4609 SmallDenseMap<const TreeEntry *, InstructionCost> TransformedToGatherNodes;
4612 SmallDenseMap<std::pair<const TreeEntry *, unsigned>, TreeEntry *>
4613 OperandsToTreeEntry;
4616 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarsInSplitNodes;
4619 SmallDenseMap<Value *, unsigned> InstrElementSize;
4633 SmallDenseMap<const TreeEntry *, WeakTrackingVH> EntryToLastInstruction;
4637 SmallDenseMap<const Instruction *, Instruction *> LastInstructionToPos;
4642 SetVector<const TreeEntry *> PostponedGathers;
4644 using ValueToGatherNodesMap =
4645 DenseMap<Value *, SmallSetVector<const TreeEntry *, 4>>;
4646 ValueToGatherNodesMap ValueToGatherNodes;
4651 SetVector<unsigned> LoadEntriesToVectorize;
4654 bool IsGraphTransformMode =
false;
4657 std::optional<unsigned> GatheredLoadsEntriesFirst;
4660 SmallDenseMap<
const TreeEntry *,
4661 std::tuple<SmallVector<int>,
VectorType *, unsigned,
bool>>
4662 CompressEntryToData;
4665 struct ExternalUser {
4666 ExternalUser(
Value *S, llvm::User *U,
const TreeEntry &E,
unsigned L)
4667 : Scalar(S), User(
U), E(E), Lane(
L) {}
4670 Value *Scalar =
nullptr;
4673 llvm::User *User =
nullptr;
4681 using UserList = SmallVector<ExternalUser, 16>;
4687 bool isAliased(
const MemoryLocation &Loc1, Instruction *Inst1,
4688 Instruction *Inst2) {
4691 AliasCacheKey
Key = std::make_pair(Inst1, Inst2);
4692 auto Res = AliasCache.try_emplace(
Key);
4694 return Res.first->second;
4695 bool Aliased =
isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
4697 Res.first->getSecond() = Aliased;
4701 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
4705 SmallDenseMap<AliasCacheKey, bool> AliasCache;
4710 BatchAAResults BatchAA;
4717 DenseSet<Instruction *> DeletedInstructions;
4720 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
4723 DenseSet<size_t> AnalyzedReductionVals;
4727 DenseSet<Value *> AnalyzedMinBWVals;
4733 UserList ExternalUses;
4737 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
4741 SmallPtrSet<Value *, 4> ExternalUsesWithNonUsers;
4744 SmallPtrSet<const Value *, 32> EphValues;
4748 SetVector<Instruction *> GatherShuffleExtractSeq;
4751 DenseSet<BasicBlock *> CSEBlocks;
4754 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
4761 class ScheduleEntity {
4762 friend class ScheduleBundle;
4763 friend class ScheduleData;
4764 friend class ScheduleCopyableData;
4767 enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData };
4768 Kind getKind()
const {
return K; }
4769 ScheduleEntity(Kind K) : K(K) {}
4773 int SchedulingPriority = 0;
4776 bool IsScheduled =
false;
4778 const Kind K = Kind::ScheduleData;
4781 ScheduleEntity() =
delete;
4783 void setSchedulingPriority(
int Priority) { SchedulingPriority = Priority; }
4784 int getSchedulingPriority()
const {
return SchedulingPriority; }
4785 bool isReady()
const {
4787 return SD->isReady();
4789 return CD->isReady();
4795 bool hasValidDependencies()
const {
4797 return SD->hasValidDependencies();
4799 return CD->hasValidDependencies();
4803 int getUnscheduledDeps()
const {
4805 return SD->getUnscheduledDeps();
4807 return CD->getUnscheduledDeps();
4811 int incrementUnscheduledDeps(
int Incr) {
4813 return SD->incrementUnscheduledDeps(Incr);
4817 int getDependencies()
const {
4819 return SD->getDependencies();
4825 return SD->getInst();
4830 bool isScheduled()
const {
return IsScheduled; }
4831 void setScheduled(
bool Scheduled) { IsScheduled = Scheduled; }
4833 static bool classof(
const ScheduleEntity *) {
return true; }
4835#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4836 void dump(raw_ostream &OS)
const {
4838 return SD->dump(OS);
4840 return CD->dump(OS);
4851#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4853 const BoUpSLP::ScheduleEntity &SE) {
4863 class ScheduleData final :
public ScheduleEntity {
4867 enum { InvalidDeps = -1 };
4869 ScheduleData() : ScheduleEntity(Kind::ScheduleData) {}
4870 static bool classof(
const ScheduleEntity *Entity) {
4871 return Entity->getKind() == Kind::ScheduleData;
4874 void init(
int BlockSchedulingRegionID, Instruction *
I) {
4875 NextLoadStore =
nullptr;
4876 IsScheduled =
false;
4877 SchedulingRegionID = BlockSchedulingRegionID;
4878 clearDependencies();
4884 if (hasValidDependencies()) {
4885 assert(UnscheduledDeps <= Dependencies &&
"invariant");
4887 assert(UnscheduledDeps == Dependencies &&
"invariant");
4891 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
4892 "unexpected scheduled state");
4899 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
4903 bool isReady()
const {
return UnscheduledDeps == 0 && !IsScheduled; }
4908 int incrementUnscheduledDeps(
int Incr) {
4909 assert(hasValidDependencies() &&
4910 "increment of unscheduled deps would be meaningless");
4911 UnscheduledDeps += Incr;
4912 assert(UnscheduledDeps >= 0 &&
4913 "Expected valid number of unscheduled deps");
4914 return UnscheduledDeps;
4919 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
4922 void clearDependencies() {
4923 clearDirectDependencies();
4924 MemoryDependencies.clear();
4925 ControlDependencies.clear();
4932 void clearDirectDependencies() {
4933 Dependencies = InvalidDeps;
4934 resetUnscheduledDeps();
4935 IsScheduled =
false;
4939 int getUnscheduledDeps()
const {
return UnscheduledDeps; }
4941 int getDependencies()
const {
return Dependencies; }
4943 void initDependencies() { Dependencies = 0; }
4945 void incDependencies() { Dependencies++; }
4948 int getSchedulingRegionID()
const {
return SchedulingRegionID; }
4955 return MemoryDependencies;
4958 void addMemoryDependency(ScheduleData *Dep) {
4959 MemoryDependencies.push_back(Dep);
4963 return ControlDependencies;
4966 void addControlDependency(ScheduleData *Dep) {
4967 ControlDependencies.push_back(Dep);
4970 ScheduleData *getNextLoadStore()
const {
return NextLoadStore; }
4971 void setNextLoadStore(ScheduleData *
Next) { NextLoadStore =
Next; }
4973 void dump(raw_ostream &OS)
const { OS << *Inst; }
4985 ScheduleData *NextLoadStore =
nullptr;
4989 SmallVector<ScheduleData *> MemoryDependencies;
4995 SmallVector<ScheduleData *> ControlDependencies;
4999 int SchedulingRegionID = 0;
5005 int Dependencies = InvalidDeps;
5011 int UnscheduledDeps = InvalidDeps;
5016 const BoUpSLP::ScheduleData &SD) {
5022 class ScheduleBundle final :
public ScheduleEntity {
5026 bool IsValid =
true;
5028 TreeEntry *TE =
nullptr;
5029 ScheduleBundle(
bool IsValid)
5030 : ScheduleEntity(Kind::ScheduleBundle), IsValid(IsValid) {}
5033 ScheduleBundle() : ScheduleEntity(Kind::ScheduleBundle) {}
5034 static bool classof(
const ScheduleEntity *Entity) {
5035 return Entity->getKind() == Kind::ScheduleBundle;
5040 for (
const ScheduleEntity *SD : Bundle) {
5041 if (SD->hasValidDependencies()) {
5042 assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
5045 assert(SD->getUnscheduledDeps() == SD->getDependencies() &&
5049 if (isScheduled()) {
5050 assert(SD->hasValidDependencies() && SD->getUnscheduledDeps() == 0 &&
5051 "unexpected scheduled state");
5057 int unscheduledDepsInBundle()
const {
5058 assert(*
this &&
"bundle must not be empty");
5060 for (
const ScheduleEntity *BundleMember : Bundle) {
5061 if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
5062 return ScheduleData::InvalidDeps;
5063 Sum += BundleMember->getUnscheduledDeps();
5071 bool hasValidDependencies()
const {
5072 return all_of(Bundle, [](
const ScheduleEntity *SD) {
5073 return SD->hasValidDependencies();
5079 bool isReady()
const {
5080 assert(*
this &&
"bundle must not be empty");
5081 return unscheduledDepsInBundle() == 0 && !isScheduled();
5089 void add(ScheduleEntity *SD) { Bundle.push_back(SD); }
5092 void setTreeEntry(TreeEntry *TE) { this->TE = TE; }
5093 TreeEntry *getTreeEntry()
const {
return TE; }
5095 static ScheduleBundle invalid() {
return {
false}; }
5097 operator bool()
const {
return IsValid; }
5100 void dump(raw_ostream &OS)
const {
5109 OS << *SD->getInst();
5123 const BoUpSLP::ScheduleBundle &Bundle) {
5134 class ScheduleCopyableData final :
public ScheduleEntity {
5141 int SchedulingRegionID = 0;
5143 ScheduleBundle &Bundle;
5146 ScheduleCopyableData(
int BlockSchedulingRegionID,
Instruction *
I,
5147 const EdgeInfo &EI, ScheduleBundle &Bundle)
5148 : ScheduleEntity(Kind::ScheduleCopyableData), Inst(
I), EI(EI),
5149 SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {}
5150 static bool classof(
const ScheduleEntity *Entity) {
5151 return Entity->getKind() == Kind::ScheduleCopyableData;
5156 if (hasValidDependencies()) {
5157 assert(UnscheduledDeps <= Dependencies &&
"invariant");
5159 assert(UnscheduledDeps == Dependencies &&
"invariant");
5163 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
5164 "unexpected scheduled state");
5171 bool hasValidDependencies()
const {
5172 return Dependencies != ScheduleData::InvalidDeps;
5177 bool isReady()
const {
return UnscheduledDeps == 0 && !IsScheduled; }
5182 int incrementUnscheduledDeps(
int Incr) {
5183 assert(hasValidDependencies() &&
5184 "increment of unscheduled deps would be meaningless");
5185 UnscheduledDeps += Incr;
5186 assert(UnscheduledDeps >= 0 &&
"invariant");
5187 return UnscheduledDeps;
5192 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5195 int getUnscheduledDeps()
const {
return UnscheduledDeps; }
5197 int getDependencies()
const {
return Dependencies; }
5199 void initDependencies() { Dependencies = 0; }
5201 void incDependencies() { Dependencies++; }
5204 int getSchedulingRegionID()
const {
return SchedulingRegionID; }
5210 void clearDependencies() {
5211 Dependencies = ScheduleData::InvalidDeps;
5212 UnscheduledDeps = ScheduleData::InvalidDeps;
5213 IsScheduled =
false;
5217 const EdgeInfo &getEdgeInfo()
const {
return EI; }
5220 ScheduleBundle &getBundle() {
return Bundle; }
5221 const ScheduleBundle &getBundle()
const {
return Bundle; }
5223#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5224 void dump(raw_ostream &OS)
const { OS <<
"[Copyable]" << *getInst(); }
5235 int Dependencies = ScheduleData::InvalidDeps;
5241 int UnscheduledDeps = ScheduleData::InvalidDeps;
5271 struct BlockScheduling {
5273 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
5276 ScheduledBundles.clear();
5277 ScheduledBundlesList.
clear();
5278 ScheduleCopyableDataMap.clear();
5279 ScheduleCopyableDataMapByInst.clear();
5280 ScheduleCopyableDataMapByInstUser.clear();
5281 ScheduleCopyableDataMapByUsers.clear();
5283 ScheduleStart =
nullptr;
5284 ScheduleEnd =
nullptr;
5285 FirstLoadStoreInRegion =
nullptr;
5286 LastLoadStoreInRegion =
nullptr;
5287 RegionHasStackSave =
false;
5291 ScheduleRegionSizeLimit -= ScheduleRegionSize;
5294 ScheduleRegionSize = 0;
5298 ++SchedulingRegionID;
5301 ScheduleData *getScheduleData(Instruction *
I) {
5304 if (BB !=
I->getParent())
5307 ScheduleData *SD = ScheduleDataMap.lookup(
I);
5308 if (SD && isInSchedulingRegion(*SD))
5313 ScheduleData *getScheduleData(
Value *V) {
5319 ScheduleCopyableData *getScheduleCopyableData(
const EdgeInfo &EI,
5320 const Value *V)
const {
5321 if (ScheduleCopyableDataMap.empty())
5323 auto It = ScheduleCopyableDataMap.find(std::make_pair(EI, V));
5324 if (It == ScheduleCopyableDataMap.end())
5326 ScheduleCopyableData *SD = It->getSecond().get();
5327 if (!isInSchedulingRegion(*SD))
5335 getScheduleCopyableData(
const Value *User,
unsigned OperandIdx,
5337 if (ScheduleCopyableDataMapByInstUser.empty())
5339 const auto It = ScheduleCopyableDataMapByInstUser.find(
5340 std::make_pair(std::make_pair(User, OperandIdx), V));
5341 if (It == ScheduleCopyableDataMapByInstUser.end())
5344 for (ScheduleCopyableData *SD : It->getSecond()) {
5345 if (isInSchedulingRegion(*SD))
5359 bool areAllOperandsReplacedByCopyableData(Instruction *User,
5363 if (ScheduleCopyableDataMap.empty())
5365 SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;
5367 if (Entries.
empty())
5369 unsigned CurNumOps = 0;
5370 for (
const Use &U :
User->operands()) {
5376 for (TreeEntry *TE : Entries) {
5378 bool IsNonSchedulableWithParentPhiNode =
5379 TE->doesNotNeedToSchedule() &&
TE->UserTreeIndex &&
5380 TE->UserTreeIndex.UserTE->hasState() &&
5381 TE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
5382 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
5385 if (IsNonSchedulableWithParentPhiNode) {
5386 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5387 const TreeEntry *ParentTE =
TE->UserTreeIndex.UserTE;
5388 for (
Value *V : ParentTE->Scalars) {
5392 if (ParentsUniqueUsers.
insert(
PHI).second &&
5397 Inc =
count(
TE->Scalars, User);
5405 bool IsCommutativeUser =
5408 if (!IsCommutativeUser) {
5418 (!IsCommutativeUser ||
5427 "Expected commutative user with 2 first commutable operands");
5428 bool IsCommutativeWithSameOps =
5429 IsCommutativeUser &&
User->getOperand(0) ==
User->getOperand(1);
5430 if ((!IsCommutativeUser || IsCommutativeWithSameOps) &&
5432 EdgeInfo EI(TE,
U.getOperandNo());
5433 if (CurNumOps !=
NumOps || getScheduleCopyableData(EI,
Op))
5437 PotentiallyReorderedEntriesCount.
try_emplace(TE, 0)
5438 .first->getSecond() += Inc;
5441 if (PotentiallyReorderedEntriesCount.
empty())
5444 for (
auto &
P : PotentiallyReorderedEntriesCount) {
5445 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5446 bool IsNonSchedulableWithParentPhiNode =
5447 P.first->doesNotNeedToSchedule() &&
P.first->UserTreeIndex &&
5448 P.first->UserTreeIndex.UserTE->hasState() &&
5449 P.first->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
5450 P.first->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
5451 auto *It =
find(
P.first->Scalars, User);
5453 assert(It !=
P.first->Scalars.end() &&
5454 "User is not in the tree entry");
5455 int Lane = std::distance(
P.first->Scalars.begin(), It);
5456 assert(Lane >= 0 &&
"Lane is not found");
5458 Lane =
P.first->ReorderIndices[Lane];
5459 assert(Lane <
static_cast<int>(
P.first->Scalars.size()) &&
5460 "Couldn't find extract lane");
5463 if (IsNonSchedulableWithParentPhiNode) {
5464 const TreeEntry *ParentTE =
P.first->UserTreeIndex.UserTE;
5466 if (!ParentsUniqueUsers.
insert(User).second) {
5472 for (
unsigned OpIdx :
5474 P.first->getMainOp()))) {
5475 if (
P.first->getOperand(
OpIdx)[Lane] ==
Op &&
5476 getScheduleCopyableData(EdgeInfo(
P.first,
OpIdx),
Op))
5481 }
while (It !=
P.first->Scalars.end());
5483 return all_of(PotentiallyReorderedEntriesCount,
5484 [&](
const std::pair<const TreeEntry *, unsigned> &
P) {
5485 return P.second ==
NumOps - 1;
5490 getScheduleCopyableData(
const Instruction *
I)
const {
5491 if (ScheduleCopyableDataMapByInst.empty())
5493 const auto It = ScheduleCopyableDataMapByInst.find(
I);
5494 if (It == ScheduleCopyableDataMapByInst.end())
5497 for (ScheduleCopyableData *SD : It->getSecond()) {
5498 if (isInSchedulingRegion(*SD))
5505 getScheduleCopyableDataUsers(
const Instruction *User)
const {
5506 if (ScheduleCopyableDataMapByUsers.empty())
5508 const auto It = ScheduleCopyableDataMapByUsers.find(User);
5509 if (It == ScheduleCopyableDataMapByUsers.end())
5512 for (ScheduleCopyableData *SD : It->getSecond()) {
5513 if (isInSchedulingRegion(*SD))
5519 ScheduleCopyableData &addScheduleCopyableData(
const EdgeInfo &EI,
5521 int SchedulingRegionID,
5522 ScheduleBundle &Bundle) {
5523 assert(!getScheduleCopyableData(EI,
I) &&
"already in the map");
5524 ScheduleCopyableData *CD =
5525 ScheduleCopyableDataMap
5526 .try_emplace(std::make_pair(EI,
I),
5527 std::make_unique<ScheduleCopyableData>(
5528 SchedulingRegionID,
I, EI, Bundle))
5531 ScheduleCopyableDataMapByInst[
I].push_back(CD);
5535 assert(It !=
Op.end() &&
"Lane not set");
5536 SmallPtrSet<Instruction *, 4> Visited;
5538 int Lane = std::distance(
Op.begin(), It);
5539 assert(Lane >= 0 &&
"Lane not set");
5541 !EI.UserTE->ReorderIndices.empty())
5542 Lane = EI.UserTE->ReorderIndices[Lane];
5543 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
5544 "Couldn't find extract lane");
5546 if (!Visited.
insert(In).second) {
5550 ScheduleCopyableDataMapByInstUser
5551 .try_emplace(std::make_pair(std::make_pair(In, EI.EdgeIdx),
I))
5554 ScheduleCopyableDataMapByUsers.try_emplace(
I)
5561 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
5562 if (ScheduleCopyableData *UserCD =
5563 getScheduleCopyableData(UserEI, In))
5564 ScheduleCopyableDataMapByUsers[
I].remove(UserCD);
5567 }
while (It !=
Op.end());
5569 ScheduleCopyableDataMapByUsers.try_emplace(
I).first->getSecond().insert(
5579 auto It = ScheduledBundles.find(
I);
5580 if (It == ScheduledBundles.end())
5582 return It->getSecond();
5586 bool isInSchedulingRegion(
const ScheduleEntity &SD)
const {
5588 return Data->getSchedulingRegionID() == SchedulingRegionID;
5590 return CD->getSchedulingRegionID() == SchedulingRegionID;
5592 [&](
const ScheduleEntity *BundleMember) {
5593 return isInSchedulingRegion(*BundleMember);
5599 template <
typename ReadyListType>
5600 void schedule(
const BoUpSLP &R,
const InstructionsState &S,
5601 const EdgeInfo &EI, ScheduleEntity *
Data,
5602 ReadyListType &ReadyList) {
5603 auto ProcessBundleMember = [&](ScheduleEntity *BundleMember,
5608 auto DecrUnsched = [&](
auto *
Data,
bool IsControl =
false) {
5609 if ((IsControl ||
Data->hasValidDependencies()) &&
5610 Data->incrementUnscheduledDeps(-1) == 0) {
5617 CopyableBundle.
push_back(&CD->getBundle());
5618 Bundles = CopyableBundle;
5620 Bundles = getScheduleBundles(
Data->getInst());
5622 if (!Bundles.
empty()) {
5623 for (ScheduleBundle *Bundle : Bundles) {
5624 if (Bundle->unscheduledDepsInBundle() == 0) {
5625 assert(!Bundle->isScheduled() &&
5626 "already scheduled bundle gets ready");
5627 ReadyList.insert(Bundle);
5629 <<
"SLP: gets ready: " << *Bundle <<
"\n");
5635 "already scheduled bundle gets ready");
5637 "Expected non-copyable data");
5638 ReadyList.insert(
Data);
5645 if (!ScheduleCopyableDataMap.empty()) {
5647 getScheduleCopyableData(User,
OpIdx,
I);
5648 for (ScheduleCopyableData *CD : CopyableData)
5649 DecrUnsched(CD,
false);
5650 if (!CopyableData.empty())
5653 if (ScheduleData *OpSD = getScheduleData(
I))
5654 DecrUnsched(OpSD,
false);
5660 if (!Bundles.empty()) {
5661 auto *
In = BundleMember->getInst();
5663 SmallDenseMap<const Instruction *, unsigned> OperandsUses;
5664 unsigned TotalOpCount = 0;
5667 TotalOpCount = OperandsUses[
In] = 1;
5669 for (
const Use &U :
In->operands()) {
5672 ++Res.first->getSecond();
5679 auto DecrUnschedForInst =
5681 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>>
5683 if (!ScheduleCopyableDataMap.empty()) {
5684 const EdgeInfo EI = {UserTE,
OpIdx};
5685 if (ScheduleCopyableData *CD =
5686 getScheduleCopyableData(EI,
I)) {
5687 if (!Checked.insert(std::make_pair(CD,
OpIdx)).second)
5689 DecrUnsched(CD,
false);
5693 auto It = OperandsUses.
find(
I);
5694 assert(It != OperandsUses.
end() &&
"Operand not found");
5695 if (It->second > 0) {
5696 if (ScheduleData *OpSD = getScheduleData(
I)) {
5697 if (!Checked.insert(std::make_pair(OpSD,
OpIdx)).second)
5700 assert(TotalOpCount > 0 &&
"No more operands to decrement");
5702 DecrUnsched(OpSD,
false);
5705 assert(TotalOpCount > 0 &&
"No more operands to decrement");
5711 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>> Checked;
5712 for (ScheduleBundle *Bundle : Bundles) {
5713 if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
5715 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5718 auto *It =
find(Bundle->getTreeEntry()->Scalars, In);
5719 bool IsNonSchedulableWithParentPhiNode =
5720 Bundle->getTreeEntry()->doesNotNeedToSchedule() &&
5721 Bundle->getTreeEntry()->UserTreeIndex &&
5722 Bundle->getTreeEntry()->UserTreeIndex.UserTE->hasState() &&
5723 Bundle->getTreeEntry()->UserTreeIndex.UserTE->State !=
5724 TreeEntry::SplitVectorize &&
5725 Bundle->getTreeEntry()->UserTreeIndex.UserTE->getOpcode() ==
5729 std::distance(Bundle->getTreeEntry()->Scalars.begin(), It);
5730 assert(Lane >= 0 &&
"Lane not set");
5732 !Bundle->getTreeEntry()->ReorderIndices.empty())
5733 Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
5734 assert(Lane <
static_cast<int>(
5735 Bundle->getTreeEntry()->Scalars.size()) &&
5736 "Couldn't find extract lane");
5746 In->getNumOperands() ==
5747 Bundle->getTreeEntry()->getNumOperands() ||
5748 Bundle->getTreeEntry()->isCopyableElement(In)) &&
5749 "Missed TreeEntry operands?");
5753 if (IsNonSchedulableWithParentPhiNode) {
5754 const TreeEntry *ParentTE =
5755 Bundle->getTreeEntry()->UserTreeIndex.UserTE;
5757 if (!ParentsUniqueUsers.
insert(User).second) {
5758 It = std::find(std::next(It),
5759 Bundle->getTreeEntry()->Scalars.end(), In);
5764 for (
unsigned OpIdx :
5767 Bundle->getTreeEntry()->getOperand(
OpIdx)[Lane])) {
5770 DecrUnschedForInst(
I, Bundle->getTreeEntry(),
OpIdx, Checked);
5773 if (Bundle->getTreeEntry()->isCopyableElement(In))
5775 It = std::find(std::next(It),
5776 Bundle->getTreeEntry()->Scalars.end(), In);
5777 }
while (It != Bundle->getTreeEntry()->Scalars.end());
5782 for (Use &U : BundleMember->getInst()->operands()) {
5785 <<
"SLP: check for readiness (def): " << *
I <<
"\n");
5786 DecrUnschedForInst(BundleMember->getInst(),
U.getOperandNo(),
I);
5794 SmallPtrSet<const ScheduleData *, 4> VisitedMemory;
5795 for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) {
5796 if (!VisitedMemory.
insert(MemoryDep).second)
5801 << *MemoryDep <<
"\n");
5802 DecrUnsched(MemoryDep);
5805 SmallPtrSet<const ScheduleData *, 4> VisitedControl;
5806 for (ScheduleData *Dep : SD->getControlDependencies()) {
5807 if (!VisitedControl.
insert(Dep).second)
5812 <<
"SLP: check for readiness (ctrl): " << *Dep <<
"\n");
5813 DecrUnsched(Dep,
true);
5817 SD->setScheduled(
true);
5822 if (
R.isVectorized(In)) {
5824 for (TreeEntry *TE : Entries) {
5826 In->getNumOperands() !=
TE->getNumOperands())
5829 PseudoBundles.
emplace_back(std::make_unique<ScheduleBundle>());
5830 BundlePtr->setTreeEntry(TE);
5835 ProcessBundleMember(SD, Bundles);
5838 Bundle.setScheduled(
true);
5840 auto AreAllBundlesScheduled =
5841 [&](
const ScheduleEntity *SD,
5845 return !SDBundles.empty() &&
5846 all_of(SDBundles, [&](
const ScheduleBundle *SDBundle) {
5847 return SDBundle->isScheduled();
5850 for (ScheduleEntity *SD : Bundle.getBundle()) {
5853 SDBundles = getScheduleBundles(SD->getInst());
5854 if (AreAllBundlesScheduled(SD, SDBundles)) {
5855 SD->setScheduled(
true);
5868 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
5869 ScheduleStart->comesBefore(ScheduleEnd) &&
5870 "Not a valid scheduling region?");
5872 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
5874 if (!Bundles.
empty()) {
5875 for (ScheduleBundle *Bundle : Bundles) {
5876 assert(isInSchedulingRegion(*Bundle) &&
5877 "primary schedule data not in window?");
5882 auto *SD = getScheduleData(
I);
5885 assert(isInSchedulingRegion(*SD) &&
5886 "primary schedule data not in window?");
5891 [](
const ScheduleEntity *Bundle) {
5892 return Bundle->isReady();
5894 "item in ready list not ready?");
5898 template <
typename ReadyListType>
5899 void initialFillReadyList(ReadyListType &ReadyList) {
5900 SmallPtrSet<ScheduleBundle *, 16> Visited;
5901 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
5902 ScheduleData *SD = getScheduleData(
I);
5903 if (SD && SD->hasValidDependencies() && SD->isReady()) {
5906 for (ScheduleBundle *Bundle : Bundles) {
5907 if (!Visited.
insert(Bundle).second)
5909 if (Bundle->hasValidDependencies() && Bundle->isReady()) {
5910 ReadyList.insert(Bundle);
5912 << *Bundle <<
"\n");
5917 ReadyList.insert(SD);
5919 <<
"SLP: initially in ready list: " << *SD <<
"\n");
5930 const InstructionsState &S,
const EdgeInfo &EI);
5937 std::optional<ScheduleBundle *>
5939 const InstructionsState &S,
const EdgeInfo &EI);
5942 ScheduleData *allocateScheduleDataChunks();
5946 bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
5950 void initScheduleData(Instruction *FromI, Instruction *ToI,
5951 ScheduleData *PrevLoadStore,
5952 ScheduleData *NextLoadStore);
5956 void calculateDependencies(ScheduleBundle &Bundle,
bool InsertInReadyList,
5961 void resetSchedule();
5978 SmallDenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
5982 SmallDenseMap<std::pair<EdgeInfo, const Value *>,
5983 std::unique_ptr<ScheduleCopyableData>>
5984 ScheduleCopyableDataMap;
5990 SmallDenseMap<const Instruction *, SmallVector<ScheduleCopyableData *>>
5991 ScheduleCopyableDataMapByInst;
5997 SmallDenseMap<std::pair<std::pair<const Value *, unsigned>,
const Value *>,
5999 ScheduleCopyableDataMapByInstUser;
6019 SmallSetVector<ScheduleCopyableData *, 4>>
6020 ScheduleCopyableDataMapByUsers;
6023 SmallDenseMap<Instruction *, SmallVector<ScheduleBundle *>>
6029 SetVector<ScheduleEntity *> ReadyInsts;
6039 ScheduleData *FirstLoadStoreInRegion =
nullptr;
6043 ScheduleData *LastLoadStoreInRegion =
nullptr;
6048 bool RegionHasStackSave =
false;
6051 int ScheduleRegionSize = 0;
6060 int SchedulingRegionID = 1;
6064 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
6068 void scheduleBlock(
const BoUpSLP &R, BlockScheduling *BS);
6071 const SmallDenseSet<Value *> *UserIgnoreList =
nullptr;
6075 struct OrdersTypeDenseMapInfo {
6088 static unsigned getHashValue(
const OrdersType &V) {
6099 ScalarEvolution *SE;
6100 TargetTransformInfo *TTI;
6101 TargetLibraryInfo *TLI;
6104 AssumptionCache *AC;
6106 const DataLayout *DL;
6107 OptimizationRemarkEmitter *ORE;
6109 unsigned MaxVecRegSize;
6110 unsigned MinVecRegSize;
6113 IRBuilder<TargetFolder> Builder;
6120 DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
6125 unsigned ReductionBitWidth = 0;
6128 unsigned BaseGraphSize = 1;
6132 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
6136 DenseSet<unsigned> ExtraBitWidthNodes;
6144 SecondInfo::getEmptyKey());
6149 SecondInfo::getTombstoneKey());
6154 SecondInfo::getHashValue(Val.
EdgeIdx));
6175 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
6186 return R.VectorizableTree[0].get();
6190 return {&
N->UserTreeIndex,
N->Container};
6194 return {&
N->UserTreeIndex + 1,
N->Container};
6221 static unsigned size(
BoUpSLP *R) {
return R->VectorizableTree.size(); }
6233 OS << Entry->Idx <<
".\n";
6236 for (
auto *V : Entry->Scalars) {
6238 if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
6239 return EU.Scalar == V;
6249 if (Entry->isGather())
6251 if (Entry->State == TreeEntry::ScatterVectorize ||
6252 Entry->State == TreeEntry::StridedVectorize ||
6253 Entry->State == TreeEntry::CompressVectorize)
6254 return "color=blue";
6261 for (
auto *
I : DeletedInstructions) {
6262 if (!
I->getParent()) {
6267 I->insertBefore(F->getEntryBlock(),
6268 F->getEntryBlock().getFirstNonPHIIt());
6270 I->insertBefore(F->getEntryBlock().getTerminator()->getIterator());
6273 for (
Use &U :
I->operands()) {
6275 if (
Op && !DeletedInstructions.count(
Op) &&
Op->hasOneUser() &&
6279 I->dropAllReferences();
6281 for (
auto *
I : DeletedInstructions) {
6283 "trying to erase instruction with users.");
6284 I->eraseFromParent();
6290#ifdef EXPENSIVE_CHECKS
6301 assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
6302 "Expected non-empty mask.");
6305 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
6307 Reuses[Mask[
I]] = Prev[
I];
6315 bool BottomOrder =
false) {
6316 assert(!Mask.empty() &&
"Expected non-empty mask.");
6317 unsigned Sz = Mask.size();
6320 if (Order.
empty()) {
6322 std::iota(PrevOrder.
begin(), PrevOrder.
end(), 0);
6324 PrevOrder.
swap(Order);
6327 for (
unsigned I = 0;
I < Sz; ++
I)
6329 Order[
I] = PrevOrder[Mask[
I]];
6331 return Data.value() == Sz ||
Data.index() ==
Data.value();
6340 if (Order.
empty()) {
6342 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
6352 for (
unsigned I = 0;
I < Sz; ++
I)
6354 Order[MaskOrder[
I]] =
I;
6358std::optional<BoUpSLP::OrdersType>
6360 bool TopToBottom,
bool IgnoreReorder) {
6361 assert(TE.isGather() &&
"Expected gather node only.");
6365 Type *ScalarTy = GatheredScalars.
front()->getType();
6366 size_t NumScalars = GatheredScalars.
size();
6368 return std::nullopt;
6375 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
6377 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
6380 if (GatherShuffles.
empty() && ExtractShuffles.
empty())
6381 return std::nullopt;
6382 OrdersType CurrentOrder(NumScalars, NumScalars);
6383 if (GatherShuffles.
size() == 1 &&
6385 Entries.
front().front()->isSame(TE.Scalars)) {
6389 return std::nullopt;
6391 if (Entries.
front().front()->UserTreeIndex.UserTE ==
6392 TE.UserTreeIndex.UserTE)
6393 return std::nullopt;
6396 if (!IgnoreReorder && Entries.
front().front()->Idx == 0)
6397 return std::nullopt;
6400 if (!Entries.
front().front()->ReuseShuffleIndices.empty() &&
6401 TE.getVectorFactor() == 2 && Mask.size() == 2 &&
6404 return P.value() % 2 != static_cast<int>(P.index()) % 2;
6406 return std::nullopt;
6410 std::iota(CurrentOrder.
begin(), CurrentOrder.
end(), 0);
6411 return CurrentOrder;
6415 return all_of(Mask, [&](
int I) {
6422 if ((ExtractShuffles.
empty() && IsSplatMask(Mask) &&
6423 (Entries.
size() != 1 ||
6424 Entries.
front().front()->ReorderIndices.empty())) ||
6425 (GatherShuffles.
empty() && IsSplatMask(ExtractMask)))
6426 return std::nullopt;
6432 if (ShuffledSubMasks.
test(
I))
6434 const int VF = GetVF(
I);
6442 ShuffledSubMasks.
set(
I);
6446 int FirstMin = INT_MAX;
6447 int SecondVecFound =
false;
6449 int Idx = Mask[
I * PartSz + K];
6451 Value *V = GatheredScalars[
I * PartSz + K];
6453 SecondVecFound =
true;
6462 SecondVecFound =
true;
6466 FirstMin = (FirstMin / PartSz) * PartSz;
6468 if (SecondVecFound) {
6470 ShuffledSubMasks.
set(
I);
6474 int Idx = Mask[
I * PartSz + K];
6478 if (Idx >= PartSz) {
6479 SecondVecFound =
true;
6482 if (CurrentOrder[
I * PartSz + Idx] >
6483 static_cast<unsigned>(
I * PartSz + K) &&
6484 CurrentOrder[
I * PartSz + Idx] !=
6485 static_cast<unsigned>(
I * PartSz + Idx))
6486 CurrentOrder[
I * PartSz + Idx] =
I * PartSz + K;
6489 if (SecondVecFound) {
6491 ShuffledSubMasks.
set(
I);
6497 if (!ExtractShuffles.
empty())
6498 TransformMaskToOrder(
6499 CurrentOrder, ExtractMask, PartSz, NumParts, [&](
unsigned I) {
6500 if (!ExtractShuffles[
I])
6503 unsigned Sz =
getNumElems(TE.getVectorFactor(), PartSz,
I);
6505 int K =
I * PartSz + Idx;
6508 if (!TE.ReuseShuffleIndices.empty())
6509 K = TE.ReuseShuffleIndices[K];
6512 if (!TE.ReorderIndices.empty())
6513 K = std::distance(TE.ReorderIndices.begin(),
6514 find(TE.ReorderIndices, K));
6520 .getKnownMinValue());
6525 if (GatherShuffles.
size() == 1 && NumParts != 1) {
6526 if (ShuffledSubMasks.
any())
6527 return std::nullopt;
6528 PartSz = NumScalars;
6531 if (!Entries.
empty())
6532 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](
unsigned I) {
6533 if (!GatherShuffles[
I])
6535 return std::max(Entries[
I].front()->getVectorFactor(),
6536 Entries[
I].back()->getVectorFactor());
6538 unsigned NumUndefs =
count(CurrentOrder, NumScalars);
6539 if (ShuffledSubMasks.
all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
6540 return std::nullopt;
6541 return std::move(CurrentOrder);
6546 bool CompareOpcodes =
true) {
6552 return (!GEP1 || GEP1->getNumOperands() == 2) &&
6553 (!GEP2 || GEP2->getNumOperands() == 2) &&
6554 (((!GEP1 ||
isConstant(GEP1->getOperand(1))) &&
6555 (!GEP2 ||
isConstant(GEP2->getOperand(1)))) ||
6558 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
6562template <
typename T>
6567 return CommonAlignment;
6573 "Order is empty. Please check it before using isReverseOrder.");
6574 unsigned Sz = Order.
size();
6576 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
6597 "Coeffs vector needs to be of correct size");
6599 const SCEV *PtrSCEVLowest =
nullptr;
6600 const SCEV *PtrSCEVHighest =
nullptr;
6603 for (
Value *Ptr : PointerOps) {
6608 if (!PtrSCEVLowest && !PtrSCEVHighest) {
6609 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
6616 PtrSCEVLowest = PtrSCEV;
6623 PtrSCEVHighest = PtrSCEV;
6631 int Size =
DL.getTypeStoreSize(ElemTy);
6632 auto TryGetStride = [&](
const SCEV *Dist,
6633 const SCEV *Multiplier) ->
const SCEV * {
6635 if (M->getOperand(0) == Multiplier)
6636 return M->getOperand(1);
6637 if (M->getOperand(1) == Multiplier)
6638 return M->getOperand(0);
6641 if (Multiplier == Dist)
6646 const SCEV *Stride =
nullptr;
6647 if (
Size != 1 || SCEVs.
size() > 2) {
6649 Stride = TryGetStride(Dist, Sz);
6657 using DistOrdPair = std::pair<int64_t, int>;
6659 std::set<DistOrdPair,
decltype(Compare)> Offsets(Compare);
6661 bool IsConsecutive =
true;
6662 for (
const auto [Idx, PtrSCEV] :
enumerate(SCEVs)) {
6664 if (PtrSCEV != PtrSCEVLowest) {
6666 const SCEV *Coeff = TryGetStride(Diff, Stride);
6672 Coeffs[Idx] = (int64_t)SC->getAPInt().getLimitedValue();
6677 Dist = SC->getAPInt().getZExtValue();
6684 auto Res = Offsets.emplace(Dist, Cnt);
6688 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
6691 if (Offsets.size() != SCEVs.
size())
6693 SortedIndices.
clear();
6694 if (!IsConsecutive) {
6698 for (
const std::pair<int64_t, int> &Pair : Offsets) {
6699 SortedIndices[Cnt] = Pair.second;
6706static std::pair<InstructionCost, InstructionCost>
6725 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask,
CostKind, Index, SubTp,
6727 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
6730 Mask, NumSrcElts, NumSubElts, Index)) {
6731 if (Index + NumSubElts > NumSrcElts &&
6732 Index + NumSrcElts <=
static_cast<int>(
Mask.size()))
6736 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask,
CostKind, Index, SubTp,
6749 "ScalableVectorType is not supported.");
6752 "Incorrect usage.");
6757 unsigned ScalarTyNumElements = VecTy->getNumElements();
6760 if (!DemandedElts[
I])
6764 I * ScalarTyNumElements, VecTy);
6767 I * ScalarTyNumElements, VecTy);
6771 return TTI.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
6780 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
6781 if (Opcode == Instruction::ExtractElement) {
6787 Index * VecTy->getNumElements(), VecTy);
6790 return TTI.getVectorInstrCost(Opcode, Val,
CostKind, Index, Scalar,
6803 getWidenedType(VecTy->getElementType(), ScalarTy->getNumElements());
6805 Index * ScalarTy->getNumElements(), SubTp) +
6809 return TTI.getExtractWithExtendCost(Opcode, Dst, VecTy, Index,
CostKind);
6825 auto *Begin = std::next(
Mask.begin(), Index);
6826 std::iota(Begin, std::next(Begin, SubVecVF), 0);
6827 Vec = Builder.CreateShuffleVector(V, Mask);
6830 std::iota(
Mask.begin(),
Mask.end(), 0);
6831 std::iota(std::next(
Mask.begin(), Index),
6832 std::next(
Mask.begin(), Index + SubVecVF), VecVF);
6834 return Generator(Vec, V, Mask);
6837 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
6838 V = Builder.CreateShuffleVector(V, ResizeMask);
6840 return Builder.CreateShuffleVector(Vec, V, Mask);
6845 unsigned SubVecVF,
unsigned Index) {
6847 std::iota(Mask.begin(), Mask.end(), Index);
6848 return Builder.CreateShuffleVector(Vec, Mask);
6858 const unsigned Sz = PointerOps.
size();
6861 CompressMask[0] = 0;
6863 std::optional<unsigned> Stride = 0;
6866 Value *Ptr = Order.
empty() ? PointerOps[
I] : PointerOps[Order[
I]];
6867 std::optional<int64_t> OptPos =
6869 if (!OptPos || OptPos > std::numeric_limits<unsigned>::max())
6871 unsigned Pos =
static_cast<unsigned>(*OptPos);
6872 CompressMask[
I] = Pos;
6879 if (Pos != *Stride *
I)
6882 return Stride.has_value();
6895 InterleaveFactor = 0;
6897 const size_t Sz = VL.
size();
6905 if (AreAllUsersVectorized(V))
6908 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
CostKind,
6909 Mask.empty() ?
I : Mask[
I]);
6912 if (ExtractCost <= ScalarCost)
6917 if (Order.
empty()) {
6918 Ptr0 = PointerOps.
front();
6919 PtrN = PointerOps.
back();
6921 Ptr0 = PointerOps[Order.
front()];
6922 PtrN = PointerOps[Order.
back()];
6924 std::optional<int64_t> Diff =
6928 const size_t MaxRegSize =
6932 if (*Diff / Sz >= MaxRegSize / 8)
6936 Align CommonAlignment = LI->getAlign();
6938 Ptr0, LoadVecTy, CommonAlignment,
DL,
6941 if (IsMasked && !
TTI.isLegalMaskedLoad(LoadVecTy, CommonAlignment,
6942 LI->getPointerAddressSpace()))
6948 assert(CompressMask.
size() >= 2 &&
"At least two elements are required");
6952 auto [ScalarGEPCost, VectorGEPCost] =
6954 Instruction::GetElementPtr,
CostKind, ScalarTy, LoadVecTy);
6971 LoadCost =
TTI.getMemIntrinsicInstrCost(
6974 LI->getPointerAddressSpace()),
6978 TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
6979 LI->getPointerAddressSpace(),
CostKind);
6981 if (IsStrided && !IsMasked && Order.
empty()) {
6988 AlignedLoadVecTy = LoadVecTy;
6989 if (
TTI.isLegalInterleavedAccessType(AlignedLoadVecTy, CompressMask[1],
6991 LI->getPointerAddressSpace())) {
6993 VectorGEPCost +
TTI.getInterleavedMemoryOpCost(
6994 Instruction::Load, AlignedLoadVecTy,
6995 CompressMask[1], {}, CommonAlignment,
6996 LI->getPointerAddressSpace(),
CostKind, IsMasked);
6997 if (InterleavedCost < GatherCost) {
6998 InterleaveFactor = CompressMask[1];
6999 LoadVecTy = AlignedLoadVecTy;
7006 if (!Order.
empty()) {
7009 NewMask[
I] = CompressMask[Mask[
I]];
7011 CompressMask.
swap(NewMask);
7013 InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
7014 return TotalVecCost < GatherCost;
7027 unsigned InterleaveFactor;
7031 AreAllUsersVectorized, IsMasked, InterleaveFactor,
7032 CompressMask, LoadVecTy);
7049 Align Alignment,
const int64_t Diff,
7050 const size_t Sz)
const {
7051 if (Diff % (Sz - 1) != 0)
7055 auto IsAnyPointerUsedOutGraph =
any_of(PointerOps, [&](
Value *V) {
7057 return !isVectorized(U) && !MustGather.contains(U);
7061 const uint64_t AbsoluteDiff = std::abs(Diff);
7063 if (IsAnyPointerUsedOutGraph ||
7064 (AbsoluteDiff > Sz &&
7067 AbsoluteDiff % Sz == 0 &&
has_single_bit(AbsoluteDiff / Sz)))) ||
7068 Diff == -(
static_cast<int64_t
>(Sz) - 1)) {
7069 int64_t Stride = Diff /
static_cast<int64_t
>(Sz - 1);
7070 if (Diff != Stride *
static_cast<int64_t
>(Sz - 1))
7072 if (!TTI->isLegalStridedLoadStore(VecTy, Alignment))
7082 Value *Ptr0,
Value *PtrN, StridedPtrInfo &SPtrInfo)
const {
7083 const size_t Sz = PointerOps.
size();
7091 SortedIndices.
empty() ? PointerOps[
I] : PointerOps[SortedIndices[
I]];
7092 std::optional<int64_t>
Offset =
7094 assert(
Offset &&
"sortPtrAccesses should have validated this pointer");
7095 SortedOffsetsFromBase[
I] = *
Offset;
7112 int64_t StrideWithinGroup =
7113 SortedOffsetsFromBase[1] - SortedOffsetsFromBase[0];
7116 auto IsEndOfGroupIndex = [=, &SortedOffsetsFromBase](
unsigned Idx) {
7117 return SortedOffsetsFromBase[Idx] - SortedOffsetsFromBase[Idx - 1] !=
7122 unsigned GroupSize = FoundIt != Indices.end() ? *FoundIt : Sz;
7124 unsigned VecSz = Sz;
7125 Type *NewScalarTy = ScalarTy;
7129 bool NeedsWidening = Sz != GroupSize;
7130 if (NeedsWidening) {
7131 if (Sz % GroupSize != 0)
7134 if (StrideWithinGroup != 1)
7136 VecSz = Sz / GroupSize;
7139 DL->getTypeSizeInBits(ScalarTy).getFixedValue() * GroupSize);
7142 if (!
isStridedLoad(PointerOps, NewScalarTy, Alignment, Diff, VecSz))
7145 int64_t StrideIntVal = StrideWithinGroup;
7146 if (NeedsWidening) {
7149 unsigned CurrentGroupStartIdx = GroupSize;
7150 int64_t StrideBetweenGroups =
7151 SortedOffsetsFromBase[GroupSize] - SortedOffsetsFromBase[0];
7152 StrideIntVal = StrideBetweenGroups;
7153 for (; CurrentGroupStartIdx < Sz; CurrentGroupStartIdx += GroupSize) {
7154 if (SortedOffsetsFromBase[CurrentGroupStartIdx] -
7155 SortedOffsetsFromBase[CurrentGroupStartIdx - GroupSize] !=
7156 StrideBetweenGroups)
7160 auto CheckGroup = [=](
const unsigned StartIdx) ->
bool {
7163 unsigned GroupEndIdx = FoundIt != Indices.end() ? *FoundIt : Sz;
7164 return GroupEndIdx - StartIdx == GroupSize;
7166 for (
unsigned I = 0;
I < Sz;
I += GroupSize) {
7172 Type *StrideTy = DL->getIndexType(Ptr0->
getType());
7181 StridedPtrInfo &SPtrInfo)
const {
7187 OffsetToPointerOpIdxMap;
7188 for (
auto [Idx, Ptr] :
enumerate(PointerOps)) {
7189 const SCEV *PtrSCEV = SE->getSCEV(Ptr);
7201 Offset = SC->getAPInt().getSExtValue();
7205 OffsetToPointerOpIdxMap[
Offset].first.push_back(Ptr);
7206 OffsetToPointerOpIdxMap[
Offset].second.push_back(Idx);
7208 unsigned NumOffsets = OffsetToPointerOpIdxMap.
size();
7212 const unsigned Sz = PointerOps.
size();
7213 unsigned VecSz = Sz;
7214 Type *NewScalarTy = ScalarTy;
7215 if (NumOffsets > 1) {
7216 if (Sz % NumOffsets != 0)
7218 VecSz = Sz / NumOffsets;
7221 DL->getTypeSizeInBits(ScalarTy).getFixedValue() * NumOffsets);
7224 if (Sz <= MinProfitableStridedLoads || !TTI->isTypeLegal(StridedLoadTy) ||
7225 !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))
7231 for (
auto [Idx, MapPair] :
enumerate(OffsetToPointerOpIdxMap)) {
7232 if (MapPair.second.first.size() != VecSz)
7234 SortedOffsetsV[Idx] = MapPair.first;
7236 sort(SortedOffsetsV);
7238 if (NumOffsets > 1) {
7240 if (SortedOffsetsV[
I] - SortedOffsetsV[
I - 1] != 1)
7313 auto UpdateSortedIndices =
7316 if (SortedIndicesForOffset.
empty()) {
7317 SortedIndicesForOffset.
resize(IndicesInAllPointerOps.
size());
7318 std::iota(SortedIndicesForOffset.
begin(),
7319 SortedIndicesForOffset.
end(), 0);
7321 for (
const auto [Num, Idx] :
enumerate(SortedIndicesForOffset)) {
7322 SortedIndicesDraft[Num * NumOffsets + OffsetNum] =
7323 IndicesInAllPointerOps[Idx];
7327 int64_t LowestOffset = SortedOffsetsV[0];
7333 SortedIndicesForOffset0, Coeffs0);
7336 unsigned NumCoeffs0 = Coeffs0.
size();
7337 if (NumCoeffs0 * NumOffsets != Sz)
7342 OffsetToPointerOpIdxMap[LowestOffset].second;
7343 UpdateSortedIndices(SortedIndicesForOffset0, IndicesInAllPointerOps0, 0);
7349 for (
int J :
seq<int>(1, NumOffsets)) {
7352 SortedIndicesForOffset.
clear();
7354 int64_t
Offset = SortedOffsetsV[J];
7356 OffsetToPointerOpIdxMap[
Offset].first;
7358 OffsetToPointerOpIdxMap[
Offset].second;
7359 const SCEV *StrideWithinGroup =
7361 SortedIndicesForOffset, Coeffs);
7363 if (!StrideWithinGroup || StrideWithinGroup != Stride0)
7365 if (Coeffs.
size() != NumCoeffs0)
7368 if (Coeffs != Coeffs0)
7371 UpdateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps, J);
7374 SortedIndices.
clear();
7375 SortedIndices = SortedIndicesDraft;
7376 SPtrInfo.StrideSCEV = Stride0;
7377 SPtrInfo.Ty = StridedLoadTy;
7384 unsigned *BestVF,
bool TryRecursiveCheck)
const {
7397 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
7403 const size_t Sz = VL.
size();
7405 auto *POIter = PointerOps.
begin();
7406 for (
Value *V : VL) {
7408 if (!L || !L->isSimple())
7410 *POIter = L->getPointerOperand();
7416 bool IsSorted =
sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
7425 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
7426 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
7437 if (Order.
empty()) {
7438 Ptr0 = PointerOps.
front();
7439 PtrN = PointerOps.
back();
7441 Ptr0 = PointerOps[Order.
front()];
7442 PtrN = PointerOps[Order.
back()];
7447 std::optional<int64_t> Diff0 =
7449 std::optional<int64_t> DiffN =
7452 "sortPtrAccesses should have validated these pointers");
7453 int64_t Diff = *DiffN - *Diff0;
7455 if (
static_cast<uint64_t>(Diff) == Sz - 1)
7458 *TLI, [&](
Value *V) {
7459 return areAllUsersVectorized(
7467 Diff, Ptr0, PtrN, SPtrInfo))
7470 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
7471 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
7476 auto CheckForShuffledLoads = [&, &TTI = *TTI](
Align CommonAlignment,
7478 bool ProfitableGatherPointers) {
7483 auto [ScalarGEPCost, VectorGEPCost] =
7485 Instruction::GetElementPtr,
CostKind, ScalarTy, VecTy);
7489 Type *PtrScalarTy = PointerOps.
front()->getType()->getScalarType();
7491 if (
static_cast<unsigned>(
count_if(
7510 return C + TTI.getInstructionCost(
7516 TTI.getMemIntrinsicInstrCost(
7519 false, CommonAlignment),
7521 (ProfitableGatherPointers ? 0 : VectorGEPCost);
7529 constexpr unsigned ListLimit = 4;
7530 if (!TryRecursiveCheck || VL.
size() < ListLimit)
7539 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
7549 for (
unsigned Cnt = 0, End = VL.
size(); Cnt + VF <= End; Cnt += VF) {
7554 PointerOps, SPtrInfo, BestVF,
7562 DemandedElts.
setBits(Cnt, Cnt + VF);
7578 if (!DemandedElts.
isZero()) {
7584 if (DemandedElts[Idx])
7595 LI0->getPointerOperand(),
7596 Instruction::GetElementPtr,
CostKind, ScalarTy,
7600 if (
static_cast<unsigned>(
7602 PointerOps.
size() - 1 ||
7621 TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
7622 LI0->getPointerAddressSpace(),
CostKind,
7627 VecLdCost += TTI.getMemIntrinsicInstrCost(
7629 Intrinsic::experimental_vp_strided_load,
7630 SubVecTy, LI0->getPointerOperand(),
7631 false, CommonAlignment),
7636 VecLdCost += TTI.getMemIntrinsicInstrCost(
7638 Intrinsic::masked_load, SubVecTy,
7639 CommonAlignment, LI0->getPointerAddressSpace()),
7645 VecLdCost += TTI.getMemIntrinsicInstrCost(
7647 Intrinsic::masked_gather, SubVecTy,
7648 LI0->getPointerOperand(),
7649 false, CommonAlignment),
7659 ShuffleMask[Idx] = Idx / VF ==
I ? VL.
size() + Idx % VF : Idx;
7668 if (MaskedGatherCost >= VecLdCost &&
7681 bool ProfitableGatherPointers =
7682 L && Sz > 2 &&
static_cast<unsigned>(
count_if(PointerOps, [L](
Value *V) {
7683 return L->isLoopInvariant(V);
7685 if (ProfitableGatherPointers ||
all_of(PointerOps, [](
Value *
P) {
7688 (
GEP &&
GEP->getNumOperands() == 2 &&
7696 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
7697 ProfitableGatherPointers))
7709 all_of(VL, [](
const Value *V) {
return V->getType()->isPointerTy(); }) &&
7710 "Expected list of pointer operands.");
7715 std::pair<BasicBlock *, Value *>,
7719 .try_emplace(std::make_pair(
7723 SortedIndices.
clear();
7725 auto Key = std::make_pair(BBs[Cnt + 1],
7727 bool Found =
any_of(Bases.try_emplace(
Key).first->second,
7728 [&, &Cnt = Cnt, &Ptr = Ptr](
auto &
Base) {
7729 std::optional<int64_t> Diff =
7730 getPointersDiff(ElemTy, std::get<0>(Base.front()),
7731 ElemTy, Ptr, DL, SE,
7736 Base.emplace_back(Ptr, *Diff, Cnt + 1);
7742 if (Bases.size() > VL.
size() / 2 - 1)
7746 Bases.find(
Key)->second.emplace_back().emplace_back(Ptr, 0, Cnt + 1);
7750 if (Bases.size() == VL.
size())
7753 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
7754 Bases.front().second.size() == VL.
size()))
7759 auto ComparePointers = [](
Value *Ptr1,
Value *Ptr2) {
7768 FirstPointers.
insert(P1);
7769 SecondPointers.
insert(P2);
7775 "Unable to find matching root.");
7778 for (
auto &
Base : Bases) {
7779 for (
auto &Vec :
Base.second) {
7780 if (Vec.size() > 1) {
7782 int64_t InitialOffset = std::get<1>(Vec[0]);
7783 bool AnyConsecutive =
7785 return std::get<1>(
P.value()) ==
7786 int64_t(
P.index()) + InitialOffset;
7790 if (!AnyConsecutive)
7795 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
7799 for (
auto &
T : Bases)
7800 for (
const auto &Vec :
T.second)
7801 for (
const auto &
P : Vec)
7805 "Expected SortedIndices to be the size of VL");
7809std::optional<BoUpSLP::OrdersType>
7811 assert(TE.isGather() &&
"Expected gather node only.");
7812 Type *ScalarTy = TE.Scalars[0]->getType();
7815 Ptrs.
reserve(TE.Scalars.size());
7817 BBs.
reserve(TE.Scalars.size());
7818 for (
Value *V : TE.Scalars) {
7820 if (!L || !L->isSimple())
7821 return std::nullopt;
7827 if (!LoadEntriesToVectorize.contains(TE.Idx) &&
7829 return std::move(Order);
7830 return std::nullopt;
7841 if (VU->
getType() != V->getType())
7844 if (!VU->
hasOneUse() && !V->hasOneUse())
7850 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
7857 bool IsReusedIdx =
false;
7859 if (IE2 == VU && !IE1)
7861 if (IE1 == V && !IE2)
7862 return V->hasOneUse();
7863 if (IE1 && IE1 != V) {
7865 IsReusedIdx |= ReusedIdx.
test(Idx1);
7866 ReusedIdx.
set(Idx1);
7867 if ((IE1 != VU && !IE1->
hasOneUse()) || IsReusedIdx)
7872 if (IE2 && IE2 != VU) {
7874 IsReusedIdx |= ReusedIdx.
test(Idx2);
7875 ReusedIdx.
set(Idx2);
7876 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
7881 }
while (!IsReusedIdx && (IE1 || IE2));
7891std::optional<BoUpSLP::OrdersType>
7893 bool IgnoreReorder) {
7896 if (!TE.ReuseShuffleIndices.empty()) {
7898 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
7899 "Reshuffling scalars not yet supported for nodes with padding");
7902 return std::nullopt;
7910 unsigned Sz = TE.Scalars.size();
7911 if (TE.isGather()) {
7912 if (std::optional<OrdersType> CurrentOrder =
7917 ::addMask(Mask, TE.ReuseShuffleIndices);
7918 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
7919 unsigned Sz = TE.Scalars.size();
7920 for (
int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
7923 Res[Idx + K * Sz] =
I + K * Sz;
7925 return std::move(Res);
7928 if (Sz == 2 && TE.getVectorFactor() == 4 &&
7930 2 * TE.getVectorFactor())) == 1)
7931 return std::nullopt;
7932 if (TE.ReuseShuffleIndices.size() % Sz != 0)
7933 return std::nullopt;
7937 if (TE.ReorderIndices.empty())
7938 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
7941 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
7942 unsigned VF = ReorderMask.
size();
7946 for (
unsigned I = 0;
I < VF;
I += Sz) {
7948 unsigned UndefCnt = 0;
7949 unsigned Limit = std::min(Sz, VF -
I);
7958 Val >=
static_cast<int>(NumParts) || UsedVals.
test(Val) ||
7960 return std::nullopt;
7962 for (
unsigned K = 0; K < NumParts; ++K) {
7963 unsigned Idx = Val + Sz * K;
7964 if (Idx < VF &&
I + K < VF)
7965 ResOrder[Idx] =
I + K;
7968 return std::move(ResOrder);
7970 unsigned VF = TE.getVectorFactor();
7973 TE.ReuseShuffleIndices.end());
7974 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
7976 if (isa<PoisonValue>(V))
7978 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
7979 return Idx && *Idx < Sz;
7981 assert(!TE.isAltShuffle() &&
"Alternate instructions are only supported "
7982 "by BinaryOperator and CastInst.");
7984 if (TE.ReorderIndices.empty())
7985 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
7988 for (
unsigned I = 0;
I < VF; ++
I) {
7989 int &Idx = ReusedMask[
I];
7992 Value *V = TE.Scalars[ReorderMask[Idx]];
7994 Idx = std::distance(ReorderMask.
begin(),
find(ReorderMask, *EI));
8000 std::iota(ResOrder.
begin(), ResOrder.
end(), 0);
8001 auto *It = ResOrder.
begin();
8002 for (
unsigned K = 0; K < VF; K += Sz) {
8006 std::iota(SubMask.
begin(), SubMask.
end(), 0);
8008 transform(CurrentOrder, It, [K](
unsigned Pos) {
return Pos + K; });
8009 std::advance(It, Sz);
8012 return Data.index() ==
Data.value();
8014 return std::nullopt;
8015 return std::move(ResOrder);
8017 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
8018 (!TE.UserTreeIndex || !TE.UserTreeIndex.UserTE->hasState() ||
8020 (TE.ReorderIndices.empty() ||
isReverseOrder(TE.ReorderIndices)))
8021 return std::nullopt;
8022 if (TE.State == TreeEntry::SplitVectorize ||
8023 ((TE.State == TreeEntry::Vectorize ||
8024 TE.State == TreeEntry::StridedVectorize ||
8025 TE.State == TreeEntry::CompressVectorize) &&
8028 assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
8029 "Alternate instructions are only supported by "
8030 "BinaryOperator and CastInst.");
8031 return TE.ReorderIndices;
8033 if (!TopToBottom && IgnoreReorder && TE.State == TreeEntry::Vectorize &&
8034 TE.isAltShuffle()) {
8035 assert(TE.ReuseShuffleIndices.empty() &&
8036 "ReuseShuffleIndices should be "
8037 "empty for alternate instructions.");
8039 TE.buildAltOpShuffleMask(
8041 assert(TE.getMatchingMainOpOrAltOp(
I) &&
8042 "Unexpected main/alternate opcode");
8046 const int VF = TE.getVectorFactor();
8051 ResOrder[Mask[
I] % VF] =
I;
8053 return std::move(ResOrder);
8055 if (!TE.ReorderIndices.empty())
8056 return TE.ReorderIndices;
8057 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
8058 if (!TE.ReorderIndices.empty())
8059 return TE.ReorderIndices;
8062 for (
auto [
I, V] :
zip(UserBVHead, TE.Scalars)) {
8070 while (
II &&
II->hasOneUse() &&
II->getParent() == BB) {
8078 assert(BB1 != BB2 &&
"Expected different basic blocks.");
8079 if (!DT->isReachableFromEntry(BB1))
8081 if (!DT->isReachableFromEntry(BB2))
8083 auto *NodeA = DT->getNode(BB1);
8084 auto *NodeB = DT->getNode(BB2);
8085 assert(NodeA &&
"Should only process reachable instructions");
8086 assert(NodeB &&
"Should only process reachable instructions");
8087 assert((NodeA == NodeB) ==
8088 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
8089 "Different nodes should have different DFS numbers");
8090 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
8092 auto PHICompare = [&](
unsigned I1,
unsigned I2) {
8093 Value *V1 = TE.Scalars[I1];
8094 Value *V2 = TE.Scalars[I2];
8107 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
8108 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
8109 FirstUserOfPhi2->getParent());
8119 if (UserBVHead[I1] && !UserBVHead[I2])
8121 if (!UserBVHead[I1])
8123 if (UserBVHead[I1] == UserBVHead[I2])
8126 return CompareByBasicBlocks(UserBVHead[I1]->
getParent(),
8128 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
8141 if (EE1->getOperand(0) == EE2->getOperand(0))
8143 if (!Inst1 && Inst2)
8145 if (Inst1 && Inst2) {
8153 "Expected either instructions or arguments vector operands.");
8154 return P1->getArgNo() < P2->getArgNo();
8159 std::iota(Phis.
begin(), Phis.
end(), 0);
8162 return std::nullopt;
8163 return std::move(Phis);
8165 if (TE.isGather() &&
8166 (!TE.hasState() || !TE.isAltShuffle() ||
8167 ScalarsInSplitNodes.contains(TE.getMainOp())) &&
8171 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
8175 auto *EE = dyn_cast<ExtractElementInst>(V);
8176 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
8182 canReuseExtract(TE.Scalars, CurrentOrder,
true);
8183 if (Reuse || !CurrentOrder.
empty())
8184 return std::move(CurrentOrder);
8192 int Sz = TE.Scalars.size();
8196 if (It == TE.Scalars.begin())
8199 if (It != TE.Scalars.end()) {
8201 unsigned Idx = std::distance(TE.Scalars.begin(), It);
8216 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
8219 return std::move(Order);
8224 return std::nullopt;
8225 if (TE.Scalars.size() >= 3)
8230 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
8232 StridedPtrInfo SPtrInfo;
8235 CurrentOrder, PointerOps, SPtrInfo);
8238 return std::move(CurrentOrder);
8243 if (std::optional<OrdersType> CurrentOrder =
8245 return CurrentOrder;
8247 return std::nullopt;
8257 for (
unsigned I = Sz,
E = Mask.size();
I <
E;
I += Sz) {
8259 if (Cluster != FirstCluster)
8265void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const {
8268 const unsigned Sz =
TE.Scalars.size();
8270 if (!
TE.isGather() ||
8277 addMask(NewMask,
TE.ReuseShuffleIndices);
8279 TE.ReorderIndices.clear();
8286 for (
auto *It =
TE.ReuseShuffleIndices.begin(),
8287 *End =
TE.ReuseShuffleIndices.end();
8288 It != End; std::advance(It, Sz))
8289 std::iota(It, std::next(It, Sz), 0);
8295 "Expected same size of orders");
8296 size_t Sz = Order.
size();
8299 if (Order[Idx] != Sz)
8300 UsedIndices.
set(Order[Idx]);
8302 if (SecondaryOrder.
empty()) {
8304 if (Order[Idx] == Sz && !UsedIndices.
test(Idx))
8308 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
8309 !UsedIndices.
test(SecondaryOrder[Idx]))
8310 Order[Idx] = SecondaryOrder[Idx];
8318 constexpr unsigned TinyVF = 2;
8319 constexpr unsigned TinyTree = 10;
8320 constexpr unsigned PhiOpsLimit = 12;
8321 constexpr unsigned GatherLoadsLimit = 2;
8322 if (VectorizableTree.size() <= TinyTree)
8324 if (VectorizableTree.front()->hasState() &&
8325 !VectorizableTree.front()->isGather() &&
8326 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
8327 VectorizableTree.front()->getOpcode() == Instruction::PHI ||
8328 (VectorizableTree.front()->getVectorFactor() <= TinyVF &&
8329 (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
8330 VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
8331 VectorizableTree.front()->ReorderIndices.empty()) {
8335 if (VectorizableTree.front()->hasState() &&
8336 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
8337 VectorizableTree.front()->Scalars.size() == TinyVF &&
8338 VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
8341 if (VectorizableTree.front()->hasState() &&
8342 VectorizableTree.front()->getOpcode() == Instruction::Store &&
8343 VectorizableTree.front()->ReorderIndices.empty()) {
8344 const unsigned ReorderedSplitsCnt =
8345 count_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
8346 return TE->State == TreeEntry::SplitVectorize &&
8347 !TE->ReorderIndices.empty() && TE->UserTreeIndex.UserTE &&
8348 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8351 if (ReorderedSplitsCnt <= 1 &&
8353 VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
8354 return ((!TE->isGather() &&
8355 (TE->ReorderIndices.empty() ||
8356 (TE->UserTreeIndex.UserTE &&
8357 TE->UserTreeIndex.UserTE->State ==
8358 TreeEntry::Vectorize &&
8359 !TE->UserTreeIndex.UserTE->ReuseShuffleIndices
8361 (TE->isGather() && TE->ReorderIndices.empty() &&
8362 (!TE->hasState() || TE->isAltShuffle() ||
8363 TE->getOpcode() == Instruction::Load ||
8364 TE->getOpcode() == Instruction::ZExt ||
8365 TE->getOpcode() == Instruction::SExt))) &&
8366 (VectorizableTree.front()->getVectorFactor() > TinyVF ||
8367 !TE->isGather() ||
none_of(TE->Scalars, [&](
Value *V) {
8368 return !isConstant(V) && isVectorized(V);
8370 })) >= VectorizableTree.size() - ReorderedSplitsCnt)
8373 bool HasPhis =
false;
8374 bool HasLoad =
true;
8375 unsigned GatherLoads = 0;
8376 for (
const std::unique_ptr<TreeEntry> &TE :
8377 ArrayRef(VectorizableTree).drop_front()) {
8378 if (TE->State == TreeEntry::SplitVectorize)
8380 if (!TE->hasState()) {
8384 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8389 if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) {
8390 if (!TE->isGather()) {
8397 if (GatherLoads >= GatherLoadsLimit)
8400 if (TE->getOpcode() == Instruction::GetElementPtr ||
8403 if (TE->getOpcode() != Instruction::PHI &&
8404 (!TE->hasCopyableElements() ||
8406 TE->Scalars.size() / 2))
8408 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8409 TE->getNumOperands() > PhiOpsLimit)
8418void BoUpSLP::TreeEntry::reorderSplitNode(
unsigned Idx,
ArrayRef<int> Mask,
8420 assert(State == TreeEntry::SplitVectorize &&
"Expected split user node.");
8423 std::iota(NewMask.
begin(), NewMask.
end(), 0);
8424 std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0);
8427 copy(MaskOrder, NewMaskOrder.begin());
8429 assert(Idx == 1 &&
"Expected either 0 or 1 index.");
8430 unsigned Offset = CombinedEntriesWithIndices.
back().second;
8439 ReorderIndices.clear();
8458 ExternalUserReorderMap;
8462 for_each(VectorizableTree, [&, &TTIRef = *TTI](
8463 const std::unique_ptr<TreeEntry> &TE) {
8466 findExternalStoreUsersReorderIndices(TE.get());
8467 if (!ExternalUserReorderIndices.
empty()) {
8468 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
8470 std::move(ExternalUserReorderIndices));
8476 if (TE->hasState() && TE->isAltShuffle() &&
8477 TE->State != TreeEntry::SplitVectorize) {
8478 Type *ScalarTy = TE->Scalars[0]->getType();
8480 unsigned Opcode0 = TE->getOpcode();
8481 unsigned Opcode1 = TE->getAltOpcode();
8485 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
8486 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
8492 bool IgnoreReorder =
8493 !UserIgnoreList && VectorizableTree.front()->hasState() &&
8494 (VectorizableTree.front()->
getOpcode() == Instruction::InsertElement ||
8495 VectorizableTree.front()->getOpcode() == Instruction::Store);
8496 if (std::optional<OrdersType> CurrentOrder =
8506 const TreeEntry *UserTE = TE.get();
8508 if (!UserTE->UserTreeIndex)
8510 if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8511 UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
8512 UserTE->UserTreeIndex.UserTE->Idx != 0)
8514 UserTE = UserTE->UserTreeIndex.UserTE;
8517 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
8518 if (!(TE->State == TreeEntry::Vectorize ||
8519 TE->State == TreeEntry::StridedVectorize ||
8520 TE->State == TreeEntry::SplitVectorize ||
8521 TE->State == TreeEntry::CompressVectorize) ||
8522 !TE->ReuseShuffleIndices.empty())
8523 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
8524 if (TE->State == TreeEntry::Vectorize &&
8525 TE->getOpcode() == Instruction::PHI)
8526 PhisToOrders.
try_emplace(TE.get(), *CurrentOrder);
8531 for (
unsigned VF = VectorizableTree.front()->getVectorFactor();
8532 !VFToOrderedEntries.
empty() && VF > 1; VF -= 2 - (VF & 1U)) {
8533 auto It = VFToOrderedEntries.
find(VF);
8534 if (It == VFToOrderedEntries.
end())
8548 for (
const TreeEntry *OpTE : OrderedEntries) {
8551 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE) &&
8552 OpTE->State != TreeEntry::SplitVectorize)
8555 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
8557 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
8558 auto It = GathersToOrders.find(OpTE);
8559 if (It != GathersToOrders.end())
8562 if (OpTE->hasState() && OpTE->isAltShuffle()) {
8563 auto It = AltShufflesToOrders.find(OpTE);
8564 if (It != AltShufflesToOrders.end())
8567 if (OpTE->State == TreeEntry::Vectorize &&
8568 OpTE->getOpcode() == Instruction::PHI) {
8569 auto It = PhisToOrders.
find(OpTE);
8570 if (It != PhisToOrders.
end())
8573 return OpTE->ReorderIndices;
8576 auto It = ExternalUserReorderMap.
find(OpTE);
8577 if (It != ExternalUserReorderMap.
end()) {
8578 const auto &ExternalUserReorderIndices = It->second;
8582 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
8583 OrdersUses.try_emplace(
OrdersType(), 0).first->second +=
8584 ExternalUserReorderIndices.size();
8586 for (
const OrdersType &ExtOrder : ExternalUserReorderIndices)
8587 ++OrdersUses.try_emplace(ExtOrder, 0).first->second;
8594 if (OpTE->State == TreeEntry::Vectorize &&
8595 OpTE->getOpcode() == Instruction::Store && !Order.
empty()) {
8596 assert(!OpTE->isAltShuffle() &&
8597 "Alternate instructions are only supported by BinaryOperator "
8601 unsigned E = Order.
size();
8604 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8607 ++OrdersUses.try_emplace(CurrentOrder, 0).first->second;
8609 ++OrdersUses.try_emplace(Order, 0).first->second;
8612 if (OrdersUses.empty())
8615 unsigned IdentityCnt = 0;
8616 unsigned FilledIdentityCnt = 0;
8618 for (
auto &Pair : OrdersUses) {
8620 if (!Pair.first.empty())
8621 FilledIdentityCnt += Pair.second;
8622 IdentityCnt += Pair.second;
8627 unsigned Cnt = IdentityCnt;
8628 for (
auto &Pair : OrdersUses) {
8632 if (Cnt < Pair.second ||
8633 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
8634 Cnt == Pair.second && !BestOrder.
empty() &&
8637 BestOrder = Pair.first;
8650 unsigned E = BestOrder.
size();
8652 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8655 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8657 if (TE->Scalars.size() != VF) {
8658 if (TE->ReuseShuffleIndices.size() == VF) {
8659 assert(TE->State != TreeEntry::SplitVectorize &&
8660 "Split vectorized not expected.");
8665 (!TE->UserTreeIndex ||
8666 TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
8667 TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() ||
8668 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) &&
8669 "All users must be of VF size.");
8676 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->hasState() &&
8682 reorderNodeWithReuses(*TE, Mask);
8684 if (TE->UserTreeIndex &&
8685 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8686 TE->UserTreeIndex.UserTE->reorderSplitNode(
8687 TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder);
8691 if ((TE->State == TreeEntry::SplitVectorize &&
8692 TE->ReuseShuffleIndices.empty()) ||
8693 ((TE->State == TreeEntry::Vectorize ||
8694 TE->State == TreeEntry::StridedVectorize ||
8695 TE->State == TreeEntry::CompressVectorize) &&
8700 (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize &&
8701 TE->ReuseShuffleIndices.empty())) &&
8702 "Alternate instructions are only supported by BinaryOperator "
8708 TE->reorderOperands(Mask);
8711 TE->reorderOperands(Mask);
8712 assert(TE->ReorderIndices.empty() &&
8713 "Expected empty reorder sequence.");
8716 if (!TE->ReuseShuffleIndices.empty()) {
8723 addMask(NewReuses, TE->ReuseShuffleIndices);
8724 TE->ReuseShuffleIndices.swap(NewReuses);
8725 }
else if (TE->UserTreeIndex &&
8726 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8728 TE->UserTreeIndex.UserTE->reorderSplitNode(TE->UserTreeIndex.EdgeIdx,
8734void BoUpSLP::buildReorderableOperands(
8735 TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
8739 if (
any_of(Edges, [
I](
const std::pair<unsigned, TreeEntry *> &OpData) {
8740 return OpData.first ==
I &&
8741 (OpData.second->State == TreeEntry::Vectorize ||
8742 OpData.second->State == TreeEntry::StridedVectorize ||
8743 OpData.second->State == TreeEntry::CompressVectorize ||
8744 OpData.second->State == TreeEntry::SplitVectorize);
8748 if (UserTE->hasState()) {
8749 if (UserTE->getOpcode() == Instruction::ExtractElement ||
8750 UserTE->getOpcode() == Instruction::ExtractValue)
8752 if (UserTE->getOpcode() == Instruction::InsertElement &&
I == 0)
8754 if (UserTE->getOpcode() == Instruction::Store &&
8755 UserTE->State == TreeEntry::Vectorize &&
I == 1)
8757 if (UserTE->getOpcode() == Instruction::Load &&
8758 (UserTE->State == TreeEntry::Vectorize ||
8759 UserTE->State == TreeEntry::StridedVectorize ||
8760 UserTE->State == TreeEntry::CompressVectorize))
8763 TreeEntry *
TE = getOperandEntry(UserTE,
I);
8764 assert(TE &&
"Expected operand entry.");
8765 if (!
TE->isGather()) {
8768 Edges.emplace_back(
I, TE);
8774 if (
TE->State == TreeEntry::ScatterVectorize &&
8775 TE->ReuseShuffleIndices.empty() &&
TE->ReorderIndices.empty())
8779 if (ReorderableGathers.
contains(TE))
8785 struct TreeEntryCompare {
8786 bool operator()(
const TreeEntry *LHS,
const TreeEntry *RHS)
const {
8787 if (LHS->UserTreeIndex && RHS->UserTreeIndex)
8788 return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx;
8789 return LHS->Idx < RHS->Idx;
8798 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8799 if (TE->State != TreeEntry::Vectorize &&
8800 TE->State != TreeEntry::StridedVectorize &&
8801 TE->State != TreeEntry::CompressVectorize &&
8802 TE->State != TreeEntry::SplitVectorize)
8803 NonVectorized.
insert(TE.get());
8804 if (std::optional<OrdersType> CurrentOrder =
8806 Queue.push(TE.get());
8807 if (!(TE->State == TreeEntry::Vectorize ||
8808 TE->State == TreeEntry::StridedVectorize ||
8809 TE->State == TreeEntry::CompressVectorize ||
8810 TE->State == TreeEntry::SplitVectorize) ||
8811 !TE->ReuseShuffleIndices.empty())
8812 GathersToOrders.
insert(TE.get());
8821 while (!Queue.empty()) {
8823 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>
Users;
8824 TreeEntry *TE = Queue.top();
8825 const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
8828 while (!Queue.empty()) {
8830 if (!UserTE || UserTE != TE->UserTreeIndex.UserTE)
8835 for (TreeEntry *TE : OrderedOps) {
8836 if (!(TE->State == TreeEntry::Vectorize ||
8837 TE->State == TreeEntry::StridedVectorize ||
8838 TE->State == TreeEntry::CompressVectorize ||
8839 TE->State == TreeEntry::SplitVectorize ||
8840 (TE->isGather() && GathersToOrders.
contains(TE))) ||
8841 !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
8842 !Visited.
insert(TE).second)
8846 Users.first = TE->UserTreeIndex.UserTE;
8847 Users.second.emplace_back(TE->UserTreeIndex.EdgeIdx, TE);
8851 if (
Data.first->State == TreeEntry::SplitVectorize) {
8853 Data.second.size() <= 2 &&
8854 "Expected not greater than 2 operands for split vectorize node.");
8856 [](
const auto &
Op) { return !Op.second->UserTreeIndex; }))
8859 assert(
Data.first->CombinedEntriesWithIndices.size() == 2 &&
8860 "Expected exactly 2 entries.");
8861 for (
const auto &
P :
Data.first->CombinedEntriesWithIndices) {
8862 TreeEntry &OpTE = *VectorizableTree[
P.first];
8864 if (Order.
empty() || !OpTE.ReuseShuffleIndices.empty()) {
8865 if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
8867 const auto BestOrder =
8876 const unsigned E = Order.
size();
8879 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8881 Data.first->reorderSplitNode(
P.second ? 1 : 0, Mask, MaskOrder);
8883 if (!OpTE.ReorderIndices.empty()) {
8884 OpTE.ReorderIndices.clear();
8885 }
else if (!OpTE.ReuseShuffleIndices.empty()) {
8888 assert(OpTE.isGather() &&
"Expected only gather/buildvector node.");
8892 if (
Data.first->ReuseShuffleIndices.empty() &&
8893 !
Data.first->ReorderIndices.empty()) {
8896 Queue.push(
Data.first);
8902 buildReorderableOperands(
Data.first,
Data.second, NonVectorized,
8914 for (
const auto &
Op :
Data.second) {
8915 TreeEntry *OpTE =
Op.second;
8916 if (!VisitedOps.
insert(OpTE).second)
8918 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
8920 const auto Order = [&]() ->
const OrdersType {
8921 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
8925 return OpTE->ReorderIndices;
8929 if (Order.
size() == 1)
8935 Value *Root = OpTE->hasState()
8938 auto GetSameNodesUsers = [&](
Value *Root) {
8940 for (
const TreeEntry *TE : ValueToGatherNodes.lookup(Root)) {
8941 if (TE != OpTE && TE->UserTreeIndex &&
8942 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8943 TE->Scalars.size() == OpTE->Scalars.size() &&
8944 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8945 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8946 Res.
insert(TE->UserTreeIndex.UserTE);
8948 for (
const TreeEntry *TE : getTreeEntries(Root)) {
8949 if (TE != OpTE && TE->UserTreeIndex &&
8950 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8951 TE->Scalars.size() == OpTE->Scalars.size() &&
8952 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8953 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8954 Res.
insert(TE->UserTreeIndex.UserTE);
8958 auto GetNumOperands = [](
const TreeEntry *TE) {
8959 if (TE->State == TreeEntry::SplitVectorize)
8960 return TE->getNumOperands();
8962 return CI->arg_size();
8963 return TE->getNumOperands();
8965 auto NodeShouldBeReorderedWithOperands = [&, TTI = TTI](
8966 const TreeEntry *TE) {
8974 const TreeEntry *
Op = getOperandEntry(TE, Idx);
8975 if (
Op->isGather() &&
Op->hasState()) {
8976 const TreeEntry *VecOp =
8977 getSameValuesTreeEntry(
Op->getMainOp(),
Op->Scalars);
8981 if (
Op->ReorderIndices.empty() &&
Op->ReuseShuffleIndices.empty())
8988 if (!RevisitedOps.
insert(UTE).second)
8990 return UTE ==
Data.first || !UTE->ReorderIndices.empty() ||
8991 !UTE->ReuseShuffleIndices.empty() ||
8992 (UTE->UserTreeIndex &&
8993 UTE->UserTreeIndex.UserTE ==
Data.first) ||
8994 (
Data.first->UserTreeIndex &&
8995 Data.first->UserTreeIndex.UserTE == UTE) ||
8996 (IgnoreReorder && UTE->UserTreeIndex &&
8997 UTE->UserTreeIndex.UserTE->Idx == 0) ||
8998 NodeShouldBeReorderedWithOperands(UTE);
9001 for (TreeEntry *UTE :
Users) {
9009 const TreeEntry *
Op = getOperandEntry(UTE, Idx);
9011 Queue.push(
const_cast<TreeEntry *
>(
Op));
9016 Data.second, [OpTE](
const std::pair<unsigned, TreeEntry *> &
P) {
9017 return P.second == OpTE;
9020 if (OpTE->State == TreeEntry::Vectorize &&
9021 OpTE->getOpcode() == Instruction::Store && !Order.
empty()) {
9022 assert(!OpTE->isAltShuffle() &&
9023 "Alternate instructions are only supported by BinaryOperator "
9027 unsigned E = Order.
size();
9030 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
9033 OrdersUses.try_emplace(CurrentOrder, 0).first->second +=
NumOps;
9035 OrdersUses.try_emplace(Order, 0).first->second +=
NumOps;
9037 auto Res = OrdersUses.try_emplace(
OrdersType(), 0);
9038 const auto AllowsReordering = [&](
const TreeEntry *TE) {
9039 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
9040 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
9041 (IgnoreReorder && TE->Idx == 0))
9043 if (TE->isGather()) {
9053 if (OpTE->UserTreeIndex) {
9054 TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE;
9055 if (!VisitedUsers.
insert(UserTE).second)
9060 if (AllowsReordering(UserTE))
9068 if (
static_cast<unsigned>(
count_if(
9069 Ops, [UserTE, &AllowsReordering](
9070 const std::pair<unsigned, TreeEntry *> &
Op) {
9071 return AllowsReordering(
Op.second) &&
9072 Op.second->UserTreeIndex.UserTE == UserTE;
9073 })) <=
Ops.size() / 2)
9074 ++Res.first->second;
9077 if (OrdersUses.empty()) {
9082 unsigned IdentityCnt = 0;
9083 unsigned VF =
Data.second.front().second->getVectorFactor();
9085 for (
auto &Pair : OrdersUses) {
9087 IdentityCnt += Pair.second;
9092 unsigned Cnt = IdentityCnt;
9093 for (
auto &Pair : OrdersUses) {
9097 if (Cnt < Pair.second) {
9099 BestOrder = Pair.first;
9116 unsigned E = BestOrder.
size();
9118 return I < E ? static_cast<int>(I) : PoisonMaskElem;
9120 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second) {
9121 TreeEntry *TE =
Op.second;
9122 if (!VisitedOps.
insert(TE).second)
9124 if (TE->ReuseShuffleIndices.size() == BestOrder.
size()) {
9125 reorderNodeWithReuses(*TE, Mask);
9129 if (TE->State != TreeEntry::Vectorize &&
9130 TE->State != TreeEntry::StridedVectorize &&
9131 TE->State != TreeEntry::CompressVectorize &&
9132 TE->State != TreeEntry::SplitVectorize &&
9133 (TE->State != TreeEntry::ScatterVectorize ||
9134 TE->ReorderIndices.empty()))
9136 assert((BestOrder.
size() == TE->ReorderIndices.size() ||
9137 TE->ReorderIndices.empty()) &&
9138 "Non-matching sizes of user/operand entries.");
9140 if (IgnoreReorder && TE == VectorizableTree.front().get())
9141 IgnoreReorder =
false;
9144 for (TreeEntry *
Gather : GatherOps) {
9146 "Unexpected reordering of gathers.");
9147 if (!
Gather->ReuseShuffleIndices.empty()) {
9157 auto IsNotProfitableAltCodeNode = [](
const TreeEntry &TE) {
9158 return TE.isAltShuffle() &&
9159 (!TE.ReuseShuffleIndices.empty() || TE.getVectorFactor() == 2 ||
9160 TE.ReorderIndices.empty());
9162 if (
Data.first->State != TreeEntry::Vectorize ||
9164 Data.first->getMainOp()) ||
9165 IsNotProfitableAltCodeNode(*
Data.first))
9166 Data.first->reorderOperands(Mask);
9168 IsNotProfitableAltCodeNode(*
Data.first) ||
9169 Data.first->State == TreeEntry::StridedVectorize ||
9170 Data.first->State == TreeEntry::CompressVectorize) {
9174 if (
Data.first->ReuseShuffleIndices.empty() &&
9175 !
Data.first->ReorderIndices.empty() &&
9176 !IsNotProfitableAltCodeNode(*
Data.first)) {
9179 Queue.push(
Data.first);
9187 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
9188 VectorizableTree.front()->ReuseShuffleIndices.empty())
9189 VectorizableTree.front()->ReorderIndices.
clear();
9192Instruction *BoUpSLP::getRootEntryInstruction(
const TreeEntry &Entry)
const {
9193 if (Entry.hasState() &&
9194 (Entry.getOpcode() == Instruction::Store ||
9195 Entry.getOpcode() == Instruction::Load) &&
9196 Entry.State == TreeEntry::StridedVectorize &&
9197 !Entry.ReorderIndices.empty() &&
isReverseOrder(Entry.ReorderIndices))
9204 const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
9207 for (
auto &TEPtr : VectorizableTree) {
9208 TreeEntry *Entry = TEPtr.get();
9211 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize ||
9212 DeletedNodes.contains(Entry) ||
9213 TransformedToGatherNodes.contains(Entry))
9217 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
9218 Value *Scalar = Entry->Scalars[Lane];
9223 auto It = ScalarToExtUses.
find(Scalar);
9224 if (It != ScalarToExtUses.
end() && !ExternalUses[It->second].User)
9227 if (Scalar->hasNUsesOrMore(NumVectScalars)) {
9228 unsigned FoundLane = Entry->findLaneForValue(Scalar);
9229 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract from lane " << FoundLane
9230 <<
" from " << *Scalar <<
"for many users.\n");
9231 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
9232 ExternalUses.emplace_back(Scalar,
nullptr, *Entry, FoundLane);
9233 ExternalUsesWithNonUsers.insert(Scalar);
9238 const auto ExtI = ExternallyUsedValues.
find(Scalar);
9239 if (ExtI != ExternallyUsedValues.
end()) {
9240 unsigned FoundLane = Entry->findLaneForValue(Scalar);
9241 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract: Extra arg from lane "
9242 << FoundLane <<
" from " << *Scalar <<
".\n");
9243 ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size());
9244 ExternalUses.emplace_back(Scalar,
nullptr, *Entry, FoundLane);
9255 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
9260 any_of(UseEntries, [
this](
const TreeEntry *UseEntry) {
9261 return !DeletedNodes.contains(UseEntry) &&
9262 !TransformedToGatherNodes.contains(UseEntry);
9267 if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
9270 all_of(UseEntries, [&](TreeEntry *UseEntry) {
9271 if (DeletedNodes.contains(UseEntry) ||
9272 TransformedToGatherNodes.contains(UseEntry))
9274 return UseEntry->State == TreeEntry::ScatterVectorize ||
9276 Scalar, getRootEntryInstruction(*UseEntry), TLI,
9279 LLVM_DEBUG(
dbgs() <<
"SLP: \tInternal user will be removed:" << *U
9282 [](TreeEntry *UseEntry) {
9283 return UseEntry->isGather();
9289 if (It != ScalarToExtUses.
end()) {
9290 ExternalUses[It->second].User =
nullptr;
9295 if (U && Scalar->hasNUsesOrMore(
UsesLimit))
9297 unsigned FoundLane = Entry->findLaneForValue(Scalar);
9299 <<
" from lane " << FoundLane <<
" from " << *Scalar
9301 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
9302 ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
9303 ExternalUsesWithNonUsers.insert(Scalar);
9312BoUpSLP::collectUserStores(
const BoUpSLP::TreeEntry *TE)
const {
9316 for (
unsigned Lane :
seq<unsigned>(0, TE->Scalars.size())) {
9317 Value *V = TE->Scalars[Lane];
9330 if (
SI ==
nullptr || !
SI->isSimple() ||
SI->getFunction() !=
F ||
9339 auto &StoresVec = PtrToStoresMap[{
SI->getParent(),
9340 SI->getValueOperand()->getType(), Ptr}];
9343 if (StoresVec.size() > Lane)
9345 if (!StoresVec.empty()) {
9347 SI->getValueOperand()->getType(),
SI->getPointerOperand(),
9348 SI->getValueOperand()->getType(),
9349 StoresVec.front()->getPointerOperand(), *
DL, *SE,
9355 StoresVec.push_back(SI);
9360 for (
auto &
P : PtrToStoresMap) {
9375 StoreInst *S0 = StoresVec[0];
9380 StoreInst *
SI = StoresVec[Idx];
9381 std::optional<int64_t> Diff =
9383 SI->getPointerOperand(), *DL, *SE,
9389 if (StoreOffsetVec.
size() != StoresVec.
size())
9391 sort(StoreOffsetVec, llvm::less_first());
9393 int64_t PrevDist = 0;
9394 for (
const auto &
P : StoreOffsetVec) {
9395 if (Idx > 0 &&
P.first != PrevDist + 1)
9403 ReorderIndices.assign(StoresVec.
size(), 0);
9404 bool IsIdentity =
true;
9406 ReorderIndices[
P.second] =
I;
9407 IsIdentity &=
P.second ==
I;
9413 ReorderIndices.clear();
9420 for (
unsigned Idx : Order)
9421 dbgs() << Idx <<
", ";
9427BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE)
const {
9428 unsigned NumLanes =
TE->Scalars.size();
9441 if (StoresVec.
size() != NumLanes)
9446 if (!canFormVector(StoresVec, ReorderIndices))
9451 ExternalReorderIndices.
push_back(ReorderIndices);
9453 return ExternalReorderIndices;
9459 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9460 "TreeEntryToStridedPtrInfoMap is not cleared");
9461 UserIgnoreList = &UserIgnoreLst;
9464 buildTreeRec(Roots, 0,
EdgeInfo());
9469 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9470 "TreeEntryToStridedPtrInfoMap is not cleared");
9473 buildTreeRec(Roots, 0,
EdgeInfo());
9482 bool AddNew =
true) {
9490 for (
Value *V : VL) {
9494 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
9496 bool IsFound =
false;
9497 for (
auto [Map,
Data] :
zip(ClusteredDistToLoad, ClusteredLoads)) {
9498 assert(LI->getParent() ==
Data.front().first->getParent() &&
9499 LI->getType() ==
Data.front().first->getType() &&
9503 "Expected loads with the same type, same parent and same "
9504 "underlying pointer.");
9506 LI->getType(), LI->getPointerOperand(),
Data.front().first->getType(),
9507 Data.front().first->getPointerOperand(),
DL, SE,
9511 auto It = Map.find(*Dist);
9512 if (It != Map.end() && It->second != LI)
9514 if (It == Map.end()) {
9515 Data.emplace_back(LI, *Dist);
9516 Map.try_emplace(*Dist, LI);
9526 auto FindMatchingLoads =
9531 int64_t &
Offset,
unsigned &Start) {
9533 return GatheredLoads.
end();
9542 std::optional<int64_t> Dist =
9544 Data.front().first->getType(),
9545 Data.front().first->getPointerOperand(),
DL, SE,
9551 for (std::pair<LoadInst *, int64_t>
P :
Data) {
9557 unsigned NumUniques = 0;
9558 for (
auto [Cnt, Pair] :
enumerate(Loads)) {
9559 bool Used = DataLoads.
contains(Pair.first);
9560 if (!Used && !DataDists.
contains(*Dist + Pair.second)) {
9564 Repeated.insert(Cnt);
9567 if (NumUniques > 0 &&
9568 (Loads.
size() == NumUniques ||
9569 (Loads.
size() - NumUniques >= 2 &&
9570 Loads.
size() - NumUniques >= Loads.
size() / 2 &&
9576 return std::next(GatheredLoads.
begin(), Idx);
9580 return GatheredLoads.
end();
9582 for (
ArrayRef<std::pair<LoadInst *, int64_t>>
Data : ClusteredLoads) {
9586 auto *It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
9588 while (It != GatheredLoads.
end()) {
9589 assert(!LocalToAdd.
empty() &&
"Expected some elements to add.");
9590 for (
unsigned Idx : LocalToAdd)
9593 It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
Offset,
9597 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
9604 Loads.push_back(
Data[Idx]);
9610 GatheredLoads, [&](
ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9611 return PD.front().first->getParent() == LI->
getParent() &&
9612 PD.front().first->getType() == LI->
getType();
9614 while (It != GatheredLoads.
end()) {
9617 std::next(It), GatheredLoads.
end(),
9618 [&](
ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9619 return PD.front().first->getParent() == LI->getParent() &&
9620 PD.front().first->getType() == LI->getType();
9624 GatheredLoads.emplace_back().append(
Data.begin(),
Data.end());
9625 AddNewLoads(GatheredLoads.emplace_back());
9630void BoUpSLP::tryToVectorizeGatheredLoads(
9631 const SmallMapVector<
9632 std::tuple<BasicBlock *, Value *, Type *>,
9635 GatheredLoadsEntriesFirst = VectorizableTree.
size();
9638 LoadEntriesToVectorize.size());
9639 for (
auto [Idx, Set] :
zip(LoadEntriesToVectorize, LoadSetsToVectorize))
9640 Set.insert_range(VectorizableTree[Idx]->Scalars);
9643 auto LoadSorter = [](
const std::pair<LoadInst *, int64_t> &L1,
9644 const std::pair<LoadInst *, int64_t> &L2) {
9645 return L1.second > L2.second;
9652 auto *Ty =
getWidenedType(Loads.front()->getType(), Loads.size());
9653 return TTI->isLegalMaskedGather(Ty, Alignment) &&
9654 !TTI->forceScalarizeMaskedGather(Ty, Alignment);
9659 SmallVectorImpl<LoadInst *> &NonVectorized,
9660 bool Final,
unsigned MaxVF) {
9662 unsigned StartIdx = 0;
9663 SmallVector<int> CandidateVFs;
9667 *TTI, Loads.
front()->getType(), MaxVF);
9669 *TTI, Loads.
front()->getType(), NumElts - 1)) {
9675 if (Final && CandidateVFs.
empty())
9678 unsigned BestVF = Final ? CandidateVFs.
back() : 0;
9679 for (
unsigned NumElts : CandidateVFs) {
9680 if (Final && NumElts > BestVF)
9682 SmallVector<unsigned> MaskedGatherVectorized;
9683 for (
unsigned Cnt = StartIdx,
E = Loads.
size(); Cnt <
E;
9687 if (VectorizedLoads.count(Slice.
front()) ||
9688 VectorizedLoads.count(Slice.
back()) ||
9694 bool AllowToVectorize =
false;
9697 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
9700 for (LoadInst *LI : Slice) {
9702 if (LI->hasOneUse())
9708 if (
static_cast<unsigned int>(std::distance(
9709 LI->user_begin(), LI->user_end())) != LI->getNumUses())
9711 if (!IsLegalBroadcastLoad)
9715 for (User *U : LI->users()) {
9718 for (
const TreeEntry *UTE : getTreeEntries(U)) {
9719 for (
int I :
seq<int>(UTE->getNumOperands())) {
9721 return V == LI || isa<PoisonValue>(V);
9731 AllowToVectorize = CheckIfAllowed(Slice);
9735 any_of(ValueToGatherNodes.at(Slice.front()),
9736 [=](
const TreeEntry *TE) {
9737 return TE->Scalars.size() == 2 &&
9738 ((TE->Scalars.front() == Slice.front() &&
9739 TE->Scalars.back() == Slice.back()) ||
9740 (TE->Scalars.front() == Slice.back() &&
9741 TE->Scalars.back() == Slice.front()));
9746 if (AllowToVectorize) {
9751 reinterpret_cast<Value *
const *
>(Slice.begin()), Slice.size());
9752 StridedPtrInfo SPtrInfo;
9754 PointerOps, SPtrInfo, &BestVF);
9756 (BestVF > 1 &&
static_cast<unsigned>(NumElts) == 2 * BestVF)) {
9758 if (MaskedGatherVectorized.
empty() ||
9759 Cnt >= MaskedGatherVectorized.
back() + NumElts)
9764 Results.emplace_back(Values, LS);
9765 VectorizedLoads.insert_range(Slice);
9768 if (Cnt == StartIdx)
9769 StartIdx += NumElts;
9772 if (StartIdx >= Loads.
size())
9776 if (!MaskedGatherVectorized.
empty() &&
9777 Cnt < MaskedGatherVectorized.
back() + NumElts)
9783 if (!AllowToVectorize || BestVF == 0)
9787 for (
unsigned Cnt : MaskedGatherVectorized) {
9789 Cnt, std::min<unsigned>(NumElts, Loads.
size() - Cnt));
9793 VectorizedLoads.insert_range(Slice);
9795 if (Cnt == StartIdx)
9796 StartIdx += NumElts;
9799 for (LoadInst *LI : Loads) {
9800 if (!VectorizedLoads.contains(LI))
9801 NonVectorized.push_back(LI);
9805 auto ProcessGatheredLoads =
9808 bool Final =
false) {
9810 for (
ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists :
9812 if (LoadsDists.size() <= 1) {
9813 NonVectorized.
push_back(LoadsDists.back().first);
9821 unsigned MaxConsecutiveDistance = 0;
9822 unsigned CurrentConsecutiveDist = 1;
9823 int64_t LastDist = LocalLoadsDists.front().second;
9824 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
9825 for (
const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {
9828 assert(LastDist >=
L.second &&
9829 "Expected first distance always not less than second");
9830 if (
static_cast<uint64_t
>(LastDist -
L.second) ==
9831 CurrentConsecutiveDist) {
9832 ++CurrentConsecutiveDist;
9833 MaxConsecutiveDistance =
9834 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
9838 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
9841 CurrentConsecutiveDist = 1;
9842 LastDist =
L.second;
9845 if (Loads.
size() <= 1)
9847 if (AllowMaskedGather)
9848 MaxConsecutiveDistance = Loads.
size();
9849 else if (MaxConsecutiveDistance < 2)
9854 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
9855 Final, MaxConsecutiveDistance);
9857 OriginalLoads.size() == Loads.
size() &&
9858 MaxConsecutiveDistance == Loads.
size() &&
9863 VectorizedLoads.
clear();
9867 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
9868 UnsortedNonVectorized, Final,
9869 OriginalLoads.size());
9870 if (SortedNonVectorized.
size() >= UnsortedNonVectorized.
size()) {
9871 SortedNonVectorized.
swap(UnsortedNonVectorized);
9872 Results.swap(UnsortedResults);
9877 << Slice.
size() <<
")\n");
9879 for (
Value *L : Slice)
9887 unsigned MaxVF = Slice.size();
9888 unsigned UserMaxVF = 0;
9889 unsigned InterleaveFactor = 0;
9894 std::optional<unsigned> InterleavedLoadsDistance = 0;
9896 std::optional<unsigned> CommonVF = 0;
9897 DenseMap<const TreeEntry *, unsigned> EntryToPosition;
9898 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
9899 for (
auto [Idx, V] :
enumerate(Slice)) {
9900 for (
const TreeEntry *
E : ValueToGatherNodes.at(V)) {
9901 UserMaxVF = std::max<unsigned>(UserMaxVF,
E->Scalars.size());
9904 UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
9906 if (*CommonVF == 0) {
9907 CommonVF =
E->Scalars.size();
9910 if (*CommonVF !=
E->Scalars.size())
9914 if (Pos != Idx && InterleavedLoadsDistance) {
9917 if (isa<Constant>(V))
9919 if (isVectorized(V))
9921 const auto &Nodes = ValueToGatherNodes.at(V);
9922 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
9923 !is_contained(Slice, V);
9925 InterleavedLoadsDistance.reset();
9929 if (*InterleavedLoadsDistance == 0) {
9930 InterleavedLoadsDistance = Idx - Pos;
9933 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
9934 (Idx - Pos) / *InterleavedLoadsDistance < Order)
9935 InterleavedLoadsDistance.reset();
9936 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
9940 DeinterleavedNodes.
clear();
9942 if (InterleavedLoadsDistance.value_or(0) > 1 &&
9943 CommonVF.value_or(0) != 0) {
9944 InterleaveFactor =
bit_ceil(*InterleavedLoadsDistance);
9945 unsigned VF = *CommonVF;
9948 StridedPtrInfo SPtrInfo;
9950 if (InterleaveFactor <= Slice.size() &&
9951 TTI.isLegalInterleavedAccessType(
9959 UserMaxVF = InterleaveFactor * VF;
9961 InterleaveFactor = 0;
9966 unsigned ConsecutiveNodesSize = 0;
9967 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
9968 any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9969 [&, Slice = Slice](
const auto &
P) {
9971 return std::get<1>(
P).contains(V);
9973 if (It == Slice.end())
9975 const TreeEntry &
TE =
9976 *VectorizableTree[std::get<0>(
P)];
9980 StridedPtrInfo SPtrInfo;
9982 VL, VL.
front(), Order, PointerOps, SPtrInfo);
9986 ConsecutiveNodesSize += VL.
size();
9987 size_t Start = std::distance(Slice.begin(), It);
9988 size_t Sz = Slice.size() -
Start;
9989 return Sz < VL.
size() ||
9990 Slice.slice(Start, VL.
size()) != VL;
9995 if (InterleaveFactor == 0 &&
9997 [&, Slice = Slice](
unsigned Idx) {
9999 SmallVector<Value *> PointerOps;
10000 StridedPtrInfo SPtrInfo;
10001 return canVectorizeLoads(
10002 Slice.slice(Idx * UserMaxVF, UserMaxVF),
10003 Slice[Idx * UserMaxVF], Order, PointerOps,
10004 SPtrInfo) == LoadsState::ScatterVectorize;
10007 if (Slice.size() != ConsecutiveNodesSize)
10008 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
10010 for (
unsigned VF = MaxVF; VF >= 2; VF /= 2) {
10011 bool IsVectorized =
true;
10012 for (
unsigned I = 0,
E = Slice.size();
I <
E;
I += VF) {
10014 Slice.slice(
I, std::min(VF,
E -
I));
10019 if (
any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
10020 [&](
const auto &
P) {
10021 return !SubSlice.
equals(
10022 VectorizableTree[std::get<0>(
P)]
10027 unsigned Sz = VectorizableTree.size();
10028 buildTreeRec(SubSlice, 0,
EdgeInfo(), InterleaveFactor);
10029 if (Sz == VectorizableTree.size()) {
10030 IsVectorized =
false;
10033 if (InterleaveFactor > 0) {
10034 VF = 2 * (MaxVF / InterleaveFactor);
10035 InterleaveFactor = 0;
10044 NonVectorized.
append(SortedNonVectorized);
10046 return NonVectorized;
10048 for (
const auto &GLs : GatheredLoads) {
10049 const auto &
Ref = GLs.second;
10051 if (!
Ref.empty() && !NonVectorized.
empty() &&
10053 Ref.begin(),
Ref.end(), 0u,
10054 [](
unsigned S,
ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists)
10055 ->
unsigned { return S + LoadsDists.size(); }) !=
10056 NonVectorized.
size() &&
10057 IsMaskedGatherSupported(NonVectorized)) {
10059 FinalGatheredLoads;
10060 for (LoadInst *LI : NonVectorized) {
10064 FinalGatheredLoads,
10068 (void)ProcessGatheredLoads(FinalGatheredLoads,
true);
10072 for (
unsigned Idx : LoadEntriesToVectorize) {
10073 const TreeEntry &
E = *VectorizableTree[Idx];
10076 if (!
E.ReorderIndices.empty()) {
10079 SmallVector<int> ReorderMask;
10083 buildTreeRec(GatheredScalars, 0,
EdgeInfo());
10087 if (
static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
10088 VectorizableTree.size())
10089 GatheredLoadsEntriesFirst.reset();
10099 bool AllowAlternate) {
10105 if (LI->isSimple())
10116 SubKey =
hash_value(EI->getVectorOperand());
10123 if (AllowAlternate)
10134 std::pair<size_t, size_t> OpVals =
10142 if (CI->isCommutative())
10164 SubKey =
hash_value(Gep->getPointerOperand());
10176 return std::make_pair(
Key, SubKey);
10182 Instruction *AltOp,
const TargetLibraryInfo &TLI);
10184bool BoUpSLP::areAltOperandsProfitable(
const InstructionsState &S,
10186 Type *ScalarTy = S.getMainOp()->getType();
10187 unsigned Opcode0 = S.getOpcode();
10188 unsigned Opcode1 = S.getAltOpcode();
10189 SmallBitVector OpcodeMask(
getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
10192 Opcode1, OpcodeMask))
10195 for (
unsigned I :
seq<unsigned>(S.getMainOp()->getNumOperands())) {
10198 for (
Value *V : VL) {
10200 Operands.
back().push_back(
10207 if (Operands.
size() == 2) {
10211 Candidates[0] = std::make_pair(Operands[0][
I], Operands[0][
I + 1]);
10212 Candidates[1] = std::make_pair(Operands[0][
I], Operands[1][
I + 1]);
10213 Candidates[2] = std::make_pair(Operands[1][
I], Operands[0][
I + 1]);
10215 switch (Res.value_or(0)) {
10219 std::swap(Operands[0][
I + 1], Operands[1][
I + 1]);
10229 DenseSet<unsigned> UniqueOpcodes;
10230 constexpr unsigned NumAltInsts = 3;
10231 unsigned NonInstCnt = 0;
10234 unsigned UndefCnt = 0;
10236 unsigned ExtraShuffleInsts = 0;
10239 if (Operands.
size() == 2) {
10241 if (Operands.
front() == Operands.
back()) {
10245 return is_contained(Operands.back(), V);
10248 ++ExtraShuffleInsts;
10251 const Loop *
L = LI->getLoopFor(S.getMainOp()->getParent());
10263 DenseMap<Value *, unsigned> Uniques;
10273 if (!Res.second && Res.first->second == 1)
10274 ++ExtraShuffleInsts;
10275 ++Res.first->getSecond();
10277 UniqueOpcodes.
insert(
I->getOpcode());
10278 else if (Res.second)
10281 return none_of(Uniques, [&](
const auto &
P) {
10282 return P.first->hasNUsesOrMore(
P.second + 1) &&
10283 none_of(
P.first->users(), [&](User *U) {
10284 return isVectorized(U) || Uniques.contains(U);
10293 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
10294 (UniqueOpcodes.
size() + NonInstCnt + ExtraShuffleInsts +
10295 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
10302 const unsigned VF,
unsigned MinBW,
10325static std::pair<InstructionCost, InstructionCost>
10345 FMF = FPCI->getFastMathFlags();
10348 LibCost.isValid() ? LibCost : ScalarLimit);
10358BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
10360 bool IsScatterVectorizeUserTE,
OrdersType &CurrentOrder,
10361 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo) {
10363 "Expected instructions with same/alternate opcodes only.");
10365 unsigned ShuffleOrOp =
10366 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
10368 switch (ShuffleOrOp) {
10369 case Instruction::PHI: {
10372 return TreeEntry::NeedToGather;
10374 for (
Value *V : VL) {
10378 for (
Value *Incoming :
PHI->incoming_values()) {
10380 if (Term &&
Term->isTerminator()) {
10382 <<
"SLP: Need to swizzle PHINodes (terminator use).\n");
10383 return TreeEntry::NeedToGather;
10388 return TreeEntry::Vectorize;
10390 case Instruction::ExtractElement:
10397 return TreeEntry::NeedToGather;
10399 case Instruction::ExtractValue: {
10400 bool Reuse = canReuseExtract(VL, CurrentOrder);
10404 return TreeEntry::NeedToGather;
10405 if (Reuse || !CurrentOrder.empty())
10406 return TreeEntry::Vectorize;
10408 return TreeEntry::NeedToGather;
10410 case Instruction::InsertElement: {
10414 for (
Value *V : VL) {
10416 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement/poison vector.\n");
10417 return TreeEntry::NeedToGather;
10421 "Non-constant or undef index?");
10425 return !SourceVectors.contains(V);
10428 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
10429 "different source vectors.\n");
10430 return TreeEntry::NeedToGather;
10435 return SourceVectors.contains(V) && !
V->hasOneUse();
10438 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
10439 "multiple uses.\n");
10440 return TreeEntry::NeedToGather;
10443 return TreeEntry::Vectorize;
10445 case Instruction::Load: {
10452 auto IsGatheredNode = [&]() {
10453 if (!GatheredLoadsEntriesFirst)
10458 return any_of(getTreeEntries(V), [&](
const TreeEntry *TE) {
10459 return TE->Idx >= *GatheredLoadsEntriesFirst;
10465 return TreeEntry::Vectorize;
10467 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
10469 LoadEntriesToVectorize.insert(VectorizableTree.size());
10470 return TreeEntry::NeedToGather;
10472 return IsGatheredNode() ? TreeEntry::NeedToGather
10473 : TreeEntry::CompressVectorize;
10475 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
10477 LoadEntriesToVectorize.insert(VectorizableTree.size());
10478 return TreeEntry::NeedToGather;
10480 return IsGatheredNode() ? TreeEntry::NeedToGather
10481 : TreeEntry::ScatterVectorize;
10483 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
10485 LoadEntriesToVectorize.insert(VectorizableTree.size());
10486 return TreeEntry::NeedToGather;
10488 return IsGatheredNode() ? TreeEntry::NeedToGather
10489 : TreeEntry::StridedVectorize;
10493 if (DL->getTypeSizeInBits(ScalarTy) !=
10494 DL->getTypeAllocSizeInBits(ScalarTy))
10495 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering loads of non-packed type.\n");
10498 return !LI || !LI->isSimple();
10502 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering non-consecutive loads.\n");
10505 return TreeEntry::NeedToGather;
10509 case Instruction::ZExt:
10510 case Instruction::SExt:
10511 case Instruction::FPToUI:
10512 case Instruction::FPToSI:
10513 case Instruction::FPExt:
10514 case Instruction::PtrToInt:
10515 case Instruction::IntToPtr:
10516 case Instruction::SIToFP:
10517 case Instruction::UIToFP:
10518 case Instruction::Trunc:
10519 case Instruction::FPTrunc:
10520 case Instruction::BitCast: {
10522 for (
Value *V : VL) {
10528 dbgs() <<
"SLP: Gathering casts with different src types.\n");
10529 return TreeEntry::NeedToGather;
10532 return TreeEntry::Vectorize;
10534 case Instruction::ICmp:
10535 case Instruction::FCmp: {
10540 for (
Value *V : VL) {
10544 if ((
Cmp->getPredicate() != P0 &&
Cmp->getPredicate() != SwapP0) ||
10545 Cmp->getOperand(0)->getType() != ComparedTy) {
10546 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering cmp with different predicate.\n");
10547 return TreeEntry::NeedToGather;
10550 return TreeEntry::Vectorize;
10552 case Instruction::Select:
10553 case Instruction::FNeg:
10554 case Instruction::Add:
10555 case Instruction::FAdd:
10556 case Instruction::Sub:
10557 case Instruction::FSub:
10558 case Instruction::Mul:
10559 case Instruction::FMul:
10560 case Instruction::UDiv:
10561 case Instruction::SDiv:
10562 case Instruction::FDiv:
10563 case Instruction::URem:
10564 case Instruction::SRem:
10565 case Instruction::FRem:
10566 case Instruction::Shl:
10567 case Instruction::LShr:
10568 case Instruction::AShr:
10569 case Instruction::And:
10570 case Instruction::Or:
10571 case Instruction::Xor:
10572 case Instruction::Freeze:
10573 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10574 TTI->isFPVectorizationPotentiallyUnsafe() &&
any_of(VL, [](
Value *V) {
10576 return I &&
I->isBinaryOp() && !
I->isFast();
10578 return TreeEntry::NeedToGather;
10579 return TreeEntry::Vectorize;
10580 case Instruction::GetElementPtr: {
10582 for (
Value *V : VL) {
10586 if (
I->getNumOperands() != 2) {
10587 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (nested indexes).\n");
10588 return TreeEntry::NeedToGather;
10595 for (
Value *V : VL) {
10599 Type *CurTy =
GEP->getSourceElementType();
10600 if (Ty0 != CurTy) {
10601 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (different types).\n");
10602 return TreeEntry::NeedToGather;
10608 for (
Value *V : VL) {
10612 auto *
Op =
I->getOperand(1);
10614 (
Op->getType() != Ty1 &&
10616 Op->getType()->getScalarSizeInBits() >
10617 DL->getIndexSizeInBits(
10618 V->getType()->getPointerAddressSpace())))) {
10620 dbgs() <<
"SLP: not-vectorizable GEP (non-constant indexes).\n");
10621 return TreeEntry::NeedToGather;
10625 return TreeEntry::Vectorize;
10627 case Instruction::Store: {
10629 llvm::Type *ScalarTy =
cast<StoreInst>(VL0)->getValueOperand()->getType();
10632 if (DL->getTypeSizeInBits(ScalarTy) !=
10633 DL->getTypeAllocSizeInBits(ScalarTy)) {
10634 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering stores of non-packed type.\n");
10635 return TreeEntry::NeedToGather;
10639 for (
Value *V : VL) {
10641 if (!
SI->isSimple()) {
10643 return TreeEntry::NeedToGather;
10652 if (CurrentOrder.empty()) {
10653 Ptr0 = PointerOps.
front();
10654 PtrN = PointerOps.
back();
10656 Ptr0 = PointerOps[CurrentOrder.front()];
10657 PtrN = PointerOps[CurrentOrder.back()];
10659 std::optional<int64_t> Dist =
10662 if (
static_cast<uint64_t
>(*Dist) == VL.size() - 1)
10663 return TreeEntry::Vectorize;
10667 return TreeEntry::NeedToGather;
10669 case Instruction::Call: {
10670 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10671 TTI->isFPVectorizationPotentiallyUnsafe() &&
any_of(VL, [](
Value *V) {
10673 return I && !
I->isFast();
10675 return TreeEntry::NeedToGather;
10685 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10689 return TreeEntry::NeedToGather;
10692 unsigned NumArgs = CI->
arg_size();
10693 SmallVector<Value *, 4> ScalarArgs(NumArgs,
nullptr);
10694 for (
unsigned J = 0; J != NumArgs; ++J)
10697 for (
Value *V : VL) {
10702 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
10704 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched calls:" << *CI <<
"!=" << *V
10706 return TreeEntry::NeedToGather;
10710 for (
unsigned J = 0; J != NumArgs; ++J) {
10713 if (ScalarArgs[J] != A1J) {
10715 <<
"SLP: mismatched arguments in call:" << *CI
10716 <<
" argument " << ScalarArgs[J] <<
"!=" << A1J <<
"\n");
10717 return TreeEntry::NeedToGather;
10726 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched bundle operands in calls:" << *CI
10727 <<
"!=" << *V <<
'\n');
10728 return TreeEntry::NeedToGather;
10733 auto *VecTy =
getWidenedType(S.getMainOp()->getType(), VL.size());
10735 if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())
10736 return TreeEntry::NeedToGather;
10738 return TreeEntry::Vectorize;
10740 case Instruction::ShuffleVector: {
10741 if (!S.isAltShuffle()) {
10744 return TreeEntry::Vectorize;
10747 LLVM_DEBUG(
dbgs() <<
"SLP: ShuffleVector are not vectorized.\n");
10748 return TreeEntry::NeedToGather;
10753 <<
"SLP: ShuffleVector not vectorized, operands are buildvector and "
10754 "the whole alt sequence is not profitable.\n");
10755 return TreeEntry::NeedToGather;
10758 return TreeEntry::Vectorize;
10762 return TreeEntry::NeedToGather;
10771 PHINode *Main =
nullptr;
10776 PHIHandler() =
delete;
10778 : DT(DT), Main(Main), Phis(Phis),
10779 Operands(Main->getNumIncomingValues(),
10781 void buildOperands() {
10782 constexpr unsigned FastLimit = 4;
10791 for (
auto [Idx, V] :
enumerate(Phis)) {
10795 "Expected isa instruction or poison value.");
10796 Operands[
I][Idx] =
V;
10799 if (
P->getIncomingBlock(
I) == InBB)
10800 Operands[
I][Idx] =
P->getIncomingValue(
I);
10802 Operands[
I][Idx] =
P->getIncomingValueForBlock(InBB);
10807 SmallMapVector<BasicBlock *, SmallVector<unsigned>, 4>
10817 for (
auto [Idx, V] :
enumerate(Phis)) {
10820 Operands[
I][Idx] =
V;
10829 Operands[
I][Idx] =
P->getIncomingValue(
I);
10832 auto *It = Blocks.
find(InBB);
10833 if (It == Blocks.
end())
10835 Operands[It->second.front()][Idx] =
P->getIncomingValue(
I);
10838 for (
const auto &
P : Blocks) {
10839 ArrayRef<unsigned> IncomingValues =
P.second;
10840 if (IncomingValues.
size() <= 1)
10843 for (
unsigned I : IncomingValues) {
10845 [&](
const auto &
Data) {
10846 return !
Data.value() ||
10847 Data.value() == Operands[BasicI][
Data.index()];
10849 "Expected empty operands list.");
10850 Operands[
I] = Operands[BasicI];
10863static std::pair<Instruction *, Instruction *>
10867 for (
Value *V : VL) {
10877 if (MainOp->
getOpcode() ==
I->getOpcode()) {
10896 "Expected different main and alt instructions.");
10897 return std::make_pair(MainOp, AltOp);
10910 const InstructionsState &S,
10912 bool TryPad =
false) {
10916 for (
Value *V : VL) {
10932 size_t NumUniqueScalarValues = UniqueValues.
size();
10935 if (NumUniqueScalarValues == VL.
size() &&
10937 ReuseShuffleIndices.
clear();
10942 if ((UserTreeIdx.
UserTE &&
10943 UserTreeIdx.
UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(
TTI)) ||
10945 LLVM_DEBUG(
dbgs() <<
"SLP: Reshuffling scalars not yet supported "
10946 "for nodes with padding.\n");
10947 ReuseShuffleIndices.
clear();
10952 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
10956 if (TryPad && UniquePositions.
size() > 1 && NumUniqueScalarValues > 1 &&
10957 S.getMainOp()->isSafeToRemove() &&
10958 (S.areInstructionsWithCopyableElements() ||
10962 TTI, UniqueValues.
front()->getType(), UniqueValues.
size());
10963 PWSz = std::min<unsigned>(PWSz, VL.
size());
10964 if (PWSz == VL.
size()) {
10968 ReuseShuffleIndices.
clear();
10972 UniqueValues.
end());
10973 PaddedUniqueValues.
append(
10974 PWSz - UniqueValues.
size(),
10978 if ((!S.areInstructionsWithCopyableElements() &&
10980 (S.areInstructionsWithCopyableElements() && S.isMulDivLikeOp() &&
10981 (S.getMainOp()->isIntDivRem() || S.getMainOp()->isFPDivRem() ||
10984 ReuseShuffleIndices.
clear();
10987 VL = std::move(PaddedUniqueValues);
10992 ReuseShuffleIndices.
clear();
10995 VL = std::move(UniqueValues);
11000 const InstructionsState &LocalState,
11001 SmallVectorImpl<Value *> &Op1,
11002 SmallVectorImpl<Value *> &Op2,
11004 constexpr unsigned SmallNodeSize = 4;
11005 if (VL.
size() <= SmallNodeSize || TTI->preferAlternateOpcodeVectorization() ||
11010 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *LocalState.getMainOp()
11012 for (TreeEntry *
E : getSplitTreeEntries(LocalState.getMainOp())) {
11013 if (
E->isSame(VL)) {
11015 << *LocalState.getMainOp() <<
".\n");
11027 ReorderIndices.assign(VL.
size(), VL.
size());
11028 SmallBitVector Op1Indices(VL.
size());
11033 Op1Indices.set(Idx);
11036 if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
11039 (LocalState.getAltOpcode() == LocalState.getOpcode() &&
11041 LocalState.getAltOp(), *TLI))) {
11043 Op1Indices.set(Idx);
11050 unsigned Opcode0 = LocalState.getOpcode();
11051 unsigned Opcode1 = LocalState.getAltOpcode();
11052 SmallBitVector OpcodeMask(
getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
11057 if (UOp1.size() <= 1 || UOp2.size() <= 1 ||
11058 TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) ||
11063 unsigned Op1Cnt = 0, Op2Cnt = Op1.
size();
11065 if (Op1Indices.test(Idx)) {
11066 ReorderIndices[Op1Cnt] = Idx;
11069 ReorderIndices[Op2Cnt] = Idx;
11074 ReorderIndices.clear();
11075 SmallVector<int>
Mask;
11076 if (!ReorderIndices.empty())
11078 unsigned NumParts = TTI->getNumberOfParts(VecTy);
11083 if (NumParts >= VL.
size())
11088 FixedVectorType *SubVecTy =
11092 if (!LocalState.isCmpOp() && NumParts <= 1 &&
11093 (
Mask.empty() || InsertCost >= NewShuffleCost))
11095 if ((LocalState.getMainOp()->isBinaryOp() &&
11096 LocalState.getAltOp()->isBinaryOp() &&
11097 (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
11098 LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
11099 (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
11100 (LocalState.getMainOp()->isUnaryOp() &&
11101 LocalState.getAltOp()->isUnaryOp())) {
11103 TTI->getArithmeticInstrCost(Opcode0, VecTy, Kind) +
11104 TTI->getArithmeticInstrCost(Opcode1, VecTy, Kind);
11109 OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.
size());
11113 VecTy, OriginalMask, Kind);
11115 TTI->getArithmeticInstrCost(Opcode0, Op1VecTy, Kind) +
11116 TTI->getArithmeticInstrCost(Opcode1, Op2VecTy, Kind);
11118 NewVecOpsCost + InsertCost +
11119 (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
11120 VectorizableTree.front()->getOpcode() == Instruction::Store
11124 if (NewCost >= OriginalCost)
11134class InstructionsCompatibilityAnalysis {
11136 const DataLayout &
DL;
11137 const TargetTransformInfo &
TTI;
11138 const TargetLibraryInfo &TLI;
11139 unsigned MainOpcode = 0;
11144 static bool isSupportedOpcode(
const unsigned Opcode) {
11145 return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
11146 Opcode == Instruction::LShr || Opcode == Instruction::Shl ||
11147 Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
11148 Opcode == Instruction::And || Opcode == Instruction::Or ||
11149 Opcode == Instruction::Xor || Opcode == Instruction::FAdd ||
11150 Opcode == Instruction::FSub || Opcode == Instruction::FMul ||
11151 Opcode == Instruction::FDiv;
11161 auto IsSupportedInstruction = [&](
Instruction *
I,
bool AnyUndef) {
11162 if (AnyUndef && (
I->isIntDivRem() ||
I->isFPDivRem() ||
isa<CallInst>(
I)))
11164 return I && isSupportedOpcode(
I->getOpcode()) &&
11169 SmallDenseSet<Value *, 8> Operands;
11170 SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
11171 bool AnyUndef =
false;
11172 for (
Value *V : VL) {
11180 if (Candidates.
empty()) {
11181 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
11183 Operands.
insert(
I->op_begin(),
I->op_end());
11186 if (Parent ==
I->getParent()) {
11187 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
11188 Operands.
insert(
I->op_begin(),
I->op_end());
11191 auto *NodeA = DT.
getNode(Parent);
11192 auto *NodeB = DT.
getNode(
I->getParent());
11193 assert(NodeA &&
"Should only process reachable instructions");
11194 assert(NodeB &&
"Should only process reachable instructions");
11195 assert((NodeA == NodeB) ==
11196 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11197 "Different nodes should have different DFS numbers");
11198 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
11199 Candidates.
clear();
11200 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
11203 Operands.
insert(
I->op_begin(),
I->op_end());
11206 unsigned BestOpcodeNum = 0;
11208 bool UsedOutside =
false;
11209 for (
const auto &
P : Candidates) {
11211 if (UsedOutside && !PUsedOutside)
11213 if (!UsedOutside && PUsedOutside)
11215 if (
P.second.size() < BestOpcodeNum)
11218 if (!PUsedOutside &&
any_of(
P.second, [&](Instruction *
I) {
11219 return Operands.contains(I);
11222 UsedOutside = PUsedOutside;
11223 for (Instruction *
I :
P.second) {
11224 if (IsSupportedInstruction(
I, AnyUndef)) {
11226 BestOpcodeNum =
P.second.size();
11236 return I &&
I->getParent() == MainOp->
getParent() &&
11249 Value *selectBestIdempotentValue()
const {
11250 assert(isSupportedOpcode(MainOpcode) &&
"Unsupported opcode");
11261 if (!S.isCopyableElement(V))
11263 assert(isSupportedOpcode(MainOpcode) &&
"Unsupported opcode");
11264 return {
V, selectBestIdempotentValue()};
11270 SmallVectorImpl<BoUpSLP::ValueList> &Operands)
const {
11272 unsigned ShuffleOrOp =
11273 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
11276 switch (ShuffleOrOp) {
11277 case Instruction::PHI: {
11281 PHIHandler Handler(DT, PH, VL);
11282 Handler.buildOperands();
11283 Operands.
assign(PH->getNumOperands(), {});
11285 Operands[
I].
assign(Handler.getOperands(
I).begin(),
11286 Handler.getOperands(
I).end());
11289 case Instruction::ExtractValue:
11290 case Instruction::ExtractElement:
11295 case Instruction::InsertElement:
11303 case Instruction::Load:
11307 for (
auto [V,
Op] :
zip(VL, Operands.
back())) {
11311 Op = LI->getPointerOperand();
11314 case Instruction::ZExt:
11315 case Instruction::SExt:
11316 case Instruction::FPToUI:
11317 case Instruction::FPToSI:
11318 case Instruction::FPExt:
11319 case Instruction::PtrToInt:
11320 case Instruction::IntToPtr:
11321 case Instruction::SIToFP:
11322 case Instruction::UIToFP:
11323 case Instruction::Trunc:
11324 case Instruction::FPTrunc:
11325 case Instruction::BitCast:
11326 case Instruction::ICmp:
11327 case Instruction::FCmp:
11328 case Instruction::Select:
11329 case Instruction::FNeg:
11330 case Instruction::Add:
11331 case Instruction::FAdd:
11332 case Instruction::Sub:
11333 case Instruction::FSub:
11334 case Instruction::Mul:
11335 case Instruction::FMul:
11336 case Instruction::UDiv:
11337 case Instruction::SDiv:
11338 case Instruction::FDiv:
11339 case Instruction::URem:
11340 case Instruction::SRem:
11341 case Instruction::FRem:
11342 case Instruction::Shl:
11343 case Instruction::LShr:
11344 case Instruction::AShr:
11345 case Instruction::And:
11346 case Instruction::Or:
11347 case Instruction::Xor:
11348 case Instruction::Freeze:
11349 case Instruction::Store:
11350 case Instruction::ShuffleVector:
11359 auto [
Op, ConvertedOps] = convertTo(
I, S);
11364 case Instruction::GetElementPtr: {
11371 const unsigned IndexIdx = 1;
11377 return !
GEP || VL0Ty ==
GEP->getOperand(IndexIdx)->getType();
11381 ->getPointerOperandType()
11382 ->getScalarType());
11386 Operands[0][Idx] =
V;
11387 Operands[1][Idx] = ConstantInt::getNullValue(Ty);
11390 Operands[0][Idx] =
GEP->getPointerOperand();
11391 auto *
Op =
GEP->getOperand(IndexIdx);
11394 CI, Ty, CI->getValue().isSignBitSet(),
DL)
11399 case Instruction::Call: {
11406 for (
Value *V : VL) {
11408 Ops.push_back(
I ?
I->getOperand(Idx)
11421 InstructionsCompatibilityAnalysis(DominatorTree &DT,
const DataLayout &
DL,
11422 const TargetTransformInfo &
TTI,
11423 const TargetLibraryInfo &TLI)
11428 bool TryCopyableElementsVectorization,
11429 bool WithProfitabilityCheck =
false,
11430 bool SkipSameCodeCheck =
false) {
11431 InstructionsState S = (SkipSameCodeCheck || !
allSameBlock(VL))
11432 ? InstructionsState::invalid()
11438 findAndSetMainInstruction(VL, R);
11440 return InstructionsState::invalid();
11441 S = InstructionsState(MainOp, MainOp,
true);
11442 if (!WithProfitabilityCheck)
11446 auto BuildCandidates =
11447 [](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates,
Value *V1,
11453 if (I1 && I2 &&
I1->getOpcode() == I2->getOpcode() &&
11454 I1->getParent() != I2->getParent())
11458 if (VL.
size() == 2) {
11461 BuildCandidates(Candidates1, Operands[0][0], Operands[0][1]);
11462 BuildCandidates(Candidates2, Operands[1][0], Operands[1][1]);
11463 bool Res = !Candidates1.
empty() && !Candidates2.
empty() &&
11464 R.findBestRootPair(Candidates1) &&
11465 R.findBestRootPair(Candidates2);
11467 Candidates1.
clear();
11468 Candidates2.
clear();
11469 BuildCandidates(Candidates1, Operands[0][0], Operands[1][1]);
11470 BuildCandidates(Candidates2, Operands[1][0], Operands[0][1]);
11471 Res = !Candidates1.
empty() && !Candidates2.
empty() &&
11472 R.findBestRootPair(Candidates1) &&
11473 R.findBestRootPair(Candidates2);
11476 return InstructionsState::invalid();
11480 FixedVectorType *VecTy =
11482 switch (MainOpcode) {
11483 case Instruction::Add:
11484 case Instruction::Sub:
11485 case Instruction::LShr:
11486 case Instruction::Shl:
11487 case Instruction::SDiv:
11488 case Instruction::UDiv:
11489 case Instruction::And:
11490 case Instruction::Or:
11491 case Instruction::Xor:
11492 case Instruction::FAdd:
11493 case Instruction::FMul:
11494 case Instruction::FSub:
11495 case Instruction::FDiv:
11501 if (VectorCost > ScalarCost)
11502 return InstructionsState::invalid();
11505 assert(Operands.
size() == 2 &&
"Unexpected number of operands!");
11506 unsigned CopyableNum =
11507 count_if(VL, [&](
Value *V) {
return S.isCopyableElement(V); });
11508 if (CopyableNum < VL.
size() / 2)
11511 const unsigned Limit = VL.
size() / 24;
11512 if ((CopyableNum >= VL.
size() - Limit ||
11513 (CopyableNum >= VL.
size() - 1 && VL.
size() > 4) ||
11518 return InstructionsState::invalid();
11522 for (
auto &
Ops : Operands) {
11537 return InstructionsState::invalid();
11543 constexpr unsigned Limit = 4;
11544 if (Operands.front().size() >= Limit) {
11545 SmallDenseMap<const Value *, unsigned>
Counters;
11553 return C.second == 1;
11559 InstructionsCompatibilityAnalysis
Analysis(DT,
DL,
TTI, TLI);
11560 InstructionsState OpS =
Analysis.buildInstructionsState(
11562 if (!OpS || (OpS.getOpcode() == Instruction::PHI && !
allSameBlock(
Ops)))
11564 unsigned CopyableNum =
11566 return CopyableNum <= VL.
size() / 2;
11568 if (!CheckOperand(Operands.front()))
11569 return InstructionsState::invalid();
11576 assert(S &&
"Invalid state!");
11578 if (S.areInstructionsWithCopyableElements()) {
11579 MainOp = S.getMainOp();
11580 MainOpcode = S.getOpcode();
11585 for (
auto [OperandIdx, Operand] :
enumerate(OperandsForValue))
11586 Operands[OperandIdx][Idx] = Operand;
11589 buildOriginalOperands(S, VL, Operands);
11596BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
11598 bool TryCopyableElementsVectorization)
const {
11601 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
11602 InstructionsState S =
Analysis.buildInstructionsState(
11603 VL, *
this, TryCopyableElementsVectorization,
11604 true, TryCopyableElementsVectorization);
11606 bool AreScatterAllGEPSameBlock =
false;
11608 SmallVector<unsigned> SortedIndices;
11610 bool IsScatterVectorizeUserTE =
11611 UserTreeIdx.UserTE &&
11612 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11613 AreScatterAllGEPSameBlock =
11627 *SE, SortedIndices));
11628 if (!AreScatterAllGEPSameBlock) {
11629 LLVM_DEBUG(
dbgs() <<
"SLP: Try split and if failed, gathering due to "
11630 "C,S,B,O, small shuffle. \n";
11634 return ScalarsVectorizationLegality(S,
false,
11640 assert(It != VL.
end() &&
"Expected at least one GEP.");
11643 assert(S &&
"Must be valid.");
11649 return ScalarsVectorizationLegality(S,
false,
11655 BasicBlock *BB = S.getMainOp()->getParent();
11658 !DT->isReachableFromEntry(BB)) {
11664 return ScalarsVectorizationLegality(S,
false);
11673 return ScalarsVectorizationLegality(S,
false,
11678 if (S.getOpcode() == Instruction::ExtractElement &&
11681 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to scalable vector type.\n");
11682 return ScalarsVectorizationLegality(S,
false);
11689 (S.isAltShuffle() || VL.
size() < 4 ||
11696 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to max recursion depth.\n");
11697 return ScalarsVectorizationLegality(S,
false);
11701 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *S.getMainOp() <<
".\n");
11702 for (TreeEntry *
E : getTreeEntries(S.getMainOp())) {
11703 if (
E->isSame(VL)) {
11704 LLVM_DEBUG(
dbgs() <<
"SLP: Perfect diamond merge at " << *S.getMainOp()
11706 return ScalarsVectorizationLegality(S,
false);
11711 (S.getOpcode() == Instruction::PHI &&
isa<PHINode>(V) &&
11712 LI->getLoopFor(S.getMainOp()->getParent()) &&
11716 return ScalarsVectorizationLegality(S,
false);
11726 if (!S || !S.isAltShuffle() || VL.
size() > 2)
11734 SmallVector<unsigned, 8> InstsCount;
11735 for (
Value *V : VL) {
11738 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
11741 bool IsCommutative =
11743 if ((IsCommutative &&
11744 std::accumulate(InstsCount.
begin(), InstsCount.
end(), 0) < 2) ||
11746 all_of(InstsCount, [](
unsigned ICnt) {
return ICnt < 2; })))
11748 assert(VL.size() == 2 &&
"Expected only 2 alternate op instructions.");
11752 for (
int Op :
seq<int>(S.getMainOp()->getNumOperands()))
11754 I2->getOperand(
Op));
11755 if (
static_cast<unsigned>(
count_if(
11756 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
11758 })) >= S.getMainOp()->getNumOperands() / 2)
11760 if (S.getMainOp()->getNumOperands() > 2)
11762 if (IsCommutative) {
11764 Candidates.
clear();
11765 for (
int Op = 0,
E = S.getMainOp()->getNumOperands();
Op <
E; ++
Op)
11767 I2->getOperand((
Op + 1) %
E));
11769 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
11776 bool AreAllSameBlock = !AreScatterAllGEPSameBlock;
11777 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
11778 if (!AreAllSameInsts ||
isSplat(VL) ||
11782 NotProfitableForVectorization(VL)) {
11783 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to C,S,B,O, small shuffle. \n";
11787 return ScalarsVectorizationLegality(S,
false);
11791 if (!EphValues.empty()) {
11792 for (
Value *V : VL) {
11793 if (EphValues.count(V)) {
11795 <<
") is ephemeral.\n");
11797 return ScalarsVectorizationLegality(S,
false,
11809 if (S.isAltShuffle()) {
11810 auto GetNumVectorizedExtracted = [&]() {
11816 all_of(
I->operands(), [&](
const Use &U) {
11817 return isa<ExtractElementInst>(U.get());
11822 else if (!
I->hasOneUser() && !areAllUsersVectorized(
I, UserIgnoreList))
11825 return std::make_pair(Vectorized, Extracted);
11827 auto [Vectorized, Extracted] = GetNumVectorizedExtracted();
11829 bool PreferScalarize = !Vectorized.
isAllOnes() && VL.size() == 2;
11830 if (!Vectorized.
isAllOnes() && !PreferScalarize) {
11833 Type *ScalarTy = VL.front()->getType();
11838 false,
true, Kind);
11840 *TTI, ScalarTy, VecTy, Vectorized,
11841 true,
false, Kind,
false);
11842 PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate;
11844 if (PreferScalarize) {
11845 LLVM_DEBUG(
dbgs() <<
"SLP: The instructions are in tree and alternate "
11846 "node is not profitable.\n");
11847 return ScalarsVectorizationLegality(S,
false);
11852 if (UserIgnoreList && !UserIgnoreList->empty()) {
11853 for (
Value *V : VL) {
11854 if (UserIgnoreList->contains(V)) {
11855 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to gathered scalar.\n");
11856 return ScalarsVectorizationLegality(S,
false);
11861 return ScalarsVectorizationLegality(S,
true);
11866 unsigned InterleaveFactor) {
11869 SmallVector<int> ReuseShuffleIndices;
11873 auto TrySplitNode = [&](
const InstructionsState &LocalState) {
11876 if (!canBuildSplitNode(VL, LocalState, Op1, Op2, ReorderIndices))
11879 auto Invalid = ScheduleBundle::invalid();
11880 auto *
TE = newTreeEntry(VL, TreeEntry::SplitVectorize,
Invalid, LocalState,
11881 UserTreeIdx, {}, ReorderIndices);
11886 getSameValuesTreeEntry(S.getMainOp(),
Op,
true))) {
11888 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11889 Idx == 0 ? 0 : Op1.
size());
11890 (void)newTreeEntry(
Op, TreeEntry::NeedToGather,
Invalid, S, {
TE, Idx});
11892 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11893 Idx == 0 ? 0 : Op1.
size());
11903 bool AreConsts =
false;
11904 for (
Value *V : VL) {
11916 if (AreOnlyConstsWithPHIs(VL)) {
11917 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to all constants and PHIs.\n");
11918 newGatherTreeEntry(VL, InstructionsState::invalid(), UserTreeIdx);
11922 ScalarsVectorizationLegality Legality = getScalarsVectorizationLegality(
11923 VL,
Depth, UserTreeIdx,
false);
11924 InstructionsState S = Legality.getInstructionsState();
11925 if (!Legality.isLegal()) {
11926 if (Legality.trySplitVectorize()) {
11929 if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
11933 Legality = getScalarsVectorizationLegality(
11934 VL,
Depth, UserTreeIdx,
true);
11935 if (!Legality.isLegal()) {
11936 if (Legality.tryToFindDuplicates())
11940 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11943 S = Legality.getInstructionsState();
11947 if (S.isAltShuffle() && TrySplitNode(S))
11953 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11958 bool IsScatterVectorizeUserTE =
11959 UserTreeIdx.UserTE &&
11960 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11963 StridedPtrInfo SPtrInfo;
11964 TreeEntry::EntryState State = getScalarsVectorizationState(
11965 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, SPtrInfo);
11966 if (State == TreeEntry::NeedToGather) {
11967 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11973 auto &BSRef = BlocksSchedules[BB];
11975 BSRef = std::make_unique<BlockScheduling>(BB);
11977 BlockScheduling &BS = *BSRef;
11980 std::optional<ScheduleBundle *> BundlePtr =
11981 BS.tryScheduleBundle(UniqueValues.getArrayRef(),
this, S, UserTreeIdx);
11982#ifdef EXPENSIVE_CHECKS
11986 if (!BundlePtr || (*BundlePtr && !*BundlePtr.value())) {
11987 LLVM_DEBUG(
dbgs() <<
"SLP: We are not able to schedule this bundle!\n");
11989 if (S.isAltShuffle() && ReuseShuffleIndices.
empty() && TrySplitNode(S))
11991 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11992 NonScheduledFirst.insert(VL.front());
11993 if (S.getOpcode() == Instruction::Load &&
11994 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
11998 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
12000 ScheduleBundle
Empty;
12001 ScheduleBundle &Bundle = BundlePtr.value() ? *BundlePtr.value() :
Empty;
12002 LLVM_DEBUG(
dbgs() <<
"SLP: We are able to schedule this bundle.\n");
12004 unsigned ShuffleOrOp =
12005 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
12006 auto CreateOperandNodes = [&](TreeEntry *
TE,
const auto &Operands) {
12008 SmallVector<unsigned> PHIOps;
12014 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
12019 for (
unsigned I : PHIOps)
12020 buildTreeRec(Operands[
I],
Depth + 1, {
TE,
I});
12022 switch (ShuffleOrOp) {
12023 case Instruction::PHI: {
12025 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
12029 TE->setOperands(Operands);
12030 CreateOperandNodes(TE, Operands);
12033 case Instruction::ExtractValue:
12034 case Instruction::ExtractElement: {
12035 if (CurrentOrder.empty()) {
12036 LLVM_DEBUG(
dbgs() <<
"SLP: Reusing or shuffling extract sequence.\n");
12039 dbgs() <<
"SLP: Reusing or shuffling of reordered extract sequence "
12041 for (
unsigned Idx : CurrentOrder)
12042 dbgs() <<
" " << Idx;
12049 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12050 ReuseShuffleIndices, CurrentOrder);
12052 "(ExtractValueInst/ExtractElementInst).\n";
12056 TE->setOperands(Operands);
12059 case Instruction::InsertElement: {
12060 assert(ReuseShuffleIndices.
empty() &&
"All inserts should be unique");
12062 auto OrdCompare = [](
const std::pair<int, int> &
P1,
12063 const std::pair<int, int> &P2) {
12064 return P1.first > P2.first;
12067 decltype(OrdCompare)>
12068 Indices(OrdCompare);
12069 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
12071 Indices.emplace(Idx,
I);
12073 OrdersType CurrentOrder(VL.size(), VL.size());
12074 bool IsIdentity =
true;
12075 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
12076 CurrentOrder[Indices.top().second] =
I;
12077 IsIdentity &= Indices.top().second ==
I;
12081 CurrentOrder.clear();
12082 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12084 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (InsertElementInst).\n";
12087 TE->setOperands(Operands);
12088 buildTreeRec(
TE->getOperand(1),
Depth + 1, {TE, 1});
12091 case Instruction::Load: {
12098 TreeEntry *
TE =
nullptr;
12101 case TreeEntry::Vectorize:
12102 TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12103 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
12104 if (CurrentOrder.empty())
12105 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (LoadInst).\n";
12109 <<
"SLP: added a new TreeEntry (jumbled LoadInst).\n";
12112 case TreeEntry::CompressVectorize:
12114 TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S,
12115 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
12118 <<
"SLP: added a new TreeEntry (masked LoadInst + compress).\n";
12121 case TreeEntry::StridedVectorize:
12123 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
12124 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
12125 TreeEntryToStridedPtrInfoMap[
TE] = SPtrInfo;
12126 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (strided LoadInst).\n";
12129 case TreeEntry::ScatterVectorize:
12131 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
12132 UserTreeIdx, ReuseShuffleIndices);
12135 <<
"SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
12138 case TreeEntry::CombinedVectorize:
12139 case TreeEntry::SplitVectorize:
12140 case TreeEntry::NeedToGather:
12143 if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {
12144 assert(Operands.
size() == 1 &&
"Expected a single operand only");
12145 SmallVector<int>
Mask;
12149 TE->setOperands(Operands);
12150 if (State == TreeEntry::ScatterVectorize)
12151 buildTreeRec(PointerOps,
Depth + 1, {
TE, 0});
12154 case Instruction::ZExt:
12155 case Instruction::SExt:
12156 case Instruction::FPToUI:
12157 case Instruction::FPToSI:
12158 case Instruction::FPExt:
12159 case Instruction::PtrToInt:
12160 case Instruction::IntToPtr:
12161 case Instruction::SIToFP:
12162 case Instruction::UIToFP:
12163 case Instruction::Trunc:
12164 case Instruction::FPTrunc:
12165 case Instruction::BitCast: {
12166 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
12167 std::make_pair(std::numeric_limits<unsigned>::min(),
12168 std::numeric_limits<unsigned>::max()));
12169 if (ShuffleOrOp == Instruction::ZExt ||
12170 ShuffleOrOp == Instruction::SExt) {
12171 CastMaxMinBWSizes = std::make_pair(
12172 std::max<unsigned>(DL->getTypeSizeInBits(VL0->
getType()),
12174 std::min<unsigned>(
12177 }
else if (ShuffleOrOp == Instruction::Trunc) {
12178 CastMaxMinBWSizes = std::make_pair(
12179 std::max<unsigned>(
12182 std::min<unsigned>(DL->getTypeSizeInBits(VL0->
getType()),
12185 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12186 ReuseShuffleIndices);
12187 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (CastInst).\n";
12190 TE->setOperands(Operands);
12192 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
12193 if (ShuffleOrOp == Instruction::Trunc) {
12194 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
12195 }
else if (ShuffleOrOp == Instruction::SIToFP ||
12196 ShuffleOrOp == Instruction::UIToFP) {
12197 unsigned NumSignBits =
12200 APInt
Mask = DB->getDemandedBits(OpI);
12201 NumSignBits = std::max(NumSignBits,
Mask.countl_zero());
12203 if (NumSignBits * 2 >=
12205 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
12209 case Instruction::ICmp:
12210 case Instruction::FCmp: {
12213 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12214 ReuseShuffleIndices);
12223 "Commutative Predicate mismatch");
12226 Operands.
back() =
Ops.getVL(1);
12233 if (
Cmp->getPredicate() != P0)
12237 TE->setOperands(Operands);
12238 buildTreeRec(Operands.
front(),
Depth + 1, {TE, 0});
12239 buildTreeRec(Operands.
back(),
Depth + 1, {TE, 1});
12240 if (ShuffleOrOp == Instruction::ICmp) {
12241 unsigned NumSignBits0 =
12243 if (NumSignBits0 * 2 >=
12245 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
12246 unsigned NumSignBits1 =
12248 if (NumSignBits1 * 2 >=
12250 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
12254 case Instruction::Select:
12255 case Instruction::FNeg:
12256 case Instruction::Add:
12257 case Instruction::FAdd:
12258 case Instruction::Sub:
12259 case Instruction::FSub:
12260 case Instruction::Mul:
12261 case Instruction::FMul:
12262 case Instruction::UDiv:
12263 case Instruction::SDiv:
12264 case Instruction::FDiv:
12265 case Instruction::URem:
12266 case Instruction::SRem:
12267 case Instruction::FRem:
12268 case Instruction::Shl:
12269 case Instruction::LShr:
12270 case Instruction::AShr:
12271 case Instruction::And:
12272 case Instruction::Or:
12273 case Instruction::Xor:
12274 case Instruction::Freeze: {
12275 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12276 ReuseShuffleIndices);
12278 dbgs() <<
"SLP: added a new TreeEntry "
12279 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
12285 Operands[0] =
Ops.getVL(0);
12286 Operands[1] =
Ops.getVL(1);
12288 TE->setOperands(Operands);
12290 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
12293 case Instruction::GetElementPtr: {
12294 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12295 ReuseShuffleIndices);
12296 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (GetElementPtrInst).\n";
12298 TE->setOperands(Operands);
12301 buildTreeRec(Operands[
I],
Depth + 1, {
TE,
I});
12304 case Instruction::Store: {
12305 bool Consecutive = CurrentOrder.empty();
12308 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12309 ReuseShuffleIndices, CurrentOrder);
12311 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (StoreInst).\n";
12315 dbgs() <<
"SLP: added a new TreeEntry (jumbled StoreInst).\n";
12317 TE->setOperands(Operands);
12318 buildTreeRec(
TE->getOperand(0),
Depth + 1, {TE, 0});
12321 case Instruction::Call: {
12327 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12328 ReuseShuffleIndices);
12329 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (CallInst).\n";
12334 Operands[0] =
Ops.getVL(0);
12335 Operands[1] =
Ops.getVL(1);
12337 TE->setOperands(Operands);
12343 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
12347 case Instruction::ShuffleVector: {
12348 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12349 ReuseShuffleIndices);
12350 if (S.isAltShuffle()) {
12351 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (isAltShuffle).\n";
12356 dbgs() <<
"SLP: added a new TreeEntry (ShuffleVectorInst).\n";
12370 "Expected different main/alternate predicates.");
12386 TE->setOperands(Operands);
12387 buildTreeRec(Operands.
front(),
Depth + 1, {TE, 0});
12388 buildTreeRec(Operands.
back(),
Depth + 1, {TE, 1});
12395 Operands[0] =
Ops.getVL(0);
12396 Operands[1] =
Ops.getVL(1);
12398 TE->setOperands(Operands);
12400 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
12418 for (
const auto *Ty : ST->elements())
12419 if (Ty != *ST->element_begin())
12421 N *= ST->getNumElements();
12422 EltTy = *ST->element_begin();
12424 N *= AT->getNumElements();
12425 EltTy = AT->getElementType();
12428 N *= VT->getNumElements();
12429 EltTy = VT->getElementType();
12435 size_t VTSize = DL->getTypeStoreSizeInBits(
getWidenedType(EltTy,
N));
12436 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
12437 VTSize != DL->getTypeStoreSizeInBits(T))
12444 bool ResizeAllowed)
const {
12446 assert(It != VL.
end() &&
"Expected at least one extract instruction.");
12453 Value *Vec = E0->getOperand(0);
12455 CurrentOrder.
clear();
12459 if (E0->getOpcode() == Instruction::ExtractValue) {
12471 unsigned E = VL.
size();
12472 if (!ResizeAllowed && NElts !=
E)
12475 unsigned MinIdx = NElts, MaxIdx = 0;
12480 if (Inst->getOperand(0) != Vec)
12488 const unsigned ExtIdx = *Idx;
12489 if (ExtIdx >= NElts)
12491 Indices[
I] = ExtIdx;
12492 if (MinIdx > ExtIdx)
12494 if (MaxIdx < ExtIdx)
12497 if (MaxIdx - MinIdx + 1 >
E)
12499 if (MaxIdx + 1 <=
E)
12503 bool ShouldKeepOrder =
true;
12510 for (
unsigned I = 0;
I <
E; ++
I) {
12513 const unsigned ExtIdx = Indices[
I] - MinIdx;
12514 if (CurrentOrder[ExtIdx] !=
E) {
12515 CurrentOrder.
clear();
12518 ShouldKeepOrder &= ExtIdx ==
I;
12519 CurrentOrder[ExtIdx] =
I;
12521 if (ShouldKeepOrder)
12522 CurrentOrder.
clear();
12524 return ShouldKeepOrder;
12527bool BoUpSLP::areAllUsersVectorized(
12528 Instruction *
I,
const SmallDenseSet<Value *> *VectorizedVals)
const {
12529 return (
I->hasOneUse() && (!VectorizedVals || VectorizedVals->
contains(
I))) ||
12530 all_of(
I->users(), [
this](User *U) {
12531 return isVectorized(U) || isVectorLikeInstWithConstOps(U) ||
12532 (isa<ExtractElementInst>(U) && MustGather.contains(U));
12536void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
12537 const function_ref<
bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
12538 SmallVectorImpl<Value *> *OpScalars,
12539 SmallVectorImpl<Value *> *AltScalars)
const {
12540 unsigned Sz = Scalars.size();
12542 SmallVector<int> OrderMask;
12543 if (!ReorderIndices.empty())
12545 for (
unsigned I = 0;
I < Sz; ++
I) {
12547 if (!ReorderIndices.empty())
12548 Idx = OrderMask[
I];
12552 if (IsAltOp(OpInst)) {
12553 Mask[
I] = Sz + Idx;
12562 if (!ReuseShuffleIndices.
empty()) {
12564 transform(ReuseShuffleIndices, NewMask.
begin(), [&Mask](
int Idx) {
12565 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
12567 Mask.swap(NewMask);
12574 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(
I) == MainOp;
12584 assert(MainP != AltP &&
"Expected different main/alternate predicates.");
12593 assert((MainP ==
P || AltP ==
P || MainP == SwappedP || AltP == SwappedP) &&
12594 "CmpInst expected to match either main or alternate predicate or "
12596 return MainP !=
P && MainP != SwappedP;
12598 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(
I) == AltOp;
12603 const auto *Op0 =
Ops.front();
12616 return CI->getValue().isPowerOf2();
12622 return CI->getValue().isNegatedPowerOf2();
12627 if (IsConstant && IsUniform)
12629 else if (IsConstant)
12631 else if (IsUniform)
12643class BaseShuffleAnalysis {
12645 Type *ScalarTy =
nullptr;
12647 BaseShuffleAnalysis(
Type *ScalarTy) : ScalarTy(ScalarTy) {}
12655 unsigned getVF(
Value *V)
const {
12656 assert(V &&
"V cannot be nullptr");
12658 "V does not have FixedVectorType");
12659 assert(ScalarTy &&
"ScalarTy cannot be nullptr");
12661 unsigned VNumElements =
12663 assert(VNumElements > ScalarTyNumElements &&
12664 "the number of elements of V is not large enough");
12665 assert(VNumElements % ScalarTyNumElements == 0 &&
12666 "the number of elements of V is not a vectorized value");
12667 return VNumElements / ScalarTyNumElements;
12673 static bool isIdentityMask(ArrayRef<int> Mask,
const FixedVectorType *VecTy,
12675 int Limit =
Mask.size();
12687 if (Limit % VF == 0 &&
all_of(
seq<int>(0, Limit / VF), [=](
int Idx) {
12688 ArrayRef<int> Slice =
Mask.slice(Idx * VF, VF);
12701 static void combineMasks(
unsigned LocalVF, SmallVectorImpl<int> &Mask,
12702 ArrayRef<int> ExtMask) {
12703 unsigned VF =
Mask.size();
12705 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
12708 int MaskedIdx =
Mask[ExtMask[
I] % VF];
12712 Mask.swap(NewMask);
12748 static bool peekThroughShuffles(
Value *&V, SmallVectorImpl<int> &Mask,
12749 bool SinglePermute) {
12751 ShuffleVectorInst *IdentityOp =
nullptr;
12752 SmallVector<int> IdentityMask;
12761 if (isIdentityMask(Mask, SVTy,
false)) {
12762 if (!IdentityOp || !SinglePermute ||
12763 (isIdentityMask(Mask, SVTy,
true) &&
12765 IdentityMask.
size()))) {
12770 IdentityMask.
assign(Mask);
12790 if (SV->isZeroEltSplat()) {
12792 IdentityMask.
assign(Mask);
12794 int LocalVF =
Mask.size();
12797 LocalVF = SVOpTy->getNumElements();
12801 static_cast<unsigned>(
I) >= SV->getShuffleMask().size())
12803 ExtMask[Idx] = SV->getMaskValue(
I);
12813 if (!IsOp1Undef && !IsOp2Undef) {
12815 for (
int &
I : Mask) {
12818 if (SV->getMaskValue(
I % SV->getShuffleMask().size()) ==
12824 SmallVector<int> ShuffleMask(SV->getShuffleMask());
12825 combineMasks(LocalVF, ShuffleMask, Mask);
12826 Mask.swap(ShuffleMask);
12828 Op = SV->getOperand(0);
12830 Op = SV->getOperand(1);
12833 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
12838 "Expected masks of same sizes.");
12843 Mask.swap(IdentityMask);
12845 return SinglePermute &&
12848 (Shuffle &&
Mask.size() == Shuffle->getShuffleMask().size() &&
12849 Shuffle->isZeroEltSplat() &&
12853 Shuffle->getShuffleMask()[
P.index()] == 0;
12866 template <
typename T,
typename ShuffleBuilderTy>
12867 static T createShuffle(
Value *V1,
Value *V2, ArrayRef<int> Mask,
12868 ShuffleBuilderTy &Builder,
Type *ScalarTy) {
12869 assert(V1 &&
"Expected at least one vector value.");
12871 SmallVector<int> NewMask(Mask);
12872 if (ScalarTyNumElements != 1) {
12878 Builder.resizeToMatch(V1, V2);
12879 int VF =
Mask.size();
12881 VF = FTy->getNumElements();
12892 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
12894 CombinedMask1[
I] =
Mask[
I];
12896 CombinedMask2[
I] =
Mask[
I] - VF;
12903 (void)peekThroughShuffles(Op1, CombinedMask1,
false);
12904 (void)peekThroughShuffles(Op2, CombinedMask2,
false);
12910 for (
auto [Idx,
I] :
enumerate(CombinedMask1)) {
12913 ExtMask1[Idx] = SV1->getMaskValue(
I);
12917 ->getNumElements(),
12918 ExtMask1, UseMask::SecondArg);
12919 SmallVector<int> ExtMask2(CombinedMask2.size(),
PoisonMaskElem);
12920 for (
auto [Idx,
I] :
enumerate(CombinedMask2)) {
12923 ExtMask2[Idx] = SV2->getMaskValue(
I);
12927 ->getNumElements(),
12928 ExtMask2, UseMask::SecondArg);
12929 if (SV1->getOperand(0)->getType() ==
12930 SV2->getOperand(0)->getType() &&
12931 SV1->getOperand(0)->getType() != SV1->getType() &&
12934 Op1 = SV1->getOperand(0);
12935 Op2 = SV2->getOperand(0);
12936 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
12937 int LocalVF = ShuffleMask1.size();
12939 LocalVF = FTy->getNumElements();
12940 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
12941 CombinedMask1.swap(ShuffleMask1);
12942 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
12943 LocalVF = ShuffleMask2.size();
12945 LocalVF = FTy->getNumElements();
12946 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
12947 CombinedMask2.swap(ShuffleMask2);
12950 }
while (PrevOp1 != Op1 || PrevOp2 != Op2);
12951 Builder.resizeToMatch(Op1, Op2);
12953 ->getElementCount()
12954 .getKnownMinValue(),
12956 ->getElementCount()
12957 .getKnownMinValue());
12958 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
12961 "Expected undefined mask element");
12962 CombinedMask1[
I] = CombinedMask2[
I] + (Op1 == Op2 ? 0 : VF);
12971 return Builder.createIdentity(Op1);
12972 return Builder.createShuffleVector(
12977 return Builder.createPoison(
12979 bool IsIdentity = peekThroughShuffles(V1, NewMask,
true);
12980 assert(V1 &&
"Expected non-null value after looking through shuffles.");
12983 return Builder.createShuffleVector(V1, NewMask);
12984 return Builder.createIdentity(V1);
12990 ArrayRef<int> Mask) {
12999static std::pair<InstructionCost, InstructionCost>
13010 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
13019 ScalarCost =
TTI.getPointersChainCost(
13020 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
13024 for (
Value *V : Ptrs) {
13025 if (V == BasePtr) {
13038 if (PtrsRetainedInVecCode.
size() == Ptrs.size()) {
13043 VecCost =
TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
13044 TTI::PointersChainInfo::getKnownStride(),
13054 [](
const Value *V) {
13056 return Ptr && !Ptr->hasAllConstantIndices();
13058 ? TTI::PointersChainInfo::getUnknownStride()
13059 : TTI::PointersChainInfo::getKnownStride();
13062 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy,
CostKind);
13066 if (It != Ptrs.
end())
13071 VecCost =
TTI.getGEPCost(BaseGEP->getSourceElementType(),
13072 BaseGEP->getPointerOperand(), Indices, VecTy,
13077 return std::make_pair(ScalarCost, VecCost);
13080void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
13081 assert(
TE.isGather() &&
TE.ReorderIndices.empty() &&
13082 "Expected gather node without reordering.");
13084 SmallSet<size_t, 2> LoadKeyUsed;
13088 if (
TE.Scalars.size() == 2 || (
TE.hasState() && !
TE.isAltShuffle()) ||
13093 return VectorizableTree[Idx]->isSame(TE.Scalars);
13097 auto GenerateLoadsSubkey = [&](
size_t Key, LoadInst *LI) {
13102 auto LIt = LoadsMap.
find(std::make_pair(
Key, Ptr));
13103 if (LIt != LoadsMap.
end()) {
13104 for (LoadInst *RLI : LIt->second) {
13106 LI->
getType(), LI->getPointerOperand(), *DL, *SE,
13110 for (LoadInst *RLI : LIt->second) {
13112 LI->getPointerOperand(), *TLI)) {
13117 if (LIt->second.size() > 2) {
13119 hash_value(LIt->second.back()->getPointerOperand());
13125 LoadsMap.
try_emplace(std::make_pair(
Key, Ptr)).first->second.push_back(LI);
13128 MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues;
13129 SmallDenseMap<Value *, SmallVector<unsigned>, 8> KeyToIndex;
13130 bool IsOrdered =
true;
13131 unsigned NumInstructions = 0;
13135 size_t Key = 1, Idx = 1;
13143 auto &Container = SortedValues[
Key];
13144 if (IsOrdered && !KeyToIndex.
contains(V) &&
13147 ((Container.contains(Idx) &&
13148 KeyToIndex.
at(Container[Idx].back()).back() !=
I - 1) ||
13149 (!Container.empty() && !Container.contains(Idx) &&
13150 KeyToIndex.
at(Container.back().second.back()).back() !=
I - 1)))
13152 auto &KTI = KeyToIndex[
V];
13154 Container[Idx].push_back(V);
13159 if (!IsOrdered && NumInstructions > 1) {
13161 TE.ReorderIndices.resize(
TE.Scalars.size(),
TE.Scalars.size());
13162 for (
const auto &
D : SortedValues) {
13163 for (
const auto &
P :
D.second) {
13165 for (
Value *V :
P.second) {
13166 ArrayRef<unsigned> Indices = KeyToIndex.
at(V);
13167 for (
auto [K, Idx] :
enumerate(Indices)) {
13168 TE.ReorderIndices[Cnt +
K] = Idx;
13169 TE.Scalars[Cnt +
K] =
V;
13171 Sz += Indices.
size();
13172 Cnt += Indices.
size();
13176 *TTI,
TE.Scalars.front()->getType(), Sz);
13180 }
else if (!
P.second.empty() &&
isConstant(
P.second.front())) {
13188 if (!
TE.ReuseShuffleIndices.empty() ||
TE.ReorderIndices.empty())
13193 auto *ScalarTy =
TE.Scalars.front()->getType();
13195 for (
auto [Idx, Sz] : SubVectors) {
13202 int Sz =
TE.Scalars.size();
13203 SmallVector<int> ReorderMask(
TE.ReorderIndices.begin(),
13204 TE.ReorderIndices.end());
13210 ReorderMask[
I] =
I +
TE.ReorderIndices.size();
13214 any_of(ReorderMask, [&](
int I) {
return I >= Sz; })
13217 VecTy, ReorderMask);
13223 DemandedElts.clearBit(
I);
13225 ReorderMask[
I] =
I;
13227 ReorderMask[
I] =
I + Sz;
13233 if (!DemandedElts.isAllOnes())
13235 if (
Cost >= BVCost) {
13236 SmallVector<int>
Mask(
TE.ReorderIndices.begin(),
TE.ReorderIndices.end());
13238 TE.ReorderIndices.clear();
13245 const InstructionsState &S,
13251 return V->getType()->getScalarType()->isFloatingPointTy();
13253 "Can only convert to FMA for floating point types");
13254 assert(S.isAddSubLikeOp() &&
"Can only convert to FMA for add/sub");
13259 for (
Value *V : VL) {
13263 if (S.isCopyableElement(
I))
13265 Instruction *MatchingI = S.getMatchingMainOpOrAltOp(
I);
13266 if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
13269 FMF &= FPCI->getFastMathFlags();
13273 if (!CheckForContractable(VL))
13276 InstructionsCompatibilityAnalysis
Analysis(DT,
DL,
TTI, TLI);
13283 if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
13285 if (!CheckForContractable(Operands.
front()))
13293 for (
Value *V : VL) {
13297 if (!S.isCopyableElement(
I))
13299 FMF &= FPCI->getFastMathFlags();
13300 FMulPlusFAddCost +=
TTI.getInstructionCost(
I,
CostKind);
13303 for (
auto [V,
Op] :
zip(VL, Operands.
front())) {
13304 if (S.isCopyableElement(V))
13307 if (!
I || !
I->hasOneUse() || OpS.isCopyableElement(
I)) {
13309 FMACost +=
TTI.getInstructionCost(OpI,
CostKind);
13316 FMF &= FPCI->getFastMathFlags();
13317 FMulPlusFAddCost +=
TTI.getInstructionCost(
I,
CostKind);
13327 BaseGraphSize = VectorizableTree.size();
13329 class GraphTransformModeRAAI {
13330 bool &SavedIsGraphTransformMode;
13333 GraphTransformModeRAAI(
bool &IsGraphTransformMode)
13334 : SavedIsGraphTransformMode(IsGraphTransformMode) {
13335 IsGraphTransformMode =
true;
13337 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode =
false; }
13338 } TransformContext(IsGraphTransformMode);
13347 const InstructionsState &S) {
13351 I2->getOperand(
Op));
13353 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
13355 [](
const std::pair<Value *, Value *> &
P) {
13365 TreeEntry &E = *VectorizableTree[Idx];
13367 reorderGatherNode(E);
13372 constexpr unsigned VFLimit = 16;
13373 bool ForceLoadGather =
13374 count_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
13375 return TE->isGather() && TE->hasState() &&
13376 TE->getOpcode() == Instruction::Load &&
13377 TE->getVectorFactor() < VFLimit;
13383 return TE->isSame(VL) ||
all_of(VL, [&](
Value *V) {
13392 auto CheckForSameVectorNodes = [&](
const TreeEntry &E) {
13393 if (E.hasState()) {
13395 !TEs.
empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
13396 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
13397 ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
13398 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13399 return is_contained(TEs, TE);
13406 !TEs.
empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
13407 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
13408 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
13409 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13410 return is_contained(TEs, TE);
13418 if (It != E.Scalars.end()) {
13420 !TEs.empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
13421 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
13422 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
13423 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13424 return is_contained(TEs, TE);
13434 for (
unsigned Idx :
seq<unsigned>(BaseGraphSize)) {
13435 TreeEntry &
E = *VectorizableTree[Idx];
13436 if (
E.isGather()) {
13439 unsigned MinVF =
getMinVF(2 * Sz);
13442 if (VL.
size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
13443 !(!
E.hasState() ||
E.getOpcode() == Instruction::Load ||
13449 if (ForceLoadGather &&
E.hasState() &&
E.getOpcode() == Instruction::Load)
13452 if (CheckForSameVectorNodes(
E))
13456 unsigned StartIdx = 0;
13457 unsigned End = VL.
size();
13458 SmallBitVector Processed(End);
13460 *TTI, VL.
front()->getType(), VL.
size() - 1);
13462 *TTI, VL.
front()->getType(), VF - 1)) {
13463 if (StartIdx + VF > End)
13466 bool AllStrided =
true;
13467 for (
unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
13472 !getSameValuesTreeEntry(Slice.
front(), Slice,
true))
13479 bool IsSplat =
isSplat(Slice);
13480 bool IsTwoRegisterSplat =
true;
13481 if (IsSplat && VF == 2) {
13484 IsTwoRegisterSplat = NumRegs2VF == 2;
13486 if (Slices.
empty() || !IsSplat || !IsTwoRegisterSplat ||
13494 (S.getOpcode() == Instruction::Load &&
13496 (S.getOpcode() != Instruction::Load &&
13502 if ((!UserIgnoreList ||
E.Idx != 0) &&
13503 TTI->getInstructionCost(S.getMainOp(),
CostKind) <
13512 if (S.getOpcode() == Instruction::Load) {
13515 StridedPtrInfo SPtrInfo;
13517 PointerOps, SPtrInfo);
13528 if (UserIgnoreList &&
E.Idx == 0)
13533 }
else if (S.getOpcode() == Instruction::ExtractElement ||
13534 (TTI->getInstructionCost(S.getMainOp(),
CostKind) <
13536 !CheckOperandsProfitability(
13553 if (VF == 2 && AllStrided && Slices.
size() > 2)
13555 auto AddCombinedNode = [&](
unsigned Idx,
unsigned Cnt,
unsigned Sz) {
13556 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
13557 Processed.set(Cnt, Cnt + Sz);
13558 if (StartIdx == Cnt)
13559 StartIdx = Cnt + Sz;
13560 if (End == Cnt + Sz)
13563 for (
auto [Cnt, Sz] : Slices) {
13565 const TreeEntry *SameTE =
nullptr;
13567 It != Slice.
end()) {
13569 SameTE = getSameValuesTreeEntry(*It, Slice);
13571 unsigned PrevSize = VectorizableTree.size();
13572 [[maybe_unused]]
unsigned PrevEntriesSize =
13573 LoadEntriesToVectorize.size();
13574 buildTreeRec(Slice, 0,
EdgeInfo(&
E, UINT_MAX));
13575 if (PrevSize + 1 == VectorizableTree.size() && !SameTE &&
13576 VectorizableTree[PrevSize]->isGather() &&
13577 VectorizableTree[PrevSize]->hasState() &&
13578 VectorizableTree[PrevSize]->getOpcode() !=
13579 Instruction::ExtractElement &&
13581 if (UserIgnoreList &&
E.Idx == 0 && VF == 2)
13583 VectorizableTree.pop_back();
13584 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
13585 "LoadEntriesToVectorize expected to remain the same");
13588 AddCombinedNode(PrevSize, Cnt, Sz);
13592 if (
E.CombinedEntriesWithIndices.empty() && !
E.ReorderIndices.empty()) {
13593 SmallVector<int>
Mask(
E.ReorderIndices.begin(),
E.ReorderIndices.end());
13595 E.ReorderIndices.clear();
13600 switch (
E.getOpcode()) {
13601 case Instruction::Load: {
13604 if (
E.State != TreeEntry::Vectorize)
13606 Type *ScalarTy =
E.getMainOp()->getType();
13612 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13613 SmallVector<int>
Mask;
13617 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
13618 BaseLI->getPointerAddressSpace(),
CostKind,
13622 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_load,
13623 VecTy, BaseLI->getPointerOperand(),
13624 false, CommonAlignment,
13631 ->getPointerOperand()
13633 StridedPtrInfo SPtrInfo;
13634 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1);
13635 SPtrInfo.Ty = VecTy;
13636 TreeEntryToStridedPtrInfoMap[&
E] = SPtrInfo;
13637 E.State = TreeEntry::StridedVectorize;
13642 case Instruction::Store: {
13650 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13651 SmallVector<int>
Mask;
13655 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
13656 BaseSI->getPointerAddressSpace(),
CostKind,
13660 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_store,
13661 VecTy, BaseSI->getPointerOperand(),
13662 false, CommonAlignment,
13665 if (StridedCost < OriginalVecCost)
13668 E.State = TreeEntry::StridedVectorize;
13669 }
else if (!
E.ReorderIndices.empty()) {
13671 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int>
Mask) {
13673 assert(
Mask.size() > 1 &&
"Expected mask greater than 1 element.");
13674 if (
Mask.size() < 4)
13678 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
13679 TTI.isLegalInterleavedAccessType(
13680 VecTy, Factor, BaseSI->getAlign(),
13681 BaseSI->getPointerAddressSpace()))
13687 SmallVector<int>
Mask(
E.ReorderIndices.begin(),
E.ReorderIndices.end());
13688 unsigned InterleaveFactor = IsInterleaveMask(Mask);
13689 if (InterleaveFactor != 0)
13690 E.setInterleave(InterleaveFactor);
13694 case Instruction::Select: {
13695 if (
E.State != TreeEntry::Vectorize)
13701 E.CombinedOp = TreeEntry::MinMax;
13702 TreeEntry *CondEntry = getOperandEntry(&
E, 0);
13703 if (SelectOnly && CondEntry->UserTreeIndex &&
13704 CondEntry->State == TreeEntry::Vectorize) {
13706 CondEntry->State = TreeEntry::CombinedVectorize;
13710 case Instruction::FSub:
13711 case Instruction::FAdd: {
13713 if (
E.State != TreeEntry::Vectorize ||
13714 !
E.getOperations().isAddSubLikeOp())
13720 E.CombinedOp = TreeEntry::FMulAdd;
13721 TreeEntry *FMulEntry = getOperandEntry(&
E, 0);
13722 if (FMulEntry->UserTreeIndex &&
13723 FMulEntry->State == TreeEntry::Vectorize) {
13725 FMulEntry->State = TreeEntry::CombinedVectorize;
13734 if (LoadEntriesToVectorize.empty()) {
13736 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
13737 VectorizableTree.front()->getOpcode() == Instruction::Load)
13740 constexpr unsigned SmallTree = 3;
13741 constexpr unsigned SmallVF = 2;
13742 if ((VectorizableTree.size() <= SmallTree &&
13743 VectorizableTree.front()->Scalars.size() == SmallVF) ||
13744 (VectorizableTree.size() <= 2 && UserIgnoreList))
13747 if (VectorizableTree.front()->isNonPowOf2Vec() &&
13751 [](
const std::unique_ptr<TreeEntry> &TE) {
13752 return TE->isGather() &&
TE->hasState() &&
13753 TE->getOpcode() == Instruction::Load &&
13761 SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
13765 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
13766 TreeEntry &
E = *
TE;
13767 if (
E.isGather() &&
13768 ((
E.hasState() &&
E.getOpcode() == Instruction::Load) ||
13769 (!
E.hasState() &&
any_of(
E.Scalars,
13771 return isa<LoadInst>(V) &&
13772 !isVectorized(V) &&
13773 !isDeleted(cast<Instruction>(V));
13776 for (
Value *V :
E.Scalars) {
13783 *
this, V, *DL, *SE, *TTI,
13784 GatheredLoads[std::make_tuple(
13792 if (!GatheredLoads.
empty())
13793 tryToVectorizeGatheredLoads(GatheredLoads);
13803 bool IsFinalized =
false;
13816 bool SameNodesEstimated =
true;
13819 if (Ty->getScalarType()->isPointerTy()) {
13823 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
13824 Ty->getScalarType());
13842 assert(It != VL.
end() &&
"Expected at least one non-undef value.");
13845 count(VL, *It) > 1 &&
13847 if (!NeedShuffle) {
13850 return TTI.getShuffleCost(
13855 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
13856 CostKind, std::distance(VL.
begin(), It),
13862 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
13865 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
13869 VecTy, ShuffleMask, CostKind,
13873 return GatherCost +
13876 : R.getGatherCost(Gathers, !Root && VL.
equals(Gathers),
13884 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13885 unsigned NumParts) {
13886 assert(VL.
size() > NumParts &&
"Unexpected scalarized shuffle.");
13888 std::accumulate(VL.
begin(), VL.
end(), 0, [](
unsigned Sz,
Value *V) {
13889 auto *EE = dyn_cast<ExtractElementInst>(V);
13892 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
13895 return std::max(Sz, VecTy->getNumElements());
13902 -> std::optional<TTI::ShuffleKind> {
13903 if (NumElts <= EltsPerVector)
13904 return std::nullopt;
13906 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
13908 if (I == PoisonMaskElem)
13910 return std::min(S, I);
13913 int OffsetReg1 = OffsetReg0;
13917 int FirstRegId = -1;
13918 Indices.assign(1, OffsetReg0);
13922 int Idx =
I - OffsetReg0;
13924 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
13925 if (FirstRegId < 0)
13926 FirstRegId = RegId;
13927 RegIndices.
insert(RegId);
13928 if (RegIndices.
size() > 2)
13929 return std::nullopt;
13930 if (RegIndices.
size() == 2) {
13932 if (Indices.
size() == 1) {
13935 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
13936 [&](
int S,
int I) {
13937 if (I == PoisonMaskElem)
13939 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
13940 ((I - OffsetReg0) % NumElts) / EltsPerVector;
13941 if (RegId == FirstRegId)
13943 return std::min(S, I);
13946 unsigned Index = OffsetReg1 % NumElts;
13947 Indices.push_back(Index);
13948 SubVecSizes.push_back(std::min(NumElts - Index, EltsPerVector));
13950 Idx =
I - OffsetReg1;
13952 I = (Idx % NumElts) % EltsPerVector +
13953 (RegId == FirstRegId ? 0 : EltsPerVector);
13955 return ShuffleKind;
13963 if (!ShuffleKinds[Part])
13966 Part * EltsPerVector,
getNumElems(Mask.size(), EltsPerVector, Part));
13971 std::optional<TTI::ShuffleKind> RegShuffleKind =
13972 CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);
13973 if (!RegShuffleKind) {
13976 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.
size())))
13989 *R.TTI, VL.
front()->getType(),
alignTo(NumElts, EltsPerVector));
13990 for (
const auto [Idx, SubVecSize] :
zip(Indices, SubVecSizes)) {
13991 assert((Idx + SubVecSize) <= BaseVF &&
13992 "SK_ExtractSubvector index out of range");
14002 TTI, *ShuffleKinds[Part],
getWidenedType(ScalarTy, NumElts), SubMask);
14003 if (OriginalCost < Cost)
14004 Cost = OriginalCost;
14011 void estimateNodesPermuteCost(
const TreeEntry &E1,
const TreeEntry *E2,
14013 unsigned SliceSize) {
14014 if (SameNodesEstimated) {
14020 if ((InVectors.size() == 2 &&
14024 unsigned Limit =
getNumElems(Mask.size(), SliceSize, Part);
14027 "Expected all poisoned elements.");
14029 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
14034 Cost += createShuffle(InVectors.front(),
14035 InVectors.size() == 1 ?
nullptr : InVectors.back(),
14037 transformMaskAfterShuffle(CommonMask, CommonMask);
14038 }
else if (InVectors.size() == 2) {
14039 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14040 transformMaskAfterShuffle(CommonMask, CommonMask);
14042 SameNodesEstimated =
false;
14043 if (!E2 && InVectors.size() == 1) {
14044 unsigned VF = E1.getVectorFactor();
14046 VF = std::max(VF, getVF(V1));
14049 VF = std::max(VF, E->getVectorFactor());
14051 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14053 CommonMask[Idx] = Mask[Idx] + VF;
14054 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
14055 transformMaskAfterShuffle(CommonMask, CommonMask);
14057 auto P = InVectors.front();
14058 Cost += createShuffle(&E1, E2, Mask);
14059 unsigned VF = Mask.size();
14065 VF = std::max(VF, E->getVectorFactor());
14067 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14069 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
14070 Cost += createShuffle(
P, InVectors.front(), CommonMask);
14071 transformMaskAfterShuffle(CommonMask, CommonMask);
14075 class ShuffleCostBuilder {
14078 static bool isEmptyOrIdentity(
ArrayRef<int> Mask,
unsigned VF) {
14080 return Mask.empty() ||
14081 (VF == Mask.size() &&
14089 ~ShuffleCostBuilder() =
default;
14095 if (isEmptyOrIdentity(Mask, VF))
14104 if (isEmptyOrIdentity(Mask, VF))
14113 void resizeToMatch(
Value *&,
Value *&)
const {}
14123 ShuffleCostBuilder Builder(TTI);
14126 unsigned CommonVF = Mask.size();
14128 auto GetNodeMinBWAffectedCost = [&](
const TreeEntry &E,
14132 Type *EScalarTy = E.Scalars.front()->getType();
14133 bool IsSigned =
true;
14134 if (
auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
14136 IsSigned = It->second.second;
14138 if (EScalarTy != ScalarTy) {
14139 unsigned CastOpcode = Instruction::Trunc;
14140 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
14141 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
14143 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14144 return TTI.getCastInstrCost(CastOpcode,
getWidenedType(ScalarTy, VF),
14154 Type *EScalarTy = VecTy->getElementType();
14155 if (EScalarTy != ScalarTy) {
14157 unsigned CastOpcode = Instruction::Trunc;
14158 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
14159 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
14161 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14162 return TTI.getCastInstrCost(
14168 if (!V1 && !V2 && !P2.
isNull()) {
14171 unsigned VF = E->getVectorFactor();
14173 CommonVF = std::max(VF, E2->getVectorFactor());
14176 return Idx < 2 * static_cast<int>(CommonVF);
14178 "All elements in mask must be less than 2 * CommonVF.");
14179 if (E->Scalars.size() == E2->Scalars.size()) {
14183 for (
int &Idx : CommonMask) {
14186 if (Idx <
static_cast<int>(CommonVF) && !EMask.
empty())
14188 else if (Idx >=
static_cast<int>(CommonVF))
14189 Idx = (E2Mask.
empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
14193 CommonVF = E->Scalars.size();
14194 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
14195 GetNodeMinBWAffectedCost(*E2, CommonVF);
14197 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
14198 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
14201 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
14202 }
else if (!V1 && P2.
isNull()) {
14205 unsigned VF = E->getVectorFactor();
14209 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
14210 "All elements in mask must be less than CommonVF.");
14211 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
14213 assert(!EMask.
empty() &&
"Expected non-empty common mask.");
14214 for (
int &Idx : CommonMask) {
14218 CommonVF = E->Scalars.size();
14219 }
else if (
unsigned Factor = E->getInterleaveFactor();
14220 Factor > 0 && E->Scalars.size() != Mask.size() &&
14224 std::iota(CommonMask.begin(), CommonMask.end(), 0);
14226 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
14229 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
14230 CommonVF == CommonMask.size() &&
14232 [](
const auto &&
P) {
14234 static_cast<unsigned>(
P.value()) !=
P.index();
14242 }
else if (V1 && P2.
isNull()) {
14244 ExtraCost += GetValueMinBWAffectedCost(V1);
14245 CommonVF = getVF(V1);
14248 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
14249 "All elements in mask must be less than CommonVF.");
14250 }
else if (V1 && !V2) {
14252 unsigned VF = getVF(V1);
14254 CommonVF = std::max(VF, E2->getVectorFactor());
14257 return Idx < 2 * static_cast<int>(CommonVF);
14259 "All elements in mask must be less than 2 * CommonVF.");
14260 if (E2->Scalars.size() == VF && VF != CommonVF) {
14262 assert(!E2Mask.
empty() &&
"Expected non-empty common mask.");
14263 for (
int &Idx : CommonMask) {
14266 if (Idx >=
static_cast<int>(CommonVF))
14267 Idx = E2Mask[Idx - CommonVF] + VF;
14271 ExtraCost += GetValueMinBWAffectedCost(V1);
14273 ExtraCost += GetNodeMinBWAffectedCost(
14274 *E2, std::min(CommonVF, E2->getVectorFactor()));
14275 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
14276 }
else if (!V1 && V2) {
14278 unsigned VF = getVF(V2);
14280 CommonVF = std::max(VF, E1->getVectorFactor());
14283 return Idx < 2 * static_cast<int>(CommonVF);
14285 "All elements in mask must be less than 2 * CommonVF.");
14286 if (E1->Scalars.size() == VF && VF != CommonVF) {
14288 assert(!E1Mask.
empty() &&
"Expected non-empty common mask.");
14289 for (
int &Idx : CommonMask) {
14292 if (Idx >=
static_cast<int>(CommonVF))
14293 Idx = E1Mask[Idx - CommonVF] + VF;
14299 ExtraCost += GetNodeMinBWAffectedCost(
14300 *E1, std::min(CommonVF, E1->getVectorFactor()));
14302 ExtraCost += GetValueMinBWAffectedCost(V2);
14303 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
14305 assert(V1 && V2 &&
"Expected both vectors.");
14306 unsigned VF = getVF(V1);
14307 CommonVF = std::max(VF, getVF(V2));
14310 return Idx < 2 * static_cast<int>(CommonVF);
14312 "All elements in mask must be less than 2 * CommonVF.");
14314 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
14317 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
14322 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
14325 InVectors.front() =
14327 if (InVectors.size() == 2)
14328 InVectors.pop_back();
14329 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
14330 V1, V2, CommonMask, Builder, ScalarTy);
14337 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
14338 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
14339 CheckedExtracts(CheckedExtracts) {}
14341 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
14342 unsigned NumParts,
bool &UseVecBaseAsInput) {
14343 UseVecBaseAsInput =
false;
14346 Value *VecBase =
nullptr;
14348 if (!E->ReorderIndices.empty()) {
14350 E->ReorderIndices.end());
14355 bool PrevNodeFound =
any_of(
14356 ArrayRef(R.VectorizableTree).take_front(E->Idx),
14357 [&](
const std::unique_ptr<TreeEntry> &TE) {
14358 return ((TE->hasState() && !TE->isAltShuffle() &&
14359 TE->getOpcode() == Instruction::ExtractElement) ||
14361 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
14362 return VL.size() > Data.index() &&
14363 (Mask[Data.index()] == PoisonMaskElem ||
14364 isa<UndefValue>(VL[Data.index()]) ||
14365 Data.value() == VL[Data.index()]);
14373 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
14387 VecBase = EE->getVectorOperand();
14388 UniqueBases.
insert(VecBase);
14390 if (!CheckedExtracts.
insert(V).second ||
14393 [&](
const TreeEntry *TE) {
14394 return R.DeletedNodes.contains(TE) ||
14395 R.TransformedToGatherNodes.contains(TE);
14397 (E->UserTreeIndex && E->UserTreeIndex.EdgeIdx == UINT_MAX &&
14398 !R.isVectorized(EE) &&
14400 count_if(E->UserTreeIndex.UserTE->Scalars,
14401 [&](
Value *V) { return V == EE; })) ||
14404 return isa<GetElementPtrInst>(U) &&
14405 !R.areAllUsersVectorized(cast<Instruction>(U),
14413 unsigned Idx = *EEIdx;
14415 if (EE->hasOneUse() || !PrevNodeFound) {
14421 Cost -=
TTI.getExtractWithExtendCost(
14425 Cost +=
TTI.getCastInstrCost(
14431 APInt &DemandedElts =
14432 VectorOpsToExtracts
14435 .first->getSecond();
14436 DemandedElts.
setBit(Idx);
14439 for (
const auto &[Vec, DemandedElts] : VectorOpsToExtracts)
14441 DemandedElts,
false,
14449 if (!PrevNodeFound)
14450 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
14453 transformMaskAfterShuffle(CommonMask, CommonMask);
14454 SameNodesEstimated =
false;
14455 if (NumParts != 1 && UniqueBases.
size() != 1) {
14456 UseVecBaseAsInput =
true;
14464 std::optional<InstructionCost>
14468 return std::nullopt;
14472 IsFinalized =
false;
14473 CommonMask.clear();
14476 VectorizedVals.clear();
14477 SameNodesEstimated =
true;
14483 return Idx < static_cast<int>(E1.getVectorFactor());
14485 "Expected single vector shuffle mask.");
14489 if (InVectors.empty()) {
14490 CommonMask.assign(Mask.begin(), Mask.end());
14491 InVectors.assign({&E1, &E2});
14494 assert(!CommonMask.empty() &&
"Expected non-empty common mask.");
14499 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
14500 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
14503 if (InVectors.empty()) {
14504 CommonMask.assign(Mask.begin(), Mask.end());
14505 InVectors.assign(1, &E1);
14508 assert(!CommonMask.empty() &&
"Expected non-empty common mask.");
14513 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
14514 estimateNodesPermuteCost(E1,
nullptr, Mask, Part, SliceSize);
14515 if (!SameNodesEstimated && InVectors.size() == 1)
14516 InVectors.emplace_back(&E1);
14522 assert(InVectors.size() == 1 &&
14529 ->getOrdered(
P.index()));
14530 return EI->getVectorOperand() == V1 ||
14531 EI->getVectorOperand() == V2;
14533 "Expected extractelement vectors.");
14537 if (InVectors.empty()) {
14538 assert(CommonMask.empty() && !ForExtracts &&
14539 "Expected empty input mask/vectors.");
14540 CommonMask.assign(Mask.begin(), Mask.end());
14541 InVectors.assign(1, V1);
14547 !CommonMask.empty() &&
14551 ->getOrdered(
P.index());
14553 return P.value() == Mask[
P.index()] ||
14558 return EI->getVectorOperand() == V1;
14560 "Expected only tree entry for extractelement vectors.");
14563 assert(!InVectors.empty() && !CommonMask.empty() &&
14564 "Expected only tree entries from extracts/reused buildvectors.");
14565 unsigned VF = getVF(V1);
14566 if (InVectors.size() == 2) {
14567 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14568 transformMaskAfterShuffle(CommonMask, CommonMask);
14569 VF = std::max<unsigned>(VF, CommonMask.size());
14570 }
else if (
const auto *InTE =
14571 InVectors.front().dyn_cast<
const TreeEntry *>()) {
14572 VF = std::max(VF, InTE->getVectorFactor());
14576 ->getNumElements());
14578 InVectors.push_back(V1);
14579 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14581 CommonMask[Idx] = Mask[Idx] + VF;
14584 Value *Root =
nullptr) {
14585 Cost += getBuildVectorCost(VL, Root);
14589 unsigned VF = VL.
size();
14591 VF = std::min(VF, MaskVF);
14592 Type *VLScalarTy = VL.
front()->getType();
14616 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
14622 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14627 IsFinalized =
true;
14630 if (InVectors.
size() == 2)
14631 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
14633 Cost += createShuffle(Vec,
nullptr, CommonMask);
14634 transformMaskAfterShuffle(CommonMask, CommonMask);
14636 "Expected vector length for the final value before action.");
14639 Cost += createShuffle(V1, V2, Mask);
14642 InVectors.
front() = V;
14644 if (!SubVectors.empty()) {
14646 if (InVectors.
size() == 2)
14647 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
14649 Cost += createShuffle(Vec,
nullptr, CommonMask);
14650 transformMaskAfterShuffle(CommonMask, CommonMask);
14652 if (!SubVectorsMask.
empty()) {
14654 "Expected same size of masks for subvectors and common mask.");
14656 copy(SubVectorsMask, SVMask.begin());
14657 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
14660 I1 = I2 + CommonMask.
size();
14667 for (
auto [
E, Idx] : SubVectors) {
14668 Type *EScalarTy =
E->Scalars.front()->getType();
14669 bool IsSigned =
true;
14670 if (
auto It =
R.MinBWs.find(
E); It !=
R.MinBWs.end()) {
14673 IsSigned = It->second.second;
14675 if (ScalarTy != EScalarTy) {
14676 unsigned CastOpcode = Instruction::Trunc;
14677 unsigned DstSz =
R.DL->getTypeSizeInBits(ScalarTy);
14678 unsigned SrcSz =
R.DL->getTypeSizeInBits(EScalarTy);
14680 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14681 Cost += TTI.getCastInstrCost(
14690 if (!CommonMask.
empty()) {
14691 std::iota(std::next(CommonMask.
begin(), Idx),
14692 std::next(CommonMask.
begin(), Idx +
E->getVectorFactor()),
14698 if (!ExtMask.
empty()) {
14699 if (CommonMask.
empty()) {
14703 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
14706 NewMask[
I] = CommonMask[ExtMask[
I]];
14708 CommonMask.
swap(NewMask);
14711 if (CommonMask.
empty()) {
14712 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
14716 createShuffle(InVectors.
front(),
14717 InVectors.
size() == 2 ? InVectors.
back() :
nullptr,
14722 assert((IsFinalized || CommonMask.empty()) &&
14723 "Shuffle construction must be finalized.");
14727const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(
const TreeEntry *E,
14728 unsigned Idx)
const {
14729 TreeEntry *
Op = OperandsToTreeEntry.
at({E, Idx});
14730 assert(
Op->isSame(
E->getOperand(Idx)) &&
"Operands mismatch!");
14735 if (
TE.State == TreeEntry::ScatterVectorize ||
14736 TE.State == TreeEntry::StridedVectorize)
14738 if (
TE.State == TreeEntry::CompressVectorize)
14740 if (
TE.State == TreeEntry::Vectorize &&
TE.getOpcode() == Instruction::Load &&
14741 !
TE.isAltShuffle()) {
14742 if (
TE.ReorderIndices.empty())
14744 SmallVector<int>
Mask;
14754 SmallPtrSetImpl<Value *> &CheckedExtracts) {
14759 return InstructionCost::getInvalid();
14764 auto It = MinBWs.find(
E);
14765 Type *OrigScalarTy = ScalarTy;
14766 if (It != MinBWs.end()) {
14773 unsigned EntryVF =
E->getVectorFactor();
14776 if (
E->isGather() || TransformedToGatherNodes.contains(
E)) {
14780 return InstructionCost::getInvalid();
14782 ScalarTy = VL.
front()->getType();
14783 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
14784 E, ScalarTy, *TTI, VectorizedVals, *
this, CheckedExtracts);
14786 if (
E->State == TreeEntry::SplitVectorize) {
14787 assert(
E->CombinedEntriesWithIndices.size() == 2 &&
14788 "Expected exactly 2 combined entries.");
14789 assert(
E->ReuseShuffleIndices.empty() &&
"Expected empty reuses mask.");
14791 if (
E->ReorderIndices.empty()) {
14794 E->CombinedEntriesWithIndices.back().second,
14797 VectorizableTree[
E->CombinedEntriesWithIndices.back().first]
14798 ->getVectorFactor()));
14800 unsigned CommonVF =
14801 std::max(VectorizableTree[
E->CombinedEntriesWithIndices.front().first]
14802 ->getVectorFactor(),
14803 VectorizableTree[
E->CombinedEntriesWithIndices.back().first]
14804 ->getVectorFactor());
14809 LLVM_DEBUG(dumpTreeCosts(
E, 0, VectorCost, 0,
"Calculated costs for Tree"));
14813 SmallVector<int>
Mask;
14814 if (!
E->ReorderIndices.empty() &&
E->State != TreeEntry::CompressVectorize &&
14815 (
E->State != TreeEntry::StridedVectorize ||
14817 SmallVector<int> NewMask;
14818 if (
E->getOpcode() == Instruction::Store) {
14820 NewMask.
resize(
E->ReorderIndices.size());
14827 if (!
E->ReuseShuffleIndices.empty())
14832 assert((
E->State == TreeEntry::Vectorize ||
14833 E->State == TreeEntry::ScatterVectorize ||
14834 E->State == TreeEntry::StridedVectorize ||
14835 E->State == TreeEntry::CompressVectorize) &&
14836 "Unhandled state");
14839 (
E->getOpcode() == Instruction::GetElementPtr &&
14840 E->getMainOp()->getType()->isPointerTy()) ||
14841 E->hasCopyableElements()) &&
14844 unsigned ShuffleOrOp =
14845 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector :
E->
getOpcode();
14846 if (
E->CombinedOp != TreeEntry::NotCombinedOp)
14847 ShuffleOrOp =
E->CombinedOp;
14848 SmallSetVector<Value *, 16> UniqueValues;
14849 SmallVector<unsigned, 16> UniqueIndexes;
14851 if (UniqueValues.insert(V))
14852 UniqueIndexes.push_back(Idx);
14853 const unsigned Sz = UniqueValues.size();
14854 SmallBitVector UsedScalars(Sz,
false);
14855 for (
unsigned I = 0;
I < Sz; ++
I) {
14857 !
E->isCopyableElement(UniqueValues[
I]) &&
14858 getTreeEntries(UniqueValues[
I]).
front() ==
E)
14860 UsedScalars.set(
I);
14862 auto GetCastContextHint = [&](
Value *
V) {
14864 return getCastContextHint(*OpTEs.front());
14865 InstructionsState SrcState =
getSameOpcode(
E->getOperand(0), *TLI);
14866 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
14867 !SrcState.isAltShuffle())
14880 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
14882 for (
unsigned I = 0;
I < Sz; ++
I) {
14883 if (UsedScalars.test(
I))
14885 ScalarCost += ScalarEltCost(
I);
14892 if (It != MinBWs.end() && !UnaryInstruction::isCast(
E->getOpcode()) &&
14894 (
E->getOpcode() != Instruction::Load ||
E->UserTreeIndex)) {
14896 if (!EI.UserTE->hasState() ||
14897 EI.UserTE->getOpcode() != Instruction::Select ||
14899 auto UserBWIt = MinBWs.find(EI.UserTE);
14900 Type *UserScalarTy =
14901 (EI.UserTE->isGather() ||
14902 EI.UserTE->State == TreeEntry::SplitVectorize)
14903 ? EI.UserTE->Scalars.front()->getType()
14904 : EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
14905 if (UserBWIt != MinBWs.end())
14907 UserBWIt->second.first);
14908 if (ScalarTy != UserScalarTy) {
14909 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
14910 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
14911 unsigned VecOpcode;
14913 if (BWSz > SrcBWSz)
14914 VecOpcode = Instruction::Trunc;
14917 It->second.second ? Instruction::SExt : Instruction::ZExt;
14919 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
14924 LLVM_DEBUG(dumpTreeCosts(
E, CommonCost, VecCost - CommonCost,
14925 ScalarCost,
"Calculated costs for Tree"));
14926 return VecCost - ScalarCost;
14931 assert((
E->State == TreeEntry::Vectorize ||
14932 E->State == TreeEntry::StridedVectorize ||
14933 E->State == TreeEntry::CompressVectorize) &&
14934 "Entry state expected to be Vectorize, StridedVectorize or "
14935 "MaskedLoadCompressVectorize here.");
14939 *TTI, Ptrs, BasePtr,
E->getOpcode(),
CostKind, OrigScalarTy, VecTy);
14940 LLVM_DEBUG(dumpTreeCosts(
E, 0, VecCost, ScalarCost,
14941 "Calculated GEPs cost for Tree"));
14943 return VecCost - ScalarCost;
14949 return InstructionCost::getInvalid();
14950 Type *CanonicalType = Ty;
14956 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
14957 {CanonicalType, CanonicalType});
14959 TTI->getIntrinsicInstrCost(CostAttrs,
CostKind);
14962 if (VI && SelectOnly) {
14964 "Expected only for scalar type.");
14967 CI->
getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
14968 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
14969 {TTI::OK_AnyValue, TTI::OP_None}, CI);
14973 auto GetFMulAddCost = [&, &TTI = *TTI](
const InstructionsState &S,
14978 switch (ShuffleOrOp) {
14979 case Instruction::PHI: {
14982 SmallPtrSet<const TreeEntry *, 4> CountedOps;
14983 for (
Value *V : UniqueValues) {
14988 ValueList Operands(
PHI->getNumIncomingValues(),
nullptr);
14989 for (
unsigned I = 0,
N =
PHI->getNumIncomingValues();
I <
N; ++
I) {
14993 if (
const TreeEntry *OpTE =
14994 getSameValuesTreeEntry(Operands.
front(), Operands))
14995 if (CountedOps.
insert(OpTE).second &&
14996 !OpTE->ReuseShuffleIndices.empty())
14997 ScalarCost +=
TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
14998 OpTE->Scalars.size());
15001 return CommonCost - ScalarCost;
15003 case Instruction::ExtractValue:
15004 case Instruction::ExtractElement: {
15005 APInt DemandedElts;
15007 auto GetScalarCost = [&](
unsigned Idx) {
15013 if (ShuffleOrOp == Instruction::ExtractElement) {
15015 SrcVecTy = EE->getVectorOperandType();
15018 Type *AggregateTy = EV->getAggregateOperand()->getType();
15021 NumElts = ATy->getNumElements();
15027 if (
I->hasOneUse()) {
15037 Cost -= TTI->getCastInstrCost(
15043 if (DemandedElts.
isZero())
15049 return CommonCost - (DemandedElts.
isZero()
15051 : TTI.getScalarizationOverhead(
15052 SrcVecTy, DemandedElts,
false,
15055 return GetCostDiff(GetScalarCost, GetVectorCost);
15057 case Instruction::InsertElement: {
15058 assert(
E->ReuseShuffleIndices.empty() &&
15059 "Unique insertelements only are expected.");
15061 unsigned const NumElts = SrcVecTy->getNumElements();
15062 unsigned const NumScalars = VL.
size();
15068 unsigned OffsetEnd = OffsetBeg;
15069 InsertMask[OffsetBeg] = 0;
15072 if (OffsetBeg > Idx)
15074 else if (OffsetEnd < Idx)
15076 InsertMask[Idx] =
I + 1;
15079 if (NumOfParts > 0 && NumOfParts < NumElts)
15080 VecScalarsSz =
PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
15081 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
15083 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
15084 unsigned InsertVecSz = std::min<unsigned>(
15086 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
15087 bool IsWholeSubvector =
15088 OffsetBeg ==
Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
15092 if (OffsetBeg + InsertVecSz > VecSz) {
15095 InsertVecSz = VecSz;
15100 SmallVector<int>
Mask;
15101 if (!
E->ReorderIndices.empty()) {
15106 std::iota(
Mask.begin(), std::next(
Mask.begin(), InsertVecSz), 0);
15108 bool IsIdentity =
true;
15110 Mask.swap(PrevMask);
15111 for (
unsigned I = 0;
I < NumScalars; ++
I) {
15113 DemandedElts.
setBit(InsertIdx);
15114 IsIdentity &= InsertIdx - OffsetBeg ==
I;
15115 Mask[InsertIdx - OffsetBeg] =
I;
15117 assert(
Offset < NumElts &&
"Failed to find vector index offset");
15131 InsertVecTy, Mask);
15133 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
15139 SmallBitVector InMask =
15141 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
15142 if (!InMask.
all() && NumScalars != NumElts && !IsWholeSubvector) {
15143 if (InsertVecSz != VecSz) {
15148 for (
unsigned I = 0, End = OffsetBeg -
Offset;
I < End; ++
I)
15150 for (
unsigned I = OffsetBeg -
Offset, End = OffsetEnd -
Offset;
15154 for (
unsigned I = OffsetEnd + 1 -
Offset;
I < VecSz; ++
I)
15163 case Instruction::ZExt:
15164 case Instruction::SExt:
15165 case Instruction::FPToUI:
15166 case Instruction::FPToSI:
15167 case Instruction::FPExt:
15168 case Instruction::PtrToInt:
15169 case Instruction::IntToPtr:
15170 case Instruction::SIToFP:
15171 case Instruction::UIToFP:
15172 case Instruction::Trunc:
15173 case Instruction::FPTrunc:
15174 case Instruction::BitCast: {
15175 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
15178 unsigned Opcode = ShuffleOrOp;
15179 unsigned VecOpcode = Opcode;
15181 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
15183 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->
getScalarType());
15184 if (SrcIt != MinBWs.end()) {
15185 SrcBWSz = SrcIt->second.first;
15191 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->
getScalarType());
15192 if (BWSz == SrcBWSz) {
15193 VecOpcode = Instruction::BitCast;
15194 }
else if (BWSz < SrcBWSz) {
15195 VecOpcode = Instruction::Trunc;
15196 }
else if (It != MinBWs.end()) {
15197 assert(BWSz > SrcBWSz &&
"Invalid cast!");
15198 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
15199 }
else if (SrcIt != MinBWs.end()) {
15200 assert(BWSz > SrcBWSz &&
"Invalid cast!");
15202 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
15204 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
15205 !SrcIt->second.second) {
15206 VecOpcode = Instruction::UIToFP;
15209 assert(Idx == 0 &&
"Expected 0 index only");
15210 return TTI->getCastInstrCost(Opcode, VL0->
getType(),
15217 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
15219 auto *
VI = VL0->
getOpcode() == Opcode ? VL0 :
nullptr;
15222 bool IsArithmeticExtendedReduction =
15223 E->Idx == 0 && UserIgnoreList &&
15226 return is_contained({Instruction::Add, Instruction::FAdd,
15227 Instruction::Mul, Instruction::FMul,
15228 Instruction::And, Instruction::Or,
15232 if (IsArithmeticExtendedReduction &&
15233 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
15235 return CommonCost +
15236 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH,
CostKind,
15237 VecOpcode == Opcode ? VI :
nullptr);
15239 return GetCostDiff(GetScalarCost, GetVectorCost);
15241 case Instruction::FCmp:
15242 case Instruction::ICmp:
15243 case Instruction::Select: {
15244 CmpPredicate VecPred, SwappedVecPred;
15247 match(VL0, MatchCmp))
15253 auto GetScalarCost = [&](
unsigned Idx) {
15263 !
match(VI, MatchCmp)) ||
15271 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
15272 CostKind, getOperandInfo(
VI->getOperand(0)),
15273 getOperandInfo(
VI->getOperand(1)), VI);
15284 TTI->getCmpSelInstrCost(
E->getOpcode(), VecTy, MaskTy, VecPred,
15285 CostKind, getOperandInfo(
E->getOperand(0)),
15286 getOperandInfo(
E->getOperand(1)), VL0);
15290 unsigned CondNumElements = CondType->getNumElements();
15292 assert(VecTyNumElements >= CondNumElements &&
15293 VecTyNumElements % CondNumElements == 0 &&
15294 "Cannot vectorize Instruction::Select");
15295 if (CondNumElements != VecTyNumElements) {
15304 return VecCost + CommonCost;
15306 return GetCostDiff(GetScalarCost, GetVectorCost);
15308 case TreeEntry::MinMax: {
15309 auto GetScalarCost = [&](
unsigned Idx) {
15310 return GetMinMaxCost(OrigScalarTy);
15314 return VecCost + CommonCost;
15316 return GetCostDiff(GetScalarCost, GetVectorCost);
15318 case TreeEntry::FMulAdd: {
15319 auto GetScalarCost = [&](
unsigned Idx) {
15322 return GetFMulAddCost(
E->getOperations(),
15328 for (
Value *V :
E->Scalars) {
15330 FMF &= FPCI->getFastMathFlags();
15332 FMF &= FPCIOp->getFastMathFlags();
15335 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,
15336 {VecTy, VecTy, VecTy}, FMF);
15338 return VecCost + CommonCost;
15340 return GetCostDiff(GetScalarCost, GetVectorCost);
15342 case Instruction::FNeg:
15343 case Instruction::Add:
15344 case Instruction::FAdd:
15345 case Instruction::Sub:
15346 case Instruction::FSub:
15347 case Instruction::Mul:
15348 case Instruction::FMul:
15349 case Instruction::UDiv:
15350 case Instruction::SDiv:
15351 case Instruction::FDiv:
15352 case Instruction::URem:
15353 case Instruction::SRem:
15354 case Instruction::FRem:
15355 case Instruction::Shl:
15356 case Instruction::LShr:
15357 case Instruction::AShr:
15358 case Instruction::And:
15359 case Instruction::Or:
15360 case Instruction::Xor: {
15361 auto GetScalarCost = [&](
unsigned Idx) {
15368 unsigned Lane = UniqueIndexes[Idx];
15369 Value *Op1 =
E->getOperand(0)[Lane];
15371 SmallVector<const Value *, 2> Operands(1, Op1);
15375 Op2 =
E->getOperand(1)[Lane];
15381 ShuffleOrOp, OrigScalarTy,
CostKind, Op1Info, Op2Info, Operands);
15383 I && (ShuffleOrOp == Instruction::FAdd ||
15384 ShuffleOrOp == Instruction::FSub)) {
15392 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
15397 return CI && CI->getValue().countr_one() >= It->second.first;
15405 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy,
CostKind, Op1Info,
15406 Op2Info, {},
nullptr, TLI) +
15409 return GetCostDiff(GetScalarCost, GetVectorCost);
15411 case Instruction::GetElementPtr: {
15412 return CommonCost + GetGEPCostDiff(VL, VL0);
15414 case Instruction::Load: {
15415 auto GetScalarCost = [&](
unsigned Idx) {
15417 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
15418 VI->getAlign(),
VI->getPointerAddressSpace(),
15424 switch (
E->State) {
15425 case TreeEntry::Vectorize:
15426 if (
unsigned Factor =
E->getInterleaveFactor()) {
15427 VecLdCost = TTI->getInterleavedMemoryOpCost(
15428 Instruction::Load, VecTy, Factor, {}, LI0->getAlign(),
15429 LI0->getPointerAddressSpace(),
CostKind);
15432 VecLdCost = TTI->getMemoryOpCost(
15433 Instruction::Load, VecTy, LI0->getAlign(),
15437 case TreeEntry::StridedVectorize: {
15438 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(
E);
15439 FixedVectorType *StridedLoadTy = SPtrInfo.Ty;
15440 assert(StridedLoadTy &&
"Missing StridedPoinerInfo for tree entry.");
15441 Align CommonAlignment =
15443 VecLdCost = TTI->getMemIntrinsicInstrCost(
15444 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_load,
15445 StridedLoadTy, LI0->getPointerOperand(),
15446 false, CommonAlignment),
15448 if (StridedLoadTy != VecTy)
15450 TTI->getCastInstrCost(Instruction::BitCast, VecTy, StridedLoadTy,
15455 case TreeEntry::CompressVectorize: {
15457 unsigned InterleaveFactor;
15458 SmallVector<int> CompressMask;
15461 if (!
E->ReorderIndices.empty()) {
15462 SmallVector<int>
Mask(
E->ReorderIndices.begin(),
15463 E->ReorderIndices.end());
15470 Scalars, PointerOps,
E->ReorderIndices, *TTI, *DL, *SE, *AC, *DT,
15471 *TLI, [](
Value *) { return true; }, IsMasked, InterleaveFactor,
15472 CompressMask, LoadVecTy);
15473 assert(IsVectorized &&
"Failed to vectorize load");
15474 CompressEntryToData.try_emplace(
E, CompressMask, LoadVecTy,
15475 InterleaveFactor, IsMasked);
15476 Align CommonAlignment = LI0->getAlign();
15477 if (InterleaveFactor) {
15478 VecLdCost = TTI->getInterleavedMemoryOpCost(
15479 Instruction::Load, LoadVecTy, InterleaveFactor, {},
15480 CommonAlignment, LI0->getPointerAddressSpace(),
CostKind);
15481 }
else if (IsMasked) {
15482 VecLdCost = TTI->getMemIntrinsicInstrCost(
15483 MemIntrinsicCostAttributes(Intrinsic::masked_load, LoadVecTy,
15485 LI0->getPointerAddressSpace()),
15489 LoadVecTy, CompressMask,
CostKind);
15491 VecLdCost = TTI->getMemoryOpCost(
15492 Instruction::Load, LoadVecTy, CommonAlignment,
15496 LoadVecTy, CompressMask,
CostKind);
15500 case TreeEntry::ScatterVectorize: {
15501 Align CommonAlignment =
15503 VecLdCost = TTI->getMemIntrinsicInstrCost(
15504 MemIntrinsicCostAttributes(Intrinsic::masked_gather, VecTy,
15505 LI0->getPointerOperand(),
15506 false, CommonAlignment),
15510 case TreeEntry::CombinedVectorize:
15511 case TreeEntry::SplitVectorize:
15512 case TreeEntry::NeedToGather:
15515 return VecLdCost + CommonCost;
15521 if (
E->State == TreeEntry::ScatterVectorize)
15528 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
15530 case Instruction::Store: {
15531 bool IsReorder = !
E->ReorderIndices.empty();
15532 auto GetScalarCost = [=](
unsigned Idx) {
15535 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
15536 VI->getAlign(),
VI->getPointerAddressSpace(),
15544 if (
E->State == TreeEntry::StridedVectorize) {
15545 Align CommonAlignment =
15547 VecStCost = TTI->getMemIntrinsicInstrCost(
15548 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_store,
15549 VecTy, BaseSI->getPointerOperand(),
15550 false, CommonAlignment),
15553 assert(
E->State == TreeEntry::Vectorize &&
15554 "Expected either strided or consecutive stores.");
15555 if (
unsigned Factor =
E->getInterleaveFactor()) {
15556 assert(
E->ReuseShuffleIndices.empty() && !
E->ReorderIndices.empty() &&
15557 "No reused shuffles expected");
15559 VecStCost = TTI->getInterleavedMemoryOpCost(
15560 Instruction::Store, VecTy, Factor, {}, BaseSI->getAlign(),
15561 BaseSI->getPointerAddressSpace(),
CostKind);
15564 VecStCost = TTI->getMemoryOpCost(
15565 Instruction::Store, VecTy, BaseSI->getAlign(),
15566 BaseSI->getPointerAddressSpace(),
CostKind, OpInfo);
15569 return VecStCost + CommonCost;
15573 unsigned Idx = IsReorder ?
E->ReorderIndices[
I] :
I;
15577 return GetCostDiff(GetScalarCost, GetVectorCost) +
15578 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
15580 case Instruction::Call: {
15581 auto GetScalarCost = [&](
unsigned Idx) {
15585 IntrinsicCostAttributes CostAttrs(
ID, *CI, 1);
15586 return TTI->getIntrinsicInstrCost(CostAttrs,
CostKind);
15596 CI,
ID, VecTy->getNumElements(),
15597 It != MinBWs.end() ? It->second.first : 0, TTI);
15599 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
15601 return GetCostDiff(GetScalarCost, GetVectorCost);
15603 case Instruction::ShuffleVector: {
15611 "Invalid Shuffle Vector Operand");
15614 auto TryFindNodeWithEqualOperands = [=]() {
15615 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
15618 if (
TE->hasState() &&
TE->isAltShuffle() &&
15619 ((
TE->getOpcode() ==
E->getOpcode() &&
15620 TE->getAltOpcode() ==
E->getAltOpcode()) ||
15621 (
TE->getOpcode() ==
E->getAltOpcode() &&
15622 TE->getAltOpcode() ==
E->getOpcode())) &&
15623 TE->hasEqualOperands(*
E))
15628 auto GetScalarCost = [&](
unsigned Idx) {
15633 assert(
E->getMatchingMainOpOrAltOp(VI) &&
15634 "Unexpected main/alternate opcode");
15636 return TTI->getInstructionCost(VI,
CostKind);
15644 if (TryFindNodeWithEqualOperands()) {
15646 dbgs() <<
"SLP: diamond match for alternate node found.\n";
15653 TTIRef.getArithmeticInstrCost(
E->getOpcode(), VecTy,
CostKind);
15655 TTIRef.getArithmeticInstrCost(
E->getAltOpcode(), VecTy,
CostKind);
15658 VecCost = TTIRef.getCmpSelInstrCost(
15659 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(),
CostKind,
15660 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15662 VecCost += TTIRef.getCmpSelInstrCost(
15663 E->getOpcode(), VecTy, MaskTy,
15665 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15668 Type *SrcSclTy =
E->getMainOp()->getOperand(0)->getType();
15671 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
15672 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
15674 DL->getTypeSizeInBits(
E->getMainOp()->getOperand(0)->getType());
15675 if (SrcIt != MinBWs.end()) {
15676 SrcBWSz = SrcIt->second.first;
15680 if (BWSz <= SrcBWSz) {
15681 if (BWSz < SrcBWSz)
15683 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
15687 <<
"SLP: alternate extension, which should be truncated.\n";
15693 VecCost = TTIRef.getCastInstrCost(
E->getOpcode(), VecTy, SrcTy,
15696 TTIRef.getCastInstrCost(
E->getAltOpcode(), VecTy, SrcTy,
15699 SmallVector<int>
Mask;
15700 E->buildAltOpShuffleMask(
15701 [&](Instruction *
I) {
15702 assert(
E->getMatchingMainOpOrAltOp(
I) &&
15703 "Unexpected main/alternate opcode");
15714 unsigned Opcode0 =
E->getOpcode();
15715 unsigned Opcode1 =
E->getAltOpcode();
15716 SmallBitVector OpcodeMask(
15720 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
15722 VecTy, Opcode0, Opcode1, OpcodeMask,
CostKind);
15723 return AltVecCost < VecCost ? AltVecCost : VecCost;
15729 return GetCostDiff(
15734 "Not supported shufflevector usage.");
15736 unsigned SVNumElements =
15738 ->getNumElements();
15739 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
15740 for (
size_t I = 0, End = VL.
size();
I != End;
I += GroupSize) {
15745 "Not supported shufflevector usage.");
15748 [[maybe_unused]]
bool IsExtractSubvectorMask =
15749 SV->isExtractSubvectorMask(Index);
15750 assert(IsExtractSubvectorMask &&
15751 "Not supported shufflevector usage.");
15752 if (NextIndex != Index)
15754 NextIndex += SV->getShuffleMask().size();
15757 return ::getShuffleCost(
15763 return GetCostDiff(GetScalarCost, GetVectorCost);
15765 case Instruction::Freeze:
15772bool BoUpSLP::isFullyVectorizableTinyTree(
bool ForReduction)
const {
15774 << VectorizableTree.size() <<
" is fully vectorizable .\n");
15776 auto &&AreVectorizableGathers = [
this](
const TreeEntry *
TE,
unsigned Limit) {
15777 SmallVector<int>
Mask;
15778 return TE->isGather() &&
15780 [
this](
Value *V) { return EphValues.contains(V); }) &&
15782 TE->Scalars.size() < Limit ||
15783 (((
TE->hasState() &&
15784 TE->getOpcode() == Instruction::ExtractElement) ||
15787 (
TE->hasState() &&
TE->getOpcode() == Instruction::Load &&
15788 !
TE->isAltShuffle()) ||
15793 if (VectorizableTree.size() == 1 &&
15794 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
15795 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
15796 VectorizableTree[0]->State == TreeEntry::CompressVectorize ||
15798 AreVectorizableGathers(VectorizableTree[0].
get(),
15799 VectorizableTree[0]->Scalars.size()) &&
15800 VectorizableTree[0]->getVectorFactor() > 2)))
15803 if (VectorizableTree.size() != 2)
15810 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
15811 AreVectorizableGathers(VectorizableTree[1].
get(),
15812 VectorizableTree[0]->Scalars.size()))
15816 if (VectorizableTree[0]->
isGather() ||
15817 (VectorizableTree[1]->
isGather() &&
15818 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
15819 VectorizableTree[0]->State != TreeEntry::StridedVectorize &&
15820 VectorizableTree[0]->State != TreeEntry::CompressVectorize))
15828 bool MustMatchOrInst) {
15832 Value *ZextLoad = Root;
15833 const APInt *ShAmtC;
15834 bool FoundOr =
false;
15838 ShAmtC->
urem(8) == 0))) {
15840 ZextLoad = BinOp->getOperand(0);
15841 if (BinOp->getOpcode() == Instruction::Or)
15846 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
15853 Type *SrcTy = Load->getType();
15854 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
15860 LLVM_DEBUG(
dbgs() <<
"SLP: Assume load combining for tree starting at "
15870 unsigned NumElts = VectorizableTree[0]->Scalars.size();
15871 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
15879 unsigned NumElts = Stores.
size();
15880 for (
Value *Scalar : Stores) {
15894 if (VectorizableTree.empty()) {
15895 assert(ExternalUses.empty() &&
"We shouldn't have any external users");
15901 if (VectorizableTree.size() == 2 &&
15903 VectorizableTree[1]->isGather() &&
15904 (VectorizableTree[1]->getVectorFactor() <= 2 ||
15905 !(
isSplat(VectorizableTree[1]->Scalars) ||
15913 constexpr int Limit = 4;
15915 !VectorizableTree.empty() &&
15916 all_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
15917 return (TE->isGather() &&
15918 (!TE->hasState() ||
15919 TE->getOpcode() != Instruction::ExtractElement) &&
15921 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
15928 VectorizableTree.size() <= Limit &&
15929 all_of(VectorizableTree,
15930 [&](
const std::unique_ptr<TreeEntry> &TE) {
15931 return (TE->isGather() &&
15932 (!TE->hasState() ||
15933 TE->getOpcode() != Instruction::ExtractElement) &&
15937 (TE->getOpcode() == Instruction::InsertElement ||
15938 (TE->getOpcode() == Instruction::PHI &&
15940 return isa<PoisonValue>(V) || MustGather.contains(V);
15943 any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
15944 return TE->State == TreeEntry::Vectorize &&
15945 TE->getOpcode() == Instruction::PHI;
15952 unsigned NumGathers = 0;
15953 constexpr int LimitTreeSize = 36;
15955 all_of(VectorizableTree,
15956 [&](
const std::unique_ptr<TreeEntry> &TE) {
15957 if (!TE->isGather() && TE->hasState() &&
15958 (TE->getOpcode() == Instruction::Load ||
15959 TE->getOpcode() == Instruction::Store)) {
15963 if (TE->isGather())
15965 return TE->State == TreeEntry::SplitVectorize ||
15966 (TE->Idx == 0 && TE->Scalars.size() == 2 &&
15967 TE->hasState() && TE->getOpcode() == Instruction::ICmp &&
15968 VectorizableTree.size() > LimitTreeSize) ||
15972 (TE->getOpcode() == Instruction::PHI ||
15973 (TE->hasCopyableElements() &&
15976 TE->Scalars.size() / 2) ||
15977 ((!TE->ReuseShuffleIndices.empty() ||
15978 !TE->ReorderIndices.empty() || TE->isAltShuffle()) &&
15979 TE->Scalars.size() == 2)));
15981 (StoreLoadNodes.
empty() ||
15982 (VectorizableTree.size() > LimitTreeSize * StoreLoadNodes.
size() &&
15983 (NumGathers > 0 ||
none_of(StoreLoadNodes, [&](
const TreeEntry *TE) {
15984 return TE->getOpcode() == Instruction::Store ||
15986 return !isa<LoadInst>(V) ||
15987 areAllUsersVectorized(cast<Instruction>(V));
15995 VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
15996 VectorizableTree.size() >= Limit &&
15998 [&](
const std::unique_ptr<TreeEntry> &TE) {
15999 return !TE->isGather() && TE->UserTreeIndex.UserTE &&
16000 TE->UserTreeIndex.UserTE->Idx == 0;
16007 VectorizableTree.size() > 2 &&
16008 VectorizableTree.front()->State == TreeEntry::Vectorize &&
16009 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
16010 VectorizableTree[1]->State == TreeEntry::Vectorize &&
16011 VectorizableTree[1]->getOpcode() == Instruction::PHI &&
16013 ArrayRef(VectorizableTree).drop_front(2),
16014 [&](
const std::unique_ptr<TreeEntry> &TE) {
return TE->isGather(); }))
16024 if (isFullyVectorizableTinyTree(ForReduction))
16029 bool IsAllowedSingleBVNode =
16030 VectorizableTree.
size() > 1 ||
16031 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
16032 !VectorizableTree.front()->isAltShuffle() &&
16033 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
16034 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
16036 if (
any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
16037 return TE->isGather() &&
all_of(TE->Scalars, [&](
Value *V) {
16038 return isa<ExtractElementInst, Constant>(V) ||
16039 (IsAllowedSingleBVNode &&
16040 !V->hasNUsesOrMore(UsesLimit) &&
16041 any_of(V->users(), IsaPred<InsertElementInst>));
16046 if (VectorizableTree.back()->isGather() &&
16047 VectorizableTree.back()->hasState() &&
16048 VectorizableTree.back()->isAltShuffle() &&
16049 VectorizableTree.back()->getVectorFactor() > 2 &&
16051 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
16052 TTI->getScalarizationOverhead(
16053 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
16054 VectorizableTree.back()->getVectorFactor()),
16067 constexpr unsigned SmallTree = 3;
16068 if (VectorizableTree.front()->isNonPowOf2Vec() &&
16071 [](
const std::unique_ptr<TreeEntry> &TE) {
16072 return TE->isGather() && TE->hasState() &&
16073 TE->getOpcode() == Instruction::Load &&
16081 TreeEntry &E = *VectorizableTree[Idx];
16082 if (E.State == TreeEntry::SplitVectorize)
16086 if ((E.hasState() && E.getOpcode() != Instruction::Load) ||
16105 const TreeEntry *Root = VectorizableTree.front().get();
16106 if (Root->isGather())
16114 for (
const auto &TEPtr : VectorizableTree) {
16115 if (!TEPtr->isGather()) {
16116 Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
16117 EntriesToLastInstruction.
try_emplace(TEPtr.get(), LastInst);
16118 LastInstructions.
insert(LastInst);
16120 if (TEPtr->UserTreeIndex)
16121 EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
16128 if (
II->isAssumeLikeIntrinsic())
16135 return IntrCost < CallCost;
16142 CheckedInstructions;
16143 unsigned Budget = 0;
16144 const unsigned BudgetLimit =
16149 "Expected instructions in same block.");
16150 if (
auto It = CheckedInstructions.
find(
Last);
16151 It != CheckedInstructions.
end()) {
16152 const Instruction *Checked = It->second.getPointer();
16154 return It->second.getInt() != 0;
16160 ++
First->getIterator().getReverse(),
16162 Last->getIterator().getReverse();
16164 while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
16170 for (
const Instruction *LastInst : LastInstsInRange)
16171 CheckedInstructions.
try_emplace(LastInst, &*PrevInstIt, 0);
16174 if (LastInstructions.
contains(&*PrevInstIt))
16175 LastInstsInRange.
push_back(&*PrevInstIt);
16180 for (
const Instruction *LastInst : LastInstsInRange)
16182 LastInst, PrevInstIt == InstIt ?
First : &*PrevInstIt,
16183 Budget <= BudgetLimit ? 1 : 0);
16184 return Budget <= BudgetLimit;
16186 auto AddCosts = [&](
const TreeEntry *
Op) {
16187 Type *ScalarTy =
Op->Scalars.front()->getType();
16188 auto It = MinBWs.find(
Op);
16189 if (It != MinBWs.end())
16192 Cost += TTI->getCostOfKeepingLiveOverCall(VecTy);
16195 Cost -=
Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy);
16202 ParentOpParentToPreds;
16205 auto Key = std::make_pair(Root, OpParent);
16206 if (
auto It = ParentOpParentToPreds.
find(
Key);
16207 It != ParentOpParentToPreds.
end())
16219 for (
const auto &KeyPair : ParentsPairsToAdd) {
16221 "Should not have been added before.");
16225 while (!Worklist.
empty()) {
16227 if (BB == OpParent || !Visited.
insert(BB).second)
16229 auto Pair = std::make_pair(BB, OpParent);
16230 if (
auto It = ParentOpParentToPreds.
find(Pair);
16231 It != ParentOpParentToPreds.
end()) {
16235 ParentsPairsToAdd.
insert(Pair);
16240 if (Budget > BudgetLimit)
16252 while (!LiveEntries.
empty()) {
16255 if (Operands.
empty())
16257 Instruction *LastInst = EntriesToLastInstruction.
at(Entry);
16259 for (
const TreeEntry *
Op : Operands) {
16260 if (!
Op->isGather())
16262 if (Entry->State == TreeEntry::SplitVectorize ||
16263 (Entry->getOpcode() != Instruction::PHI &&
Op->isGather()) ||
16269 Pred = Phi->getIncomingBlock(
Op->UserTreeIndex.EdgeIdx);
16272 if (
Op->isGather()) {
16273 assert(Entry->getOpcode() == Instruction::PHI &&
16274 "Expected phi node only.");
16276 ->getIncomingBlock(
Op->UserTreeIndex.EdgeIdx);
16278 for (
Value *V :
Op->Scalars) {
16289 OpLastInst = EntriesToLastInstruction.
at(
Op);
16293 if (OpParent == Parent) {
16294 if (Entry->getOpcode() == Instruction::PHI) {
16295 if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
16299 if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
16305 if (Entry->getOpcode() != Instruction::PHI &&
16306 !CheckForNonVecCallsInSameBlock(
16307 &*LastInst->
getParent()->getFirstNonPHIOrDbgOrAlloca(),
16313 if (!CheckForNonVecCallsInSameBlock(OpLastInst,
16319 if (!CheckPredecessors(Parent, Pred, OpParent)) {
16335 const auto *I1 = IE1;
16336 const auto *I2 = IE2;
16348 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
16351 if (I2 && ((I2 == IE2 || I2->
hasOneUse())) &&
16354 }
while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
16361struct ValueSelect {
16362 template <
typename U>
16363 static std::enable_if_t<std::is_same_v<Value *, U>,
Value *>
get(
Value *V) {
16366 template <
typename U>
16367 static std::enable_if_t<!std::is_same_v<Value *, U>,
U>
get(
Value *) {
16385template <
typename T>
16391 assert(!ShuffleMask.empty() &&
"Empty list of shuffles for inserts.");
16393 auto VMIt = std::next(ShuffleMask.begin());
16396 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
16398 if (!IsBaseUndef.
all()) {
16400 std::pair<T *, bool> Res =
16401 ResizeAction(ShuffleMask.begin()->first, Mask,
false);
16403 for (
unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
16407 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
16409 [[maybe_unused]]
auto *V = ValueSelect::get<T *>(
Base);
16410 assert((!V || GetVF(V) == Mask.size()) &&
16411 "Expected base vector of VF number of elements.");
16412 Prev = Action(Mask, {
nullptr, Res.first});
16413 }
else if (ShuffleMask.size() == 1) {
16416 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
16422 Prev = Action(Mask, {ShuffleMask.begin()->first});
16426 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
16427 unsigned Vec2VF = GetVF(VMIt->first);
16428 if (Vec1VF == Vec2VF) {
16432 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
16435 Mask[
I] = SecMask[
I] + Vec1VF;
16438 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
16441 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
16443 std::pair<T *, bool> Res2 =
16444 ResizeAction(VMIt->first, VMIt->second,
false);
16446 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
16453 Mask[
I] = (Res2.second ?
I : SecMask[
I]) + VF;
16456 Prev = Action(Mask, {Res1.first, Res2.first});
16458 VMIt = std::next(VMIt);
16460 [[maybe_unused]]
bool IsBaseNotUndef = !IsBaseUndef.
all();
16462 for (
auto E = ShuffleMask.end(); VMIt !=
E; ++VMIt) {
16464 std::pair<T *, bool> Res =
16465 ResizeAction(VMIt->first, VMIt->second,
false);
16467 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
16470 "Multiple uses of scalars.");
16471 Mask[
I] = (Res.second ?
I : SecMask[
I]) + VF;
16476 Prev = Action(Mask, {Prev, Res.first});
16487 << VectorizableTree.size() <<
".\n");
16489 for (
const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
16490 TreeEntry &TE = *Ptr;
16493 if (TE.State == TreeEntry::CombinedVectorize) {
16495 dbgs() <<
"SLP: Skipping cost for combined node that starts with "
16496 << *TE.Scalars[0] <<
".\n";
16497 TE.dump();
dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
16501 if (TE.hasState() &&
16502 (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {
16503 if (
const TreeEntry *E =
16504 getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
16505 E && E->getVectorFactor() == TE.getVectorFactor()) {
16510 <<
"SLP: Current total cost = " << Cost <<
"\n");
16518 assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&
16519 "Expected gather nodes with users only.");
16526 <<
"SLP: Current total cost = " << Cost <<
"\n");
16528 if (TE.Idx > 0 && !TE.UserTreeIndex && TE.hasState() &&
16529 TE.getOpcode() == Instruction::Load)
16530 GatheredLoadsNodes.
insert(&TE);
16539 if (!GatheredLoadsNodes.
empty())
16542 constexpr unsigned PartLimit = 2;
16543 const unsigned Sz =
16545 const unsigned MinVF =
getMinVF(Sz);
16547 VectorizableTree.front()->Scalars.size() * PartLimit <= MinVF &&
16548 (!VectorizableTree.front()->hasState() ||
16549 (VectorizableTree.front()->getOpcode() != Instruction::Store &&
16550 LI->getLoopFor(VectorizableTree.front()->getMainOp()->getParent()))))
16553 VectorizableTree.size());
16554 for (
const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
16555 TreeEntry &TE = *Ptr;
16557 SubtreeCosts[TE.Idx].first +=
C;
16558 const TreeEntry *UserTE = TE.UserTreeIndex.UserTE;
16560 SubtreeCosts[UserTE->Idx].first +=
C;
16561 SubtreeCosts[UserTE->Idx].second.
push_back(TE.Idx);
16562 UserTE = UserTE->UserTreeIndex.UserTE;
16565 using CostIndicesTy =
16566 std::pair<TreeEntry *, std::pair<InstructionCost, SmallVector<unsigned>>>;
16567 struct FirstGreater {
16568 bool operator()(
const CostIndicesTy &LHS,
const CostIndicesTy &RHS)
const {
16569 return LHS.second.first < RHS.second.first ||
16570 (LHS.second.first == RHS.second.first &&
16571 LHS.first->Idx < RHS.first->Idx);
16576 for (
const auto [Idx,
P] :
enumerate(SubtreeCosts))
16577 Worklist.emplace(VectorizableTree[Idx].
get(),
P);
16580 if (!UserIgnoreList && VectorizableTree.front()->getVectorFactor() < MinVF &&
16581 VectorizableTree.front()->hasState() &&
16582 VectorizableTree.front()->getOpcode() == Instruction::Store &&
16583 (Worklist.top().first->Idx == 0 || Worklist.top().first->Idx == 1))
16587 while (!Worklist.empty() && Worklist.top().second.first > 0) {
16588 TreeEntry *TE = Worklist.top().first;
16589 if (TE->isGather() || TE->Idx == 0 || DeletedNodes.contains(TE) ||
16592 (TE->UserTreeIndex &&
16593 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize &&
16595 ArrayRef<TreeEntry *> Entries = getSplitTreeEntries(V);
16596 return Entries.size() > 1;
16604 if (SubtreeCost < TE->Scalars.size()) {
16608 if (!TransformedToGatherNodes.empty()) {
16609 for (
unsigned Idx : Worklist.top().second.second) {
16610 auto It = TransformedToGatherNodes.find(VectorizableTree[Idx].
get());
16611 if (It != TransformedToGatherNodes.end()) {
16612 SubtreeCost -= SubtreeCosts[Idx].first;
16613 SubtreeCost += It->second;
16617 if (SubtreeCost < 0 || SubtreeCost < TE->Scalars.size()) {
16621 const unsigned Sz = TE->Scalars.size();
16623 for (
auto [Idx, V] :
enumerate(TE->Scalars)) {
16631 const unsigned EntryVF = TE->getVectorFactor();
16634 *TTI, ScalarTy, VecTy, DemandedElts,
16637 if (!TE->ReorderIndices.empty() &&
16638 TE->State != TreeEntry::CompressVectorize &&
16639 (TE->State != TreeEntry::StridedVectorize ||
16642 if (TE->getOpcode() == Instruction::Store) {
16644 NewMask.
resize(TE->ReorderIndices.size());
16645 copy(TE->ReorderIndices, NewMask.
begin());
16651 if (!TE->ReuseShuffleIndices.empty())
16652 ::addMask(Mask, TE->ReuseShuffleIndices);
16659 return (TE->hasCopyableElements() && TE->isCopyableElement(V)) ||
16660 isConstant(V) || isGathered(V) || getTreeEntries(V).size() > 1;
16664 if (SubtreeCost > GatherCost) {
16667 if (VectorizableTree.front()->hasState() &&
16668 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
16672 LLVM_DEBUG(
dbgs() <<
"SLP: Trimming unprofitable subtree at node "
16673 << TE->Idx <<
" with cost "
16674 << Worklist.top().second.first <<
" and gather cost "
16675 << GatherCost <<
".\n");
16676 if (TE->UserTreeIndex) {
16677 TransformedToGatherNodes.try_emplace(TE, GatherCost);
16678 NodesCosts.
erase(TE);
16680 DeletedNodes.insert(TE);
16681 TransformedToGatherNodes.erase(TE);
16682 NodesCosts.
erase(TE);
16684 for (
unsigned Idx : Worklist.top().second.second) {
16685 TreeEntry &ChildTE = *VectorizableTree[Idx];
16686 DeletedNodes.insert(&ChildTE);
16687 TransformedToGatherNodes.erase(&ChildTE);
16688 NodesCosts.
erase(&ChildTE);
16695 return SubtreeCosts.
front().first;
16697 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
16698 if (!TE->UserTreeIndex && TransformedToGatherNodes.contains(TE.get())) {
16699 assert(TE->getOpcode() == Instruction::Load &&
"Expected load only.");
16702 if (DeletedNodes.contains(TE.get()))
16704 if (!NodesCosts.
contains(TE.get())) {
16706 getEntryCost(TE.get(), VectorizedVals, CheckedExtracts);
16711 LLVM_DEBUG(
dbgs() <<
"SLP: Recalculate costs after tree trimming.\n");
16713 for (
const auto &
P : NodesCosts) {
16714 NewCost +=
P.second;
16715 LLVM_DEBUG(
dbgs() <<
"SLP: Adding cost " <<
P.second <<
" for bundle "
16718 <<
"SLP: Current total cost = " << Cost <<
"\n");
16720 if (NewCost >= Cost) {
16721 DeletedNodes.clear();
16722 TransformedToGatherNodes.clear();
16731template <
typename T>
struct ShuffledInsertData {
16745 none_of(ExternalUses, [](
const ExternalUser &EU) {
16756 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
16763 for (ExternalUser &EU : ExternalUses) {
16764 ScalarUserAndIdx.
emplace_back(EU.Scalar, EU.User, EU.Lane);
16767 for (ExternalUser &EU : ExternalUses) {
16768 LLVM_DEBUG(
dbgs() <<
"SLP: Computing cost for external use of TreeEntry "
16769 << EU.E.Idx <<
" in lane " << EU.Lane <<
"\n");
16771 else dbgs() <<
" User: nullptr\n");
16772 LLVM_DEBUG(
dbgs() <<
" Use: " << EU.Scalar->getNameOrAsOperand() <<
"\n");
16777 if (EphValues.count(EU.User))
16781 if (!CheckedScalarUser.
insert(std::make_pair(EU.Scalar, EU.User)).second ||
16783 CheckedScalarUser.
contains(std::make_pair(EU.Scalar,
nullptr))))
16791 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
16797 !ExtractCostCalculated.
insert(EU.Scalar).second)
16810 if (!UsedInserts.
insert(VU).second)
16814 const TreeEntry *ScalarTE = &EU.E;
16817 [
this, VU](
const ShuffledInsertData<const TreeEntry *> &
Data) {
16822 Value *Op0 =
II->getOperand(0);
16829 if (It == ShuffledInserts.
end()) {
16831 Data.InsertElements.emplace_back(VU);
16833 VecId = ShuffledInserts.
size() - 1;
16834 auto It = MinBWs.find(ScalarTE);
16835 if (It != MinBWs.end() &&
16837 .
insert(std::make_pair(ScalarTE, FTy->getElementType()))
16839 unsigned BWSz = It->second.first;
16840 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
16841 unsigned VecOpcode;
16842 if (DstBWSz < BWSz)
16843 VecOpcode = Instruction::Trunc;
16846 It->second.second ? Instruction::SExt : Instruction::ZExt;
16851 FTy->getNumElements()),
16854 <<
" for extending externally used vector with "
16855 "non-equal minimum bitwidth.\n");
16860 It->InsertElements.front() = VU;
16861 VecId = std::distance(ShuffledInserts.
begin(), It);
16863 int InIdx = *InsertIdx;
16865 ShuffledInserts[VecId].ValueMasks[ScalarTE];
16868 Mask[InIdx] = EU.Lane;
16869 DemandedElts[VecId].setBit(InIdx);
16880 auto *ScalarTy = EU.Scalar->getType();
16881 const unsigned BundleWidth = EU.E.getVectorFactor();
16882 assert(EU.Lane < BundleWidth &&
"Extracted lane out of bounds.");
16884 const TreeEntry *Entry = &EU.E;
16885 auto It = MinBWs.find(Entry);
16886 if (It != MinBWs.end()) {
16891 ? Instruction::ZExt
16892 : Instruction::SExt;
16897 << ExtraCost <<
"\n");
16901 CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx);
16902 LLVM_DEBUG(
dbgs() <<
" ExtractElement cost for " << *ScalarTy <<
" from "
16903 << *VecTy <<
": " << ExtraCost <<
"\n");
16906 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
16907 Entry->getOpcode() == Instruction::Load) {
16909 auto IsPhiInLoop = [&](
const ExternalUser &U) {
16912 const Loop *L = LI->getLoopFor(Phi->getParent());
16913 return L && (Phi->getParent() ==
I->getParent() ||
16914 L == LI->getLoopFor(
I->getParent()));
16918 if (!ValueToExtUses) {
16919 ValueToExtUses.emplace();
16920 for (
const auto &
P :
enumerate(ExternalUses)) {
16922 if (IsPhiInLoop(
P.value()))
16925 ValueToExtUses->try_emplace(
P.value().Scalar,
P.index());
16932 auto OperandIsScalar = [&](
Value *V) {
16938 return !EE->hasOneUse() || !MustGather.contains(EE);
16941 return ValueToExtUses->contains(V);
16943 bool CanBeUsedAsScalar =
all_of(Inst->operands(), OperandIsScalar);
16944 bool CanBeUsedAsScalarCast =
false;
16947 Op &&
all_of(
Op->operands(), OperandIsScalar)) {
16952 if (ScalarCost + OpCost <= ExtraCost) {
16953 CanBeUsedAsScalar = CanBeUsedAsScalarCast =
true;
16954 ScalarCost += OpCost;
16958 if (CanBeUsedAsScalar) {
16959 bool KeepScalar = ScalarCost <= ExtraCost;
16963 bool IsProfitablePHIUser =
16965 VectorizableTree.front()->Scalars.size() > 2)) &&
16966 VectorizableTree.front()->hasState() &&
16967 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
16971 auto *PHIUser = dyn_cast<PHINode>(U);
16972 return (!PHIUser ||
16973 PHIUser->getParent() !=
16975 VectorizableTree.front()->getMainOp())
16980 return ValueToExtUses->contains(V);
16982 if (IsProfitablePHIUser) {
16986 (!GatheredLoadsEntriesFirst.has_value() ||
16987 Entry->Idx < *GatheredLoadsEntriesFirst)) {
16988 unsigned ScalarUsesCount =
count_if(Entry->Scalars, [&](
Value *V) {
16989 return ValueToExtUses->contains(V);
16991 auto It = ExtractsCount.
find(Entry);
16992 if (It != ExtractsCount.
end()) {
16993 assert(ScalarUsesCount >= It->getSecond().size() &&
16994 "Expected total number of external uses not less than "
16995 "number of scalar uses.");
16996 ScalarUsesCount -= It->getSecond().size();
17001 KeepScalar = ScalarUsesCount <= 1 || !
has_single_bit(ScalarUsesCount);
17004 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
17005 for (
Value *V : Inst->operands()) {
17006 auto It = ValueToExtUses->find(V);
17007 if (It != ValueToExtUses->end()) {
17009 ExternalUses[It->second].User =
nullptr;
17012 ExtraCost = ScalarCost;
17013 if (!IsPhiInLoop(EU))
17014 ExtractsCount[Entry].
insert(Inst);
17015 if (CanBeUsedAsScalarCast) {
17016 ScalarOpsFromCasts.
insert(Inst->getOperand(0));
17020 for (
Value *V : IOp->operands()) {
17021 auto It = ValueToExtUses->find(V);
17022 if (It != ValueToExtUses->end()) {
17024 ExternalUses[It->second].User =
nullptr;
17033 ExtractCost += ExtraCost;
17037 for (
Value *V : ScalarOpsFromCasts) {
17038 ExternalUsesAsOriginalScalar.insert(V);
17040 const auto *It =
find_if_not(TEs, [&](TreeEntry *TE) {
17041 return TransformedToGatherNodes.contains(TE) ||
17042 DeletedNodes.contains(TE);
17044 if (It != TEs.end()) {
17045 const TreeEntry *UserTE = *It;
17046 ExternalUses.emplace_back(V,
nullptr, *UserTE,
17047 UserTE->findLaneForValue(V));
17052 if (!VectorizedVals.
empty()) {
17053 const TreeEntry &Root = *VectorizableTree.front();
17054 auto BWIt = MinBWs.find(&Root);
17055 if (BWIt != MinBWs.end()) {
17056 Type *DstTy = Root.Scalars.front()->getType();
17057 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->
getScalarType());
17059 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
17060 if (OriginalSz != SrcSz) {
17061 unsigned Opcode = Instruction::Trunc;
17062 if (OriginalSz > SrcSz)
17063 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
17069 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
17080 VectorizableTree[1]->hasState() &&
17081 VectorizableTree[1]->State == TreeEntry::Vectorize &&
17082 all_of(VectorizableTree[1]->Scalars, [&](
Value *V) {
17083 return ExternalUsesAsOriginalScalar.contains(V);
17087 Cost += ExtractCost;
17088 auto &&ResizeToVF = [
this, &Cost](
const TreeEntry *TE,
ArrayRef<int> Mask,
17089 bool ForSingleMask) {
17091 unsigned VF = Mask.size();
17092 unsigned VecVF = TE->getVectorFactor();
17093 bool HasLargeIndex =
17094 any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); });
17095 if ((VF != VecVF && HasLargeIndex) ||
17098 if (HasLargeIndex) {
17100 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
17106 dbgs() <<
"SLP: Adding cost " <<
C
17107 <<
" for final shuffle of insertelement external users.\n";
17108 TE->dump();
dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
17110 return std::make_pair(TE,
true);
17113 if (!ForSingleMask) {
17115 for (
unsigned I = 0;
I < VF; ++
I) {
17117 ResizeMask[Mask[
I]] = Mask[
I];
17124 dbgs() <<
"SLP: Adding cost " <<
C
17125 <<
" for final shuffle of insertelement external users.\n";
17126 TE->dump();
dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
17131 return std::make_pair(TE,
false);
17134 for (
int I = 0, E = ShuffledInserts.
size();
I < E; ++
I) {
17135 Value *
Base = ShuffledInserts[
I].InsertElements.
front()->getOperand(0);
17136 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
17140 assert((TEs.size() == 1 || TEs.size() == 2) &&
17141 "Expected exactly 1 or 2 tree entries.");
17142 if (TEs.size() == 1) {
17144 VF = TEs.front()->getVectorFactor();
17145 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
17149 (
Data.index() < VF &&
17150 static_cast<int>(
Data.index()) ==
Data.value());
17155 <<
" for final shuffle of insertelement "
17156 "external users.\n";
17157 TEs.front()->
dump();
17158 dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
17164 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
17165 VF = TEs.front()->getVectorFactor();
17169 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
17173 <<
" for final shuffle of vector node and external "
17174 "insertelement users.\n";
17175 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
17176 dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
17184 [](
const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
17185 EstimateShufflesCost);
17188 ShuffledInserts[
I].InsertElements.
front()->getType()),
17191 Cost -= InsertCost;
17195 if (ReductionBitWidth != 0) {
17196 assert(UserIgnoreList &&
"Expected reduction tree.");
17197 const TreeEntry &E = *VectorizableTree.front();
17198 auto It = MinBWs.find(&E);
17199 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
17200 unsigned SrcSize = It->second.first;
17201 unsigned DstSize = ReductionBitWidth;
17202 unsigned Opcode = Instruction::Trunc;
17203 if (SrcSize < DstSize) {
17204 bool IsArithmeticExtendedReduction =
17207 return is_contained({Instruction::Add, Instruction::FAdd,
17208 Instruction::Mul, Instruction::FMul,
17209 Instruction::And, Instruction::Or,
17213 if (IsArithmeticExtendedReduction)
17215 Instruction::BitCast;
17217 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
17219 if (Opcode != Instruction::BitCast) {
17221 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
17223 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
17226 switch (E.getOpcode()) {
17227 case Instruction::SExt:
17228 case Instruction::ZExt:
17229 case Instruction::Trunc: {
17230 const TreeEntry *OpTE = getOperandEntry(&E, 0);
17231 CCH = getCastContextHint(*OpTE);
17237 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
17241 <<
" for final resize for reduction from " << SrcVecTy
17242 <<
" to " << DstVecTy <<
"\n";
17243 dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
17248 std::optional<InstructionCost> SpillCost;
17251 Cost += *SpillCost;
17257 OS <<
"SLP: Spill Cost = ";
17262 OS <<
".\nSLP: Extract Cost = " << ExtractCost <<
".\n"
17263 <<
"SLP: Total Cost = " << Cost <<
".\n";
17267 ViewGraph(
this,
"SLP" + F->getName(),
false, Str);
17278std::optional<TTI::ShuffleKind>
17279BoUpSLP::tryToGatherSingleRegisterExtractElements(
17285 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
17301 if (Idx >= VecTy->getNumElements()) {
17305 SmallBitVector ExtractMask(VecTy->getNumElements(),
true);
17306 ExtractMask.reset(*Idx);
17311 VectorOpToIdx[EI->getVectorOperand()].push_back(
I);
17316 stable_sort(Vectors, [](
const auto &P1,
const auto &P2) {
17317 return P1.second.size() > P2.second.size();
17320 const int UndefSz = UndefVectorExtracts.
size();
17321 unsigned SingleMax = 0;
17322 unsigned PairMax = 0;
17323 if (!Vectors.
empty()) {
17324 SingleMax = Vectors.
front().second.size() + UndefSz;
17325 if (Vectors.
size() > 1) {
17326 auto *ItNext = std::next(Vectors.
begin());
17327 PairMax = SingleMax + ItNext->second.size();
17330 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
17331 return std::nullopt;
17337 if (SingleMax >= PairMax && SingleMax) {
17338 for (
int Idx : Vectors.
front().second)
17339 std::swap(GatheredExtracts[Idx], VL[Idx]);
17340 }
else if (!Vectors.
empty()) {
17341 for (
unsigned Idx : {0, 1})
17342 for (
int Idx : Vectors[Idx].second)
17343 std::swap(GatheredExtracts[Idx], VL[Idx]);
17346 for (
int Idx : UndefVectorExtracts)
17347 std::swap(GatheredExtracts[Idx], VL[Idx]);
17350 std::optional<TTI::ShuffleKind> Res =
17356 return std::nullopt;
17360 for (
int I = 0,
E = GatheredExtracts.size();
I <
E; ++
I) {
17381BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
17382 SmallVectorImpl<int> &Mask,
17383 unsigned NumParts)
const {
17384 assert(NumParts > 0 &&
"NumParts expected be greater than or equal to 1.");
17393 SmallVector<int> SubMask;
17394 std::optional<TTI::ShuffleKind> Res =
17395 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
17396 ShufflesRes[Part] = Res;
17397 copy(SubMask, std::next(
Mask.begin(), Part * SliceSize));
17399 if (
none_of(ShufflesRes, [](
const std::optional<TTI::ShuffleKind> &Res) {
17400 return Res.has_value();
17402 ShufflesRes.clear();
17403 return ShufflesRes;
17406std::optional<TargetTransformInfo::ShuffleKind>
17407BoUpSLP::isGatherShuffledSingleRegisterEntry(
17409 SmallVectorImpl<const TreeEntry *> &Entries,
unsigned Part,
bool ForOrder) {
17412 return std::nullopt;
17415 auto GetUserEntry = [&](
const TreeEntry *
TE) {
17416 while (
TE->UserTreeIndex &&
TE->UserTreeIndex.EdgeIdx == UINT_MAX)
17417 TE =
TE->UserTreeIndex.UserTE;
17418 if (TE == VectorizableTree.front().get())
17419 return EdgeInfo(
const_cast<TreeEntry *
>(TE), 0);
17420 return TE->UserTreeIndex;
17422 auto HasGatherUser = [&](
const TreeEntry *
TE) {
17423 while (
TE->Idx != 0 &&
TE->UserTreeIndex) {
17424 if (
TE->UserTreeIndex.EdgeIdx == UINT_MAX)
17426 TE =
TE->UserTreeIndex.UserTE;
17430 const EdgeInfo TEUseEI = GetUserEntry(TE);
17432 return std::nullopt;
17433 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
17438 TEUseEI.UserTE->hasState() ? TEUseEI.UserTE->getMainOp() :
nullptr);
17439 PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
17440 TEInsertBlock =
PHI->getIncomingBlock(TEUseEI.EdgeIdx);
17443 TEInsertBlock = TEInsertPt->
getParent();
17445 if (!DT->isReachableFromEntry(TEInsertBlock))
17446 return std::nullopt;
17447 auto *NodeUI = DT->getNode(TEInsertBlock);
17448 assert(NodeUI &&
"Should only process reachable instructions");
17450 auto CheckOrdering = [&](
const Instruction *InsertPt) {
17463 const BasicBlock *InsertBlock = InsertPt->getParent();
17464 auto *NodeEUI = DT->getNode(InsertBlock);
17467 assert((NodeUI == NodeEUI) ==
17468 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
17469 "Different nodes should have different DFS numbers");
17471 if (TEInsertPt->
getParent() != InsertBlock &&
17472 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
17474 if (TEInsertPt->
getParent() == InsertBlock &&
17487 SmallDenseMap<Value *, int> UsedValuesEntry;
17488 SmallPtrSet<const Value *, 16> VisitedValue;
17489 auto CheckAndUseSameNode = [&](
const TreeEntry *TEPtr) {
17491 if ((TEPtr->getVectorFactor() != VL.
size() &&
17492 TEPtr->Scalars.size() != VL.
size()) ||
17493 (!TEPtr->isSame(VL) && !TEPtr->isSame(
TE->Scalars)))
17497 for (
Value *V : VL) {
17504 auto CheckParentNodes = [&](
const TreeEntry *User1,
const TreeEntry *User2,
17505 unsigned EdgeIdx) {
17506 const TreeEntry *Ptr1 = User1;
17507 const TreeEntry *Ptr2 = User2;
17508 SmallDenseMap<const TreeEntry *, unsigned> PtrToIdx;
17511 EdgeIdx = Ptr2->UserTreeIndex.EdgeIdx;
17512 Ptr2 = Ptr2->UserTreeIndex.UserTE;
17515 unsigned Idx = Ptr1->UserTreeIndex.EdgeIdx;
17516 Ptr1 = Ptr1->UserTreeIndex.UserTE;
17517 if (
auto It = PtrToIdx.
find(Ptr1); It != PtrToIdx.
end())
17518 return Idx < It->second;
17522 auto CheckNonSchedulableOrdering = [&](
const TreeEntry *
E,
17524 return TEUseEI && TEUseEI.UserTE && TEUseEI.UserTE->hasCopyableElements() &&
17525 !TEUseEI.UserTE->isCopyableElement(
17528 InsertPt->getNextNode() == TEInsertPt &&
17529 (!
E->hasCopyableElements() || !
E->isCopyableElement(InsertPt) ||
17532 for (
Value *V : VL) {
17536 SmallPtrSet<const TreeEntry *, 4> VToTEs;
17538 ValueToGatherNodes.lookup(V).takeVector());
17539 if (TransformedToGatherNodes.contains(TE)) {
17540 for (TreeEntry *
E : getSplitTreeEntries(V)) {
17541 if (TE ==
E || !TransformedToGatherNodes.contains(
E) ||
17542 !
E->UserTreeIndex ||
E->UserTreeIndex.UserTE->isGather())
17544 GatherNodes.push_back(
E);
17546 for (TreeEntry *
E : getTreeEntries(V)) {
17547 if (TE ==
E || !TransformedToGatherNodes.contains(
E) ||
17548 !
E->UserTreeIndex ||
E->UserTreeIndex.UserTE->isGather())
17550 GatherNodes.push_back(
E);
17553 for (
const TreeEntry *TEPtr : GatherNodes) {
17554 if (TEPtr == TE || TEPtr->Idx == 0 || DeletedNodes.contains(TEPtr))
17557 [&](
Value *V) { return GatheredScalars.contains(V); }) &&
17558 "Must contain at least single gathered value.");
17559 assert(TEPtr->UserTreeIndex &&
17560 "Expected only single user of a gather node.");
17561 const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
17563 PHINode *UserPHI = (UseEI.UserTE->State != TreeEntry::SplitVectorize &&
17564 UseEI.UserTE->hasState())
17569 : &getLastInstructionInBundle(UseEI.UserTE);
17570 if (TEInsertPt == InsertPt) {
17572 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
17573 (TEUseEI.UserTE->getOpcode() != Instruction::PHI ||
17574 TEUseEI.UserTE->isAltShuffle()) &&
17576 if (UseEI.UserTE->State != TreeEntry::Vectorize ||
17577 (UseEI.UserTE->hasState() &&
17578 UseEI.UserTE->getOpcode() == Instruction::PHI &&
17579 !UseEI.UserTE->isAltShuffle()) ||
17588 (TEUseEI.UserTE != UseEI.UserTE || TEUseEI.EdgeIdx < UseEI.EdgeIdx))
17591 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
17592 TEUseEI.UserTE->getOpcode() == Instruction::PHI &&
17593 UseEI.UserTE->State == TreeEntry::Vectorize &&
17594 UseEI.UserTE->getOpcode() == Instruction::PHI &&
17595 TEUseEI.UserTE != UseEI.UserTE)
17600 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
17604 if (TEUseEI.UserTE != UseEI.UserTE &&
17605 (TEUseEI.UserTE->Idx < UseEI.UserTE->Idx ||
17606 HasGatherUser(TEUseEI.UserTE)))
17609 if (CheckParentNodes(TEUseEI.UserTE, UseEI.UserTE, UseEI.EdgeIdx))
17613 if (!TEUseEI.UserTE->isGather() && !UserPHI &&
17614 TEUseEI.UserTE->doesNotNeedToSchedule() !=
17615 UseEI.UserTE->doesNotNeedToSchedule() &&
17620 if ((TEInsertBlock != InsertPt->
getParent() ||
17621 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
17622 (!CheckOrdering(InsertPt) ||
17623 (UseEI.UserTE->hasCopyableElements() &&
17628 if (CheckAndUseSameNode(TEPtr))
17633 if (CheckNonSchedulableOrdering(UseEI.UserTE, InsertPt))
17638 const auto *It =
find_if(VTEs, [&](
const TreeEntry *MTE) {
17639 return MTE !=
TE && MTE != TEUseEI.UserTE &&
17640 !DeletedNodes.contains(MTE) &&
17641 !TransformedToGatherNodes.contains(MTE);
17643 if (It != VTEs.end()) {
17644 const TreeEntry *VTE = *It;
17645 if (
none_of(
TE->CombinedEntriesWithIndices,
17646 [&](
const auto &
P) { return P.first == VTE->Idx; })) {
17647 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
17648 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
17652 if (CheckAndUseSameNode(VTE))
17658 const auto *It =
find_if(VTEs, [&, MainTE = TE](
const TreeEntry *TE) {
17659 return TE != MainTE && !DeletedNodes.contains(TE) &&
17660 !TransformedToGatherNodes.contains(TE);
17662 if (It != VTEs.end()) {
17663 const TreeEntry *VTE = *It;
17664 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
17665 VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
17666 VTEs = VTEs.drop_front();
17668 const auto *MIt =
find_if(VTEs, [](
const TreeEntry *MTE) {
17669 return MTE->State == TreeEntry::Vectorize;
17671 if (MIt == VTEs.end())
17675 if (
none_of(
TE->CombinedEntriesWithIndices,
17676 [&](
const auto &
P) { return P.first == VTE->Idx; })) {
17677 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
17678 if (&LastBundleInst == TEInsertPt ||
17679 !CheckOrdering(&LastBundleInst) ||
17680 CheckNonSchedulableOrdering(VTE, &LastBundleInst))
17684 if (CheckAndUseSameNode(VTE))
17689 if (VToTEs.
empty())
17691 if (UsedTEs.
empty()) {
17699 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
17701 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
17705 if (!VToTEs.
empty()) {
17711 VToTEs = SavedVToTEs;
17716 if (Idx == UsedTEs.
size()) {
17720 if (UsedTEs.
size() == 2)
17722 UsedTEs.push_back(SavedVToTEs);
17723 Idx = UsedTEs.
size() - 1;
17729 if (UsedTEs.
empty()) {
17731 return std::nullopt;
17735 if (UsedTEs.
size() == 1) {
17738 UsedTEs.front().
end());
17739 sort(FirstEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
17740 return TE1->Idx < TE2->Idx;
17743 auto *It =
find_if(FirstEntries, [=](
const TreeEntry *EntryPtr) {
17744 return EntryPtr->isSame(VL) || EntryPtr->isSame(
TE->Scalars);
17746 if (It != FirstEntries.end() &&
17747 ((*It)->getVectorFactor() == VL.size() ||
17748 ((*It)->getVectorFactor() ==
TE->Scalars.size() &&
17749 TE->ReuseShuffleIndices.size() == VL.size() &&
17750 (*It)->isSame(
TE->Scalars)))) {
17752 if ((*It)->getVectorFactor() == VL.size()) {
17753 std::iota(std::next(
Mask.begin(), Part * VL.size()),
17754 std::next(
Mask.begin(), (Part + 1) * VL.size()), 0);
17756 SmallVector<int> CommonMask =
TE->getCommonMask();
17767 Entries.
push_back(FirstEntries.front());
17769 for (
auto &
P : UsedValuesEntry)
17771 VF = FirstEntries.front()->getVectorFactor();
17774 assert(UsedTEs.
size() == 2 &&
"Expected at max 2 permuted entries.");
17776 DenseMap<int, const TreeEntry *> VFToTE;
17777 for (
const TreeEntry *TE : UsedTEs.front()) {
17778 unsigned VF =
TE->getVectorFactor();
17779 auto It = VFToTE.
find(VF);
17780 if (It != VFToTE.
end()) {
17781 if (It->second->Idx >
TE->Idx)
17782 It->getSecond() =
TE;
17789 UsedTEs.back().
end());
17790 sort(SecondEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
17791 return TE1->Idx < TE2->Idx;
17793 for (
const TreeEntry *TE : SecondEntries) {
17794 auto It = VFToTE.
find(
TE->getVectorFactor());
17795 if (It != VFToTE.
end()) {
17804 if (Entries.
empty()) {
17806 UsedTEs.front(), [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
17807 return TE1->Idx < TE2->Idx;
17809 Entries.
push_back(SecondEntries.front());
17810 VF = std::max(Entries.
front()->getVectorFactor(),
17811 Entries.
back()->getVectorFactor());
17813 VF = Entries.
front()->getVectorFactor();
17816 for (
const TreeEntry *
E : Entries)
17820 for (
auto &
P : UsedValuesEntry) {
17822 if (ValuesToEntries[Idx].
contains(
P.first)) {
17832 auto AreCompatiblePHIs = [&](
Value *
V,
Value *V1) {
17839 for (
int I = 0,
E =
PHI->getNumIncomingValues();
I <
E; ++
I) {
17841 Value *In1 = PHI1->getIncomingValue(
I);
17856 auto MightBeIgnored = [=](
Value *
V) {
17860 !areAllUsersVectorized(
I, UserIgnoreList) &&
isSimple(
I);
17865 auto NeighborMightBeIgnored = [&](
Value *
V,
int Idx) {
17866 Value *V1 = VL[Idx];
17867 bool UsedInSameVTE =
false;
17868 auto It = UsedValuesEntry.find(V1);
17869 if (It != UsedValuesEntry.end())
17870 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
17871 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
17878 SmallBitVector UsedIdxs(Entries.size());
17880 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
17882 auto It = UsedValuesEntry.find(V);
17883 if (It == UsedValuesEntry.end())
17889 ((
I > 0 && NeighborMightBeIgnored(V,
I - 1)) ||
17890 (
I !=
E - 1 && NeighborMightBeIgnored(V,
I + 1)))))
17892 unsigned Idx = It->second;
17899 for (
unsigned I = 0, Sz = Entries.size();
I < Sz; ++
I) {
17900 if (!UsedIdxs.test(
I))
17906 for (std::pair<unsigned, int> &Pair : EntryLanes)
17907 if (Pair.first ==
I)
17908 Pair.first = TempEntries.
size();
17911 Entries.swap(TempEntries);
17912 if (EntryLanes.size() == Entries.size() &&
17914 .slice(Part * VL.size(),
17915 std::min<int>(VL.size(),
TE->Scalars.size())))) {
17921 return std::nullopt;
17924 bool IsIdentity = Entries.size() == 1;
17927 for (
const std::pair<unsigned, int> &Pair : EntryLanes) {
17928 unsigned Idx = Part * VL.size() + Pair.second;
17931 (ForOrder ? std::distance(
17932 Entries[Pair.first]->Scalars.begin(),
17933 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
17934 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
17935 IsIdentity &=
Mask[Idx] == Pair.second;
17937 if (ForOrder || IsIdentity || Entries.empty()) {
17938 switch (Entries.size()) {
17940 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
17944 if (EntryLanes.size() > 2 || VL.size() <= 2)
17951 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
17953 SmallVector<int> SubMask(std::next(
Mask.begin(), Part * VL.size()),
17954 std::next(
Mask.begin(), (Part + 1) * VL.size()));
17955 int MinElement = SubMask.
front(), MaxElement = SubMask.
front();
17956 for (
int Idx : SubMask) {
17964 assert(MaxElement >= 0 && MinElement >= 0 &&
17965 MaxElement % VF >= MinElement % VF &&
17966 "Expected at least single element.");
17967 unsigned NewVF = std::max<unsigned>(
17969 (MaxElement % VF) -
17970 (MinElement % VF) + 1));
17972 for (
int &Idx : SubMask) {
17975 Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
17976 (Idx >=
static_cast<int>(VF) ? NewVF : 0);
17984 auto *MaskVecTy =
getWidenedType(VL.front()->getType(), SubMask.size());
17985 auto GetShuffleCost = [&,
17986 &TTI = *TTI](ArrayRef<int>
Mask,
17989 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
17991 Mask, Entries.front()->getInterleaveFactor()))
17993 return ::getShuffleCost(TTI,
17998 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
18000 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
18001 if (Entries.size() == 1 || !Entries[0]->isGather()) {
18002 FirstShuffleCost = ShuffleCost;
18006 bool IsIdentity =
true;
18007 for (
auto [
I, Idx] :
enumerate(FirstMask)) {
18008 if (Idx >=
static_cast<int>(NewVF)) {
18013 IsIdentity &=
static_cast<int>(
I) == Idx;
18017 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
18019 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
18023 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
18024 if (Entries.size() == 1 || !Entries[1]->isGather()) {
18025 SecondShuffleCost = ShuffleCost;
18029 bool IsIdentity =
true;
18030 for (
auto [
I, Idx] :
enumerate(SecondMask)) {
18031 if (Idx <
static_cast<int>(NewVF) && Idx >= 0) {
18037 IsIdentity &=
static_cast<int>(
I) == Idx;
18042 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
18044 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
18052 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
18054 const TreeEntry *BestEntry =
nullptr;
18055 if (FirstShuffleCost < ShuffleCost) {
18056 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
18057 std::next(
Mask.begin(), (Part + 1) * VL.size()),
18059 if (Idx >= static_cast<int>(VF))
18060 Idx = PoisonMaskElem;
18062 BestEntry = Entries.front();
18063 ShuffleCost = FirstShuffleCost;
18065 if (SecondShuffleCost < ShuffleCost) {
18066 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
18067 std::next(
Mask.begin(), (Part + 1) * VL.size()),
18069 if (Idx < static_cast<int>(VF))
18070 Idx = PoisonMaskElem;
18074 BestEntry = Entries[1];
18075 ShuffleCost = SecondShuffleCost;
18077 if (BuildVectorCost >= ShuffleCost) {
18080 Entries.push_back(BestEntry);
18088 std::fill(std::next(
Mask.begin(), Part * VL.size()),
18090 return std::nullopt;
18094BoUpSLP::isGatherShuffledEntry(
18098 assert(NumParts > 0 && NumParts < VL.
size() &&
18099 "Expected positive number of registers.");
18102 if (TE == VectorizableTree.front().get() &&
18103 (!GatheredLoadsEntriesFirst.has_value() ||
18105 [](
const std::unique_ptr<TreeEntry> &TE) {
18106 return !
TE->isGather();
18111 if (
TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
18114 assert((
TE->UserTreeIndex || TE == VectorizableTree.front().get()) &&
18115 "Expected only single user of the gather node.");
18117 "Number of scalars must be divisible by NumParts.");
18118 if (
TE->UserTreeIndex &&
TE->UserTreeIndex.UserTE->isGather() &&
18119 TE->UserTreeIndex.EdgeIdx == UINT_MAX &&
18121 (
TE->hasState() &&
TE->getOpcode() == Instruction::ExtractElement) ||
18124 getSameValuesTreeEntry(
TE->getMainOp(),
TE->Scalars))))
18131 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
18132 std::optional<TTI::ShuffleKind> SubRes =
18133 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
18136 SubEntries.
clear();
18139 SubEntries.
front()->getVectorFactor() == VL.
size() &&
18140 (SubEntries.
front()->isSame(
TE->Scalars) ||
18141 SubEntries.
front()->isSame(VL))) {
18143 LocalSubEntries.
swap(SubEntries);
18146 std::iota(
Mask.begin(),
Mask.end(), 0);
18148 for (
int I = 0, Sz = VL.
size();
I < Sz; ++
I)
18151 Entries.emplace_back(1, LocalSubEntries.
front());
18157 [](
const std::optional<TTI::ShuffleKind> &SK) {
return !SK; })) {
18165 Type *ScalarTy)
const {
18166 const unsigned VF = VL.
size();
18174 auto EstimateInsertCost = [&](
unsigned I,
Value *
V) {
18176 if (
V->getType() != ScalarTy)
18177 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy,
V->getType(),
18181 std::iota(ConstantShuffleMask.begin(), ConstantShuffleMask.end(), 0);
18188 ConstantShuffleMask[
I] =
I + VF;
18191 EstimateInsertCost(
I, V);
18194 bool IsAnyNonUndefConst =
18197 if (!ForPoisonSrc && IsAnyNonUndefConst) {
18199 ConstantShuffleMask);
18203 if (!DemandedElements.
isZero())
18207 ForPoisonSrc && !IsAnyNonUndefConst, VL);
18211Instruction &BoUpSLP::getLastInstructionInBundle(
const TreeEntry *
E) {
18212 auto It = EntryToLastInstruction.find(
E);
18213 if (It != EntryToLastInstruction.end())
18221 if (
E->hasState()) {
18222 Front =
E->getMainOp();
18223 Opcode =
E->getOpcode();
18230 ((GatheredLoadsEntriesFirst.has_value() && Opcode == Instruction::Load &&
18231 E->isGather() &&
E->Idx < *GatheredLoadsEntriesFirst) ||
18232 E->State == TreeEntry::SplitVectorize ||
E->hasCopyableElements() ||
18234 [=](
Value *V) ->
bool {
18235 if (Opcode == Instruction::GetElementPtr &&
18236 !isa<GetElementPtrInst>(V))
18238 auto *I = dyn_cast<Instruction>(V);
18239 return !I || !E->getMatchingMainOpOrAltOp(I) ||
18240 I->getParent() == BB || isVectorLikeInstWithConstOps(I);
18242 "Expected gathered loads or GEPs or instructions from same basic "
18245 auto FindLastInst = [&]() {
18247 for (
Value *V :
E->Scalars) {
18251 if (
E->isCopyableElement(
I))
18253 if (LastInst->
getParent() ==
I->getParent()) {
18258 assert(((Opcode == Instruction::GetElementPtr &&
18260 E->State == TreeEntry::SplitVectorize ||
18263 (GatheredLoadsEntriesFirst.has_value() &&
18264 Opcode == Instruction::Load &&
E->isGather() &&
18265 E->Idx < *GatheredLoadsEntriesFirst)) &&
18266 "Expected vector-like or non-GEP in GEP node insts only.");
18267 if (!DT->isReachableFromEntry(LastInst->
getParent())) {
18271 if (!DT->isReachableFromEntry(
I->getParent()))
18273 auto *NodeA = DT->getNode(LastInst->
getParent());
18274 auto *NodeB = DT->getNode(
I->getParent());
18275 assert(NodeA &&
"Should only process reachable instructions");
18276 assert(NodeB &&
"Should only process reachable instructions");
18277 assert((NodeA == NodeB) ==
18278 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
18279 "Different nodes should have different DFS numbers");
18280 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
18287 auto FindFirstInst = [&]() {
18289 for (
Value *V :
E->Scalars) {
18293 if (
E->isCopyableElement(
I))
18295 if (FirstInst->
getParent() ==
I->getParent()) {
18296 if (
I->comesBefore(FirstInst))
18300 assert(((Opcode == Instruction::GetElementPtr &&
18304 "Expected vector-like or non-GEP in GEP node insts only.");
18305 if (!DT->isReachableFromEntry(FirstInst->
getParent())) {
18309 if (!DT->isReachableFromEntry(
I->getParent()))
18311 auto *NodeA = DT->getNode(FirstInst->
getParent());
18312 auto *NodeB = DT->getNode(
I->getParent());
18313 assert(NodeA &&
"Should only process reachable instructions");
18314 assert(NodeB &&
"Should only process reachable instructions");
18315 assert((NodeA == NodeB) ==
18316 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
18317 "Different nodes should have different DFS numbers");
18318 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
18324 if (
E->State == TreeEntry::SplitVectorize) {
18325 Res = FindLastInst();
18327 for (
auto *
E : Entries) {
18330 I = &getLastInstructionInBundle(
E);
18335 EntryToLastInstruction.try_emplace(
E, Res);
18340 if (GatheredLoadsEntriesFirst.has_value() &&
18341 E->Idx >= *GatheredLoadsEntriesFirst && !
E->isGather() &&
18342 Opcode == Instruction::Load) {
18343 Res = FindFirstInst();
18344 EntryToLastInstruction.try_emplace(
E, Res);
18350 auto FindScheduleBundle = [&](
const TreeEntry *
E) ->
const ScheduleBundle * {
18354 const auto *It = BlocksSchedules.find(BB);
18355 if (It == BlocksSchedules.end())
18357 for (
Value *V :
E->Scalars) {
18363 if (Bundles.
empty())
18366 Bundles, [&](ScheduleBundle *
B) {
return B->getTreeEntry() ==
E; });
18367 if (It != Bundles.
end())
18372 const ScheduleBundle *Bundle = FindScheduleBundle(
E);
18373 if (!
E->isGather() && !Bundle) {
18374 if ((Opcode == Instruction::GetElementPtr &&
18377 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
18381 return isa<PoisonValue>(V) ||
18382 (E->Idx == 0 && isa<InsertElementInst>(V)) ||
18383 E->isCopyableElement(V) ||
18384 (!isVectorLikeInstWithConstOps(V) &&
18385 isUsedOutsideBlock(V));
18387 (!
E->doesNotNeedToSchedule() ||
18390 if (!isa<Instruction>(V) ||
18391 (E->hasCopyableElements() && E->isCopyableElement(V)))
18393 return !areAllOperandsNonInsts(V);
18396 if (!isa<Instruction>(V) ||
18397 (E->hasCopyableElements() && E->isCopyableElement(V)))
18399 return MustGather.contains(V);
18401 Res = FindLastInst();
18403 Res = FindFirstInst();
18404 EntryToLastInstruction.try_emplace(
E, Res);
18413 assert(!
E->isGather() &&
"Gathered instructions should not be scheduled");
18414 Res = Bundle->getBundle().back()->getInst();
18415 EntryToLastInstruction.try_emplace(
E, Res);
18438 Res = FindLastInst();
18439 assert(Res &&
"Failed to find last instruction in bundle");
18440 EntryToLastInstruction.try_emplace(
E, Res);
18444void BoUpSLP::setInsertPointAfterBundle(
const TreeEntry *
E) {
18445 auto *Front =
E->getMainOp();
18446 Instruction *LastInst = &getLastInstructionInBundle(
E);
18447 assert(LastInst &&
"Failed to find last instruction in bundle");
18452 LastInstIt = LastInst->
getParent()->getFirstNonPHIIt();
18453 if (LastInstIt != LastInst->
getParent()->end() &&
18454 LastInstIt->getParent()->isLandingPad())
18455 LastInstIt = std::next(LastInstIt);
18458 (!
E->isGather() &&
E->State != TreeEntry::SplitVectorize &&
18459 (
E->doesNotNeedToSchedule() ||
18460 (
E->hasCopyableElements() && !
E->isCopyableElement(LastInst) &&
18462 (GatheredLoadsEntriesFirst.has_value() &&
18463 E->Idx >= *GatheredLoadsEntriesFirst && !
E->isGather() &&
18464 E->getOpcode() == Instruction::Load)) {
18465 Builder.SetInsertPoint(LastInst->
getParent(), LastInstIt);
18469 Builder.SetInsertPoint(
18472 if (Instruction *Res = LastInstructionToPos.lookup(LastInst)) {
18475 Res = Builder.CreateAlignedLoad(Builder.getPtrTy(),
18480 LastInstructionToPos.try_emplace(LastInst, Res);
18483 Builder.SetCurrentDebugLocation(Front->
getDebugLoc());
18486Value *BoUpSLP::gather(
18488 function_ref<
Value *(
Value *,
Value *, ArrayRef<int>)> CreateShuffle) {
18494 SmallSet<int, 4> PostponedIndices;
18495 Loop *
L = LI->getLoopFor(Builder.GetInsertBlock());
18497 SmallPtrSet<BasicBlock *, 4> Visited;
18498 while (InsertBB && InsertBB != InstBB && Visited.
insert(InsertBB).second)
18499 InsertBB = InsertBB->getSinglePredecessor();
18500 return InsertBB && InsertBB == InstBB;
18502 for (
int I = 0,
E = VL.
size();
I <
E; ++
I) {
18504 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
18506 (L && (!Root ||
L->isLoopInvariant(Root)) &&
L->contains(Inst))) &&
18507 PostponedIndices.
insert(
I).second)
18511 auto &&CreateInsertElement = [
this](
Value *Vec,
Value *
V,
unsigned Pos,
18514 if (
Scalar->getType() != Ty) {
18525 Scalar = Builder.CreateIntCast(
18539 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
18544 GatherShuffleExtractSeq.insert(InsElt);
18549 const auto *It =
find_if(Entries, [&](
const TreeEntry *
E) {
18550 return !TransformedToGatherNodes.contains(
E) &&
18551 !DeletedNodes.contains(
E);
18553 if (It != Entries.
end()) {
18555 User *UserOp =
nullptr;
18560 if (
V->getType()->isVectorTy()) {
18562 SV && SV->getOperand(0) != V && SV->getOperand(1) != V) {
18564 auto FindOperand = [](
Value *Vec,
Value *
V) -> Instruction * {
18566 if (SV->getOperand(0) == V)
18568 if (SV->getOperand(1) == V)
18574 if (Instruction *User = FindOperand(SV->getOperand(0), V))
18576 else if (Instruction *User = FindOperand(SV->getOperand(1), V))
18579 "Failed to find shufflevector, caused by resize.");
18585 unsigned FoundLane = (*It)->findLaneForValue(V);
18586 ExternalUses.emplace_back(V, UserOp, **It, FoundLane);
18594 SmallVector<int> NonConsts;
18596 std::iota(
Mask.begin(),
Mask.end(), 0);
18597 Value *OriginalRoot = Root;
18600 SV->getOperand(0)->getType() == VecTy) {
18601 Root = SV->getOperand(0);
18602 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
18605 for (
int I = 0,
E = VL.
size();
I <
E; ++
I) {
18614 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
18619 Vec = OriginalRoot;
18621 Vec = CreateShuffle(Root, Vec, Mask);
18623 OI && OI->use_empty() &&
18624 none_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
18625 return TE->VectorizedValue == OI;
18631 for (
int I : NonConsts)
18632 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
18635 for (
const std::pair<Value *, unsigned> &Pair : PostponedInsts)
18636 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
18674 bool IsFinalized =
false;
18687 class ShuffleIRBuilder {
18700 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
18701 CSEBlocks(CSEBlocks),
DL(DL) {}
18702 ~ShuffleIRBuilder() =
default;
18708 "Expected integer vector types only.");
18714 ->getIntegerBitWidth())
18715 V2 = Builder.CreateIntCast(
18718 V1 = Builder.CreateIntCast(
18722 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
18724 GatherShuffleExtractSeq.insert(
I);
18725 CSEBlocks.insert(
I->getParent());
18734 unsigned VF = Mask.size();
18738 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
18740 GatherShuffleExtractSeq.insert(
I);
18741 CSEBlocks.insert(
I->getParent());
18745 Value *createIdentity(
Value *V) {
return V; }
18746 Value *createPoison(
Type *Ty,
unsigned VF) {
18751 void resizeToMatch(
Value *&V1,
Value *&V2) {
18756 int VF = std::max(V1VF, V2VF);
18757 int MinVF = std::min(V1VF, V2VF);
18759 std::iota(IdentityMask.
begin(), std::next(IdentityMask.
begin(), MinVF),
18761 Value *&
Op = MinVF == V1VF ? V1 : V2;
18762 Op = Builder.CreateShuffleVector(
Op, IdentityMask);
18764 GatherShuffleExtractSeq.insert(
I);
18765 CSEBlocks.insert(
I->getParent());
18778 assert(V1 &&
"Expected at least one vector value.");
18779 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
18780 R.CSEBlocks, *R.DL);
18781 return BaseShuffleAnalysis::createShuffle<Value *>(
18782 V1, V2, Mask, ShuffleBuilder, ScalarTy);
18788 std::optional<bool> IsSigned = std::nullopt) {
18791 if (VecTy->getElementType() == ScalarTy->getScalarType())
18793 return Builder.CreateIntCast(
18794 V,
VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
18798 Value *getVectorizedValue(
const TreeEntry &E) {
18799 Value *Vec = E.VectorizedValue;
18802 return castToScalarTyElem(Vec,
any_of(E.Scalars, [&](
Value *V) {
18803 return !isa<PoisonValue>(V) &&
18804 !isKnownNonNegative(
18805 V, SimplifyQuery(*R.DL));
18811 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
18815 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
18816 unsigned NumParts,
bool &UseVecBaseAsInput) {
18817 UseVecBaseAsInput =
false;
18819 Value *VecBase =
nullptr;
18821 if (!E->ReorderIndices.empty()) {
18823 E->ReorderIndices.end());
18826 for (
int I = 0, Sz = Mask.size();
I < Sz; ++
I) {
18831 VecBase = EI->getVectorOperand();
18833 VecBase = TEs.front()->VectorizedValue;
18834 assert(VecBase &&
"Expected vectorized value.");
18835 UniqueBases.
insert(VecBase);
18838 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
18839 (E->UserTreeIndex && E->UserTreeIndex.EdgeIdx == UINT_MAX &&
18840 !R.isVectorized(EI) &&
18842 count_if(E->UserTreeIndex.UserTE->Scalars,
18843 [&](
Value *V) { return V == EI; })) ||
18844 (NumParts != 1 &&
count(VL, EI) > 1) ||
18846 ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(U);
18847 return UTEs.empty() || UTEs.size() > 1 ||
18849 [&](const TreeEntry *TE) {
18850 return R.DeletedNodes.contains(TE) ||
18851 R.TransformedToGatherNodes.contains(TE);
18857 [&](
const std::unique_ptr<TreeEntry> &TE) {
18858 return TE->UserTreeIndex.UserTE ==
18860 is_contained(VL, EI);
18864 R.eraseInstruction(EI);
18866 if (NumParts == 1 || UniqueBases.
size() == 1) {
18867 assert(VecBase &&
"Expected vectorized value.");
18868 return castToScalarTyElem(VecBase);
18870 UseVecBaseAsInput =
true;
18880 Value *Vec =
nullptr;
18887 constexpr int MaxBases = 2;
18889 auto VLMask =
zip(SubVL, SubMask);
18890 const unsigned VF = std::accumulate(
18891 VLMask.begin(), VLMask.end(), 0U, [&](
unsigned S,
const auto &
D) {
18892 if (std::get<1>(D) == PoisonMaskElem)
18895 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
18896 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp);
18898 VecOp = TEs.front()->VectorizedValue;
18899 assert(VecOp &&
"Expected vectorized value.");
18900 const unsigned Size =
18901 cast<FixedVectorType>(VecOp->getType())->getNumElements();
18902 return std::max(S, Size);
18904 for (
const auto [V,
I] : VLMask) {
18909 VecOp = TEs.front()->VectorizedValue;
18910 assert(VecOp &&
"Expected vectorized value.");
18911 VecOp = castToScalarTyElem(VecOp);
18912 Bases[
I / VF] = VecOp;
18914 if (!Bases.front())
18917 if (Bases.back()) {
18918 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
18919 TransformToIdentity(SubMask);
18921 SubVec = Bases.front();
18927 ArrayRef<int> SubMask =
18928 Mask.slice(
P * SliceSize,
18931 return all_of(SubMask, [](
int Idx) {
18935 "Expected first part or all previous parts masked.");
18936 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18941 unsigned SubVecVF =
18943 NewVF = std::max(NewVF, SubVecVF);
18946 for (
int &Idx : SubMask)
18949 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18950 Vec = createShuffle(Vec, SubVec, VecMask);
18951 TransformToIdentity(VecMask);
18959 std::optional<Value *>
18965 TEs, [](
const TreeEntry *TE) {
return TE->VectorizedValue; });
18967 return std::nullopt;
18970 auto *ResVecTy =
getWidenedType(ScalarTy, E->getVectorFactor());
18971 return Builder.CreateAlignedLoad(
18978 IsFinalized =
false;
18979 CommonMask.clear();
18985 Value *V1 = getVectorizedValue(E1);
18986 Value *V2 = getVectorizedValue(E2);
18992 Value *V1 = getVectorizedValue(E1);
18997 assert(V1 && V2 && !Mask.empty() &&
"Expected non-empty input vectors.");
19000 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
19001 V1 = castToScalarTyElem(V1);
19002 V2 = castToScalarTyElem(V2);
19003 if (InVectors.empty()) {
19004 InVectors.push_back(V1);
19005 InVectors.push_back(V2);
19006 CommonMask.assign(Mask.begin(), Mask.end());
19009 Value *Vec = InVectors.front();
19010 if (InVectors.size() == 2) {
19011 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
19012 transformMaskAfterShuffle(CommonMask, CommonMask);
19015 Vec = createShuffle(Vec,
nullptr, CommonMask);
19016 transformMaskAfterShuffle(CommonMask, CommonMask);
19018 V1 = createShuffle(V1, V2, Mask);
19019 unsigned VF = std::max(getVF(V1), getVF(Vec));
19020 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
19022 CommonMask[Idx] = Idx + VF;
19023 InVectors.front() = Vec;
19024 if (InVectors.size() == 2)
19025 InVectors.back() = V1;
19027 InVectors.push_back(V1);
19032 "castToScalarTyElem expects V1 to be FixedVectorType");
19033 V1 = castToScalarTyElem(V1);
19034 if (InVectors.empty()) {
19035 InVectors.push_back(V1);
19036 CommonMask.assign(Mask.begin(), Mask.end());
19039 const auto *It =
find(InVectors, V1);
19040 if (It == InVectors.end()) {
19041 if (InVectors.size() == 2 ||
19042 InVectors.front()->getType() != V1->
getType()) {
19043 Value *V = InVectors.front();
19044 if (InVectors.size() == 2) {
19045 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
19046 transformMaskAfterShuffle(CommonMask, CommonMask);
19048 CommonMask.size()) {
19049 V = createShuffle(InVectors.front(),
nullptr, CommonMask);
19050 transformMaskAfterShuffle(CommonMask, CommonMask);
19052 unsigned VF = std::max(CommonMask.size(), Mask.size());
19053 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
19055 CommonMask[Idx] = V->getType() != V1->
getType()
19057 : Mask[Idx] + getVF(V1);
19058 if (V->getType() != V1->
getType())
19059 V1 = createShuffle(V1,
nullptr, Mask);
19060 InVectors.front() = V;
19061 if (InVectors.size() == 2)
19062 InVectors.back() = V1;
19064 InVectors.push_back(V1);
19069 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
19071 InVectors.push_back(V1);
19076 for (
Value *V : InVectors)
19077 VF = std::max(VF, getVF(V));
19078 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
19080 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
19089 Value *Root =
nullptr) {
19090 return R.gather(VL, Root, ScalarTy,
19092 return createShuffle(V1, V2, Mask);
19101 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
19106 IsFinalized =
true;
19109 if (InVectors.
size() == 2) {
19110 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
19113 Vec = createShuffle(Vec,
nullptr, CommonMask);
19115 transformMaskAfterShuffle(CommonMask, CommonMask);
19117 "Expected vector length for the final value before action.");
19121 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
19122 Vec = createShuffle(Vec,
nullptr, ResizeMask);
19124 Action(Vec, CommonMask, [
this](
Value *V1,
Value *V2, ArrayRef<int> Mask) {
19125 return createShuffle(V1, V2, Mask);
19127 InVectors.
front() = Vec;
19129 if (!SubVectors.empty()) {
19131 if (InVectors.
size() == 2) {
19132 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
19135 Vec = createShuffle(Vec,
nullptr, CommonMask);
19137 transformMaskAfterShuffle(CommonMask, CommonMask);
19138 auto CreateSubVectors = [&](
Value *Vec,
19139 SmallVectorImpl<int> &CommonMask) {
19140 for (
auto [
E, Idx] : SubVectors) {
19141 Value *
V = getVectorizedValue(*
E);
19148 Type *OrigScalarTy = ScalarTy;
19151 Builder, Vec, V, InsertionIndex,
19152 std::bind(&ShuffleInstructionBuilder::createShuffle,
this, _1, _2,
19154 ScalarTy = OrigScalarTy;
19155 if (!CommonMask.
empty()) {
19156 std::iota(std::next(CommonMask.
begin(), Idx),
19157 std::next(CommonMask.
begin(), Idx +
E->getVectorFactor()),
19163 if (SubVectorsMask.
empty()) {
19164 Vec = CreateSubVectors(Vec, CommonMask);
19167 copy(SubVectorsMask, SVMask.begin());
19168 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
19171 I1 = I2 + CommonMask.
size();
19176 Vec = createShuffle(InsertVec, Vec, SVMask);
19177 transformMaskAfterShuffle(CommonMask, SVMask);
19179 InVectors.
front() = Vec;
19182 if (!ExtMask.
empty()) {
19183 if (CommonMask.
empty()) {
19187 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
19190 NewMask[
I] = CommonMask[ExtMask[
I]];
19192 CommonMask.
swap(NewMask);
19195 if (CommonMask.
empty()) {
19196 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
19197 return InVectors.
front();
19199 if (InVectors.
size() == 2)
19200 return createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
19201 return createShuffle(InVectors.
front(),
nullptr, CommonMask);
19205 assert((IsFinalized || CommonMask.empty()) &&
19206 "Shuffle construction must be finalized.");
19210Value *BoUpSLP::vectorizeOperand(TreeEntry *E,
unsigned NodeIdx) {
19214template <
typename BVTy,
typename ResTy,
typename... Args>
19215ResTy BoUpSLP::processBuildVector(
const TreeEntry *E,
Type *ScalarTy,
19217 assert((E->isGather() || TransformedToGatherNodes.contains(E)) &&
19218 "Expected gather node.");
19219 unsigned VF = E->getVectorFactor();
19221 bool NeedFreeze =
false;
19224 for (
auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
19226 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
19229 E->CombinedEntriesWithIndices.size());
19230 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
19231 [&](
const auto &
P) {
19232 return std::make_pair(VectorizableTree[P.first].get(), P.second);
19237 E->ReorderIndices.end());
19238 if (!ReorderMask.
empty())
19244 if (!SubVectors.empty() && !SubVectorsMask.
empty()) {
19246 if (E->Scalars[
I] == GatheredScalars[ReorderMask[
I]])
19249 SubVectorsMask.
clear();
19253 unsigned I,
unsigned SliceSize,
19254 bool IsNotPoisonous) {
19256 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
19259 TreeEntry *UserTE = E->UserTreeIndex.UserTE;
19260 unsigned EdgeIdx = E->UserTreeIndex.EdgeIdx;
19261 if (UserTE->getNumOperands() != 2)
19263 if (!IsNotPoisonous) {
19264 auto *It =
find_if(
ArrayRef(VectorizableTree).drop_front(UserTE->Idx + 1),
19265 [=](
const std::unique_ptr<TreeEntry> &TE) {
19266 return TE->UserTreeIndex.UserTE == UserTE &&
19267 TE->UserTreeIndex.EdgeIdx != EdgeIdx;
19269 if (It == VectorizableTree.end())
19272 if (!(*It)->ReorderIndices.empty()) {
19276 if (!
all_of(
zip(GatheredScalars, GS), [&](
const auto &
P) {
19277 Value *V0 = std::get<0>(
P);
19278 Value *V1 = std::get<1>(
P);
19286 if ((Mask.size() < InputVF &&
19289 (Mask.size() == InputVF &&
19292 std::next(Mask.begin(),
I * SliceSize),
19293 std::next(Mask.begin(),
19300 std::next(Mask.begin(),
I * SliceSize),
19301 std::next(Mask.begin(),
19307 BVTy ShuffleBuilder(ScalarTy, Params...);
19308 ResTy Res = ResTy();
19312 Value *ExtractVecBase =
nullptr;
19313 bool UseVecBaseAsInput =
false;
19316 Type *OrigScalarTy = GatheredScalars.
front()->getType();
19321 bool Resized =
false;
19323 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
19324 if (!ExtractShuffles.
empty()) {
19326 for (
auto [Idx,
I] :
enumerate(ExtractMask)) {
19332 ExtractEntries.
append(TEs.begin(), TEs.end());
19334 if (std::optional<ResTy> Delayed =
19335 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
19337 PostponedGathers.insert(E);
19342 if (
Value *VecBase = ShuffleBuilder.adjustExtracts(
19343 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
19344 ExtractVecBase = VecBase;
19346 if (VF == VecBaseTy->getNumElements() &&
19347 GatheredScalars.
size() != VF) {
19349 GatheredScalars.
append(VF - GatheredScalars.
size(),
19357 if (!ExtractShuffles.
empty() || !E->hasState() ||
19358 E->getOpcode() != Instruction::Load ||
19359 (((E->hasState() && E->getOpcode() == Instruction::Load) ||
19363 return isa<LoadInst>(V) && isVectorized(V);
19365 (E->hasState() && E->isAltShuffle()) ||
19366 all_of(E->Scalars, [
this](
Value *V) { return isVectorized(V); }) ||
19368 (E->Scalars != GatheredScalars && GatheredScalars.
size() <= 2)) {
19370 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
19372 if (!GatherShuffles.
empty()) {
19373 if (std::optional<ResTy> Delayed =
19374 ShuffleBuilder.needToDelay(E, Entries)) {
19376 PostponedGathers.insert(E);
19381 if (GatherShuffles.
size() == 1 &&
19383 Entries.
front().front()->isSame(E->Scalars)) {
19386 LLVM_DEBUG(
dbgs() <<
"SLP: perfect diamond match for gather bundle "
19389 Mask.resize(E->Scalars.size());
19390 const TreeEntry *FrontTE = Entries.
front().front();
19391 if (FrontTE->ReorderIndices.empty() &&
19392 ((FrontTE->ReuseShuffleIndices.empty() &&
19393 E->Scalars.size() == FrontTE->Scalars.size()) ||
19394 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
19395 std::iota(Mask.begin(), Mask.end(), 0);
19402 Mask[
I] = FrontTE->findLaneForValue(V);
19407 ShuffleBuilder.resetForSameNode();
19408 ShuffleBuilder.add(*FrontTE, Mask);
19410 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
19414 if (GatheredScalars.
size() != VF &&
19416 return any_of(TEs, [&](
const TreeEntry *TE) {
19417 return TE->getVectorFactor() == VF;
19420 GatheredScalars.
append(VF - GatheredScalars.
size(),
19424 for (
int I = 0, Sz = Mask.size();
I < Sz; ++
I) {
19432 bool IsRootPoison) {
19435 bool IsSplat = IsRootPoison &&
isSplat(Scalars) &&
19442 int NumNonConsts = 0;
19461 Scalars.
front() = OrigV;
19464 const auto Res = UniquePositions.
try_emplace(OrigV,
I);
19465 Scalars[Res.first->second] = OrigV;
19466 ReuseMask[
I] = Res.first->second;
19469 if (NumNonConsts == 1) {
19474 if (!UndefPos.
empty() && UndefPos.
front() == 0)
19477 ReuseMask[SinglePos] = SinglePos;
19478 }
else if (!UndefPos.
empty() && IsSplat) {
19485 (E->UserTreeIndex &&
any_of(V->uses(), [E](
const Use &U) {
19488 return E->UserTreeIndex.EdgeIdx != U.getOperandNo() &&
19489 is_contained(E->UserTreeIndex.UserTE->Scalars,
19493 if (It != Scalars.
end()) {
19495 int Pos = std::distance(Scalars.
begin(), It);
19496 for (
int I : UndefPos) {
19498 ReuseMask[
I] = Pos;
19507 for (
int I : UndefPos) {
19516 if (!ExtractShuffles.
empty() || !GatherShuffles.
empty()) {
19517 bool IsNonPoisoned =
true;
19518 bool IsUsedInExpr =
true;
19519 Value *Vec1 =
nullptr;
19520 if (!ExtractShuffles.
empty()) {
19524 Value *Vec2 =
nullptr;
19525 for (
unsigned I = 0, Sz = ExtractMask.
size();
I < Sz; ++
I) {
19529 if (UseVecBaseAsInput) {
19530 Vec1 = ExtractVecBase;
19532 for (
unsigned I = 0, Sz = ExtractMask.
size();
I < Sz; ++
I) {
19538 Value *VecOp = EI->getVectorOperand();
19540 !TEs.
empty() && TEs.front()->VectorizedValue)
19541 VecOp = TEs.front()->VectorizedValue;
19544 }
else if (Vec1 != VecOp) {
19545 assert((!Vec2 || Vec2 == VecOp) &&
19546 "Expected only 1 or 2 vectors shuffle.");
19552 IsUsedInExpr =
false;
19555 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
19558 IsUsedInExpr &= FindReusedSplat(
19561 ExtractMask.
size(), IsNotPoisonedVec);
19562 ShuffleBuilder.add(Vec1, ExtractMask,
true);
19563 IsNonPoisoned &= IsNotPoisonedVec;
19565 IsUsedInExpr =
false;
19570 if (!GatherShuffles.
empty()) {
19571 unsigned SliceSize =
19575 for (
const auto [
I, TEs] :
enumerate(Entries)) {
19578 "No shuffles with empty entries list expected.");
19581 assert((TEs.size() == 1 || TEs.size() == 2) &&
19582 "Expected shuffle of 1 or 2 entries.");
19583 unsigned Limit =
getNumElems(Mask.size(), SliceSize,
I);
19586 copy(SubMask, std::next(VecMask.
begin(),
I * SliceSize));
19587 if (TEs.size() == 1) {
19588 bool IsNotPoisonedVec =
19589 TEs.front()->VectorizedValue
19593 FindReusedSplat(VecMask, TEs.
front()->getVectorFactor(),
I,
19594 SliceSize, IsNotPoisonedVec);
19595 ShuffleBuilder.add(*TEs.front(), VecMask);
19596 IsNonPoisoned &= IsNotPoisonedVec;
19598 IsUsedInExpr =
false;
19599 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
19600 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
19611 int EMSz = ExtractMask.
size();
19612 int MSz = Mask.size();
19615 bool IsSingleShuffle = ExtractShuffles.
empty() || GatherShuffles.
empty();
19616 bool IsIdentityShuffle =
19617 ((UseVecBaseAsInput ||
19619 [](
const std::optional<TTI::ShuffleKind> &SK) {
19623 none_of(ExtractMask, [&](
int I) {
return I >= EMSz; }) &&
19625 (!GatherShuffles.
empty() &&
19627 [](
const std::optional<TTI::ShuffleKind> &SK) {
19631 none_of(Mask, [&](
int I) {
return I >= MSz; }) &&
19633 bool EnoughConstsForShuffle =
19643 (!IsIdentityShuffle ||
19644 (GatheredScalars.
size() == 2 &&
19652 for (
int I = 0, Sz = GatheredScalars.
size();
I < Sz; ++
I) {
19653 if (EnoughConstsForShuffle &&
isa<Constant>(GatheredScalars[
I]))
19661 TryPackScalars(GatheredScalars, BVMask,
true);
19662 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.
size());
19663 ShuffleBuilder.add(BV, BVMask);
19667 (IsSingleShuffle && ((IsIdentityShuffle &&
19670 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
19673 Res = ShuffleBuilder.finalize(
19674 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.
size(),
19676 bool IsSplat = isSplat(NonConstants);
19677 SmallVector<int> BVMask(Mask.size(), PoisonMaskElem);
19678 TryPackScalars(NonConstants, BVMask, false);
19679 auto CheckIfSplatIsProfitable = [&]() {
19682 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
19683 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
19684 if (isa<ExtractElementInst>(V) || isVectorized(V))
19686 InstructionCost SplatCost = TTI->getVectorInstrCost(
19687 Instruction::InsertElement, VecTy, CostKind, 0,
19688 PoisonValue::get(VecTy), V);
19689 SmallVector<int> NewMask(Mask.begin(), Mask.end());
19690 for (auto [Idx, I] : enumerate(BVMask))
19691 if (I != PoisonMaskElem)
19692 NewMask[Idx] = Mask.size();
19693 SplatCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy,
19694 NewMask, CostKind);
19695 InstructionCost BVCost = TTI->getVectorInstrCost(
19696 Instruction::InsertElement, VecTy, CostKind,
19697 *find_if(Mask, not_equal_to(PoisonMaskElem)), Vec, V);
19699 if (count(BVMask, PoisonMaskElem) <
19700 static_cast<int>(BVMask.size() - 1)) {
19701 SmallVector<int> NewMask(Mask.begin(), Mask.end());
19702 for (auto [Idx, I] : enumerate(BVMask))
19703 if (I != PoisonMaskElem)
19705 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
19706 VecTy, NewMask, CostKind);
19708 return SplatCost <= BVCost;
19710 if (!IsSplat || Mask.size() <= 2 || !CheckIfSplatIsProfitable()) {
19714 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
19720 Value *BV = ShuffleBuilder.gather(Values, BVMask.
size());
19723 return I == PoisonMaskElem ? PoisonMaskElem : 0;
19726 BV = CreateShuffle(BV,
nullptr, SplatMask);
19729 Mask[Idx] = BVMask.size() + Idx;
19730 Vec = CreateShuffle(Vec, BV, Mask);
19739 TryPackScalars(GatheredScalars, ReuseMask,
true);
19740 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
19741 ShuffleBuilder.add(BV, ReuseMask);
19742 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
19747 for (
auto [
I, V] :
enumerate(GatheredScalars)) {
19751 Value *BV = ShuffleBuilder.gather(GatheredScalars);
19752 ShuffleBuilder.add(BV, Mask);
19753 Res = ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors,
19758 Res = ShuffleBuilder.createFreeze(Res);
19762Value *BoUpSLP::createBuildVector(
const TreeEntry *
E,
Type *ScalarTy) {
19763 for (
auto [EIdx,
_] :
E->CombinedEntriesWithIndices)
19765 return processBuildVector<ShuffleInstructionBuilder, Value *>(
E, ScalarTy,
19773 for (
Value *V : VL)
19786 IRBuilderBase::InsertPointGuard Guard(Builder);
19788 Value *
V =
E->Scalars.front();
19789 Type *ScalarTy =
V->getType();
19792 auto It = MinBWs.find(
E);
19793 if (It != MinBWs.end()) {
19799 if (
E->VectorizedValue)
19800 return E->VectorizedValue;
19802 if (
E->isGather() || TransformedToGatherNodes.contains(
E)) {
19804 if (
E->hasState() &&
E->Idx == 0 && !UserIgnoreList)
19805 setInsertPointAfterBundle(
E);
19806 Value *Vec = createBuildVector(
E, ScalarTy);
19807 E->VectorizedValue = Vec;
19810 if (
E->State == TreeEntry::SplitVectorize) {
19811 assert(
E->CombinedEntriesWithIndices.size() == 2 &&
19812 "Expected exactly 2 combined entries.");
19813 setInsertPointAfterBundle(
E);
19815 *VectorizableTree[
E->CombinedEntriesWithIndices.front().first];
19817 ArrayRef(
E->Scalars).take_front(OpTE1.getVectorFactor())) &&
19818 "Expected same first part of scalars.");
19821 *VectorizableTree[
E->CombinedEntriesWithIndices.back().first];
19823 OpTE2.isSame(
ArrayRef(
E->Scalars).take_back(OpTE2.getVectorFactor())) &&
19824 "Expected same second part of scalars.");
19826 auto GetOperandSignedness = [&](
const TreeEntry *OpE) {
19827 bool IsSigned =
false;
19828 auto It = MinBWs.find(OpE);
19829 if (It != MinBWs.end())
19830 IsSigned = It->second.second;
19833 if (isa<PoisonValue>(V))
19835 return !isKnownNonNegative(R, SimplifyQuery(*DL));
19842 Op1 = Builder.CreateIntCast(
19847 GetOperandSignedness(&OpTE1));
19852 Op2 = Builder.CreateIntCast(
19857 GetOperandSignedness(&OpTE2));
19859 if (
E->ReorderIndices.empty()) {
19863 std::next(
Mask.begin(),
E->CombinedEntriesWithIndices.back().second),
19866 if (ScalarTyNumElements != 1) {
19870 Value *Vec = Builder.CreateShuffleVector(Op1, Mask);
19872 E->CombinedEntriesWithIndices.back().second *
19873 ScalarTyNumElements);
19874 E->VectorizedValue = Vec;
19877 unsigned CommonVF =
19878 std::max(OpTE1.getVectorFactor(), OpTE2.getVectorFactor());
19881 std::iota(
Mask.begin(), std::next(
Mask.begin(), OpTE1.getVectorFactor()),
19883 Op1 = Builder.CreateShuffleVector(Op1, Mask);
19887 std::iota(
Mask.begin(), std::next(
Mask.begin(), OpTE2.getVectorFactor()),
19889 Op2 = Builder.CreateShuffleVector(Op2, Mask);
19891 Value *Vec = Builder.CreateShuffleVector(Op1, Op2,
E->getSplitMask());
19892 E->VectorizedValue = Vec;
19896 bool IsReverseOrder =
19898 auto FinalShuffle = [&](
Value *
V,
const TreeEntry *
E) {
19900 if (
E->getOpcode() == Instruction::Store &&
19901 E->State == TreeEntry::Vectorize) {
19902 ArrayRef<int>
Mask =
19903 ArrayRef(
reinterpret_cast<const int *
>(
E->ReorderIndices.begin()),
19904 E->ReorderIndices.size());
19905 ShuffleBuilder.add(V, Mask);
19906 }
else if ((
E->State == TreeEntry::StridedVectorize && IsReverseOrder) ||
19907 E->State == TreeEntry::CompressVectorize) {
19908 ShuffleBuilder.addOrdered(V, {});
19910 ShuffleBuilder.addOrdered(V,
E->ReorderIndices);
19913 E->CombinedEntriesWithIndices.size());
19915 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](
const auto &
P) {
19916 return std::make_pair(VectorizableTree[P.first].get(), P.second);
19919 (
E->CombinedEntriesWithIndices.empty() ||
E->ReorderIndices.empty()) &&
19920 "Expected either combined subnodes or reordering");
19921 return ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors, {});
19924 assert(!
E->isGather() &&
"Unhandled state");
19925 unsigned ShuffleOrOp =
19926 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector :
E->
getOpcode();
19928 auto GetOperandSignedness = [&](
unsigned Idx) {
19929 const TreeEntry *OpE = getOperandEntry(
E, Idx);
19930 bool IsSigned =
false;
19931 auto It = MinBWs.find(OpE);
19932 if (It != MinBWs.end())
19933 IsSigned = It->second.second;
19936 if (isa<PoisonValue>(V))
19938 return !isKnownNonNegative(R, SimplifyQuery(*DL));
19942 switch (ShuffleOrOp) {
19943 case Instruction::PHI: {
19944 assert((
E->ReorderIndices.empty() || !
E->ReuseShuffleIndices.empty() ||
19945 E != VectorizableTree.front().get() ||
E->UserTreeIndex) &&
19946 "PHI reordering is free.");
19948 Builder.SetInsertPoint(PH->getParent(),
19949 PH->getParent()->getFirstNonPHIIt());
19951 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
19955 Builder.SetInsertPoint(PH->getParent(),
19956 PH->getParent()->getFirstInsertionPt());
19959 V = FinalShuffle(V,
E);
19961 E->VectorizedValue =
V;
19968 SmallPtrSet<BasicBlock *, 4> VisitedBBs;
19975 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
19979 if (!VisitedBBs.
insert(IBB).second) {
19982 TreeEntry *OpTE = getOperandEntry(
E,
I);
19983 assert(!OpTE->VectorizedValue &&
"Expected no vectorized value.");
19984 OpTE->VectorizedValue = VecOp;
19990 Value *Vec = vectorizeOperand(
E,
I);
19991 if (VecTy != Vec->
getType()) {
19993 MinBWs.contains(getOperandEntry(
E,
I))) &&
19994 "Expected item in MinBWs.");
19995 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(
I));
20001 "Invalid number of incoming values");
20002 assert(
E->VectorizedValue &&
"Expected vectorized value.");
20003 return E->VectorizedValue;
20006 case Instruction::ExtractElement: {
20007 Value *
V =
E->getSingleOperand(0);
20008 setInsertPointAfterBundle(
E);
20009 V = FinalShuffle(V,
E);
20010 E->VectorizedValue =
V;
20013 case Instruction::ExtractValue: {
20015 Builder.SetInsertPoint(LI);
20016 Value *Ptr = LI->getPointerOperand();
20017 LoadInst *
V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
20019 NewV = FinalShuffle(NewV,
E);
20020 E->VectorizedValue = NewV;
20023 case Instruction::InsertElement: {
20024 assert(
E->ReuseShuffleIndices.empty() &&
"All inserts should be unique");
20025 if (
const TreeEntry *OpE = getOperandEntry(
E, 1);
20026 OpE && !OpE->isGather() && OpE->hasState() &&
20027 !OpE->hasCopyableElements())
20030 setInsertPointAfterBundle(
E);
20031 Value *
V = vectorizeOperand(
E, 1);
20033 Type *ScalarTy =
Op.front()->getType();
20036 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(
E, 1));
20037 assert(Res.first > 0 &&
"Expected item in MinBWs.");
20038 V = Builder.CreateIntCast(
20048 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
20050 const unsigned NumElts =
20052 const unsigned NumScalars =
E->Scalars.size();
20055 assert(
Offset < NumElts &&
"Failed to find vector index offset");
20058 SmallVector<int>
Mask;
20059 if (!
E->ReorderIndices.empty()) {
20064 std::iota(
Mask.begin(), std::next(
Mask.begin(), NumScalars), 0);
20067 bool IsIdentity =
true;
20069 Mask.swap(PrevMask);
20070 for (
unsigned I = 0;
I < NumScalars; ++
I) {
20073 IsIdentity &= InsertIdx -
Offset ==
I;
20076 if (!IsIdentity || NumElts != NumScalars) {
20077 Value *V2 =
nullptr;
20078 bool IsVNonPoisonous =
20080 SmallVector<int> InsertMask(Mask);
20081 if (NumElts != NumScalars &&
Offset == 0) {
20090 InsertMask[*InsertIdx] = *InsertIdx;
20096 SmallBitVector UseMask =
20097 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
20098 SmallBitVector IsFirstPoison =
20100 SmallBitVector IsFirstUndef =
20102 if (!IsFirstPoison.
all()) {
20104 for (
unsigned I = 0;
I < NumElts;
I++) {
20106 IsFirstUndef.
test(
I)) {
20107 if (IsVNonPoisonous) {
20108 InsertMask[
I] =
I < NumScalars ?
I : 0;
20113 if (Idx >= NumScalars)
20114 Idx = NumScalars - 1;
20115 InsertMask[
I] = NumScalars + Idx;
20128 V = Builder.CreateShuffleVector(V, V2, InsertMask);
20130 GatherShuffleExtractSeq.insert(
I);
20131 CSEBlocks.insert(
I->getParent());
20136 for (
unsigned I = 0;
I < NumElts;
I++) {
20140 SmallBitVector UseMask =
20141 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
20142 SmallBitVector IsFirstUndef =
20144 if ((!IsIdentity ||
Offset != 0 || !IsFirstUndef.
all()) &&
20145 NumElts != NumScalars) {
20146 if (IsFirstUndef.
all()) {
20148 SmallBitVector IsFirstPoison =
20150 if (!IsFirstPoison.
all()) {
20151 for (
unsigned I = 0;
I < NumElts;
I++) {
20153 InsertMask[
I] =
I + NumElts;
20156 V = Builder.CreateShuffleVector(
20162 GatherShuffleExtractSeq.insert(
I);
20163 CSEBlocks.insert(
I->getParent());
20167 SmallBitVector IsFirstPoison =
20169 for (
unsigned I = 0;
I < NumElts;
I++) {
20173 InsertMask[
I] += NumElts;
20175 V = Builder.CreateShuffleVector(
20176 FirstInsert->getOperand(0), V, InsertMask,
20179 GatherShuffleExtractSeq.insert(
I);
20180 CSEBlocks.insert(
I->getParent());
20185 ++NumVectorInstructions;
20186 E->VectorizedValue =
V;
20189 case Instruction::ZExt:
20190 case Instruction::SExt:
20191 case Instruction::FPToUI:
20192 case Instruction::FPToSI:
20193 case Instruction::FPExt:
20194 case Instruction::PtrToInt:
20195 case Instruction::IntToPtr:
20196 case Instruction::SIToFP:
20197 case Instruction::UIToFP:
20198 case Instruction::Trunc:
20199 case Instruction::FPTrunc:
20200 case Instruction::BitCast: {
20201 setInsertPointAfterBundle(
E);
20203 Value *InVec = vectorizeOperand(
E, 0);
20208 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
20210 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
20213 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
20214 if (SrcIt != MinBWs.end())
20215 SrcBWSz = SrcIt->second.first;
20216 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->
getScalarType());
20217 if (BWSz == SrcBWSz) {
20218 VecOpcode = Instruction::BitCast;
20219 }
else if (BWSz < SrcBWSz) {
20220 VecOpcode = Instruction::Trunc;
20221 }
else if (It != MinBWs.end()) {
20222 assert(BWSz > SrcBWSz &&
"Invalid cast!");
20223 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
20224 }
else if (SrcIt != MinBWs.end()) {
20225 assert(BWSz > SrcBWSz &&
"Invalid cast!");
20227 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
20229 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
20230 !SrcIt->second.second) {
20231 VecOpcode = Instruction::UIToFP;
20233 Value *
V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
20235 : Builder.CreateCast(VecOpcode, InVec, VecTy);
20236 V = FinalShuffle(V,
E);
20238 E->VectorizedValue =
V;
20239 ++NumVectorInstructions;
20242 case Instruction::FCmp:
20243 case Instruction::ICmp: {
20244 setInsertPointAfterBundle(
E);
20246 Value *
L = vectorizeOperand(
E, 0);
20247 Value *
R = vectorizeOperand(
E, 1);
20248 if (
L->getType() !=
R->getType()) {
20251 MinBWs.contains(getOperandEntry(
E, 0)) ||
20252 MinBWs.contains(getOperandEntry(
E, 1))) &&
20253 "Expected item in MinBWs.");
20258 ->getIntegerBitWidth()) {
20259 Type *CastTy =
R->getType();
20260 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
20262 Type *CastTy =
L->getType();
20263 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
20268 Value *
V = Builder.CreateCmp(P0, L, R);
20271 ICmp->setSameSign(
false);
20274 V = FinalShuffle(V,
E);
20276 E->VectorizedValue =
V;
20277 ++NumVectorInstructions;
20280 case Instruction::Select: {
20281 setInsertPointAfterBundle(
E);
20284 Value *True = vectorizeOperand(
E, 1);
20285 Value *False = vectorizeOperand(
E, 2);
20289 MinBWs.contains(getOperandEntry(
E, 1)) ||
20290 MinBWs.contains(getOperandEntry(
E, 2))) &&
20291 "Expected item in MinBWs.");
20292 if (True->
getType() != VecTy)
20293 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
20294 if (False->
getType() != VecTy)
20295 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
20300 assert(TrueNumElements >= CondNumElements &&
20301 TrueNumElements % CondNumElements == 0 &&
20302 "Cannot vectorize Instruction::Select");
20304 "Cannot vectorize Instruction::Select");
20305 if (CondNumElements != TrueNumElements) {
20308 Cond = Builder.CreateShuffleVector(
20313 "Cannot vectorize Instruction::Select");
20315 Builder.CreateSelectWithUnknownProfile(
Cond, True, False,
DEBUG_TYPE);
20316 V = FinalShuffle(V,
E);
20318 E->VectorizedValue =
V;
20319 ++NumVectorInstructions;
20322 case Instruction::FNeg: {
20323 setInsertPointAfterBundle(
E);
20325 Value *
Op = vectorizeOperand(
E, 0);
20327 Value *
V = Builder.CreateUnOp(
20333 V = FinalShuffle(V,
E);
20335 E->VectorizedValue =
V;
20336 ++NumVectorInstructions;
20340 case Instruction::Freeze: {
20341 setInsertPointAfterBundle(
E);
20343 Value *
Op = vectorizeOperand(
E, 0);
20345 if (
Op->getType() != VecTy) {
20347 MinBWs.contains(getOperandEntry(
E, 0))) &&
20348 "Expected item in MinBWs.");
20349 Op = Builder.CreateIntCast(
Op, VecTy, GetOperandSignedness(0));
20351 Value *
V = Builder.CreateFreeze(
Op);
20352 V = FinalShuffle(V,
E);
20354 E->VectorizedValue =
V;
20355 ++NumVectorInstructions;
20359 case Instruction::Add:
20360 case Instruction::FAdd:
20361 case Instruction::Sub:
20362 case Instruction::FSub:
20363 case Instruction::Mul:
20364 case Instruction::FMul:
20365 case Instruction::UDiv:
20366 case Instruction::SDiv:
20367 case Instruction::FDiv:
20368 case Instruction::URem:
20369 case Instruction::SRem:
20370 case Instruction::FRem:
20371 case Instruction::Shl:
20372 case Instruction::LShr:
20373 case Instruction::AShr:
20374 case Instruction::And:
20375 case Instruction::Or:
20376 case Instruction::Xor: {
20377 setInsertPointAfterBundle(
E);
20381 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
20386 return CI && CI->getValue().countr_one() >= It->second.first;
20388 V = FinalShuffle(
I == 0 ?
RHS :
LHS,
E);
20389 E->VectorizedValue =
V;
20390 ++NumVectorInstructions;
20398 MinBWs.contains(getOperandEntry(
E, 0)) ||
20399 MinBWs.contains(getOperandEntry(
E, 1))) &&
20400 "Expected item in MinBWs.");
20402 LHS = Builder.CreateIntCast(
LHS, VecTy, GetOperandSignedness(0));
20404 RHS = Builder.CreateIntCast(
RHS, VecTy, GetOperandSignedness(1));
20407 Value *
V = Builder.CreateBinOp(
20414 if (!MinBWs.contains(
E) && ShuffleOrOp == Instruction::Sub &&
20416 return isa<PoisonValue>(V) ||
20417 (E->hasCopyableElements() && E->isCopyableElement(V)) ||
20418 isCommutative(cast<Instruction>(V));
20420 I->setHasNoUnsignedWrap(
false);
20423 V = FinalShuffle(V,
E);
20425 E->VectorizedValue =
V;
20426 ++NumVectorInstructions;
20430 case Instruction::Load: {
20433 setInsertPointAfterBundle(
E);
20437 FixedVectorType *StridedLoadTy =
nullptr;
20438 Value *PO = LI->getPointerOperand();
20439 if (
E->State == TreeEntry::Vectorize) {
20440 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
20441 }
else if (
E->State == TreeEntry::CompressVectorize) {
20442 auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
20443 CompressEntryToData.at(
E);
20444 Align CommonAlignment = LI->getAlign();
20450 for (
int I : CompressMask)
20454 MaskValues =
replicateMask(MaskValues, VecTy->getNumElements());
20457 NewLI = Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment,
20460 NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment);
20471 }
else if (
E->State == TreeEntry::StridedVectorize) {
20474 PO = IsReverseOrder ? PtrN : Ptr0;
20475 Type *StrideTy = DL->getIndexType(PO->
getType());
20477 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(
E);
20478 StridedLoadTy = SPtrInfo.Ty;
20479 assert(StridedLoadTy &&
"Missing StridedPoinerInfo for tree entry.");
20480 unsigned StridedLoadEC =
20483 Value *Stride = SPtrInfo.StrideVal;
20485 const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
20486 assert(StrideSCEV &&
"Neither StrideVal nor StrideSCEV were set.");
20487 SCEVExpander Expander(*SE,
"strided-load-vec");
20488 Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->
getType(),
20489 &*Builder.GetInsertPoint());
20492 Builder.CreateIntCast(Stride, StrideTy,
true);
20493 StrideVal = Builder.CreateMul(
20495 StrideTy, (IsReverseOrder ? -1 : 1) *
20497 DL->getTypeAllocSize(ScalarTy))));
20499 auto *Inst = Builder.CreateIntrinsic(
20500 Intrinsic::experimental_vp_strided_load,
20501 {StridedLoadTy, PO->
getType(), StrideTy},
20504 Builder.getInt32(StridedLoadEC)});
20505 Inst->addParamAttr(
20507 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
20510 assert(
E->State == TreeEntry::ScatterVectorize &&
"Unhandled state");
20511 Value *VecPtr = vectorizeOperand(
E, 0);
20516 unsigned ScalarTyNumElements =
20518 unsigned VecTyNumElements =
20520 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
20521 "Cannot expand getelementptr.");
20522 unsigned VF = VecTyNumElements / ScalarTyNumElements;
20525 return Builder.getInt64(I % ScalarTyNumElements);
20527 VecPtr = Builder.CreateGEP(
20528 VecTy->getElementType(),
20529 Builder.CreateShuffleVector(
20535 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
20537 Value *
V =
E->State == TreeEntry::CompressVectorize
20541 if (StridedLoadTy != VecTy)
20542 V = Builder.CreateBitOrPointerCast(V, VecTy);
20543 V = FinalShuffle(V,
E);
20544 E->VectorizedValue =
V;
20545 ++NumVectorInstructions;
20548 case Instruction::Store: {
20551 setInsertPointAfterBundle(
E);
20553 Value *VecValue = vectorizeOperand(
E, 0);
20554 if (VecValue->
getType() != VecTy)
20556 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
20557 VecValue = FinalShuffle(VecValue,
E);
20559 Value *Ptr =
SI->getPointerOperand();
20561 if (
E->State == TreeEntry::Vectorize) {
20562 ST = Builder.CreateAlignedStore(VecValue, Ptr,
SI->getAlign());
20564 assert(
E->State == TreeEntry::StridedVectorize &&
20565 "Expected either strided or consecutive stores.");
20566 if (!
E->ReorderIndices.empty()) {
20568 Ptr =
SI->getPointerOperand();
20571 Type *StrideTy = DL->getIndexType(
SI->getPointerOperandType());
20572 auto *Inst = Builder.CreateIntrinsic(
20573 Intrinsic::experimental_vp_strided_store,
20574 {VecTy, Ptr->
getType(), StrideTy},
20577 StrideTy, -
static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
20578 Builder.getAllOnesMask(VecTy->getElementCount()),
20579 Builder.getInt32(
E->Scalars.size())});
20580 Inst->addParamAttr(
20582 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
20588 E->VectorizedValue =
V;
20589 ++NumVectorInstructions;
20592 case Instruction::GetElementPtr: {
20594 setInsertPointAfterBundle(
E);
20596 Value *Op0 = vectorizeOperand(
E, 0);
20599 for (
int J = 1,
N = GEP0->getNumOperands(); J <
N; ++J) {
20600 Value *OpVec = vectorizeOperand(
E, J);
20604 Value *
V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
20607 for (
Value *V :
E->Scalars) {
20614 V = FinalShuffle(V,
E);
20616 E->VectorizedValue =
V;
20617 ++NumVectorInstructions;
20621 case Instruction::Call: {
20623 setInsertPointAfterBundle(
E);
20628 CI,
ID, VecTy->getNumElements(),
20629 It != MinBWs.end() ? It->second.first : 0, TTI);
20632 VecCallCosts.first <= VecCallCosts.second;
20634 Value *ScalarArg =
nullptr;
20645 ScalarArg = CEI->getArgOperand(
I);
20648 if (
ID == Intrinsic::abs && It != MinBWs.end() &&
20649 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
20650 ScalarArg = Builder.getFalse();
20657 Value *OpVec = vectorizeOperand(
E,
I);
20658 ScalarArg = CEI->getArgOperand(
I);
20661 It == MinBWs.end()) {
20664 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(
I));
20665 }
else if (It != MinBWs.end()) {
20666 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(
I));
20675 if (!UseIntrinsic) {
20680 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
20687 Value *
V = Builder.CreateCall(CF, OpVecs, OpBundles);
20690 V = FinalShuffle(V,
E);
20692 E->VectorizedValue =
V;
20693 ++NumVectorInstructions;
20696 case Instruction::ShuffleVector: {
20699 setInsertPointAfterBundle(
E);
20700 Value *Src = vectorizeOperand(
E, 0);
20703 SmallVector<int> NewMask(ThisMask.size());
20705 return SVSrc->getShuffleMask()[Mask];
20707 V = Builder.CreateShuffleVector(SVSrc->getOperand(0),
20708 SVSrc->getOperand(1), NewMask);
20710 V = Builder.CreateShuffleVector(Src, ThisMask);
20715 V = FinalShuffle(V,
E);
20723 "Invalid Shuffle Vector Operand");
20727 setInsertPointAfterBundle(
E);
20728 LHS = vectorizeOperand(
E, 0);
20729 RHS = vectorizeOperand(
E, 1);
20731 setInsertPointAfterBundle(
E);
20732 LHS = vectorizeOperand(
E, 0);
20738 assert((It != MinBWs.end() ||
20739 getOperandEntry(
E, 0)->State == TreeEntry::NeedToGather ||
20740 getOperandEntry(
E, 1)->State == TreeEntry::NeedToGather ||
20741 MinBWs.contains(getOperandEntry(
E, 0)) ||
20742 MinBWs.contains(getOperandEntry(
E, 1))) &&
20743 "Expected item in MinBWs.");
20744 Type *CastTy = VecTy;
20750 ->getIntegerBitWidth())
20756 LHS = Builder.CreateIntCast(
LHS, CastTy, GetOperandSignedness(0));
20758 RHS = Builder.CreateIntCast(
RHS, CastTy, GetOperandSignedness(1));
20763 V0 = Builder.CreateBinOp(
20765 V1 = Builder.CreateBinOp(
20768 V0 = Builder.CreateCmp(CI0->getPredicate(),
LHS,
RHS);
20771 V1 = Builder.CreateCmp(AltPred,
LHS,
RHS);
20774 unsigned SrcBWSz = DL->getTypeSizeInBits(
20776 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
20777 if (BWSz <= SrcBWSz) {
20778 if (BWSz < SrcBWSz)
20779 LHS = Builder.CreateIntCast(
LHS, VecTy, It->second.first);
20781 "Expected same type as operand.");
20785 E->VectorizedValue =
LHS;
20786 ++NumVectorInstructions;
20790 V0 = Builder.CreateCast(
20792 V1 = Builder.CreateCast(
20797 for (
Value *V : {V0, V1}) {
20799 GatherShuffleExtractSeq.insert(
I);
20800 CSEBlocks.insert(
I->getParent());
20808 SmallVector<int>
Mask;
20809 E->buildAltOpShuffleMask(
20810 [
E,
this](Instruction *
I) {
20811 assert(
E->getMatchingMainOpOrAltOp(
I) &&
20812 "Unexpected main/alternate opcode");
20816 Mask, &OpScalars, &AltScalars);
20820 auto DropNuwFlag = [&](
Value *Vec,
unsigned Opcode) {
20823 I && Opcode == Instruction::Sub && !MinBWs.contains(
E) &&
20825 if (isa<PoisonValue>(V))
20827 if (E->hasCopyableElements() && E->isCopyableElement(V))
20829 auto *IV = cast<Instruction>(V);
20830 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
20832 I->setHasNoUnsignedWrap(
false);
20834 DropNuwFlag(V0,
E->getOpcode());
20835 DropNuwFlag(V1,
E->getAltOpcode());
20841 V = Builder.CreateShuffleVector(V0, V1, Mask);
20844 GatherShuffleExtractSeq.insert(
I);
20845 CSEBlocks.insert(
I->getParent());
20849 E->VectorizedValue =
V;
20850 ++NumVectorInstructions;
20868 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
20871 EntryToLastInstruction.clear();
20873 for (
auto &BSIter : BlocksSchedules)
20874 scheduleBlock(*
this, BSIter.second.get());
20877 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20878 if (TE->isGather() || DeletedNodes.contains(TE.get()))
20880 (void)getLastInstructionInBundle(TE.get());
20884 Builder.SetInsertPoint(ReductionRoot->
getParent(),
20887 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20891 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20892 if (DeletedNodes.contains(TE.get()))
20894 if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
20895 TE->UserTreeIndex.UserTE->hasState() &&
20896 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
20897 (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
20898 TE->UserTreeIndex.UserTE->isAltShuffle()) &&
20899 !TE->UserTreeIndex.UserTE->hasCopyableElements() &&
20900 all_of(TE->UserTreeIndex.UserTE->Scalars,
20901 [](
Value *V) { return isUsedOutsideBlock(V); })) {
20903 getLastInstructionInBundle(TE->UserTreeIndex.UserTE);
20907 for (
auto &Entry : GatherEntries) {
20909 Builder.SetInsertPoint(Entry.second);
20910 Builder.SetCurrentDebugLocation(Entry.second->getDebugLoc());
20915 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20916 if (DeletedNodes.contains(TE.get()))
20918 if (GatheredLoadsEntriesFirst.has_value() &&
20919 TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue &&
20920 (!TE->isGather() || TE->UserTreeIndex)) {
20921 assert((TE->UserTreeIndex ||
20922 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
20923 "Expected gathered load node.");
20932 for (
const TreeEntry *E : PostponedNodes) {
20933 auto *TE =
const_cast<TreeEntry *
>(E);
20935 TE->VectorizedValue =
nullptr;
20946 (TE->UserTreeIndex.UserTE->hasState() &&
20947 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI)) {
20956 if (UI->comesBefore(InsertPt))
20959 Builder.SetInsertPoint(InsertPt);
20961 Builder.SetInsertPoint(PrevVec);
20963 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
20966 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
20967 Builder.GetInsertPoint()->comesBefore(VecI))
20968 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
20969 Builder.GetInsertPoint());
20970 if (Vec->
getType() != PrevVec->getType()) {
20972 PrevVec->getType()->isIntOrIntVectorTy() &&
20973 "Expected integer vector types only.");
20974 std::optional<bool> IsSigned;
20975 for (
Value *V : TE->Scalars) {
20977 for (
const TreeEntry *MNTE : getTreeEntries(V)) {
20978 auto It = MinBWs.find(MNTE);
20979 if (It != MinBWs.end()) {
20980 IsSigned = IsSigned.value_or(
false) || It->second.second;
20985 if (IsSigned.value_or(
false))
20988 for (
const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
20989 auto It = MinBWs.find(BVE);
20990 if (It != MinBWs.end()) {
20991 IsSigned = IsSigned.value_or(
false) || It->second.second;
20996 if (IsSigned.value_or(
false))
21000 IsSigned.value_or(
false) ||
21004 if (IsSigned.value_or(
false))
21008 if (IsSigned.value_or(
false)) {
21010 auto It = MinBWs.find(TE->UserTreeIndex.UserTE);
21011 if (It != MinBWs.end())
21012 IsSigned = It->second.second;
21015 "Expected user node or perfect diamond match in MinBWs.");
21016 Vec = Builder.CreateIntCast(Vec, PrevVec->
getType(), *IsSigned);
21018 PrevVec->replaceAllUsesWith(Vec);
21019 PostponedValues.
try_emplace(Vec).first->second.push_back(TE);
21022 auto It = PostponedValues.
find(PrevVec);
21023 if (It != PostponedValues.
end()) {
21024 for (TreeEntry *VTE : It->getSecond())
21025 VTE->VectorizedValue = Vec;
21045 for (
const auto &ExternalUse : ExternalUses) {
21046 Value *Scalar = ExternalUse.Scalar;
21053 const TreeEntry *E = &ExternalUse.E;
21054 assert(E &&
"Invalid scalar");
21055 assert(!E->isGather() &&
"Extracting from a gather list");
21057 if (E->getOpcode() == Instruction::GetElementPtr &&
21061 Value *Vec = E->VectorizedValue;
21062 assert(Vec &&
"Can't find vectorizable value");
21064 Value *Lane = Builder.getInt32(ExternalUse.Lane);
21065 auto ExtractAndExtendIfNeeded = [&](
Value *Vec) {
21066 if (Scalar->getType() != Vec->
getType()) {
21067 Value *Ex =
nullptr;
21068 Value *ExV =
nullptr;
21070 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
21071 auto It = ScalarToEEs.
find(Scalar);
21072 if (It != ScalarToEEs.
end()) {
21075 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
21076 : Builder.GetInsertBlock());
21077 if (EEIt != It->second.end()) {
21078 Value *PrevV = EEIt->second.first;
21080 I && !ReplaceInst &&
21081 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
21082 Builder.GetInsertPoint()->comesBefore(
I)) {
21083 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
21084 Builder.GetInsertPoint());
21089 ExV = EEIt->second.second ? EEIt->second.second : Ex;
21098 IgnoredExtracts.
insert(EE);
21101 auto *CloneInst = Inst->clone();
21102 CloneInst->insertBefore(Inst->getIterator());
21103 if (Inst->hasName())
21104 CloneInst->takeName(Inst);
21109 Value *V = ES->getVectorOperand();
21112 V = ETEs.front()->VectorizedValue;
21114 !
IV ||
IV == Vec ||
IV->getParent() != IVec->getParent() ||
21115 IV->comesBefore(IVec))
21116 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
21118 Ex = Builder.CreateExtractElement(Vec, Lane);
21119 }
else if (
auto *VecTy =
21122 unsigned VecTyNumElements = VecTy->getNumElements();
21127 ExternalUse.Lane * VecTyNumElements);
21129 Ex = Builder.CreateExtractElement(Vec, Lane);
21134 if (Scalar->getType() != Ex->
getType())
21135 ExV = Builder.CreateIntCast(
21140 : &F->getEntryBlock(),
21141 std::make_pair(Ex, ExV));
21147 GatherShuffleExtractSeq.insert(ExI);
21148 CSEBlocks.insert(ExI->getParent());
21154 "In-tree scalar of vector type is not insertelement?");
21163 if (!ScalarsWithNullptrUser.
insert(Scalar).second)
21166 (ExternallyUsedValues.
count(Scalar) ||
21167 ExternalUsesWithNonUsers.count(Scalar) ||
21168 ExternalUsesAsOriginalScalar.contains(Scalar) ||
21172 if (ExternalUsesAsOriginalScalar.contains(U))
21174 ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
21175 return !UseEntries.empty() &&
21176 (E->State == TreeEntry::Vectorize ||
21177 E->State == TreeEntry::StridedVectorize ||
21178 E->State == TreeEntry::CompressVectorize) &&
21179 any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {
21180 return (UseEntry->State == TreeEntry::Vectorize ||
21182 TreeEntry::StridedVectorize ||
21184 TreeEntry::CompressVectorize) &&
21185 doesInTreeUserNeedToExtract(
21186 Scalar, getRootEntryInstruction(*UseEntry),
21190 "Scalar with nullptr User must be registered in "
21191 "ExternallyUsedValues map or remain as scalar in vectorized "
21195 if (
PHI->getParent()->isLandingPad())
21196 Builder.SetInsertPoint(
21199 PHI->getParent()->getLandingPadInst()->getIterator()));
21201 Builder.SetInsertPoint(
PHI->getParent(),
21202 PHI->getParent()->getFirstNonPHIIt());
21204 Builder.SetInsertPoint(VecI->getParent(),
21205 std::next(VecI->getIterator()));
21208 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
21210 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
21212 if (Scalar != NewInst) {
21215 "Extractelements should not be replaced.");
21216 Scalar->replaceAllUsesWith(NewInst);
21226 if (!UsedInserts.
insert(VU).second)
21229 auto BWIt = MinBWs.find(E);
21231 auto *ScalarTy = FTy->getElementType();
21232 auto Key = std::make_pair(Vec, ScalarTy);
21233 auto VecIt = VectorCasts.
find(
Key);
21234 if (VecIt == VectorCasts.
end()) {
21237 if (IVec->getParent()->isLandingPad())
21238 Builder.SetInsertPoint(IVec->getParent(),
21239 std::next(IVec->getParent()
21240 ->getLandingPadInst()
21243 Builder.SetInsertPoint(
21244 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
21246 Builder.SetInsertPoint(IVec->getNextNode());
21248 Vec = Builder.CreateIntCast(
21253 BWIt->second.second);
21256 Vec = VecIt->second;
21263 ShuffledInserts, [VU](
const ShuffledInsertData<Value *> &
Data) {
21270 unsigned Idx = *InsertIdx;
21271 if (It == ShuffledInserts.
end()) {
21273 It = std::next(ShuffledInserts.
begin(),
21274 ShuffledInserts.
size() - 1);
21279 Mask[Idx] = ExternalUse.Lane;
21291 for (
unsigned I :
seq<unsigned>(0, PH->getNumIncomingValues())) {
21292 if (PH->getIncomingValue(
I) == Scalar) {
21294 PH->getIncomingBlock(
I)->getTerminator();
21296 Builder.SetInsertPoint(VecI->getParent(),
21297 std::next(VecI->getIterator()));
21299 Builder.SetInsertPoint(PH->getIncomingBlock(
I)->getTerminator());
21301 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
21302 PH->setOperand(
I, NewInst);
21307 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
21311 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
21312 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
21323 for (
int I = 0, E = Mask.size();
I < E; ++
I) {
21325 CombinedMask1[
I] = Mask[
I];
21327 CombinedMask2[
I] = Mask[
I] - VF;
21329 ShuffleInstructionBuilder ShuffleBuilder(
21331 ShuffleBuilder.add(V1, CombinedMask1);
21333 ShuffleBuilder.add(V2, CombinedMask2);
21334 return ShuffleBuilder.finalize({}, {}, {});
21337 auto &&ResizeToVF = [&CreateShuffle](
Value *Vec, ArrayRef<int>
Mask,
21338 bool ForSingleMask) {
21339 unsigned VF =
Mask.size();
21342 if (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); })) {
21343 Vec = CreateShuffle(Vec,
nullptr, Mask);
21344 return std::make_pair(Vec,
true);
21346 if (!ForSingleMask) {
21348 for (
unsigned I = 0;
I < VF; ++
I) {
21352 Vec = CreateShuffle(Vec,
nullptr, ResizeMask);
21356 return std::make_pair(Vec,
false);
21360 for (
int I = 0,
E = ShuffledInserts.size();
I <
E; ++
I) {
21363 InsertElementInst *FirstInsert = ShuffledInserts[
I].InsertElements.front();
21364 InsertElementInst *LastInsert = ShuffledInserts[
I].InsertElements.back();
21365 Builder.SetInsertPoint(LastInsert);
21366 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
21371 return cast<VectorType>(Vec->getType())
21372 ->getElementCount()
21373 .getKnownMinValue();
21376 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
21378 assert((Vals.size() == 1 || Vals.size() == 2) &&
21379 "Expected exactly 1 or 2 input values.");
21380 if (Vals.size() == 1) {
21383 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
21384 ->getNumElements() ||
21385 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
21386 return CreateShuffle(Vals.front(), nullptr, Mask);
21387 return Vals.front();
21389 return CreateShuffle(Vals.
front() ? Vals.
front()
21391 Vals.
back(), Mask);
21393 auto It = ShuffledInserts[
I].InsertElements.rbegin();
21395 InsertElementInst *
II =
nullptr;
21396 if (It != ShuffledInserts[
I].InsertElements.rend())
21399 while (It != ShuffledInserts[
I].InsertElements.rend()) {
21400 assert(
II &&
"Must be an insertelement instruction.");
21407 for (Instruction *
II :
reverse(Inserts)) {
21408 II->replaceUsesOfWith(
II->getOperand(0), NewInst);
21410 if (
II->getParent() == NewI->getParent() &&
II->comesBefore(NewI))
21411 II->moveAfter(NewI);
21415 for (InsertElementInst *IE :
reverse(ShuffledInserts[
I].InsertElements)) {
21416 IE->replaceUsesOfWith(
IE->getOperand(0),
21418 IE->replaceUsesOfWith(
IE->getOperand(1),
21422 CSEBlocks.insert(LastInsert->
getParent());
21427 for (
auto &TEPtr : VectorizableTree) {
21428 TreeEntry *
Entry = TEPtr.get();
21431 if (
Entry->isGather() ||
Entry->State == TreeEntry::SplitVectorize ||
21432 DeletedNodes.contains(Entry) ||
21433 TransformedToGatherNodes.contains(Entry))
21436 assert(
Entry->VectorizedValue &&
"Can't find vectorizable value");
21439 for (
int Lane = 0, LE =
Entry->Scalars.size(); Lane != LE; ++Lane) {
21442 if (
Entry->getOpcode() == Instruction::GetElementPtr &&
21446 EE && IgnoredExtracts.contains(EE))
21453 for (User *U :
Scalar->users()) {
21458 (UserIgnoreList && UserIgnoreList->contains(U)) ||
21461 "Deleting out-of-tree value");
21465 LLVM_DEBUG(
dbgs() <<
"SLP: \tErasing scalar:" << *Scalar <<
".\n");
21474 V->mergeDIAssignID(RemovedInsts);
21477 if (UserIgnoreList) {
21478 for (Instruction *
I : RemovedInsts) {
21479 const TreeEntry *
IE = getTreeEntries(
I).front();
21481 !SplitEntries.empty() && SplitEntries.front()->Idx <
IE->Idx)
21482 IE = SplitEntries.front();
21483 if (
IE->Idx != 0 &&
21484 !(VectorizableTree.front()->isGather() &&
IE->UserTreeIndex &&
21485 (ValueToGatherNodes.lookup(
I).contains(
21486 VectorizableTree.front().get()) ||
21487 (
IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
21488 IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
21489 !(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
21490 IE->UserTreeIndex &&
21492 !(GatheredLoadsEntriesFirst.has_value() &&
21493 IE->Idx >= *GatheredLoadsEntriesFirst &&
21494 VectorizableTree.front()->isGather() &&
21496 !(!VectorizableTree.front()->isGather() &&
21497 VectorizableTree.front()->isCopyableElement(
I)))
21502 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
21503 (match(U.getUser(), m_LogicalAnd()) ||
21504 match(U.getUser(), m_LogicalOr())) &&
21505 U.getOperandNo() == 0;
21506 if (IsPoisoningLogicalOp) {
21507 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
21510 return UserIgnoreList->contains(
U.getUser());
21514 for (SelectInst *SI : LogicalOpSelects)
21524 Builder.ClearInsertionPoint();
21525 InstrElementSize.clear();
21527 const TreeEntry &RootTE = *VectorizableTree.front();
21528 Value *Vec = RootTE.VectorizedValue;
21529 if (
auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
21530 It != MinBWs.end() &&
21531 ReductionBitWidth != It->second.first) {
21532 IRBuilder<>::InsertPointGuard Guard(Builder);
21533 Builder.SetInsertPoint(ReductionRoot->getParent(),
21534 ReductionRoot->getIterator());
21535 Vec = Builder.CreateIntCast(
21537 VectorType::get(Builder.getIntNTy(ReductionBitWidth),
21539 It->second.second);
21545 LLVM_DEBUG(
dbgs() <<
"SLP: Optimizing " << GatherShuffleExtractSeq.size()
21546 <<
" gather sequences instructions.\n");
21553 Loop *L = LI->getLoopFor(
I->getParent());
21558 BasicBlock *PreHeader = L->getLoopPreheader();
21566 auto *OpI = dyn_cast<Instruction>(V);
21567 return OpI && L->contains(OpI);
21573 CSEBlocks.insert(PreHeader);
21578 CSEWorkList.
reserve(CSEBlocks.size());
21581 assert(DT->isReachableFromEntry(
N));
21588 assert((
A ==
B) == (
A->getDFSNumIn() ==
B->getDFSNumIn()) &&
21589 "Different nodes should have different DFS numbers");
21590 return A->getDFSNumIn() <
B->getDFSNumIn();
21598 auto &&IsIdenticalOrLessDefined = [TTI = TTI](
Instruction *I1,
21601 if (I1->getType() != I2->getType())
21606 return I1->isIdenticalTo(I2);
21607 if (SI1->isIdenticalTo(SI2))
21609 for (
int I = 0, E = SI1->getNumOperands();
I < E; ++
I)
21610 if (SI1->getOperand(
I) != SI2->getOperand(
I))
21613 NewMask.
assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
21617 unsigned LastUndefsCnt = 0;
21618 for (
int I = 0, E = NewMask.
size();
I < E; ++
I) {
21624 NewMask[
I] != SM1[
I])
21627 NewMask[
I] = SM1[
I];
21631 return SM1.
size() - LastUndefsCnt > 1 &&
21635 SM1.
size() - LastUndefsCnt));
21641 for (
auto I = CSEWorkList.
begin(), E = CSEWorkList.
end();
I != E; ++
I) {
21643 (
I == CSEWorkList.
begin() || !DT->dominates(*
I, *std::prev(
I))) &&
21644 "Worklist not sorted properly!");
21651 !GatherShuffleExtractSeq.contains(&In))
21656 bool Replaced =
false;
21659 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
21660 DT->dominates(V->getParent(), In.getParent())) {
21661 In.replaceAllUsesWith(V);
21664 if (!NewMask.
empty())
21665 SI->setShuffleMask(NewMask);
21670 GatherShuffleExtractSeq.contains(V) &&
21671 IsIdenticalOrLessDefined(V, &In, NewMask) &&
21672 DT->dominates(In.getParent(), V->getParent())) {
21674 V->replaceAllUsesWith(&In);
21677 if (!NewMask.
empty())
21678 SI->setShuffleMask(NewMask);
21686 Visited.push_back(&In);
21691 GatherShuffleExtractSeq.clear();
21694BoUpSLP::ScheduleBundle &BoUpSLP::BlockScheduling::buildBundle(
21697 ScheduledBundlesList.emplace_back(std::make_unique<ScheduleBundle>());
21698 for (
Value *V : VL) {
21699 if (S.isNonSchedulable(V))
21702 if (S.isCopyableElement(V)) {
21704 ScheduleCopyableData &SD =
21705 addScheduleCopyableData(EI,
I, SchedulingRegionID, *BundlePtr);
21707 BundlePtr->add(&SD);
21710 ScheduleData *BundleMember = getScheduleData(V);
21711 assert(BundleMember &&
"no ScheduleData for bundle member "
21712 "(maybe not in same basic block)");
21714 BundlePtr->add(BundleMember);
21715 ScheduledBundles.try_emplace(
I).first->getSecond().push_back(
21718 assert(BundlePtr && *BundlePtr &&
"Failed to find schedule bundle");
21724std::optional<BoUpSLP::ScheduleBundle *>
21726 const InstructionsState &S,
21739 if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() &&
21740 EI.UserTE->doesNotNeedToSchedule() &&
21741 EI.UserTE->getOpcode() != Instruction::PHI &&
21743 auto *I = dyn_cast<Instruction>(V);
21744 if (!I || I->hasOneUser())
21746 for (User *U : I->users()) {
21747 auto *UI = cast<Instruction>(U);
21748 if (isa<BinaryOperator>(UI))
21753 return std::nullopt;
21754 if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() &&
21755 EI.UserTE->hasCopyableElements() &&
21756 EI.UserTE->getMainOp()->getParent() == S.getMainOp()->getParent() &&
21758 if (S.isCopyableElement(V))
21762 return std::nullopt;
21765 if (S.areInstructionsWithCopyableElements() &&
any_of(VL, [&](
Value *V) {
21778 return std::nullopt;
21779 if (S.areInstructionsWithCopyableElements() && EI) {
21780 bool IsNonSchedulableWithParentPhiNode =
21781 EI.UserTE->doesNotNeedToSchedule() && EI.UserTE->UserTreeIndex &&
21782 EI.UserTE->UserTreeIndex.UserTE->hasState() &&
21783 EI.UserTE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
21784 EI.UserTE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
21785 if (IsNonSchedulableWithParentPhiNode) {
21786 SmallSet<std::pair<Value *, Value *>, 4> Values;
21787 for (
const auto [Idx, V] :
21788 enumerate(EI.UserTE->UserTreeIndex.UserTE->Scalars)) {
21789 Value *
Op = EI.UserTE->UserTreeIndex.UserTE->getOperand(
21790 EI.UserTE->UserTreeIndex.EdgeIdx)[Idx];
21794 if (!Values.
insert(std::make_pair(V,
Op)).second)
21795 return std::nullopt;
21799 bool HasCopyables = S.areInstructionsWithCopyableElements();
21801 all_of(VL, [&](
Value *V) {
return S.isNonSchedulable(V); }))) {
21805 SmallVector<ScheduleData *> ControlDependentMembers;
21806 for (
Value *V : VL) {
21808 if (!
I || (HasCopyables && S.isCopyableElement(V)))
21810 SmallDenseMap<std::pair<Instruction *, Value *>,
unsigned> UserOpToNumOps;
21811 for (
const Use &U :
I->operands()) {
21814 .first->getSecond();
21817 Op && areAllOperandsReplacedByCopyableData(
I,
Op, *SLP,
NumOps)) {
21818 if (ScheduleData *OpSD = getScheduleData(
Op);
21819 OpSD && OpSD->hasValidDependencies())
21821 return std::nullopt;
21830 LLVM_DEBUG(
dbgs() <<
"SLP: bundle: " << *S.getMainOp() <<
"\n");
21832 auto TryScheduleBundleImpl = [=](
bool ReSchedule, ScheduleBundle &Bundle) {
21835 SmallVector<ScheduleData *> ControlDependentMembers;
21836 auto CheckIfNeedToClearDeps = [&](ScheduleBundle &Bundle) {
21837 SmallDenseMap<std::pair<Instruction *, Value *>,
unsigned> UserOpToNumOps;
21838 for (ScheduleEntity *SE : Bundle.getBundle()) {
21840 if (ScheduleData *BundleMember = getScheduleData(SD->getInst());
21841 BundleMember && BundleMember->hasValidDependencies()) {
21842 BundleMember->clearDirectDependencies();
21843 if (RegionHasStackSave ||
21845 BundleMember->getInst()))
21846 ControlDependentMembers.
push_back(BundleMember);
21851 if (SD->hasValidDependencies() &&
21852 (!S.areInstructionsWithCopyableElements() ||
21853 !S.isCopyableElement(SD->getInst())) &&
21854 !getScheduleCopyableData(SD->getInst()).empty() && EI.UserTE &&
21855 EI.UserTE->hasState() &&
21856 (!EI.UserTE->hasCopyableElements() ||
21857 !EI.UserTE->isCopyableElement(SD->getInst())))
21858 SD->clearDirectDependencies();
21859 for (
const Use &U : SD->getInst()->operands()) {
21862 .
try_emplace(std::make_pair(SD->getInst(),
U.get()), 0)
21863 .first->getSecond();
21866 Op && areAllOperandsReplacedByCopyableData(SD->getInst(),
Op,
21868 if (ScheduleData *OpSD = getScheduleData(
Op);
21869 OpSD && OpSD->hasValidDependencies()) {
21870 OpSD->clearDirectDependencies();
21871 if (RegionHasStackSave ||
21873 ControlDependentMembers.
push_back(OpSD);
21884 if (OldScheduleEnd && ScheduleEnd != OldScheduleEnd) {
21885 for_each(ScheduleDataMap, [&](
auto &
P) {
21886 if (BB !=
P.first->getParent())
21888 ScheduleData *SD =
P.second;
21889 if (isInSchedulingRegion(*SD))
21890 SD->clearDependencies();
21892 for_each(ScheduleCopyableDataMapByInst, [&](
auto &
P) {
21893 for_each(
P.second, [&](ScheduleCopyableData *SD) {
21894 if (isInSchedulingRegion(*SD))
21895 SD->clearDependencies();
21902 if (Bundle && !Bundle.getBundle().empty()) {
21903 if (S.areInstructionsWithCopyableElements() ||
21904 !ScheduleCopyableDataMap.empty())
21905 CheckIfNeedToClearDeps(Bundle);
21906 LLVM_DEBUG(
dbgs() <<
"SLP: try schedule bundle " << Bundle <<
" in block "
21908 calculateDependencies(Bundle, !ReSchedule, SLP,
21909 ControlDependentMembers);
21910 }
else if (!ControlDependentMembers.
empty()) {
21911 ScheduleBundle
Invalid = ScheduleBundle::invalid();
21912 calculateDependencies(
Invalid, !ReSchedule, SLP,
21913 ControlDependentMembers);
21918 initialFillReadyList(ReadyInsts);
21925 while (((!Bundle && ReSchedule) || (Bundle && !Bundle.isReady())) &&
21926 !ReadyInsts.empty()) {
21927 ScheduleEntity *Picked = ReadyInsts.pop_back_val();
21928 assert(Picked->isReady() &&
"must be ready to schedule");
21929 schedule(*SLP, S, EI, Picked, ReadyInsts);
21930 if (Picked == &Bundle)
21937 for (
Value *V : VL) {
21938 if (S.isNonSchedulable(V))
21940 if (!extendSchedulingRegion(V, S)) {
21947 ScheduleBundle
Invalid = ScheduleBundle::invalid();
21948 TryScheduleBundleImpl(
false,
Invalid);
21949 return std::nullopt;
21953 bool ReSchedule =
false;
21954 for (
Value *V : VL) {
21955 if (S.isNonSchedulable(V))
21959 if (!CopyableData.
empty()) {
21960 for (ScheduleCopyableData *SD : CopyableData)
21961 ReadyInsts.remove(SD);
21963 ScheduleData *BundleMember = getScheduleData(V);
21964 assert((BundleMember || S.isCopyableElement(V)) &&
21965 "no ScheduleData for bundle member (maybe not in same basic block)");
21971 ReadyInsts.remove(BundleMember);
21973 !Bundles.
empty()) {
21974 for (ScheduleBundle *
B : Bundles)
21975 ReadyInsts.remove(
B);
21978 if (!S.isCopyableElement(V) && !BundleMember->isScheduled())
21985 LLVM_DEBUG(
dbgs() <<
"SLP: reset schedule because " << *BundleMember
21986 <<
" was already scheduled\n");
21990 ScheduleBundle &Bundle = buildBundle(VL, S, EI);
21991 TryScheduleBundleImpl(ReSchedule, Bundle);
21992 if (!Bundle.isReady()) {
21993 for (ScheduleEntity *BD : Bundle.getBundle()) {
21997 if (BD->isReady()) {
21999 if (Bundles.
empty()) {
22000 ReadyInsts.insert(BD);
22003 for (ScheduleBundle *
B : Bundles)
22005 ReadyInsts.insert(
B);
22008 ScheduledBundlesList.pop_back();
22009 SmallVector<ScheduleData *> ControlDependentMembers;
22010 for (
Value *V : VL) {
22011 if (S.isNonSchedulable(V))
22014 if (S.isCopyableElement(
I)) {
22017 auto KV = std::make_pair(EI,
I);
22018 assert(ScheduleCopyableDataMap.contains(KV) &&
22019 "no ScheduleCopyableData for copyable element");
22020 ScheduleCopyableData *SD =
22021 ScheduleCopyableDataMapByInst.find(
I)->getSecond().pop_back_val();
22022 ScheduleCopyableDataMapByUsers[
I].remove(SD);
22025 const auto *It =
find(
Op,
I);
22026 assert(It !=
Op.end() &&
"Lane not set");
22027 SmallPtrSet<Instruction *, 4> Visited;
22029 int Lane = std::distance(
Op.begin(), It);
22030 assert(Lane >= 0 &&
"Lane not set");
22032 !EI.UserTE->ReorderIndices.empty())
22033 Lane = EI.UserTE->ReorderIndices[Lane];
22034 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
22035 "Couldn't find extract lane");
22037 if (!Visited.
insert(In).second) {
22041 ScheduleCopyableDataMapByInstUser
22042 [std::make_pair(std::make_pair(In, EI.EdgeIdx),
I)]
22045 }
while (It !=
Op.end());
22047 if (ScheduleCopyableData *UserCD = getScheduleCopyableData(UserEI,
I))
22048 ScheduleCopyableDataMapByUsers[
I].insert(UserCD);
22050 if (ScheduleCopyableDataMapByUsers[
I].
empty())
22051 ScheduleCopyableDataMapByUsers.erase(
I);
22052 ScheduleCopyableDataMap.erase(KV);
22054 if (ScheduleData *OpSD = getScheduleData(
I);
22055 OpSD && OpSD->hasValidDependencies()) {
22056 OpSD->clearDirectDependencies();
22057 if (RegionHasStackSave ||
22059 ControlDependentMembers.
push_back(OpSD);
22063 ScheduledBundles.find(
I)->getSecond().pop_back();
22065 if (!ControlDependentMembers.
empty()) {
22066 ScheduleBundle
Invalid = ScheduleBundle::invalid();
22067 calculateDependencies(
Invalid,
false, SLP,
22068 ControlDependentMembers);
22070 return std::nullopt;
22075BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
22077 if (ChunkPos >= ChunkSize) {
22078 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
22081 return &(ScheduleDataChunks.back()[ChunkPos++]);
22084bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
22085 Value *V,
const InstructionsState &S) {
22087 assert(
I &&
"bundle member must be an instruction");
22088 if (getScheduleData(
I))
22090 if (!ScheduleStart) {
22092 initScheduleData(
I,
I->getNextNode(),
nullptr,
nullptr);
22094 ScheduleEnd =
I->getNextNode();
22095 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
22096 LLVM_DEBUG(
dbgs() <<
"SLP: initialize schedule region to " << *
I <<
"\n");
22104 ++ScheduleStart->getIterator().getReverse();
22110 return II->isAssumeLikeIntrinsic();
22113 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
22114 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
22115 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter !=
I &&
22117 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
22118 LLVM_DEBUG(
dbgs() <<
"SLP: exceeded schedule region size limit\n");
22125 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
22126 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
22128 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter ==
I)) {
22129 assert(
I->getParent() == ScheduleStart->getParent() &&
22130 "Instruction is in wrong basic block.");
22131 initScheduleData(
I, ScheduleStart,
nullptr, FirstLoadStoreInRegion);
22137 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter ==
I)) &&
22138 "Expected to reach top of the basic block or instruction down the "
22140 assert(
I->getParent() == ScheduleEnd->getParent() &&
22141 "Instruction is in wrong basic block.");
22142 initScheduleData(ScheduleEnd,
I->getNextNode(), LastLoadStoreInRegion,
22144 ScheduleEnd =
I->getNextNode();
22145 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
22146 LLVM_DEBUG(
dbgs() <<
"SLP: extend schedule region end to " << *
I <<
"\n");
22150void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
22152 ScheduleData *PrevLoadStore,
22153 ScheduleData *NextLoadStore) {
22154 ScheduleData *CurrentLoadStore = PrevLoadStore;
22159 ScheduleData *SD = ScheduleDataMap.lookup(
I);
22161 SD = allocateScheduleDataChunks();
22162 ScheduleDataMap[
I] = SD;
22164 assert(!isInSchedulingRegion(*SD) &&
22165 "new ScheduleData already in scheduling region");
22166 SD->init(SchedulingRegionID,
I);
22173 return LI && LI->isSimple() &&
22174 LI->getMetadata(LLVMContext::MD_invariant_load);
22177 if (
I->mayReadOrWriteMemory() &&
22179 !CanIgnoreLoad(
I) &&
22183 Intrinsic::pseudoprobe))) {
22185 if (CurrentLoadStore) {
22186 CurrentLoadStore->setNextLoadStore(SD);
22188 FirstLoadStoreInRegion = SD;
22190 CurrentLoadStore = SD;
22195 RegionHasStackSave =
true;
22197 if (NextLoadStore) {
22198 if (CurrentLoadStore)
22199 CurrentLoadStore->setNextLoadStore(NextLoadStore);
22201 LastLoadStoreInRegion = CurrentLoadStore;
22205void BoUpSLP::BlockScheduling::calculateDependencies(
22206 ScheduleBundle &Bundle,
bool InsertInReadyList,
BoUpSLP *SLP,
22208 SmallVector<ScheduleEntity *> WorkList;
22209 auto ProcessNode = [&](ScheduleEntity *SE) {
22211 if (CD->hasValidDependencies())
22214 CD->initDependencies();
22215 CD->resetUnscheduledDeps();
22216 const EdgeInfo &EI = CD->getEdgeInfo();
22219 const auto *It =
find(
Op, CD->getInst());
22220 assert(It !=
Op.end() &&
"Lane not set");
22221 SmallPtrSet<Instruction *, 4> Visited;
22223 int Lane = std::distance(
Op.begin(), It);
22224 assert(Lane >= 0 &&
"Lane not set");
22226 !EI.UserTE->ReorderIndices.empty())
22227 Lane = EI.UserTE->ReorderIndices[Lane];
22228 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
22229 "Couldn't find extract lane");
22231 if (EI.UserTE->isCopyableElement(In)) {
22234 if (ScheduleCopyableData *UseSD =
22235 getScheduleCopyableData(EI.UserTE->UserTreeIndex, In)) {
22236 CD->incDependencies();
22237 if (!UseSD->isScheduled())
22238 CD->incrementUnscheduledDeps(1);
22239 if (!UseSD->hasValidDependencies() ||
22240 (InsertInReadyList && UseSD->isReady()))
22243 }
else if (Visited.
insert(In).second) {
22244 if (ScheduleData *UseSD = getScheduleData(In)) {
22245 CD->incDependencies();
22246 if (!UseSD->isScheduled())
22247 CD->incrementUnscheduledDeps(1);
22248 if (!UseSD->hasValidDependencies() ||
22249 (InsertInReadyList && UseSD->isReady()))
22254 }
while (It !=
Op.end());
22255 if (CD->isReady() && CD->getDependencies() == 0 &&
22256 (EI.UserTE->hasState() &&
22257 (EI.UserTE->getMainOp()->getParent() !=
22258 CD->getInst()->getParent() ||
22260 (EI.UserTE->getMainOp()->hasNUsesOrMore(
UsesLimit) ||
22261 any_of(EI.UserTE->getMainOp()->users(), [&](User *U) {
22262 auto *IU = dyn_cast<Instruction>(U);
22265 return IU->getParent() == EI.UserTE->getMainOp()->getParent();
22271 CD->incDependencies();
22272 CD->incrementUnscheduledDeps(1);
22278 if (BundleMember->hasValidDependencies())
22280 LLVM_DEBUG(
dbgs() <<
"SLP: update deps of " << *BundleMember <<
"\n");
22281 BundleMember->initDependencies();
22282 BundleMember->resetUnscheduledDeps();
22284 SmallDenseMap<Value *, unsigned> UserToNumOps;
22285 for (User *U : BundleMember->getInst()->users()) {
22288 if (ScheduleData *UseSD = getScheduleData(U)) {
22292 if (areAllOperandsReplacedByCopyableData(
22295 BundleMember->incDependencies();
22296 if (!UseSD->isScheduled())
22297 BundleMember->incrementUnscheduledDeps(1);
22298 if (!UseSD->hasValidDependencies() ||
22299 (InsertInReadyList && UseSD->isReady()))
22303 for (ScheduleCopyableData *UseSD :
22304 getScheduleCopyableDataUsers(BundleMember->getInst())) {
22305 BundleMember->incDependencies();
22306 if (!UseSD->isScheduled())
22307 BundleMember->incrementUnscheduledDeps(1);
22308 if (!UseSD->hasValidDependencies() ||
22309 (InsertInReadyList && UseSD->isReady()))
22313 SmallPtrSet<const Instruction *, 4> Visited;
22316 if (!Visited.
insert(
I).second)
22318 auto *DepDest = getScheduleData(
I);
22319 assert(DepDest &&
"must be in schedule window");
22320 DepDest->addControlDependency(BundleMember);
22321 BundleMember->incDependencies();
22322 if (!DepDest->isScheduled())
22323 BundleMember->incrementUnscheduledDeps(1);
22324 if (!DepDest->hasValidDependencies() ||
22325 (InsertInReadyList && DepDest->isReady()))
22333 for (Instruction *
I = BundleMember->getInst()->getNextNode();
22334 I != ScheduleEnd;
I =
I->getNextNode()) {
22339 MakeControlDependent(
I);
22347 if (RegionHasStackSave) {
22352 match(BundleMember->getInst(),
22354 for (Instruction *
I = BundleMember->getInst()->getNextNode();
22355 I != ScheduleEnd;
I =
I->getNextNode()) {
22366 MakeControlDependent(
I);
22376 BundleMember->getInst()->mayReadOrWriteMemory()) {
22377 for (Instruction *
I = BundleMember->getInst()->getNextNode();
22378 I != ScheduleEnd;
I =
I->getNextNode()) {
22384 MakeControlDependent(
I);
22391 ScheduleData *NextLoadStore = BundleMember->getNextLoadStore();
22392 if (!NextLoadStore)
22396 "NextLoadStore list for non memory effecting bundle?");
22399 unsigned NumAliased = 0;
22400 unsigned DistToSrc = 1;
22401 bool IsNonSimpleSrc = !SrcLoc.
Ptr || !
isSimple(SrcInst);
22403 for (ScheduleData *DepDest = NextLoadStore; DepDest;
22404 DepDest = DepDest->getNextLoadStore()) {
22405 assert(isInSchedulingRegion(*DepDest) &&
"Expected to be in region");
22415 ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
22417 SLP->isAliased(SrcLoc, SrcInst, DepDest->getInst())))) {
22424 DepDest->addMemoryDependency(BundleMember);
22425 BundleMember->incDependencies();
22426 if (!DepDest->isScheduled())
22427 BundleMember->incrementUnscheduledDeps(1);
22428 if (!DepDest->hasValidDependencies() ||
22429 (InsertInReadyList && DepDest->isReady()))
22453 "expected at least one instruction to schedule");
22455 WorkList.
push_back(Bundle.getBundle().front());
22457 SmallPtrSet<ScheduleBundle *, 16> Visited;
22458 while (!WorkList.
empty()) {
22463 CopyableBundle.
push_back(&CD->getBundle());
22464 Bundles = CopyableBundle;
22466 Bundles = getScheduleBundles(SD->getInst());
22468 if (Bundles.
empty()) {
22469 if (!SD->hasValidDependencies())
22471 if (InsertInReadyList && SD->isReady()) {
22472 ReadyInsts.insert(SD);
22473 LLVM_DEBUG(
dbgs() <<
"SLP: gets ready on update: " << *SD <<
"\n");
22477 for (ScheduleBundle *Bundle : Bundles) {
22478 if (Bundle->hasValidDependencies() || !Visited.
insert(Bundle).second)
22480 assert(isInSchedulingRegion(*Bundle) &&
22481 "ScheduleData not in scheduling region");
22482 for_each(Bundle->getBundle(), ProcessNode);
22484 if (InsertInReadyList && SD->isReady()) {
22485 for (ScheduleBundle *Bundle : Bundles) {
22486 assert(isInSchedulingRegion(*Bundle) &&
22487 "ScheduleData not in scheduling region");
22488 if (!Bundle->isReady())
22490 ReadyInsts.insert(Bundle);
22498void BoUpSLP::BlockScheduling::resetSchedule() {
22500 "tried to reset schedule on block which has not been scheduled");
22501 for_each(ScheduleDataMap, [&](
auto &
P) {
22502 if (BB !=
P.first->getParent())
22504 ScheduleData *SD =
P.second;
22505 if (isInSchedulingRegion(*SD)) {
22506 SD->setScheduled(
false);
22507 SD->resetUnscheduledDeps();
22510 for_each(ScheduleCopyableDataMapByInst, [&](
auto &
P) {
22511 for_each(
P.second, [&](ScheduleCopyableData *SD) {
22512 if (isInSchedulingRegion(*SD)) {
22513 SD->setScheduled(false);
22514 SD->resetUnscheduledDeps();
22518 for_each(ScheduledBundles, [&](
auto &
P) {
22519 for_each(
P.second, [&](ScheduleBundle *Bundle) {
22520 if (isInSchedulingRegion(*Bundle))
22521 Bundle->setScheduled(false);
22525 for (
auto &
P : ScheduleCopyableDataMap) {
22526 if (isInSchedulingRegion(*
P.second)) {
22527 P.second->setScheduled(
false);
22528 P.second->resetUnscheduledDeps();
22531 ReadyInsts.clear();
22534void BoUpSLP::scheduleBlock(
const BoUpSLP &R, BlockScheduling *BS) {
22535 if (!BS->ScheduleStart)
22538 LLVM_DEBUG(
dbgs() <<
"SLP: schedule block " << BS->BB->getName() <<
"\n");
22545 BS->resetSchedule();
22552 struct ScheduleDataCompare {
22553 bool operator()(
const ScheduleEntity *SD1,
22554 const ScheduleEntity *SD2)
const {
22555 return SD2->getSchedulingPriority() < SD1->getSchedulingPriority();
22558 std::set<ScheduleEntity *, ScheduleDataCompare> ReadyInsts;
22563 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
22564 I =
I->getNextNode()) {
22566 if (!Bundles.
empty()) {
22567 for (ScheduleBundle *Bundle : Bundles) {
22568 Bundle->setSchedulingPriority(Idx++);
22569 if (!Bundle->hasValidDependencies())
22570 BS->calculateDependencies(*Bundle,
false,
this);
22573 for (ScheduleCopyableData *SD :
reverse(SDs)) {
22574 ScheduleBundle &Bundle = SD->getBundle();
22575 Bundle.setSchedulingPriority(Idx++);
22576 if (!Bundle.hasValidDependencies())
22577 BS->calculateDependencies(Bundle,
false,
this);
22582 BS->getScheduleCopyableDataUsers(
I);
22583 if (ScheduleData *SD = BS->getScheduleData(
I)) {
22586 SDTEs.
front()->doesNotNeedToSchedule() ||
22588 "scheduler and vectorizer bundle mismatch");
22589 SD->setSchedulingPriority(Idx++);
22590 if (!SD->hasValidDependencies() &&
22591 (!CopyableData.
empty() ||
22592 any_of(
R.ValueToGatherNodes.lookup(
I), [&](
const TreeEntry *TE) {
22593 assert(TE->isGather() &&
"expected gather node");
22594 return TE->hasState() && TE->hasCopyableElements() &&
22595 TE->isCopyableElement(I);
22601 ScheduleBundle Bundle;
22603 BS->calculateDependencies(Bundle,
false,
this);
22606 for (ScheduleCopyableData *SD :
reverse(CopyableData)) {
22607 ScheduleBundle &Bundle = SD->getBundle();
22608 Bundle.setSchedulingPriority(Idx++);
22609 if (!Bundle.hasValidDependencies())
22610 BS->calculateDependencies(Bundle,
false,
this);
22613 BS->initialFillReadyList(ReadyInsts);
22615 Instruction *LastScheduledInst = BS->ScheduleEnd;
22618 SmallPtrSet<Instruction *, 16> Scheduled;
22619 while (!ReadyInsts.empty()) {
22620 auto *Picked = *ReadyInsts.begin();
22621 ReadyInsts.erase(ReadyInsts.begin());
22626 for (
const ScheduleEntity *BundleMember : Bundle->getBundle()) {
22627 Instruction *PickedInst = BundleMember->getInst();
22629 bool IsCopyable = Bundle->getTreeEntry()->isCopyableElement(PickedInst);
22630 if ((IsCopyable && BS->getScheduleData(PickedInst)) ||
22631 (!IsCopyable && !Scheduled.
insert(PickedInst).second))
22633 if (PickedInst->
getNextNode() != LastScheduledInst)
22635 LastScheduledInst = PickedInst;
22637 EntryToLastInstruction.try_emplace(Bundle->getTreeEntry(),
22638 LastScheduledInst);
22642 if (PickedInst->
getNextNode() != LastScheduledInst)
22644 LastScheduledInst = PickedInst;
22646 auto Invalid = InstructionsState::invalid();
22651#ifdef EXPENSIVE_CHECKS
22655#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
22657 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
22658 I =
I->getNextNode()) {
22661 [](
const ScheduleBundle *Bundle) {
22662 return Bundle->isScheduled();
22664 "must be scheduled at this point");
22669 BS->ScheduleStart =
nullptr;
22677 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
22682 auto E = InstrElementSize.find(V);
22683 if (E != InstrElementSize.end())
22700 Value *FirstNonBool =
nullptr;
22701 while (!Worklist.
empty()) {
22706 auto *Ty =
I->getType();
22709 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
22717 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
22725 for (
Use &U :
I->operands()) {
22727 if (Visited.
insert(J).second &&
22733 FirstNonBool = U.get();
22744 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
22746 Width = DL->getTypeSizeInBits(V->getType());
22750 InstrElementSize[
I] = Width;
22755bool BoUpSLP::collectValuesToDemote(
22756 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
22759 bool &IsProfitableToDemote,
bool IsTruncRoot)
const {
22764 unsigned OrigBitWidth =
22765 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
22779 if (isa<PoisonValue>(R))
22781 return !isKnownNonNegative(R, SimplifyQuery(*DL));
22783 auto IsPotentiallyTruncated = [&](
Value *V,
unsigned &
BitWidth) ->
bool {
22786 if (getTreeEntries(V).
size() > 1)
22792 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth >
BitWidth) {
22798 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
22802 APInt
Mask = DB->getDemandedBits(
I);
22803 unsigned BitWidth2 =
22804 std::max<unsigned>(1,
Mask.getBitWidth() -
Mask.countl_zero());
22805 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
22811 BitWidth1 = std::min(BitWidth1, BitWidth2);
22816 auto FinalAnalysis = [&, TTI = TTI]() {
22817 if (!IsProfitableToDemote)
22820 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(
BitWidth)));
22822 if (Res &&
E.isGather()) {
22823 if (
E.hasState()) {
22824 if (
const TreeEntry *SameTE =
22825 getSameValuesTreeEntry(
E.getMainOp(),
E.Scalars);
22827 if (collectValuesToDemote(*SameTE, IsProfitableToDemoteRoot,
BitWidth,
22828 ToDemote, Visited, NodesToKeepBWs,
22829 MaxDepthLevel, IsProfitableToDemote,
22837 SmallPtrSet<Value *, 4> UniqueBases;
22838 for (
Value *V :
E.Scalars) {
22842 UniqueBases.
insert(EE->getVectorOperand());
22844 const unsigned VF =
E.Scalars.size();
22845 Type *OrigScalarTy =
E.Scalars.front()->getType();
22846 if (UniqueBases.
size() <= 2 ||
22859 if (
E.isGather() || !Visited.
insert(&
E).second ||
22861 return !isa<Constant>(V) && all_of(V->users(), [&](User *U) {
22862 return isa<InsertElementInst>(U) && !isVectorized(U);
22865 return FinalAnalysis();
22868 return !isa<Constant>(V) && !all_of(V->users(), [=](User *U) {
22869 return isVectorized(U) ||
22870 (E.Idx == 0 && UserIgnoreList &&
22871 UserIgnoreList->contains(U)) ||
22872 (!isa<CmpInst>(U) && U->getType()->isSized() &&
22873 !U->getType()->isScalableTy() &&
22874 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
22875 }) && !IsPotentiallyTruncated(V,
BitWidth);
22880 bool &NeedToExit) {
22881 NeedToExit =
false;
22882 unsigned InitLevel = MaxDepthLevel;
22883 for (
const TreeEntry *
Op : Operands) {
22884 unsigned Level = InitLevel;
22885 if (!collectValuesToDemote(*
Op, IsProfitableToDemoteRoot,
BitWidth,
22886 ToDemote, Visited, NodesToKeepBWs, Level,
22887 IsProfitableToDemote, IsTruncRoot)) {
22888 if (!IsProfitableToDemote)
22891 if (!FinalAnalysis())
22895 MaxDepthLevel = std::max(MaxDepthLevel, Level);
22899 auto AttemptCheckBitwidth =
22900 [&](function_ref<bool(
unsigned,
unsigned)> Checker,
bool &NeedToExit) {
22902 NeedToExit =
false;
22903 unsigned BestFailBitwidth = 0;
22905 if (Checker(
BitWidth, OrigBitWidth))
22907 if (BestFailBitwidth == 0 && FinalAnalysis())
22911 if (BestFailBitwidth == 0) {
22922 auto TryProcessInstruction =
22924 function_ref<bool(
unsigned,
unsigned)> Checker = {}) {
22925 if (Operands.empty()) {
22928 for (
Value *V :
E.Scalars)
22929 (void)IsPotentiallyTruncated(V,
BitWidth);
22934 return !V->hasOneUse() && !IsPotentiallyTruncated(V, BitWidth);
22937 bool NeedToExit =
false;
22938 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
22942 if (!ProcessOperands(Operands, NeedToExit))
22951 return IsProfitableToDemote;
22954 if (
E.State == TreeEntry::SplitVectorize)
22955 return TryProcessInstruction(
22957 {VectorizableTree[
E.CombinedEntriesWithIndices.front().first].get(),
22958 VectorizableTree[
E.CombinedEntriesWithIndices.back().first].get()});
22960 if (
E.isAltShuffle()) {
22962 auto IsDangerousOpcode = [](
unsigned Opcode) {
22964 case Instruction::Shl:
22965 case Instruction::AShr:
22966 case Instruction::LShr:
22967 case Instruction::UDiv:
22968 case Instruction::SDiv:
22969 case Instruction::URem:
22970 case Instruction::SRem:
22977 if (IsDangerousOpcode(
E.getAltOpcode()))
22978 return FinalAnalysis();
22981 switch (
E.getOpcode()) {
22985 case Instruction::Trunc:
22986 if (IsProfitableToDemoteRoot)
22987 IsProfitableToDemote =
true;
22988 return TryProcessInstruction(
BitWidth);
22989 case Instruction::ZExt:
22990 case Instruction::SExt:
22991 if (
E.UserTreeIndex.UserTE &&
E.UserTreeIndex.UserTE->hasState() &&
22992 E.UserTreeIndex.UserTE->getOpcode() == Instruction::BitCast &&
22993 E.UserTreeIndex.UserTE->getMainOp()->getType()->isFPOrFPVectorTy())
22995 IsProfitableToDemote =
true;
22996 return TryProcessInstruction(
BitWidth);
23000 case Instruction::Add:
23001 case Instruction::Sub:
23002 case Instruction::Mul:
23003 case Instruction::And:
23004 case Instruction::Or:
23005 case Instruction::Xor: {
23006 return TryProcessInstruction(
23007 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)});
23009 case Instruction::Freeze:
23010 return TryProcessInstruction(
BitWidth, getOperandEntry(&
E, 0));
23011 case Instruction::Shl: {
23014 auto ShlChecker = [&](
unsigned BitWidth, unsigned) {
23016 if (isa<PoisonValue>(V))
23018 if (E.isCopyableElement(V))
23020 auto *I = cast<Instruction>(V);
23021 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
23022 return AmtKnownBits.getMaxValue().ult(BitWidth);
23025 return TryProcessInstruction(
23026 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)}, ShlChecker);
23028 case Instruction::LShr: {
23032 auto LShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
23034 if (isa<PoisonValue>(V))
23036 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
23037 if (E.isCopyableElement(V))
23038 return MaskedValueIsZero(V, ShiftedBits, SimplifyQuery(*DL));
23039 auto *I = cast<Instruction>(V);
23040 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
23041 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
23042 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
23043 SimplifyQuery(*DL));
23046 return TryProcessInstruction(
23047 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)},
23050 case Instruction::AShr: {
23054 auto AShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
23056 if (isa<PoisonValue>(V))
23058 auto *I = cast<Instruction>(V);
23059 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
23060 unsigned ShiftedBits = OrigBitWidth - BitWidth;
23061 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
23063 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
23066 return TryProcessInstruction(
23067 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)},
23070 case Instruction::UDiv:
23071 case Instruction::URem: {
23073 auto Checker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
23076 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
23077 if (E.hasCopyableElements() && E.isCopyableElement(V))
23078 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
23079 auto *I = cast<Instruction>(V);
23080 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
23081 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
23084 return TryProcessInstruction(
23085 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)}, Checker);
23089 case Instruction::Select: {
23090 return TryProcessInstruction(
23091 BitWidth, {getOperandEntry(&
E, 1), getOperandEntry(&
E, 2)});
23095 case Instruction::PHI: {
23096 const unsigned NumOps =
E.getNumOperands();
23099 [&](
unsigned Idx) { return getOperandEntry(&E, Idx); });
23104 case Instruction::Call: {
23109 if (
ID != Intrinsic::abs &&
ID != Intrinsic::smin &&
23110 ID != Intrinsic::smax &&
ID != Intrinsic::umin &&
ID != Intrinsic::umax)
23113 function_ref<bool(
unsigned,
unsigned)> CallChecker;
23114 auto CompChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
23117 auto *I = cast<Instruction>(V);
23118 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
23119 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
23120 return MaskedValueIsZero(I->getOperand(0), Mask,
23121 SimplifyQuery(*DL)) &&
23122 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
23124 assert((
ID == Intrinsic::smin ||
ID == Intrinsic::smax) &&
23125 "Expected min/max intrinsics only.");
23126 unsigned SignBits = OrigBitWidth -
BitWidth;
23128 unsigned Op0SignBits =
23130 unsigned Op1SignBits =
23132 return SignBits <= Op0SignBits &&
23133 ((SignBits != Op0SignBits &&
23136 SimplifyQuery(*DL))) &&
23137 SignBits <= Op1SignBits &&
23138 ((SignBits != Op1SignBits &&
23143 auto AbsChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
23146 auto *I = cast<Instruction>(V);
23147 unsigned SignBits = OrigBitWidth - BitWidth;
23148 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
23149 unsigned Op0SignBits =
23150 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
23151 return SignBits <= Op0SignBits &&
23152 ((SignBits != Op0SignBits &&
23153 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
23154 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
23157 if (
ID != Intrinsic::abs) {
23158 Operands.push_back(getOperandEntry(&
E, 1));
23159 CallChecker = CompChecker;
23161 CallChecker = AbsChecker;
23164 std::numeric_limits<InstructionCost::CostType>::max();
23166 unsigned VF =
E.Scalars.size();
23168 auto Checker = [&](
unsigned BitWidth, unsigned) {
23176 if (
Cost < BestCost) {
23182 [[maybe_unused]]
bool NeedToExit;
23183 (void)AttemptCheckBitwidth(Checker, NeedToExit);
23185 return TryProcessInstruction(
BitWidth, Operands, CallChecker);
23193 return FinalAnalysis();
23200 bool IsStoreOrInsertElt =
23201 VectorizableTree.front()->hasState() &&
23202 (VectorizableTree.front()->
getOpcode() == Instruction::Store ||
23203 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
23204 if ((IsStoreOrInsertElt || UserIgnoreList) &&
23205 ExtraBitWidthNodes.size() <= 1 &&
23206 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
23207 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
23210 unsigned NodeIdx = 0;
23211 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
23215 assert((VectorizableTree[NodeIdx]->
isGather() || NodeIdx != 0 ||
23216 !VectorizableTree[NodeIdx]->UserTreeIndex) &&
23217 "Unexpected tree is graph.");
23221 bool IsTruncRoot =
false;
23222 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
23225 if (NodeIdx != 0 &&
23226 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
23227 VectorizableTree[NodeIdx]->
getOpcode() == Instruction::Trunc) {
23228 assert(IsStoreOrInsertElt &&
"Expected store/insertelement seeded graph.");
23229 IsTruncRoot =
true;
23231 IsProfitableToDemoteRoot =
true;
23236 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
23240 auto ComputeMaxBitWidth =
23241 [&](
const TreeEntry &E,
bool IsTopRoot,
bool IsProfitableToDemoteRoot,
23242 unsigned Limit,
bool IsTruncRoot,
bool IsSignedCmp) ->
unsigned {
23246 if (E.isGather() && IsTruncRoot && E.UserTreeIndex &&
23247 !NodesToKeepBWs.
contains(E.Idx) &&
23248 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
23250 return V->hasOneUse() || isa<Constant>(V) ||
23251 (!V->hasNUsesOrMore(UsesLimit) &&
23252 none_of(V->users(), [&](User *U) {
23253 ArrayRef<TreeEntry *> TEs = getTreeEntries(U);
23254 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
23255 if (TEs.empty() || is_contained(TEs, UserTE))
23257 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
23259 isa<SIToFPInst, UIToFPInst>(U) ||
23260 (UserTE->hasState() &&
23261 (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
23262 SelectInst>(UserTE->getMainOp()) ||
23263 isa<SIToFPInst, UIToFPInst>(UserTE->getMainOp()))))
23265 unsigned UserTESz = DL->getTypeSizeInBits(
23266 UserTE->Scalars.front()->getType());
23267 if (all_of(TEs, [&](const TreeEntry *TE) {
23268 auto It = MinBWs.find(TE);
23269 return It != MinBWs.end() &&
23270 It->second.first > UserTESz;
23273 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
23277 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
23278 auto It = MinBWs.find(UserTE);
23279 if (It != MinBWs.end())
23280 return It->second.first;
23281 unsigned MaxBitWidth =
23282 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
23283 MaxBitWidth =
bit_ceil(MaxBitWidth);
23284 if (MaxBitWidth < 8 && MaxBitWidth > 1)
23286 return MaxBitWidth;
23292 unsigned VF = E.getVectorFactor();
23293 Type *ScalarTy = E.Scalars.front()->getType();
23300 [&](
Value *V) { return AnalyzedMinBWVals.contains(V); }))
23309 unsigned MaxBitWidth = 1u;
23317 bool IsKnownPositive = !IsSignedCmp &&
all_of(E.Scalars, [&](
Value *R) {
23318 if (isa<PoisonValue>(R))
23320 KnownBits Known = computeKnownBits(R, *DL);
23321 return Known.isNonNegative();
23324 if (!IsKnownPositive && !IsTopRoot && E.UserTreeIndex &&
23325 E.UserTreeIndex.UserTE->hasState() &&
23326 E.UserTreeIndex.UserTE->getOpcode() == Instruction::UIToFP)
23328 std::min(DL->getTypeSizeInBits(
23329 E.UserTreeIndex.UserTE->Scalars.front()->getType()),
23330 DL->getTypeSizeInBits(ScalarTy));
23334 for (
Value *Root : E.Scalars) {
23340 unsigned BitWidth1 = NumTypeBits - NumSignBits;
23356 if (!IsKnownPositive)
23361 MaxBitWidth = std::max(BitWidth1, MaxBitWidth);
23364 APInt Mask = DB->getDemandedBits(
I);
23365 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
23367 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
23370 if (MaxBitWidth < 8 && MaxBitWidth > 1)
23375 if (NumParts > 1 &&
23383 unsigned Opcode = E.getOpcode();
23384 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
23385 Opcode == Instruction::SExt ||
23386 Opcode == Instruction::ZExt || NumParts > 1;
23391 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
23392 bool NeedToDemote = IsProfitableToDemote;
23394 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
23395 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
23396 NeedToDemote, IsTruncRoot) ||
23397 (MaxDepthLevel <= Limit &&
23398 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
23399 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
23400 DL->getTypeSizeInBits(TreeRootIT) /
23401 DL->getTypeSizeInBits(
23402 E.getMainOp()->getOperand(0)->getType()) >
23406 MaxBitWidth =
bit_ceil(MaxBitWidth);
23408 return MaxBitWidth;
23415 if (UserIgnoreList &&
23419 if (
all_of(*UserIgnoreList,
23424 VectorizableTree.front()->State == TreeEntry::Vectorize &&
23425 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
23426 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
23427 Builder.getInt1Ty()) {
23428 ReductionBitWidth = 1;
23430 for (
Value *V : *UserIgnoreList) {
23434 TypeSize NumTypeBits = DL->getTypeSizeInBits(
V->getType());
23435 unsigned BitWidth1 = NumTypeBits - NumSignBits;
23438 unsigned BitWidth2 = BitWidth1;
23441 BitWidth2 =
Mask.getBitWidth() -
Mask.countl_zero();
23443 ReductionBitWidth =
23444 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
23446 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
23447 ReductionBitWidth = 8;
23449 ReductionBitWidth =
bit_ceil(ReductionBitWidth);
23452 bool IsTopRoot = NodeIdx == 0;
23453 while (NodeIdx < VectorizableTree.size() &&
23454 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
23455 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
23456 RootDemotes.push_back(NodeIdx);
23458 IsTruncRoot =
true;
23460 bool IsSignedCmp =
false;
23461 if (UserIgnoreList &&
23465 IsSignedCmp =
true;
23466 while (NodeIdx < VectorizableTree.size()) {
23468 unsigned Limit = 2;
23470 ReductionBitWidth ==
23471 DL->getTypeSizeInBits(
23472 VectorizableTree.front()->Scalars.front()->getType()))
23474 unsigned MaxBitWidth = ComputeMaxBitWidth(
23475 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
23476 IsTruncRoot, IsSignedCmp);
23477 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
23478 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
23479 ReductionBitWidth =
bit_ceil(MaxBitWidth);
23480 else if (MaxBitWidth == 0)
23481 ReductionBitWidth = 0;
23484 for (
unsigned Idx : RootDemotes) {
23485 if (
all_of(VectorizableTree[Idx]->Scalars, [&](
Value *V) {
23486 uint32_t OrigBitWidth =
23487 DL->getTypeSizeInBits(
V->getType()->getScalarType());
23488 if (OrigBitWidth > MaxBitWidth) {
23496 RootDemotes.clear();
23498 IsProfitableToDemoteRoot =
true;
23500 if (ExtraBitWidthNodes.empty()) {
23501 NodeIdx = VectorizableTree.size();
23503 unsigned NewIdx = 0;
23505 NewIdx = *ExtraBitWidthNodes.begin();
23506 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
23507 }
while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
23510 NodeIdx < VectorizableTree.size() &&
23511 VectorizableTree[NodeIdx]->UserTreeIndex &&
23512 VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&
23513 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
23514 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
23515 Instruction::Trunc &&
23516 !VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();
23518 NodeIdx < VectorizableTree.size() &&
23519 VectorizableTree[NodeIdx]->UserTreeIndex &&
23520 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
23521 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
23522 Instruction::ICmp &&
23524 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->Scalars,
23526 auto *IC = dyn_cast<ICmpInst>(V);
23527 return IC && (IC->isSigned() ||
23528 !isKnownNonNegative(IC->getOperand(0),
23529 SimplifyQuery(*DL)) ||
23530 !isKnownNonNegative(IC->getOperand(1),
23531 SimplifyQuery(*DL)));
23537 if (MaxBitWidth == 0 ||
23541 if (UserIgnoreList)
23542 AnalyzedMinBWVals.insert_range(TreeRoot);
23549 for (
unsigned Idx : ToDemote) {
23550 TreeEntry *
TE = VectorizableTree[Idx].get();
23551 if (MinBWs.contains(TE))
23554 if (isa<PoisonValue>(R))
23556 return !isKnownNonNegative(R, SimplifyQuery(*DL));
23558 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
23599 DL = &
F.getDataLayout();
23607 if (!
TTI->getNumberOfRegisters(
TTI->getRegisterClassForType(
true))) {
23609 dbgs() <<
"SLP: Didn't find any vector registers for target, abort.\n");
23614 if (
F.hasFnAttribute(Attribute::NoImplicitFloat))
23617 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing blocks in " <<
F.getName() <<
".\n");
23621 BoUpSLP R(&
F,
SE,
TTI,
TLI,
AA,
LI,
DT,
AC,
DB,
DL, ORE_);
23627 DT->updateDFSNumbers();
23630 for (
auto *BB :
post_order(&
F.getEntryBlock())) {
23635 R.clearReductionData();
23636 collectSeedInstructions(BB);
23639 if (!Stores.empty()) {
23641 <<
" underlying objects.\n");
23642 Changed |= vectorizeStoreChains(R);
23646 Changed |= vectorizeChainsInBlock(BB, R);
23651 if (!GEPs.empty()) {
23653 <<
" underlying objects.\n");
23654 Changed |= vectorizeGEPIndices(BB, R);
23659 R.optimizeGatherSequence();
23667 unsigned Idx,
unsigned MinVF,
23672 const unsigned Sz = R.getVectorElementSize(Chain[0]);
23673 unsigned VF = Chain.
size();
23679 VF < 2 || VF < MinVF) {
23687 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << VF <<
" stores at offset " << Idx
23691 for (
Value *V : Chain)
23694 InstructionsCompatibilityAnalysis
Analysis(*DT, *
DL, *
TTI, *TLI);
23695 InstructionsState S =
Analysis.buildInstructionsState(
23699 bool IsAllowedSize =
23703 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
23704 (!S.getMainOp()->isSafeToRemove() ||
23707 return !isa<ExtractElementInst>(V) &&
23708 (V->getNumUses() > Chain.size() ||
23709 any_of(V->users(), [&](User *U) {
23710 return !Stores.contains(U);
23713 (ValOps.
size() > Chain.size() / 2 && !S)) {
23714 Size = (!IsAllowedSize && S) ? 1 : 2;
23718 if (
R.isLoadCombineCandidate(Chain))
23720 R.buildTree(Chain);
23722 if (
R.isTreeTinyAndNotFullyVectorizable()) {
23723 if (
R.isGathered(Chain.front()) ||
23725 return std::nullopt;
23726 Size =
R.getCanonicalGraphSize();
23729 if (
R.isProfitableToReorder()) {
23730 R.reorderTopToBottom();
23731 R.reorderBottomToTop();
23733 R.transformNodes();
23734 R.computeMinimumValueSizes();
23737 R.buildExternalUses();
23739 Size =
R.getCanonicalGraphSize();
23740 if (S && S.getOpcode() == Instruction::Load)
23748 using namespace ore;
23750 R.getORE()->emit(OptimizationRemark(
SV_NAME,
"StoresVectorized",
23752 <<
"Stores SLP vectorized with cost " <<
NV(
"Cost",
Cost)
23753 <<
" and with tree size "
23754 <<
NV(
"TreeSize",
R.getTreeSize()));
23768 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
23769 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
23770 unsigned Size = First ? Val.first : Val.second;
23782 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
23783 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
23784 unsigned P = First ? Val.first : Val.second;
23787 return V + (P - Mean) * (P - Mean);
23790 return Dev * 96 / (Mean * Mean) == 0;
23798class RelatedStoreInsts {
23801 : AllStores(AllStores) {
23802 reset(BaseInstrIdx);
23805 void reset(
unsigned NewBaseInstr) {
23806 assert(NewBaseInstr < AllStores.size() &&
23807 "Instruction index out of bounds");
23808 BaseInstrIdx = NewBaseInstr;
23810 insertOrLookup(NewBaseInstr, 0);
23817 std::optional<unsigned> insertOrLookup(
unsigned InstrIdx, int64_t PtrDist) {
23818 auto [It,
Inserted] = Instrs.emplace(PtrDist, InstrIdx);
23819 return Inserted ? std::nullopt : std::make_optional(It->second);
23822 using DistToInstMap = std::map<int64_t, unsigned>;
23823 const DistToInstMap &getStores()
const {
return Instrs; }
23827 std::optional<int64_t> getPointerDiff(StoreInst &SI,
const DataLayout &
DL,
23828 ScalarEvolution &SE)
const {
23829 StoreInst &BaseStore = *AllStores[BaseInstrIdx];
23832 SI.getValueOperand()->getType(),
SI.getPointerOperand(),
DL, SE,
23838 void rebase(
unsigned MinSafeIdx,
unsigned NewBaseInstIdx,
23839 int64_t DistFromCurBase) {
23840 DistToInstMap PrevSet = std::move(Instrs);
23841 reset(NewBaseInstIdx);
23846 for (
auto [Dist, InstIdx] : PrevSet) {
23847 if (InstIdx >= MinSafeIdx)
23848 insertOrLookup(InstIdx, Dist - DistFromCurBase);
23854 DistToInstMap::reverse_iterator LastVectorizedStore =
find_if(
23855 reverse(Instrs), [&](
const std::pair<int64_t, unsigned> &DistAndIdx) {
23856 return VectorizedStores.
contains(AllStores[DistAndIdx.second]);
23861 DistToInstMap::iterator VectorizedStoresEnd = LastVectorizedStore.base();
23862 Instrs.erase(Instrs.begin(), VectorizedStoresEnd);
23867 unsigned BaseInstrIdx;
23870 DistToInstMap Instrs;
23878bool SLPVectorizerPass::vectorizeStores(
23880 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
23887 auto TryToVectorize = [&](
const RelatedStoreInsts::DistToInstMap &StoreSeq) {
23888 int64_t PrevDist = -1;
23892 auto &[Dist, InstIdx] =
Data;
23893 if (Operands.
empty() || Dist - PrevDist == 1) {
23896 if (Idx != StoreSeq.size() - 1)
23905 if (Operands.
size() <= 1 ||
23907 .
insert({Operands.front(),
23908 cast<StoreInst>(Operands.front())->getValueOperand(),
23910 cast<StoreInst>(Operands.back())->getValueOperand(),
23915 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
23916 unsigned EltSize =
R.getVectorElementSize(Operands[0]);
23920 std::min(
R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
23922 Type *StoreTy =
Store->getValueOperand()->getType();
23923 Type *ValueTy = StoreTy;
23925 ValueTy = Trunc->getSrcTy();
23934 R.getMinVF(DL->getTypeStoreSizeInBits(StoreScalarTy)), StoreScalarTy,
23937 MinVF = std::max<unsigned>(2, MinVF);
23939 if (MaxVF < MinVF) {
23940 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
23942 <<
"MinVF (" << MinVF <<
")\n");
23946 unsigned NonPowerOf2VF = 0;
23951 unsigned CandVF = std::clamp<unsigned>(Operands.
size(), MinVF, MaxVF);
23953 NonPowerOf2VF = CandVF;
23954 assert(NonPowerOf2VF != MaxVF &&
23955 "Non-power-of-2 VF should not be equal to MaxVF");
23962 unsigned MaxRegVF = MaxVF;
23964 MaxVF = std::min<unsigned>(MaxVF,
bit_floor(Operands.
size()));
23965 if (MaxVF < MinVF) {
23966 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
23968 <<
"MinVF (" << MinVF <<
")\n");
23972 SmallVector<unsigned> CandidateVFs;
23973 for (
unsigned VF = std::max(MaxVF, NonPowerOf2VF); VF >= MinVF;
23977 unsigned End = Operands.
size();
23978 unsigned Repeat = 0;
23979 constexpr unsigned MaxAttempts = 4;
23980 OwningArrayRef<std::pair<unsigned, unsigned>> RangeSizes(Operands.
size());
23981 for (std::pair<unsigned, unsigned> &
P : RangeSizes)
23982 P.first =
P.second = 1;
23983 DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable;
23984 auto IsNotVectorized = [](
bool First,
23985 const std::pair<unsigned, unsigned> &
P) {
23986 return First ?
P.first > 0 :
P.second > 0;
23988 auto IsVectorized = [](
bool First,
23989 const std::pair<unsigned, unsigned> &
P) {
23990 return First ?
P.first == 0 :
P.second == 0;
23992 auto VFIsProfitable = [](
bool First,
unsigned Size,
23993 const std::pair<unsigned, unsigned> &
P) {
23996 auto FirstSizeSame = [](
unsigned Size,
23997 const std::pair<unsigned, unsigned> &
P) {
23998 return Size ==
P.first;
24002 bool RepeatChanged =
false;
24003 bool AnyProfitableGraph =
false;
24004 for (
unsigned VF : CandidateVFs) {
24005 AnyProfitableGraph =
false;
24006 unsigned FirstUnvecStore =
24007 std::distance(RangeSizes.begin(),
24008 find_if(RangeSizes, std::bind(IsNotVectorized,
24009 VF >= MaxRegVF, _1)));
24013 while (FirstUnvecStore < End) {
24014 unsigned FirstVecStore = std::distance(
24015 RangeSizes.begin(),
24016 find_if(RangeSizes.drop_front(FirstUnvecStore),
24017 std::bind(IsVectorized, VF >= MaxRegVF, _1)));
24018 unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore;
24019 for (
unsigned SliceStartIdx = FirstUnvecStore;
24020 SliceStartIdx + VF <= MaxSliceEnd;) {
24031 ->getValueOperand()
24034 ->getValueOperand()
24037 "Expected all operands of same type.");
24038 if (!NonSchedulable.
empty()) {
24039 auto [NonSchedSizeMax, NonSchedSizeMin] =
24041 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= VF) {
24044 SliceStartIdx += NonSchedSizeMax;
24049 std::optional<bool> Res =
24050 vectorizeStoreChain(Slice, R, SliceStartIdx, MinVF, TreeSize);
24056 .first->getSecond()
24064 AnyProfitableGraph = RepeatChanged =
Changed =
true;
24067 for (std::pair<unsigned, unsigned> &
P :
24068 RangeSizes.slice(SliceStartIdx, VF))
24069 P.first =
P.second = 0;
24070 if (SliceStartIdx < FirstUnvecStore + MinVF) {
24071 for (std::pair<unsigned, unsigned> &
P : RangeSizes.slice(
24072 FirstUnvecStore, SliceStartIdx - FirstUnvecStore))
24073 P.first =
P.second = 0;
24074 FirstUnvecStore = SliceStartIdx + VF;
24076 if (SliceStartIdx > MaxSliceEnd - VF - MinVF) {
24077 for (std::pair<unsigned, unsigned> &
P :
24078 RangeSizes.slice(SliceStartIdx + VF,
24079 MaxSliceEnd - (SliceStartIdx + VF)))
24080 P.first =
P.second = 0;
24081 if (MaxSliceEnd == End)
24082 End = SliceStartIdx;
24083 MaxSliceEnd = SliceStartIdx;
24085 SliceStartIdx += VF;
24088 if (VF > 2 && Res &&
24089 !
all_of(RangeSizes.slice(SliceStartIdx, VF),
24090 std::bind(VFIsProfitable, VF >= MaxRegVF, TreeSize,
24092 SliceStartIdx += VF;
24097 if (VF > MaxRegVF && TreeSize > 1 &&
24098 all_of(RangeSizes.slice(SliceStartIdx, VF),
24099 std::bind(FirstSizeSame, TreeSize, _1))) {
24100 SliceStartIdx += VF;
24101 while (SliceStartIdx != MaxSliceEnd &&
24102 RangeSizes[SliceStartIdx].first == TreeSize)
24106 if (TreeSize > 1) {
24107 for (std::pair<unsigned, unsigned> &
P :
24108 RangeSizes.slice(SliceStartIdx, VF)) {
24109 if (VF >= MaxRegVF)
24110 P.second = std::max(
P.second, TreeSize);
24112 P.first = std::max(
P.first, TreeSize);
24116 AnyProfitableGraph =
true;
24118 if (FirstUnvecStore >= End)
24120 if (MaxSliceEnd - FirstUnvecStore < VF &&
24121 MaxSliceEnd - FirstUnvecStore >= MinVF)
24122 AnyProfitableGraph =
true;
24123 FirstUnvecStore = std::distance(
24124 RangeSizes.begin(),
24125 find_if(RangeSizes.drop_front(MaxSliceEnd),
24126 std::bind(IsNotVectorized, VF >= MaxRegVF, _1)));
24128 if (!AnyProfitableGraph && VF >= MaxRegVF &&
has_single_bit(VF))
24132 if (
all_of(RangeSizes, [](
const std::pair<unsigned, unsigned> &
P) {
24133 return P.first == 0 &&
P.second == 0;
24137 if (Repeat >= MaxAttempts ||
24138 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
24140 constexpr unsigned StoresLimit = 64;
24141 const unsigned MaxTotalNum = std::min<unsigned>(
24143 static_cast<unsigned>(
24146 RangeSizes.begin(),
24147 find_if(RangeSizes, std::bind(IsNotVectorized,
true, _1))) +
24149 unsigned VF =
bit_ceil(CandidateVFs.front()) * 2;
24152 CandidateVFs.clear();
24154 CandidateVFs.push_back(Limit);
24155 if (VF > MaxTotalNum || VF >= StoresLimit)
24157 for (std::pair<unsigned, unsigned> &
P : RangeSizes) {
24159 P.first = std::max(
P.second,
P.first);
24163 CandidateVFs.push_back(VF);
24203 auto FillStoresSet = [&](
unsigned Idx, StoreInst *
SI) {
24204 std::optional<int64_t> PtrDist;
24205 auto *RelatedStores =
find_if(
24206 SortedStores, [&PtrDist, SI,
this](
const RelatedStoreInsts &StoreSeq) {
24207 PtrDist = StoreSeq.getPointerDiff(*SI, *DL, *SE);
24208 return PtrDist.has_value();
24212 if (RelatedStores == SortedStores.
end()) {
24220 if (std::optional<unsigned> PrevInst =
24221 RelatedStores->insertOrLookup(Idx, *PtrDist)) {
24222 TryToVectorize(RelatedStores->getStores());
24223 RelatedStores->clearVectorizedStores(VectorizedStores);
24224 RelatedStores->rebase(*PrevInst + 1,
24229 Type *PrevValTy =
nullptr;
24231 if (
R.isDeleted(SI))
24234 PrevValTy =
SI->getValueOperand()->getType();
24236 if (PrevValTy !=
SI->getValueOperand()->getType()) {
24237 for (RelatedStoreInsts &StoreSeq : SortedStores)
24238 TryToVectorize(StoreSeq.getStores());
24239 SortedStores.clear();
24240 PrevValTy =
SI->getValueOperand()->getType();
24242 FillStoresSet(
I, SI);
24246 for (RelatedStoreInsts &StoreSeq : SortedStores)
24247 TryToVectorize(StoreSeq.getStores());
24252void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
24260 for (Instruction &
I : *BB) {
24264 if (!
SI->isSimple())
24275 if (
GEP->getNumIndices() != 1)
24277 Value *Idx =
GEP->idx_begin()->get();
24282 if (
GEP->getType()->isVectorTy())
24294 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize a list of length = "
24295 << VL.
size() <<
".\n");
24306 for (
Value *V : VL) {
24307 Type *Ty =
V->getType();
24311 R.getORE()->emit([&]() {
24312 std::string TypeStr;
24313 llvm::raw_string_ostream OS(TypeStr);
24315 return OptimizationRemarkMissed(
SV_NAME,
"UnsupportedType", I0)
24316 <<
"Cannot SLP vectorize list: type "
24317 << TypeStr +
" is unsupported by vectorizer";
24324 unsigned Sz =
R.getVectorElementSize(I0);
24325 unsigned MinVF =
R.getMinVF(Sz);
24326 unsigned MaxVF = std::max<unsigned>(
24328 MaxVF = std::min(
R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
24330 R.getORE()->emit([&]() {
24331 return OptimizationRemarkMissed(
SV_NAME,
"SmallVF", I0)
24332 <<
"Cannot SLP vectorize list: vectorization factor "
24333 <<
"less than 2 is not supported";
24339 bool CandidateFound =
false;
24342 unsigned NextInst = 0, MaxInst = VL.size();
24343 for (
unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
24349 if (TTI->getNumberOfParts(VecTy) == VF)
24351 for (
unsigned I = NextInst;
I < MaxInst; ++
I) {
24352 unsigned ActualVF = std::min(MaxInst -
I, VF);
24357 if (MaxVFOnly && ActualVF < MaxVF)
24359 if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
24364 for (
Value *V : VL.drop_front(
I)) {
24368 !Inst || !
R.isDeleted(Inst)) {
24371 if (Idx == ActualVF)
24376 if (Idx != ActualVF)
24379 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << ActualVF <<
" operations "
24383 if (
R.isTreeTinyAndNotFullyVectorizable())
24385 if (
R.isProfitableToReorder()) {
24386 R.reorderTopToBottom();
24389 R.transformNodes();
24390 R.computeMinimumValueSizes();
24392 R.buildExternalUses();
24395 CandidateFound =
true;
24396 MinCost = std::min(MinCost,
Cost);
24399 <<
" for VF=" << ActualVF <<
"\n");
24402 R.getORE()->emit(OptimizationRemark(
SV_NAME,
"VectorizedList",
24404 <<
"SLP vectorized with cost " <<
ore::NV(
"Cost",
Cost)
24405 <<
" and with tree size "
24406 <<
ore::NV(
"TreeSize",
R.getTreeSize()));
24417 if (!
Changed && CandidateFound) {
24418 R.getORE()->emit([&]() {
24419 return OptimizationRemarkMissed(
SV_NAME,
"NotBeneficial", I0)
24420 <<
"List vectorization was possible but not beneficial with cost "
24421 <<
ore::NV(
"Cost", MinCost) <<
" >= "
24425 R.getORE()->emit([&]() {
24426 return OptimizationRemarkMissed(
SV_NAME,
"NotPossible", I0)
24427 <<
"Cannot SLP vectorize list: vectorization was impossible"
24428 <<
" with available vectorization factors";
24463 using ReductionOpsType = SmallVector<Value *, 16>;
24464 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
24465 ReductionOpsListType ReductionOps;
24469 SmallDenseMap<Value *, SmallVector<Instruction *>, 16> ReducedValsToOps;
24470 WeakTrackingVH ReductionRoot;
24475 bool IsSupportedHorRdxIdentityOp =
false;
24482 static bool isCmpSelMinMax(Instruction *
I) {
24490 static bool isBoolLogicOp(Instruction *
I) {
24496 static bool isVectorizable(
RecurKind Kind, Instruction *
I,
24497 bool TwoElementReduction =
false) {
24498 if (Kind == RecurKind::None)
24507 if (TwoElementReduction)
24510 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
24514 return I->getFastMathFlags().noNaNs();
24517 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
24520 return I->isAssociative();
24523 static Value *getRdxOperand(Instruction *
I,
unsigned Index) {
24529 return I->getOperand(2);
24530 return I->getOperand(Index);
24535 Value *
RHS,
const Twine &Name,
bool UseSelect) {
24539 case RecurKind::Or: {
24548 case RecurKind::And: {
24558 case RecurKind::Add:
24559 case RecurKind::Mul:
24560 case RecurKind::Xor:
24561 case RecurKind::FAdd:
24562 case RecurKind::FMul: {
24567 case RecurKind::SMax:
24568 case RecurKind::SMin:
24569 case RecurKind::UMax:
24570 case RecurKind::UMin:
24578 case RecurKind::FMax:
24579 case RecurKind::FMin:
24580 case RecurKind::FMaximum:
24581 case RecurKind::FMinimum:
24582 case RecurKind::FMaximumNum:
24583 case RecurKind::FMinimumNum: {
24596 const ReductionOpsListType &ReductionOps) {
24597 bool UseSelect = ReductionOps.size() == 2 ||
24599 (ReductionOps.size() == 1 &&
24601 assert((!UseSelect || ReductionOps.size() != 2 ||
24603 "Expected cmp + select pairs for reduction");
24604 Value *
Op = createOp(Builder, RdxKind,
LHS,
RHS, Name, UseSelect);
24622 return RecurKind::None;
24624 return RecurKind::Add;
24626 return RecurKind::Mul;
24629 return RecurKind::And;
24632 return RecurKind::Or;
24634 return RecurKind::Xor;
24636 return RecurKind::FAdd;
24638 return RecurKind::FMul;
24641 return RecurKind::FMax;
24643 return RecurKind::FMin;
24646 return RecurKind::FMaximum;
24648 return RecurKind::FMinimum;
24654 return RecurKind::SMax;
24656 return RecurKind::SMin;
24658 return RecurKind::UMax;
24660 return RecurKind::UMin;
24686 return RecurKind::None;
24690 return RecurKind::None;
24693 return RecurKind::None;
24697 return RecurKind::None;
24702 return RecurKind::None;
24705 return RecurKind::SMax;
24708 return RecurKind::SMin;
24711 return RecurKind::UMax;
24714 return RecurKind::UMin;
24717 return RecurKind::None;
24721 static unsigned getFirstOperandIndex(Instruction *
I) {
24722 return isCmpSelMinMax(
I) ? 1 : 0;
24727 static unsigned getNumberOfOperands(Instruction *
I) {
24728 return isCmpSelMinMax(
I) ? 3 : 2;
24733 static bool hasSameParent(Instruction *
I, BasicBlock *BB) {
24734 if (isCmpSelMinMax(
I) || isBoolLogicOp(
I)) {
24737 return Sel->getParent() == BB &&
Cmp &&
Cmp->getParent() == BB;
24739 return I->getParent() == BB;
24743 static bool hasRequiredNumberOfUses(
bool IsCmpSelMinMax, Instruction *
I) {
24744 if (IsCmpSelMinMax) {
24748 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
24749 return I->hasNUses(2);
24757 void initReductionOps(Instruction *
I) {
24758 if (isCmpSelMinMax(
I))
24759 ReductionOps.assign(2, ReductionOpsType());
24761 ReductionOps.assign(1, ReductionOpsType());
24765 void addReductionOps(Instruction *
I) {
24766 if (isCmpSelMinMax(
I)) {
24768 ReductionOps[1].emplace_back(
I);
24770 ReductionOps[0].emplace_back(
I);
24775 int Sz =
Data.size();
24784 : ReductionRoot(
I), ReductionLimit(2) {
24785 RdxKind = HorizontalReduction::getRdxKind(
I);
24786 ReductionOps.emplace_back().push_back(
I);
24789 ReducedValsToOps[
V].push_back(
I);
24792 bool matchReductionForOperands()
const {
24795 assert(ReductionRoot &&
"Reduction root is not set!");
24798 return Ops.size() == 2;
24806 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
24807 ScalarEvolution &SE,
const DataLayout &
DL,
24808 const TargetLibraryInfo &TLI) {
24809 RdxKind = HorizontalReduction::getRdxKind(Root);
24810 if (!isVectorizable(RdxKind, Root))
24822 if (!Sel->getCondition()->hasOneUse())
24825 ReductionRoot = Root;
24830 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
24832 1, std::make_pair(Root, 0));
24837 SmallVectorImpl<Value *> &PossibleReducedVals,
24838 SmallVectorImpl<Instruction *> &ReductionOps,
24841 getNumberOfOperands(TreeN)))) {
24842 Value *EdgeVal = getRdxOperand(TreeN,
I);
24843 ReducedValsToOps[EdgeVal].push_back(TreeN);
24851 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
24852 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
24853 !isVectorizable(RdxKind, EdgeInst) ||
24854 (
R.isAnalyzedReductionRoot(EdgeInst) &&
24856 PossibleReducedVals.push_back(EdgeVal);
24859 ReductionOps.push_back(EdgeInst);
24868 size_t, SmallMapVector<size_t, SmallMapVector<Value *, unsigned, 2>, 2>,
24870 PossibleReducedVals;
24871 initReductionOps(Root);
24873 SmallSet<size_t, 2> LoadKeyUsed;
24875 auto GenerateLoadsSubkey = [&](
size_t Key, LoadInst *LI) {
24880 auto LIt = LoadsMap.
find(std::make_pair(
Key, Ptr));
24881 if (LIt != LoadsMap.
end()) {
24882 for (LoadInst *RLI : LIt->second) {
24888 for (LoadInst *RLI : LIt->second) {
24895 if (LIt->second.size() > 2) {
24897 hash_value(LIt->second.back()->getPointerOperand());
24903 .first->second.push_back(LI);
24907 while (!Worklist.empty()) {
24908 auto [TreeN,
Level] = Worklist.pop_back_val();
24911 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
24912 addReductionOps(TreeN);
24915 for (
Value *V : PossibleRedVals) {
24919 ++PossibleReducedVals[
Key][Idx].
try_emplace(V, 0).first->second;
24921 for (Instruction *
I :
reverse(PossibleReductionOps))
24922 Worklist.emplace_back(
I,
I->getParent() == BB ? 0 : Level + 1);
24924 auto PossibleReducedValsVect = PossibleReducedVals.
takeVector();
24927 for (
auto &PossibleReducedVals : PossibleReducedValsVect) {
24928 auto PossibleRedVals = PossibleReducedVals.second.
takeVector();
24930 for (
auto &Slice : PossibleRedVals) {
24932 auto RedValsVect = Slice.second.takeVector();
24934 for (
const std::pair<Value *, unsigned> &
Data : RedValsVect)
24935 PossibleRedValsVect.
back().append(
Data.second,
Data.first);
24937 stable_sort(PossibleRedValsVect, [](
const auto &P1,
const auto &P2) {
24938 return P1.size() > P2.size();
24945 }
else if (!isGoodForReduction(
Data)) {
24948 if (!LI || !LastLI ||
24953 ReducedVals.
back().append(
Data.rbegin(),
Data.rend());
24959 return P1.size() > P2.
size();
24965 Value *tryToReduce(BoUpSLP &V,
const DataLayout &
DL, TargetTransformInfo *
TTI,
24966 const TargetLibraryInfo &TLI, AssumptionCache *AC,
24967 DominatorTree &DT) {
24968 constexpr unsigned RegMaxNumber = 4;
24969 constexpr unsigned RedValsMaxNumber = 128;
24973 if (
unsigned NumReducedVals = std::accumulate(
24974 ReducedVals.
begin(), ReducedVals.
end(), 0,
24976 if (!isGoodForReduction(Vals))
24978 return Num + Vals.size();
24980 NumReducedVals < ReductionLimit &&
24984 for (ReductionOpsType &RdxOps : ReductionOps)
24985 for (
Value *RdxOp : RdxOps)
24990 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
24996 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.
size() *
24997 ReducedVals.
front().size());
25001 auto &&GetCmpForMinMaxReduction = [](
Instruction *RdxRootInst) {
25003 "Expected min/max reduction to have select root instruction");
25006 "Expected min/max reduction to have compare condition");
25010 bool AnyBoolLogicOp =
any_of(ReductionOps.back(), [](
Value *V) {
25011 return isBoolLogicOp(cast<Instruction>(V));
25014 auto GetNewVectorizedTree = [&](
Value *VectorizedTree,
Value *Res) {
25015 if (VectorizedTree) {
25019 if (AnyBoolLogicOp) {
25020 auto It = ReducedValsToOps.
find(VectorizedTree);
25021 auto It1 = ReducedValsToOps.
find(Res);
25022 if ((It == ReducedValsToOps.
end() && It1 == ReducedValsToOps.
end()) ||
25024 (It != ReducedValsToOps.
end() &&
25025 any_of(It->getSecond(), [&](Instruction *
I) {
25026 return isBoolLogicOp(I) &&
25027 getRdxOperand(I, 0) == VectorizedTree;
25031 (It1 != ReducedValsToOps.
end() &&
25032 any_of(It1->getSecond(), [&](Instruction *
I) {
25033 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
25037 VectorizedTree = Builder.
CreateFreeze(VectorizedTree);
25041 return createOp(Builder, RdxKind, VectorizedTree, Res,
"op.rdx",
25047 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
25048 ReductionOps.front().size());
25049 for (ReductionOpsType &RdxOps : ReductionOps)
25050 for (
Value *RdxOp : RdxOps) {
25053 IgnoreList.insert(RdxOp);
25056 FastMathFlags RdxFMF;
25058 for (
Value *U : IgnoreList)
25060 RdxFMF &= FPMO->getFastMathFlags();
25066 for (
Value *V : Candidates)
25067 TrackedVals.try_emplace(V, V);
25069 auto At = [](SmallMapVector<Value *, unsigned, 16> &MV,
25070 Value *
V) ->
unsigned & {
25071 auto *It = MV.
find(V);
25072 assert(It != MV.
end() &&
"Unable to find given key.");
25076 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.
size());
25079 SmallPtrSet<Value *, 4> RequiredExtract;
25080 WeakTrackingVH VectorizedTree =
nullptr;
25081 bool CheckForReusedReductionOps =
false;
25091 States.
back().getOpcode() == Instruction::Load)) {
25092 LocalReducedVals.
emplace_back().append(RV.begin(), RV.end());
25093 States.
push_back(InstructionsState::invalid());
25096 if (!LocalReducedVals.
empty() &&
25099 LocalReducedVals.
emplace_back().append(RV.begin(), RV.end());
25104 if (!LocalReducedVals.
empty())
25105 Ops = LocalReducedVals.
back();
25106 Ops.append(RV.begin(), RV.end());
25107 InstructionsCompatibilityAnalysis
Analysis(DT,
DL, *
TTI, TLI);
25108 InstructionsState OpS =
25110 if (LocalReducedVals.
empty()) {
25116 LocalReducedVals.
back().swap(
Ops);
25117 States.
back() = OpS;
25120 LocalReducedVals.
emplace_back().append(RV.begin(), RV.end());
25123 ReducedVals.swap(LocalReducedVals);
25124 for (
unsigned I = 0,
E = ReducedVals.
size();
I <
E; ++
I) {
25126 InstructionsState S = States[
I];
25129 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.
size());
25130 for (
Value *ReducedVal : OrigReducedVals) {
25131 Value *RdxVal = TrackedVals.at(ReducedVal);
25138 (!S || (!S.getMatchingMainOpOrAltOp(Inst) &&
25139 !S.isCopyableElement(Inst)))) ||
25141 !S.isCopyableElement(RdxVal)))
25144 TrackedToOrig.try_emplace(RdxVal, ReducedVal);
25146 bool ShuffledExtracts =
false;
25148 if (S && S.getOpcode() == Instruction::ExtractElement &&
25149 !S.isAltShuffle() &&
I + 1 <
E) {
25151 for (
Value *RV : ReducedVals[
I + 1]) {
25152 Value *RdxVal = TrackedVals.at(RV);
25159 CommonCandidates.push_back(RdxVal);
25160 TrackedToOrig.try_emplace(RdxVal, RV);
25162 SmallVector<int>
Mask;
25165 Candidates.
swap(CommonCandidates);
25166 ShuffledExtracts =
true;
25173 Value *OrigV = TrackedToOrig.at(Candidates.
front());
25174 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
25176 Res = createOp(Builder, RdxKind, Res, VC,
"const.rdx", ReductionOps);
25177 Value *OrigV = TrackedToOrig.at(VC);
25178 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
25180 V.analyzedReductionRoot(ResI);
25182 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
25186 unsigned NumReducedVals = Candidates.
size();
25187 if (NumReducedVals < ReductionLimit &&
25188 (NumReducedVals < 2 || !
isSplat(Candidates)))
25193 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
25194 RdxKind != RecurKind::FMul &&
25195 RdxKind != RecurKind::FMulAdd;
25197 SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
25198 if (IsSupportedHorRdxIdentityOp)
25199 for (
Value *V : Candidates) {
25200 Value *OrigV = TrackedToOrig.at(V);
25201 ++SameValuesCounter.
try_emplace(OrigV).first->second;
25213 bool SameScaleFactor =
false;
25214 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
25215 SameValuesCounter.
size() != Candidates.size();
25217 if (OptReusedScalars) {
25219 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
25220 RdxKind == RecurKind::Xor) &&
25222 [&SameValuesCounter](
const std::pair<Value *, unsigned> &
P) {
25223 return P.second == SameValuesCounter.
front().second;
25225 Candidates.resize(SameValuesCounter.
size());
25226 transform(SameValuesCounter, Candidates.begin(),
25227 [&](
const auto &
P) { return TrackedVals.at(P.first); });
25228 NumReducedVals = Candidates.size();
25230 if (NumReducedVals == 1) {
25231 Value *OrigV = TrackedToOrig.at(Candidates.front());
25232 unsigned Cnt = At(SameValuesCounter, OrigV);
25234 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
25235 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
25236 VectorizedVals.try_emplace(OrigV, Cnt);
25237 ExternallyUsedValues.
insert(OrigV);
25242 unsigned MaxVecRegSize =
V.getMaxVecRegSize();
25243 unsigned EltSize =
V.getVectorElementSize(Candidates[0]);
25244 const unsigned MaxElts = std::clamp<unsigned>(
25246 RegMaxNumber * RedValsMaxNumber);
25248 unsigned ReduxWidth = NumReducedVals;
25249 auto GetVectorFactor = [&, &
TTI = *
TTI](
unsigned ReduxWidth) {
25250 unsigned NumParts, NumRegs;
25251 Type *ScalarTy = Candidates.front()->getType();
25258 while (NumParts > NumRegs) {
25259 assert(ReduxWidth > 0 &&
"ReduxWidth is unexpectedly 0.");
25260 ReduxWidth =
bit_floor(ReduxWidth - 1);
25266 if (NumParts > NumRegs / 2)
25271 ReduxWidth = GetVectorFactor(ReduxWidth);
25272 ReduxWidth = std::min(ReduxWidth, MaxElts);
25274 unsigned Start = 0;
25275 unsigned Pos =
Start;
25277 unsigned PrevReduxWidth = ReduxWidth;
25278 bool CheckForReusedReductionOpsLocal =
false;
25279 auto AdjustReducedVals = [&](
bool IgnoreVL =
false) {
25280 bool IsAnyRedOpGathered = !IgnoreVL &&
V.isAnyGathered(IgnoreList);
25281 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
25284 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
25287 if (Pos < NumReducedVals - ReduxWidth + 1)
25288 return IsAnyRedOpGathered;
25291 if (ReduxWidth > 1)
25292 ReduxWidth = GetVectorFactor(ReduxWidth);
25293 return IsAnyRedOpGathered;
25295 bool AnyVectorized =
false;
25296 SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
25297 while (Pos < NumReducedVals - ReduxWidth + 1 &&
25298 ReduxWidth >= ReductionLimit) {
25301 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
25303 CheckForReusedReductionOps =
true;
25306 PrevReduxWidth = ReduxWidth;
25309 if (IgnoredCandidates.
contains(std::make_pair(Pos, ReduxWidth)) ||
25312 std::make_pair(Pos,
bit_floor(ReduxWidth))) ||
25314 std::make_pair(Pos + (ReduxWidth -
bit_floor(ReduxWidth)),
25316 V.areAnalyzedReductionVals(VL)) {
25317 (void)AdjustReducedVals(
true);
25324 return RedValI &&
V.isDeleted(RedValI);
25327 V.buildTree(VL, IgnoreList);
25328 if (
V.isTreeTinyAndNotFullyVectorizable(
true)) {
25329 if (!AdjustReducedVals())
25330 V.analyzedReductionVals(VL);
25333 if (
V.isLoadCombineReductionCandidate(RdxKind)) {
25334 if (!AdjustReducedVals())
25335 V.analyzedReductionVals(VL);
25338 V.reorderTopToBottom();
25341 VL.front()->getType()->isIntOrIntVectorTy() ||
25342 ReductionLimit > 2);
25346 ExternallyUsedValues);
25350 LocalExternallyUsedValues.insert(ReductionRoot);
25351 for (
unsigned Cnt = 0, Sz = ReducedVals.
size(); Cnt < Sz; ++Cnt) {
25352 if (Cnt ==
I || (ShuffledExtracts && Cnt ==
I - 1))
25354 for (
Value *V : ReducedVals[Cnt])
25356 LocalExternallyUsedValues.insert(TrackedVals[V]);
25358 if (!IsSupportedHorRdxIdentityOp) {
25361 "Reused values counter map is not empty");
25362 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
25363 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
25365 Value *
V = Candidates[Cnt];
25366 Value *OrigV = TrackedToOrig.at(V);
25367 ++SameValuesCounter.
try_emplace(OrigV).first->second;
25370 V.transformNodes();
25371 V.computeMinimumValueSizes();
25376 SmallPtrSet<Value *, 4> Visited;
25377 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
25378 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
25380 Value *RdxVal = Candidates[Cnt];
25381 if (
auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
25382 RdxVal = It->second;
25383 if (!Visited.
insert(RdxVal).second)
25387 if (!VLScalars.contains(RdxVal) &&
V.isVectorized(RdxVal)) {
25388 LocalExternallyUsedValues.insert(RdxVal);
25391 Value *OrigV = TrackedToOrig.at(RdxVal);
25393 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
25394 if (
NumOps != ReducedValsToOps.
at(OrigV).size())
25395 LocalExternallyUsedValues.insert(RdxVal);
25398 if (!IsSupportedHorRdxIdentityOp)
25399 SameValuesCounter.
clear();
25400 for (
Value *RdxVal : VL)
25401 if (RequiredExtract.
contains(RdxVal))
25402 LocalExternallyUsedValues.insert(RdxVal);
25403 V.buildExternalUses(LocalExternallyUsedValues);
25407 getReductionCost(
TTI, VL, IsCmpSelMinMax, RdxFMF, V, DT,
DL, TLI);
25410 <<
" for reduction\n");
25414 V.getORE()->emit([&]() {
25415 return OptimizationRemarkMissed(
SV_NAME,
"HorSLPNotBeneficial",
25416 ReducedValsToOps.
at(VL[0]).front())
25417 <<
"Vectorizing horizontal reduction is possible "
25418 <<
"but not beneficial with cost " <<
ore::NV(
"Cost",
Cost)
25419 <<
" and threshold "
25422 if (!AdjustReducedVals()) {
25423 V.analyzedReductionVals(VL);
25425 if (ReduxWidth > ReductionLimit &&
V.isTreeNotExtendable()) {
25428 *
TTI, VL.front()->getType(), ReduxWidth - 1);
25429 VF >= ReductionLimit;
25431 *
TTI, VL.front()->getType(), VF - 1)) {
25433 V.getCanonicalGraphSize() !=
V.getTreeSize())
25436 IgnoredCandidates.
insert(std::make_pair(
Offset + Idx, VF));
25443 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorizing horizontal reduction at cost:"
25444 <<
Cost <<
". (HorRdx)\n");
25445 V.getORE()->emit([&]() {
25446 return OptimizationRemark(
SV_NAME,
"VectorizedHorizontalReduction",
25447 ReducedValsToOps.
at(VL[0]).front())
25448 <<
"Vectorized horizontal reduction with cost "
25449 <<
ore::NV(
"Cost",
Cost) <<
" and with tree size "
25450 <<
ore::NV(
"TreeSize",
V.getTreeSize());
25459 if (IsCmpSelMinMax)
25460 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
25463 Value *VectorizedRoot =
V.vectorizeTree(
25464 LocalExternallyUsedValues, InsertPt, VectorValuesAndScales);
25467 for (
Value *RdxVal : Candidates) {
25468 Value *OrigVal = TrackedToOrig.at(RdxVal);
25469 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
25470 if (TransformedRdxVal != RdxVal)
25471 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
25480 VectorizedRoot = Builder.
CreateFreeze(VectorizedRoot);
25483 if (OptReusedScalars && !SameScaleFactor) {
25484 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
25485 SameValuesCounter, TrackedToOrig);
25488 Type *ScalarTy = VL.front()->getType();
25493 OptReusedScalars && SameScaleFactor
25494 ? SameValuesCounter.
front().second
25497 ?
V.isSignedMinBitwidthRootNode()
25501 for (
Value *RdxVal : VL) {
25502 Value *OrigV = TrackedToOrig.at(RdxVal);
25503 if (IsSupportedHorRdxIdentityOp) {
25504 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
25507 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
25508 if (!
V.isVectorized(RdxVal))
25509 RequiredExtract.
insert(RdxVal);
25513 ReduxWidth = NumReducedVals - Pos;
25514 if (ReduxWidth > 1)
25515 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
25516 AnyVectorized =
true;
25518 if (OptReusedScalars && !AnyVectorized) {
25519 for (
const std::pair<Value *, unsigned> &
P : SameValuesCounter) {
25520 Value *RdxVal = TrackedVals.at(
P.first);
25521 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder,
P.second);
25522 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
25523 VectorizedVals.try_emplace(
P.first,
P.second);
25528 if (!VectorValuesAndScales.
empty())
25529 VectorizedTree = GetNewVectorizedTree(
25531 emitReduction(Builder, *
TTI, ReductionRoot->getType()));
25533 if (!VectorizedTree) {
25534 if (!CheckForReusedReductionOps) {
25535 for (ReductionOpsType &RdxOps : ReductionOps)
25536 for (
Value *RdxOp : RdxOps)
25558 auto FixBoolLogicalOps =
25561 if (!AnyBoolLogicOp)
25563 if (isBoolLogicOp(RedOp1) && ((!InitStep &&
LHS == VectorizedTree) ||
25564 getRdxOperand(RedOp1, 0) ==
LHS ||
25567 bool NeedFreeze =
LHS != VectorizedTree;
25568 if (isBoolLogicOp(RedOp2) && ((!InitStep &&
RHS == VectorizedTree) ||
25569 getRdxOperand(RedOp2, 0) ==
RHS ||
25572 if ((InitStep ||
RHS != VectorizedTree) &&
25573 getRdxOperand(RedOp2, 0) ==
RHS &&
25574 ((isBoolLogicOp(RedOp1) &&
25575 getRdxOperand(RedOp1, 1) == RedOp2) ||
25579 return OpI && isBoolLogicOp(OpI) &&
25580 getRdxOperand(OpI, 1) == RedOp2;
25583 NeedFreeze =
false;
25597 unsigned Sz = InstVals.
size();
25599 for (
unsigned I = 0,
E = (Sz / 2) * 2;
I <
E;
I += 2) {
25602 Value *RdxVal1 = InstVals[
I].second;
25603 Value *StableRdxVal1 = RdxVal1;
25604 auto It1 = TrackedVals.find(RdxVal1);
25605 if (It1 != TrackedVals.end())
25606 StableRdxVal1 = It1->second;
25607 Value *RdxVal2 = InstVals[
I + 1].second;
25608 Value *StableRdxVal2 = RdxVal2;
25609 auto It2 = TrackedVals.find(RdxVal2);
25610 if (It2 != TrackedVals.end())
25611 StableRdxVal2 = It2->second;
25615 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[
I].first,
25617 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
25618 StableRdxVal2,
"op.rdx", ReductionOps);
25619 ExtraReds[
I / 2] = std::make_pair(InstVals[
I].first, ExtraRed);
25622 ExtraReds[Sz / 2] = InstVals.
back();
25628 SmallPtrSet<Value *, 8> Visited;
25630 for (
Value *RdxVal : Candidates) {
25631 if (!Visited.
insert(RdxVal).second)
25633 unsigned NumOps = VectorizedVals.lookup(RdxVal);
25634 for (Instruction *RedOp :
25640 bool InitStep =
true;
25641 while (ExtraReductions.
size() > 1) {
25643 FinalGen(ExtraReductions, InitStep);
25644 ExtraReductions.
swap(NewReds);
25647 VectorizedTree = ExtraReductions.
front().second;
25649 ReductionRoot->replaceAllUsesWith(VectorizedTree);
25656 SmallPtrSet<Value *, 4> IgnoreSet;
25665 for (
auto *U :
Ignore->users()) {
25667 "All users must be either in the reduction ops list.");
25670 if (!
Ignore->use_empty()) {
25672 Ignore->replaceAllUsesWith(
P);
25675 V.removeInstructionsAndOperands(RdxOps, VectorValuesAndScales);
25677 return VectorizedTree;
25683 Value *createSingleOp(IRBuilderBase &Builder,
const TargetTransformInfo &
TTI,
25684 Value *Vec,
unsigned Scale,
bool IsSigned,
25708 Rdx, emitReduction(Lane, Builder, &
TTI, DestTy),
I);
25711 Rdx = emitReduction(Vec, Builder, &
TTI, DestTy);
25713 if (Rdx->
getType() != DestTy)
25719 Rdx = emitScaleForReusedOps(Rdx, Builder, Scale);
25726 bool IsCmpSelMinMax, FastMathFlags FMF,
25727 const BoUpSLP &R, DominatorTree &DT,
25728 const DataLayout &
DL,
25729 const TargetLibraryInfo &TLI) {
25731 Type *ScalarTy = ReducedVals.
front()->getType();
25732 unsigned ReduxWidth = ReducedVals.
size();
25733 FixedVectorType *VectorTy =
R.getReductionType();
25738 auto EvaluateScalarCost = [&](function_ref<
InstructionCost()> GenCostFn) {
25741 int Cnt = ReducedVals.
size();
25742 for (
Value *RdxVal : ReducedVals) {
25749 Cost += GenCostFn();
25753 for (User *U : RdxVal->
users()) {
25755 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
25756 if (RdxKind == RecurKind::FAdd) {
25766 FMACost -= FMulCost;
25768 ScalarCost += FMACost;
25775 ScalarCost = InstructionCost::getInvalid();
25779 Cost += ScalarCost;
25781 Cost += GenCostFn();
25790 bool DoesRequireReductionOp = !AllConsts && VectorValuesAndScales.
empty();
25792 case RecurKind::Add:
25793 case RecurKind::Mul:
25794 case RecurKind::Or:
25795 case RecurKind::And:
25796 case RecurKind::Xor:
25797 case RecurKind::FAdd:
25798 case RecurKind::FMul: {
25801 if (DoesRequireReductionOp) {
25804 unsigned ScalarTyNumElements = VecTy->getNumElements();
25809 ReducedVals.size()),
25820 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
25821 std::make_pair(RedTy,
true));
25822 if (RType == RedTy) {
25827 RdxOpcode, !IsSigned, RedTy,
25833 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
25834 std::make_pair(RedTy,
true));
25837 if (RdxKind == RecurKind::FAdd) {
25842 for (
Value *RdxVal : ReducedVals) {
25848 FMF &= FPCI->getFastMathFlags();
25851 if (!
Ops.empty()) {
25856 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy,
25857 {RVecTy, RVecTy, RVecTy}, FMF);
25863 Instruction::FMul, RVecTy,
CostKind);
25865 <<
"Minus vector FMul cost: " << FMulCost <<
"\n");
25866 FMACost -= FMulCost;
25870 if (FMACost.isValid())
25871 VectorCost += FMACost;
25875 if (RType != RedTy) {
25876 unsigned Opcode = Instruction::Trunc;
25878 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
25884 ScalarCost = EvaluateScalarCost([&]() {
25889 case RecurKind::FMax:
25890 case RecurKind::FMin:
25891 case RecurKind::FMaximum:
25892 case RecurKind::FMinimum:
25893 case RecurKind::SMax:
25894 case RecurKind::SMin:
25895 case RecurKind::UMax:
25896 case RecurKind::UMin: {
25899 if (DoesRequireReductionOp) {
25905 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
25906 std::make_pair(RedTy,
true));
25908 IntrinsicCostAttributes ICA(Id, RVecTy, {RVecTy, RVecTy}, FMF);
25910 if (RType != RedTy) {
25911 unsigned Opcode = Instruction::Trunc;
25913 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
25919 ScalarCost = EvaluateScalarCost([&]() {
25920 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
25929 LLVM_DEBUG(
dbgs() <<
"SLP: Adding cost " << VectorCost - ScalarCost
25931 <<
" (It is a splitting reduction)\n");
25932 return VectorCost - ScalarCost;
25938 Value *emitReduction(IRBuilderBase &Builder,
const TargetTransformInfo &
TTI,
25940 Value *ReducedSubTree =
nullptr;
25942 auto CreateSingleOp = [&](
Value *Vec,
unsigned Scale,
bool IsSigned) {
25943 Value *Rdx = createSingleOp(Builder,
TTI, Vec, Scale, IsSigned, DestTy);
25944 if (ReducedSubTree)
25945 ReducedSubTree = createOp(Builder, RdxKind, ReducedSubTree, Rdx,
25946 "op.rdx", ReductionOps);
25948 ReducedSubTree = Rdx;
25950 if (VectorValuesAndScales.
size() == 1) {
25951 const auto &[Vec, Scale, IsSigned] = VectorValuesAndScales.
front();
25952 CreateSingleOp(Vec, Scale, IsSigned);
25953 return ReducedSubTree;
25957 Value *VecRes =
nullptr;
25958 bool VecResSignedness =
false;
25959 auto CreateVecOp = [&](
Value *Vec,
unsigned Cnt,
bool IsSigned) {
25965 case RecurKind::Add: {
25966 if (ScalarTy == Builder.
getInt1Ty() && ScalarTy != DestTy) {
25969 <<
". (HorRdx)\n");
25972 std::iota(std::next(
Mask.begin(), VF *
I),
25973 std::next(
Mask.begin(), VF * (
I + 1)), 0);
25974 ++NumVectorInstructions;
25985 LLVM_DEBUG(
dbgs() <<
"SLP: Add (to-mul) " << Cnt <<
"of " << Vec
25986 <<
". (HorRdx)\n");
25987 ++NumVectorInstructions;
25991 case RecurKind::Xor: {
25994 <<
"SLP: Xor " << Cnt <<
"of " << Vec <<
". (HorRdx)\n");
25999 case RecurKind::FAdd: {
26003 LLVM_DEBUG(
dbgs() <<
"SLP: FAdd (to-fmul) " << Cnt <<
"of " << Vec
26004 <<
". (HorRdx)\n");
26005 ++NumVectorInstructions;
26009 case RecurKind::And:
26010 case RecurKind::Or:
26011 case RecurKind::SMax:
26012 case RecurKind::SMin:
26013 case RecurKind::UMax:
26014 case RecurKind::UMin:
26015 case RecurKind::FMax:
26016 case RecurKind::FMin:
26017 case RecurKind::FMaximum:
26018 case RecurKind::FMinimum:
26021 case RecurKind::Sub:
26022 case RecurKind::AddChainWithSubs:
26023 case RecurKind::Mul:
26024 case RecurKind::FMul:
26025 case RecurKind::FMulAdd:
26026 case RecurKind::AnyOf:
26027 case RecurKind::FindFirstIVSMin:
26028 case RecurKind::FindFirstIVUMin:
26029 case RecurKind::FindLastIVSMax:
26030 case RecurKind::FindLastIVUMax:
26031 case RecurKind::FindLast:
26032 case RecurKind::FMaxNum:
26033 case RecurKind::FMinNum:
26034 case RecurKind::FMaximumNum:
26035 case RecurKind::FMinimumNum:
26036 case RecurKind::None:
26043 VecResSignedness = IsSigned;
26045 ++NumVectorInstructions;
26046 if (ScalarTy == Builder.
getInt1Ty() && ScalarTy != DestTy &&
26052 std::iota(
Mask.begin(),
Mask.end(), 0);
26054 if (VecResVF < VecVF) {
26058 if (VecResVF != VecVF) {
26060 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
26077 if (VecResVF < VecVF) {
26083 if (VecResVF != VecVF)
26085 Op = createOp(Builder, RdxKind,
Op, Vec,
"rdx.op", ReductionOps);
26086 if (VecResVF != VecVF)
26091 for (
auto [Vec, Scale, IsSigned] : VectorValuesAndScales)
26092 CreateVecOp(Vec, Scale, IsSigned);
26093 CreateSingleOp(VecRes, 1,
false);
26095 return ReducedSubTree;
26099 Value *emitReduction(
Value *VectorizedValue, IRBuilderBase &Builder,
26100 const TargetTransformInfo *
TTI,
Type *DestTy) {
26101 assert(VectorizedValue &&
"Need to have a vectorized tree node");
26102 assert(RdxKind != RecurKind::FMulAdd &&
26103 "A call to the llvm.fmuladd intrinsic is not handled yet");
26106 if (FTy->getScalarType() == Builder.
getInt1Ty() &&
26107 RdxKind == RecurKind::Add &&
26112 VectorizedValue, Builder.
getIntNTy(FTy->getNumElements()));
26113 ++NumVectorInstructions;
26116 ++NumVectorInstructions;
26121 Value *emitScaleForReusedOps(
Value *VectorizedValue, IRBuilderBase &Builder,
26123 assert(IsSupportedHorRdxIdentityOp &&
26124 "The optimization of matched scalar identity horizontal reductions "
26125 "must be supported.");
26127 return VectorizedValue;
26129 case RecurKind::Add: {
26131 Value *Scale = ConstantInt::get(VectorizedValue->
getType(), Cnt);
26133 << VectorizedValue <<
". (HorRdx)\n");
26134 return Builder.
CreateMul(VectorizedValue, Scale);
26136 case RecurKind::Xor: {
26138 LLVM_DEBUG(
dbgs() <<
"SLP: Xor " << Cnt <<
"of " << VectorizedValue
26139 <<
". (HorRdx)\n");
26142 return VectorizedValue;
26144 case RecurKind::FAdd: {
26146 Value *Scale = ConstantFP::get(VectorizedValue->
getType(), Cnt);
26148 << VectorizedValue <<
". (HorRdx)\n");
26149 return Builder.
CreateFMul(VectorizedValue, Scale);
26151 case RecurKind::And:
26152 case RecurKind::Or:
26153 case RecurKind::SMax:
26154 case RecurKind::SMin:
26155 case RecurKind::UMax:
26156 case RecurKind::UMin:
26157 case RecurKind::FMax:
26158 case RecurKind::FMin:
26159 case RecurKind::FMaximum:
26160 case RecurKind::FMinimum:
26162 return VectorizedValue;
26163 case RecurKind::Sub:
26164 case RecurKind::AddChainWithSubs:
26165 case RecurKind::Mul:
26166 case RecurKind::FMul:
26167 case RecurKind::FMulAdd:
26168 case RecurKind::AnyOf:
26169 case RecurKind::FindFirstIVSMin:
26170 case RecurKind::FindFirstIVUMin:
26171 case RecurKind::FindLastIVSMax:
26172 case RecurKind::FindLastIVUMax:
26173 case RecurKind::FindLast:
26174 case RecurKind::FMaxNum:
26175 case RecurKind::FMinNum:
26176 case RecurKind::FMaximumNum:
26177 case RecurKind::FMinimumNum:
26178 case RecurKind::None:
26187 emitReusedOps(
Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,
26188 const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter,
26189 const DenseMap<Value *, Value *> &TrackedToOrig) {
26190 assert(IsSupportedHorRdxIdentityOp &&
26191 "The optimization of matched scalar identity horizontal reductions "
26192 "must be supported.");
26195 if (VTy->getElementType() != VL.
front()->getType()) {
26199 R.isSignedMinBitwidthRootNode());
26202 case RecurKind::Add: {
26205 for (
Value *V : VL) {
26206 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
26207 Vals.
push_back(ConstantInt::get(
V->getType(), Cnt,
false));
26211 << VectorizedValue <<
". (HorRdx)\n");
26212 return Builder.
CreateMul(VectorizedValue, Scale);
26214 case RecurKind::And:
26215 case RecurKind::Or:
26218 <<
". (HorRdx)\n");
26219 return VectorizedValue;
26220 case RecurKind::SMax:
26221 case RecurKind::SMin:
26222 case RecurKind::UMax:
26223 case RecurKind::UMin:
26224 case RecurKind::FMax:
26225 case RecurKind::FMin:
26226 case RecurKind::FMaximum:
26227 case RecurKind::FMinimum:
26230 <<
". (HorRdx)\n");
26231 return VectorizedValue;
26232 case RecurKind::Xor: {
26237 SmallVector<int>
Mask(
26240 std::iota(
Mask.begin(),
Mask.end(), 0);
26241 bool NeedShuffle =
false;
26242 for (
unsigned I = 0, VF = VL.size();
I < VF; ++
I) {
26244 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
26245 if (Cnt % 2 == 0) {
26247 NeedShuffle =
true;
26253 dbgs() <<
"> of " << VectorizedValue <<
". (HorRdx)\n");
26257 ConstantVector::getNullValue(VectorizedValue->
getType()), Mask);
26258 return VectorizedValue;
26260 case RecurKind::FAdd: {
26263 for (
Value *V : VL) {
26264 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
26265 Vals.
push_back(ConstantFP::get(
V->getType(), Cnt));
26268 return Builder.
CreateFMul(VectorizedValue, Scale);
26270 case RecurKind::Sub:
26271 case RecurKind::AddChainWithSubs:
26272 case RecurKind::Mul:
26273 case RecurKind::FMul:
26274 case RecurKind::FMulAdd:
26275 case RecurKind::AnyOf:
26276 case RecurKind::FindFirstIVSMin:
26277 case RecurKind::FindFirstIVUMin:
26278 case RecurKind::FindLastIVSMax:
26279 case RecurKind::FindLastIVUMax:
26280 case RecurKind::FindLast:
26281 case RecurKind::FMaxNum:
26282 case RecurKind::FMinNum:
26283 case RecurKind::FMaximumNum:
26284 case RecurKind::FMinimumNum:
26285 case RecurKind::None:
26295 return HorizontalReduction::getRdxKind(V);
26301 unsigned AggregateSize = 1;
26303 Type *CurrentType =
IV->getType();
26306 for (
auto *Elt : ST->elements())
26307 if (Elt != ST->getElementType(0))
26308 return std::nullopt;
26309 AggregateSize *= ST->getNumElements();
26310 CurrentType = ST->getElementType(0);
26312 AggregateSize *= AT->getNumElements();
26313 CurrentType = AT->getElementType();
26315 AggregateSize *= VT->getNumElements();
26316 return AggregateSize;
26318 return AggregateSize;
26320 return std::nullopt;
26329 unsigned OperandOffset,
const BoUpSLP &R) {
26332 std::optional<unsigned> OperandIndex =
26334 if (!OperandIndex || R.isDeleted(LastInsertInst))
26338 BuildVectorOpds, InsertElts, *OperandIndex, R);
26341 BuildVectorOpds[*OperandIndex] = InsertedOperand;
26342 InsertElts[*OperandIndex] = LastInsertInst;
26345 }
while (LastInsertInst !=
nullptr &&
26372 "Expected insertelement or insertvalue instruction!");
26375 "Expected empty result vectors!");
26378 if (!AggregateSize)
26380 BuildVectorOpds.
resize(*AggregateSize);
26381 InsertElts.
resize(*AggregateSize);
26386 if (BuildVectorOpds.
size() >= 2)
26404 auto DominatedReduxValue = [&](
Value *R) {
26412 if (
P->getIncomingBlock(0) == ParentBB) {
26414 }
else if (
P->getIncomingBlock(1) == ParentBB) {
26418 if (Rdx && DominatedReduxValue(Rdx))
26431 if (
P->getIncomingBlock(0) == BBLatch) {
26433 }
else if (
P->getIncomingBlock(1) == BBLatch) {
26437 if (Rdx && DominatedReduxValue(Rdx))
26473 "Expected binop, select, or intrinsic for reduction matching");
26475 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root));
26477 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
26488 Value *Op0 =
nullptr;
26489 Value *Op1 =
nullptr;
26498 Value *B0 =
nullptr, *B1 =
nullptr;
26503bool SLPVectorizerPass::vectorizeHorReduction(
26504 PHINode *
P, Instruction *Root, BasicBlock *BB,
BoUpSLP &R,
26505 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
26514 auto SelectRoot = [&]() {
26516 HorizontalReduction::getRdxKind(Root) != RecurKind::None)
26533 std::queue<std::pair<Instruction *, unsigned>>
Stack;
26534 Stack.emplace(SelectRoot(), 0);
26535 SmallPtrSet<Value *, 8> VisitedInstrs;
26538 if (
R.isAnalyzedReductionRoot(Inst))
26543 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
26545 return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC, *DT);
26547 auto TryAppendToPostponedInsts = [&](
Instruction *FutureSeed) {
26548 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
26560 while (!
Stack.empty()) {
26563 std::tie(Inst, Level) =
Stack.front();
26568 if (
R.isDeleted(Inst))
26570 if (
Value *VectorizedV = TryToReduce(Inst)) {
26574 Stack.emplace(
I, Level);
26577 if (
R.isDeleted(Inst))
26581 if (!TryAppendToPostponedInsts(Inst)) {
26592 if (VisitedInstrs.
insert(
Op).second)
26597 !
R.isDeleted(
I) &&
I->getParent() == BB)
26598 Stack.emplace(
I, Level);
26603bool SLPVectorizerPass::tryToVectorize(Instruction *
I,
BoUpSLP &R) {
26610 if ((
I->getOpcode() == Instruction::FAdd ||
26611 I->getOpcode() == Instruction::FSub) &&
26621 if (!Op0 || !Op1 || Op0->getParent() !=
P || Op1->getParent() !=
P ||
26622 R.isDeleted(Op0) ||
R.isDeleted(Op1))
26632 if (
A &&
B &&
B->hasOneUse()) {
26635 if (B0 && B0->getParent() ==
P && !
R.isDeleted(B0))
26637 if (B1 && B1->getParent() ==
P && !
R.isDeleted(B1))
26641 if (
B &&
A &&
A->hasOneUse()) {
26644 if (A0 && A0->getParent() ==
P && !
R.isDeleted(A0))
26646 if (A1 && A1->getParent() ==
P && !
R.isDeleted(A1))
26650 auto TryToReduce = [
this, &
R, &TTI = *TTI](
Instruction *Inst,
26654 Type *Ty = Inst->getType();
26658 if (!HorRdx.matchReductionForOperands())
26664 TTI.getScalarizationOverhead(
26667 TTI.getInstructionCost(Inst,
CostKind);
26670 case RecurKind::Add:
26671 case RecurKind::Mul:
26672 case RecurKind::Or:
26673 case RecurKind::And:
26674 case RecurKind::Xor:
26675 case RecurKind::FAdd:
26676 case RecurKind::FMul: {
26679 FMF = FPCI->getFastMathFlags();
26680 RedCost = TTI.getArithmeticReductionCost(Inst->getOpcode(), VecTy, FMF,
26687 if (RedCost >= ScalarCost)
26690 return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC, *DT) !=
nullptr;
26692 if (Candidates.
size() == 1)
26693 return TryToReduce(
I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1},
R);
26696 std::optional<int> BestCandidate =
R.findBestRootPair(Candidates);
26697 if (!BestCandidate)
26699 return (*BestCandidate == 0 &&
26700 TryToReduce(
I, {Candidates[*BestCandidate].first,
26701 Candidates[*BestCandidate].second})) ||
26702 tryToVectorizeList({Candidates[*BestCandidate].first,
26703 Candidates[*BestCandidate].second},
26707bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *
P, Instruction *Root,
26708 BasicBlock *BB,
BoUpSLP &R) {
26710 bool Res = vectorizeHorReduction(
P, Root, BB, R, PostponedInsts);
26711 Res |= tryToVectorize(PostponedInsts, R);
26718 for (
Value *V : Insts)
26720 Res |= tryToVectorize(Inst, R);
26724bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
26727 if (!
R.canMapToVector(IVI->
getType()))
26730 SmallVector<Value *, 16> BuildVectorOpds;
26731 SmallVector<Value *, 16> BuildVectorInsts;
26735 if (MaxVFOnly && BuildVectorOpds.
size() == 2) {
26736 R.getORE()->emit([&]() {
26737 return OptimizationRemarkMissed(
SV_NAME,
"NotPossible", IVI)
26738 <<
"Cannot SLP vectorize list: only 2 elements of buildvalue, "
26739 "trying reduction first.";
26743 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IVI <<
"\n");
26745 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
26748bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
26751 SmallVector<Value *, 16> BuildVectorInsts;
26752 SmallVector<Value *, 16> BuildVectorOpds;
26753 SmallVector<int>
Mask;
26759 if (MaxVFOnly && BuildVectorInsts.
size() == 2) {
26760 R.getORE()->emit([&]() {
26761 return OptimizationRemarkMissed(
SV_NAME,
"NotPossible", IEI)
26762 <<
"Cannot SLP vectorize list: only 2 elements of buildvector, "
26763 "trying reduction first.";
26767 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IEI <<
"\n");
26768 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
26771template <
typename T>
26776 bool MaxVFOnly,
BoUpSLP &R) {
26789 if (!
I || R.isDeleted(
I)) {
26793 auto *SameTypeIt = IncIt;
26796 AreCompatible(VL, *SameTypeIt))) {
26799 if (
I && !R.isDeleted(
I))
26804 unsigned NumElts = VL.
size();
26805 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize starting at nodes ("
26806 << NumElts <<
")\n");
26816 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL), MaxVFOnly)) {
26819 VL.
swap(Candidates);
26820 Candidates.
clear();
26828 auto GetMinNumElements = [&R](
Value *V) {
26829 unsigned EltSize = R.getVectorElementSize(V);
26830 return std::max(2U, R.getMaxVecRegSize() / EltSize);
26832 if (NumElts < GetMinNumElements(*IncIt) &&
26833 (Candidates.
empty() ||
26834 Candidates.
front()->getType() == (*IncIt)->getType())) {
26842 if (Candidates.
size() > 1 &&
26843 (SameTypeIt ==
E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
26844 if (TryToVectorizeHelper(Candidates,
false)) {
26847 }
else if (MaxVFOnly) {
26850 for (
auto *It = Candidates.
begin(), *End = Candidates.
end(); It != End;
26853 if (!
I || R.isDeleted(
I)) {
26857 auto *SameTypeIt = It;
26858 while (SameTypeIt != End &&
26861 AreCompatible(*SameTypeIt, *It))) {
26864 if (
I && !R.isDeleted(
I))
26867 unsigned NumElts = VL.
size();
26868 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL),
26874 Candidates.
clear();
26878 IncIt = SameTypeIt;
26890template <
bool IsCompatibility>
26895 "Expected valid element types only.");
26897 return IsCompatibility;
26900 if (CI1->getOperand(0)->getType()->getTypeID() <
26902 return !IsCompatibility;
26903 if (CI1->getOperand(0)->getType()->getTypeID() >
26906 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
26908 return !IsCompatibility;
26909 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
26918 if (BasePred1 < BasePred2)
26919 return !IsCompatibility;
26920 if (BasePred1 > BasePred2)
26923 bool CI1Preds = Pred1 == BasePred1;
26924 bool CI2Preds = Pred2 == BasePred1;
26925 for (
int I = 0,
E = CI1->getNumOperands();
I <
E; ++
I) {
26926 auto *Op1 = CI1->getOperand(CI1Preds ?
I :
E -
I - 1);
26931 return !IsCompatibility;
26936 if (IsCompatibility) {
26937 if (I1->getParent() != I2->getParent())
26944 return NodeI2 !=
nullptr;
26947 assert((NodeI1 == NodeI2) ==
26949 "Different nodes should have different DFS numbers");
26950 if (NodeI1 != NodeI2)
26954 if (S && (IsCompatibility || !S.isAltShuffle()))
26956 if (IsCompatibility)
26958 if (I1->getOpcode() != I2->getOpcode())
26959 return I1->getOpcode() < I2->getOpcode();
26962 return IsCompatibility;
26965template <
typename ItT>
26971 if (R.isDeleted(
I))
26975 Changed |= vectorizeRootInstruction(
nullptr, RootOp, BB, R);
26976 if (R.isDeleted(
I))
26982 if (R.isDeleted(
I))
26988 auto CompareSorter = [&](
Value *V,
Value *V2) {
27004 if (Vals.
size() <= 1)
27007 Vals, CompareSorter, AreCompatibleCompares,
27010 bool ArePossiblyReducedInOtherBlock =
any_of(Candidates, [](
Value *V) {
27012 auto *Select = dyn_cast<SelectInst>(U);
27014 Select->getParent() != cast<Instruction>(V)->getParent();
27017 if (ArePossiblyReducedInOtherBlock)
27019 return tryToVectorizeList(Candidates, R, MaxVFOnly);
27025bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
27028 "This function only accepts Insert instructions");
27029 bool OpsChanged =
false;
27031 for (
auto *
I :
reverse(Instructions)) {
27037 vectorizeInsertValueInst(LastInsertValue, BB, R,
true);
27040 vectorizeInsertElementInst(LastInsertElem, BB, R,
true);
27043 if (
R.isDeleted(
I))
27045 OpsChanged |= vectorizeHorReduction(
nullptr,
I, BB, R, PostponedInsts);
27051 vectorizeInsertValueInst(LastInsertValue, BB, R,
false);
27053 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
27058 OpsChanged |= tryToVectorize(PostponedInsts, R);
27064bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB,
BoUpSLP &R) {
27066 SmallVector<Value *, 4> Incoming;
27067 SmallPtrSet<Value *, 16> VisitedInstrs;
27071 DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;
27072 auto PHICompare = [
this, &PHIToOpcodes](
Value *V1,
Value *V2) {
27075 "Expected vectorizable types only.");
27085 V2->getType()->getScalarSizeInBits())
27088 V2->getType()->getScalarSizeInBits())
27092 if (Opcodes1.
size() < Opcodes2.
size())
27094 if (Opcodes1.
size() > Opcodes2.
size())
27096 for (
int I = 0,
E = Opcodes1.
size();
I <
E; ++
I) {
27105 return NodeI2 !=
nullptr;
27108 assert((NodeI1 == NodeI2) ==
27110 "Different nodes should have different DFS numbers");
27111 if (NodeI1 != NodeI2)
27114 if (S && !S.isAltShuffle() &&
I1->getOpcode() == I2->getOpcode()) {
27130 DT->getNode(V1->getParent());
27132 DT->getNode(V2->getParent());
27134 return NodeI2 !=
nullptr;
27137 assert((NodeI1 == NodeI2) ==
27139 "Different nodes should have different DFS numbers");
27140 if (NodeI1 != NodeI2)
27142 return V1->comesBefore(V2);
27155 return *Id1 < *Id2;
27159 if (
I1->getOpcode() == I2->getOpcode())
27161 return I1->getOpcode() < I2->getOpcode();
27184 auto ValID1 = Opcodes1[
I]->getValueID();
27185 auto ValID2 = Opcodes2[
I]->getValueID();
27186 if (ValID1 == ValID2)
27188 if (ValID1 < ValID2)
27190 if (ValID1 > ValID2)
27199 assert(U1 && U2 &&
"The only thing left should be undef & undef.");
27205 if (VL.empty() || V1 == VL.back())
27207 Value *V2 = VL.back();
27212 if (Opcodes1.
size() != Opcodes2.
size())
27214 for (
int I = 0,
E = Opcodes1.
size();
I <
E; ++
I) {
27220 if (
R.isDeleted(I1) ||
R.isDeleted(I2))
27222 if (
I1->getParent() != I2->getParent())
27230 if (Opcodes1[
I]->getValueID() != Opcodes2[
I]->getValueID())
27236 bool HaveVectorizedPhiNodes =
false;
27240 for (Instruction &
I : *BB) {
27247 if (!VisitedInstrs.
count(
P) && !
R.isDeleted(
P) &&
27252 if (Incoming.
size() <= 1)
27257 for (
Value *V : Incoming) {
27258 SmallVectorImpl<Value *> &Opcodes =
27260 if (!Opcodes.
empty())
27262 SmallVector<Value *, 4> Nodes(1, V);
27263 SmallPtrSet<Value *, 4> Visited;
27264 while (!Nodes.empty()) {
27268 for (
Value *V :
PHI->incoming_values()) {
27270 Nodes.push_back(PHI1);
27279 Incoming, PHICompare, AreCompatiblePHIs,
27281 return tryToVectorizeList(Candidates, R, MaxVFOnly);
27284 Changed |= HaveVectorizedPhiNodes;
27285 if (HaveVectorizedPhiNodes &&
any_of(PHIToOpcodes, [&](
const auto &
P) {
27287 return !
PHI ||
R.isDeleted(
PHI);
27289 PHIToOpcodes.
clear();
27291 }
while (HaveVectorizedPhiNodes);
27293 VisitedInstrs.
clear();
27295 InstSetVector PostProcessInserts;
27296 SmallSetVector<CmpInst *, 8> PostProcessCmps;
27299 auto VectorizeInsertsAndCmps = [&](
bool VectorizeCmps) {
27300 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
27301 if (VectorizeCmps) {
27303 PostProcessCmps.
clear();
27305 PostProcessInserts.clear();
27311 return PostProcessCmps.
contains(Cmp);
27313 PostProcessInserts.contains(
I);
27319 return I->use_empty() &&
27329 if (
R.isDeleted(&*It))
27332 if (!VisitedInstrs.
insert(&*It).second) {
27333 if (HasNoUsers(&*It) &&
27334 VectorizeInsertsAndCmps(It->isTerminator())) {
27347 if (
P->getNumIncomingValues() == 2) {
27350 if (Root && vectorizeRootInstruction(
P, Root, BB, R)) {
27364 if (BB ==
P->getIncomingBlock(
I) ||
27365 !DT->isReachableFromEntry(
P->getIncomingBlock(
I)))
27371 PI && !IsInPostProcessInstrs(PI)) {
27373 vectorizeRootInstruction(
nullptr, PI,
P->getIncomingBlock(
I), R);
27375 if (Res &&
R.isDeleted(
P)) {
27385 if (HasNoUsers(&*It)) {
27386 bool OpsChanged =
false;
27397 TryToVectorizeRoot |= (
I == Stores.
end() ||
I->second.size() == 1) &&
27398 SI->getValueOperand()->hasOneUse();
27400 if (TryToVectorizeRoot) {
27401 for (
auto *V : It->operand_values()) {
27405 VI && !IsInPostProcessInstrs(VI))
27407 OpsChanged |= vectorizeRootInstruction(
nullptr, VI, BB, R);
27414 VectorizeInsertsAndCmps(It->isTerminator());
27426 PostProcessInserts.insert(&*It);
27434bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB,
BoUpSLP &R) {
27436 for (
auto &Entry : GEPs) {
27439 if (
Entry.second.size() < 2)
27442 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing a getelementptr list of length "
27443 <<
Entry.second.size() <<
".\n");
27451 return !R.isDeleted(GEP);
27453 if (It ==
Entry.second.end())
27455 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
27456 unsigned EltSize =
R.getVectorElementSize(*(*It)->idx_begin());
27457 if (MaxVecRegSize < EltSize)
27460 unsigned MaxElts = MaxVecRegSize / EltSize;
27461 for (
unsigned BI = 0, BE =
Entry.second.size(); BI < BE; BI += MaxElts) {
27462 auto Len = std::min<unsigned>(BE - BI, MaxElts);
27475 Candidates.remove_if([&R](
Value *
I) {
27485 for (
int I = 0,
E = GEPList.size();
I <
E && Candidates.
size() > 1; ++
I) {
27486 auto *GEPI = GEPList[
I];
27487 if (!Candidates.count(GEPI))
27489 const SCEV *SCEVI = SE->getSCEV(GEPList[
I]);
27490 for (
int J =
I + 1; J <
E && Candidates.
size() > 1; ++J) {
27491 auto *GEPJ = GEPList[J];
27492 const SCEV *SCEVJ = SE->getSCEV(GEPList[J]);
27494 Candidates.remove(GEPI);
27495 Candidates.remove(GEPJ);
27496 }
else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
27497 Candidates.remove(GEPJ);
27504 if (Candidates.
size() < 2)
27510 SmallVector<Value *, 16> Bundle(Candidates.
size());
27511 auto BundleIndex = 0
u;
27512 for (
auto *V : Candidates) {
27514 auto *GEPIdx =
GEP->idx_begin()->get();
27516 Bundle[BundleIndex++] = GEPIdx;
27528 Changed |= tryToVectorizeList(Bundle, R);
27534bool SLPVectorizerPass::vectorizeStoreChains(
BoUpSLP &R) {
27539 auto &&StoreSorter = [
this](StoreInst *
V, StoreInst *V2) {
27540 if (
V->getValueOperand()->getType()->getTypeID() <
27543 if (
V->getValueOperand()->getType()->getTypeID() >
27546 if (
V->getPointerOperandType()->getTypeID() <
27547 V2->getPointerOperandType()->getTypeID())
27549 if (
V->getPointerOperandType()->getTypeID() >
27550 V2->getPointerOperandType()->getTypeID())
27552 if (
V->getValueOperand()->getType()->getScalarSizeInBits() <
27555 if (
V->getValueOperand()->getType()->getScalarSizeInBits() >
27562 DomTreeNodeBase<llvm::BasicBlock> *NodeI1 = DT->getNode(
I1->getParent());
27563 DomTreeNodeBase<llvm::BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
27564 assert(NodeI1 &&
"Should only process reachable instructions");
27565 assert(NodeI2 &&
"Should only process reachable instructions");
27566 assert((NodeI1 == NodeI2) ==
27568 "Different nodes should have different DFS numbers");
27569 if (NodeI1 != NodeI2)
27571 return I1->getOpcode() < I2->getOpcode();
27577 return V->getValueOperand()->getValueID() <
27581 bool SameParent =
true;
27587 StoreInst *V2 = VL.
back();
27612 SameParent &=
I1 && I2 &&
I1->getParent() == I2->getParent();
27614 for (
auto [SI, V] :
zip(VL, NewVL))
27615 V =
SI->getValueOperand();
27616 NewVL.back() = V1->getValueOperand();
27617 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
27618 InstructionsState S =
Analysis.buildInstructionsState(
27626 return V1->getValueOperand()->
getValueID() ==
27631 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> Attempted;
27632 for (
auto &Pair : Stores) {
27633 if (Pair.second.size() < 2)
27637 << Pair.second.size() <<
".\n");
27646 Pair.second.rend());
27648 ReversedStores, StoreSorter, AreCompatibleStores,
27650 return vectorizeStores(Candidates, R, Attempted);
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool isConstant(const MachineInstr &MI)
AMDGPU Register Bank Select
ReachingDefInfo InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
block Block Frequency Analysis
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
static bool runImpl(Function &F, const TargetLowering &TLI, const LibcallLoweringInfo &Libcalls, AssumptionCache *AC)
This is the interface for a simple mod/ref and alias analysis over globals.
static Value * getCondition(Instruction *I)
static void setCondition(Instruction *I, Value *NewCond)
static const HTTPClientCleanup Cleanup
static Type * getIndexType(Value *In)
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
This file provides utility analysis objects describing memory locations.
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
static bool IsSelect(MachineInstr &MI)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
static std::optional< OperandInfo > getOperandInfo(const MachineOperand &MO)
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool isAlternateInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static cl::opt< bool > SplitAlternateInstructions("slp-split-alternate-instructions", cl::init(true), cl::Hidden, cl::desc("Improve the code quality by splitting alternate instructions"))
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static bool isMaskedLoadCompress(ArrayRef< Value * > VL, ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC, const DominatorTree &DT, const TargetLibraryInfo &TLI, const function_ref< bool(Value *)> AreAllUsersVectorized, bool &IsMasked, unsigned &InterleaveFactor, SmallVectorImpl< int > &CompressMask, VectorType *&LoadVecTy)
Checks if the VL can be transformed to a (masked)load + compress or (masked) interleaved load.
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
static cl::opt< bool > VectorizeCopyableElements("slp-copyable-elements", cl::init(true), cl::Hidden, cl::desc("Try to replace values with the idempotent instructions for " "better vectorization."))
Enables vectorization of copyable elements.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static bool allSameOpcode(ArrayRef< Value * > VL)
static InstructionCost canConvertToFMA(ArrayRef< Value * > VL, const InstructionsState &S, DominatorTree &DT, const DataLayout &DL, TargetTransformInfo &TTI, const TargetLibraryInfo &TLI)
Check if we can convert fadd/fsub sequence to FMAD.
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool isCommutative(Instruction *I, Value *ValWithUses, bool IsCopyable=false)
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Creates subvector insert.
static void findBuildAggregateRec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool isSimple(Instruction *I)
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static DebugLoc getDebugLocFromPHI(PHINode &PN)
static std::optional< unsigned > getExtractIndex(const Instruction *E)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static cl::opt< bool > ForceStridedLoads("slp-force-strided-loads", cl::init(false), cl::Hidden, cl::desc("Generate strided loads even if they are not " "profitable. Used for testing only."))
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static bool isMainInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an main operation for the given MainOp and AltOp instruction...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static unsigned getNumberOfPotentiallyCommutativeOps(Instruction *I)
static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy, VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={})
This is similar to TargetTransformInfo::getScalarizationOverhead, but if ScalarTy is a FixedVectorTyp...
static bool isCommutableOperand(Instruction *I, Value *ValWithUses, unsigned Op, bool IsCopyable=false)
Checks if the operand is commutative.
static bool buildCompressMask(ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, Type *ScalarTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< int > &CompressMask)
Builds compress-like mask for shuffles for the given PointerOps, ordered with Order.
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
Calculates the costs of vectorized intrinsic (if possible) and vectorized function (if possible) call...
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static bool isValidForAlternation(unsigned Opcode)
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
static SmallVector< Constant * > replicateMask(ArrayRef< Constant * > Val, unsigned VF)
Replicates the given Val VF times.
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static Instruction * findInstructionWithOpcode(ArrayRef< Value * > VL, unsigned Opcode)
Find an instruction with a specific opcode in VL.
static const SCEV * calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, SmallVectorImpl< int64_t > &Coeffs)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static InstructionCost getExtractWithExtendCost(const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput)
This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst is a FixedVectorType,...
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static std::pair< Instruction *, Instruction * > getMainAltOpsNoStateVL(ArrayRef< Value * > VL)
Returns main/alternate instructions for the given VL.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int64_t > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(ArrayRef< T * >, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static unsigned getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy, const unsigned Limit=std::numeric_limits< unsigned >::max())
Returns number of parts, the type VecTy will be split at the codegen phase.
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static bool tryToFindDuplicates(SmallVectorImpl< Value * > &VL, SmallVectorImpl< int > &ReuseShuffleIndices, const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, const InstructionsState &S, const BoUpSLP::EdgeInfo &UserTreeIdx, bool TryPad=false)
Checks that every instruction appears once in the list and if not, packs them, building ReuseShuffleI...
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Value * createExtractVector(IRBuilderBase &Builder, Value *Vec, unsigned SubVecVF, unsigned Index)
Generates subvector extract using Generator or using default shuffle.
static cl::opt< bool > DisableTreeReorder("slp-disable-tree-reorder", cl::init(false), cl::Hidden, cl::desc("Disable tree reordering even if it is " "profitable. Used for testing only."))
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static InstructionCost getVectorInstrCost(const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar, ArrayRef< std::tuple< Value *, User *, int > > ScalarUserAndIdx)
This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy is a FixedVectorType,...
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, Type *ScalarTy, unsigned Opcode0, unsigned Opcode1)
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
static const int BlockSize
LocallyHashedType DenseMapInfo< LocallyHashedType >::Empty
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
static const uint32_t IV[8]
Merges shuffle masks and emits final shuffle instruction, if required.
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost createFreeze(InstructionCost Cost)
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void resetForSameNode()
Reset the builder to handle perfect diamond match.
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Merges shuffle masks and emits final shuffle instruction, if required.
Value * createFreeze(Value *V)
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void resetForSameNode()
Reset the builder to handle perfect diamond match.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
~ShuffleInstructionBuilder()
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
A manager for alias analyses.
Class for arbitrary precision integers.
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
void clearBit(unsigned BitPosition)
Set a given bit to 0.
uint64_t getZExtValue() const
Get zero extended value.
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
unsigned getBitWidth() const
Return the number of bits in the APInt.
bool ult(const APInt &RHS) const
Unsigned less than comparison.
void clearAllBits()
Set every bit to 0.
void negate()
Negate this APInt in place.
unsigned logBase2() const
void setAllBits()
Set every bit to 1.
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
bool isOne() const
Determine if this is a value of 1.
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
const T & back() const
back - Get the last element.
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
const T & front() const
front - Get the first element.
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
const T & consume_front()
consume_front() - Returns the first element and drops it from ArrayRef.
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::reverse_iterator reverse_iterator
InstListType::iterator iterator
Instruction iterators...
LLVM_ABI const_iterator getFirstNonPHIOrDbgOrAlloca() const
Returns an iterator to the first instruction in this block that is not a PHINode, a debug intrinsic,...
InstListType::const_reverse_iterator const_reverse_iterator
bool isEHPad() const
Return true if this basic block is an exception handling block.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Represents analyses that only rely on functions' control flow.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
LLVM_ABI void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
This class is the base class for the comparison instructions.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ ICMP_SLT
signed less than
@ ICMP_SLE
signed less or equal
@ ICMP_UGE
unsigned greater or equal
@ ICMP_UGT
unsigned greater than
@ ICMP_SGT
signed greater than
@ ICMP_ULT
unsigned less than
@ ICMP_SGE
signed greater or equal
@ ICMP_ULE
unsigned less or equal
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Predicate getPredicate() const
Return the predicate for this instruction.
static LLVM_ABI Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
static LLVM_ABI Constant * getBinOpIdentity(unsigned Opcode, Type *Ty, bool AllowRHSConstant=false, bool NSZ=false)
Return the identity constant for a binary opcode.
This is the shared class of boolean and integer constants.
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
const APInt & getValue() const
Return the constant as an APInt value reference.
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
static bool shouldExecute(CounterInfo &Counter)
static DebugLoc getUnknown()
An analysis that produces DemandedBits for a function.
ValueT & at(const_arg_type_t< KeyT > Val)
at - Return the entry for the specified key, or abort if no such entry exists.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
bool erase(const KeyT &Val)
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
Base class for the actual dominator tree node.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Convenience struct for specifying and reasoning about fast-math flags.
bool allowReassoc() const
Flag queries.
bool allowContract() const
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
ArrayRef< Type * > params() const
Type * getReturnType() const
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
For the node iterator we just need to turn the TreeEntry iterator into a TreeEntry* iterator so that ...
nodes_iterator operator++()
nodes_iterator(const ItTy &It2)
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Value * CreateFreeze(Value *V, const Twine &Name="")
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
LLVM_ABI Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
LLVM_ABI Value * CreateSelectWithUnknownProfile(Value *C, Value *True, Value *False, StringRef PassName, const Twine &Name="")
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
static InstructionCost getInvalid(CostType Val=0)
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
LLVM_ABI bool mayWriteToMemory() const LLVM_READONLY
Return true if this instruction may modify memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
const SmallVectorImpl< Type * > & getArgTypes() const
An instruction for reading from memory.
Value * getPointerOperand()
Analysis pass that exposes the LoopInfo for a function.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
This class implements a map that also provides access to all stored values in a deterministic order.
VectorType takeVector()
Clear the MapVector and return the underlying vector.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
ValueT lookup(const KeyT &Key) const
std::pair< KeyT, ValueT > & front()
Information for memory intrinsic cost model.
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
T & front() const
front - Get the first element.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
ArrayRef< value_type > getArrayRef() const
size_type size() const
Determine the number of elements in the SetVector.
const value_type & front() const
Return the first element of the SetVector.
void insert_range(Range &&R)
Vector takeVector()
Clear the SetVector and return the underlying vector.
bool contains(const_arg_type key) const
Check if the SetVector contains the given key.
void clear()
Completely clear the SetVector.
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
static LLVM_ABI bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static LLVM_ABI bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static LLVM_ABI bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
bool contains(const T &V) const
Check if the SmallSet contains the given element.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
iterator erase(const_iterator CI)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
The instances of the Type class are immutable: once they are created, they are never changed.
LLVM_ABI bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
bool isVectorTy() const
True if this is an instance of VectorType.
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
bool isPointerTy() const
True if this is an instance of PointerType.
LLVM_ABI unsigned getStructNumElements() const
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
bool isIntegerTy() const
True if this is an instance of IntegerType.
TypeID getTypeID() const
Return the type id for the type.
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
bool isVoidTy() const
Return true if this is 'void'.
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Value * getOperand(unsigned i) const
unsigned getNumOperands() const
iterator_range< value_op_iterator > operand_values()
The Vector Function Database.
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
user_iterator user_begin()
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
unsigned getValueID() const
Return an ID for the concrete type of this object.
bool hasUseList() const
Check if this Value has a use-list.
LLVM_ABI bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
LLVM_ABI User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
LLVM_ABI unsigned getNumUses() const
This method computes the number of uses of this Value.
iterator_range< use_iterator > uses()
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
iterator find(const_arg_type_t< ValueT > V)
void insert_range(Range &&R)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
const ParentTy * getParent() const
self_iterator getIterator()
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator I
iterator_adaptor_base()=default
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
static const int ScoreConstants
Constants.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
A helper data structure to hold the operands of a vector of instructions.
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, ArrayRef< ValueList > Operands, const InstructionsState &S, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
Bottom Up SLP Vectorizer.
static bool isIdentityOrder(ArrayRef< unsigned > Order)
Does this non-empty order represent an identity order?
bool isProfitableToReorder() const
Checks if it is profitable to reorder the current tree.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleEntity &SE)
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleBundle &Bundle)
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
unsigned getMaxVecRegSize() const
OptimizationRemarkEmitter * getORE()
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
unsigned getTreeSize() const
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
InstructionCost getSpillCost()
bool isVectorized(const Value *V) const
Check if the value is vectorized in the tree.
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool isStridedLoad(ArrayRef< Value * > PointerOps, Type *ScalarTy, Align Alignment, const int64_t Diff, const size_t Sz) const
Checks if strided loads can be generated out of VL loads with pointers PointerOps:
SmallVector< StoreInst *, 8 > StoreList
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
unsigned getMinVecRegSize() const
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
unsigned getMinVF(unsigned Sz) const
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, StridedPtrInfo &SPtrInfo, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
SmallVector< unsigned, 4 > OrdersType
SmallVector< Instruction *, 16 > InstrList
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
InstructionCost getTreeCost(InstructionCost TreeCost, ArrayRef< Value * > VectorizedVals={}, InstructionCost ReductionCost=TTI::TCC_Free)
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool analyzeRtStrideCandidate(ArrayRef< Value * > PointerOps, Type *ScalarTy, Align CommonAlignment, SmallVectorImpl< unsigned > &SortedIndices, StridedPtrInfo &SPtrInfo) const
Return true if an array of scalar loads can be replaced with a strided load (with run-time stride).
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleData &SD)
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
SmallVector< Value *, 8 > ValueList
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
bool analyzeConstantStrideCandidate(const ArrayRef< Value * > PointerOps, Type *ElemTy, Align Alignment, const SmallVectorImpl< unsigned > &SortedIndices, const int64_t Diff, Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const
Return true if an array of scalar loads can be replaced with a strided load (with constant stride).
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
InstructionCost calculateTreeCostAndTrimNonProfitable(ArrayRef< Value * > VectorizedVals={})
Calculates the cost of the subtrees, trims non-profitable ones and returns final cost.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Gets reordering data for the given tree entry.
SmallPtrSet< Value *, 16 > ValueSet
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals, ArrayRef< std::tuple< Value *, unsigned, bool > > VectorValuesAndScales)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ BasicBlock
Various leaf nodes.
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaxNum(const Opnd0 &Op0, const Opnd1 &Op1)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
auto match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinimum(const Opnd0 &Op0, const Opnd1 &Op1)
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaximum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinNum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
initializer< Ty > init(const Ty &Val)
unsigned combineHashValue(unsigned a, unsigned b)
Simplistic combination of 32-bit hash values into 32-bit hash values.
@ User
could "use" a pointer
DiagnosticInfoOptimizationBase::Argument NV
friend class Instruction
Iterator for Instructions in a `BasicBlock.
LLVM_ABI iterator begin() const
LLVM_ABI Instruction & front() const
A private "module" namespace for types and utilities used by this pass.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
LLVM_ABI Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
constexpr auto not_equal_to(T &&Arg)
Functor variant of std::not_equal_to that can be used as a UnaryPredicate in functional algorithms li...
FunctionAddr VTableAddr Value
void stable_sort(R &&Range)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
void fill(R &&Range, T &&Value)
Provide wrappers to std::fill which take ranges instead of having to pass begin/end explicitly.
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
hash_code hash_value(const FixedPointSemantics &Val)
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
MaybeAlign getAlign(const CallInst &I, unsigned Index)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
auto pred_end(const MachineBasicBlock *BB)
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
LLVM_ABI void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
scope_exit(Callable) -> scope_exit< Callable >
constexpr from_range_t from_range
LLVM_ABI std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
auto cast_or_null(const Y &Val)
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
iterator_range< po_iterator< T > > post_order(const T &G)
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
bool isa_and_nonnull(const Y &Val)
auto binary_search(R &&Range, T &&Value)
Provide wrappers to std::binary_search which take ranges instead of having to pass begin/end explicit...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
DomTreeNodeBase< BasicBlock > DomTreeNode
auto dyn_cast_or_null(const Y &Val)
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
constexpr bool has_single_bit(T Value) noexcept
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
bool isa_and_present(const Y &Val)
isa_and_present<X> - Functionally identical to isa, except that a null value is accepted.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
bool isPointerTy(const Type *T)
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
auto make_first_range(ContainerTy &&c)
Given a container of pairs, return a range over the first elements.
LLVM_ABI std::optional< int64_t > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
LLVM_ABI bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
bool isModOrRefSet(const ModRefInfo MRI)
bool is_sorted(R &&Range, Compare C)
Wrapper function around std::is_sorted to check if elements in a range R are sorted with respect to a...
LLVM_ABI bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Ref
The access may reference the value stored in memory.
@ LLVM_MARK_AS_BITMASK_ENUM
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
LLVM_ABI CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)
Returns the comparison predicate used when expanding a min/max reduction.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
FunctionAddr VTableAddr Next
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
OutputIt copy(R &&Range, OutputIt Out)
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
constexpr unsigned BitWidth
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
auto pred_begin(const MachineBasicBlock *BB)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
template class LLVM_TEMPLATE_ABI DomTreeNodeBase< BasicBlock >
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
LLVM_ABI Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
LLVM_ABI bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
LLVM_ABI bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Used to keep track of an operand bundle.
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
BoUpSLP::TreeEntry TreeEntry
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits(bool IsSimple=false)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits(bool simple=false)
DenseMapInfo< BoUpSLP::TreeEntry * > FirstInfo
static bool isEqual(const BoUpSLP::EdgeInfo &LHS, const BoUpSLP::EdgeInfo &RHS)
static BoUpSLP::EdgeInfo getEmptyKey()
DenseMapInfo< unsigned > SecondInfo
static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val)
static BoUpSLP::EdgeInfo getTombstoneKey()
An information struct used to provide DenseMap with the various necessary components for a given valu...
Add the VectorizableTree to the index iterator to be able to return TreeEntry pointers.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
ContainerTy & VectorizableTree
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
BoUpSLP::TreeEntry TreeEntry
static nodes_iterator nodes_end(BoUpSLP *R)
BoUpSLP::TreeEntry::VecTreeTy ContainerTy
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
TargetTransformInfo * TTI
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Function object to check whether the second component of a container supported by std::get (like std:...
This structure holds any data we need about the edges being traversed during buildTreeRec().
bool operator==(const EdgeInfo &Other) const
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
unsigned EdgeIdx
The operand index of the use.
void dump(raw_ostream &OS) const
Debug print.
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)