74#ifdef EXPENSIVE_CHECKS
109using namespace std::placeholders;
111#define SV_NAME "slp-vectorizer"
112#define DEBUG_TYPE "SLP"
114STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
117 "Controls which SLP graphs should be vectorized.");
121 cl::desc(
"Run the SLP vectorization passes"));
125 cl::desc(
"Enable vectorization for wider vector utilization"));
129 cl::desc(
"Only vectorize if you gain more than this "
134 cl::desc(
"When true, SLP vectorizer bypasses profitability checks based on "
135 "heuristics and makes vectorization decision via cost modeling."));
139 cl::desc(
"Attempt to vectorize horizontal reductions"));
144 "Attempt to vectorize horizontal reductions feeding into a store"));
148 cl::desc(
"Improve the code quality by splitting alternate instructions"));
152 cl::desc(
"Attempt to vectorize for this register size in bits"));
156 cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
164 cl::desc(
"Limit the size of the SLP scheduling region per block"));
168 cl::desc(
"Attempt to vectorize for this register size in bits"));
172 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
176 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
182 cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
191 cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
195 cl::desc(
"The minimum number of loads, which should be considered strided, "
196 "if the stride is > 1 or is runtime value"));
200 cl::desc(
"The maximum stride, considered to be profitable."));
204 cl::desc(
"Disable tree reordering even if it is "
205 "profitable. Used for testing only."));
209 cl::desc(
"Generate strided loads even if they are not "
210 "profitable. Used for testing only."));
214 cl::desc(
"Display the SLP trees with Graphviz"));
218 cl::desc(
"Try to vectorize with non-power-of-2 number of elements."));
223 cl::desc(
"Try to replace values with the idempotent instructions for "
224 "better vectorization."));
256 Ty = Ty->getScalarType();
258 !Ty->isPPC_FP128Ty();
267 return SI->getValueOperand()->getType();
269 return CI->getOperand(0)->getType();
271 return IE->getOperand(1)->getType();
278 "ScalableVectorType is not supported.");
280 return VecTy->getNumElements();
294 Type *Ty,
unsigned Sz) {
299 if (NumParts == 0 || NumParts >= Sz)
314 if (NumParts == 0 || NumParts >= Sz)
319 return (Sz / RegVF) * RegVF;
331 I * VecTyNumElements, VecTyNumElements)))
333 : Mask[
I] * VecTyNumElements + J;
367 unsigned SVNumElements =
369 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
370 if (SVNumElements % ShuffleMaskSize != 0)
372 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
373 if (GroupSize == 0 || (VL.
size() % GroupSize) != 0)
375 unsigned NumGroup = 0;
376 for (
size_t I = 0,
E = VL.
size();
I !=
E;
I += GroupSize) {
378 Value *Src = SV->getOperand(0);
384 if (SV->getOperand(0) != Src)
387 if (!SV->isExtractSubvectorMask(Index))
389 ExpectedIndex.
set(Index / ShuffleMaskSize);
393 if (!ExpectedIndex.
all())
397 assert(NumGroup == (VL.
size() / GroupSize) &&
"Unexpected number of groups");
416 unsigned SVNumElements =
419 unsigned AccumulateLength = 0;
420 for (
Value *V : VL) {
422 for (
int M : SV->getShuffleMask())
424 : AccumulateLength + M);
425 AccumulateLength += SVNumElements;
466 return std::min<unsigned>(PartNumElems,
Size - Part * PartNumElems);
475 OS <<
"Idx: " << Idx <<
", ";
476 OS <<
"n=" << VL.
size() <<
" [" << *VL.
front() <<
", ..]";
499 if (BB !=
II->getParent())
516 Value *FirstNonUndef =
nullptr;
517 for (
Value *V : VL) {
520 if (!FirstNonUndef) {
524 if (V != FirstNonUndef)
527 return FirstNonUndef !=
nullptr;
542 bool IsCopyable =
false) {
544 return Cmp->isCommutative();
546 return BO->isCommutative() ||
547 (BO->getOpcode() == Instruction::Sub &&
554 if (match(U.getUser(),
555 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
556 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
560 auto *I = dyn_cast<BinaryOperator>(U.get());
561 return match(U.getUser(),
562 m_Intrinsic<Intrinsic::abs>(
563 m_Specific(U.get()), m_ConstantInt(Flag))) &&
564 ((!IsCopyable && I && !I->hasNoSignedWrap()) ||
567 (BO->getOpcode() == Instruction::FSub &&
570 return match(U.getUser(),
571 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
573 return I->isCommutative();
593 constexpr unsigned IntrinsicNumOperands = 2;
594 return IntrinsicNumOperands;
596 return I->getNumOperands();
602 static_assert(std::is_same_v<T, InsertElementInst> ||
603 std::is_same_v<T, ExtractElementInst>,
613 if (CI->getValue().uge(VT->getNumElements()))
615 Index *= VT->getNumElements();
616 Index += CI->getZExtValue();
638 Type *CurrentType =
IV->getType();
639 for (
unsigned I :
IV->indices()) {
641 Index *= ST->getNumElements();
642 CurrentType = ST->getElementType(
I);
644 Index *= AT->getNumElements();
645 CurrentType = AT->getElementType();
667 return std::all_of(It, VL.
end(), [&](
Value *V) {
668 if (auto *CI = dyn_cast<CmpInst>(V))
669 return BasePred == CI->getPredicate();
670 if (auto *I = dyn_cast<Instruction>(V))
671 return I->getOpcode() == Opcode;
672 return isa<PoisonValue>(V);
700 if (MaskArg == UseMask::UndefsAsMask)
704 if (MaskArg == UseMask::FirstArg &&
Value < VF)
705 UseMask.reset(
Value);
706 else if (MaskArg == UseMask::SecondArg &&
Value >= VF)
707 UseMask.reset(
Value - VF);
715template <
bool IsPoisonOnly = false>
719 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
727 if (!UseMask.empty()) {
738 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
753 for (
unsigned I = 0,
E = VecTy->getNumElements();
I !=
E; ++
I) {
754 if (
Constant *Elem =
C->getAggregateElement(
I))
756 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
784static std::optional<TargetTransformInfo::ShuffleKind>
791 std::accumulate(VL.
begin(), VL.
end(), 0u, [](
unsigned S,
Value *V) {
792 auto *EI = dyn_cast<ExtractElementInst>(V);
795 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
798 return std::max(S, VTy->getNumElements());
801 Value *Vec1 =
nullptr;
802 Value *Vec2 =
nullptr;
807 Value *Vec = EE->getVectorOperand();
813 ShuffleMode CommonShuffleMode =
Unknown;
815 for (
unsigned I = 0,
E = VL.
size();
I <
E; ++
I) {
822 auto *Vec = EI->getVectorOperand();
836 if (Idx->getValue().uge(
Size))
838 unsigned IntIdx = Idx->getValue().getZExtValue();
845 if (!Vec1 || Vec1 == Vec) {
847 }
else if (!Vec2 || Vec2 == Vec) {
853 if (CommonShuffleMode == Permute)
857 if (Mask[
I] %
Size !=
I) {
858 CommonShuffleMode = Permute;
861 CommonShuffleMode =
Select;
864 if (CommonShuffleMode ==
Select && Vec2)
874 unsigned Opcode =
E->getOpcode();
875 assert((Opcode == Instruction::ExtractElement ||
876 Opcode == Instruction::ExtractValue) &&
877 "Expected extractelement or extractvalue instruction.");
878 if (Opcode == Instruction::ExtractElement) {
882 return CI->getZExtValue();
885 if (EI->getNumIndices() != 1)
887 return *EI->idx_begin();
921class BinOpSameOpcodeHelper {
922 using MaskType = std::uint_fast16_t;
924 constexpr static std::initializer_list<unsigned> SupportedOp = {
925 Instruction::Add, Instruction::Sub, Instruction::Mul, Instruction::Shl,
926 Instruction::AShr, Instruction::And, Instruction::Or, Instruction::Xor};
936 MainOpBIT = 0b100000000,
944 static std::pair<ConstantInt *, unsigned>
945 isBinOpWithConstantInt(
const Instruction *
I) {
946 unsigned Opcode =
I->getOpcode();
952 if (Opcode == Instruction::Sub || Opcode == Instruction::Shl ||
953 Opcode == Instruction::AShr)
959 struct InterchangeableInfo {
962 MaskType Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |
963 MulBIT | AShrBIT | ShlBIT;
968 MaskType SeenBefore = 0;
969 InterchangeableInfo(
const Instruction *I) : I(I) {}
973 bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) {
974 if (Mask & InterchangeableMask) {
975 SeenBefore |= OpcodeInMaskForm;
976 Mask &= InterchangeableMask;
981 bool equal(
unsigned Opcode) {
982 return Opcode == I->getOpcode() && trySet(MainOpBIT, MainOpBIT);
985 MaskType Candidate = Mask & SeenBefore;
986 if (Candidate & MainOpBIT)
987 return I->getOpcode();
988 if (Candidate & ShlBIT)
989 return Instruction::Shl;
990 if (Candidate & AShrBIT)
991 return Instruction::AShr;
992 if (Candidate & MulBIT)
993 return Instruction::Mul;
994 if (Candidate & AddBIT)
995 return Instruction::Add;
996 if (Candidate & SubBIT)
997 return Instruction::Sub;
998 if (Candidate & AndBIT)
999 return Instruction::And;
1000 if (Candidate & OrBIT)
1001 return Instruction::Or;
1002 if (Candidate & XorBIT)
1003 return Instruction::Xor;
1008 bool hasCandidateOpcode(
unsigned Opcode)
const {
1009 MaskType Candidate = Mask & SeenBefore;
1011 case Instruction::Shl:
1012 return Candidate & ShlBIT;
1013 case Instruction::AShr:
1014 return Candidate & AShrBIT;
1015 case Instruction::Mul:
1016 return Candidate & MulBIT;
1017 case Instruction::Add:
1018 return Candidate & AddBIT;
1019 case Instruction::Sub:
1020 return Candidate & SubBIT;
1021 case Instruction::And:
1022 return Candidate & AndBIT;
1023 case Instruction::Or:
1024 return Candidate & OrBIT;
1025 case Instruction::Xor:
1026 return Candidate & XorBIT;
1027 case Instruction::LShr:
1028 case Instruction::FAdd:
1029 case Instruction::FSub:
1030 case Instruction::FMul:
1031 case Instruction::SDiv:
1032 case Instruction::UDiv:
1033 case Instruction::FDiv:
1034 case Instruction::SRem:
1035 case Instruction::URem:
1036 case Instruction::FRem:
1046 unsigned FromOpcode = I->getOpcode();
1047 if (FromOpcode == ToOpcode)
1050 auto [CI, Pos] = isBinOpWithConstantInt(I);
1051 const APInt &FromCIValue = CI->getValue();
1052 unsigned FromCIValueBitWidth = FromCIValue.
getBitWidth();
1054 switch (FromOpcode) {
1055 case Instruction::Shl:
1056 if (ToOpcode == Instruction::Mul) {
1060 assert(FromCIValue.
isZero() &&
"Cannot convert the instruction.");
1061 ToCIValue = ToOpcode == Instruction::And
1063 : APInt::getZero(FromCIValueBitWidth);
1066 case Instruction::Mul:
1068 if (ToOpcode == Instruction::Shl) {
1069 ToCIValue = APInt(FromCIValueBitWidth, FromCIValue.
logBase2());
1071 assert(FromCIValue.
isOne() &&
"Cannot convert the instruction.");
1072 ToCIValue = ToOpcode == Instruction::And
1074 : APInt::getZero(FromCIValueBitWidth);
1077 case Instruction::Add:
1078 case Instruction::Sub:
1079 if (FromCIValue.
isZero()) {
1083 "Cannot convert the instruction.");
1084 ToCIValue = FromCIValue;
1088 case Instruction::And:
1090 ToCIValue = ToOpcode == Instruction::Mul
1092 : APInt::getZero(FromCIValueBitWidth);
1095 assert(FromCIValue.
isZero() &&
"Cannot convert the instruction.");
1099 Value *
LHS = I->getOperand(1 - Pos);
1101 ConstantInt::get(I->getOperand(Pos)->getType(), ToCIValue);
1105 ((FromOpcode == Instruction::Add || FromOpcode == Instruction::Or ||
1106 FromOpcode == Instruction::Xor) &&
1107 ToOpcode == Instruction::Sub))
1112 InterchangeableInfo MainOp;
1113 InterchangeableInfo AltOp;
1115 return ::isValidForAlternation(MainOp.I->getOpcode()) &&
1118 bool initializeAltOp(
const Instruction *
I) {
1128 BinOpSameOpcodeHelper(
const Instruction *MainOp,
1129 const Instruction *AltOp =
nullptr)
1130 : MainOp(MainOp), AltOp(AltOp) {
1133 bool add(
const Instruction *
I) {
1135 "BinOpSameOpcodeHelper only accepts BinaryOperator.");
1136 unsigned Opcode =
I->getOpcode();
1137 MaskType OpcodeInMaskForm;
1140 case Instruction::Shl:
1141 OpcodeInMaskForm = ShlBIT;
1143 case Instruction::AShr:
1144 OpcodeInMaskForm = AShrBIT;
1146 case Instruction::Mul:
1147 OpcodeInMaskForm = MulBIT;
1149 case Instruction::Add:
1150 OpcodeInMaskForm = AddBIT;
1152 case Instruction::Sub:
1153 OpcodeInMaskForm = SubBIT;
1155 case Instruction::And:
1156 OpcodeInMaskForm = AndBIT;
1158 case Instruction::Or:
1159 OpcodeInMaskForm = OrBIT;
1161 case Instruction::Xor:
1162 OpcodeInMaskForm = XorBIT;
1165 return MainOp.equal(Opcode) ||
1166 (initializeAltOp(
I) && AltOp.equal(Opcode));
1168 MaskType InterchangeableMask = OpcodeInMaskForm;
1169 ConstantInt *CI = isBinOpWithConstantInt(
I).first;
1171 constexpr MaskType CanBeAll =
1172 XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | MulBIT | AShrBIT | ShlBIT;
1173 const APInt &CIValue = CI->
getValue();
1175 case Instruction::Shl:
1177 InterchangeableMask = CIValue.
isZero() ? CanBeAll : MulBIT | ShlBIT;
1179 case Instruction::Mul:
1180 if (CIValue.
isOne()) {
1181 InterchangeableMask = CanBeAll;
1185 InterchangeableMask = MulBIT | ShlBIT;
1187 case Instruction::Add:
1188 case Instruction::Sub:
1189 InterchangeableMask = CIValue.
isZero() ? CanBeAll : SubBIT | AddBIT;
1191 case Instruction::And:
1193 InterchangeableMask = CanBeAll;
1195 case Instruction::Xor:
1197 InterchangeableMask = XorBIT | OrBIT | AndBIT | SubBIT | AddBIT;
1201 InterchangeableMask = CanBeAll;
1205 return MainOp.trySet(OpcodeInMaskForm, InterchangeableMask) ||
1206 (initializeAltOp(
I) &&
1207 AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));
1209 unsigned getMainOpcode()
const {
return MainOp.getOpcode(); }
1211 bool hasCandidateOpcode(
unsigned Opcode)
const {
1212 return MainOp.hasCandidateOpcode(Opcode);
1214 bool hasAltOp()
const {
return AltOp.I; }
1215 unsigned getAltOpcode()
const {
1216 return hasAltOp() ? AltOp.getOpcode() : getMainOpcode();
1219 return MainOp.getOperand(
I);
1224class InstructionsState {
1250 bool HasCopyables =
false;
1254 assert(valid() &&
"InstructionsState is invalid.");
1259 assert(valid() &&
"InstructionsState is invalid.");
1264 unsigned getOpcode()
const {
return getMainOp()->getOpcode(); }
1266 unsigned getAltOpcode()
const {
return getAltOp()->getOpcode(); }
1269 bool isAltShuffle()
const {
return getMainOp() != getAltOp(); }
1278 Instruction *getMatchingMainOpOrAltOp(Instruction *
I)
const {
1279 assert(MainOp &&
"MainOp cannot be nullptr.");
1280 if (
I->getOpcode() == MainOp->getOpcode())
1283 assert(AltOp &&
"AltOp cannot be nullptr.");
1284 if (
I->getOpcode() == AltOp->getOpcode())
1286 if (!
I->isBinaryOp())
1288 BinOpSameOpcodeHelper
Converter(MainOp);
1291 if (isAltShuffle() && !
Converter.hasCandidateOpcode(MainOp->getOpcode())) {
1292 BinOpSameOpcodeHelper AltConverter(AltOp);
1293 if (AltConverter.add(
I) && AltConverter.add(AltOp) &&
1294 AltConverter.hasCandidateOpcode(AltOp->getOpcode()))
1297 if (
Converter.hasAltOp() && !isAltShuffle())
1299 return Converter.hasAltOp() ? AltOp : MainOp;
1303 bool isShiftOp()
const {
1304 return getMainOp()->isShift() && getAltOp()->isShift();
1309 return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();
1313 bool isMulDivLikeOp()
const {
1314 constexpr std::array<unsigned, 8> MulDiv = {
1315 Instruction::Mul, Instruction::FMul, Instruction::SDiv,
1316 Instruction::UDiv, Instruction::FDiv, Instruction::SRem,
1317 Instruction::URem, Instruction::FRem};
1323 bool isAddSubLikeOp()
const {
1324 constexpr std::array<unsigned, 4>
AddSub = {
1325 Instruction::Add, Instruction::Sub, Instruction::FAdd,
1332 bool isCmpOp()
const {
1333 return (
getOpcode() == Instruction::ICmp ||
1339 bool valid()
const {
return MainOp && AltOp; }
1341 explicit operator bool()
const {
return valid(); }
1343 InstructionsState() =
delete;
1344 InstructionsState(Instruction *MainOp, Instruction *AltOp,
1345 bool HasCopyables =
false)
1346 : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}
1347 static InstructionsState invalid() {
return {
nullptr,
nullptr}; }
1350 bool isCopyableElement(
Value *V)
const {
1351 assert(valid() &&
"InstructionsState is invalid.");
1354 if (isAltShuffle() ||
getOpcode() == Instruction::GetElementPtr)
1359 if (
I->getParent() != MainOp->getParent() &&
1363 if (
I->getOpcode() == MainOp->getOpcode())
1365 if (!
I->isBinaryOp())
1367 BinOpSameOpcodeHelper
Converter(MainOp);
1373 bool isNonSchedulable(
Value *V)
const {
1374 assert(valid() &&
"InstructionsState is invalid.");
1381 if (getMainOp() == V)
1383 if (isCopyableElement(V)) {
1384 auto IsNonSchedulableCopyableElement = [
this](
Value *
V) {
1386 return !
I ||
isa<PHINode>(
I) ||
I->getParent() != MainOp->getParent() ||
1391 !MainOp->comesBefore(
I));
1394 return IsNonSchedulableCopyableElement(V);
1401 bool areInstructionsWithCopyableElements()
const {
1402 assert(valid() &&
"InstructionsState is invalid.");
1403 return HasCopyables;
1407std::pair<Instruction *, SmallVector<Value *>>
1409 Instruction *SelectedOp = S.getMatchingMainOpOrAltOp(
I);
1410 assert(SelectedOp &&
"Cannot convert the instruction.");
1411 if (
I->isBinaryOp()) {
1413 return std::make_pair(SelectedOp,
Converter.getOperand(SelectedOp));
1432 for (
Value *V : VL) {
1437 if (Inst->getOpcode() == Opcode)
1451 BaseOp0 == Op0 || BaseOp1 == Op1 ||
1462 "Assessing comparisons of different types?");
1472 return (BasePred == Pred &&
1474 (BasePred == SwappedPred &&
1485 return InstructionsState::invalid();
1489 return InstructionsState::invalid();
1494 (VL.
size() == 2 && InstCnt < 2))
1495 return InstructionsState::invalid();
1504 unsigned AltOpcode = Opcode;
1506 BinOpSameOpcodeHelper BinOpHelper(MainOp);
1507 bool SwappedPredsCompatible = IsCmpOp && [&]() {
1509 UniquePreds.
insert(BasePred);
1510 UniqueNonSwappedPreds.
insert(BasePred);
1511 for (
Value *V : VL) {
1518 UniqueNonSwappedPreds.
insert(CurrentPred);
1519 if (!UniquePreds.
contains(CurrentPred) &&
1520 !UniquePreds.
contains(SwappedCurrentPred))
1521 UniquePreds.
insert(CurrentPred);
1526 return UniqueNonSwappedPreds.
size() > 2 && UniquePreds.
size() == 2;
1536 return InstructionsState::invalid();
1538 bool AnyPoison = InstCnt != VL.
size();
1549 if (AnyPoison && (
I->isIntDivRem() ||
I->isFPDivRem() ||
isa<CallInst>(
I)))
1550 return InstructionsState::invalid();
1551 unsigned InstOpcode =
I->getOpcode();
1553 if (BinOpHelper.add(
I))
1558 Value *Op1 =
I->getOperand(0);
1561 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
1563 if (Opcode == AltOpcode) {
1566 "Cast isn't safe for alternation, logic needs to be updated!");
1567 AltOpcode = InstOpcode;
1574 Type *Ty0 = BaseInst->getOperand(0)->getType();
1575 Type *Ty1 = Inst->getOperand(0)->getType();
1577 assert(InstOpcode == Opcode &&
"Expected same CmpInst opcode.");
1578 assert(InstOpcode == AltOpcode &&
1579 "Alternate instructions are only supported by BinaryOperator "
1587 if ((VL.
size() == 2 || SwappedPredsCompatible) &&
1588 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1594 if (MainOp != AltOp) {
1597 }
else if (BasePred != CurrentPred) {
1600 "CmpInst isn't safe for alternation, logic needs to be updated!");
1605 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1606 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1609 }
else if (InstOpcode == Opcode) {
1610 assert(InstOpcode == AltOpcode &&
1611 "Alternate instructions are only supported by BinaryOperator and "
1614 if (Gep->getNumOperands() != 2 ||
1616 return InstructionsState::invalid();
1619 return InstructionsState::invalid();
1622 if (!LI->isSimple() || !BaseLI->isSimple())
1623 return InstructionsState::invalid();
1627 return InstructionsState::invalid();
1628 if (
Call->hasOperandBundles() &&
1630 !std::equal(
Call->op_begin() +
Call->getBundleOperandsStartIndex(),
1631 Call->op_begin() +
Call->getBundleOperandsEndIndex(),
1634 return InstructionsState::invalid();
1637 return InstructionsState::invalid();
1640 if (Mappings.
size() != BaseMappings.
size() ||
1641 Mappings.
front().ISA != BaseMappings.
front().ISA ||
1642 Mappings.
front().ScalarName != BaseMappings.
front().ScalarName ||
1643 Mappings.
front().VectorName != BaseMappings.
front().VectorName ||
1644 Mappings.
front().Shape.VF != BaseMappings.
front().Shape.VF ||
1645 Mappings.
front().Shape.Parameters !=
1646 BaseMappings.
front().Shape.Parameters)
1647 return InstructionsState::invalid();
1652 return InstructionsState::invalid();
1657 assert(MainOp &&
"Cannot find MainOp with Opcode from BinOpHelper.");
1659 assert(MainOp &&
"Cannot find AltOp with Opcode from BinOpHelper.");
1662 "Incorrect implementation of allSameOpcode.");
1663 InstructionsState S(MainOp, AltOp);
1669 "Invalid InstructionsState.");
1677 return all_of(VL, [&](
Value *V) {
return V->getType() == Ty; });
1687 unsigned Opcode = UserInst->
getOpcode();
1689 case Instruction::Load: {
1693 case Instruction::Store: {
1695 return (
SI->getPointerOperand() == Scalar);
1697 case Instruction::Call: {
1701 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1702 Arg.value().get() == Scalar;
1722 return LI->isSimple();
1724 return SI->isSimple();
1726 return !
MI->isVolatile();
1734 bool ExtendingManyInputs =
false) {
1735 if (SubMask.
empty())
1738 (!ExtendingManyInputs || SubMask.
size() > Mask.size() ||
1741 "SubMask with many inputs support must be larger than the mask.");
1743 Mask.append(SubMask.
begin(), SubMask.
end());
1747 int TermValue = std::min(Mask.size(), SubMask.
size());
1748 for (
int I = 0,
E = SubMask.
size();
I <
E; ++
I) {
1750 (!ExtendingManyInputs &&
1751 (SubMask[
I] >= TermValue || Mask[SubMask[
I]] >= TermValue)))
1753 NewMask[
I] = Mask[SubMask[
I]];
1769 const size_t Sz = Order.
size();
1772 for (
unsigned I = 0;
I < Sz; ++
I) {
1774 UnusedIndices.
reset(Order[
I]);
1776 MaskedIndices.
set(
I);
1778 if (MaskedIndices.
none())
1781 "Non-synced masked/available indices.");
1785 assert(Idx >= 0 &&
"Indices must be synced.");
1795 unsigned Opcode0,
unsigned Opcode1) {
1802 OpcodeMask.
set(Lane * ScalarTyNumElements,
1803 Lane * ScalarTyNumElements + ScalarTyNumElements);
1812 "Expected scalar constants.");
1815 std::fill_n(NewVal.begin() +
I * VF, VF, V);
1822 const unsigned E = Indices.
size();
1824 for (
unsigned I = 0;
I <
E; ++
I)
1825 Mask[Indices[
I]] =
I;
1831 assert(!Mask.empty() &&
"Expected non-empty mask.");
1835 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
1837 Scalars[Mask[
I]] = Prev[
I];
1850 auto *IO = dyn_cast<Instruction>(V);
1853 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1866 return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(
UsesLimit) &&
1868 auto *IU = dyn_cast<Instruction>(U);
1871 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1887 return !VL.
empty() &&
1903 return NumParts > 0 && NumParts < Sz &&
has_single_bit(Sz / NumParts) &&
1912 const unsigned Limit = std::numeric_limits<unsigned>::max()) {
1913 unsigned NumParts =
TTI.getNumberOfParts(VecTy);
1914 if (NumParts == 0 || NumParts >= Limit)
1917 if (NumParts >= Sz || Sz % NumParts != 0 ||
1926 class ScheduleEntity;
1928 class ScheduleCopyableData;
1929 class ScheduleBundle;
1939 struct StridedPtrInfo {
1940 Value *StrideVal =
nullptr;
1941 const SCEV *StrideSCEV =
nullptr;
1967 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1968 AC(AC), DB(DB), DL(DL), ORE(ORE),
1987 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
2000 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales = {});
2014 const SmallDenseSet<Value *> &UserIgnoreLst);
2021 assert(!VectorizableTree.empty() &&
"No graph to get the first node from");
2022 return VectorizableTree.front()->Scalars;
2028 const TreeEntry &Root = *VectorizableTree.front();
2029 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
2030 !Root.Scalars.
front()->getType()->isIntegerTy())
2031 return std::nullopt;
2032 auto It = MinBWs.find(&Root);
2033 if (It != MinBWs.end())
2037 if (Root.getOpcode() == Instruction::ZExt ||
2038 Root.getOpcode() == Instruction::SExt)
2039 return std::make_pair(
cast<CastInst>(Root.getMainOp())->getSrcTy(),
2040 Root.getOpcode() == Instruction::SExt);
2041 return std::nullopt;
2047 return MinBWs.at(VectorizableTree.front().get()).second;
2052 if (ReductionBitWidth == 0 ||
2053 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
2054 ReductionBitWidth >=
2055 DL->getTypeSizeInBits(
2056 VectorizableTree.front()->Scalars.front()->getType()))
2058 VectorizableTree.front()->Scalars.front()->getType(),
2059 VectorizableTree.front()->getVectorFactor());
2062 VectorizableTree.front()->Scalars.front()->getContext(),
2064 VectorizableTree.front()->getVectorFactor());
2079 VectorizableTree.clear();
2080 ScalarToTreeEntries.clear();
2081 OperandsToTreeEntry.clear();
2082 ScalarsInSplitNodes.clear();
2084 NonScheduledFirst.clear();
2085 EntryToLastInstruction.clear();
2086 LastInstructionToPos.clear();
2087 LoadEntriesToVectorize.clear();
2088 IsGraphTransformMode =
false;
2089 GatheredLoadsEntriesFirst.reset();
2090 CompressEntryToData.clear();
2091 ExternalUses.clear();
2092 ExternalUsesAsOriginalScalar.clear();
2093 ExternalUsesWithNonUsers.clear();
2094 for (
auto &Iter : BlocksSchedules) {
2095 BlockScheduling *BS = Iter.second.get();
2099 ReductionBitWidth = 0;
2101 CastMaxMinBWSizes.reset();
2102 ExtraBitWidthNodes.clear();
2103 InstrElementSize.clear();
2104 UserIgnoreList =
nullptr;
2105 PostponedGathers.clear();
2106 ValueToGatherNodes.clear();
2107 TreeEntryToStridedPtrInfoMap.clear();
2123 assert(!Order.
empty() &&
"expected non-empty order");
2124 const unsigned Sz = Order.
size();
2126 return P.value() ==
P.index() ||
P.value() == Sz;
2139 bool IgnoreReorder);
2152 std::optional<OrdersType>
2190 return MaxVecRegSize;
2195 return MinVecRegSize;
2203 unsigned MaxVF =
MaxVFOption.getNumOccurrences() ?
2204 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
2205 return MaxVF ? MaxVF : UINT_MAX;
2244 Align Alignment,
const int64_t Diff,
2245 const size_t Sz)
const;
2285 Value *Ptr0,
Value *PtrN, StridedPtrInfo &SPtrInfo)
const;
2303 Align CommonAlignment,
2305 StridedPtrInfo &SPtrInfo)
const;
2320 StridedPtrInfo &SPtrInfo,
2321 unsigned *BestVF =
nullptr,
2322 bool TryRecursiveCheck =
true)
const;
2326 ListOfKnonwnNonVectorizableLoads.insert(
hash_value(VL));
2330 template <
typename T>
2332 return ListOfKnonwnNonVectorizableLoads.contains(
hash_value(VL));
2357 OS <<
"{User:" << (
UserTE ? std::to_string(
UserTE->Idx) :
"null")
2358 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
2383 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
2384 MaxLevel(MaxLevel) {}
2440 auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
2445 auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
2447 return U == U1 || U == U2 || R.isVectorized(U);
2450 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
2453 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
2455 ((
int)V1->getNumUses() == NumLanes ||
2456 AllUsersAreInternal(V1, V2)))
2462 auto CheckSameEntryOrFail = [&]() {
2467 any_of(TEs2, [&](TreeEntry *E) {
return Set.contains(E); }))
2476 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
2478 return CheckSameEntryOrFail();
2481 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
2482 LI2->getPointerOperand(), DL, SE,
true);
2483 if (!Dist || *Dist == 0) {
2486 R.TTI->isLegalMaskedGather(
2489 return CheckSameEntryOrFail();
2493 if (std::abs(*Dist) > NumLanes / 2)
2526 Value *EV2 =
nullptr;
2539 int Dist = Idx2 - Idx1;
2542 if (std::abs(Dist) == 0)
2544 if (std::abs(Dist) > NumLanes / 2)
2551 return CheckSameEntryOrFail();
2557 if (I1->getParent() != I2->getParent())
2558 return CheckSameEntryOrFail();
2566 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.
empty() ||
2567 !S.isAltShuffle()) &&
2571 S.getMainOp()->getNumOperands();
2583 return CheckSameEntryOrFail();
2617 int ShallowScoreAtThisLevel =
2628 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
2631 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
2633 ShallowScoreAtThisLevel))
2634 return ShallowScoreAtThisLevel;
2635 assert(I1 && I2 &&
"Should have early exited.");
2642 for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
2643 OpIdx1 != NumOperands1; ++OpIdx1) {
2645 int MaxTmpScore = 0;
2646 unsigned MaxOpIdx2 = 0;
2647 bool FoundBest =
false;
2651 ? I2->getNumOperands()
2652 : std::min(I2->getNumOperands(), OpIdx1 + 1);
2653 assert(FromIdx <= ToIdx &&
"Bad index");
2654 for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
2656 if (Op2Used.
count(OpIdx2))
2661 I1, I2, CurrLevel + 1, {});
2664 TmpScore > MaxTmpScore) {
2665 MaxTmpScore = TmpScore;
2672 Op2Used.
insert(MaxOpIdx2);
2673 ShallowScoreAtThisLevel += MaxTmpScore;
2676 return ShallowScoreAtThisLevel;
2707 struct OperandData {
2708 OperandData() =
default;
2709 OperandData(
Value *V,
bool APO,
bool IsUsed)
2710 : V(V), APO(APO), IsUsed(IsUsed) {}
2720 bool IsUsed =
false;
2729 enum class ReorderingMode {
2743 unsigned ArgSize = 0;
2749 const Loop *L =
nullptr;
2752 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
2753 return OpsVec[
OpIdx][Lane];
2757 const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const {
2758 return OpsVec[
OpIdx][Lane];
2763 for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
2765 for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2767 OpsVec[
OpIdx][Lane].IsUsed =
false;
2771 void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
2772 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2784 int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx,
2786 Value *IdxLaneV = getData(Idx, Lane).V;
2799 unsigned UniquesCount = Uniques.
size();
2800 auto IdxIt = Uniques.
find(IdxLaneV);
2801 unsigned UniquesCntWithIdxLaneV =
2802 IdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2804 auto OpIdxIt = Uniques.
find(OpIdxLaneV);
2805 unsigned UniquesCntWithOpIdxLaneV =
2806 OpIdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2807 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2809 return std::min(
bit_ceil(UniquesCntWithOpIdxLaneV) -
2810 UniquesCntWithOpIdxLaneV,
2811 UniquesCntWithOpIdxLaneV -
2813 ((IdxIt != Uniques.
end() && UsedLanes.
test(IdxIt->second))
2814 ? UniquesCntWithIdxLaneV -
bit_floor(UniquesCntWithIdxLaneV)
2815 :
bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2824 int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
2825 Value *IdxLaneV = getData(Idx, Lane).V;
2838 return R.areAllUsersVectorized(IdxLaneI)
2846 static const int ScoreScaleFactor = 10;
2854 int Lane,
unsigned OpIdx,
unsigned Idx,
2864 int SplatScore = getSplatScore(Lane,
OpIdx, Idx, UsedLanes);
2865 if (Score <= -SplatScore) {
2869 Score += SplatScore;
2875 Score *= ScoreScaleFactor;
2876 Score += getExternalUseScore(Lane,
OpIdx, Idx);
2894 std::optional<unsigned>
2895 getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
2899 unsigned NumOperands = getNumOperands();
2902 Value *OpLastLane = getData(
OpIdx, LastLane).V;
2905 ReorderingMode RMode = ReorderingModes[
OpIdx];
2906 if (RMode == ReorderingMode::Failed)
2907 return std::nullopt;
2910 bool OpIdxAPO = getData(
OpIdx, Lane).APO;
2916 std::optional<unsigned> Idx;
2920 BestScoresPerLanes.try_emplace(std::make_pair(
OpIdx, Lane), 0)
2926 bool IsUsed = RMode == ReorderingMode::Splat ||
2927 RMode == ReorderingMode::Constant ||
2928 RMode == ReorderingMode::Load;
2930 for (
unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2932 OperandData &OpData = getData(Idx, Lane);
2934 bool OpAPO = OpData.APO;
2943 if (OpAPO != OpIdxAPO)
2948 case ReorderingMode::Load:
2949 case ReorderingMode::Opcode: {
2950 bool LeftToRight = Lane > LastLane;
2951 Value *OpLeft = (LeftToRight) ? OpLastLane :
Op;
2952 Value *OpRight = (LeftToRight) ?
Op : OpLastLane;
2953 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2954 OpIdx, Idx, IsUsed, UsedLanes);
2955 if (Score >
static_cast<int>(BestOp.Score) ||
2956 (Score > 0 && Score ==
static_cast<int>(BestOp.Score) &&
2959 BestOp.Score = Score;
2960 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] = Score;
2964 case ReorderingMode::Constant:
2966 (!BestOp.Score && L && L->isLoopInvariant(
Op))) {
2970 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] =
2977 case ReorderingMode::Splat:
2979 IsUsed =
Op == OpLastLane;
2980 if (
Op == OpLastLane) {
2982 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] =
2988 case ReorderingMode::Failed:
2994 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2998 return std::nullopt;
3005 unsigned getBestLaneToStartReordering()
const {
3006 unsigned Min = UINT_MAX;
3007 unsigned SameOpNumber = 0;
3018 for (
int I = getNumLanes();
I > 0; --
I) {
3019 unsigned Lane =
I - 1;
3020 OperandsOrderData NumFreeOpsHash =
3021 getMaxNumOperandsThatCanBeReordered(Lane);
3024 if (NumFreeOpsHash.NumOfAPOs < Min) {
3025 Min = NumFreeOpsHash.NumOfAPOs;
3026 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3028 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3029 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
3030 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
3033 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3034 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3035 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
3036 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
3037 auto [It, Inserted] =
3038 HashMap.
try_emplace(NumFreeOpsHash.Hash, 1, Lane);
3044 unsigned BestLane = 0;
3045 unsigned CntMin = UINT_MAX;
3047 if (
Data.second.first < CntMin) {
3048 CntMin =
Data.second.first;
3049 BestLane =
Data.second.second;
3056 struct OperandsOrderData {
3059 unsigned NumOfAPOs = UINT_MAX;
3062 unsigned NumOpsWithSameOpcodeParent = 0;
3076 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const {
3077 unsigned CntTrue = 0;
3078 unsigned NumOperands = getNumOperands();
3088 bool AllUndefs =
true;
3089 unsigned NumOpsWithSameOpcodeParent = 0;
3094 const OperandData &OpData = getData(
OpIdx, Lane);
3101 I->getParent() != Parent) {
3102 if (NumOpsWithSameOpcodeParent == 0) {
3103 NumOpsWithSameOpcodeParent = 1;
3105 Parent =
I->getParent();
3107 --NumOpsWithSameOpcodeParent;
3110 ++NumOpsWithSameOpcodeParent;
3119 OperandsOrderData
Data;
3120 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
3121 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
3128 const InstructionsState &S) {
3132 return VL.
size() == getNumLanes();
3134 "Expected same number of lanes");
3135 assert(S.valid() &&
"InstructionsState is invalid.");
3141 OpsVec.resize(ArgSize);
3142 unsigned NumLanes = VL.
size();
3143 for (OperandDataVec &
Ops : OpsVec)
3144 Ops.resize(NumLanes);
3159 OpsVec[
OpIdx][Lane] = {Operands[
OpIdx][Lane],
true,
false};
3162 bool IsInverseOperation =
false;
3163 if (S.isCopyableElement(VL[Lane])) {
3165 IsInverseOperation =
3168 assert(
I &&
"Expected instruction");
3169 auto [SelectedOp,
Ops] = convertTo(
I, S);
3176 bool APO = (
OpIdx == 0) ?
false : IsInverseOperation;
3177 OpsVec[
OpIdx][Lane] = {Operands[
OpIdx][Lane], APO,
false};
3183 unsigned getNumOperands()
const {
return ArgSize; }
3186 unsigned getNumLanes()
const {
return OpsVec[0].size(); }
3189 Value *getValue(
unsigned OpIdx,
unsigned Lane)
const {
3190 return getData(
OpIdx, Lane).V;
3194 bool empty()
const {
return OpsVec.empty(); }
3197 void clear() { OpsVec.clear(); }
3202 bool shouldBroadcast(
Value *
Op,
unsigned OpIdx,
unsigned Lane) {
3204 "Op is expected to be getValue(OpIdx, Lane).");
3208 bool OpAPO = getData(
OpIdx, Lane).APO;
3209 bool IsInvariant = L && L->isLoopInvariant(
Op);
3211 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3215 bool FoundCandidate =
false;
3216 for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
3217 OperandData &
Data = getData(OpI, Ln);
3218 if (
Data.APO != OpAPO ||
Data.IsUsed)
3220 Value *OpILane = getValue(OpI, Lane);
3244 L->isLoopInvariant(
Data.V))) {
3245 FoundCandidate =
true;
3252 if (!FoundCandidate)
3255 return getNumLanes() == 2 || Cnt > 1;
3262 "Op is expected to be getValue(OpIdx, Lane).");
3263 bool OpAPO = getData(
OpIdx, Lane).APO;
3264 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3268 const OperandData &
Data = getData(OpI, Ln);
3269 if (
Data.APO != OpAPO ||
Data.IsUsed)
3271 Value *OpILn = getValue(OpI, Ln);
3272 return (L && L->isLoopInvariant(OpILn)) ||
3284 const InstructionsState &S,
const BoUpSLP &R)
3285 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
3286 L(R.LI->getLoopFor(S.getMainOp()->
getParent())) {
3288 appendOperands(RootVL, Operands, S);
3296 "Expected same num of lanes across all operands");
3297 for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
3298 OpVL[Lane] = OpsVec[
OpIdx][Lane].V;
3306 unsigned NumOperands = getNumOperands();
3307 unsigned NumLanes = getNumLanes();
3327 unsigned FirstLane = getBestLaneToStartReordering();
3336 if (shouldBroadcast(OpLane0,
OpIdx, FirstLane) ||
3337 !canBeVectorized(OpILane0,
OpIdx, FirstLane))
3338 ReorderingModes[
OpIdx] = ReorderingMode::Splat;
3340 ReorderingModes[
OpIdx] = ReorderingMode::Load;
3342 ReorderingModes[
OpIdx] = ReorderingMode::Opcode;
3344 ReorderingModes[
OpIdx] = ReorderingMode::Constant;
3347 ReorderingModes[
OpIdx] = ReorderingMode::Splat;
3357 auto &&SkipReordering = [
this]() {
3360 for (
const OperandData &
Data : Op0)
3363 ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
3364 if (
any_of(
Op, [&UniqueValues](
const OperandData &
Data) {
3371 return UniqueValues.
size() != 2 &&
3373 UniqueValues.
size());
3385 if (SkipReordering())
3388 bool StrategyFailed =
false;
3396 for (
unsigned I = 0;
I < NumOperands; ++
I)
3397 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
3400 UsedLanes.
set(FirstLane);
3401 for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
3403 for (
int Direction : {+1, -1}) {
3404 int Lane = FirstLane + Direction * Distance;
3405 if (Lane < 0 || Lane >= (
int)NumLanes)
3407 UsedLanes.
set(Lane);
3408 int LastLane = Lane - Direction;
3409 assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
3414 std::optional<unsigned> BestIdx =
3415 getBestOperand(
OpIdx, Lane, LastLane, ReorderingModes,
3416 MainAltOps[
OpIdx], UsedLanes);
3423 swap(
OpIdx, *BestIdx, Lane);
3426 StrategyFailed =
true;
3430 OperandData &AltOp = getData(
OpIdx, Lane);
3431 InstructionsState OpS =
3433 if (OpS && OpS.isAltShuffle())
3440 if (!StrategyFailed)
3445#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3448 case ReorderingMode::Load:
3450 case ReorderingMode::Opcode:
3452 case ReorderingMode::Constant:
3454 case ReorderingMode::Splat:
3456 case ReorderingMode::Failed:
3477 const unsigned Indent = 2;
3479 for (
const OperandDataVec &OpDataVec : OpsVec) {
3480 OS <<
"Operand " << Cnt++ <<
"\n";
3481 for (
const OperandData &OpData : OpDataVec) {
3482 OS.
indent(Indent) <<
"{";
3483 if (
Value *V = OpData.V)
3487 OS <<
", APO:" << OpData.APO <<
"}\n";
3509 int BestScore = Limit;
3510 std::optional<int> Index;
3511 for (
int I :
seq<int>(0, Candidates.size())) {
3513 Candidates[
I].second,
3516 if (Score > BestScore) {
3531 DeletedInstructions.insert(
I);
3536 template <
typename T>
3539 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
3541 for (T *V : DeadVals) {
3546 for (T *V : DeadVals) {
3547 if (!V || !Processed.
insert(V).second)
3552 for (
Use &U :
I->operands()) {
3554 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
3556 (Entries.
empty() ||
none_of(Entries, [&](
const TreeEntry *Entry) {
3557 return Entry->VectorizedValue == OpI;
3561 I->dropAllReferences();
3563 for (T *V : DeadVals) {
3565 if (!
I->getParent())
3570 cast<Instruction>(U.getUser()));
3572 "trying to erase instruction with users.");
3573 I->removeFromParent();
3577 while (!DeadInsts.
empty()) {
3580 if (!VI || !VI->getParent())
3583 "Live instruction found in dead worklist!");
3584 assert(VI->use_empty() &&
"Instructions with uses are not dead.");
3591 for (
Use &OpU : VI->operands()) {
3592 Value *OpV = OpU.get();
3604 if (!DeletedInstructions.contains(OpI) &&
3605 (!OpI->getType()->isVectorTy() ||
3606 none_of(VectorValuesAndScales,
3607 [&](
const std::tuple<Value *, unsigned, bool> &V) {
3608 return std::get<0>(V) == OpI;
3614 VI->removeFromParent();
3616 SE->forgetValue(VI);
3623 return AnalyzedReductionsRoots.count(
I);
3628 AnalyzedReductionsRoots.insert(
I);
3633 return AnalyzedReductionVals.contains(
hash_value(VL));
3638 AnalyzedReductionVals.insert(
hash_value(VL));
3642 AnalyzedReductionsRoots.clear();
3643 AnalyzedReductionVals.clear();
3644 AnalyzedMinBWVals.clear();
3652 return MustGather.contains(V);
3656 return NonScheduledFirst.contains(V);
3661 assert(V &&
"V cannot be nullptr.");
3662 return ScalarToTreeEntries.contains(V);
3672 bool collectValuesToDemote(
3673 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
3676 bool &IsProfitableToDemote,
bool IsTruncRoot)
const;
3685 void buildReorderableOperands(
3693 void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
3696 bool areAllUsersVectorized(
3705 const TreeEntry *getOperandEntry(
const TreeEntry *E,
unsigned Idx)
const;
3706 TreeEntry *getOperandEntry(TreeEntry *E,
unsigned Idx) {
3707 return const_cast<TreeEntry *
>(
3708 getOperandEntry(
const_cast<const TreeEntry *
>(E), Idx));
3714 Instruction *getRootEntryInstruction(
const TreeEntry &Entry)
const;
3718 getCastContextHint(
const TreeEntry &TE)
const;
3732 const InstructionsState &LocalState,
3739 unsigned InterleaveFactor = 0);
3750 bool ResizeAllowed =
false)
const;
3757 Value *vectorizeOperand(TreeEntry *
E,
unsigned NodeIdx);
3762 template <
typename BVTy,
typename ResTy,
typename... Args>
3763 ResTy processBuildVector(
const TreeEntry *
E,
Type *ScalarTy, Args &...Params);
3768 Value *createBuildVector(
const TreeEntry *
E,
Type *ScalarTy);
3774 Instruction &getLastInstructionInBundle(
const TreeEntry *
E);
3781 std::optional<TargetTransformInfo::ShuffleKind>
3793 unsigned NumParts)
const;
3805 std::optional<TargetTransformInfo::ShuffleKind>
3806 isGatherShuffledSingleRegisterEntry(
3823 isGatherShuffledEntry(
3826 unsigned NumParts,
bool ForOrder =
false);
3832 Type *ScalarTy)
const;
3836 void setInsertPointAfterBundle(
const TreeEntry *
E);
3846 bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
3851 void tryToVectorizeGatheredLoads(
3853 std::tuple<BasicBlock *, Value *, Type *>,
3861 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
3877 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
3881 void reorderGatherNode(TreeEntry &TE);
3886 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3889 SmallVector<int> getCommonMask()
const {
3890 if (State == TreeEntry::SplitVectorize)
3892 SmallVector<int>
Mask;
3899 SmallVector<int> getSplitMask()
const {
3900 assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
3901 "Expected only split vectorize node.");
3903 unsigned CommonVF = std::max<unsigned>(
3904 CombinedEntriesWithIndices.back().second,
3905 Scalars.size() - CombinedEntriesWithIndices.back().second);
3906 for (
auto [Idx,
I] :
enumerate(ReorderIndices))
3908 Idx + (Idx >= CombinedEntriesWithIndices.back().second
3909 ? CommonVF - CombinedEntriesWithIndices.back().second
3916 void reorderSplitNode(
unsigned Idx, ArrayRef<int> Mask,
3917 ArrayRef<int> MaskOrder);
3922 if (
Mask.size() != VL.
size() && VL.
size() == Scalars.size())
3923 return std::equal(VL.
begin(), VL.
end(), Scalars.begin());
3926 [Scalars](
Value *V,
int Idx) {
3927 return (isa<UndefValue>(V) &&
3928 Idx == PoisonMaskElem) ||
3929 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3932 if (!ReorderIndices.empty()) {
3936 SmallVector<int>
Mask;
3938 if (VL.
size() == Scalars.size())
3939 return IsSame(Scalars, Mask);
3940 if (VL.
size() == ReuseShuffleIndices.size()) {
3942 return IsSame(Scalars, Mask);
3946 return IsSame(Scalars, ReuseShuffleIndices);
3950 bool hasEqualOperands(
const TreeEntry &TE)
const {
3951 if (
TE.getNumOperands() != getNumOperands())
3953 SmallBitVector
Used(getNumOperands());
3954 for (
unsigned I = 0,
E = getNumOperands();
I <
E; ++
I) {
3955 unsigned PrevCount =
Used.count();
3956 for (
unsigned K = 0;
K <
E; ++
K) {
3959 if (getOperand(K) ==
TE.getOperand(
I)) {
3965 if (PrevCount ==
Used.count())
3974 unsigned getVectorFactor()
const {
3975 if (!ReuseShuffleIndices.empty())
3976 return ReuseShuffleIndices.size();
3977 return Scalars.size();
3981 bool isGather()
const {
return State == NeedToGather; }
3987 WeakTrackingVH VectorizedValue =
nullptr;
4008 enum CombinedOpcode {
4010 MinMax = Instruction::OtherOpsEnd + 1,
4013 CombinedOpcode CombinedOp = NotCombinedOp;
4016 SmallVector<int, 4> ReuseShuffleIndices;
4019 SmallVector<unsigned, 4> ReorderIndices;
4027 VecTreeTy &Container;
4030 EdgeInfo UserTreeIndex;
4043 SmallVector<ValueList, 2> Operands;
4046 SmallPtrSet<const Value *, 4> CopyableElements;
4050 InstructionsState S = InstructionsState::invalid();
4053 unsigned InterleaveFactor = 0;
4056 bool DoesNotNeedToSchedule =
false;
4060 if (Operands.size() <
OpIdx + 1)
4061 Operands.resize(
OpIdx + 1);
4064 "Number of operands is greater than the number of scalars.");
4071 unsigned getInterleaveFactor()
const {
return InterleaveFactor; }
4073 void setInterleave(
unsigned Factor) { InterleaveFactor = Factor; }
4076 void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule =
true; }
4079 bool doesNotNeedToSchedule()
const {
return DoesNotNeedToSchedule; }
4084 setOperand(
I, Operands[
I]);
4088 void reorderOperands(ArrayRef<int> Mask) {
4096 return Operands[
OpIdx];
4102 return Operands[
OpIdx];
4106 unsigned getNumOperands()
const {
return Operands.size(); }
4109 Value *getSingleOperand(
unsigned OpIdx)
const {
4112 return Operands[
OpIdx][0];
4116 bool isAltShuffle()
const {
return S.isAltShuffle(); }
4118 Instruction *getMatchingMainOpOrAltOp(Instruction *
I)
const {
4119 return S.getMatchingMainOpOrAltOp(
I);
4127 if (
I && getMatchingMainOpOrAltOp(
I))
4129 return S.getMainOp();
4132 void setOperations(
const InstructionsState &S) {
4133 assert(S &&
"InstructionsState is invalid.");
4137 Instruction *getMainOp()
const {
return S.getMainOp(); }
4139 Instruction *getAltOp()
const {
return S.getAltOp(); }
4142 unsigned getOpcode()
const {
return S.getOpcode(); }
4144 unsigned getAltOpcode()
const {
return S.getAltOpcode(); }
4146 bool hasState()
const {
return S.valid(); }
4149 void addCopyableElement(
Value *V) {
4150 assert(S.isCopyableElement(V) &&
"Not a copyable element.");
4151 CopyableElements.insert(V);
4155 bool isCopyableElement(
Value *V)
const {
4156 return CopyableElements.contains(V);
4160 bool hasCopyableElements()
const {
return !CopyableElements.empty(); }
4163 const InstructionsState &getOperations()
const {
return S; }
4167 unsigned findLaneForValue(
Value *V)
const {
4168 unsigned FoundLane = getVectorFactor();
4169 for (
auto *It =
find(Scalars, V), *End = Scalars.end(); It != End;
4170 std::advance(It, 1)) {
4173 FoundLane = std::distance(Scalars.begin(), It);
4174 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
4175 if (!ReorderIndices.empty())
4176 FoundLane = ReorderIndices[FoundLane];
4177 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
4178 if (ReuseShuffleIndices.empty())
4180 if (
auto *RIt =
find(ReuseShuffleIndices, FoundLane);
4181 RIt != ReuseShuffleIndices.end()) {
4182 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
4186 assert(FoundLane < getVectorFactor() &&
"Unable to find given value.");
4193 buildAltOpShuffleMask(
const function_ref<
bool(Instruction *)> IsAltOp,
4194 SmallVectorImpl<int> &Mask,
4195 SmallVectorImpl<Value *> *OpScalars =
nullptr,
4196 SmallVectorImpl<Value *> *AltScalars =
nullptr)
const;
4199 bool isNonPowOf2Vec()
const {
4201 return IsNonPowerOf2;
4207 hasNonWholeRegisterOrNonPowerOf2Vec(
const TargetTransformInfo &
TTI)
const {
4210 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
4211 "Reshuffling not supported with non-power-of-2 vectors yet.");
4212 return IsNonPowerOf2;
4215 Value *getOrdered(
unsigned Idx)
const {
4216 assert(isGather() &&
"Must be used only for buildvectors/gathers.");
4217 if (ReorderIndices.empty())
4218 return Scalars[Idx];
4219 SmallVector<int>
Mask;
4221 return Scalars[
Mask[Idx]];
4227 dbgs() << Idx <<
".\n";
4228 for (
unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
4229 dbgs() <<
"Operand " << OpI <<
":\n";
4230 for (
const Value *V : Operands[OpI])
4233 dbgs() <<
"Scalars: \n";
4234 for (
Value *V : Scalars)
4236 dbgs() <<
"State: ";
4237 if (S && hasCopyableElements())
4238 dbgs() <<
"[[Copyable]] ";
4241 if (InterleaveFactor > 0) {
4242 dbgs() <<
"Vectorize with interleave factor " << InterleaveFactor
4245 dbgs() <<
"Vectorize\n";
4248 case ScatterVectorize:
4249 dbgs() <<
"ScatterVectorize\n";
4251 case StridedVectorize:
4252 dbgs() <<
"StridedVectorize\n";
4254 case CompressVectorize:
4255 dbgs() <<
"CompressVectorize\n";
4258 dbgs() <<
"NeedToGather\n";
4260 case CombinedVectorize:
4261 dbgs() <<
"CombinedVectorize\n";
4263 case SplitVectorize:
4264 dbgs() <<
"SplitVectorize\n";
4268 dbgs() <<
"MainOp: " << *S.getMainOp() <<
"\n";
4269 dbgs() <<
"AltOp: " << *S.getAltOp() <<
"\n";
4271 dbgs() <<
"MainOp: NULL\n";
4272 dbgs() <<
"AltOp: NULL\n";
4274 dbgs() <<
"VectorizedValue: ";
4275 if (VectorizedValue)
4276 dbgs() << *VectorizedValue <<
"\n";
4279 dbgs() <<
"ReuseShuffleIndices: ";
4280 if (ReuseShuffleIndices.empty())
4283 for (
int ReuseIdx : ReuseShuffleIndices)
4284 dbgs() << ReuseIdx <<
", ";
4286 dbgs() <<
"ReorderIndices: ";
4287 for (
unsigned ReorderIdx : ReorderIndices)
4288 dbgs() << ReorderIdx <<
", ";
4290 dbgs() <<
"UserTreeIndex: ";
4292 dbgs() << UserTreeIndex;
4294 dbgs() <<
"<invalid>";
4296 if (!CombinedEntriesWithIndices.empty()) {
4297 dbgs() <<
"Combined entries: ";
4299 dbgs() <<
"Entry index " <<
P.first <<
" with offset " <<
P.second;
4310 StringRef Banner)
const {
4311 dbgs() <<
"SLP: " << Banner <<
":\n";
4313 dbgs() <<
"SLP: Costs:\n";
4314 dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
4315 dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
4316 dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
4317 dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = "
4318 << ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
4324 const InstructionsState &S,
4326 ArrayRef<int> ReuseShuffleIndices = {}) {
4327 auto Invalid = ScheduleBundle::invalid();
4328 return newTreeEntry(VL,
Invalid, S, UserTreeIdx, ReuseShuffleIndices);
4333 const InstructionsState &S,
4335 ArrayRef<int> ReuseShuffleIndices = {},
4336 ArrayRef<unsigned> ReorderIndices = {},
4337 unsigned InterleaveFactor = 0) {
4338 TreeEntry::EntryState EntryState =
4339 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
4340 TreeEntry *
E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
4341 ReuseShuffleIndices, ReorderIndices);
4342 if (
E && InterleaveFactor > 0)
4343 E->setInterleave(InterleaveFactor);
4348 TreeEntry::EntryState EntryState,
4349 ScheduleBundle &Bundle,
const InstructionsState &S,
4351 ArrayRef<int> ReuseShuffleIndices = {},
4352 ArrayRef<unsigned> ReorderIndices = {}) {
4353 assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||
4354 EntryState == TreeEntry::SplitVectorize)) ||
4355 (Bundle && EntryState != TreeEntry::NeedToGather &&
4356 EntryState != TreeEntry::SplitVectorize)) &&
4357 "Need to vectorize gather entry?");
4359 if (GatheredLoadsEntriesFirst.has_value() &&
4360 EntryState == TreeEntry::NeedToGather && S &&
4361 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
4362 !UserTreeIdx.UserTE)
4364 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
4365 TreeEntry *
Last = VectorizableTree.back().get();
4366 Last->Idx = VectorizableTree.size() - 1;
4367 Last->State = EntryState;
4368 if (UserTreeIdx.UserTE)
4369 OperandsToTreeEntry.try_emplace(
4370 std::make_pair(UserTreeIdx.UserTE, UserTreeIdx.EdgeIdx),
Last);
4375 ReuseShuffleIndices.empty()) &&
4376 "Reshuffling scalars not yet supported for nodes with padding");
4377 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
4378 ReuseShuffleIndices.end());
4379 if (ReorderIndices.
empty()) {
4382 Last->setOperations(S);
4385 Last->Scalars.assign(VL.
size(),
nullptr);
4387 [VL](
unsigned Idx) ->
Value * {
4388 if (Idx >= VL.size())
4389 return UndefValue::get(VL.front()->getType());
4394 Last->setOperations(S);
4395 Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
4397 if (EntryState == TreeEntry::SplitVectorize) {
4398 assert(S &&
"Split nodes must have operations.");
4399 Last->setOperations(S);
4400 SmallPtrSet<Value *, 4> Processed;
4401 for (
Value *V : VL) {
4405 auto It = ScalarsInSplitNodes.find(V);
4406 if (It == ScalarsInSplitNodes.end()) {
4407 ScalarsInSplitNodes.try_emplace(V).first->getSecond().push_back(
Last);
4408 (void)Processed.
insert(V);
4409 }
else if (Processed.
insert(V).second) {
4411 "Value already associated with the node.");
4412 It->getSecond().push_back(
Last);
4415 }
else if (!
Last->isGather()) {
4418 (!S.areInstructionsWithCopyableElements() &&
4420 all_of(VL, [&](
Value *V) {
return S.isNonSchedulable(V); }))
4421 Last->setDoesNotNeedToSchedule();
4422 SmallPtrSet<Value *, 4> Processed;
4423 for (
Value *V : VL) {
4426 if (S.isCopyableElement(V)) {
4427 Last->addCopyableElement(V);
4430 auto It = ScalarToTreeEntries.find(V);
4431 if (It == ScalarToTreeEntries.end()) {
4432 ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(
Last);
4433 (void)Processed.
insert(V);
4434 }
else if (Processed.
insert(V).second) {
4436 "Value already associated with the node.");
4437 It->getSecond().push_back(
Last);
4441 assert((!Bundle.getBundle().empty() ||
Last->doesNotNeedToSchedule()) &&
4442 "Bundle and VL out of sync");
4443 if (!Bundle.getBundle().empty()) {
4444#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
4445 auto *BundleMember = Bundle.getBundle().begin();
4446 SmallPtrSet<Value *, 4> Processed;
4447 for (
Value *V : VL) {
4448 if (S.isNonSchedulable(V) || !Processed.
insert(V).second)
4452 assert(BundleMember == Bundle.getBundle().end() &&
4453 "Bundle and VL out of sync");
4455 Bundle.setTreeEntry(
Last);
4459 bool AllConstsOrCasts =
true;
4460 for (
Value *V : VL) {
4461 if (S && S.areInstructionsWithCopyableElements() &&
4462 S.isCopyableElement(V))
4463 Last->addCopyableElement(V);
4466 AllConstsOrCasts &=
I &&
I->getType()->isIntegerTy();
4467 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
4468 !UserTreeIdx.UserTE->isGather())
4469 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(
Last);
4472 if (AllConstsOrCasts)
4474 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
4475 MustGather.insert_range(VL);
4478 if (UserTreeIdx.UserTE)
4479 Last->UserTreeIndex = UserTreeIdx;
4485 TreeEntry::VecTreeTy VectorizableTree;
4490 for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
4491 VectorizableTree[
Id]->dump();
4499 assert(V &&
"V cannot be nullptr.");
4500 auto It = ScalarToTreeEntries.find(V);
4501 if (It == ScalarToTreeEntries.end())
4503 return It->getSecond();
4508 assert(V &&
"V cannot be nullptr.");
4509 auto It = ScalarsInSplitNodes.find(V);
4510 if (It == ScalarsInSplitNodes.end())
4512 return It->getSecond();
4517 bool SameVF =
false)
const {
4518 assert(V &&
"V cannot be nullptr.");
4519 for (TreeEntry *TE : ScalarToTreeEntries.lookup(V))
4520 if ((!SameVF ||
TE->getVectorFactor() == VL.
size()) &&
TE->isSame(VL))
4531 bool areAltOperandsProfitable(
const InstructionsState &S,
4536 class ScalarsVectorizationLegality {
4537 InstructionsState S;
4539 bool TryToFindDuplicates;
4540 bool TrySplitVectorize;
4543 ScalarsVectorizationLegality(InstructionsState S,
bool IsLegal,
4544 bool TryToFindDuplicates =
true,
4545 bool TrySplitVectorize =
false)
4546 : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
4547 TrySplitVectorize(TrySplitVectorize) {
4548 assert((!IsLegal || (S.valid() && TryToFindDuplicates)) &&
4549 "Inconsistent state");
4551 const InstructionsState &getInstructionsState()
const {
return S; };
4552 bool isLegal()
const {
return IsLegal; }
4553 bool tryToFindDuplicates()
const {
return TryToFindDuplicates; }
4554 bool trySplitVectorize()
const {
return TrySplitVectorize; }
4559 ScalarsVectorizationLegality
4562 bool TryCopyableElementsVectorization)
const;
4566 TreeEntry::EntryState getScalarsVectorizationState(
4568 bool IsScatterVectorizeUserTE,
OrdersType &CurrentOrder,
4569 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo);
4572 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
4575 SmallDenseMap<std::pair<const TreeEntry *, unsigned>, TreeEntry *>
4576 OperandsToTreeEntry;
4579 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarsInSplitNodes;
4582 SmallDenseMap<Value *, unsigned> InstrElementSize;
4596 SmallDenseMap<const TreeEntry *, WeakTrackingVH> EntryToLastInstruction;
4600 SmallDenseMap<const Instruction *, Instruction *> LastInstructionToPos;
4605 SetVector<const TreeEntry *> PostponedGathers;
4607 using ValueToGatherNodesMap =
4608 DenseMap<Value *, SmallSetVector<const TreeEntry *, 4>>;
4609 ValueToGatherNodesMap ValueToGatherNodes;
4614 SetVector<unsigned> LoadEntriesToVectorize;
4617 bool IsGraphTransformMode =
false;
4620 std::optional<unsigned> GatheredLoadsEntriesFirst;
4623 SmallDenseMap<
const TreeEntry *,
4624 std::tuple<SmallVector<int>,
VectorType *, unsigned,
bool>>
4625 CompressEntryToData;
4628 struct ExternalUser {
4629 ExternalUser(
Value *S, llvm::User *U,
const TreeEntry &E,
unsigned L)
4630 : Scalar(S), User(
U), E(E), Lane(
L) {}
4633 Value *Scalar =
nullptr;
4636 llvm::User *User =
nullptr;
4644 using UserList = SmallVector<ExternalUser, 16>;
4650 bool isAliased(
const MemoryLocation &Loc1, Instruction *Inst1,
4651 Instruction *Inst2) {
4654 AliasCacheKey
Key = std::make_pair(Inst1, Inst2);
4655 auto Res = AliasCache.try_emplace(
Key);
4657 return Res.first->second;
4658 bool Aliased =
isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
4660 Res.first->getSecond() = Aliased;
4664 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
4668 SmallDenseMap<AliasCacheKey, bool> AliasCache;
4673 BatchAAResults BatchAA;
4680 DenseSet<Instruction *> DeletedInstructions;
4683 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
4686 DenseSet<size_t> AnalyzedReductionVals;
4690 DenseSet<Value *> AnalyzedMinBWVals;
4696 UserList ExternalUses;
4700 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
4704 SmallPtrSet<Value *, 4> ExternalUsesWithNonUsers;
4707 SmallPtrSet<const Value *, 32> EphValues;
4711 SetVector<Instruction *> GatherShuffleExtractSeq;
4714 DenseSet<BasicBlock *> CSEBlocks;
4717 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
4724 class ScheduleEntity {
4725 friend class ScheduleBundle;
4726 friend class ScheduleData;
4727 friend class ScheduleCopyableData;
4730 enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData };
4731 Kind getKind()
const {
return K; }
4732 ScheduleEntity(Kind K) : K(K) {}
4736 int SchedulingPriority = 0;
4739 bool IsScheduled =
false;
4741 const Kind K = Kind::ScheduleData;
4744 ScheduleEntity() =
delete;
4746 void setSchedulingPriority(
int Priority) { SchedulingPriority = Priority; }
4747 int getSchedulingPriority()
const {
return SchedulingPriority; }
4748 bool isReady()
const {
4750 return SD->isReady();
4752 return CD->isReady();
4758 bool hasValidDependencies()
const {
4760 return SD->hasValidDependencies();
4762 return CD->hasValidDependencies();
4766 int getUnscheduledDeps()
const {
4768 return SD->getUnscheduledDeps();
4770 return CD->getUnscheduledDeps();
4774 int incrementUnscheduledDeps(
int Incr) {
4776 return SD->incrementUnscheduledDeps(Incr);
4780 int getDependencies()
const {
4782 return SD->getDependencies();
4788 return SD->getInst();
4793 bool isScheduled()
const {
return IsScheduled; }
4794 void setScheduled(
bool Scheduled) { IsScheduled = Scheduled; }
4796 static bool classof(
const ScheduleEntity *) {
return true; }
4798#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4799 void dump(raw_ostream &OS)
const {
4801 return SD->dump(OS);
4803 return CD->dump(OS);
4814#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4816 const BoUpSLP::ScheduleEntity &SE) {
4826 class ScheduleData final :
public ScheduleEntity {
4830 enum { InvalidDeps = -1 };
4832 ScheduleData() : ScheduleEntity(Kind::ScheduleData) {}
4833 static bool classof(
const ScheduleEntity *Entity) {
4834 return Entity->getKind() == Kind::ScheduleData;
4837 void init(
int BlockSchedulingRegionID, Instruction *
I) {
4838 NextLoadStore =
nullptr;
4839 IsScheduled =
false;
4840 SchedulingRegionID = BlockSchedulingRegionID;
4841 clearDependencies();
4847 if (hasValidDependencies()) {
4848 assert(UnscheduledDeps <= Dependencies &&
"invariant");
4850 assert(UnscheduledDeps == Dependencies &&
"invariant");
4854 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
4855 "unexpected scheduled state");
4862 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
4866 bool isReady()
const {
return UnscheduledDeps == 0 && !IsScheduled; }
4871 int incrementUnscheduledDeps(
int Incr) {
4872 assert(hasValidDependencies() &&
4873 "increment of unscheduled deps would be meaningless");
4874 UnscheduledDeps += Incr;
4875 assert(UnscheduledDeps >= 0 &&
4876 "Expected valid number of unscheduled deps");
4877 return UnscheduledDeps;
4882 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
4885 void clearDependencies() {
4886 clearDirectDependencies();
4887 MemoryDependencies.clear();
4888 ControlDependencies.clear();
4895 void clearDirectDependencies() {
4896 Dependencies = InvalidDeps;
4897 resetUnscheduledDeps();
4898 IsScheduled =
false;
4902 int getUnscheduledDeps()
const {
return UnscheduledDeps; }
4904 int getDependencies()
const {
return Dependencies; }
4906 void initDependencies() { Dependencies = 0; }
4908 void incDependencies() { Dependencies++; }
4911 int getSchedulingRegionID()
const {
return SchedulingRegionID; }
4918 return MemoryDependencies;
4921 void addMemoryDependency(ScheduleData *Dep) {
4922 MemoryDependencies.push_back(Dep);
4926 return ControlDependencies;
4929 void addControlDependency(ScheduleData *Dep) {
4930 ControlDependencies.push_back(Dep);
4933 ScheduleData *getNextLoadStore()
const {
return NextLoadStore; }
4934 void setNextLoadStore(ScheduleData *
Next) { NextLoadStore =
Next; }
4936 void dump(raw_ostream &OS)
const { OS << *Inst; }
4948 ScheduleData *NextLoadStore =
nullptr;
4952 SmallVector<ScheduleData *> MemoryDependencies;
4958 SmallVector<ScheduleData *> ControlDependencies;
4962 int SchedulingRegionID = 0;
4968 int Dependencies = InvalidDeps;
4974 int UnscheduledDeps = InvalidDeps;
4979 const BoUpSLP::ScheduleData &SD) {
4985 class ScheduleBundle final :
public ScheduleEntity {
4989 bool IsValid =
true;
4991 TreeEntry *TE =
nullptr;
4992 ScheduleBundle(
bool IsValid)
4993 : ScheduleEntity(Kind::ScheduleBundle), IsValid(IsValid) {}
4996 ScheduleBundle() : ScheduleEntity(Kind::ScheduleBundle) {}
4997 static bool classof(
const ScheduleEntity *Entity) {
4998 return Entity->getKind() == Kind::ScheduleBundle;
5003 for (
const ScheduleEntity *SD : Bundle) {
5004 if (SD->hasValidDependencies()) {
5005 assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
5008 assert(SD->getUnscheduledDeps() == SD->getDependencies() &&
5012 if (isScheduled()) {
5013 assert(SD->hasValidDependencies() && SD->getUnscheduledDeps() == 0 &&
5014 "unexpected scheduled state");
5020 int unscheduledDepsInBundle()
const {
5021 assert(*
this &&
"bundle must not be empty");
5023 for (
const ScheduleEntity *BundleMember : Bundle) {
5024 if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
5025 return ScheduleData::InvalidDeps;
5026 Sum += BundleMember->getUnscheduledDeps();
5034 bool hasValidDependencies()
const {
5035 return all_of(Bundle, [](
const ScheduleEntity *SD) {
5036 return SD->hasValidDependencies();
5042 bool isReady()
const {
5043 assert(*
this &&
"bundle must not be empty");
5044 return unscheduledDepsInBundle() == 0 && !isScheduled();
5052 void add(ScheduleEntity *SD) { Bundle.push_back(SD); }
5055 void setTreeEntry(TreeEntry *TE) { this->TE = TE; }
5056 TreeEntry *getTreeEntry()
const {
return TE; }
5058 static ScheduleBundle invalid() {
return {
false}; }
5060 operator bool()
const {
return IsValid; }
5063 void dump(raw_ostream &OS)
const {
5072 OS << *SD->getInst();
5086 const BoUpSLP::ScheduleBundle &Bundle) {
5097 class ScheduleCopyableData final :
public ScheduleEntity {
5104 int SchedulingRegionID = 0;
5106 ScheduleBundle &Bundle;
5109 ScheduleCopyableData(
int BlockSchedulingRegionID,
Instruction *
I,
5110 const EdgeInfo &EI, ScheduleBundle &Bundle)
5111 : ScheduleEntity(Kind::ScheduleCopyableData), Inst(
I), EI(EI),
5112 SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {}
5113 static bool classof(
const ScheduleEntity *Entity) {
5114 return Entity->getKind() == Kind::ScheduleCopyableData;
5119 if (hasValidDependencies()) {
5120 assert(UnscheduledDeps <= Dependencies &&
"invariant");
5122 assert(UnscheduledDeps == Dependencies &&
"invariant");
5126 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
5127 "unexpected scheduled state");
5134 bool hasValidDependencies()
const {
5135 return Dependencies != ScheduleData::InvalidDeps;
5140 bool isReady()
const {
return UnscheduledDeps == 0 && !IsScheduled; }
5145 int incrementUnscheduledDeps(
int Incr) {
5146 assert(hasValidDependencies() &&
5147 "increment of unscheduled deps would be meaningless");
5148 UnscheduledDeps += Incr;
5149 assert(UnscheduledDeps >= 0 &&
"invariant");
5150 return UnscheduledDeps;
5155 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5158 int getUnscheduledDeps()
const {
return UnscheduledDeps; }
5160 int getDependencies()
const {
return Dependencies; }
5162 void initDependencies() { Dependencies = 0; }
5164 void incDependencies() { Dependencies++; }
5167 int getSchedulingRegionID()
const {
return SchedulingRegionID; }
5173 void clearDependencies() {
5174 Dependencies = ScheduleData::InvalidDeps;
5175 UnscheduledDeps = ScheduleData::InvalidDeps;
5176 IsScheduled =
false;
5180 const EdgeInfo &getEdgeInfo()
const {
return EI; }
5183 ScheduleBundle &getBundle() {
return Bundle; }
5184 const ScheduleBundle &getBundle()
const {
return Bundle; }
5186#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5187 void dump(raw_ostream &OS)
const { OS <<
"[Copyable]" << *getInst(); }
5198 int Dependencies = ScheduleData::InvalidDeps;
5204 int UnscheduledDeps = ScheduleData::InvalidDeps;
5234 struct BlockScheduling {
5236 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
5239 ScheduledBundles.clear();
5240 ScheduledBundlesList.
clear();
5241 ScheduleCopyableDataMap.clear();
5242 ScheduleCopyableDataMapByInst.clear();
5243 ScheduleCopyableDataMapByInstUser.clear();
5244 ScheduleCopyableDataMapByUsers.clear();
5246 ScheduleStart =
nullptr;
5247 ScheduleEnd =
nullptr;
5248 FirstLoadStoreInRegion =
nullptr;
5249 LastLoadStoreInRegion =
nullptr;
5250 RegionHasStackSave =
false;
5254 ScheduleRegionSizeLimit -= ScheduleRegionSize;
5257 ScheduleRegionSize = 0;
5261 ++SchedulingRegionID;
5264 ScheduleData *getScheduleData(Instruction *
I) {
5267 if (BB !=
I->getParent())
5270 ScheduleData *SD = ScheduleDataMap.lookup(
I);
5271 if (SD && isInSchedulingRegion(*SD))
5276 ScheduleData *getScheduleData(
Value *V) {
5282 ScheduleCopyableData *getScheduleCopyableData(
const EdgeInfo &EI,
5283 const Value *V)
const {
5284 if (ScheduleCopyableDataMap.empty())
5286 auto It = ScheduleCopyableDataMap.find(std::make_pair(EI, V));
5287 if (It == ScheduleCopyableDataMap.end())
5289 ScheduleCopyableData *SD = It->getSecond().get();
5290 if (!isInSchedulingRegion(*SD))
5298 getScheduleCopyableData(
const Value *User,
unsigned OperandIdx,
5300 if (ScheduleCopyableDataMapByInstUser.empty())
5302 const auto It = ScheduleCopyableDataMapByInstUser.find(
5303 std::make_pair(std::make_pair(User, OperandIdx), V));
5304 if (It == ScheduleCopyableDataMapByInstUser.end())
5307 for (ScheduleCopyableData *SD : It->getSecond()) {
5308 if (isInSchedulingRegion(*SD))
5322 bool areAllOperandsReplacedByCopyableData(Instruction *User,
5326 if (ScheduleCopyableDataMap.empty())
5328 SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;
5329 SmallDenseMap<const TreeEntry *, unsigned> OrderedEntriesCount;
5331 if (Entries.
empty())
5333 for (
const Use &U :
User->operands()) {
5338 for (TreeEntry *TE : Entries) {
5340 bool IsNonSchedulableWithParentPhiNode =
5341 TE->doesNotNeedToSchedule() &&
TE->UserTreeIndex &&
5342 TE->UserTreeIndex.UserTE->hasState() &&
5343 TE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
5344 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
5347 if (IsNonSchedulableWithParentPhiNode) {
5348 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5349 const TreeEntry *ParentTE =
TE->UserTreeIndex.UserTE;
5350 for (
Value *V : ParentTE->Scalars) {
5354 if (ParentsUniqueUsers.
insert(
PHI).second &&
5367 bool IsCommutativeUser =
5372 OrderedEntriesCount.
try_emplace(TE, 0).first->getSecond();
5373 EdgeInfo EI(TE,
U.getOperandNo());
5374 if (!getScheduleCopyableData(EI,
Op))
5380 PotentiallyReorderedEntriesCount.
try_emplace(TE, 0)
5381 .first->getSecond() += Inc;
5384 if (PotentiallyReorderedEntriesCount.
empty())
5385 return all_of(OrderedEntriesCount,
5386 [&](
const std::pair<const TreeEntry *, unsigned> &
P) {
5390 for (
auto &
P : PotentiallyReorderedEntriesCount) {
5391 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5392 bool IsNonSchedulableWithParentPhiNode =
5393 P.first->doesNotNeedToSchedule() &&
P.first->UserTreeIndex &&
5394 P.first->UserTreeIndex.UserTE->hasState() &&
5395 P.first->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
5396 P.first->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
5397 auto *It =
find(
P.first->Scalars, User);
5399 assert(It !=
P.first->Scalars.end() &&
5400 "User is not in the tree entry");
5401 int Lane = std::distance(
P.first->Scalars.begin(), It);
5402 assert(Lane >= 0 &&
"Lane is not found");
5404 Lane =
P.first->ReorderIndices[Lane];
5405 assert(Lane <
static_cast<int>(
P.first->Scalars.size()) &&
5406 "Couldn't find extract lane");
5409 if (IsNonSchedulableWithParentPhiNode) {
5410 const TreeEntry *ParentTE =
P.first->UserTreeIndex.UserTE;
5412 if (!ParentsUniqueUsers.
insert(User).second) {
5418 for (
unsigned OpIdx :
5420 P.first->getMainOp()))) {
5421 if (
P.first->getOperand(
OpIdx)[Lane] ==
Op &&
5422 getScheduleCopyableData(EdgeInfo(
P.first,
OpIdx),
Op))
5426 if (!IsNonSchedulableWithParentPhiNode)
5429 }
while (It !=
P.first->Scalars.end());
5431 return all_of(PotentiallyReorderedEntriesCount,
5432 [&](
const std::pair<const TreeEntry *, unsigned> &
P) {
5433 return P.second ==
NumOps - 1;
5435 all_of(OrderedEntriesCount,
5436 [&](
const std::pair<const TreeEntry *, unsigned> &
P) {
5442 getScheduleCopyableData(
const Instruction *
I)
const {
5443 if (ScheduleCopyableDataMapByInst.empty())
5445 const auto It = ScheduleCopyableDataMapByInst.find(
I);
5446 if (It == ScheduleCopyableDataMapByInst.end())
5449 for (ScheduleCopyableData *SD : It->getSecond()) {
5450 if (isInSchedulingRegion(*SD))
5457 getScheduleCopyableDataUsers(
const Instruction *User)
const {
5458 if (ScheduleCopyableDataMapByUsers.empty())
5460 const auto It = ScheduleCopyableDataMapByUsers.find(User);
5461 if (It == ScheduleCopyableDataMapByUsers.end())
5464 for (ScheduleCopyableData *SD : It->getSecond()) {
5465 if (isInSchedulingRegion(*SD))
5471 ScheduleCopyableData &addScheduleCopyableData(
const EdgeInfo &EI,
5473 int SchedulingRegionID,
5474 ScheduleBundle &Bundle) {
5475 assert(!getScheduleCopyableData(EI,
I) &&
"already in the map");
5476 ScheduleCopyableData *CD =
5477 ScheduleCopyableDataMap
5478 .try_emplace(std::make_pair(EI,
I),
5479 std::make_unique<ScheduleCopyableData>(
5480 SchedulingRegionID,
I, EI, Bundle))
5483 ScheduleCopyableDataMapByInst[
I].push_back(CD);
5487 assert(It !=
Op.end() &&
"Lane not set");
5488 SmallPtrSet<Instruction *, 4> Visited;
5490 int Lane = std::distance(
Op.begin(), It);
5491 assert(Lane >= 0 &&
"Lane not set");
5493 !EI.UserTE->ReorderIndices.empty())
5494 Lane = EI.UserTE->ReorderIndices[Lane];
5495 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
5496 "Couldn't find extract lane");
5498 if (!Visited.
insert(In).second) {
5502 ScheduleCopyableDataMapByInstUser
5503 .try_emplace(std::make_pair(std::make_pair(In, EI.EdgeIdx),
I))
5506 ScheduleCopyableDataMapByUsers.try_emplace(
I)
5513 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
5514 if (ScheduleCopyableData *UserCD =
5515 getScheduleCopyableData(UserEI, In))
5516 ScheduleCopyableDataMapByUsers[
I].remove(UserCD);
5519 }
while (It !=
Op.end());
5521 ScheduleCopyableDataMapByUsers.try_emplace(
I).first->getSecond().insert(
5531 auto It = ScheduledBundles.find(
I);
5532 if (It == ScheduledBundles.end())
5534 return It->getSecond();
5538 bool isInSchedulingRegion(
const ScheduleEntity &SD)
const {
5540 return Data->getSchedulingRegionID() == SchedulingRegionID;
5542 return CD->getSchedulingRegionID() == SchedulingRegionID;
5544 [&](
const ScheduleEntity *BundleMember) {
5545 return isInSchedulingRegion(*BundleMember);
5551 template <
typename ReadyListType>
5552 void schedule(
const BoUpSLP &R,
const InstructionsState &S,
5553 const EdgeInfo &EI, ScheduleEntity *
Data,
5554 ReadyListType &ReadyList) {
5555 auto ProcessBundleMember = [&](ScheduleEntity *BundleMember,
5560 auto DecrUnsched = [&](
auto *
Data,
bool IsControl =
false) {
5561 if ((IsControl ||
Data->hasValidDependencies()) &&
5562 Data->incrementUnscheduledDeps(-1) == 0) {
5569 CopyableBundle.
push_back(&CD->getBundle());
5570 Bundles = CopyableBundle;
5572 Bundles = getScheduleBundles(
Data->getInst());
5574 if (!Bundles.
empty()) {
5575 for (ScheduleBundle *Bundle : Bundles) {
5576 if (Bundle->unscheduledDepsInBundle() == 0) {
5577 assert(!Bundle->isScheduled() &&
5578 "already scheduled bundle gets ready");
5579 ReadyList.insert(Bundle);
5581 <<
"SLP: gets ready: " << *Bundle <<
"\n");
5587 "already scheduled bundle gets ready");
5589 "Expected non-copyable data");
5590 ReadyList.insert(
Data);
5597 if (!ScheduleCopyableDataMap.empty()) {
5599 getScheduleCopyableData(User,
OpIdx,
I);
5600 for (ScheduleCopyableData *CD : CopyableData)
5601 DecrUnsched(CD,
false);
5602 if (!CopyableData.empty())
5605 if (ScheduleData *OpSD = getScheduleData(
I))
5606 DecrUnsched(OpSD,
false);
5612 if (!Bundles.empty()) {
5613 auto *
In = BundleMember->getInst();
5615 SmallDenseMap<const Instruction *, unsigned> OperandsUses;
5616 unsigned TotalOpCount = 0;
5619 TotalOpCount = OperandsUses[
In] = 1;
5621 for (
const Use &U :
In->operands()) {
5624 ++Res.first->getSecond();
5631 auto DecrUnschedForInst =
5633 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>>
5635 if (!ScheduleCopyableDataMap.empty()) {
5636 const EdgeInfo EI = {UserTE,
OpIdx};
5637 if (ScheduleCopyableData *CD =
5638 getScheduleCopyableData(EI,
I)) {
5639 if (!Checked.insert(std::make_pair(CD,
OpIdx)).second)
5641 DecrUnsched(CD,
false);
5645 auto It = OperandsUses.
find(
I);
5646 assert(It != OperandsUses.
end() &&
"Operand not found");
5647 if (It->second > 0) {
5649 assert(TotalOpCount > 0 &&
"No more operands to decrement");
5651 if (ScheduleData *OpSD = getScheduleData(
I)) {
5652 if (!Checked.insert(std::make_pair(OpSD,
OpIdx)).second)
5654 DecrUnsched(OpSD,
false);
5659 for (ScheduleBundle *Bundle : Bundles) {
5660 if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
5662 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5665 auto *It =
find(Bundle->getTreeEntry()->Scalars, In);
5666 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>> Checked;
5667 bool IsNonSchedulableWithParentPhiNode =
5668 Bundle->getTreeEntry()->doesNotNeedToSchedule() &&
5669 Bundle->getTreeEntry()->UserTreeIndex &&
5670 Bundle->getTreeEntry()->UserTreeIndex.UserTE->hasState() &&
5671 Bundle->getTreeEntry()->UserTreeIndex.UserTE->State !=
5672 TreeEntry::SplitVectorize &&
5673 Bundle->getTreeEntry()->UserTreeIndex.UserTE->getOpcode() ==
5677 std::distance(Bundle->getTreeEntry()->Scalars.begin(), It);
5678 assert(Lane >= 0 &&
"Lane not set");
5680 !Bundle->getTreeEntry()->ReorderIndices.empty())
5681 Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
5682 assert(Lane <
static_cast<int>(
5683 Bundle->getTreeEntry()->Scalars.size()) &&
5684 "Couldn't find extract lane");
5694 In->getNumOperands() ==
5695 Bundle->getTreeEntry()->getNumOperands() ||
5696 Bundle->getTreeEntry()->isCopyableElement(In)) &&
5697 "Missed TreeEntry operands?");
5701 if (IsNonSchedulableWithParentPhiNode) {
5702 const TreeEntry *ParentTE =
5703 Bundle->getTreeEntry()->UserTreeIndex.UserTE;
5705 if (!ParentsUniqueUsers.
insert(User).second) {
5706 It = std::find(std::next(It),
5707 Bundle->getTreeEntry()->Scalars.end(), In);
5712 for (
unsigned OpIdx :
5715 Bundle->getTreeEntry()->getOperand(
OpIdx)[Lane])) {
5718 DecrUnschedForInst(
I, Bundle->getTreeEntry(),
OpIdx, Checked);
5721 if (!IsNonSchedulableWithParentPhiNode)
5723 It = std::find(std::next(It),
5724 Bundle->getTreeEntry()->Scalars.end(), In);
5725 }
while (It != Bundle->getTreeEntry()->Scalars.end());
5730 for (Use &U : BundleMember->getInst()->operands()) {
5733 <<
"SLP: check for readiness (def): " << *
I <<
"\n");
5734 DecrUnschedForInst(BundleMember->getInst(),
U.getOperandNo(),
I);
5742 SmallPtrSet<const ScheduleData *, 4> VisitedMemory;
5743 for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) {
5744 if (!VisitedMemory.
insert(MemoryDep).second)
5749 << *MemoryDep <<
"\n");
5750 DecrUnsched(MemoryDep);
5753 SmallPtrSet<const ScheduleData *, 4> VisitedControl;
5754 for (ScheduleData *Dep : SD->getControlDependencies()) {
5755 if (!VisitedControl.
insert(Dep).second)
5760 <<
"SLP: check for readiness (ctrl): " << *Dep <<
"\n");
5761 DecrUnsched(Dep,
true);
5765 SD->setScheduled(
true);
5770 if (
R.isVectorized(In)) {
5772 for (TreeEntry *TE : Entries) {
5774 In->getNumOperands() !=
TE->getNumOperands())
5777 PseudoBundles.
emplace_back(std::make_unique<ScheduleBundle>());
5778 BundlePtr->setTreeEntry(TE);
5783 ProcessBundleMember(SD, Bundles);
5786 Bundle.setScheduled(
true);
5788 auto AreAllBundlesScheduled =
5789 [&](
const ScheduleEntity *SD,
5793 return !SDBundles.empty() &&
5794 all_of(SDBundles, [&](
const ScheduleBundle *SDBundle) {
5795 return SDBundle->isScheduled();
5798 for (ScheduleEntity *SD : Bundle.getBundle()) {
5801 SDBundles = getScheduleBundles(SD->getInst());
5802 if (AreAllBundlesScheduled(SD, SDBundles)) {
5803 SD->setScheduled(
true);
5816 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
5817 ScheduleStart->comesBefore(ScheduleEnd) &&
5818 "Not a valid scheduling region?");
5820 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
5822 if (!Bundles.
empty()) {
5823 for (ScheduleBundle *Bundle : Bundles) {
5824 assert(isInSchedulingRegion(*Bundle) &&
5825 "primary schedule data not in window?");
5830 auto *SD = getScheduleData(
I);
5833 assert(isInSchedulingRegion(*SD) &&
5834 "primary schedule data not in window?");
5839 [](
const ScheduleEntity *Bundle) {
5840 return Bundle->isReady();
5842 "item in ready list not ready?");
5846 template <
typename ReadyListType>
5847 void initialFillReadyList(ReadyListType &ReadyList) {
5848 SmallPtrSet<ScheduleBundle *, 16> Visited;
5849 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
5850 ScheduleData *SD = getScheduleData(
I);
5851 if (SD && SD->hasValidDependencies() && SD->isReady()) {
5854 for (ScheduleBundle *Bundle : Bundles) {
5855 if (!Visited.
insert(Bundle).second)
5857 if (Bundle->hasValidDependencies() && Bundle->isReady()) {
5858 ReadyList.insert(Bundle);
5860 << *Bundle <<
"\n");
5865 ReadyList.insert(SD);
5867 <<
"SLP: initially in ready list: " << *SD <<
"\n");
5878 const InstructionsState &S,
const EdgeInfo &EI);
5885 std::optional<ScheduleBundle *>
5887 const InstructionsState &S,
const EdgeInfo &EI);
5890 ScheduleData *allocateScheduleDataChunks();
5894 bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
5898 void initScheduleData(Instruction *FromI, Instruction *ToI,
5899 ScheduleData *PrevLoadStore,
5900 ScheduleData *NextLoadStore);
5904 void calculateDependencies(ScheduleBundle &Bundle,
bool InsertInReadyList,
5909 void resetSchedule();
5926 SmallDenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
5930 SmallDenseMap<std::pair<EdgeInfo, const Value *>,
5931 std::unique_ptr<ScheduleCopyableData>>
5932 ScheduleCopyableDataMap;
5938 SmallDenseMap<const Instruction *, SmallVector<ScheduleCopyableData *>>
5939 ScheduleCopyableDataMapByInst;
5945 SmallDenseMap<std::pair<std::pair<const Value *, unsigned>,
const Value *>,
5947 ScheduleCopyableDataMapByInstUser;
5967 SmallSetVector<ScheduleCopyableData *, 4>>
5968 ScheduleCopyableDataMapByUsers;
5971 SmallDenseMap<Instruction *, SmallVector<ScheduleBundle *>>
5977 SetVector<ScheduleEntity *> ReadyInsts;
5987 ScheduleData *FirstLoadStoreInRegion =
nullptr;
5991 ScheduleData *LastLoadStoreInRegion =
nullptr;
5996 bool RegionHasStackSave =
false;
5999 int ScheduleRegionSize = 0;
6008 int SchedulingRegionID = 1;
6012 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
6016 void scheduleBlock(
const BoUpSLP &R, BlockScheduling *BS);
6019 const SmallDenseSet<Value *> *UserIgnoreList =
nullptr;
6023 struct OrdersTypeDenseMapInfo {
6036 static unsigned getHashValue(
const OrdersType &V) {
6047 ScalarEvolution *SE;
6048 TargetTransformInfo *TTI;
6049 TargetLibraryInfo *TLI;
6052 AssumptionCache *AC;
6054 const DataLayout *DL;
6055 OptimizationRemarkEmitter *ORE;
6057 unsigned MaxVecRegSize;
6058 unsigned MinVecRegSize;
6061 IRBuilder<TargetFolder> Builder;
6068 DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
6073 unsigned ReductionBitWidth = 0;
6076 unsigned BaseGraphSize = 1;
6080 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
6084 DenseSet<unsigned> ExtraBitWidthNodes;
6092 SecondInfo::getEmptyKey());
6097 SecondInfo::getTombstoneKey());
6102 SecondInfo::getHashValue(Val.
EdgeIdx));
6123 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
6134 return R.VectorizableTree[0].get();
6138 return {&
N->UserTreeIndex,
N->Container};
6142 return {&
N->UserTreeIndex + 1,
N->Container};
6169 static unsigned size(
BoUpSLP *R) {
return R->VectorizableTree.size(); }
6181 OS << Entry->Idx <<
".\n";
6184 for (
auto *V : Entry->Scalars) {
6186 if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
6187 return EU.Scalar == V;
6197 if (Entry->isGather())
6199 if (Entry->State == TreeEntry::ScatterVectorize ||
6200 Entry->State == TreeEntry::StridedVectorize ||
6201 Entry->State == TreeEntry::CompressVectorize)
6202 return "color=blue";
6209 for (
auto *
I : DeletedInstructions) {
6210 if (!
I->getParent()) {
6215 I->insertBefore(F->getEntryBlock(),
6216 F->getEntryBlock().getFirstNonPHIIt());
6218 I->insertBefore(F->getEntryBlock().getTerminator()->getIterator());
6221 for (
Use &U :
I->operands()) {
6223 if (
Op && !DeletedInstructions.count(
Op) &&
Op->hasOneUser() &&
6227 I->dropAllReferences();
6229 for (
auto *
I : DeletedInstructions) {
6231 "trying to erase instruction with users.");
6232 I->eraseFromParent();
6238#ifdef EXPENSIVE_CHECKS
6249 assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
6250 "Expected non-empty mask.");
6253 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
6255 Reuses[Mask[
I]] = Prev[
I];
6263 bool BottomOrder =
false) {
6264 assert(!Mask.empty() &&
"Expected non-empty mask.");
6265 unsigned Sz = Mask.size();
6268 if (Order.
empty()) {
6270 std::iota(PrevOrder.
begin(), PrevOrder.
end(), 0);
6272 PrevOrder.
swap(Order);
6275 for (
unsigned I = 0;
I < Sz; ++
I)
6277 Order[
I] = PrevOrder[Mask[
I]];
6279 return Data.value() == Sz ||
Data.index() ==
Data.value();
6288 if (Order.
empty()) {
6290 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
6300 for (
unsigned I = 0;
I < Sz; ++
I)
6302 Order[MaskOrder[
I]] =
I;
6306std::optional<BoUpSLP::OrdersType>
6308 bool TopToBottom,
bool IgnoreReorder) {
6309 assert(TE.isGather() &&
"Expected gather node only.");
6313 Type *ScalarTy = GatheredScalars.
front()->getType();
6314 size_t NumScalars = GatheredScalars.
size();
6316 return std::nullopt;
6323 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
6325 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
6328 if (GatherShuffles.
empty() && ExtractShuffles.
empty())
6329 return std::nullopt;
6330 OrdersType CurrentOrder(NumScalars, NumScalars);
6331 if (GatherShuffles.
size() == 1 &&
6333 Entries.
front().front()->isSame(TE.Scalars)) {
6337 return std::nullopt;
6339 if (Entries.
front().front()->UserTreeIndex.UserTE ==
6340 TE.UserTreeIndex.UserTE)
6341 return std::nullopt;
6344 if (!IgnoreReorder && Entries.
front().front()->Idx == 0)
6345 return std::nullopt;
6348 if (!Entries.
front().front()->ReuseShuffleIndices.empty() &&
6349 TE.getVectorFactor() == 2 && Mask.size() == 2 &&
6352 return P.value() % 2 != static_cast<int>(P.index()) % 2;
6354 return std::nullopt;
6358 std::iota(CurrentOrder.
begin(), CurrentOrder.
end(), 0);
6359 return CurrentOrder;
6363 return all_of(Mask, [&](
int I) {
6370 if ((ExtractShuffles.
empty() && IsSplatMask(Mask) &&
6371 (Entries.
size() != 1 ||
6372 Entries.
front().front()->ReorderIndices.empty())) ||
6373 (GatherShuffles.
empty() && IsSplatMask(ExtractMask)))
6374 return std::nullopt;
6380 if (ShuffledSubMasks.
test(
I))
6382 const int VF = GetVF(
I);
6388 if (
any_of(Slice, [&](
unsigned I) {
return I != NumScalars; })) {
6390 ShuffledSubMasks.
set(
I);
6394 int FirstMin = INT_MAX;
6395 int SecondVecFound =
false;
6397 int Idx = Mask[
I * PartSz + K];
6399 Value *V = GatheredScalars[
I * PartSz + K];
6401 SecondVecFound =
true;
6410 SecondVecFound =
true;
6414 FirstMin = (FirstMin / PartSz) * PartSz;
6416 if (SecondVecFound) {
6418 ShuffledSubMasks.
set(
I);
6422 int Idx = Mask[
I * PartSz + K];
6426 if (Idx >= PartSz) {
6427 SecondVecFound =
true;
6430 if (CurrentOrder[
I * PartSz + Idx] >
6431 static_cast<unsigned>(
I * PartSz + K) &&
6432 CurrentOrder[
I * PartSz + Idx] !=
6433 static_cast<unsigned>(
I * PartSz + Idx))
6434 CurrentOrder[
I * PartSz + Idx] =
I * PartSz + K;
6437 if (SecondVecFound) {
6439 ShuffledSubMasks.
set(
I);
6445 if (!ExtractShuffles.
empty())
6446 TransformMaskToOrder(
6447 CurrentOrder, ExtractMask, PartSz, NumParts, [&](
unsigned I) {
6448 if (!ExtractShuffles[
I])
6451 unsigned Sz =
getNumElems(TE.getVectorFactor(), PartSz,
I);
6453 int K =
I * PartSz + Idx;
6456 if (!TE.ReuseShuffleIndices.empty())
6457 K = TE.ReuseShuffleIndices[K];
6460 if (!TE.ReorderIndices.empty())
6461 K = std::distance(TE.ReorderIndices.begin(),
6462 find(TE.ReorderIndices, K));
6468 .getKnownMinValue());
6473 if (GatherShuffles.
size() == 1 && NumParts != 1) {
6474 if (ShuffledSubMasks.
any())
6475 return std::nullopt;
6476 PartSz = NumScalars;
6479 if (!Entries.
empty())
6480 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](
unsigned I) {
6481 if (!GatherShuffles[
I])
6483 return std::max(Entries[
I].front()->getVectorFactor(),
6484 Entries[
I].back()->getVectorFactor());
6486 unsigned NumUndefs =
count(CurrentOrder, NumScalars);
6487 if (ShuffledSubMasks.
all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
6488 return std::nullopt;
6489 return std::move(CurrentOrder);
6494 bool CompareOpcodes =
true) {
6500 return (!GEP1 || GEP1->getNumOperands() == 2) &&
6501 (!GEP2 || GEP2->getNumOperands() == 2) &&
6502 (((!GEP1 ||
isConstant(GEP1->getOperand(1))) &&
6503 (!GEP2 ||
isConstant(GEP2->getOperand(1)))) ||
6506 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
6510template <
typename T>
6515 return CommonAlignment;
6521 "Order is empty. Please check it before using isReverseOrder.");
6522 unsigned Sz = Order.
size();
6524 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
6535 const SCEV *PtrSCEVLowest =
nullptr;
6536 const SCEV *PtrSCEVHighest =
nullptr;
6539 for (
Value *Ptr : PointerOps) {
6544 if (!PtrSCEVLowest && !PtrSCEVHighest) {
6545 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
6552 PtrSCEVLowest = PtrSCEV;
6559 PtrSCEVHighest = PtrSCEV;
6567 int Size =
DL.getTypeStoreSize(ElemTy);
6568 auto TryGetStride = [&](
const SCEV *Dist,
6569 const SCEV *Multiplier) ->
const SCEV * {
6571 if (M->getOperand(0) == Multiplier)
6572 return M->getOperand(1);
6573 if (M->getOperand(1) == Multiplier)
6574 return M->getOperand(0);
6577 if (Multiplier == Dist)
6582 const SCEV *Stride =
nullptr;
6583 if (
Size != 1 || SCEVs.
size() > 2) {
6585 Stride = TryGetStride(Dist, Sz);
6593 using DistOrdPair = std::pair<int64_t, int>;
6595 std::set<DistOrdPair,
decltype(Compare)> Offsets(Compare);
6597 bool IsConsecutive =
true;
6598 for (
const SCEV *PtrSCEV : SCEVs) {
6600 if (PtrSCEV != PtrSCEVLowest) {
6602 const SCEV *Coeff = TryGetStride(Diff, Stride);
6612 Dist = SC->getAPInt().getZExtValue();
6617 auto Res = Offsets.emplace(Dist, Cnt);
6621 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
6624 if (Offsets.size() != SCEVs.
size())
6626 SortedIndices.
clear();
6627 if (!IsConsecutive) {
6631 for (
const std::pair<int64_t, int> &Pair : Offsets) {
6632 SortedIndices[Cnt] = Pair.second;
6639static std::pair<InstructionCost, InstructionCost>
6658 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask,
CostKind, Index, SubTp,
6660 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
6663 Mask, NumSrcElts, NumSubElts, Index)) {
6664 if (Index + NumSubElts > NumSrcElts &&
6665 Index + NumSrcElts <=
static_cast<int>(
Mask.size()))
6669 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask,
CostKind, Index, SubTp,
6682 "ScalableVectorType is not supported.");
6685 "Incorrect usage.");
6690 unsigned ScalarTyNumElements = VecTy->getNumElements();
6693 if (!DemandedElts[
I])
6697 I * ScalarTyNumElements, VecTy);
6700 I * ScalarTyNumElements, VecTy);
6704 return TTI.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
6713 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
6714 if (Opcode == Instruction::ExtractElement) {
6720 Index * VecTy->getNumElements(), VecTy);
6723 return TTI.getVectorInstrCost(Opcode, Val,
CostKind, Index, Scalar,
6736 getWidenedType(VecTy->getElementType(), ScalarTy->getNumElements());
6738 Index * ScalarTy->getNumElements(), SubTp) +
6742 return TTI.getExtractWithExtendCost(Opcode, Dst, VecTy, Index,
CostKind);
6758 auto *Begin = std::next(
Mask.begin(), Index);
6759 std::iota(Begin, std::next(Begin, SubVecVF), 0);
6760 Vec = Builder.CreateShuffleVector(V, Mask);
6763 std::iota(
Mask.begin(),
Mask.end(), 0);
6764 std::iota(std::next(
Mask.begin(), Index),
6765 std::next(
Mask.begin(), Index + SubVecVF), VecVF);
6767 return Generator(Vec, V, Mask);
6770 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
6771 V = Builder.CreateShuffleVector(V, ResizeMask);
6773 return Builder.CreateShuffleVector(Vec, V, Mask);
6778 unsigned SubVecVF,
unsigned Index) {
6780 std::iota(Mask.begin(), Mask.end(), Index);
6781 return Builder.CreateShuffleVector(Vec, Mask);
6791 const unsigned Sz = PointerOps.
size();
6794 CompressMask[0] = 0;
6796 std::optional<unsigned> Stride = 0;
6799 Value *Ptr = Order.
empty() ? PointerOps[
I] : PointerOps[Order[
I]];
6800 std::optional<int64_t> OptPos =
6802 if (!OptPos || OptPos > std::numeric_limits<unsigned>::max())
6804 unsigned Pos =
static_cast<unsigned>(*OptPos);
6805 CompressMask[
I] = Pos;
6812 if (Pos != *Stride *
I)
6815 return Stride.has_value();
6828 InterleaveFactor = 0;
6830 const size_t Sz = VL.
size();
6838 if (AreAllUsersVectorized(V))
6841 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
CostKind,
6842 Mask.empty() ?
I : Mask[
I]);
6845 if (ExtractCost <= ScalarCost)
6850 if (Order.
empty()) {
6851 Ptr0 = PointerOps.
front();
6852 PtrN = PointerOps.
back();
6854 Ptr0 = PointerOps[Order.
front()];
6855 PtrN = PointerOps[Order.
back()];
6857 std::optional<int64_t> Diff =
6861 const size_t MaxRegSize =
6865 if (*Diff / Sz >= MaxRegSize / 8)
6869 Align CommonAlignment = LI->getAlign();
6871 Ptr0, LoadVecTy, CommonAlignment,
DL,
6874 if (IsMasked && !
TTI.isLegalMaskedLoad(LoadVecTy, CommonAlignment,
6875 LI->getPointerAddressSpace()))
6881 assert(CompressMask.
size() >= 2 &&
"At least two elements are required");
6885 auto [ScalarGEPCost, VectorGEPCost] =
6887 Instruction::GetElementPtr,
CostKind, ScalarTy, LoadVecTy);
6904 LoadCost =
TTI.getMemIntrinsicInstrCost(
6907 LI->getPointerAddressSpace()),
6911 TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
6912 LI->getPointerAddressSpace(),
CostKind);
6914 if (IsStrided && !IsMasked && Order.
empty()) {
6921 AlignedLoadVecTy = LoadVecTy;
6922 if (
TTI.isLegalInterleavedAccessType(AlignedLoadVecTy, CompressMask[1],
6924 LI->getPointerAddressSpace())) {
6926 VectorGEPCost +
TTI.getInterleavedMemoryOpCost(
6927 Instruction::Load, AlignedLoadVecTy,
6928 CompressMask[1], {}, CommonAlignment,
6929 LI->getPointerAddressSpace(),
CostKind, IsMasked);
6930 if (InterleavedCost < GatherCost) {
6931 InterleaveFactor = CompressMask[1];
6932 LoadVecTy = AlignedLoadVecTy;
6939 if (!Order.
empty()) {
6942 NewMask[
I] = CompressMask[Mask[
I]];
6944 CompressMask.
swap(NewMask);
6946 InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
6947 return TotalVecCost < GatherCost;
6960 unsigned InterleaveFactor;
6964 AreAllUsersVectorized, IsMasked, InterleaveFactor,
6965 CompressMask, LoadVecTy);
6982 Align Alignment,
const int64_t Diff,
6983 const size_t Sz)
const {
6984 if (Diff % (Sz - 1) != 0)
6988 auto IsAnyPointerUsedOutGraph =
any_of(PointerOps, [&](
Value *V) {
6990 return !isVectorized(U) && !MustGather.contains(U);
6994 const uint64_t AbsoluteDiff = std::abs(Diff);
6996 if (IsAnyPointerUsedOutGraph ||
6997 (AbsoluteDiff > Sz &&
7000 AbsoluteDiff % Sz == 0 &&
has_single_bit(AbsoluteDiff / Sz)))) ||
7001 Diff == -(
static_cast<int64_t
>(Sz) - 1)) {
7002 int64_t Stride = Diff /
static_cast<int64_t
>(Sz - 1);
7003 if (Diff != Stride *
static_cast<int64_t
>(Sz - 1))
7005 if (!TTI->isLegalStridedLoadStore(VecTy, Alignment))
7015 Value *Ptr0,
Value *PtrN, StridedPtrInfo &SPtrInfo)
const {
7016 const size_t Sz = PointerOps.
size();
7021 SortedIndices.
empty() ? PointerOps[
I] : PointerOps[SortedIndices[
I]];
7022 SortedOffsetsFromBase[
I] =
7040 int64_t StrideWithinGroup =
7041 SortedOffsetsFromBase[1] - SortedOffsetsFromBase[0];
7044 auto IsEndOfGroupIndex = [=, &SortedOffsetsFromBase](
unsigned Idx) {
7045 return SortedOffsetsFromBase[Idx] - SortedOffsetsFromBase[Idx - 1] !=
7050 unsigned GroupSize = FoundIt != Indices.end() ? *FoundIt : Sz;
7052 unsigned VecSz = Sz;
7053 Type *NewScalarTy = ScalarTy;
7057 bool NeedsWidening = Sz != GroupSize;
7058 if (NeedsWidening) {
7059 if (Sz % GroupSize != 0)
7062 if (StrideWithinGroup != 1)
7064 VecSz = Sz / GroupSize;
7067 DL->getTypeSizeInBits(ScalarTy).getFixedValue() * GroupSize);
7070 if (!
isStridedLoad(PointerOps, NewScalarTy, Alignment, Diff, VecSz))
7073 int64_t StrideIntVal = StrideWithinGroup;
7074 if (NeedsWidening) {
7077 unsigned CurrentGroupStartIdx = GroupSize;
7078 int64_t StrideBetweenGroups =
7079 SortedOffsetsFromBase[GroupSize] - SortedOffsetsFromBase[0];
7080 StrideIntVal = StrideBetweenGroups;
7081 for (; CurrentGroupStartIdx < Sz; CurrentGroupStartIdx += GroupSize) {
7082 if (SortedOffsetsFromBase[CurrentGroupStartIdx] -
7083 SortedOffsetsFromBase[CurrentGroupStartIdx - GroupSize] !=
7084 StrideBetweenGroups)
7088 auto CheckGroup = [=](
const unsigned StartIdx) ->
bool {
7091 unsigned GroupEndIdx = FoundIt != Indices.end() ? *FoundIt : Sz;
7092 return GroupEndIdx - StartIdx == GroupSize;
7094 for (
unsigned I = 0;
I < Sz;
I += GroupSize) {
7100 Type *StrideTy = DL->getIndexType(Ptr0->
getType());
7101 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, StrideIntVal);
7109 StridedPtrInfo &SPtrInfo)
const {
7110 const unsigned Sz = PointerOps.
size();
7112 if (Sz <= MinProfitableStridedLoads || !TTI->isTypeLegal(StridedLoadTy) ||
7113 !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))
7115 if (
const SCEV *Stride =
7118 SPtrInfo.StrideSCEV = Stride;
7127 unsigned *BestVF,
bool TryRecursiveCheck)
const {
7140 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
7146 const size_t Sz = VL.
size();
7148 auto *POIter = PointerOps.
begin();
7149 for (
Value *V : VL) {
7151 if (!L || !L->isSimple())
7153 *POIter = L->getPointerOperand();
7159 bool IsSorted =
sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
7168 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
7169 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
7180 if (Order.
empty()) {
7181 Ptr0 = PointerOps.
front();
7182 PtrN = PointerOps.
back();
7184 Ptr0 = PointerOps[Order.
front()];
7185 PtrN = PointerOps[Order.
back()];
7187 std::optional<int64_t> Diff =
7190 if (
static_cast<uint64_t>(*Diff) == Sz - 1)
7193 *TLI, [&](
Value *V) {
7194 return areAllUsersVectorized(
7202 *Diff, Ptr0, PtrN, SPtrInfo))
7205 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
7206 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
7211 auto CheckForShuffledLoads = [&, &TTI = *TTI](
Align CommonAlignment,
7213 bool ProfitableGatherPointers) {
7218 auto [ScalarGEPCost, VectorGEPCost] =
7220 Instruction::GetElementPtr,
CostKind, ScalarTy, VecTy);
7224 Type *PtrScalarTy = PointerOps.
front()->getType()->getScalarType();
7226 if (
static_cast<unsigned>(
count_if(
7245 return C + TTI.getInstructionCost(
7251 TTI.getMemIntrinsicInstrCost(
7254 false, CommonAlignment),
7256 (ProfitableGatherPointers ? 0 : VectorGEPCost);
7264 constexpr unsigned ListLimit = 4;
7265 if (!TryRecursiveCheck || VL.
size() < ListLimit)
7274 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
7284 for (
unsigned Cnt = 0, End = VL.
size(); Cnt + VF <= End; Cnt += VF) {
7289 PointerOps, SPtrInfo, BestVF,
7297 DemandedElts.
setBits(Cnt, Cnt + VF);
7313 if (!DemandedElts.
isZero()) {
7319 if (DemandedElts[Idx])
7330 LI0->getPointerOperand(),
7331 Instruction::GetElementPtr,
CostKind, ScalarTy,
7335 if (
static_cast<unsigned>(
7337 PointerOps.
size() - 1 ||
7356 TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
7357 LI0->getPointerAddressSpace(),
CostKind,
7362 VecLdCost += TTI.getMemIntrinsicInstrCost(
7364 Intrinsic::experimental_vp_strided_load,
7365 SubVecTy, LI0->getPointerOperand(),
7366 false, CommonAlignment),
7371 VecLdCost += TTI.getMemIntrinsicInstrCost(
7373 Intrinsic::masked_load, SubVecTy,
7374 CommonAlignment, LI0->getPointerAddressSpace()),
7380 VecLdCost += TTI.getMemIntrinsicInstrCost(
7382 Intrinsic::masked_gather, SubVecTy,
7383 LI0->getPointerOperand(),
7384 false, CommonAlignment),
7394 ShuffleMask[Idx] = Idx / VF ==
I ? VL.
size() + Idx % VF : Idx;
7403 if (MaskedGatherCost >= VecLdCost &&
7416 bool ProfitableGatherPointers =
7417 L && Sz > 2 &&
static_cast<unsigned>(
count_if(PointerOps, [L](
Value *V) {
7418 return L->isLoopInvariant(V);
7420 if (ProfitableGatherPointers ||
all_of(PointerOps, [](
Value *
P) {
7423 (
GEP &&
GEP->getNumOperands() == 2 &&
7431 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
7432 ProfitableGatherPointers))
7444 all_of(VL, [](
const Value *V) {
return V->getType()->isPointerTy(); }) &&
7445 "Expected list of pointer operands.");
7450 std::pair<BasicBlock *, Value *>,
7454 .try_emplace(std::make_pair(
7458 SortedIndices.
clear();
7460 auto Key = std::make_pair(BBs[Cnt + 1],
7462 bool Found =
any_of(Bases.try_emplace(
Key).first->second,
7463 [&, &Cnt = Cnt, &Ptr = Ptr](
auto &
Base) {
7464 std::optional<int64_t> Diff =
7465 getPointersDiff(ElemTy, std::get<0>(Base.front()),
7466 ElemTy, Ptr, DL, SE,
7471 Base.emplace_back(Ptr, *Diff, Cnt + 1);
7477 if (Bases.size() > VL.
size() / 2 - 1)
7481 Bases.find(
Key)->second.emplace_back().emplace_back(Ptr, 0, Cnt + 1);
7485 if (Bases.size() == VL.
size())
7488 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
7489 Bases.front().second.size() == VL.
size()))
7494 auto ComparePointers = [](
Value *Ptr1,
Value *Ptr2) {
7503 FirstPointers.
insert(P1);
7504 SecondPointers.
insert(P2);
7510 "Unable to find matching root.");
7513 for (
auto &
Base : Bases) {
7514 for (
auto &Vec :
Base.second) {
7515 if (Vec.size() > 1) {
7517 int64_t InitialOffset = std::get<1>(Vec[0]);
7518 bool AnyConsecutive =
7520 return std::get<1>(
P.value()) ==
7521 int64_t(
P.index()) + InitialOffset;
7525 if (!AnyConsecutive)
7530 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
7534 for (
auto &
T : Bases)
7535 for (
const auto &Vec :
T.second)
7536 for (
const auto &
P : Vec)
7540 "Expected SortedIndices to be the size of VL");
7544std::optional<BoUpSLP::OrdersType>
7546 assert(TE.isGather() &&
"Expected gather node only.");
7547 Type *ScalarTy = TE.Scalars[0]->getType();
7550 Ptrs.
reserve(TE.Scalars.size());
7552 BBs.
reserve(TE.Scalars.size());
7553 for (
Value *V : TE.Scalars) {
7555 if (!L || !L->isSimple())
7556 return std::nullopt;
7562 if (!LoadEntriesToVectorize.contains(TE.Idx) &&
7564 return std::move(Order);
7565 return std::nullopt;
7576 if (VU->
getType() != V->getType())
7579 if (!VU->
hasOneUse() && !V->hasOneUse())
7585 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
7592 bool IsReusedIdx =
false;
7594 if (IE2 == VU && !IE1)
7596 if (IE1 == V && !IE2)
7597 return V->hasOneUse();
7598 if (IE1 && IE1 != V) {
7600 IsReusedIdx |= ReusedIdx.
test(Idx1);
7601 ReusedIdx.
set(Idx1);
7602 if ((IE1 != VU && !IE1->
hasOneUse()) || IsReusedIdx)
7607 if (IE2 && IE2 != VU) {
7609 IsReusedIdx |= ReusedIdx.
test(Idx2);
7610 ReusedIdx.
set(Idx2);
7611 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
7616 }
while (!IsReusedIdx && (IE1 || IE2));
7626std::optional<BoUpSLP::OrdersType>
7628 bool IgnoreReorder) {
7631 if (!TE.ReuseShuffleIndices.empty()) {
7633 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
7634 "Reshuffling scalars not yet supported for nodes with padding");
7637 return std::nullopt;
7645 unsigned Sz = TE.Scalars.size();
7646 if (TE.isGather()) {
7647 if (std::optional<OrdersType> CurrentOrder =
7652 ::addMask(Mask, TE.ReuseShuffleIndices);
7653 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
7654 unsigned Sz = TE.Scalars.size();
7655 for (
int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
7658 Res[Idx + K * Sz] =
I + K * Sz;
7660 return std::move(Res);
7663 if (Sz == 2 && TE.getVectorFactor() == 4 &&
7665 2 * TE.getVectorFactor())) == 1)
7666 return std::nullopt;
7667 if (TE.ReuseShuffleIndices.size() % Sz != 0)
7668 return std::nullopt;
7672 if (TE.ReorderIndices.empty())
7673 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
7676 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
7677 unsigned VF = ReorderMask.
size();
7681 for (
unsigned I = 0;
I < VF;
I += Sz) {
7683 unsigned UndefCnt = 0;
7684 unsigned Limit = std::min(Sz, VF -
I);
7693 Val >=
static_cast<int>(NumParts) || UsedVals.
test(Val) ||
7695 return std::nullopt;
7697 for (
unsigned K = 0; K < NumParts; ++K) {
7698 unsigned Idx = Val + Sz * K;
7699 if (Idx < VF &&
I + K < VF)
7700 ResOrder[Idx] =
I + K;
7703 return std::move(ResOrder);
7705 unsigned VF = TE.getVectorFactor();
7708 TE.ReuseShuffleIndices.end());
7709 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
7711 if (isa<PoisonValue>(V))
7713 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
7714 return Idx && *Idx < Sz;
7716 assert(!TE.isAltShuffle() &&
"Alternate instructions are only supported "
7717 "by BinaryOperator and CastInst.");
7719 if (TE.ReorderIndices.empty())
7720 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
7723 for (
unsigned I = 0;
I < VF; ++
I) {
7724 int &Idx = ReusedMask[
I];
7727 Value *V = TE.Scalars[ReorderMask[Idx]];
7729 Idx = std::distance(ReorderMask.
begin(),
find(ReorderMask, *EI));
7735 std::iota(ResOrder.
begin(), ResOrder.
end(), 0);
7736 auto *It = ResOrder.
begin();
7737 for (
unsigned K = 0; K < VF; K += Sz) {
7741 std::iota(SubMask.
begin(), SubMask.
end(), 0);
7743 transform(CurrentOrder, It, [K](
unsigned Pos) {
return Pos + K; });
7744 std::advance(It, Sz);
7747 return Data.index() ==
Data.value();
7749 return std::nullopt;
7750 return std::move(ResOrder);
7752 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
7753 (!TE.UserTreeIndex || !TE.UserTreeIndex.UserTE->hasState() ||
7755 (TE.ReorderIndices.empty() ||
isReverseOrder(TE.ReorderIndices)))
7756 return std::nullopt;
7757 if (TE.State == TreeEntry::SplitVectorize ||
7758 ((TE.State == TreeEntry::Vectorize ||
7759 TE.State == TreeEntry::StridedVectorize ||
7760 TE.State == TreeEntry::CompressVectorize) &&
7763 assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
7764 "Alternate instructions are only supported by "
7765 "BinaryOperator and CastInst.");
7766 return TE.ReorderIndices;
7768 if (!TopToBottom && IgnoreReorder && TE.State == TreeEntry::Vectorize &&
7769 TE.isAltShuffle()) {
7770 assert(TE.ReuseShuffleIndices.empty() &&
7771 "ReuseShuffleIndices should be "
7772 "empty for alternate instructions.");
7774 TE.buildAltOpShuffleMask(
7776 assert(TE.getMatchingMainOpOrAltOp(
I) &&
7777 "Unexpected main/alternate opcode");
7781 const int VF = TE.getVectorFactor();
7786 ResOrder[Mask[
I] % VF] =
I;
7788 return std::move(ResOrder);
7790 if (!TE.ReorderIndices.empty())
7791 return TE.ReorderIndices;
7792 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
7793 if (!TE.ReorderIndices.empty())
7794 return TE.ReorderIndices;
7797 for (
auto [
I, V] :
zip(UserBVHead, TE.Scalars)) {
7805 while (
II &&
II->hasOneUse() &&
II->getParent() == BB) {
7813 assert(BB1 != BB2 &&
"Expected different basic blocks.");
7814 if (!DT->isReachableFromEntry(BB1))
7816 if (!DT->isReachableFromEntry(BB2))
7818 auto *NodeA = DT->getNode(BB1);
7819 auto *NodeB = DT->getNode(BB2);
7820 assert(NodeA &&
"Should only process reachable instructions");
7821 assert(NodeB &&
"Should only process reachable instructions");
7822 assert((NodeA == NodeB) ==
7823 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
7824 "Different nodes should have different DFS numbers");
7825 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
7827 auto PHICompare = [&](
unsigned I1,
unsigned I2) {
7828 Value *V1 = TE.Scalars[I1];
7829 Value *V2 = TE.Scalars[I2];
7842 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
7843 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
7844 FirstUserOfPhi2->getParent());
7854 if (UserBVHead[I1] && !UserBVHead[I2])
7856 if (!UserBVHead[I1])
7858 if (UserBVHead[I1] == UserBVHead[I2])
7861 return CompareByBasicBlocks(UserBVHead[I1]->
getParent(),
7863 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
7876 if (EE1->getOperand(0) == EE2->getOperand(0))
7878 if (!Inst1 && Inst2)
7880 if (Inst1 && Inst2) {
7888 "Expected either instructions or arguments vector operands.");
7889 return P1->getArgNo() < P2->getArgNo();
7894 std::iota(Phis.
begin(), Phis.
end(), 0);
7897 return std::nullopt;
7898 return std::move(Phis);
7900 if (TE.isGather() &&
7901 (!TE.hasState() || !TE.isAltShuffle() ||
7902 ScalarsInSplitNodes.contains(TE.getMainOp())) &&
7906 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
7910 auto *EE = dyn_cast<ExtractElementInst>(V);
7911 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
7917 canReuseExtract(TE.Scalars, CurrentOrder,
true);
7918 if (Reuse || !CurrentOrder.
empty())
7919 return std::move(CurrentOrder);
7927 int Sz = TE.Scalars.size();
7931 if (It == TE.Scalars.begin())
7934 if (It != TE.Scalars.end()) {
7936 unsigned Idx = std::distance(TE.Scalars.begin(), It);
7951 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
7954 return std::move(Order);
7959 return std::nullopt;
7960 if (TE.Scalars.size() >= 3)
7965 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
7967 StridedPtrInfo SPtrInfo;
7970 CurrentOrder, PointerOps, SPtrInfo);
7973 return std::move(CurrentOrder);
7978 if (std::optional<OrdersType> CurrentOrder =
7980 return CurrentOrder;
7982 return std::nullopt;
7992 for (
unsigned I = Sz,
E = Mask.size();
I <
E;
I += Sz) {
7994 if (Cluster != FirstCluster)
8000void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const {
8003 const unsigned Sz =
TE.Scalars.size();
8005 if (!
TE.isGather() ||
8012 addMask(NewMask,
TE.ReuseShuffleIndices);
8014 TE.ReorderIndices.clear();
8021 for (
auto *It =
TE.ReuseShuffleIndices.begin(),
8022 *End =
TE.ReuseShuffleIndices.end();
8023 It != End; std::advance(It, Sz))
8024 std::iota(It, std::next(It, Sz), 0);
8030 "Expected same size of orders");
8031 size_t Sz = Order.
size();
8034 if (Order[Idx] != Sz)
8035 UsedIndices.
set(Order[Idx]);
8037 if (SecondaryOrder.
empty()) {
8039 if (Order[Idx] == Sz && !UsedIndices.
test(Idx))
8043 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
8044 !UsedIndices.
test(SecondaryOrder[Idx]))
8045 Order[Idx] = SecondaryOrder[Idx];
8053 constexpr unsigned TinyVF = 2;
8054 constexpr unsigned TinyTree = 10;
8055 constexpr unsigned PhiOpsLimit = 12;
8056 constexpr unsigned GatherLoadsLimit = 2;
8057 if (VectorizableTree.size() <= TinyTree)
8059 if (VectorizableTree.front()->hasState() &&
8060 !VectorizableTree.front()->isGather() &&
8061 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
8062 VectorizableTree.front()->getOpcode() == Instruction::PHI ||
8063 (VectorizableTree.front()->getVectorFactor() <= TinyVF &&
8064 (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
8065 VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
8066 VectorizableTree.front()->ReorderIndices.empty()) {
8070 if (VectorizableTree.front()->hasState() &&
8071 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
8072 VectorizableTree.front()->Scalars.size() == TinyVF &&
8073 VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
8076 if (VectorizableTree.front()->hasState() &&
8077 VectorizableTree.front()->getOpcode() == Instruction::Store &&
8078 VectorizableTree.front()->ReorderIndices.empty()) {
8079 const unsigned ReorderedSplitsCnt =
8080 count_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
8081 return TE->State == TreeEntry::SplitVectorize &&
8082 !TE->ReorderIndices.empty() && TE->UserTreeIndex.UserTE &&
8083 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8086 if (ReorderedSplitsCnt <= 1 &&
8088 VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
8089 return ((!TE->isGather() &&
8090 (TE->ReorderIndices.empty() ||
8091 (TE->UserTreeIndex.UserTE &&
8092 TE->UserTreeIndex.UserTE->State ==
8093 TreeEntry::Vectorize &&
8094 !TE->UserTreeIndex.UserTE->ReuseShuffleIndices
8096 (TE->isGather() && TE->ReorderIndices.empty() &&
8097 (!TE->hasState() || TE->isAltShuffle() ||
8098 TE->getOpcode() == Instruction::Load ||
8099 TE->getOpcode() == Instruction::ZExt ||
8100 TE->getOpcode() == Instruction::SExt))) &&
8101 (VectorizableTree.front()->getVectorFactor() > TinyVF ||
8102 !TE->isGather() ||
none_of(TE->Scalars, [&](
Value *V) {
8103 return !isConstant(V) && isVectorized(V);
8105 })) >= VectorizableTree.size() - ReorderedSplitsCnt)
8108 bool HasPhis =
false;
8109 bool HasLoad =
true;
8110 unsigned GatherLoads = 0;
8111 for (
const std::unique_ptr<TreeEntry> &TE :
8112 ArrayRef(VectorizableTree).drop_front()) {
8113 if (TE->State == TreeEntry::SplitVectorize)
8115 if (!TE->hasState()) {
8119 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8124 if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) {
8125 if (!TE->isGather()) {
8132 if (GatherLoads >= GatherLoadsLimit)
8135 if (TE->getOpcode() == Instruction::GetElementPtr ||
8138 if (TE->getOpcode() != Instruction::PHI &&
8139 (!TE->hasCopyableElements() ||
8141 TE->Scalars.size() / 2))
8143 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8144 TE->getNumOperands() > PhiOpsLimit)
8153void BoUpSLP::TreeEntry::reorderSplitNode(
unsigned Idx,
ArrayRef<int> Mask,
8155 assert(State == TreeEntry::SplitVectorize &&
"Expected split user node.");
8158 std::iota(NewMask.
begin(), NewMask.
end(), 0);
8159 std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0);
8162 copy(MaskOrder, NewMaskOrder.begin());
8164 assert(Idx == 1 &&
"Expected either 0 or 1 index.");
8165 unsigned Offset = CombinedEntriesWithIndices.
back().second;
8174 ReorderIndices.clear();
8193 ExternalUserReorderMap;
8197 for_each(VectorizableTree, [&, &TTIRef = *TTI](
8198 const std::unique_ptr<TreeEntry> &TE) {
8201 findExternalStoreUsersReorderIndices(TE.get());
8202 if (!ExternalUserReorderIndices.
empty()) {
8203 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
8205 std::move(ExternalUserReorderIndices));
8211 if (TE->hasState() && TE->isAltShuffle() &&
8212 TE->State != TreeEntry::SplitVectorize) {
8213 Type *ScalarTy = TE->Scalars[0]->getType();
8215 unsigned Opcode0 = TE->getOpcode();
8216 unsigned Opcode1 = TE->getAltOpcode();
8220 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
8221 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
8227 bool IgnoreReorder =
8228 !UserIgnoreList && VectorizableTree.front()->hasState() &&
8229 (VectorizableTree.front()->
getOpcode() == Instruction::InsertElement ||
8230 VectorizableTree.front()->getOpcode() == Instruction::Store);
8231 if (std::optional<OrdersType> CurrentOrder =
8241 const TreeEntry *UserTE = TE.get();
8243 if (!UserTE->UserTreeIndex)
8245 if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8246 UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
8247 UserTE->UserTreeIndex.UserTE->Idx != 0)
8249 UserTE = UserTE->UserTreeIndex.UserTE;
8252 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
8253 if (!(TE->State == TreeEntry::Vectorize ||
8254 TE->State == TreeEntry::StridedVectorize ||
8255 TE->State == TreeEntry::SplitVectorize ||
8256 TE->State == TreeEntry::CompressVectorize) ||
8257 !TE->ReuseShuffleIndices.empty())
8258 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
8259 if (TE->State == TreeEntry::Vectorize &&
8260 TE->getOpcode() == Instruction::PHI)
8261 PhisToOrders.
try_emplace(TE.get(), *CurrentOrder);
8266 for (
unsigned VF = VectorizableTree.front()->getVectorFactor();
8267 !VFToOrderedEntries.
empty() && VF > 1; VF -= 2 - (VF & 1U)) {
8268 auto It = VFToOrderedEntries.
find(VF);
8269 if (It == VFToOrderedEntries.
end())
8283 for (
const TreeEntry *OpTE : OrderedEntries) {
8286 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE) &&
8287 OpTE->State != TreeEntry::SplitVectorize)
8290 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
8292 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
8293 auto It = GathersToOrders.find(OpTE);
8294 if (It != GathersToOrders.end())
8297 if (OpTE->hasState() && OpTE->isAltShuffle()) {
8298 auto It = AltShufflesToOrders.find(OpTE);
8299 if (It != AltShufflesToOrders.end())
8302 if (OpTE->State == TreeEntry::Vectorize &&
8303 OpTE->getOpcode() == Instruction::PHI) {
8304 auto It = PhisToOrders.
find(OpTE);
8305 if (It != PhisToOrders.
end())
8308 return OpTE->ReorderIndices;
8311 auto It = ExternalUserReorderMap.
find(OpTE);
8312 if (It != ExternalUserReorderMap.
end()) {
8313 const auto &ExternalUserReorderIndices = It->second;
8317 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
8318 OrdersUses.try_emplace(
OrdersType(), 0).first->second +=
8319 ExternalUserReorderIndices.size();
8321 for (
const OrdersType &ExtOrder : ExternalUserReorderIndices)
8322 ++OrdersUses.try_emplace(ExtOrder, 0).first->second;
8329 if (OpTE->State == TreeEntry::Vectorize &&
8330 OpTE->getOpcode() == Instruction::Store && !Order.
empty()) {
8331 assert(!OpTE->isAltShuffle() &&
8332 "Alternate instructions are only supported by BinaryOperator "
8336 unsigned E = Order.
size();
8339 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8342 ++OrdersUses.try_emplace(CurrentOrder, 0).first->second;
8344 ++OrdersUses.try_emplace(Order, 0).first->second;
8347 if (OrdersUses.empty())
8350 unsigned IdentityCnt = 0;
8351 unsigned FilledIdentityCnt = 0;
8353 for (
auto &Pair : OrdersUses) {
8355 if (!Pair.first.empty())
8356 FilledIdentityCnt += Pair.second;
8357 IdentityCnt += Pair.second;
8362 unsigned Cnt = IdentityCnt;
8363 for (
auto &Pair : OrdersUses) {
8367 if (Cnt < Pair.second ||
8368 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
8369 Cnt == Pair.second && !BestOrder.
empty() &&
8372 BestOrder = Pair.first;
8385 unsigned E = BestOrder.
size();
8387 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8390 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8392 if (TE->Scalars.size() != VF) {
8393 if (TE->ReuseShuffleIndices.size() == VF) {
8394 assert(TE->State != TreeEntry::SplitVectorize &&
8395 "Split vectorized not expected.");
8400 (!TE->UserTreeIndex ||
8401 TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
8402 TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() ||
8403 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) &&
8404 "All users must be of VF size.");
8411 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->hasState() &&
8417 reorderNodeWithReuses(*TE, Mask);
8419 if (TE->UserTreeIndex &&
8420 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8421 TE->UserTreeIndex.UserTE->reorderSplitNode(
8422 TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder);
8426 if ((TE->State == TreeEntry::SplitVectorize &&
8427 TE->ReuseShuffleIndices.empty()) ||
8428 ((TE->State == TreeEntry::Vectorize ||
8429 TE->State == TreeEntry::StridedVectorize ||
8430 TE->State == TreeEntry::CompressVectorize) &&
8435 (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize &&
8436 TE->ReuseShuffleIndices.empty())) &&
8437 "Alternate instructions are only supported by BinaryOperator "
8443 TE->reorderOperands(Mask);
8446 TE->reorderOperands(Mask);
8447 assert(TE->ReorderIndices.empty() &&
8448 "Expected empty reorder sequence.");
8451 if (!TE->ReuseShuffleIndices.empty()) {
8458 addMask(NewReuses, TE->ReuseShuffleIndices);
8459 TE->ReuseShuffleIndices.swap(NewReuses);
8460 }
else if (TE->UserTreeIndex &&
8461 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8463 TE->UserTreeIndex.UserTE->reorderSplitNode(TE->UserTreeIndex.EdgeIdx,
8469void BoUpSLP::buildReorderableOperands(
8470 TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
8474 if (
any_of(Edges, [
I](
const std::pair<unsigned, TreeEntry *> &OpData) {
8475 return OpData.first ==
I &&
8476 (OpData.second->State == TreeEntry::Vectorize ||
8477 OpData.second->State == TreeEntry::StridedVectorize ||
8478 OpData.second->State == TreeEntry::CompressVectorize ||
8479 OpData.second->State == TreeEntry::SplitVectorize);
8483 if (UserTE->hasState()) {
8484 if (UserTE->getOpcode() == Instruction::ExtractElement ||
8485 UserTE->getOpcode() == Instruction::ExtractValue)
8487 if (UserTE->getOpcode() == Instruction::InsertElement &&
I == 0)
8489 if (UserTE->getOpcode() == Instruction::Store &&
8490 UserTE->State == TreeEntry::Vectorize &&
I == 1)
8492 if (UserTE->getOpcode() == Instruction::Load &&
8493 (UserTE->State == TreeEntry::Vectorize ||
8494 UserTE->State == TreeEntry::StridedVectorize ||
8495 UserTE->State == TreeEntry::CompressVectorize))
8498 TreeEntry *
TE = getOperandEntry(UserTE,
I);
8499 assert(TE &&
"Expected operand entry.");
8500 if (!
TE->isGather()) {
8503 Edges.emplace_back(
I, TE);
8509 if (
TE->State == TreeEntry::ScatterVectorize &&
8510 TE->ReuseShuffleIndices.empty() &&
TE->ReorderIndices.empty())
8514 if (ReorderableGathers.
contains(TE))
8520 struct TreeEntryCompare {
8521 bool operator()(
const TreeEntry *LHS,
const TreeEntry *RHS)
const {
8522 if (LHS->UserTreeIndex && RHS->UserTreeIndex)
8523 return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx;
8524 return LHS->Idx < RHS->Idx;
8533 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8534 if (TE->State != TreeEntry::Vectorize &&
8535 TE->State != TreeEntry::StridedVectorize &&
8536 TE->State != TreeEntry::CompressVectorize &&
8537 TE->State != TreeEntry::SplitVectorize)
8538 NonVectorized.
insert(TE.get());
8539 if (std::optional<OrdersType> CurrentOrder =
8541 Queue.push(TE.get());
8542 if (!(TE->State == TreeEntry::Vectorize ||
8543 TE->State == TreeEntry::StridedVectorize ||
8544 TE->State == TreeEntry::CompressVectorize ||
8545 TE->State == TreeEntry::SplitVectorize) ||
8546 !TE->ReuseShuffleIndices.empty())
8547 GathersToOrders.
insert(TE.get());
8556 while (!Queue.empty()) {
8558 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>
Users;
8559 TreeEntry *TE = Queue.top();
8560 const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
8563 while (!Queue.empty()) {
8565 if (!UserTE || UserTE != TE->UserTreeIndex.UserTE)
8570 for (TreeEntry *TE : OrderedOps) {
8571 if (!(TE->State == TreeEntry::Vectorize ||
8572 TE->State == TreeEntry::StridedVectorize ||
8573 TE->State == TreeEntry::CompressVectorize ||
8574 TE->State == TreeEntry::SplitVectorize ||
8575 (TE->isGather() && GathersToOrders.
contains(TE))) ||
8576 !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
8577 !Visited.
insert(TE).second)
8581 Users.first = TE->UserTreeIndex.UserTE;
8582 Users.second.emplace_back(TE->UserTreeIndex.EdgeIdx, TE);
8586 if (
Data.first->State == TreeEntry::SplitVectorize) {
8588 Data.second.size() <= 2 &&
8589 "Expected not greater than 2 operands for split vectorize node.");
8591 [](
const auto &
Op) { return !Op.second->UserTreeIndex; }))
8594 assert(
Data.first->CombinedEntriesWithIndices.size() == 2 &&
8595 "Expected exactly 2 entries.");
8596 for (
const auto &
P :
Data.first->CombinedEntriesWithIndices) {
8597 TreeEntry &OpTE = *VectorizableTree[
P.first];
8599 if (Order.
empty() || !OpTE.ReuseShuffleIndices.empty()) {
8600 if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
8602 const auto BestOrder =
8611 const unsigned E = Order.
size();
8614 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8616 Data.first->reorderSplitNode(
P.second ? 1 : 0, Mask, MaskOrder);
8618 if (!OpTE.ReorderIndices.empty()) {
8619 OpTE.ReorderIndices.clear();
8620 }
else if (!OpTE.ReuseShuffleIndices.empty()) {
8623 assert(OpTE.isGather() &&
"Expected only gather/buildvector node.");
8627 if (
Data.first->ReuseShuffleIndices.empty() &&
8628 !
Data.first->ReorderIndices.empty()) {
8631 Queue.push(
Data.first);
8637 buildReorderableOperands(
Data.first,
Data.second, NonVectorized,
8649 for (
const auto &
Op :
Data.second) {
8650 TreeEntry *OpTE =
Op.second;
8651 if (!VisitedOps.
insert(OpTE).second)
8653 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
8655 const auto Order = [&]() ->
const OrdersType {
8656 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
8660 return OpTE->ReorderIndices;
8664 if (Order.
size() == 1)
8670 Value *Root = OpTE->hasState()
8673 auto GetSameNodesUsers = [&](
Value *Root) {
8675 for (
const TreeEntry *TE : ValueToGatherNodes.lookup(Root)) {
8676 if (TE != OpTE && TE->UserTreeIndex &&
8677 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8678 TE->Scalars.size() == OpTE->Scalars.size() &&
8679 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8680 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8681 Res.
insert(TE->UserTreeIndex.UserTE);
8683 for (
const TreeEntry *TE : getTreeEntries(Root)) {
8684 if (TE != OpTE && TE->UserTreeIndex &&
8685 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8686 TE->Scalars.size() == OpTE->Scalars.size() &&
8687 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
8688 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
8689 Res.
insert(TE->UserTreeIndex.UserTE);
8693 auto GetNumOperands = [](
const TreeEntry *TE) {
8694 if (TE->State == TreeEntry::SplitVectorize)
8695 return TE->getNumOperands();
8697 return CI->arg_size();
8698 return TE->getNumOperands();
8700 auto NodeShouldBeReorderedWithOperands = [&, TTI = TTI](
8701 const TreeEntry *TE) {
8709 const TreeEntry *
Op = getOperandEntry(TE, Idx);
8710 if (
Op->isGather() &&
Op->hasState()) {
8711 const TreeEntry *VecOp =
8712 getSameValuesTreeEntry(
Op->getMainOp(),
Op->Scalars);
8716 if (
Op->ReorderIndices.empty() &&
Op->ReuseShuffleIndices.empty())
8723 if (!RevisitedOps.
insert(UTE).second)
8725 return UTE ==
Data.first || !UTE->ReorderIndices.empty() ||
8726 !UTE->ReuseShuffleIndices.empty() ||
8727 (UTE->UserTreeIndex &&
8728 UTE->UserTreeIndex.UserTE ==
Data.first) ||
8729 (
Data.first->UserTreeIndex &&
8730 Data.first->UserTreeIndex.UserTE == UTE) ||
8731 (IgnoreReorder && UTE->UserTreeIndex &&
8732 UTE->UserTreeIndex.UserTE->Idx == 0) ||
8733 NodeShouldBeReorderedWithOperands(UTE);
8736 for (TreeEntry *UTE :
Users) {
8744 const TreeEntry *
Op = getOperandEntry(UTE, Idx);
8746 Queue.push(
const_cast<TreeEntry *
>(
Op));
8751 Data.second, [OpTE](
const std::pair<unsigned, TreeEntry *> &
P) {
8752 return P.second == OpTE;
8755 if (OpTE->State == TreeEntry::Vectorize &&
8756 OpTE->getOpcode() == Instruction::Store && !Order.
empty()) {
8757 assert(!OpTE->isAltShuffle() &&
8758 "Alternate instructions are only supported by BinaryOperator "
8762 unsigned E = Order.
size();
8765 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8768 OrdersUses.try_emplace(CurrentOrder, 0).first->second +=
NumOps;
8770 OrdersUses.try_emplace(Order, 0).first->second +=
NumOps;
8772 auto Res = OrdersUses.try_emplace(
OrdersType(), 0);
8773 const auto AllowsReordering = [&](
const TreeEntry *TE) {
8774 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
8775 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
8776 (IgnoreReorder && TE->Idx == 0))
8778 if (TE->isGather()) {
8788 if (OpTE->UserTreeIndex) {
8789 TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE;
8790 if (!VisitedUsers.
insert(UserTE).second)
8795 if (AllowsReordering(UserTE))
8803 if (
static_cast<unsigned>(
count_if(
8804 Ops, [UserTE, &AllowsReordering](
8805 const std::pair<unsigned, TreeEntry *> &
Op) {
8806 return AllowsReordering(
Op.second) &&
8807 Op.second->UserTreeIndex.UserTE == UserTE;
8808 })) <=
Ops.size() / 2)
8809 ++Res.first->second;
8812 if (OrdersUses.empty()) {
8817 unsigned IdentityCnt = 0;
8818 unsigned VF =
Data.second.front().second->getVectorFactor();
8820 for (
auto &Pair : OrdersUses) {
8822 IdentityCnt += Pair.second;
8827 unsigned Cnt = IdentityCnt;
8828 for (
auto &Pair : OrdersUses) {
8832 if (Cnt < Pair.second) {
8834 BestOrder = Pair.first;
8851 unsigned E = BestOrder.
size();
8853 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8855 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second) {
8856 TreeEntry *TE =
Op.second;
8857 if (!VisitedOps.
insert(TE).second)
8859 if (TE->ReuseShuffleIndices.size() == BestOrder.
size()) {
8860 reorderNodeWithReuses(*TE, Mask);
8864 if (TE->State != TreeEntry::Vectorize &&
8865 TE->State != TreeEntry::StridedVectorize &&
8866 TE->State != TreeEntry::CompressVectorize &&
8867 TE->State != TreeEntry::SplitVectorize &&
8868 (TE->State != TreeEntry::ScatterVectorize ||
8869 TE->ReorderIndices.empty()))
8871 assert((BestOrder.
size() == TE->ReorderIndices.size() ||
8872 TE->ReorderIndices.empty()) &&
8873 "Non-matching sizes of user/operand entries.");
8875 if (IgnoreReorder && TE == VectorizableTree.front().get())
8876 IgnoreReorder =
false;
8879 for (TreeEntry *
Gather : GatherOps) {
8881 "Unexpected reordering of gathers.");
8882 if (!
Gather->ReuseShuffleIndices.empty()) {
8892 auto IsNotProfitableAltCodeNode = [](
const TreeEntry &TE) {
8893 return TE.isAltShuffle() &&
8894 (!TE.ReuseShuffleIndices.empty() || TE.getVectorFactor() == 2 ||
8895 TE.ReorderIndices.empty());
8897 if (
Data.first->State != TreeEntry::Vectorize ||
8899 Data.first->getMainOp()) ||
8900 IsNotProfitableAltCodeNode(*
Data.first))
8901 Data.first->reorderOperands(Mask);
8903 IsNotProfitableAltCodeNode(*
Data.first) ||
8904 Data.first->State == TreeEntry::StridedVectorize ||
8905 Data.first->State == TreeEntry::CompressVectorize) {
8909 if (
Data.first->ReuseShuffleIndices.empty() &&
8910 !
Data.first->ReorderIndices.empty() &&
8911 !IsNotProfitableAltCodeNode(*
Data.first)) {
8914 Queue.push(
Data.first);
8922 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
8923 VectorizableTree.front()->ReuseShuffleIndices.empty())
8924 VectorizableTree.front()->ReorderIndices.
clear();
8927Instruction *BoUpSLP::getRootEntryInstruction(
const TreeEntry &Entry)
const {
8928 if (Entry.hasState() &&
8929 (Entry.getOpcode() == Instruction::Store ||
8930 Entry.getOpcode() == Instruction::Load) &&
8931 Entry.State == TreeEntry::StridedVectorize &&
8932 !Entry.ReorderIndices.empty() &&
isReverseOrder(Entry.ReorderIndices))
8939 const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
8942 for (
auto &TEPtr : VectorizableTree) {
8943 TreeEntry *Entry = TEPtr.get();
8946 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
8950 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
8951 Value *Scalar = Entry->Scalars[Lane];
8956 auto It = ScalarToExtUses.
find(Scalar);
8957 if (It != ScalarToExtUses.
end() && !ExternalUses[It->second].User)
8960 if (Scalar->hasNUsesOrMore(NumVectScalars)) {
8961 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8962 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract from lane " << FoundLane
8963 <<
" from " << *Scalar <<
"for many users.\n");
8964 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
8965 ExternalUses.emplace_back(Scalar,
nullptr, *Entry, FoundLane);
8966 ExternalUsesWithNonUsers.insert(Scalar);
8971 const auto ExtI = ExternallyUsedValues.
find(Scalar);
8972 if (ExtI != ExternallyUsedValues.
end()) {
8973 unsigned FoundLane = Entry->findLaneForValue(Scalar);
8974 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract: Extra arg from lane "
8975 << FoundLane <<
" from " << *Scalar <<
".\n");
8976 ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size());
8977 ExternalUses.emplace_back(Scalar,
nullptr, *Entry, FoundLane);
8988 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
8993 !UseEntries.
empty()) {
8997 if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
9000 all_of(UseEntries, [&](TreeEntry *UseEntry) {
9001 return UseEntry->State == TreeEntry::ScatterVectorize ||
9003 Scalar, getRootEntryInstruction(*UseEntry), TLI,
9006 LLVM_DEBUG(
dbgs() <<
"SLP: \tInternal user will be removed:" << *U
9009 [](TreeEntry *UseEntry) {
9010 return UseEntry->isGather();
9016 if (It != ScalarToExtUses.
end()) {
9017 ExternalUses[It->second].User =
nullptr;
9022 if (U && Scalar->hasNUsesOrMore(
UsesLimit))
9024 unsigned FoundLane = Entry->findLaneForValue(Scalar);
9026 <<
" from lane " << FoundLane <<
" from " << *Scalar
9028 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
9029 ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
9030 ExternalUsesWithNonUsers.insert(Scalar);
9039BoUpSLP::collectUserStores(
const BoUpSLP::TreeEntry *TE)
const {
9043 for (
unsigned Lane :
seq<unsigned>(0, TE->Scalars.size())) {
9044 Value *V = TE->Scalars[Lane];
9057 if (
SI ==
nullptr || !
SI->isSimple() ||
SI->getFunction() !=
F ||
9066 auto &StoresVec = PtrToStoresMap[{
SI->getParent(),
9067 SI->getValueOperand()->getType(), Ptr}];
9070 if (StoresVec.size() > Lane)
9072 if (!StoresVec.empty()) {
9074 SI->getValueOperand()->getType(),
SI->getPointerOperand(),
9075 SI->getValueOperand()->getType(),
9076 StoresVec.front()->getPointerOperand(), *
DL, *SE,
9082 StoresVec.push_back(SI);
9087 for (
auto &
P : PtrToStoresMap) {
9102 StoreInst *S0 = StoresVec[0];
9107 StoreInst *
SI = StoresVec[Idx];
9108 std::optional<int64_t> Diff =
9110 SI->getPointerOperand(), *DL, *SE,
9116 if (StoreOffsetVec.
size() != StoresVec.
size())
9118 sort(StoreOffsetVec, llvm::less_first());
9120 int64_t PrevDist = 0;
9121 for (
const auto &
P : StoreOffsetVec) {
9122 if (Idx > 0 &&
P.first != PrevDist + 1)
9130 ReorderIndices.assign(StoresVec.
size(), 0);
9131 bool IsIdentity =
true;
9133 ReorderIndices[
P.second] =
I;
9134 IsIdentity &=
P.second ==
I;
9140 ReorderIndices.clear();
9147 for (
unsigned Idx : Order)
9148 dbgs() << Idx <<
", ";
9154BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE)
const {
9155 unsigned NumLanes =
TE->Scalars.size();
9168 if (StoresVec.
size() != NumLanes)
9173 if (!canFormVector(StoresVec, ReorderIndices))
9178 ExternalReorderIndices.
push_back(ReorderIndices);
9180 return ExternalReorderIndices;
9186 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9187 "TreeEntryToStridedPtrInfoMap is not cleared");
9188 UserIgnoreList = &UserIgnoreLst;
9191 buildTreeRec(Roots, 0,
EdgeInfo());
9196 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9197 "TreeEntryToStridedPtrInfoMap is not cleared");
9200 buildTreeRec(Roots, 0,
EdgeInfo());
9209 bool AddNew =
true) {
9217 for (
Value *V : VL) {
9221 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
9223 bool IsFound =
false;
9224 for (
auto [Map,
Data] :
zip(ClusteredDistToLoad, ClusteredLoads)) {
9225 assert(LI->getParent() ==
Data.front().first->getParent() &&
9226 LI->getType() ==
Data.front().first->getType() &&
9230 "Expected loads with the same type, same parent and same "
9231 "underlying pointer.");
9233 LI->getType(), LI->getPointerOperand(),
Data.front().first->getType(),
9234 Data.front().first->getPointerOperand(),
DL, SE,
9238 auto It = Map.find(*Dist);
9239 if (It != Map.end() && It->second != LI)
9241 if (It == Map.end()) {
9242 Data.emplace_back(LI, *Dist);
9243 Map.try_emplace(*Dist, LI);
9253 auto FindMatchingLoads =
9258 int64_t &
Offset,
unsigned &Start) {
9260 return GatheredLoads.
end();
9269 std::optional<int64_t> Dist =
9271 Data.front().first->getType(),
9272 Data.front().first->getPointerOperand(),
DL, SE,
9278 for (std::pair<LoadInst *, int64_t>
P :
Data) {
9284 unsigned NumUniques = 0;
9285 for (
auto [Cnt, Pair] :
enumerate(Loads)) {
9286 bool Used = DataLoads.
contains(Pair.first);
9287 if (!Used && !DataDists.
contains(*Dist + Pair.second)) {
9291 Repeated.insert(Cnt);
9294 if (NumUniques > 0 &&
9295 (Loads.
size() == NumUniques ||
9296 (Loads.
size() - NumUniques >= 2 &&
9297 Loads.
size() - NumUniques >= Loads.
size() / 2 &&
9303 return std::next(GatheredLoads.
begin(), Idx);
9307 return GatheredLoads.
end();
9309 for (
ArrayRef<std::pair<LoadInst *, int64_t>>
Data : ClusteredLoads) {
9313 auto *It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
9315 while (It != GatheredLoads.
end()) {
9316 assert(!LocalToAdd.
empty() &&
"Expected some elements to add.");
9317 for (
unsigned Idx : LocalToAdd)
9320 It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
Offset,
9324 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
9331 Loads.push_back(
Data[Idx]);
9337 GatheredLoads, [&](
ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9338 return PD.front().first->getParent() == LI->
getParent() &&
9339 PD.front().first->getType() == LI->
getType();
9341 while (It != GatheredLoads.
end()) {
9344 std::next(It), GatheredLoads.
end(),
9345 [&](
ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9346 return PD.front().first->getParent() == LI->getParent() &&
9347 PD.front().first->getType() == LI->getType();
9351 GatheredLoads.emplace_back().append(
Data.begin(),
Data.end());
9352 AddNewLoads(GatheredLoads.emplace_back());
9357void BoUpSLP::tryToVectorizeGatheredLoads(
9358 const SmallMapVector<
9359 std::tuple<BasicBlock *, Value *, Type *>,
9362 GatheredLoadsEntriesFirst = VectorizableTree.
size();
9365 LoadEntriesToVectorize.size());
9366 for (
auto [Idx, Set] :
zip(LoadEntriesToVectorize, LoadSetsToVectorize))
9367 Set.insert_range(VectorizableTree[Idx]->Scalars);
9370 auto LoadSorter = [](
const std::pair<LoadInst *, int64_t> &L1,
9371 const std::pair<LoadInst *, int64_t> &L2) {
9372 return L1.second > L2.second;
9379 auto *Ty =
getWidenedType(Loads.front()->getType(), Loads.size());
9380 return TTI->isLegalMaskedGather(Ty, Alignment) &&
9381 !TTI->forceScalarizeMaskedGather(Ty, Alignment);
9386 SmallVectorImpl<LoadInst *> &NonVectorized,
9387 bool Final,
unsigned MaxVF) {
9389 unsigned StartIdx = 0;
9390 SmallVector<int> CandidateVFs;
9394 *TTI, Loads.
front()->getType(), MaxVF);
9396 *TTI, Loads.
front()->getType(), NumElts - 1)) {
9402 if (Final && CandidateVFs.
empty())
9405 unsigned BestVF = Final ? CandidateVFs.
back() : 0;
9406 for (
unsigned NumElts : CandidateVFs) {
9407 if (Final && NumElts > BestVF)
9409 SmallVector<unsigned> MaskedGatherVectorized;
9410 for (
unsigned Cnt = StartIdx,
E = Loads.
size(); Cnt <
E;
9414 if (VectorizedLoads.count(Slice.
front()) ||
9415 VectorizedLoads.count(Slice.
back()) ||
9421 bool AllowToVectorize =
false;
9424 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
9427 for (LoadInst *LI : Slice) {
9429 if (LI->hasOneUse())
9435 if (
static_cast<unsigned int>(std::distance(
9436 LI->user_begin(), LI->user_end())) != LI->getNumUses())
9438 if (!IsLegalBroadcastLoad)
9442 for (User *U : LI->users()) {
9445 for (
const TreeEntry *UTE : getTreeEntries(U)) {
9446 for (
int I :
seq<int>(UTE->getNumOperands())) {
9448 return V == LI || isa<PoisonValue>(V);
9458 AllowToVectorize = CheckIfAllowed(Slice);
9462 any_of(ValueToGatherNodes.at(Slice.front()),
9463 [=](
const TreeEntry *TE) {
9464 return TE->Scalars.size() == 2 &&
9465 ((TE->Scalars.front() == Slice.front() &&
9466 TE->Scalars.back() == Slice.back()) ||
9467 (TE->Scalars.front() == Slice.back() &&
9468 TE->Scalars.back() == Slice.front()));
9473 if (AllowToVectorize) {
9478 reinterpret_cast<Value *
const *
>(Slice.begin()), Slice.size());
9479 StridedPtrInfo SPtrInfo;
9481 PointerOps, SPtrInfo, &BestVF);
9483 (BestVF > 1 &&
static_cast<unsigned>(NumElts) == 2 * BestVF)) {
9485 if (MaskedGatherVectorized.
empty() ||
9486 Cnt >= MaskedGatherVectorized.
back() + NumElts)
9491 Results.emplace_back(Values, LS);
9492 VectorizedLoads.insert_range(Slice);
9495 if (Cnt == StartIdx)
9496 StartIdx += NumElts;
9499 if (StartIdx >= Loads.
size())
9503 if (!MaskedGatherVectorized.
empty() &&
9504 Cnt < MaskedGatherVectorized.
back() + NumElts)
9510 if (!AllowToVectorize || BestVF == 0)
9514 for (
unsigned Cnt : MaskedGatherVectorized) {
9516 Cnt, std::min<unsigned>(NumElts, Loads.
size() - Cnt));
9520 VectorizedLoads.insert_range(Slice);
9522 if (Cnt == StartIdx)
9523 StartIdx += NumElts;
9526 for (LoadInst *LI : Loads) {
9527 if (!VectorizedLoads.contains(LI))
9528 NonVectorized.push_back(LI);
9532 auto ProcessGatheredLoads =
9535 bool Final =
false) {
9537 for (
ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists :
9539 if (LoadsDists.size() <= 1) {
9540 NonVectorized.
push_back(LoadsDists.back().first);
9548 unsigned MaxConsecutiveDistance = 0;
9549 unsigned CurrentConsecutiveDist = 1;
9550 int64_t LastDist = LocalLoadsDists.front().second;
9551 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
9552 for (
const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {
9555 assert(LastDist >=
L.second &&
9556 "Expected first distance always not less than second");
9557 if (
static_cast<uint64_t
>(LastDist -
L.second) ==
9558 CurrentConsecutiveDist) {
9559 ++CurrentConsecutiveDist;
9560 MaxConsecutiveDistance =
9561 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
9565 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
9568 CurrentConsecutiveDist = 1;
9569 LastDist =
L.second;
9572 if (Loads.
size() <= 1)
9574 if (AllowMaskedGather)
9575 MaxConsecutiveDistance = Loads.
size();
9576 else if (MaxConsecutiveDistance < 2)
9581 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
9582 Final, MaxConsecutiveDistance);
9584 OriginalLoads.size() == Loads.
size() &&
9585 MaxConsecutiveDistance == Loads.
size() &&
9590 VectorizedLoads.
clear();
9594 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
9595 UnsortedNonVectorized, Final,
9596 OriginalLoads.size());
9597 if (SortedNonVectorized.
size() >= UnsortedNonVectorized.
size()) {
9598 SortedNonVectorized.
swap(UnsortedNonVectorized);
9599 Results.swap(UnsortedResults);
9604 << Slice.
size() <<
")\n");
9606 for (
Value *L : Slice)
9614 unsigned MaxVF = Slice.size();
9615 unsigned UserMaxVF = 0;
9616 unsigned InterleaveFactor = 0;
9621 std::optional<unsigned> InterleavedLoadsDistance = 0;
9623 std::optional<unsigned> CommonVF = 0;
9624 DenseMap<const TreeEntry *, unsigned> EntryToPosition;
9625 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
9626 for (
auto [Idx, V] :
enumerate(Slice)) {
9627 for (
const TreeEntry *
E : ValueToGatherNodes.at(V)) {
9628 UserMaxVF = std::max<unsigned>(UserMaxVF,
E->Scalars.size());
9631 UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
9633 if (*CommonVF == 0) {
9634 CommonVF =
E->Scalars.size();
9637 if (*CommonVF !=
E->Scalars.size())
9641 if (Pos != Idx && InterleavedLoadsDistance) {
9644 if (isa<Constant>(V))
9646 if (isVectorized(V))
9648 const auto &Nodes = ValueToGatherNodes.at(V);
9649 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
9650 !is_contained(Slice, V);
9652 InterleavedLoadsDistance.reset();
9656 if (*InterleavedLoadsDistance == 0) {
9657 InterleavedLoadsDistance = Idx - Pos;
9660 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
9661 (Idx - Pos) / *InterleavedLoadsDistance < Order)
9662 InterleavedLoadsDistance.reset();
9663 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
9667 DeinterleavedNodes.
clear();
9669 if (InterleavedLoadsDistance.value_or(0) > 1 &&
9670 CommonVF.value_or(0) != 0) {
9671 InterleaveFactor =
bit_ceil(*InterleavedLoadsDistance);
9672 unsigned VF = *CommonVF;
9675 StridedPtrInfo SPtrInfo;
9677 if (InterleaveFactor <= Slice.size() &&
9678 TTI.isLegalInterleavedAccessType(
9686 UserMaxVF = InterleaveFactor * VF;
9688 InterleaveFactor = 0;
9693 unsigned ConsecutiveNodesSize = 0;
9694 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
9695 any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9696 [&, Slice = Slice](
const auto &
P) {
9698 return std::get<1>(
P).contains(V);
9700 if (It == Slice.end())
9702 const TreeEntry &
TE =
9703 *VectorizableTree[std::get<0>(
P)];
9707 StridedPtrInfo SPtrInfo;
9709 VL, VL.
front(), Order, PointerOps, SPtrInfo);
9713 ConsecutiveNodesSize += VL.
size();
9714 size_t Start = std::distance(Slice.begin(), It);
9715 size_t Sz = Slice.size() -
Start;
9716 return Sz < VL.
size() ||
9717 Slice.slice(Start, VL.
size()) != VL;
9722 if (InterleaveFactor == 0 &&
9724 [&, Slice = Slice](
unsigned Idx) {
9726 SmallVector<Value *> PointerOps;
9727 StridedPtrInfo SPtrInfo;
9728 return canVectorizeLoads(
9729 Slice.slice(Idx * UserMaxVF, UserMaxVF),
9730 Slice[Idx * UserMaxVF], Order, PointerOps,
9731 SPtrInfo) == LoadsState::ScatterVectorize;
9734 if (Slice.size() != ConsecutiveNodesSize)
9735 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
9737 for (
unsigned VF = MaxVF; VF >= 2; VF /= 2) {
9738 bool IsVectorized =
true;
9739 for (
unsigned I = 0,
E = Slice.size();
I <
E;
I += VF) {
9741 Slice.slice(
I, std::min(VF,
E -
I));
9746 if (
any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
9747 [&](
const auto &
P) {
9749 VectorizableTree[std::get<0>(
P)]
9754 unsigned Sz = VectorizableTree.size();
9755 buildTreeRec(SubSlice, 0,
EdgeInfo(), InterleaveFactor);
9756 if (Sz == VectorizableTree.size()) {
9757 IsVectorized =
false;
9760 if (InterleaveFactor > 0) {
9761 VF = 2 * (MaxVF / InterleaveFactor);
9762 InterleaveFactor = 0;
9771 NonVectorized.
append(SortedNonVectorized);
9773 return NonVectorized;
9775 for (
const auto &GLs : GatheredLoads) {
9776 const auto &
Ref = GLs.second;
9778 if (!
Ref.empty() && !NonVectorized.
empty() &&
9780 Ref.begin(),
Ref.end(), 0u,
9781 [](
unsigned S,
ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists)
9782 ->
unsigned { return S + LoadsDists.size(); }) !=
9783 NonVectorized.
size() &&
9784 IsMaskedGatherSupported(NonVectorized)) {
9787 for (LoadInst *LI : NonVectorized) {
9795 (void)ProcessGatheredLoads(FinalGatheredLoads,
true);
9799 for (
unsigned Idx : LoadEntriesToVectorize) {
9800 const TreeEntry &
E = *VectorizableTree[Idx];
9803 if (!
E.ReorderIndices.empty()) {
9806 SmallVector<int> ReorderMask;
9810 buildTreeRec(GatheredScalars, 0,
EdgeInfo());
9814 if (
static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
9815 VectorizableTree.size())
9816 GatheredLoadsEntriesFirst.reset();
9826 bool AllowAlternate) {
9861 std::pair<size_t, size_t> OpVals =
9869 if (CI->isCommutative())
9891 SubKey =
hash_value(Gep->getPointerOperand());
9903 return std::make_pair(
Key, SubKey);
9909 Instruction *AltOp,
const TargetLibraryInfo &TLI);
9911bool BoUpSLP::areAltOperandsProfitable(
const InstructionsState &S,
9913 Type *ScalarTy = S.getMainOp()->getType();
9914 unsigned Opcode0 = S.getOpcode();
9915 unsigned Opcode1 = S.getAltOpcode();
9916 SmallBitVector OpcodeMask(
getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
9919 Opcode1, OpcodeMask))
9922 for (
unsigned I :
seq<unsigned>(S.getMainOp()->getNumOperands())) {
9925 for (
Value *V : VL) {
9927 Operands.
back().push_back(
9934 if (Operands.
size() == 2) {
9938 Candidates[0] = std::make_pair(Operands[0][
I], Operands[0][
I + 1]);
9939 Candidates[1] = std::make_pair(Operands[0][
I], Operands[1][
I + 1]);
9940 Candidates[2] = std::make_pair(Operands[1][
I], Operands[0][
I + 1]);
9942 switch (Res.value_or(0)) {
9946 std::swap(Operands[0][
I + 1], Operands[1][
I + 1]);
9956 DenseSet<unsigned> UniqueOpcodes;
9957 constexpr unsigned NumAltInsts = 3;
9958 unsigned NonInstCnt = 0;
9961 unsigned UndefCnt = 0;
9963 unsigned ExtraShuffleInsts = 0;
9966 if (Operands.
size() == 2) {
9968 if (Operands.
front() == Operands.
back()) {
9972 return is_contained(Operands.back(), V);
9975 ++ExtraShuffleInsts;
9978 const Loop *
L = LI->getLoopFor(S.getMainOp()->getParent());
9990 DenseMap<Value *, unsigned> Uniques;
10000 if (!Res.second && Res.first->second == 1)
10001 ++ExtraShuffleInsts;
10002 ++Res.first->getSecond();
10004 UniqueOpcodes.
insert(
I->getOpcode());
10005 else if (Res.second)
10008 return none_of(Uniques, [&](
const auto &
P) {
10009 return P.first->hasNUsesOrMore(
P.second + 1) &&
10010 none_of(
P.first->users(), [&](User *U) {
10011 return isVectorized(U) || Uniques.contains(U);
10020 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
10021 (UniqueOpcodes.
size() + NonInstCnt + ExtraShuffleInsts +
10022 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
10029 const unsigned VF,
unsigned MinBW,
10052static std::pair<InstructionCost, InstructionCost>
10072 FMF = FPCI->getFastMathFlags();
10075 LibCost.isValid() ? LibCost : ScalarLimit);
10085BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
10087 bool IsScatterVectorizeUserTE,
OrdersType &CurrentOrder,
10088 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo) {
10090 "Expected instructions with same/alternate opcodes only.");
10092 unsigned ShuffleOrOp =
10093 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
10095 switch (ShuffleOrOp) {
10096 case Instruction::PHI: {
10099 return TreeEntry::NeedToGather;
10101 for (
Value *V : VL) {
10105 for (
Value *Incoming :
PHI->incoming_values()) {
10107 if (Term &&
Term->isTerminator()) {
10109 <<
"SLP: Need to swizzle PHINodes (terminator use).\n");
10110 return TreeEntry::NeedToGather;
10115 return TreeEntry::Vectorize;
10117 case Instruction::ExtractElement:
10124 return TreeEntry::NeedToGather;
10126 case Instruction::ExtractValue: {
10127 bool Reuse = canReuseExtract(VL, CurrentOrder);
10131 return TreeEntry::NeedToGather;
10132 if (Reuse || !CurrentOrder.empty())
10133 return TreeEntry::Vectorize;
10135 return TreeEntry::NeedToGather;
10137 case Instruction::InsertElement: {
10141 for (
Value *V : VL) {
10143 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement/poison vector.\n");
10144 return TreeEntry::NeedToGather;
10148 "Non-constant or undef index?");
10152 return !SourceVectors.contains(V);
10155 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
10156 "different source vectors.\n");
10157 return TreeEntry::NeedToGather;
10162 return SourceVectors.contains(V) && !
V->hasOneUse();
10165 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
10166 "multiple uses.\n");
10167 return TreeEntry::NeedToGather;
10170 return TreeEntry::Vectorize;
10172 case Instruction::Load: {
10179 auto IsGatheredNode = [&]() {
10180 if (!GatheredLoadsEntriesFirst)
10185 return any_of(getTreeEntries(V), [&](
const TreeEntry *TE) {
10186 return TE->Idx >= *GatheredLoadsEntriesFirst;
10192 return TreeEntry::Vectorize;
10194 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
10196 LoadEntriesToVectorize.insert(VectorizableTree.size());
10197 return TreeEntry::NeedToGather;
10199 return IsGatheredNode() ? TreeEntry::NeedToGather
10200 : TreeEntry::CompressVectorize;
10202 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
10204 LoadEntriesToVectorize.insert(VectorizableTree.size());
10205 return TreeEntry::NeedToGather;
10207 return IsGatheredNode() ? TreeEntry::NeedToGather
10208 : TreeEntry::ScatterVectorize;
10210 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
10212 LoadEntriesToVectorize.insert(VectorizableTree.size());
10213 return TreeEntry::NeedToGather;
10215 return IsGatheredNode() ? TreeEntry::NeedToGather
10216 : TreeEntry::StridedVectorize;
10220 if (DL->getTypeSizeInBits(ScalarTy) !=
10221 DL->getTypeAllocSizeInBits(ScalarTy))
10222 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering loads of non-packed type.\n");
10225 return !LI || !LI->isSimple();
10229 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering non-consecutive loads.\n");
10232 return TreeEntry::NeedToGather;
10236 case Instruction::ZExt:
10237 case Instruction::SExt:
10238 case Instruction::FPToUI:
10239 case Instruction::FPToSI:
10240 case Instruction::FPExt:
10241 case Instruction::PtrToInt:
10242 case Instruction::IntToPtr:
10243 case Instruction::SIToFP:
10244 case Instruction::UIToFP:
10245 case Instruction::Trunc:
10246 case Instruction::FPTrunc:
10247 case Instruction::BitCast: {
10249 for (
Value *V : VL) {
10255 dbgs() <<
"SLP: Gathering casts with different src types.\n");
10256 return TreeEntry::NeedToGather;
10259 return TreeEntry::Vectorize;
10261 case Instruction::ICmp:
10262 case Instruction::FCmp: {
10267 for (
Value *V : VL) {
10271 if ((
Cmp->getPredicate() != P0 &&
Cmp->getPredicate() != SwapP0) ||
10272 Cmp->getOperand(0)->getType() != ComparedTy) {
10273 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering cmp with different predicate.\n");
10274 return TreeEntry::NeedToGather;
10277 return TreeEntry::Vectorize;
10279 case Instruction::Select:
10280 case Instruction::FNeg:
10281 case Instruction::Add:
10282 case Instruction::FAdd:
10283 case Instruction::Sub:
10284 case Instruction::FSub:
10285 case Instruction::Mul:
10286 case Instruction::FMul:
10287 case Instruction::UDiv:
10288 case Instruction::SDiv:
10289 case Instruction::FDiv:
10290 case Instruction::URem:
10291 case Instruction::SRem:
10292 case Instruction::FRem:
10293 case Instruction::Shl:
10294 case Instruction::LShr:
10295 case Instruction::AShr:
10296 case Instruction::And:
10297 case Instruction::Or:
10298 case Instruction::Xor:
10299 case Instruction::Freeze:
10300 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10301 TTI->isFPVectorizationPotentiallyUnsafe() &&
any_of(VL, [](
Value *V) {
10303 return I &&
I->isBinaryOp() && !
I->isFast();
10305 return TreeEntry::NeedToGather;
10306 return TreeEntry::Vectorize;
10307 case Instruction::GetElementPtr: {
10309 for (
Value *V : VL) {
10313 if (
I->getNumOperands() != 2) {
10314 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (nested indexes).\n");
10315 return TreeEntry::NeedToGather;
10322 for (
Value *V : VL) {
10326 Type *CurTy =
GEP->getSourceElementType();
10327 if (Ty0 != CurTy) {
10328 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (different types).\n");
10329 return TreeEntry::NeedToGather;
10335 for (
Value *V : VL) {
10339 auto *
Op =
I->getOperand(1);
10341 (
Op->getType() != Ty1 &&
10343 Op->getType()->getScalarSizeInBits() >
10344 DL->getIndexSizeInBits(
10345 V->getType()->getPointerAddressSpace())))) {
10347 dbgs() <<
"SLP: not-vectorizable GEP (non-constant indexes).\n");
10348 return TreeEntry::NeedToGather;
10352 return TreeEntry::Vectorize;
10354 case Instruction::Store: {
10356 llvm::Type *ScalarTy =
cast<StoreInst>(VL0)->getValueOperand()->getType();
10359 if (DL->getTypeSizeInBits(ScalarTy) !=
10360 DL->getTypeAllocSizeInBits(ScalarTy)) {
10361 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering stores of non-packed type.\n");
10362 return TreeEntry::NeedToGather;
10366 for (
Value *V : VL) {
10368 if (!
SI->isSimple()) {
10370 return TreeEntry::NeedToGather;
10379 if (CurrentOrder.empty()) {
10380 Ptr0 = PointerOps.
front();
10381 PtrN = PointerOps.
back();
10383 Ptr0 = PointerOps[CurrentOrder.front()];
10384 PtrN = PointerOps[CurrentOrder.back()];
10386 std::optional<int64_t> Dist =
10389 if (
static_cast<uint64_t
>(*Dist) == VL.size() - 1)
10390 return TreeEntry::Vectorize;
10394 return TreeEntry::NeedToGather;
10396 case Instruction::Call: {
10397 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10398 TTI->isFPVectorizationPotentiallyUnsafe() &&
any_of(VL, [](
Value *V) {
10400 return I && !
I->isFast();
10402 return TreeEntry::NeedToGather;
10412 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10416 return TreeEntry::NeedToGather;
10419 unsigned NumArgs = CI->
arg_size();
10421 for (
unsigned J = 0; J != NumArgs; ++J)
10424 for (
Value *V : VL) {
10429 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
10431 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched calls:" << *CI <<
"!=" << *V
10433 return TreeEntry::NeedToGather;
10437 for (
unsigned J = 0; J != NumArgs; ++J) {
10440 if (ScalarArgs[J] != A1J) {
10442 <<
"SLP: mismatched arguments in call:" << *CI
10443 <<
" argument " << ScalarArgs[J] <<
"!=" << A1J <<
"\n");
10444 return TreeEntry::NeedToGather;
10453 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched bundle operands in calls:" << *CI
10454 <<
"!=" << *V <<
'\n');
10455 return TreeEntry::NeedToGather;
10460 auto *VecTy =
getWidenedType(S.getMainOp()->getType(), VL.size());
10462 if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())
10463 return TreeEntry::NeedToGather;
10465 return TreeEntry::Vectorize;
10467 case Instruction::ShuffleVector: {
10468 if (!S.isAltShuffle()) {
10471 return TreeEntry::Vectorize;
10474 LLVM_DEBUG(
dbgs() <<
"SLP: ShuffleVector are not vectorized.\n");
10475 return TreeEntry::NeedToGather;
10480 <<
"SLP: ShuffleVector not vectorized, operands are buildvector and "
10481 "the whole alt sequence is not profitable.\n");
10482 return TreeEntry::NeedToGather;
10485 return TreeEntry::Vectorize;
10489 return TreeEntry::NeedToGather;
10498 PHINode *Main =
nullptr;
10503 PHIHandler() =
delete;
10505 : DT(DT), Main(Main), Phis(Phis),
10506 Operands(Main->getNumIncomingValues(),
10508 void buildOperands() {
10509 constexpr unsigned FastLimit = 4;
10518 for (
auto [Idx, V] :
enumerate(Phis)) {
10522 "Expected isa instruction or poison value.");
10523 Operands[
I][Idx] =
V;
10526 if (
P->getIncomingBlock(
I) == InBB)
10527 Operands[
I][Idx] =
P->getIncomingValue(
I);
10529 Operands[
I][Idx] =
P->getIncomingValueForBlock(InBB);
10534 SmallMapVector<BasicBlock *, SmallVector<unsigned>, 4>
10544 for (
auto [Idx, V] :
enumerate(Phis)) {
10547 Operands[
I][Idx] =
V;
10556 Operands[
I][Idx] =
P->getIncomingValue(
I);
10559 auto *It = Blocks.
find(InBB);
10560 if (It == Blocks.
end())
10562 Operands[It->second.front()][Idx] =
P->getIncomingValue(
I);
10565 for (
const auto &
P : Blocks) {
10566 ArrayRef<unsigned> IncomingValues =
P.second;
10567 if (IncomingValues.
size() <= 1)
10570 for (
unsigned I : IncomingValues) {
10572 [&](
const auto &
Data) {
10573 return !
Data.value() ||
10574 Data.value() == Operands[BasicI][
Data.index()];
10576 "Expected empty operands list.");
10577 Operands[
I] = Operands[BasicI];
10590static std::pair<Instruction *, Instruction *>
10594 for (
Value *V : VL) {
10604 if (MainOp->
getOpcode() ==
I->getOpcode()) {
10623 "Expected different main and alt instructions.");
10624 return std::make_pair(MainOp, AltOp);
10637 const InstructionsState &S,
10639 bool TryPad =
false) {
10643 for (
Value *V : VL) {
10659 size_t NumUniqueScalarValues = UniqueValues.
size();
10662 if (NumUniqueScalarValues == VL.
size() &&
10664 ReuseShuffleIndices.
clear();
10669 if ((UserTreeIdx.
UserTE &&
10670 UserTreeIdx.
UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(
TTI)) ||
10672 LLVM_DEBUG(
dbgs() <<
"SLP: Reshuffling scalars not yet supported "
10673 "for nodes with padding.\n");
10674 ReuseShuffleIndices.
clear();
10679 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
10683 if (TryPad && UniquePositions.
size() > 1 && NumUniqueScalarValues > 1 &&
10684 S.getMainOp()->isSafeToRemove() &&
10685 (S.areInstructionsWithCopyableElements() ||
10689 TTI, UniqueValues.
front()->getType(), UniqueValues.
size());
10690 PWSz = std::min<unsigned>(PWSz, VL.
size());
10691 if (PWSz == VL.
size()) {
10695 ReuseShuffleIndices.
clear();
10699 UniqueValues.
end());
10700 PaddedUniqueValues.
append(
10701 PWSz - UniqueValues.
size(),
10705 if ((!S.areInstructionsWithCopyableElements() &&
10707 (S.areInstructionsWithCopyableElements() && S.isMulDivLikeOp() &&
10708 (S.getMainOp()->isIntDivRem() || S.getMainOp()->isFPDivRem() ||
10711 ReuseShuffleIndices.
clear();
10714 VL = std::move(PaddedUniqueValues);
10719 ReuseShuffleIndices.
clear();
10722 VL = std::move(UniqueValues);
10727 const InstructionsState &LocalState,
10728 SmallVectorImpl<Value *> &Op1,
10729 SmallVectorImpl<Value *> &Op2,
10731 constexpr unsigned SmallNodeSize = 4;
10732 if (VL.
size() <= SmallNodeSize || TTI->preferAlternateOpcodeVectorization() ||
10737 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *LocalState.getMainOp()
10739 for (TreeEntry *
E : getSplitTreeEntries(LocalState.getMainOp())) {
10740 if (
E->isSame(VL)) {
10742 << *LocalState.getMainOp() <<
".\n");
10754 ReorderIndices.assign(VL.
size(), VL.
size());
10755 SmallBitVector Op1Indices(VL.
size());
10760 Op1Indices.set(Idx);
10763 if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
10766 (LocalState.getAltOpcode() == LocalState.getOpcode() &&
10768 LocalState.getAltOp(), *TLI))) {
10770 Op1Indices.set(Idx);
10777 unsigned Opcode0 = LocalState.getOpcode();
10778 unsigned Opcode1 = LocalState.getAltOpcode();
10779 SmallBitVector OpcodeMask(
getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
10784 if (UOp1.size() <= 1 || UOp2.size() <= 1 ||
10785 TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) ||
10790 unsigned Op1Cnt = 0, Op2Cnt = Op1.
size();
10792 if (Op1Indices.test(Idx)) {
10793 ReorderIndices[Op1Cnt] = Idx;
10796 ReorderIndices[Op2Cnt] = Idx;
10801 ReorderIndices.clear();
10802 SmallVector<int>
Mask;
10803 if (!ReorderIndices.empty())
10805 unsigned NumParts = TTI->getNumberOfParts(VecTy);
10810 if (NumParts >= VL.
size())
10815 FixedVectorType *SubVecTy =
10819 if (!LocalState.isCmpOp() && NumParts <= 1 &&
10820 (
Mask.empty() || InsertCost >= NewShuffleCost))
10822 if ((LocalState.getMainOp()->isBinaryOp() &&
10823 LocalState.getAltOp()->isBinaryOp() &&
10824 (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
10825 LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
10826 (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
10827 (LocalState.getMainOp()->isUnaryOp() &&
10828 LocalState.getAltOp()->isUnaryOp())) {
10830 TTI->getArithmeticInstrCost(Opcode0, VecTy, Kind) +
10831 TTI->getArithmeticInstrCost(Opcode1, VecTy, Kind);
10836 OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.
size());
10840 VecTy, OriginalMask, Kind);
10842 TTI->getArithmeticInstrCost(Opcode0, Op1VecTy, Kind) +
10843 TTI->getArithmeticInstrCost(Opcode1, Op2VecTy, Kind);
10845 NewVecOpsCost + InsertCost +
10846 (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
10847 VectorizableTree.front()->getOpcode() == Instruction::Store
10851 if (NewCost >= OriginalCost)
10861class InstructionsCompatibilityAnalysis {
10863 const DataLayout &
DL;
10864 const TargetTransformInfo &
TTI;
10865 const TargetLibraryInfo &TLI;
10866 unsigned MainOpcode = 0;
10871 static bool isSupportedOpcode(
const unsigned Opcode) {
10872 return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
10873 Opcode == Instruction::LShr || Opcode == Instruction::Shl ||
10874 Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
10875 Opcode == Instruction::And || Opcode == Instruction::Or ||
10876 Opcode == Instruction::Xor;
10886 auto IsSupportedInstruction = [&](
Instruction *
I,
bool AnyUndef) {
10887 if (AnyUndef && (
I->isIntDivRem() ||
I->isFPDivRem() ||
isa<CallInst>(
I)))
10889 return I && isSupportedOpcode(
I->getOpcode()) &&
10894 SmallDenseSet<Value *, 8> Operands;
10895 SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
10896 bool AnyUndef =
false;
10897 for (
Value *V : VL) {
10905 if (Candidates.
empty()) {
10906 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
10908 Operands.
insert(
I->op_begin(),
I->op_end());
10911 if (Parent ==
I->getParent()) {
10912 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
10913 Operands.
insert(
I->op_begin(),
I->op_end());
10916 auto *NodeA = DT.
getNode(Parent);
10917 auto *NodeB = DT.
getNode(
I->getParent());
10918 assert(NodeA &&
"Should only process reachable instructions");
10919 assert(NodeB &&
"Should only process reachable instructions");
10920 assert((NodeA == NodeB) ==
10921 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10922 "Different nodes should have different DFS numbers");
10923 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
10924 Candidates.
clear();
10925 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
10928 Operands.
insert(
I->op_begin(),
I->op_end());
10931 unsigned BestOpcodeNum = 0;
10933 bool UsedOutside =
false;
10934 for (
const auto &
P : Candidates) {
10936 if (UsedOutside && !PUsedOutside)
10938 if (!UsedOutside && PUsedOutside)
10940 if (
P.second.size() < BestOpcodeNum)
10943 if (!PUsedOutside &&
any_of(
P.second, [&](Instruction *
I) {
10944 return Operands.contains(I);
10947 UsedOutside = PUsedOutside;
10948 for (Instruction *
I :
P.second) {
10949 if (IsSupportedInstruction(
I, AnyUndef)) {
10951 BestOpcodeNum =
P.second.size();
10961 return I &&
I->getParent() == MainOp->
getParent() &&
10974 Value *selectBestIdempotentValue()
const {
10975 assert(isSupportedOpcode(MainOpcode) &&
"Unsupported opcode");
10986 if (!S.isCopyableElement(V))
10988 assert(isSupportedOpcode(MainOpcode) &&
"Unsupported opcode");
10989 return {
V, selectBestIdempotentValue()};
10995 SmallVectorImpl<BoUpSLP::ValueList> &Operands)
const {
10997 unsigned ShuffleOrOp =
10998 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
11001 switch (ShuffleOrOp) {
11002 case Instruction::PHI: {
11006 PHIHandler Handler(DT, PH, VL);
11007 Handler.buildOperands();
11008 Operands.
assign(PH->getNumOperands(), {});
11010 Operands[
I].
assign(Handler.getOperands(
I).begin(),
11011 Handler.getOperands(
I).end());
11014 case Instruction::ExtractValue:
11015 case Instruction::ExtractElement:
11020 case Instruction::InsertElement:
11028 case Instruction::Load:
11032 for (
auto [V,
Op] :
zip(VL, Operands.
back())) {
11036 Op = LI->getPointerOperand();
11039 case Instruction::ZExt:
11040 case Instruction::SExt:
11041 case Instruction::FPToUI:
11042 case Instruction::FPToSI:
11043 case Instruction::FPExt:
11044 case Instruction::PtrToInt:
11045 case Instruction::IntToPtr:
11046 case Instruction::SIToFP:
11047 case Instruction::UIToFP:
11048 case Instruction::Trunc:
11049 case Instruction::FPTrunc:
11050 case Instruction::BitCast:
11051 case Instruction::ICmp:
11052 case Instruction::FCmp:
11053 case Instruction::Select:
11054 case Instruction::FNeg:
11055 case Instruction::Add:
11056 case Instruction::FAdd:
11057 case Instruction::Sub:
11058 case Instruction::FSub:
11059 case Instruction::Mul:
11060 case Instruction::FMul:
11061 case Instruction::UDiv:
11062 case Instruction::SDiv:
11063 case Instruction::FDiv:
11064 case Instruction::URem:
11065 case Instruction::SRem:
11066 case Instruction::FRem:
11067 case Instruction::Shl:
11068 case Instruction::LShr:
11069 case Instruction::AShr:
11070 case Instruction::And:
11071 case Instruction::Or:
11072 case Instruction::Xor:
11073 case Instruction::Freeze:
11074 case Instruction::Store:
11075 case Instruction::ShuffleVector:
11084 auto [
Op, ConvertedOps] = convertTo(
I, S);
11089 case Instruction::GetElementPtr: {
11096 const unsigned IndexIdx = 1;
11102 return !
GEP || VL0Ty ==
GEP->getOperand(IndexIdx)->getType();
11106 ->getPointerOperandType()
11107 ->getScalarType());
11111 Operands[0][Idx] =
V;
11112 Operands[1][Idx] = ConstantInt::getNullValue(Ty);
11115 Operands[0][Idx] =
GEP->getPointerOperand();
11116 auto *
Op =
GEP->getOperand(IndexIdx);
11119 CI, Ty, CI->getValue().isSignBitSet(),
DL)
11124 case Instruction::Call: {
11131 for (
Value *V : VL) {
11133 Ops.push_back(
I ?
I->getOperand(Idx)
11146 InstructionsCompatibilityAnalysis(DominatorTree &DT,
const DataLayout &
DL,
11147 const TargetTransformInfo &
TTI,
11148 const TargetLibraryInfo &TLI)
11153 bool TryCopyableElementsVectorization,
11154 bool WithProfitabilityCheck =
false,
11155 bool SkipSameCodeCheck =
false) {
11156 InstructionsState S = (SkipSameCodeCheck || !
allSameBlock(VL))
11157 ? InstructionsState::invalid()
11163 findAndSetMainInstruction(VL, R);
11165 return InstructionsState::invalid();
11166 S = InstructionsState(MainOp, MainOp,
true);
11167 if (!WithProfitabilityCheck)
11171 auto BuildCandidates =
11172 [](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates,
Value *V1,
11178 if (I1 && I2 &&
I1->getOpcode() == I2->getOpcode() &&
11179 I1->getParent() != I2->getParent())
11183 if (VL.
size() == 2) {
11186 BuildCandidates(Candidates1, Operands[0][0], Operands[0][1]);
11187 BuildCandidates(Candidates2, Operands[1][0], Operands[1][1]);
11188 bool Res = !Candidates1.
empty() && !Candidates2.
empty() &&
11189 R.findBestRootPair(Candidates1) &&
11190 R.findBestRootPair(Candidates2);
11192 Candidates1.
clear();
11193 Candidates2.
clear();
11194 BuildCandidates(Candidates1, Operands[0][0], Operands[1][1]);
11195 BuildCandidates(Candidates2, Operands[1][0], Operands[0][1]);
11196 Res = !Candidates1.
empty() && !Candidates2.
empty() &&
11197 R.findBestRootPair(Candidates1) &&
11198 R.findBestRootPair(Candidates2);
11201 return InstructionsState::invalid();
11205 FixedVectorType *VecTy =
11207 switch (MainOpcode) {
11208 case Instruction::Add:
11209 case Instruction::Sub:
11210 case Instruction::LShr:
11211 case Instruction::Shl:
11212 case Instruction::SDiv:
11213 case Instruction::UDiv:
11214 case Instruction::And:
11215 case Instruction::Or:
11216 case Instruction::Xor:
11222 if (VectorCost > ScalarCost)
11223 return InstructionsState::invalid();
11226 assert(Operands.
size() == 2 &&
"Unexpected number of operands!");
11227 unsigned CopyableNum =
11228 count_if(VL, [&](
Value *V) {
return S.isCopyableElement(V); });
11229 if (CopyableNum < VL.
size() / 2)
11232 const unsigned Limit = VL.
size() / 24;
11233 if ((CopyableNum >= VL.
size() - Limit ||
11234 (CopyableNum >= VL.
size() - 1 && VL.
size() > 4) ||
11239 return InstructionsState::invalid();
11243 for (
auto &
Ops : Operands) {
11258 return InstructionsState::invalid();
11264 constexpr unsigned Limit = 4;
11265 if (Operands.front().size() >= Limit) {
11266 SmallDenseMap<const Value *, unsigned>
Counters;
11274 return C.second == 1;
11280 InstructionsCompatibilityAnalysis
Analysis(DT,
DL,
TTI, TLI);
11281 InstructionsState OpS =
Analysis.buildInstructionsState(
11283 if (!OpS || (OpS.getOpcode() == Instruction::PHI && !
allSameBlock(
Ops)))
11285 unsigned CopyableNum =
11287 return CopyableNum <= VL.
size() / 2;
11289 if (!CheckOperand(Operands.front()))
11290 return InstructionsState::invalid();
11297 assert(S &&
"Invalid state!");
11299 if (S.areInstructionsWithCopyableElements()) {
11300 MainOp = S.getMainOp();
11301 MainOpcode = S.getOpcode();
11306 for (
auto [OperandIdx, Operand] :
enumerate(OperandsForValue))
11307 Operands[OperandIdx][Idx] = Operand;
11310 buildOriginalOperands(S, VL, Operands);
11317BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
11319 bool TryCopyableElementsVectorization)
const {
11322 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
11323 InstructionsState S =
Analysis.buildInstructionsState(
11324 VL, *
this, TryCopyableElementsVectorization,
11325 true, TryCopyableElementsVectorization);
11327 bool AreScatterAllGEPSameBlock =
false;
11329 SmallVector<unsigned> SortedIndices;
11331 bool IsScatterVectorizeUserTE =
11332 UserTreeIdx.UserTE &&
11333 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11334 AreScatterAllGEPSameBlock =
11348 *SE, SortedIndices));
11349 if (!AreScatterAllGEPSameBlock) {
11350 LLVM_DEBUG(
dbgs() <<
"SLP: Try split and if failed, gathering due to "
11351 "C,S,B,O, small shuffle. \n";
11355 return ScalarsVectorizationLegality(S,
false,
11361 assert(It != VL.
end() &&
"Expected at least one GEP.");
11364 assert(S &&
"Must be valid.");
11370 return ScalarsVectorizationLegality(S,
false,
11376 BasicBlock *BB = S.getMainOp()->getParent();
11379 !DT->isReachableFromEntry(BB)) {
11385 return ScalarsVectorizationLegality(S,
false);
11394 return ScalarsVectorizationLegality(S,
false,
11399 if (S.getOpcode() == Instruction::ExtractElement &&
11402 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to scalable vector type.\n");
11403 return ScalarsVectorizationLegality(S,
false);
11410 (S.isAltShuffle() || VL.
size() < 4 ||
11417 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to max recursion depth.\n");
11418 return ScalarsVectorizationLegality(S,
false);
11422 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *S.getMainOp() <<
".\n");
11423 for (TreeEntry *
E : getTreeEntries(S.getMainOp())) {
11424 if (
E->isSame(VL)) {
11425 LLVM_DEBUG(
dbgs() <<
"SLP: Perfect diamond merge at " << *S.getMainOp()
11427 return ScalarsVectorizationLegality(S,
false);
11432 (S.getOpcode() == Instruction::PHI &&
isa<PHINode>(V) &&
11433 LI->getLoopFor(S.getMainOp()->getParent()) &&
11437 return ScalarsVectorizationLegality(S,
false);
11447 if (!S || !S.isAltShuffle() || VL.
size() > 2)
11455 SmallVector<unsigned, 8> InstsCount;
11456 for (
Value *V : VL) {
11459 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
11462 bool IsCommutative =
11464 if ((IsCommutative &&
11465 std::accumulate(InstsCount.
begin(), InstsCount.
end(), 0) < 2) ||
11467 all_of(InstsCount, [](
unsigned ICnt) {
return ICnt < 2; })))
11469 assert(VL.size() == 2 &&
"Expected only 2 alternate op instructions.");
11473 for (
int Op :
seq<int>(S.getMainOp()->getNumOperands()))
11475 I2->getOperand(
Op));
11476 if (
static_cast<unsigned>(
count_if(
11477 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
11479 })) >= S.getMainOp()->getNumOperands() / 2)
11481 if (S.getMainOp()->getNumOperands() > 2)
11483 if (IsCommutative) {
11485 Candidates.
clear();
11486 for (
int Op = 0,
E = S.getMainOp()->getNumOperands();
Op <
E; ++
Op)
11488 I2->getOperand((
Op + 1) %
E));
11490 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
11497 bool AreAllSameBlock = !AreScatterAllGEPSameBlock;
11498 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
11499 if (!AreAllSameInsts ||
isSplat(VL) ||
11503 NotProfitableForVectorization(VL)) {
11504 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to C,S,B,O, small shuffle. \n";
11508 return ScalarsVectorizationLegality(S,
false);
11512 if (!EphValues.empty()) {
11513 for (
Value *V : VL) {
11514 if (EphValues.count(V)) {
11516 <<
") is ephemeral.\n");
11518 return ScalarsVectorizationLegality(S,
false,
11530 if (S.isAltShuffle()) {
11531 auto GetNumVectorizedExtracted = [&]() {
11537 all_of(
I->operands(), [&](
const Use &U) {
11538 return isa<ExtractElementInst>(U.get());
11543 else if (!
I->hasOneUser() && !areAllUsersVectorized(
I, UserIgnoreList))
11546 return std::make_pair(Vectorized, Extracted);
11548 auto [Vectorized, Extracted] = GetNumVectorizedExtracted();
11550 bool PreferScalarize = !Vectorized.
isAllOnes() && VL.size() == 2;
11551 if (!Vectorized.
isAllOnes() && !PreferScalarize) {
11554 Type *ScalarTy = VL.front()->getType();
11559 false,
true, Kind);
11561 *TTI, ScalarTy, VecTy, Vectorized,
11562 true,
false, Kind,
false);
11563 PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate;
11565 if (PreferScalarize) {
11566 LLVM_DEBUG(
dbgs() <<
"SLP: The instructions are in tree and alternate "
11567 "node is not profitable.\n");
11568 return ScalarsVectorizationLegality(S,
false);
11573 if (UserIgnoreList && !UserIgnoreList->empty()) {
11574 for (
Value *V : VL) {
11575 if (UserIgnoreList->contains(V)) {
11576 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to gathered scalar.\n");
11577 return ScalarsVectorizationLegality(S,
false);
11582 return ScalarsVectorizationLegality(S,
true);
11587 unsigned InterleaveFactor) {
11590 SmallVector<int> ReuseShuffleIndices;
11594 auto TrySplitNode = [&](
const InstructionsState &LocalState) {
11597 if (!canBuildSplitNode(VL, LocalState, Op1, Op2, ReorderIndices))
11600 auto Invalid = ScheduleBundle::invalid();
11601 auto *
TE = newTreeEntry(VL, TreeEntry::SplitVectorize,
Invalid, LocalState,
11602 UserTreeIdx, {}, ReorderIndices);
11607 getSameValuesTreeEntry(S.getMainOp(),
Op,
true))) {
11609 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11610 Idx == 0 ? 0 : Op1.
size());
11611 (void)newTreeEntry(
Op, TreeEntry::NeedToGather,
Invalid, S, {
TE, Idx});
11613 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
11614 Idx == 0 ? 0 : Op1.
size());
11624 bool AreConsts =
false;
11625 for (
Value *V : VL) {
11637 if (AreOnlyConstsWithPHIs(VL)) {
11638 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to all constants and PHIs.\n");
11639 newGatherTreeEntry(VL, InstructionsState::invalid(), UserTreeIdx);
11643 ScalarsVectorizationLegality Legality = getScalarsVectorizationLegality(
11644 VL,
Depth, UserTreeIdx,
false);
11645 InstructionsState S = Legality.getInstructionsState();
11646 if (!Legality.isLegal()) {
11647 if (Legality.trySplitVectorize()) {
11650 if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
11654 Legality = getScalarsVectorizationLegality(
11655 VL,
Depth, UserTreeIdx,
true);
11656 if (!Legality.isLegal()) {
11657 if (Legality.tryToFindDuplicates())
11661 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11664 S = Legality.getInstructionsState();
11668 if (S.isAltShuffle() && TrySplitNode(S))
11674 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11679 bool IsScatterVectorizeUserTE =
11680 UserTreeIdx.UserTE &&
11681 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11684 StridedPtrInfo SPtrInfo;
11685 TreeEntry::EntryState State = getScalarsVectorizationState(
11686 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, SPtrInfo);
11687 if (State == TreeEntry::NeedToGather) {
11688 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11694 auto &BSRef = BlocksSchedules[BB];
11696 BSRef = std::make_unique<BlockScheduling>(BB);
11698 BlockScheduling &BS = *BSRef;
11701 std::optional<ScheduleBundle *> BundlePtr =
11702 BS.tryScheduleBundle(UniqueValues.getArrayRef(),
this, S, UserTreeIdx);
11703#ifdef EXPENSIVE_CHECKS
11707 if (!BundlePtr || (*BundlePtr && !*BundlePtr.value())) {
11708 LLVM_DEBUG(
dbgs() <<
"SLP: We are not able to schedule this bundle!\n");
11710 if (S.isAltShuffle() && ReuseShuffleIndices.
empty() && TrySplitNode(S))
11712 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11713 NonScheduledFirst.insert(VL.front());
11714 if (S.getOpcode() == Instruction::Load &&
11715 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
11719 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
11721 ScheduleBundle
Empty;
11722 ScheduleBundle &Bundle = BundlePtr.value() ? *BundlePtr.value() :
Empty;
11723 LLVM_DEBUG(
dbgs() <<
"SLP: We are able to schedule this bundle.\n");
11725 unsigned ShuffleOrOp =
11726 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.
getOpcode();
11727 auto CreateOperandNodes = [&](TreeEntry *
TE,
const auto &Operands) {
11729 SmallVector<unsigned> PHIOps;
11735 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
11740 for (
unsigned I : PHIOps)
11741 buildTreeRec(Operands[
I],
Depth + 1, {
TE,
I});
11743 switch (ShuffleOrOp) {
11744 case Instruction::PHI: {
11746 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
11750 TE->setOperands(Operands);
11751 CreateOperandNodes(TE, Operands);
11754 case Instruction::ExtractValue:
11755 case Instruction::ExtractElement: {
11756 if (CurrentOrder.empty()) {
11757 LLVM_DEBUG(
dbgs() <<
"SLP: Reusing or shuffling extract sequence.\n");
11760 dbgs() <<
"SLP: Reusing or shuffling of reordered extract sequence "
11762 for (
unsigned Idx : CurrentOrder)
11763 dbgs() <<
" " << Idx;
11770 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11771 ReuseShuffleIndices, CurrentOrder);
11773 "(ExtractValueInst/ExtractElementInst).\n";
11777 TE->setOperands(Operands);
11780 case Instruction::InsertElement: {
11781 assert(ReuseShuffleIndices.
empty() &&
"All inserts should be unique");
11783 auto OrdCompare = [](
const std::pair<int, int> &
P1,
11784 const std::pair<int, int> &P2) {
11785 return P1.first > P2.first;
11788 decltype(OrdCompare)>
11789 Indices(OrdCompare);
11790 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
11792 Indices.emplace(Idx,
I);
11794 OrdersType CurrentOrder(VL.size(), VL.size());
11795 bool IsIdentity =
true;
11796 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
11797 CurrentOrder[Indices.top().second] =
I;
11798 IsIdentity &= Indices.top().second ==
I;
11802 CurrentOrder.clear();
11803 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11805 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (InsertElementInst).\n";
11808 TE->setOperands(Operands);
11809 buildTreeRec(
TE->getOperand(1),
Depth + 1, {TE, 1});
11812 case Instruction::Load: {
11819 TreeEntry *
TE =
nullptr;
11822 case TreeEntry::Vectorize:
11823 TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11824 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
11825 if (CurrentOrder.empty())
11826 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (LoadInst).\n";
11830 <<
"SLP: added a new TreeEntry (jumbled LoadInst).\n";
11833 case TreeEntry::CompressVectorize:
11835 TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S,
11836 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11839 <<
"SLP: added a new TreeEntry (masked LoadInst + compress).\n";
11842 case TreeEntry::StridedVectorize:
11844 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
11845 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
11846 TreeEntryToStridedPtrInfoMap[
TE] = SPtrInfo;
11847 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (strided LoadInst).\n";
11850 case TreeEntry::ScatterVectorize:
11852 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
11853 UserTreeIdx, ReuseShuffleIndices);
11856 <<
"SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
11859 case TreeEntry::CombinedVectorize:
11860 case TreeEntry::SplitVectorize:
11861 case TreeEntry::NeedToGather:
11864 if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {
11865 assert(Operands.
size() == 1 &&
"Expected a single operand only");
11866 SmallVector<int>
Mask;
11870 TE->setOperands(Operands);
11871 if (State == TreeEntry::ScatterVectorize)
11872 buildTreeRec(PointerOps,
Depth + 1, {
TE, 0});
11875 case Instruction::ZExt:
11876 case Instruction::SExt:
11877 case Instruction::FPToUI:
11878 case Instruction::FPToSI:
11879 case Instruction::FPExt:
11880 case Instruction::PtrToInt:
11881 case Instruction::IntToPtr:
11882 case Instruction::SIToFP:
11883 case Instruction::UIToFP:
11884 case Instruction::Trunc:
11885 case Instruction::FPTrunc:
11886 case Instruction::BitCast: {
11887 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
11888 std::make_pair(std::numeric_limits<unsigned>::min(),
11889 std::numeric_limits<unsigned>::max()));
11890 if (ShuffleOrOp == Instruction::ZExt ||
11891 ShuffleOrOp == Instruction::SExt) {
11892 CastMaxMinBWSizes = std::make_pair(
11893 std::max<unsigned>(DL->getTypeSizeInBits(VL0->
getType()),
11895 std::min<unsigned>(
11898 }
else if (ShuffleOrOp == Instruction::Trunc) {
11899 CastMaxMinBWSizes = std::make_pair(
11900 std::max<unsigned>(
11903 std::min<unsigned>(DL->getTypeSizeInBits(VL0->
getType()),
11906 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11907 ReuseShuffleIndices);
11908 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (CastInst).\n";
11911 TE->setOperands(Operands);
11913 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
11914 if (ShuffleOrOp == Instruction::Trunc) {
11915 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11916 }
else if (ShuffleOrOp == Instruction::SIToFP ||
11917 ShuffleOrOp == Instruction::UIToFP) {
11918 unsigned NumSignBits =
11921 APInt
Mask = DB->getDemandedBits(OpI);
11922 NumSignBits = std::max(NumSignBits,
Mask.countl_zero());
11924 if (NumSignBits * 2 >=
11926 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11930 case Instruction::ICmp:
11931 case Instruction::FCmp: {
11934 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11935 ReuseShuffleIndices);
11944 "Commutative Predicate mismatch");
11947 Operands.
back() =
Ops.getVL(1);
11954 if (
Cmp->getPredicate() != P0)
11958 TE->setOperands(Operands);
11959 buildTreeRec(Operands.
front(),
Depth + 1, {TE, 0});
11960 buildTreeRec(Operands.
back(),
Depth + 1, {TE, 1});
11961 if (ShuffleOrOp == Instruction::ICmp) {
11962 unsigned NumSignBits0 =
11964 if (NumSignBits0 * 2 >=
11966 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
11967 unsigned NumSignBits1 =
11969 if (NumSignBits1 * 2 >=
11971 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
11975 case Instruction::Select:
11976 case Instruction::FNeg:
11977 case Instruction::Add:
11978 case Instruction::FAdd:
11979 case Instruction::Sub:
11980 case Instruction::FSub:
11981 case Instruction::Mul:
11982 case Instruction::FMul:
11983 case Instruction::UDiv:
11984 case Instruction::SDiv:
11985 case Instruction::FDiv:
11986 case Instruction::URem:
11987 case Instruction::SRem:
11988 case Instruction::FRem:
11989 case Instruction::Shl:
11990 case Instruction::LShr:
11991 case Instruction::AShr:
11992 case Instruction::And:
11993 case Instruction::Or:
11994 case Instruction::Xor:
11995 case Instruction::Freeze: {
11996 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
11997 ReuseShuffleIndices);
11999 dbgs() <<
"SLP: added a new TreeEntry "
12000 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
12006 Operands[0] =
Ops.getVL(0);
12007 Operands[1] =
Ops.getVL(1);
12009 TE->setOperands(Operands);
12011 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
12014 case Instruction::GetElementPtr: {
12015 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12016 ReuseShuffleIndices);
12017 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (GetElementPtrInst).\n";
12019 TE->setOperands(Operands);
12022 buildTreeRec(Operands[
I],
Depth + 1, {
TE,
I});
12025 case Instruction::Store: {
12026 bool Consecutive = CurrentOrder.empty();
12029 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12030 ReuseShuffleIndices, CurrentOrder);
12032 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (StoreInst).\n";
12036 dbgs() <<
"SLP: added a new TreeEntry (jumbled StoreInst).\n";
12038 TE->setOperands(Operands);
12039 buildTreeRec(
TE->getOperand(0),
Depth + 1, {TE, 0});
12042 case Instruction::Call: {
12048 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12049 ReuseShuffleIndices);
12050 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (CallInst).\n";
12055 Operands[0] =
Ops.getVL(0);
12056 Operands[1] =
Ops.getVL(1);
12058 TE->setOperands(Operands);
12064 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
12068 case Instruction::ShuffleVector: {
12069 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12070 ReuseShuffleIndices);
12071 if (S.isAltShuffle()) {
12072 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (isAltShuffle).\n";
12077 dbgs() <<
"SLP: added a new TreeEntry (ShuffleVectorInst).\n";
12091 "Expected different main/alternate predicates.");
12107 TE->setOperands(Operands);
12108 buildTreeRec(Operands.
front(),
Depth + 1, {TE, 0});
12109 buildTreeRec(Operands.
back(),
Depth + 1, {TE, 1});
12116 Operands[0] =
Ops.getVL(0);
12117 Operands[1] =
Ops.getVL(1);
12119 TE->setOperands(Operands);
12121 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
12139 for (
const auto *Ty : ST->elements())
12140 if (Ty != *ST->element_begin())
12142 N *= ST->getNumElements();
12143 EltTy = *ST->element_begin();
12145 N *= AT->getNumElements();
12146 EltTy = AT->getElementType();
12149 N *= VT->getNumElements();
12150 EltTy = VT->getElementType();
12156 size_t VTSize = DL->getTypeStoreSizeInBits(
getWidenedType(EltTy,
N));
12157 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
12158 VTSize != DL->getTypeStoreSizeInBits(T))
12165 bool ResizeAllowed)
const {
12167 assert(It != VL.
end() &&
"Expected at least one extract instruction.");
12174 Value *Vec = E0->getOperand(0);
12176 CurrentOrder.
clear();
12180 if (E0->getOpcode() == Instruction::ExtractValue) {
12192 unsigned E = VL.
size();
12193 if (!ResizeAllowed && NElts !=
E)
12196 unsigned MinIdx = NElts, MaxIdx = 0;
12201 if (Inst->getOperand(0) != Vec)
12209 const unsigned ExtIdx = *Idx;
12210 if (ExtIdx >= NElts)
12212 Indices[
I] = ExtIdx;
12213 if (MinIdx > ExtIdx)
12215 if (MaxIdx < ExtIdx)
12218 if (MaxIdx - MinIdx + 1 >
E)
12220 if (MaxIdx + 1 <=
E)
12224 bool ShouldKeepOrder =
true;
12231 for (
unsigned I = 0;
I <
E; ++
I) {
12234 const unsigned ExtIdx = Indices[
I] - MinIdx;
12235 if (CurrentOrder[ExtIdx] !=
E) {
12236 CurrentOrder.
clear();
12239 ShouldKeepOrder &= ExtIdx ==
I;
12240 CurrentOrder[ExtIdx] =
I;
12242 if (ShouldKeepOrder)
12243 CurrentOrder.
clear();
12245 return ShouldKeepOrder;
12248bool BoUpSLP::areAllUsersVectorized(
12249 Instruction *
I,
const SmallDenseSet<Value *> *VectorizedVals)
const {
12250 return (
I->hasOneUse() && (!VectorizedVals || VectorizedVals->
contains(
I))) ||
12251 all_of(
I->users(), [
this](User *U) {
12252 return isVectorized(U) || isVectorLikeInstWithConstOps(U) ||
12253 (isa<ExtractElementInst>(U) && MustGather.contains(U));
12257void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
12258 const function_ref<
bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
12259 SmallVectorImpl<Value *> *OpScalars,
12260 SmallVectorImpl<Value *> *AltScalars)
const {
12261 unsigned Sz = Scalars.size();
12263 SmallVector<int> OrderMask;
12264 if (!ReorderIndices.empty())
12266 for (
unsigned I = 0;
I < Sz; ++
I) {
12268 if (!ReorderIndices.empty())
12269 Idx = OrderMask[
I];
12273 if (IsAltOp(OpInst)) {
12274 Mask[
I] = Sz + Idx;
12283 if (!ReuseShuffleIndices.
empty()) {
12285 transform(ReuseShuffleIndices, NewMask.
begin(), [&Mask](
int Idx) {
12286 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
12288 Mask.swap(NewMask);
12295 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(
I) == MainOp;
12305 assert(MainP != AltP &&
"Expected different main/alternate predicates.");
12314 assert((MainP ==
P || AltP ==
P || MainP == SwappedP || AltP == SwappedP) &&
12315 "CmpInst expected to match either main or alternate predicate or "
12317 return MainP !=
P && MainP != SwappedP;
12319 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(
I) == AltOp;
12324 const auto *Op0 =
Ops.front();
12337 return CI->getValue().isPowerOf2();
12343 return CI->getValue().isNegatedPowerOf2();
12348 if (IsConstant && IsUniform)
12350 else if (IsConstant)
12352 else if (IsUniform)
12364class BaseShuffleAnalysis {
12366 Type *ScalarTy =
nullptr;
12368 BaseShuffleAnalysis(
Type *ScalarTy) : ScalarTy(ScalarTy) {}
12376 unsigned getVF(
Value *V)
const {
12377 assert(V &&
"V cannot be nullptr");
12379 "V does not have FixedVectorType");
12380 assert(ScalarTy &&
"ScalarTy cannot be nullptr");
12382 unsigned VNumElements =
12384 assert(VNumElements > ScalarTyNumElements &&
12385 "the number of elements of V is not large enough");
12386 assert(VNumElements % ScalarTyNumElements == 0 &&
12387 "the number of elements of V is not a vectorized value");
12388 return VNumElements / ScalarTyNumElements;
12394 static bool isIdentityMask(ArrayRef<int> Mask,
const FixedVectorType *VecTy,
12396 int Limit =
Mask.size();
12408 if (Limit % VF == 0 &&
all_of(
seq<int>(0, Limit / VF), [=](
int Idx) {
12409 ArrayRef<int> Slice =
Mask.slice(Idx * VF, VF);
12422 static void combineMasks(
unsigned LocalVF, SmallVectorImpl<int> &Mask,
12423 ArrayRef<int> ExtMask) {
12424 unsigned VF =
Mask.size();
12426 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
12429 int MaskedIdx =
Mask[ExtMask[
I] % VF];
12433 Mask.swap(NewMask);
12469 static bool peekThroughShuffles(
Value *&V, SmallVectorImpl<int> &Mask,
12470 bool SinglePermute) {
12472 ShuffleVectorInst *IdentityOp =
nullptr;
12473 SmallVector<int> IdentityMask;
12482 if (isIdentityMask(Mask, SVTy,
false)) {
12483 if (!IdentityOp || !SinglePermute ||
12484 (isIdentityMask(Mask, SVTy,
true) &&
12486 IdentityMask.
size()))) {
12491 IdentityMask.
assign(Mask);
12511 if (SV->isZeroEltSplat()) {
12513 IdentityMask.
assign(Mask);
12515 int LocalVF =
Mask.size();
12518 LocalVF = SVOpTy->getNumElements();
12522 static_cast<unsigned>(
I) >= SV->getShuffleMask().size())
12524 ExtMask[Idx] = SV->getMaskValue(
I);
12534 if (!IsOp1Undef && !IsOp2Undef) {
12536 for (
int &
I : Mask) {
12539 if (SV->getMaskValue(
I % SV->getShuffleMask().size()) ==
12545 SmallVector<int> ShuffleMask(SV->getShuffleMask());
12546 combineMasks(LocalVF, ShuffleMask, Mask);
12547 Mask.swap(ShuffleMask);
12549 Op = SV->getOperand(0);
12551 Op = SV->getOperand(1);
12554 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
12559 "Expected masks of same sizes.");
12564 Mask.swap(IdentityMask);
12566 return SinglePermute &&
12569 (Shuffle &&
Mask.size() == Shuffle->getShuffleMask().size() &&
12570 Shuffle->isZeroEltSplat() &&
12574 Shuffle->getShuffleMask()[
P.index()] == 0;
12587 template <
typename T,
typename ShuffleBuilderTy>
12588 static T createShuffle(
Value *V1,
Value *V2, ArrayRef<int> Mask,
12589 ShuffleBuilderTy &Builder,
Type *ScalarTy) {
12590 assert(V1 &&
"Expected at least one vector value.");
12592 SmallVector<int> NewMask(Mask);
12593 if (ScalarTyNumElements != 1) {
12599 Builder.resizeToMatch(V1, V2);
12600 int VF =
Mask.size();
12602 VF = FTy->getNumElements();
12613 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
12615 CombinedMask1[
I] =
Mask[
I];
12617 CombinedMask2[
I] =
Mask[
I] - VF;
12624 (void)peekThroughShuffles(Op1, CombinedMask1,
false);
12625 (void)peekThroughShuffles(Op2, CombinedMask2,
false);
12631 for (
auto [Idx,
I] :
enumerate(CombinedMask1)) {
12634 ExtMask1[Idx] = SV1->getMaskValue(
I);
12638 ->getNumElements(),
12639 ExtMask1, UseMask::SecondArg);
12640 SmallVector<int> ExtMask2(CombinedMask2.size(),
PoisonMaskElem);
12641 for (
auto [Idx,
I] :
enumerate(CombinedMask2)) {
12644 ExtMask2[Idx] = SV2->getMaskValue(
I);
12648 ->getNumElements(),
12649 ExtMask2, UseMask::SecondArg);
12650 if (SV1->getOperand(0)->getType() ==
12651 SV2->getOperand(0)->getType() &&
12652 SV1->getOperand(0)->getType() != SV1->getType() &&
12655 Op1 = SV1->getOperand(0);
12656 Op2 = SV2->getOperand(0);
12657 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
12658 int LocalVF = ShuffleMask1.size();
12660 LocalVF = FTy->getNumElements();
12661 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
12662 CombinedMask1.swap(ShuffleMask1);
12663 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
12664 LocalVF = ShuffleMask2.size();
12666 LocalVF = FTy->getNumElements();
12667 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
12668 CombinedMask2.swap(ShuffleMask2);
12671 }
while (PrevOp1 != Op1 || PrevOp2 != Op2);
12672 Builder.resizeToMatch(Op1, Op2);
12674 ->getElementCount()
12675 .getKnownMinValue(),
12677 ->getElementCount()
12678 .getKnownMinValue());
12679 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
12682 "Expected undefined mask element");
12683 CombinedMask1[
I] = CombinedMask2[
I] + (Op1 == Op2 ? 0 : VF);
12692 return Builder.createIdentity(Op1);
12693 return Builder.createShuffleVector(
12698 return Builder.createPoison(
12700 bool IsIdentity = peekThroughShuffles(V1, NewMask,
true);
12701 assert(V1 &&
"Expected non-null value after looking through shuffles.");
12704 return Builder.createShuffleVector(V1, NewMask);
12705 return Builder.createIdentity(V1);
12711 ArrayRef<int> Mask) {
12720static std::pair<InstructionCost, InstructionCost>
12731 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
12740 ScalarCost =
TTI.getPointersChainCost(
12741 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
12745 for (
Value *V : Ptrs) {
12746 if (V == BasePtr) {
12759 if (PtrsRetainedInVecCode.
size() == Ptrs.size()) {
12764 VecCost =
TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
12765 TTI::PointersChainInfo::getKnownStride(),
12775 [](
const Value *V) {
12777 return Ptr && !Ptr->hasAllConstantIndices();
12779 ? TTI::PointersChainInfo::getUnknownStride()
12780 : TTI::PointersChainInfo::getKnownStride();
12783 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy,
CostKind);
12787 if (It != Ptrs.
end())
12792 VecCost =
TTI.getGEPCost(BaseGEP->getSourceElementType(),
12793 BaseGEP->getPointerOperand(), Indices, VecTy,
12798 return std::make_pair(ScalarCost, VecCost);
12801void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
12802 assert(
TE.isGather() &&
TE.ReorderIndices.empty() &&
12803 "Expected gather node without reordering.");
12805 SmallSet<size_t, 2> LoadKeyUsed;
12809 if (
TE.Scalars.size() == 2 || (
TE.hasState() && !
TE.isAltShuffle()) ||
12814 return VectorizableTree[Idx]->isSame(TE.Scalars);
12818 auto GenerateLoadsSubkey = [&](
size_t Key, LoadInst *LI) {
12823 auto LIt = LoadsMap.
find(std::make_pair(
Key, Ptr));
12824 if (LIt != LoadsMap.
end()) {
12825 for (LoadInst *RLI : LIt->second) {
12827 LI->
getType(), LI->getPointerOperand(), *DL, *SE,
12831 for (LoadInst *RLI : LIt->second) {
12833 LI->getPointerOperand(), *TLI)) {
12838 if (LIt->second.size() > 2) {
12840 hash_value(LIt->second.back()->getPointerOperand());
12846 LoadsMap.
try_emplace(std::make_pair(
Key, Ptr)).first->second.push_back(LI);
12849 MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues;
12850 SmallDenseMap<Value *, SmallVector<unsigned>, 8> KeyToIndex;
12851 bool IsOrdered =
true;
12852 unsigned NumInstructions = 0;
12856 size_t Key = 1, Idx = 1;
12864 auto &Container = SortedValues[
Key];
12865 if (IsOrdered && !KeyToIndex.
contains(V) &&
12868 ((Container.contains(Idx) &&
12869 KeyToIndex.
at(Container[Idx].back()).back() !=
I - 1) ||
12870 (!Container.empty() && !Container.contains(Idx) &&
12871 KeyToIndex.
at(Container.back().second.back()).back() !=
I - 1)))
12873 auto &KTI = KeyToIndex[
V];
12875 Container[Idx].push_back(V);
12880 if (!IsOrdered && NumInstructions > 1) {
12882 TE.ReorderIndices.resize(
TE.Scalars.size(),
TE.Scalars.size());
12883 for (
const auto &
D : SortedValues) {
12884 for (
const auto &
P :
D.second) {
12886 for (
Value *V :
P.second) {
12887 ArrayRef<unsigned> Indices = KeyToIndex.
at(V);
12888 for (
auto [K, Idx] :
enumerate(Indices)) {
12889 TE.ReorderIndices[Cnt +
K] = Idx;
12890 TE.Scalars[Cnt +
K] =
V;
12892 Sz += Indices.
size();
12893 Cnt += Indices.
size();
12897 *TTI,
TE.Scalars.front()->getType(), Sz);
12901 }
else if (!
P.second.empty() &&
isConstant(
P.second.front())) {
12909 if (!
TE.ReuseShuffleIndices.empty() ||
TE.ReorderIndices.empty())
12914 auto *ScalarTy =
TE.Scalars.front()->getType();
12916 for (
auto [Idx, Sz] : SubVectors) {
12923 int Sz =
TE.Scalars.size();
12924 SmallVector<int> ReorderMask(
TE.ReorderIndices.begin(),
12925 TE.ReorderIndices.end());
12931 ReorderMask[
I] =
I +
TE.ReorderIndices.size();
12935 any_of(ReorderMask, [&](
int I) {
return I >= Sz; })
12938 VecTy, ReorderMask);
12944 DemandedElts.clearBit(
I);
12946 ReorderMask[
I] =
I;
12948 ReorderMask[
I] =
I + Sz;
12954 if (!DemandedElts.isAllOnes())
12956 if (
Cost >= BVCost) {
12957 SmallVector<int>
Mask(
TE.ReorderIndices.begin(),
TE.ReorderIndices.end());
12959 TE.ReorderIndices.clear();
12966 const InstructionsState &S,
12972 return V->getType()->getScalarType()->isFloatingPointTy();
12974 "Can only convert to FMA for floating point types");
12975 assert(S.isAddSubLikeOp() &&
"Can only convert to FMA for add/sub");
12980 for (
Value *V : VL) {
12984 if (S.isCopyableElement(
I))
12986 Instruction *MatchingI = S.getMatchingMainOpOrAltOp(
I);
12987 if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
12990 FMF &= FPCI->getFastMathFlags();
12994 if (!CheckForContractable(VL))
12997 InstructionsCompatibilityAnalysis
Analysis(DT,
DL,
TTI, TLI);
13004 if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
13006 if (!CheckForContractable(Operands.
front()))
13014 for (
Value *V : VL) {
13018 if (!S.isCopyableElement(
I))
13020 FMF &= FPCI->getFastMathFlags();
13021 FMulPlusFAddCost +=
TTI.getInstructionCost(
I,
CostKind);
13024 for (
auto [V,
Op] :
zip(VL, Operands.
front())) {
13025 if (S.isCopyableElement(V))
13028 if (!
I || !
I->hasOneUse() || OpS.isCopyableElement(
I)) {
13030 FMACost +=
TTI.getInstructionCost(OpI,
CostKind);
13037 FMF &= FPCI->getFastMathFlags();
13038 FMulPlusFAddCost +=
TTI.getInstructionCost(
I,
CostKind);
13048 BaseGraphSize = VectorizableTree.size();
13050 class GraphTransformModeRAAI {
13051 bool &SavedIsGraphTransformMode;
13054 GraphTransformModeRAAI(
bool &IsGraphTransformMode)
13055 : SavedIsGraphTransformMode(IsGraphTransformMode) {
13056 IsGraphTransformMode =
true;
13058 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode =
false; }
13059 } TransformContext(IsGraphTransformMode);
13068 const InstructionsState &S) {
13072 I2->getOperand(
Op));
13074 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
13076 [](
const std::pair<Value *, Value *> &
P) {
13086 TreeEntry &E = *VectorizableTree[Idx];
13088 reorderGatherNode(E);
13093 constexpr unsigned VFLimit = 16;
13094 bool ForceLoadGather =
13095 count_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
13096 return TE->isGather() && TE->hasState() &&
13097 TE->getOpcode() == Instruction::Load &&
13098 TE->getVectorFactor() < VFLimit;
13104 return TE->isSame(VL) ||
all_of(VL, [&](
Value *V) {
13113 auto CheckForSameVectorNodes = [&](
const TreeEntry &E) {
13114 if (E.hasState()) {
13116 !TEs.
empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
13117 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
13118 ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
13119 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13120 return is_contained(TEs, TE);
13127 !TEs.
empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
13128 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
13129 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
13130 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13131 return is_contained(TEs, TE);
13139 if (It != E.Scalars.end()) {
13141 !TEs.empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
13142 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
13143 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
13144 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
13145 return is_contained(TEs, TE);
13155 for (
unsigned Idx :
seq<unsigned>(BaseGraphSize)) {
13156 TreeEntry &
E = *VectorizableTree[Idx];
13157 if (
E.isGather()) {
13160 unsigned MinVF =
getMinVF(2 * Sz);
13163 if (VL.
size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
13164 !(!
E.hasState() ||
E.getOpcode() == Instruction::Load ||
13170 if (ForceLoadGather &&
E.hasState() &&
E.getOpcode() == Instruction::Load)
13173 if (CheckForSameVectorNodes(
E))
13177 unsigned StartIdx = 0;
13178 unsigned End = VL.
size();
13180 *TTI, VL.
front()->getType(), VL.
size() - 1);
13182 *TTI, VL.
front()->getType(), VF - 1)) {
13183 if (StartIdx + VF > End)
13186 bool AllStrided =
true;
13187 for (
unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
13192 !getSameValuesTreeEntry(Slice.
front(), Slice,
true))
13199 bool IsSplat =
isSplat(Slice);
13200 bool IsTwoRegisterSplat =
true;
13201 if (IsSplat && VF == 2) {
13204 IsTwoRegisterSplat = NumRegs2VF == 2;
13206 if (Slices.
empty() || !IsSplat || !IsTwoRegisterSplat ||
13214 (S.getOpcode() == Instruction::Load &&
13216 (S.getOpcode() != Instruction::Load &&
13222 if ((!UserIgnoreList ||
E.Idx != 0) &&
13223 TTI->getInstructionCost(S.getMainOp(),
CostKind) <
13232 if (S.getOpcode() == Instruction::Load) {
13235 StridedPtrInfo SPtrInfo;
13237 PointerOps, SPtrInfo);
13248 if (UserIgnoreList &&
E.Idx == 0)
13253 }
else if (S.getOpcode() == Instruction::ExtractElement ||
13254 (TTI->getInstructionCost(S.getMainOp(),
CostKind) <
13256 !CheckOperandsProfitability(
13273 if (VF == 2 && AllStrided && Slices.
size() > 2)
13275 auto AddCombinedNode = [&](
unsigned Idx,
unsigned Cnt,
unsigned Sz) {
13276 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
13277 if (StartIdx == Cnt)
13278 StartIdx = Cnt + Sz;
13279 if (End == Cnt + Sz)
13282 for (
auto [Cnt, Sz] : Slices) {
13284 const TreeEntry *SameTE =
nullptr;
13286 It != Slice.
end()) {
13288 SameTE = getSameValuesTreeEntry(*It, Slice);
13290 unsigned PrevSize = VectorizableTree.size();
13291 [[maybe_unused]]
unsigned PrevEntriesSize =
13292 LoadEntriesToVectorize.size();
13293 buildTreeRec(Slice, 0,
EdgeInfo(&
E, UINT_MAX));
13294 if (PrevSize + 1 == VectorizableTree.size() && !SameTE &&
13295 VectorizableTree[PrevSize]->isGather() &&
13296 VectorizableTree[PrevSize]->hasState() &&
13297 VectorizableTree[PrevSize]->getOpcode() !=
13298 Instruction::ExtractElement &&
13300 if (UserIgnoreList &&
E.Idx == 0 && VF == 2)
13302 VectorizableTree.pop_back();
13303 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
13304 "LoadEntriesToVectorize expected to remain the same");
13307 AddCombinedNode(PrevSize, Cnt, Sz);
13311 if (
E.CombinedEntriesWithIndices.empty() && !
E.ReorderIndices.empty()) {
13312 SmallVector<int>
Mask(
E.ReorderIndices.begin(),
E.ReorderIndices.end());
13314 E.ReorderIndices.clear();
13319 switch (
E.getOpcode()) {
13320 case Instruction::Load: {
13323 if (
E.State != TreeEntry::Vectorize)
13325 Type *ScalarTy =
E.getMainOp()->getType();
13331 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13332 SmallVector<int>
Mask;
13336 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
13337 BaseLI->getPointerAddressSpace(),
CostKind,
13341 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_load,
13342 VecTy, BaseLI->getPointerOperand(),
13343 false, CommonAlignment,
13350 ->getPointerOperand()
13352 StridedPtrInfo SPtrInfo;
13353 SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1);
13354 SPtrInfo.Ty = VecTy;
13355 TreeEntryToStridedPtrInfoMap[&
E] = SPtrInfo;
13356 E.State = TreeEntry::StridedVectorize;
13361 case Instruction::Store: {
13369 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
13370 SmallVector<int>
Mask;
13374 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
13375 BaseSI->getPointerAddressSpace(),
CostKind,
13379 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_store,
13380 VecTy, BaseSI->getPointerOperand(),
13381 false, CommonAlignment,
13384 if (StridedCost < OriginalVecCost)
13387 E.State = TreeEntry::StridedVectorize;
13388 }
else if (!
E.ReorderIndices.empty()) {
13390 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int>
Mask) {
13392 assert(
Mask.size() > 1 &&
"Expected mask greater than 1 element.");
13393 if (
Mask.size() < 4)
13397 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
13398 TTI.isLegalInterleavedAccessType(
13399 VecTy, Factor, BaseSI->getAlign(),
13400 BaseSI->getPointerAddressSpace()))
13406 SmallVector<int>
Mask(
E.ReorderIndices.begin(),
E.ReorderIndices.end());
13407 unsigned InterleaveFactor = IsInterleaveMask(Mask);
13408 if (InterleaveFactor != 0)
13409 E.setInterleave(InterleaveFactor);
13413 case Instruction::Select: {
13414 if (
E.State != TreeEntry::Vectorize)
13420 E.CombinedOp = TreeEntry::MinMax;
13421 TreeEntry *CondEntry = getOperandEntry(&
E, 0);
13422 if (SelectOnly && CondEntry->UserTreeIndex &&
13423 CondEntry->State == TreeEntry::Vectorize) {
13425 CondEntry->State = TreeEntry::CombinedVectorize;
13429 case Instruction::FSub:
13430 case Instruction::FAdd: {
13432 if (
E.State != TreeEntry::Vectorize ||
13433 !
E.getOperations().isAddSubLikeOp())
13439 E.CombinedOp = TreeEntry::FMulAdd;
13440 TreeEntry *FMulEntry = getOperandEntry(&
E, 0);
13441 if (FMulEntry->UserTreeIndex &&
13442 FMulEntry->State == TreeEntry::Vectorize) {
13444 FMulEntry->State = TreeEntry::CombinedVectorize;
13453 if (LoadEntriesToVectorize.empty()) {
13455 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
13456 VectorizableTree.front()->getOpcode() == Instruction::Load)
13459 constexpr unsigned SmallTree = 3;
13460 constexpr unsigned SmallVF = 2;
13461 if ((VectorizableTree.size() <= SmallTree &&
13462 VectorizableTree.front()->Scalars.size() == SmallVF) ||
13463 (VectorizableTree.size() <= 2 && UserIgnoreList))
13466 if (VectorizableTree.front()->isNonPowOf2Vec() &&
13470 [](
const std::unique_ptr<TreeEntry> &TE) {
13471 return TE->isGather() &&
TE->hasState() &&
13472 TE->getOpcode() == Instruction::Load &&
13480 SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
13484 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
13485 TreeEntry &
E = *
TE;
13486 if (
E.isGather() &&
13487 ((
E.hasState() &&
E.getOpcode() == Instruction::Load) ||
13488 (!
E.hasState() &&
any_of(
E.Scalars,
13490 return isa<LoadInst>(V) &&
13491 !isVectorized(V) &&
13492 !isDeleted(cast<Instruction>(V));
13495 for (
Value *V :
E.Scalars) {
13502 *
this, V, *DL, *SE, *TTI,
13503 GatheredLoads[std::make_tuple(
13511 if (!GatheredLoads.
empty())
13512 tryToVectorizeGatheredLoads(GatheredLoads);
13522 bool IsFinalized =
false;
13535 bool SameNodesEstimated =
true;
13538 if (Ty->getScalarType()->isPointerTy()) {
13542 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
13543 Ty->getScalarType());
13561 assert(It != VL.
end() &&
"Expected at least one non-undef value.");
13564 count(VL, *It) > 1 &&
13566 if (!NeedShuffle) {
13569 return TTI.getShuffleCost(
13574 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
13575 CostKind, std::distance(VL.
begin(), It),
13581 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
13584 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
13588 VecTy, ShuffleMask, CostKind,
13592 return GatherCost +
13595 : R.getGatherCost(Gathers, !Root && VL.
equals(Gathers),
13603 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13604 unsigned NumParts) {
13605 assert(VL.
size() > NumParts &&
"Unexpected scalarized shuffle.");
13607 std::accumulate(VL.
begin(), VL.
end(), 0, [](
unsigned Sz,
Value *V) {
13608 auto *EE = dyn_cast<ExtractElementInst>(V);
13611 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
13614 return std::max(Sz, VecTy->getNumElements());
13621 -> std::optional<TTI::ShuffleKind> {
13622 if (NumElts <= EltsPerVector)
13623 return std::nullopt;
13625 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
13627 if (I == PoisonMaskElem)
13629 return std::min(S, I);
13632 int OffsetReg1 = OffsetReg0;
13636 int FirstRegId = -1;
13637 Indices.assign(1, OffsetReg0);
13641 int Idx =
I - OffsetReg0;
13643 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
13644 if (FirstRegId < 0)
13645 FirstRegId = RegId;
13646 RegIndices.
insert(RegId);
13647 if (RegIndices.
size() > 2)
13648 return std::nullopt;
13649 if (RegIndices.
size() == 2) {
13651 if (Indices.
size() == 1) {
13654 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
13655 [&](
int S,
int I) {
13656 if (I == PoisonMaskElem)
13658 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
13659 ((I - OffsetReg0) % NumElts) / EltsPerVector;
13660 if (RegId == FirstRegId)
13662 return std::min(S, I);
13665 unsigned Index = OffsetReg1 % NumElts;
13666 Indices.push_back(Index);
13667 SubVecSizes.push_back(std::min(NumElts - Index, EltsPerVector));
13669 Idx =
I - OffsetReg1;
13671 I = (Idx % NumElts) % EltsPerVector +
13672 (RegId == FirstRegId ? 0 : EltsPerVector);
13674 return ShuffleKind;
13682 if (!ShuffleKinds[Part])
13685 Part * EltsPerVector,
getNumElems(Mask.size(), EltsPerVector, Part));
13690 std::optional<TTI::ShuffleKind> RegShuffleKind =
13691 CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);
13692 if (!RegShuffleKind) {
13695 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.
size())))
13708 *R.TTI, VL.
front()->getType(),
alignTo(NumElts, EltsPerVector));
13709 for (
const auto [Idx, SubVecSize] :
zip(Indices, SubVecSizes)) {
13710 assert((Idx + SubVecSize) <= BaseVF &&
13711 "SK_ExtractSubvector index out of range");
13721 TTI, *ShuffleKinds[Part],
getWidenedType(ScalarTy, NumElts), SubMask);
13722 if (OriginalCost < Cost)
13723 Cost = OriginalCost;
13730 void estimateNodesPermuteCost(
const TreeEntry &E1,
const TreeEntry *E2,
13732 unsigned SliceSize) {
13733 if (SameNodesEstimated) {
13739 if ((InVectors.size() == 2 &&
13743 unsigned Limit =
getNumElems(Mask.size(), SliceSize, Part);
13746 "Expected all poisoned elements.");
13748 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
13753 Cost += createShuffle(InVectors.front(),
13754 InVectors.size() == 1 ?
nullptr : InVectors.back(),
13756 transformMaskAfterShuffle(CommonMask, CommonMask);
13757 }
else if (InVectors.size() == 2) {
13758 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
13759 transformMaskAfterShuffle(CommonMask, CommonMask);
13761 SameNodesEstimated =
false;
13762 if (!E2 && InVectors.size() == 1) {
13763 unsigned VF = E1.getVectorFactor();
13765 VF = std::max(VF, getVF(V1));
13768 VF = std::max(VF, E->getVectorFactor());
13770 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13772 CommonMask[Idx] = Mask[Idx] + VF;
13773 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
13774 transformMaskAfterShuffle(CommonMask, CommonMask);
13776 auto P = InVectors.front();
13777 Cost += createShuffle(&E1, E2, Mask);
13778 unsigned VF = Mask.size();
13784 VF = std::max(VF, E->getVectorFactor());
13786 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13788 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
13789 Cost += createShuffle(
P, InVectors.front(), CommonMask);
13790 transformMaskAfterShuffle(CommonMask, CommonMask);
13794 class ShuffleCostBuilder {
13797 static bool isEmptyOrIdentity(
ArrayRef<int> Mask,
unsigned VF) {
13799 return Mask.empty() ||
13800 (VF == Mask.size() &&
13808 ~ShuffleCostBuilder() =
default;
13814 if (isEmptyOrIdentity(Mask, VF))
13823 if (isEmptyOrIdentity(Mask, VF))
13832 void resizeToMatch(
Value *&,
Value *&)
const {}
13842 ShuffleCostBuilder Builder(TTI);
13845 unsigned CommonVF = Mask.size();
13847 auto GetNodeMinBWAffectedCost = [&](
const TreeEntry &E,
13851 Type *EScalarTy = E.Scalars.front()->getType();
13852 bool IsSigned =
true;
13853 if (
auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
13855 IsSigned = It->second.second;
13857 if (EScalarTy != ScalarTy) {
13858 unsigned CastOpcode = Instruction::Trunc;
13859 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13860 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13862 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13863 return TTI.getCastInstrCost(CastOpcode,
getWidenedType(ScalarTy, VF),
13873 Type *EScalarTy = VecTy->getElementType();
13874 if (EScalarTy != ScalarTy) {
13876 unsigned CastOpcode = Instruction::Trunc;
13877 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
13878 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
13880 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
13881 return TTI.getCastInstrCost(
13887 if (!V1 && !V2 && !P2.
isNull()) {
13890 unsigned VF = E->getVectorFactor();
13892 CommonVF = std::max(VF, E2->getVectorFactor());
13895 return Idx < 2 * static_cast<int>(CommonVF);
13897 "All elements in mask must be less than 2 * CommonVF.");
13898 if (E->Scalars.size() == E2->Scalars.size()) {
13902 for (
int &Idx : CommonMask) {
13905 if (Idx <
static_cast<int>(CommonVF) && !EMask.
empty())
13907 else if (Idx >=
static_cast<int>(CommonVF))
13908 Idx = (E2Mask.
empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
13912 CommonVF = E->Scalars.size();
13913 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
13914 GetNodeMinBWAffectedCost(*E2, CommonVF);
13916 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
13917 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
13920 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13921 }
else if (!V1 && P2.
isNull()) {
13924 unsigned VF = E->getVectorFactor();
13928 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
13929 "All elements in mask must be less than CommonVF.");
13930 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
13932 assert(!EMask.
empty() &&
"Expected non-empty common mask.");
13933 for (
int &Idx : CommonMask) {
13937 CommonVF = E->Scalars.size();
13938 }
else if (
unsigned Factor = E->getInterleaveFactor();
13939 Factor > 0 && E->Scalars.size() != Mask.size() &&
13943 std::iota(CommonMask.begin(), CommonMask.end(), 0);
13945 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
13948 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
13949 CommonVF == CommonMask.size() &&
13951 [](
const auto &&
P) {
13953 static_cast<unsigned>(
P.value()) !=
P.index();
13961 }
else if (V1 && P2.
isNull()) {
13963 ExtraCost += GetValueMinBWAffectedCost(V1);
13964 CommonVF = getVF(V1);
13967 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
13968 "All elements in mask must be less than CommonVF.");
13969 }
else if (V1 && !V2) {
13971 unsigned VF = getVF(V1);
13973 CommonVF = std::max(VF, E2->getVectorFactor());
13976 return Idx < 2 * static_cast<int>(CommonVF);
13978 "All elements in mask must be less than 2 * CommonVF.");
13979 if (E2->Scalars.size() == VF && VF != CommonVF) {
13981 assert(!E2Mask.
empty() &&
"Expected non-empty common mask.");
13982 for (
int &Idx : CommonMask) {
13985 if (Idx >=
static_cast<int>(CommonVF))
13986 Idx = E2Mask[Idx - CommonVF] + VF;
13990 ExtraCost += GetValueMinBWAffectedCost(V1);
13992 ExtraCost += GetNodeMinBWAffectedCost(
13993 *E2, std::min(CommonVF, E2->getVectorFactor()));
13994 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
13995 }
else if (!V1 && V2) {
13997 unsigned VF = getVF(V2);
13999 CommonVF = std::max(VF, E1->getVectorFactor());
14002 return Idx < 2 * static_cast<int>(CommonVF);
14004 "All elements in mask must be less than 2 * CommonVF.");
14005 if (E1->Scalars.size() == VF && VF != CommonVF) {
14007 assert(!E1Mask.
empty() &&
"Expected non-empty common mask.");
14008 for (
int &Idx : CommonMask) {
14011 if (Idx >=
static_cast<int>(CommonVF))
14012 Idx = E1Mask[Idx - CommonVF] + VF;
14018 ExtraCost += GetNodeMinBWAffectedCost(
14019 *E1, std::min(CommonVF, E1->getVectorFactor()));
14021 ExtraCost += GetValueMinBWAffectedCost(V2);
14022 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
14024 assert(V1 && V2 &&
"Expected both vectors.");
14025 unsigned VF = getVF(V1);
14026 CommonVF = std::max(VF, getVF(V2));
14029 return Idx < 2 * static_cast<int>(CommonVF);
14031 "All elements in mask must be less than 2 * CommonVF.");
14033 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
14036 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
14041 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
14044 InVectors.front() =
14046 if (InVectors.size() == 2)
14047 InVectors.pop_back();
14048 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
14049 V1, V2, CommonMask, Builder, ScalarTy);
14056 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
14057 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
14058 CheckedExtracts(CheckedExtracts) {}
14060 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
14061 unsigned NumParts,
bool &UseVecBaseAsInput) {
14062 UseVecBaseAsInput =
false;
14065 Value *VecBase =
nullptr;
14067 if (!E->ReorderIndices.empty()) {
14069 E->ReorderIndices.end());
14074 bool PrevNodeFound =
any_of(
14075 ArrayRef(R.VectorizableTree).take_front(E->Idx),
14076 [&](
const std::unique_ptr<TreeEntry> &TE) {
14077 return ((TE->hasState() && !TE->isAltShuffle() &&
14078 TE->getOpcode() == Instruction::ExtractElement) ||
14080 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
14081 return VL.size() > Data.index() &&
14082 (Mask[Data.index()] == PoisonMaskElem ||
14083 isa<UndefValue>(VL[Data.index()]) ||
14084 Data.value() == VL[Data.index()]);
14092 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
14106 VecBase = EE->getVectorOperand();
14107 UniqueBases.
insert(VecBase);
14109 if (!CheckedExtracts.
insert(V).second ||
14113 return isa<GetElementPtrInst>(U) &&
14114 !R.areAllUsersVectorized(cast<Instruction>(U),
14122 unsigned Idx = *EEIdx;
14124 if (EE->hasOneUse() || !PrevNodeFound) {
14130 Cost -= TTI.getExtractWithExtendCost(
14134 Cost += TTI.getCastInstrCost(
14140 APInt &DemandedElts =
14141 VectorOpsToExtracts
14144 .first->getSecond();
14145 DemandedElts.
setBit(Idx);
14148 for (
const auto &[Vec, DemandedElts] : VectorOpsToExtracts)
14150 DemandedElts,
false,
14158 if (!PrevNodeFound)
14159 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
14162 transformMaskAfterShuffle(CommonMask, CommonMask);
14163 SameNodesEstimated =
false;
14164 if (NumParts != 1 && UniqueBases.
size() != 1) {
14165 UseVecBaseAsInput =
true;
14173 std::optional<InstructionCost>
14177 return std::nullopt;
14181 IsFinalized =
false;
14182 CommonMask.clear();
14185 VectorizedVals.clear();
14186 SameNodesEstimated =
true;
14192 return Idx < static_cast<int>(E1.getVectorFactor());
14194 "Expected single vector shuffle mask.");
14198 if (InVectors.empty()) {
14199 CommonMask.assign(Mask.begin(), Mask.end());
14200 InVectors.assign({&E1, &E2});
14203 assert(!CommonMask.empty() &&
"Expected non-empty common mask.");
14209 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
14210 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
14213 if (InVectors.empty()) {
14214 CommonMask.assign(Mask.begin(), Mask.end());
14215 InVectors.assign(1, &E1);
14218 assert(!CommonMask.empty() &&
"Expected non-empty common mask.");
14224 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
14225 estimateNodesPermuteCost(E1,
nullptr, Mask, Part, SliceSize);
14226 if (!SameNodesEstimated && InVectors.size() == 1)
14227 InVectors.emplace_back(&E1);
14233 assert(InVectors.size() == 1 &&
14240 ->getOrdered(
P.index()));
14241 return EI->getVectorOperand() == V1 ||
14242 EI->getVectorOperand() == V2;
14244 "Expected extractelement vectors.");
14248 if (InVectors.empty()) {
14249 assert(CommonMask.empty() && !ForExtracts &&
14250 "Expected empty input mask/vectors.");
14251 CommonMask.assign(Mask.begin(), Mask.end());
14252 InVectors.assign(1, V1);
14258 !CommonMask.empty() &&
14262 ->getOrdered(
P.index());
14264 return P.value() == Mask[
P.index()] ||
14269 return EI->getVectorOperand() == V1;
14271 "Expected only tree entry for extractelement vectors.");
14274 assert(!InVectors.empty() && !CommonMask.empty() &&
14275 "Expected only tree entries from extracts/reused buildvectors.");
14276 unsigned VF = getVF(V1);
14277 if (InVectors.size() == 2) {
14278 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14279 transformMaskAfterShuffle(CommonMask, CommonMask);
14280 VF = std::max<unsigned>(VF, CommonMask.size());
14281 }
else if (
const auto *InTE =
14282 InVectors.front().dyn_cast<
const TreeEntry *>()) {
14283 VF = std::max(VF, InTE->getVectorFactor());
14287 ->getNumElements());
14289 InVectors.push_back(V1);
14290 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14292 CommonMask[Idx] = Mask[Idx] + VF;
14295 Value *Root =
nullptr) {
14296 Cost += getBuildVectorCost(VL, Root);
14300 unsigned VF = VL.
size();
14302 VF = std::min(VF, MaskVF);
14303 Type *VLScalarTy = VL.
front()->getType();
14327 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
14333 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14338 IsFinalized =
true;
14341 if (InVectors.
size() == 2)
14342 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
14344 Cost += createShuffle(Vec,
nullptr, CommonMask);
14345 transformMaskAfterShuffle(CommonMask, CommonMask);
14347 "Expected vector length for the final value before action.");
14350 Cost += createShuffle(V1, V2, Mask);
14353 InVectors.
front() = V;
14355 if (!SubVectors.empty()) {
14357 if (InVectors.
size() == 2)
14358 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
14360 Cost += createShuffle(Vec,
nullptr, CommonMask);
14361 transformMaskAfterShuffle(CommonMask, CommonMask);
14363 if (!SubVectorsMask.
empty()) {
14365 "Expected same size of masks for subvectors and common mask.");
14367 copy(SubVectorsMask, SVMask.begin());
14368 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
14371 I1 = I2 + CommonMask.
size();
14378 for (
auto [
E, Idx] : SubVectors) {
14379 Type *EScalarTy =
E->Scalars.front()->getType();
14380 bool IsSigned =
true;
14381 if (
auto It =
R.MinBWs.find(
E); It !=
R.MinBWs.end()) {
14384 IsSigned = It->second.second;
14386 if (ScalarTy != EScalarTy) {
14387 unsigned CastOpcode = Instruction::Trunc;
14388 unsigned DstSz =
R.DL->getTypeSizeInBits(ScalarTy);
14389 unsigned SrcSz =
R.DL->getTypeSizeInBits(EScalarTy);
14391 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14392 Cost += TTI.getCastInstrCost(
14401 if (!CommonMask.
empty()) {
14402 std::iota(std::next(CommonMask.
begin(), Idx),
14403 std::next(CommonMask.
begin(), Idx +
E->getVectorFactor()),
14409 if (!ExtMask.
empty()) {
14410 if (CommonMask.
empty()) {
14414 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
14417 NewMask[
I] = CommonMask[ExtMask[
I]];
14419 CommonMask.
swap(NewMask);
14422 if (CommonMask.
empty()) {
14423 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
14427 createShuffle(InVectors.
front(),
14428 InVectors.
size() == 2 ? InVectors.
back() :
nullptr,
14433 assert((IsFinalized || CommonMask.empty()) &&
14434 "Shuffle construction must be finalized.");
14438const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(
const TreeEntry *E,
14439 unsigned Idx)
const {
14440 TreeEntry *
Op = OperandsToTreeEntry.
at({E, Idx});
14441 assert(
Op->isSame(
E->getOperand(Idx)) &&
"Operands mismatch!");
14446 if (
TE.State == TreeEntry::ScatterVectorize ||
14447 TE.State == TreeEntry::StridedVectorize)
14449 if (
TE.State == TreeEntry::CompressVectorize)
14451 if (
TE.State == TreeEntry::Vectorize &&
TE.getOpcode() == Instruction::Load &&
14452 !
TE.isAltShuffle()) {
14453 if (
TE.ReorderIndices.empty())
14455 SmallVector<int>
Mask;
14465 SmallPtrSetImpl<Value *> &CheckedExtracts) {
14470 return InstructionCost::getInvalid();
14475 auto It = MinBWs.find(
E);
14476 Type *OrigScalarTy = ScalarTy;
14477 if (It != MinBWs.end()) {
14484 unsigned EntryVF =
E->getVectorFactor();
14487 if (
E->isGather()) {
14491 return InstructionCost::getInvalid();
14493 ScalarTy = VL.
front()->getType();
14494 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
14495 E, ScalarTy, *TTI, VectorizedVals, *
this, CheckedExtracts);
14497 if (
E->State == TreeEntry::SplitVectorize) {
14498 assert(
E->CombinedEntriesWithIndices.size() == 2 &&
14499 "Expected exactly 2 combined entries.");
14500 assert(
E->ReuseShuffleIndices.empty() &&
"Expected empty reuses mask.");
14502 if (
E->ReorderIndices.empty()) {
14505 E->CombinedEntriesWithIndices.back().second,
14508 VectorizableTree[
E->CombinedEntriesWithIndices.back().first]
14509 ->getVectorFactor()));
14511 unsigned CommonVF =
14512 std::max(VectorizableTree[
E->CombinedEntriesWithIndices.front().first]
14513 ->getVectorFactor(),
14514 VectorizableTree[
E->CombinedEntriesWithIndices.back().first]
14515 ->getVectorFactor());
14520 LLVM_DEBUG(dumpTreeCosts(
E, 0, VectorCost, 0,
"Calculated costs for Tree"));
14524 SmallVector<int>
Mask;
14525 if (!
E->ReorderIndices.empty() &&
E->State != TreeEntry::CompressVectorize &&
14526 (
E->State != TreeEntry::StridedVectorize ||
14528 SmallVector<int> NewMask;
14529 if (
E->getOpcode() == Instruction::Store) {
14531 NewMask.
resize(
E->ReorderIndices.size());
14538 if (!
E->ReuseShuffleIndices.empty())
14543 assert((
E->State == TreeEntry::Vectorize ||
14544 E->State == TreeEntry::ScatterVectorize ||
14545 E->State == TreeEntry::StridedVectorize ||
14546 E->State == TreeEntry::CompressVectorize) &&
14547 "Unhandled state");
14550 (
E->getOpcode() == Instruction::GetElementPtr &&
14551 E->getMainOp()->getType()->isPointerTy()) ||
14552 E->hasCopyableElements()) &&
14555 unsigned ShuffleOrOp =
14556 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector :
E->
getOpcode();
14557 if (
E->CombinedOp != TreeEntry::NotCombinedOp)
14558 ShuffleOrOp =
E->CombinedOp;
14559 SmallSetVector<Value *, 16> UniqueValues(VL.
begin(), VL.
end());
14560 const unsigned Sz = UniqueValues.size();
14561 SmallBitVector UsedScalars(Sz,
false);
14562 for (
unsigned I = 0;
I < Sz; ++
I) {
14564 !
E->isCopyableElement(UniqueValues[
I]) &&
14565 getTreeEntries(UniqueValues[
I]).
front() ==
E)
14567 UsedScalars.set(
I);
14569 auto GetCastContextHint = [&](
Value *
V) {
14571 return getCastContextHint(*OpTEs.front());
14572 InstructionsState SrcState =
getSameOpcode(
E->getOperand(0), *TLI);
14573 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
14574 !SrcState.isAltShuffle())
14587 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
14589 for (
unsigned I = 0;
I < Sz; ++
I) {
14590 if (UsedScalars.test(
I))
14592 ScalarCost += ScalarEltCost(
I);
14599 if (It != MinBWs.end() && !UnaryInstruction::isCast(
E->getOpcode()) &&
14601 (
E->getOpcode() != Instruction::Load ||
E->UserTreeIndex)) {
14603 if (!EI.UserTE->hasState() ||
14604 EI.UserTE->getOpcode() != Instruction::Select ||
14606 auto UserBWIt = MinBWs.find(EI.UserTE);
14607 Type *UserScalarTy =
14608 (EI.UserTE->isGather() ||
14609 EI.UserTE->State == TreeEntry::SplitVectorize)
14610 ? EI.UserTE->Scalars.front()->getType()
14611 : EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
14612 if (UserBWIt != MinBWs.end())
14614 UserBWIt->second.first);
14615 if (ScalarTy != UserScalarTy) {
14616 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
14617 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
14618 unsigned VecOpcode;
14620 if (BWSz > SrcBWSz)
14621 VecOpcode = Instruction::Trunc;
14624 It->second.second ? Instruction::SExt : Instruction::ZExt;
14626 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
14631 LLVM_DEBUG(dumpTreeCosts(
E, CommonCost, VecCost - CommonCost,
14632 ScalarCost,
"Calculated costs for Tree"));
14633 return VecCost - ScalarCost;
14638 assert((
E->State == TreeEntry::Vectorize ||
14639 E->State == TreeEntry::StridedVectorize ||
14640 E->State == TreeEntry::CompressVectorize) &&
14641 "Entry state expected to be Vectorize, StridedVectorize or "
14642 "MaskedLoadCompressVectorize here.");
14646 *TTI, Ptrs, BasePtr,
E->getOpcode(),
CostKind, OrigScalarTy, VecTy);
14647 LLVM_DEBUG(dumpTreeCosts(
E, 0, VecCost, ScalarCost,
14648 "Calculated GEPs cost for Tree"));
14650 return VecCost - ScalarCost;
14656 return InstructionCost::getInvalid();
14657 Type *CanonicalType = Ty;
14663 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
14664 {CanonicalType, CanonicalType});
14666 TTI->getIntrinsicInstrCost(CostAttrs,
CostKind);
14669 if (VI && SelectOnly) {
14671 "Expected only for scalar type.");
14674 CI->
getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
14675 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
14676 {TTI::OK_AnyValue, TTI::OP_None}, CI);
14680 auto GetFMulAddCost = [&, &TTI = *TTI](
const InstructionsState &S,
14685 switch (ShuffleOrOp) {
14686 case Instruction::PHI: {
14689 SmallPtrSet<const TreeEntry *, 4> CountedOps;
14690 for (
Value *V : UniqueValues) {
14695 ValueList Operands(
PHI->getNumIncomingValues(),
nullptr);
14696 for (
unsigned I = 0,
N =
PHI->getNumIncomingValues();
I <
N; ++
I) {
14700 if (
const TreeEntry *OpTE =
14701 getSameValuesTreeEntry(Operands.
front(), Operands))
14702 if (CountedOps.
insert(OpTE).second &&
14703 !OpTE->ReuseShuffleIndices.empty())
14704 ScalarCost +=
TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
14705 OpTE->Scalars.size());
14708 return CommonCost - ScalarCost;
14710 case Instruction::ExtractValue:
14711 case Instruction::ExtractElement: {
14712 APInt DemandedElts;
14714 auto GetScalarCost = [&](
unsigned Idx) {
14720 if (ShuffleOrOp == Instruction::ExtractElement) {
14722 SrcVecTy = EE->getVectorOperandType();
14725 Type *AggregateTy = EV->getAggregateOperand()->getType();
14728 NumElts = ATy->getNumElements();
14734 if (
I->hasOneUse()) {
14744 Cost -= TTI->getCastInstrCost(
14750 if (DemandedElts.
isZero())
14756 return CommonCost - (DemandedElts.
isZero()
14758 : TTI.getScalarizationOverhead(
14759 SrcVecTy, DemandedElts,
false,
14762 return GetCostDiff(GetScalarCost, GetVectorCost);
14764 case Instruction::InsertElement: {
14765 assert(
E->ReuseShuffleIndices.empty() &&
14766 "Unique insertelements only are expected.");
14768 unsigned const NumElts = SrcVecTy->getNumElements();
14769 unsigned const NumScalars = VL.
size();
14775 unsigned OffsetEnd = OffsetBeg;
14776 InsertMask[OffsetBeg] = 0;
14779 if (OffsetBeg > Idx)
14781 else if (OffsetEnd < Idx)
14783 InsertMask[Idx] =
I + 1;
14786 if (NumOfParts > 0 && NumOfParts < NumElts)
14787 VecScalarsSz =
PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
14788 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
14790 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
14791 unsigned InsertVecSz = std::min<unsigned>(
14793 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
14794 bool IsWholeSubvector =
14795 OffsetBeg ==
Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
14799 if (OffsetBeg + InsertVecSz > VecSz) {
14802 InsertVecSz = VecSz;
14807 SmallVector<int>
Mask;
14808 if (!
E->ReorderIndices.empty()) {
14813 std::iota(
Mask.begin(), std::next(
Mask.begin(), InsertVecSz), 0);
14815 bool IsIdentity =
true;
14817 Mask.swap(PrevMask);
14818 for (
unsigned I = 0;
I < NumScalars; ++
I) {
14820 DemandedElts.
setBit(InsertIdx);
14821 IsIdentity &= InsertIdx - OffsetBeg ==
I;
14822 Mask[InsertIdx - OffsetBeg] =
I;
14824 assert(
Offset < NumElts &&
"Failed to find vector index offset");
14838 InsertVecTy, Mask);
14840 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
14846 SmallBitVector InMask =
14848 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
14849 if (!InMask.
all() && NumScalars != NumElts && !IsWholeSubvector) {
14850 if (InsertVecSz != VecSz) {
14855 for (
unsigned I = 0, End = OffsetBeg -
Offset;
I < End; ++
I)
14857 for (
unsigned I = OffsetBeg -
Offset, End = OffsetEnd -
Offset;
14861 for (
unsigned I = OffsetEnd + 1 -
Offset;
I < VecSz; ++
I)
14870 case Instruction::ZExt:
14871 case Instruction::SExt:
14872 case Instruction::FPToUI:
14873 case Instruction::FPToSI:
14874 case Instruction::FPExt:
14875 case Instruction::PtrToInt:
14876 case Instruction::IntToPtr:
14877 case Instruction::SIToFP:
14878 case Instruction::UIToFP:
14879 case Instruction::Trunc:
14880 case Instruction::FPTrunc:
14881 case Instruction::BitCast: {
14882 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
14885 unsigned Opcode = ShuffleOrOp;
14886 unsigned VecOpcode = Opcode;
14888 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
14890 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->
getScalarType());
14891 if (SrcIt != MinBWs.end()) {
14892 SrcBWSz = SrcIt->second.first;
14898 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->
getScalarType());
14899 if (BWSz == SrcBWSz) {
14900 VecOpcode = Instruction::BitCast;
14901 }
else if (BWSz < SrcBWSz) {
14902 VecOpcode = Instruction::Trunc;
14903 }
else if (It != MinBWs.end()) {
14904 assert(BWSz > SrcBWSz &&
"Invalid cast!");
14905 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
14906 }
else if (SrcIt != MinBWs.end()) {
14907 assert(BWSz > SrcBWSz &&
"Invalid cast!");
14909 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
14911 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
14912 !SrcIt->second.second) {
14913 VecOpcode = Instruction::UIToFP;
14916 assert(Idx == 0 &&
"Expected 0 index only");
14917 return TTI->getCastInstrCost(Opcode, VL0->
getType(),
14924 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
14926 auto *
VI = VL0->
getOpcode() == Opcode ? VL0 :
nullptr;
14929 bool IsArithmeticExtendedReduction =
14930 E->Idx == 0 && UserIgnoreList &&
14933 return is_contained({Instruction::Add, Instruction::FAdd,
14934 Instruction::Mul, Instruction::FMul,
14935 Instruction::And, Instruction::Or,
14939 if (IsArithmeticExtendedReduction &&
14940 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
14942 return CommonCost +
14943 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH,
CostKind,
14944 VecOpcode == Opcode ? VI :
nullptr);
14946 return GetCostDiff(GetScalarCost, GetVectorCost);
14948 case Instruction::FCmp:
14949 case Instruction::ICmp:
14950 case Instruction::Select: {
14951 CmpPredicate VecPred, SwappedVecPred;
14954 match(VL0, MatchCmp))
14960 auto GetScalarCost = [&](
unsigned Idx) {
14970 !
match(VI, MatchCmp)) ||
14978 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
14979 CostKind, getOperandInfo(
VI->getOperand(0)),
14980 getOperandInfo(
VI->getOperand(1)), VI);
14991 TTI->getCmpSelInstrCost(
E->getOpcode(), VecTy, MaskTy, VecPred,
14992 CostKind, getOperandInfo(
E->getOperand(0)),
14993 getOperandInfo(
E->getOperand(1)), VL0);
14997 unsigned CondNumElements = CondType->getNumElements();
14999 assert(VecTyNumElements >= CondNumElements &&
15000 VecTyNumElements % CondNumElements == 0 &&
15001 "Cannot vectorize Instruction::Select");
15002 if (CondNumElements != VecTyNumElements) {
15011 return VecCost + CommonCost;
15013 return GetCostDiff(GetScalarCost, GetVectorCost);
15015 case TreeEntry::MinMax: {
15016 auto GetScalarCost = [&](
unsigned Idx) {
15017 return GetMinMaxCost(OrigScalarTy);
15021 return VecCost + CommonCost;
15023 return GetCostDiff(GetScalarCost, GetVectorCost);
15025 case TreeEntry::FMulAdd: {
15026 auto GetScalarCost = [&](
unsigned Idx) {
15029 return GetFMulAddCost(
E->getOperations(),
15035 for (
Value *V :
E->Scalars) {
15037 FMF &= FPCI->getFastMathFlags();
15039 FMF &= FPCIOp->getFastMathFlags();
15042 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,
15043 {VecTy, VecTy, VecTy}, FMF);
15045 return VecCost + CommonCost;
15047 return GetCostDiff(GetScalarCost, GetVectorCost);
15049 case Instruction::FNeg:
15050 case Instruction::Add:
15051 case Instruction::FAdd:
15052 case Instruction::Sub:
15053 case Instruction::FSub:
15054 case Instruction::Mul:
15055 case Instruction::FMul:
15056 case Instruction::UDiv:
15057 case Instruction::SDiv:
15058 case Instruction::FDiv:
15059 case Instruction::URem:
15060 case Instruction::SRem:
15061 case Instruction::FRem:
15062 case Instruction::Shl:
15063 case Instruction::LShr:
15064 case Instruction::AShr:
15065 case Instruction::And:
15066 case Instruction::Or:
15067 case Instruction::Xor: {
15068 auto GetScalarCost = [&](
unsigned Idx) {
15075 Value *Op1 =
E->getOperand(0)[Idx];
15077 SmallVector<const Value *, 2> Operands(1, Op1);
15081 Op2 =
E->getOperand(1)[Idx];
15087 ShuffleOrOp, OrigScalarTy,
CostKind, Op1Info, Op2Info, Operands);
15089 I && (ShuffleOrOp == Instruction::FAdd ||
15090 ShuffleOrOp == Instruction::FSub)) {
15098 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
15103 return CI && CI->getValue().countr_one() >= It->second.first;
15111 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy,
CostKind, Op1Info,
15112 Op2Info, {},
nullptr, TLI) +
15115 return GetCostDiff(GetScalarCost, GetVectorCost);
15117 case Instruction::GetElementPtr: {
15118 return CommonCost + GetGEPCostDiff(VL, VL0);
15120 case Instruction::Load: {
15121 auto GetScalarCost = [&](
unsigned Idx) {
15123 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
15124 VI->getAlign(),
VI->getPointerAddressSpace(),
15130 switch (
E->State) {
15131 case TreeEntry::Vectorize:
15132 if (
unsigned Factor =
E->getInterleaveFactor()) {
15133 VecLdCost = TTI->getInterleavedMemoryOpCost(
15134 Instruction::Load, VecTy, Factor, {}, LI0->getAlign(),
15135 LI0->getPointerAddressSpace(),
CostKind);
15138 VecLdCost = TTI->getMemoryOpCost(
15139 Instruction::Load, VecTy, LI0->getAlign(),
15143 case TreeEntry::StridedVectorize: {
15144 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(
E);
15145 FixedVectorType *StridedLoadTy = SPtrInfo.Ty;
15146 assert(StridedLoadTy &&
"Missing StridedPoinerInfo for tree entry.");
15147 Align CommonAlignment =
15149 VecLdCost = TTI->getMemIntrinsicInstrCost(
15150 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_load,
15151 StridedLoadTy, LI0->getPointerOperand(),
15152 false, CommonAlignment),
15154 if (StridedLoadTy != VecTy)
15156 TTI->getCastInstrCost(Instruction::BitCast, VecTy, StridedLoadTy,
15161 case TreeEntry::CompressVectorize: {
15163 unsigned InterleaveFactor;
15164 SmallVector<int> CompressMask;
15167 if (!
E->ReorderIndices.empty()) {
15168 SmallVector<int>
Mask(
E->ReorderIndices.begin(),
15169 E->ReorderIndices.end());
15176 Scalars, PointerOps,
E->ReorderIndices, *TTI, *DL, *SE, *AC, *DT,
15177 *TLI, [](
Value *) { return true; }, IsMasked, InterleaveFactor,
15178 CompressMask, LoadVecTy);
15179 assert(IsVectorized &&
"Failed to vectorize load");
15180 CompressEntryToData.try_emplace(
E, CompressMask, LoadVecTy,
15181 InterleaveFactor, IsMasked);
15182 Align CommonAlignment = LI0->getAlign();
15183 if (InterleaveFactor) {
15184 VecLdCost = TTI->getInterleavedMemoryOpCost(
15185 Instruction::Load, LoadVecTy, InterleaveFactor, {},
15186 CommonAlignment, LI0->getPointerAddressSpace(),
CostKind);
15187 }
else if (IsMasked) {
15188 VecLdCost = TTI->getMemIntrinsicInstrCost(
15189 MemIntrinsicCostAttributes(Intrinsic::masked_load, LoadVecTy,
15191 LI0->getPointerAddressSpace()),
15195 LoadVecTy, CompressMask,
CostKind);
15197 VecLdCost = TTI->getMemoryOpCost(
15198 Instruction::Load, LoadVecTy, CommonAlignment,
15202 LoadVecTy, CompressMask,
CostKind);
15206 case TreeEntry::ScatterVectorize: {
15207 Align CommonAlignment =
15209 VecLdCost = TTI->getMemIntrinsicInstrCost(
15210 MemIntrinsicCostAttributes(Intrinsic::masked_gather, VecTy,
15211 LI0->getPointerOperand(),
15212 false, CommonAlignment),
15216 case TreeEntry::CombinedVectorize:
15217 case TreeEntry::SplitVectorize:
15218 case TreeEntry::NeedToGather:
15221 return VecLdCost + CommonCost;
15227 if (
E->State == TreeEntry::ScatterVectorize)
15234 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
15236 case Instruction::Store: {
15237 bool IsReorder = !
E->ReorderIndices.empty();
15238 auto GetScalarCost = [=](
unsigned Idx) {
15241 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
15242 VI->getAlign(),
VI->getPointerAddressSpace(),
15250 if (
E->State == TreeEntry::StridedVectorize) {
15251 Align CommonAlignment =
15253 VecStCost = TTI->getMemIntrinsicInstrCost(
15254 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_store,
15255 VecTy, BaseSI->getPointerOperand(),
15256 false, CommonAlignment),
15259 assert(
E->State == TreeEntry::Vectorize &&
15260 "Expected either strided or consecutive stores.");
15261 if (
unsigned Factor =
E->getInterleaveFactor()) {
15262 assert(
E->ReuseShuffleIndices.empty() && !
E->ReorderIndices.empty() &&
15263 "No reused shuffles expected");
15265 VecStCost = TTI->getInterleavedMemoryOpCost(
15266 Instruction::Store, VecTy, Factor, {}, BaseSI->getAlign(),
15267 BaseSI->getPointerAddressSpace(),
CostKind);
15270 VecStCost = TTI->getMemoryOpCost(
15271 Instruction::Store, VecTy, BaseSI->getAlign(),
15272 BaseSI->getPointerAddressSpace(),
CostKind, OpInfo);
15275 return VecStCost + CommonCost;
15279 unsigned Idx = IsReorder ?
E->ReorderIndices[
I] :
I;
15283 return GetCostDiff(GetScalarCost, GetVectorCost) +
15284 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
15286 case Instruction::Call: {
15287 auto GetScalarCost = [&](
unsigned Idx) {
15291 IntrinsicCostAttributes CostAttrs(
ID, *CI, 1);
15292 return TTI->getIntrinsicInstrCost(CostAttrs,
CostKind);
15302 CI,
ID, VecTy->getNumElements(),
15303 It != MinBWs.end() ? It->second.first : 0, TTI);
15305 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
15307 return GetCostDiff(GetScalarCost, GetVectorCost);
15309 case Instruction::ShuffleVector: {
15317 "Invalid Shuffle Vector Operand");
15320 auto TryFindNodeWithEqualOperands = [=]() {
15321 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
15324 if (
TE->hasState() &&
TE->isAltShuffle() &&
15325 ((
TE->getOpcode() ==
E->getOpcode() &&
15326 TE->getAltOpcode() ==
E->getAltOpcode()) ||
15327 (
TE->getOpcode() ==
E->getAltOpcode() &&
15328 TE->getAltOpcode() ==
E->getOpcode())) &&
15329 TE->hasEqualOperands(*
E))
15334 auto GetScalarCost = [&](
unsigned Idx) {
15339 assert(
E->getMatchingMainOpOrAltOp(VI) &&
15340 "Unexpected main/alternate opcode");
15342 return TTI->getInstructionCost(VI,
CostKind);
15350 if (TryFindNodeWithEqualOperands()) {
15352 dbgs() <<
"SLP: diamond match for alternate node found.\n";
15359 TTIRef.getArithmeticInstrCost(
E->getOpcode(), VecTy,
CostKind);
15361 TTIRef.getArithmeticInstrCost(
E->getAltOpcode(), VecTy,
CostKind);
15364 VecCost = TTIRef.getCmpSelInstrCost(
15365 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(),
CostKind,
15366 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15368 VecCost += TTIRef.getCmpSelInstrCost(
15369 E->getOpcode(), VecTy, MaskTy,
15371 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
15374 Type *SrcSclTy =
E->getMainOp()->getOperand(0)->getType();
15377 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
15378 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
15380 DL->getTypeSizeInBits(
E->getMainOp()->getOperand(0)->getType());
15381 if (SrcIt != MinBWs.end()) {
15382 SrcBWSz = SrcIt->second.first;
15386 if (BWSz <= SrcBWSz) {
15387 if (BWSz < SrcBWSz)
15389 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
15393 <<
"SLP: alternate extension, which should be truncated.\n";
15399 VecCost = TTIRef.getCastInstrCost(
E->getOpcode(), VecTy, SrcTy,
15402 TTIRef.getCastInstrCost(
E->getAltOpcode(), VecTy, SrcTy,
15405 SmallVector<int>
Mask;
15406 E->buildAltOpShuffleMask(
15407 [&](Instruction *
I) {
15408 assert(
E->getMatchingMainOpOrAltOp(
I) &&
15409 "Unexpected main/alternate opcode");
15420 unsigned Opcode0 =
E->getOpcode();
15421 unsigned Opcode1 =
E->getAltOpcode();
15422 SmallBitVector OpcodeMask(
15426 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
15428 VecTy, Opcode0, Opcode1, OpcodeMask,
CostKind);
15429 return AltVecCost < VecCost ? AltVecCost : VecCost;
15435 return GetCostDiff(
15440 "Not supported shufflevector usage.");
15442 unsigned SVNumElements =
15444 ->getNumElements();
15445 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
15446 for (
size_t I = 0, End = VL.
size();
I != End;
I += GroupSize) {
15451 "Not supported shufflevector usage.");
15454 [[maybe_unused]]
bool IsExtractSubvectorMask =
15455 SV->isExtractSubvectorMask(Index);
15456 assert(IsExtractSubvectorMask &&
15457 "Not supported shufflevector usage.");
15458 if (NextIndex != Index)
15460 NextIndex += SV->getShuffleMask().size();
15463 return ::getShuffleCost(
15469 return GetCostDiff(GetScalarCost, GetVectorCost);
15471 case Instruction::Freeze:
15478bool BoUpSLP::isFullyVectorizableTinyTree(
bool ForReduction)
const {
15480 << VectorizableTree.size() <<
" is fully vectorizable .\n");
15482 auto &&AreVectorizableGathers = [
this](
const TreeEntry *
TE,
unsigned Limit) {
15483 SmallVector<int>
Mask;
15484 return TE->isGather() &&
15486 [
this](
Value *V) { return EphValues.contains(V); }) &&
15488 TE->Scalars.size() < Limit ||
15489 (((
TE->hasState() &&
15490 TE->getOpcode() == Instruction::ExtractElement) ||
15493 (
TE->hasState() &&
TE->getOpcode() == Instruction::Load &&
15494 !
TE->isAltShuffle()) ||
15499 if (VectorizableTree.size() == 1 &&
15500 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
15501 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
15502 VectorizableTree[0]->State == TreeEntry::CompressVectorize ||
15504 AreVectorizableGathers(VectorizableTree[0].
get(),
15505 VectorizableTree[0]->Scalars.size()) &&
15506 VectorizableTree[0]->getVectorFactor() > 2)))
15509 if (VectorizableTree.size() != 2)
15516 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
15517 AreVectorizableGathers(VectorizableTree[1].
get(),
15518 VectorizableTree[0]->Scalars.size()))
15522 if (VectorizableTree[0]->
isGather() ||
15523 (VectorizableTree[1]->
isGather() &&
15524 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
15525 VectorizableTree[0]->State != TreeEntry::StridedVectorize &&
15526 VectorizableTree[0]->State != TreeEntry::CompressVectorize))
15534 bool MustMatchOrInst) {
15538 Value *ZextLoad = Root;
15539 const APInt *ShAmtC;
15540 bool FoundOr =
false;
15544 ShAmtC->
urem(8) == 0))) {
15546 ZextLoad = BinOp->getOperand(0);
15547 if (BinOp->getOpcode() == Instruction::Or)
15552 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
15559 Type *SrcTy = Load->getType();
15560 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
15566 LLVM_DEBUG(
dbgs() <<
"SLP: Assume load combining for tree starting at "
15576 unsigned NumElts = VectorizableTree[0]->Scalars.size();
15577 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
15585 unsigned NumElts = Stores.
size();
15586 for (
Value *Scalar : Stores) {
15600 if (VectorizableTree.empty()) {
15601 assert(ExternalUses.empty() &&
"We shouldn't have any external users");
15607 if (VectorizableTree.size() == 2 &&
15609 VectorizableTree[1]->isGather() &&
15610 (VectorizableTree[1]->getVectorFactor() <= 2 ||
15611 !(
isSplat(VectorizableTree[1]->Scalars) ||
15619 constexpr int Limit = 4;
15621 !VectorizableTree.empty() &&
15622 all_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
15623 return (TE->isGather() &&
15624 (!TE->hasState() ||
15625 TE->getOpcode() != Instruction::ExtractElement) &&
15627 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
15634 VectorizableTree.size() <= Limit &&
15635 all_of(VectorizableTree,
15636 [&](
const std::unique_ptr<TreeEntry> &TE) {
15637 return (TE->isGather() &&
15638 (!TE->hasState() ||
15639 TE->getOpcode() != Instruction::ExtractElement) &&
15643 (TE->getOpcode() == Instruction::InsertElement ||
15644 (TE->getOpcode() == Instruction::PHI &&
15646 return isa<PoisonValue>(V) || MustGather.contains(V);
15649 any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
15650 return TE->State == TreeEntry::Vectorize &&
15651 TE->getOpcode() == Instruction::PHI;
15658 unsigned NumGathers = 0;
15659 constexpr int LimitTreeSize = 36;
15661 all_of(VectorizableTree,
15662 [&](
const std::unique_ptr<TreeEntry> &TE) {
15663 if (!TE->isGather() && TE->hasState() &&
15664 (TE->getOpcode() == Instruction::Load ||
15665 TE->getOpcode() == Instruction::Store)) {
15669 if (TE->isGather())
15671 return TE->State == TreeEntry::SplitVectorize ||
15672 (TE->Idx == 0 && TE->Scalars.size() == 2 &&
15673 TE->hasState() && TE->getOpcode() == Instruction::ICmp &&
15674 VectorizableTree.size() > LimitTreeSize) ||
15678 (TE->getOpcode() == Instruction::PHI ||
15679 (TE->hasCopyableElements() &&
15682 TE->Scalars.size() / 2) ||
15683 ((!TE->ReuseShuffleIndices.empty() ||
15684 !TE->ReorderIndices.empty() || TE->isAltShuffle()) &&
15685 TE->Scalars.size() == 2)));
15687 (StoreLoadNodes.
empty() ||
15688 (VectorizableTree.size() > LimitTreeSize * StoreLoadNodes.
size() &&
15689 (NumGathers > 0 ||
none_of(StoreLoadNodes, [&](
const TreeEntry *TE) {
15690 return TE->getOpcode() == Instruction::Store ||
15692 return !isa<LoadInst>(V) ||
15693 areAllUsersVectorized(cast<Instruction>(V));
15701 VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
15702 VectorizableTree.size() >= Limit &&
15704 [&](
const std::unique_ptr<TreeEntry> &TE) {
15705 return !TE->isGather() && TE->UserTreeIndex.UserTE &&
15706 TE->UserTreeIndex.UserTE->Idx == 0;
15713 VectorizableTree.size() > 2 &&
15714 VectorizableTree.front()->State == TreeEntry::Vectorize &&
15715 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
15716 VectorizableTree[1]->State == TreeEntry::Vectorize &&
15717 VectorizableTree[1]->getOpcode() == Instruction::PHI &&
15719 ArrayRef(VectorizableTree).drop_front(2),
15720 [&](
const std::unique_ptr<TreeEntry> &TE) {
return TE->isGather(); }))
15730 if (isFullyVectorizableTinyTree(ForReduction))
15735 bool IsAllowedSingleBVNode =
15736 VectorizableTree.
size() > 1 ||
15737 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
15738 !VectorizableTree.front()->isAltShuffle() &&
15739 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
15740 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
15742 if (
any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
15743 return TE->isGather() &&
all_of(TE->Scalars, [&](
Value *V) {
15744 return isa<ExtractElementInst, Constant>(V) ||
15745 (IsAllowedSingleBVNode &&
15746 !V->hasNUsesOrMore(UsesLimit) &&
15747 any_of(V->users(), IsaPred<InsertElementInst>));
15752 if (VectorizableTree.back()->isGather() &&
15753 VectorizableTree.back()->hasState() &&
15754 VectorizableTree.back()->isAltShuffle() &&
15755 VectorizableTree.back()->getVectorFactor() > 2 &&
15757 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
15758 TTI->getScalarizationOverhead(
15759 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
15760 VectorizableTree.back()->getVectorFactor()),
15773 constexpr unsigned SmallTree = 3;
15774 if (VectorizableTree.front()->isNonPowOf2Vec() &&
15777 [](
const std::unique_ptr<TreeEntry> &TE) {
15778 return TE->isGather() && TE->hasState() &&
15779 TE->getOpcode() == Instruction::Load &&
15787 TreeEntry &E = *VectorizableTree[Idx];
15788 if (E.State == TreeEntry::SplitVectorize)
15792 if ((E.hasState() && E.getOpcode() != Instruction::Load) ||
15811 const TreeEntry *Root = VectorizableTree.front().get();
15812 if (Root->isGather())
15820 for (
const auto &TEPtr : VectorizableTree) {
15821 if (!TEPtr->isGather()) {
15822 Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
15823 EntriesToLastInstruction.
try_emplace(TEPtr.get(), LastInst);
15824 LastInstructions.
insert(LastInst);
15826 if (TEPtr->UserTreeIndex)
15827 EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
15834 if (
II->isAssumeLikeIntrinsic())
15841 return IntrCost < CallCost;
15848 CheckedInstructions;
15849 unsigned Budget = 0;
15850 const unsigned BudgetLimit =
15855 "Expected instructions in same block.");
15856 if (
auto It = CheckedInstructions.
find(
Last);
15857 It != CheckedInstructions.
end()) {
15858 const Instruction *Checked = It->second.getPointer();
15860 return It->second.getInt() != 0;
15866 ++
First->getIterator().getReverse(),
15868 Last->getIterator().getReverse();
15870 while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
15876 for (
const Instruction *LastInst : LastInstsInRange)
15877 CheckedInstructions.
try_emplace(LastInst, &*PrevInstIt, 0);
15880 if (LastInstructions.
contains(&*PrevInstIt))
15881 LastInstsInRange.
push_back(&*PrevInstIt);
15886 for (
const Instruction *LastInst : LastInstsInRange)
15888 LastInst, PrevInstIt == InstIt ?
First : &*PrevInstIt,
15889 Budget <= BudgetLimit ? 1 : 0);
15890 return Budget <= BudgetLimit;
15892 auto AddCosts = [&](
const TreeEntry *
Op) {
15893 Type *ScalarTy =
Op->Scalars.front()->getType();
15894 auto It = MinBWs.find(
Op);
15895 if (It != MinBWs.end())
15898 Cost += TTI->getCostOfKeepingLiveOverCall(VecTy);
15901 Cost -=
Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy);
15908 ParentOpParentToPreds;
15911 auto Key = std::make_pair(Root, OpParent);
15912 if (
auto It = ParentOpParentToPreds.
find(
Key);
15913 It != ParentOpParentToPreds.
end())
15925 for (
const auto &KeyPair : ParentsPairsToAdd) {
15927 "Should not have been added before.");
15931 while (!Worklist.
empty()) {
15933 if (BB == OpParent || !Visited.
insert(BB).second)
15935 auto Pair = std::make_pair(BB, OpParent);
15936 if (
auto It = ParentOpParentToPreds.
find(Pair);
15937 It != ParentOpParentToPreds.
end()) {
15941 ParentsPairsToAdd.
insert(Pair);
15946 if (Budget > BudgetLimit)
15958 while (!LiveEntries.
empty()) {
15961 if (Operands.
empty())
15963 Instruction *LastInst = EntriesToLastInstruction.
at(Entry);
15965 for (
const TreeEntry *
Op : Operands) {
15966 if (!
Op->isGather())
15968 if (Entry->State == TreeEntry::SplitVectorize ||
15969 (Entry->getOpcode() != Instruction::PHI &&
Op->isGather()) ||
15975 Pred = Phi->getIncomingBlock(
Op->UserTreeIndex.EdgeIdx);
15978 if (
Op->isGather()) {
15979 assert(Entry->getOpcode() == Instruction::PHI &&
15980 "Expected phi node only.");
15982 ->getIncomingBlock(
Op->UserTreeIndex.EdgeIdx);
15984 for (
Value *V :
Op->Scalars) {
15995 OpLastInst = EntriesToLastInstruction.
at(
Op);
15999 if (OpParent == Parent) {
16000 if (Entry->getOpcode() == Instruction::PHI) {
16001 if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
16005 if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
16011 if (Entry->getOpcode() != Instruction::PHI &&
16012 !CheckForNonVecCallsInSameBlock(
16013 &*LastInst->
getParent()->getFirstNonPHIOrDbgOrAlloca(),
16019 if (!CheckForNonVecCallsInSameBlock(OpLastInst,
16025 if (!CheckPredecessors(Parent, Pred, OpParent)) {
16041 const auto *I1 = IE1;
16042 const auto *I2 = IE2;
16054 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
16057 if (I2 && ((I2 == IE2 || I2->
hasOneUse())) &&
16060 }
while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
16067struct ValueSelect {
16068 template <
typename U>
16069 static std::enable_if_t<std::is_same_v<Value *, U>,
Value *>
get(
Value *V) {
16072 template <
typename U>
16073 static std::enable_if_t<!std::is_same_v<Value *, U>,
U>
get(
Value *) {
16091template <
typename T>
16097 assert(!ShuffleMask.empty() &&
"Empty list of shuffles for inserts.");
16099 auto VMIt = std::next(ShuffleMask.begin());
16102 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
16104 if (!IsBaseUndef.
all()) {
16106 std::pair<T *, bool> Res =
16107 ResizeAction(ShuffleMask.begin()->first, Mask,
false);
16109 for (
unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
16113 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
16115 [[maybe_unused]]
auto *V = ValueSelect::get<T *>(
Base);
16116 assert((!V || GetVF(V) == Mask.size()) &&
16117 "Expected base vector of VF number of elements.");
16118 Prev = Action(Mask, {
nullptr, Res.first});
16119 }
else if (ShuffleMask.size() == 1) {
16122 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
16128 Prev = Action(Mask, {ShuffleMask.begin()->first});
16132 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
16133 unsigned Vec2VF = GetVF(VMIt->first);
16134 if (Vec1VF == Vec2VF) {
16138 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
16141 Mask[
I] = SecMask[
I] + Vec1VF;
16144 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
16147 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
16149 std::pair<T *, bool> Res2 =
16150 ResizeAction(VMIt->first, VMIt->second,
false);
16152 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
16159 Mask[
I] = (Res2.second ?
I : SecMask[
I]) + VF;
16162 Prev = Action(Mask, {Res1.first, Res2.first});
16164 VMIt = std::next(VMIt);
16166 [[maybe_unused]]
bool IsBaseNotUndef = !IsBaseUndef.
all();
16168 for (
auto E = ShuffleMask.end(); VMIt !=
E; ++VMIt) {
16170 std::pair<T *, bool> Res =
16171 ResizeAction(VMIt->first, VMIt->second,
false);
16173 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
16176 "Multiple uses of scalars.");
16177 Mask[
I] = (Res.second ?
I : SecMask[
I]) + VF;
16182 Prev = Action(Mask, {Prev, Res.first});
16190template <
typename T>
struct ShuffledInsertData {
16194 MapVector<T, SmallVector<int>> ValueMasks;
16202 << VectorizableTree.size() <<
".\n");
16205 for (
unsigned I = 0, E = VectorizableTree.size();
I < E; ++
I) {
16206 TreeEntry &TE = *VectorizableTree[
I];
16209 if (TE.State == TreeEntry::CombinedVectorize) {
16211 dbgs() <<
"SLP: Skipping cost for combined node that starts with "
16212 << *TE.Scalars[0] <<
".\n";
16213 TE.dump();
dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
16216 if (TE.hasState() &&
16217 (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {
16218 if (
const TreeEntry *E =
16219 getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
16220 E && E->getVectorFactor() == TE.getVectorFactor()) {
16225 <<
"SLP: Current total cost = " << Cost <<
"\n");
16232 assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&
16233 "Expected gather nodes with users only.");
16239 <<
"SLP: Current total cost = " << Cost <<
"\n");
16243 none_of(ExternalUses, [](
const ExternalUser &EU) {
16254 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
16261 for (ExternalUser &EU : ExternalUses) {
16262 ScalarUserAndIdx.
emplace_back(EU.Scalar, EU.User, EU.Lane);
16265 for (ExternalUser &EU : ExternalUses) {
16266 LLVM_DEBUG(
dbgs() <<
"SLP: Computing cost for external use of TreeEntry "
16267 << EU.E.Idx <<
" in lane " << EU.Lane <<
"\n");
16269 else dbgs() <<
" User: nullptr\n");
16270 LLVM_DEBUG(
dbgs() <<
" Use: " << EU.Scalar->getNameOrAsOperand() <<
"\n");
16275 if (EphValues.count(EU.User))
16279 if (!CheckedScalarUser.
insert(std::make_pair(EU.Scalar, EU.User)).second ||
16281 CheckedScalarUser.
contains(std::make_pair(EU.Scalar,
nullptr))))
16289 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
16295 !ExtractCostCalculated.
insert(EU.Scalar).second)
16308 if (!UsedInserts.
insert(VU).second)
16312 const TreeEntry *ScalarTE = &EU.E;
16315 [
this, VU](
const ShuffledInsertData<const TreeEntry *> &
Data) {
16320 Value *Op0 =
II->getOperand(0);
16327 if (It == ShuffledInserts.
end()) {
16329 Data.InsertElements.emplace_back(VU);
16331 VecId = ShuffledInserts.
size() - 1;
16332 auto It = MinBWs.find(ScalarTE);
16333 if (It != MinBWs.end() &&
16335 .
insert(std::make_pair(ScalarTE, FTy->getElementType()))
16337 unsigned BWSz = It->second.first;
16338 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
16339 unsigned VecOpcode;
16340 if (DstBWSz < BWSz)
16341 VecOpcode = Instruction::Trunc;
16344 It->second.second ? Instruction::SExt : Instruction::ZExt;
16349 FTy->getNumElements()),
16352 <<
" for extending externally used vector with "
16353 "non-equal minimum bitwidth.\n");
16358 It->InsertElements.front() = VU;
16359 VecId = std::distance(ShuffledInserts.
begin(), It);
16361 int InIdx = *InsertIdx;
16363 ShuffledInserts[VecId].ValueMasks[ScalarTE];
16366 Mask[InIdx] = EU.Lane;
16367 DemandedElts[VecId].setBit(InIdx);
16378 auto *ScalarTy = EU.Scalar->getType();
16379 const unsigned BundleWidth = EU.E.getVectorFactor();
16380 assert(EU.Lane < BundleWidth &&
"Extracted lane out of bounds.");
16382 const TreeEntry *Entry = &EU.E;
16383 auto It = MinBWs.find(Entry);
16384 if (It != MinBWs.end()) {
16389 ? Instruction::ZExt
16390 : Instruction::SExt;
16395 << ExtraCost <<
"\n");
16399 CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx);
16400 LLVM_DEBUG(
dbgs() <<
" ExtractElement cost for " << *ScalarTy <<
" from "
16401 << *VecTy <<
": " << ExtraCost <<
"\n");
16404 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
16405 Entry->getOpcode() == Instruction::Load) {
16407 auto IsPhiInLoop = [&](
const ExternalUser &U) {
16410 const Loop *L = LI->getLoopFor(Phi->getParent());
16411 return L && (Phi->getParent() ==
I->getParent() ||
16412 L == LI->getLoopFor(
I->getParent()));
16416 if (!ValueToExtUses) {
16417 ValueToExtUses.emplace();
16418 for (
const auto &
P :
enumerate(ExternalUses)) {
16420 if (IsPhiInLoop(
P.value()))
16423 ValueToExtUses->try_emplace(
P.value().Scalar,
P.index());
16430 auto OperandIsScalar = [&](
Value *V) {
16436 return !EE->hasOneUse() || !MustGather.contains(EE);
16439 return ValueToExtUses->contains(V);
16441 bool CanBeUsedAsScalar =
all_of(Inst->operands(), OperandIsScalar);
16442 bool CanBeUsedAsScalarCast =
false;
16445 Op &&
all_of(
Op->operands(), OperandIsScalar)) {
16450 if (ScalarCost + OpCost <= ExtraCost) {
16451 CanBeUsedAsScalar = CanBeUsedAsScalarCast =
true;
16452 ScalarCost += OpCost;
16456 if (CanBeUsedAsScalar) {
16457 bool KeepScalar = ScalarCost <= ExtraCost;
16461 bool IsProfitablePHIUser =
16463 VectorizableTree.front()->Scalars.size() > 2)) &&
16464 VectorizableTree.front()->hasState() &&
16465 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
16469 auto *PHIUser = dyn_cast<PHINode>(U);
16470 return (!PHIUser ||
16471 PHIUser->getParent() !=
16473 VectorizableTree.front()->getMainOp())
16478 return ValueToExtUses->contains(V);
16480 if (IsProfitablePHIUser) {
16484 (!GatheredLoadsEntriesFirst.has_value() ||
16485 Entry->Idx < *GatheredLoadsEntriesFirst)) {
16486 unsigned ScalarUsesCount =
count_if(Entry->Scalars, [&](
Value *V) {
16487 return ValueToExtUses->contains(V);
16489 auto It = ExtractsCount.
find(Entry);
16490 if (It != ExtractsCount.
end()) {
16491 assert(ScalarUsesCount >= It->getSecond().size() &&
16492 "Expected total number of external uses not less than "
16493 "number of scalar uses.");
16494 ScalarUsesCount -= It->getSecond().size();
16499 KeepScalar = ScalarUsesCount <= 1 || !
has_single_bit(ScalarUsesCount);
16502 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
16503 for (
Value *V : Inst->operands()) {
16504 auto It = ValueToExtUses->find(V);
16505 if (It != ValueToExtUses->end()) {
16507 ExternalUses[It->second].User =
nullptr;
16510 ExtraCost = ScalarCost;
16511 if (!IsPhiInLoop(EU))
16512 ExtractsCount[Entry].
insert(Inst);
16513 if (CanBeUsedAsScalarCast) {
16514 ScalarOpsFromCasts.
insert(Inst->getOperand(0));
16518 for (
Value *V : IOp->operands()) {
16519 auto It = ValueToExtUses->find(V);
16520 if (It != ValueToExtUses->end()) {
16522 ExternalUses[It->second].User =
nullptr;
16531 ExtractCost += ExtraCost;
16535 for (
Value *V : ScalarOpsFromCasts) {
16536 ExternalUsesAsOriginalScalar.insert(V);
16538 ExternalUses.emplace_back(V,
nullptr, *TEs.front(),
16539 TEs.front()->findLaneForValue(V));
16543 if (!VectorizedVals.
empty()) {
16544 const TreeEntry &Root = *VectorizableTree.front();
16545 auto BWIt = MinBWs.find(&Root);
16546 if (BWIt != MinBWs.end()) {
16547 Type *DstTy = Root.Scalars.front()->getType();
16548 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->
getScalarType());
16550 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
16551 if (OriginalSz != SrcSz) {
16552 unsigned Opcode = Instruction::Trunc;
16553 if (OriginalSz > SrcSz)
16554 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
16560 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
16567 Cost += ExtractCost;
16568 auto &&ResizeToVF = [
this, &Cost](
const TreeEntry *TE,
ArrayRef<int> Mask,
16569 bool ForSingleMask) {
16571 unsigned VF = Mask.size();
16572 unsigned VecVF = TE->getVectorFactor();
16573 bool HasLargeIndex =
16574 any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); });
16575 if ((VF != VecVF && HasLargeIndex) ||
16578 if (HasLargeIndex) {
16580 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
16586 dbgs() <<
"SLP: Adding cost " <<
C
16587 <<
" for final shuffle of insertelement external users.\n";
16588 TE->dump();
dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
16590 return std::make_pair(TE,
true);
16593 if (!ForSingleMask) {
16595 for (
unsigned I = 0;
I < VF; ++
I) {
16597 ResizeMask[Mask[
I]] = Mask[
I];
16604 dbgs() <<
"SLP: Adding cost " <<
C
16605 <<
" for final shuffle of insertelement external users.\n";
16606 TE->dump();
dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
16611 return std::make_pair(TE,
false);
16614 for (
int I = 0, E = ShuffledInserts.
size();
I < E; ++
I) {
16615 Value *
Base = ShuffledInserts[
I].InsertElements.
front()->getOperand(0);
16616 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
16620 assert((TEs.size() == 1 || TEs.size() == 2) &&
16621 "Expected exactly 1 or 2 tree entries.");
16622 if (TEs.size() == 1) {
16624 VF = TEs.front()->getVectorFactor();
16625 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16629 (
Data.index() < VF &&
16630 static_cast<int>(
Data.index()) ==
Data.value());
16635 <<
" for final shuffle of insertelement "
16636 "external users.\n";
16637 TEs.front()->
dump();
16638 dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
16644 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
16645 VF = TEs.front()->getVectorFactor();
16649 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
16653 <<
" for final shuffle of vector node and external "
16654 "insertelement users.\n";
16655 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
16656 dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
16664 [](
const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
16665 EstimateShufflesCost);
16668 ShuffledInserts[
I].InsertElements.
front()->getType()),
16671 Cost -= InsertCost;
16675 if (ReductionBitWidth != 0) {
16676 assert(UserIgnoreList &&
"Expected reduction tree.");
16677 const TreeEntry &E = *VectorizableTree.front();
16678 auto It = MinBWs.find(&E);
16679 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
16680 unsigned SrcSize = It->second.first;
16681 unsigned DstSize = ReductionBitWidth;
16682 unsigned Opcode = Instruction::Trunc;
16683 if (SrcSize < DstSize) {
16684 bool IsArithmeticExtendedReduction =
16687 return is_contained({Instruction::Add, Instruction::FAdd,
16688 Instruction::Mul, Instruction::FMul,
16689 Instruction::And, Instruction::Or,
16693 if (IsArithmeticExtendedReduction)
16695 Instruction::BitCast;
16697 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
16699 if (Opcode != Instruction::BitCast) {
16701 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
16703 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
16706 switch (E.getOpcode()) {
16707 case Instruction::SExt:
16708 case Instruction::ZExt:
16709 case Instruction::Trunc: {
16710 const TreeEntry *OpTE = getOperandEntry(&E, 0);
16711 CCH = getCastContextHint(*OpTE);
16717 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
16721 <<
" for final resize for reduction from " << SrcVecTy
16722 <<
" to " << DstVecTy <<
"\n";
16723 dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
16728 std::optional<InstructionCost> SpillCost;
16731 Cost += *SpillCost;
16737 OS <<
"SLP: Spill Cost = ";
16742 OS <<
".\nSLP: Extract Cost = " << ExtractCost <<
".\n"
16743 <<
"SLP: Total Cost = " << Cost <<
".\n";
16747 ViewGraph(
this,
"SLP" + F->getName(),
false, Str);
16758std::optional<TTI::ShuffleKind>
16759BoUpSLP::tryToGatherSingleRegisterExtractElements(
16765 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
16781 if (Idx >= VecTy->getNumElements()) {
16785 SmallBitVector ExtractMask(VecTy->getNumElements(),
true);
16786 ExtractMask.reset(*Idx);
16791 VectorOpToIdx[EI->getVectorOperand()].push_back(
I);
16796 stable_sort(Vectors, [](
const auto &P1,
const auto &P2) {
16797 return P1.second.size() > P2.second.size();
16800 const int UndefSz = UndefVectorExtracts.
size();
16801 unsigned SingleMax = 0;
16802 unsigned PairMax = 0;
16803 if (!Vectors.
empty()) {
16804 SingleMax = Vectors.
front().second.size() + UndefSz;
16805 if (Vectors.
size() > 1) {
16806 auto *ItNext = std::next(Vectors.
begin());
16807 PairMax = SingleMax + ItNext->second.size();
16810 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
16811 return std::nullopt;
16817 if (SingleMax >= PairMax && SingleMax) {
16818 for (
int Idx : Vectors.
front().second)
16819 std::swap(GatheredExtracts[Idx], VL[Idx]);
16820 }
else if (!Vectors.
empty()) {
16821 for (
unsigned Idx : {0, 1})
16822 for (
int Idx : Vectors[Idx].second)
16823 std::swap(GatheredExtracts[Idx], VL[Idx]);
16826 for (
int Idx : UndefVectorExtracts)
16827 std::swap(GatheredExtracts[Idx], VL[Idx]);
16830 std::optional<TTI::ShuffleKind> Res =
16836 return std::nullopt;
16840 for (
int I = 0,
E = GatheredExtracts.size();
I <
E; ++
I) {
16861BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
16862 SmallVectorImpl<int> &Mask,
16863 unsigned NumParts)
const {
16864 assert(NumParts > 0 &&
"NumParts expected be greater than or equal to 1.");
16873 SmallVector<int> SubMask;
16874 std::optional<TTI::ShuffleKind> Res =
16875 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
16876 ShufflesRes[Part] = Res;
16877 copy(SubMask, std::next(
Mask.begin(), Part * SliceSize));
16879 if (
none_of(ShufflesRes, [](
const std::optional<TTI::ShuffleKind> &Res) {
16880 return Res.has_value();
16882 ShufflesRes.clear();
16883 return ShufflesRes;
16886std::optional<TargetTransformInfo::ShuffleKind>
16887BoUpSLP::isGatherShuffledSingleRegisterEntry(
16889 SmallVectorImpl<const TreeEntry *> &Entries,
unsigned Part,
bool ForOrder) {
16893 auto GetUserEntry = [&](
const TreeEntry *
TE) {
16894 while (
TE->UserTreeIndex &&
TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16895 TE =
TE->UserTreeIndex.UserTE;
16896 if (TE == VectorizableTree.front().get())
16897 return EdgeInfo(
const_cast<TreeEntry *
>(TE), 0);
16898 return TE->UserTreeIndex;
16900 auto HasGatherUser = [&](
const TreeEntry *
TE) {
16901 while (
TE->Idx != 0 &&
TE->UserTreeIndex) {
16902 if (
TE->UserTreeIndex.EdgeIdx == UINT_MAX)
16904 TE =
TE->UserTreeIndex.UserTE;
16908 const EdgeInfo TEUseEI = GetUserEntry(TE);
16910 return std::nullopt;
16911 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
16916 TEUseEI.UserTE->hasState() ? TEUseEI.UserTE->getMainOp() :
nullptr);
16917 PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
16918 TEInsertBlock =
PHI->getIncomingBlock(TEUseEI.EdgeIdx);
16921 TEInsertBlock = TEInsertPt->
getParent();
16923 if (!DT->isReachableFromEntry(TEInsertBlock))
16924 return std::nullopt;
16925 auto *NodeUI = DT->getNode(TEInsertBlock);
16926 assert(NodeUI &&
"Should only process reachable instructions");
16928 auto CheckOrdering = [&](
const Instruction *InsertPt) {
16941 const BasicBlock *InsertBlock = InsertPt->getParent();
16942 auto *NodeEUI = DT->getNode(InsertBlock);
16945 assert((NodeUI == NodeEUI) ==
16946 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
16947 "Different nodes should have different DFS numbers");
16949 if (TEInsertPt->
getParent() != InsertBlock &&
16950 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
16952 if (TEInsertPt->
getParent() == InsertBlock &&
16965 SmallDenseMap<Value *, int> UsedValuesEntry;
16966 SmallPtrSet<const Value *, 16> VisitedValue;
16967 auto CheckAndUseSameNode = [&](
const TreeEntry *TEPtr) {
16969 if ((TEPtr->getVectorFactor() != VL.
size() &&
16970 TEPtr->Scalars.size() != VL.
size()) ||
16971 (!TEPtr->isSame(VL) && !TEPtr->isSame(
TE->Scalars)))
16975 for (
Value *V : VL) {
16982 auto CheckParentNodes = [&](
const TreeEntry *User1,
const TreeEntry *User2,
16983 unsigned EdgeIdx) {
16984 const TreeEntry *Ptr1 = User1;
16985 const TreeEntry *Ptr2 = User2;
16986 SmallDenseMap<const TreeEntry *, unsigned> PtrToIdx;
16989 EdgeIdx = Ptr2->UserTreeIndex.EdgeIdx;
16990 Ptr2 = Ptr2->UserTreeIndex.UserTE;
16993 unsigned Idx = Ptr1->UserTreeIndex.EdgeIdx;
16994 Ptr1 = Ptr1->UserTreeIndex.UserTE;
16995 if (
auto It = PtrToIdx.
find(Ptr1); It != PtrToIdx.
end())
16996 return Idx < It->second;
17000 auto CheckNonSchedulableOrdering = [&](
const TreeEntry *
E,
17002 return TEUseEI && TEUseEI.UserTE && TEUseEI.UserTE->hasCopyableElements() &&
17003 !TEUseEI.UserTE->isCopyableElement(
17006 InsertPt->getNextNode() == TEInsertPt &&
17007 (!
E->hasCopyableElements() || !
E->isCopyableElement(InsertPt) ||
17010 for (
Value *V : VL) {
17014 SmallPtrSet<const TreeEntry *, 4> VToTEs;
17015 for (
const TreeEntry *TEPtr : ValueToGatherNodes.lookup(V)) {
17016 if (TEPtr == TE || TEPtr->Idx == 0)
17019 [&](
Value *V) { return GatheredScalars.contains(V); }) &&
17020 "Must contain at least single gathered value.");
17021 assert(TEPtr->UserTreeIndex &&
17022 "Expected only single user of a gather node.");
17023 const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
17025 PHINode *UserPHI = (UseEI.UserTE->State != TreeEntry::SplitVectorize &&
17026 UseEI.UserTE->hasState())
17031 : &getLastInstructionInBundle(UseEI.UserTE);
17032 if (TEInsertPt == InsertPt) {
17034 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
17035 (TEUseEI.UserTE->getOpcode() != Instruction::PHI ||
17036 TEUseEI.UserTE->isAltShuffle()) &&
17038 if (UseEI.UserTE->State != TreeEntry::Vectorize ||
17039 (UseEI.UserTE->hasState() &&
17040 UseEI.UserTE->getOpcode() == Instruction::PHI &&
17041 !UseEI.UserTE->isAltShuffle()) ||
17050 (TEUseEI.UserTE != UseEI.UserTE || TEUseEI.EdgeIdx < UseEI.EdgeIdx))
17053 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
17054 TEUseEI.UserTE->getOpcode() == Instruction::PHI &&
17055 UseEI.UserTE->State == TreeEntry::Vectorize &&
17056 UseEI.UserTE->getOpcode() == Instruction::PHI &&
17057 TEUseEI.UserTE != UseEI.UserTE)
17062 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
17066 if (TEUseEI.UserTE != UseEI.UserTE &&
17067 (TEUseEI.UserTE->Idx < UseEI.UserTE->Idx ||
17068 HasGatherUser(TEUseEI.UserTE)))
17071 if (CheckParentNodes(TEUseEI.UserTE, UseEI.UserTE, UseEI.EdgeIdx))
17075 if (!TEUseEI.UserTE->isGather() && !UserPHI &&
17076 TEUseEI.UserTE->doesNotNeedToSchedule() !=
17077 UseEI.UserTE->doesNotNeedToSchedule() &&
17082 if ((TEInsertBlock != InsertPt->
getParent() ||
17083 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
17084 (!CheckOrdering(InsertPt) ||
17085 (UseEI.UserTE->hasCopyableElements() &&
17090 if (CheckAndUseSameNode(TEPtr))
17095 if (CheckNonSchedulableOrdering(UseEI.UserTE, InsertPt))
17101 VTEs, [&](
const TreeEntry *MTE) {
return MTE != TEUseEI.UserTE; });
17102 if (It != VTEs.end()) {
17103 const TreeEntry *VTE = *It;
17104 if (
none_of(
TE->CombinedEntriesWithIndices,
17105 [&](
const auto &
P) { return P.first == VTE->Idx; })) {
17106 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
17107 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
17111 if (CheckAndUseSameNode(VTE))
17117 const TreeEntry *VTE = VTEs.front();
17118 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
17119 VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
17120 VTEs = VTEs.drop_front();
17122 const auto *MIt =
find_if(VTEs, [](
const TreeEntry *MTE) {
17123 return MTE->State == TreeEntry::Vectorize;
17125 if (MIt == VTEs.end())
17129 if (
none_of(
TE->CombinedEntriesWithIndices,
17130 [&](
const auto &
P) { return P.first == VTE->Idx; })) {
17131 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
17132 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst) ||
17133 CheckNonSchedulableOrdering(VTE, &LastBundleInst))
17137 if (CheckAndUseSameNode(VTE))
17141 if (VToTEs.
empty())
17143 if (UsedTEs.
empty()) {
17151 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
17153 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
17157 if (!VToTEs.
empty()) {
17163 VToTEs = SavedVToTEs;
17168 if (Idx == UsedTEs.
size()) {
17172 if (UsedTEs.
size() == 2)
17174 UsedTEs.push_back(SavedVToTEs);
17175 Idx = UsedTEs.
size() - 1;
17181 if (UsedTEs.
empty()) {
17183 return std::nullopt;
17187 if (UsedTEs.
size() == 1) {
17190 UsedTEs.front().
end());
17191 sort(FirstEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
17192 return TE1->Idx < TE2->Idx;
17195 auto *It =
find_if(FirstEntries, [=](
const TreeEntry *EntryPtr) {
17196 return EntryPtr->isSame(VL) || EntryPtr->isSame(
TE->Scalars);
17198 if (It != FirstEntries.end() &&
17199 ((*It)->getVectorFactor() == VL.size() ||
17200 ((*It)->getVectorFactor() ==
TE->Scalars.size() &&
17201 TE->ReuseShuffleIndices.size() == VL.size() &&
17202 (*It)->isSame(
TE->Scalars)))) {
17204 if ((*It)->getVectorFactor() == VL.size()) {
17205 std::iota(std::next(
Mask.begin(), Part * VL.size()),
17206 std::next(
Mask.begin(), (Part + 1) * VL.size()), 0);
17208 SmallVector<int> CommonMask =
TE->getCommonMask();
17219 Entries.
push_back(FirstEntries.front());
17221 for (
auto &
P : UsedValuesEntry)
17223 VF = FirstEntries.front()->getVectorFactor();
17226 assert(UsedTEs.
size() == 2 &&
"Expected at max 2 permuted entries.");
17228 DenseMap<int, const TreeEntry *> VFToTE;
17229 for (
const TreeEntry *TE : UsedTEs.front()) {
17230 unsigned VF =
TE->getVectorFactor();
17231 auto It = VFToTE.
find(VF);
17232 if (It != VFToTE.
end()) {
17233 if (It->second->Idx >
TE->Idx)
17234 It->getSecond() =
TE;
17241 UsedTEs.back().
end());
17242 sort(SecondEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
17243 return TE1->Idx < TE2->Idx;
17245 for (
const TreeEntry *TE : SecondEntries) {
17246 auto It = VFToTE.
find(
TE->getVectorFactor());
17247 if (It != VFToTE.
end()) {
17256 if (Entries.
empty()) {
17258 UsedTEs.front(), [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
17259 return TE1->Idx < TE2->Idx;
17261 Entries.
push_back(SecondEntries.front());
17262 VF = std::max(Entries.
front()->getVectorFactor(),
17263 Entries.
back()->getVectorFactor());
17265 VF = Entries.
front()->getVectorFactor();
17268 for (
const TreeEntry *
E : Entries)
17272 for (
auto &
P : UsedValuesEntry) {
17274 if (ValuesToEntries[Idx].
contains(
P.first)) {
17284 auto AreCompatiblePHIs = [&](
Value *
V,
Value *V1) {
17291 for (
int I = 0,
E =
PHI->getNumIncomingValues();
I <
E; ++
I) {
17293 Value *In1 = PHI1->getIncomingValue(
I);
17308 auto MightBeIgnored = [=](
Value *
V) {
17312 !areAllUsersVectorized(
I, UserIgnoreList) &&
isSimple(
I);
17317 auto NeighborMightBeIgnored = [&](
Value *
V,
int Idx) {
17318 Value *V1 = VL[Idx];
17319 bool UsedInSameVTE =
false;
17320 auto It = UsedValuesEntry.find(V1);
17321 if (It != UsedValuesEntry.end())
17322 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
17323 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
17330 SmallBitVector UsedIdxs(Entries.size());
17332 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
17334 auto It = UsedValuesEntry.find(V);
17335 if (It == UsedValuesEntry.end())
17341 ((
I > 0 && NeighborMightBeIgnored(V,
I - 1)) ||
17342 (
I !=
E - 1 && NeighborMightBeIgnored(V,
I + 1)))))
17344 unsigned Idx = It->second;
17351 for (
unsigned I = 0, Sz = Entries.size();
I < Sz; ++
I) {
17352 if (!UsedIdxs.test(
I))
17358 for (std::pair<unsigned, int> &Pair : EntryLanes)
17359 if (Pair.first ==
I)
17360 Pair.first = TempEntries.
size();
17363 Entries.swap(TempEntries);
17364 if (EntryLanes.size() == Entries.size() &&
17366 .slice(Part * VL.size(),
17367 std::min<int>(VL.size(),
TE->Scalars.size())))) {
17373 return std::nullopt;
17376 bool IsIdentity = Entries.size() == 1;
17379 for (
const std::pair<unsigned, int> &Pair : EntryLanes) {
17380 unsigned Idx = Part * VL.size() + Pair.second;
17383 (ForOrder ? std::distance(
17384 Entries[Pair.first]->Scalars.begin(),
17385 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
17386 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
17387 IsIdentity &=
Mask[Idx] == Pair.second;
17389 if (ForOrder || IsIdentity || Entries.empty()) {
17390 switch (Entries.size()) {
17392 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
17396 if (EntryLanes.size() > 2 || VL.size() <= 2)
17403 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
17405 SmallVector<int> SubMask(std::next(
Mask.begin(), Part * VL.size()),
17406 std::next(
Mask.begin(), (Part + 1) * VL.size()));
17407 int MinElement = SubMask.
front(), MaxElement = SubMask.
front();
17408 for (
int Idx : SubMask) {
17416 assert(MaxElement >= 0 && MinElement >= 0 &&
17417 MaxElement % VF >= MinElement % VF &&
17418 "Expected at least single element.");
17419 unsigned NewVF = std::max<unsigned>(
17421 (MaxElement % VF) -
17422 (MinElement % VF) + 1));
17424 for (
int &Idx : SubMask) {
17427 Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
17428 (Idx >=
static_cast<int>(VF) ? NewVF : 0);
17436 auto *MaskVecTy =
getWidenedType(VL.front()->getType(), SubMask.size());
17437 auto GetShuffleCost = [&,
17438 &TTI = *TTI](ArrayRef<int>
Mask,
17441 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
17443 Mask, Entries.front()->getInterleaveFactor()))
17445 return ::getShuffleCost(TTI,
17450 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
17452 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
17453 if (Entries.size() == 1 || !Entries[0]->isGather()) {
17454 FirstShuffleCost = ShuffleCost;
17458 bool IsIdentity =
true;
17459 for (
auto [
I, Idx] :
enumerate(FirstMask)) {
17460 if (Idx >=
static_cast<int>(NewVF)) {
17465 IsIdentity &=
static_cast<int>(
I) == Idx;
17469 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
17471 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
17475 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
17476 if (Entries.size() == 1 || !Entries[1]->isGather()) {
17477 SecondShuffleCost = ShuffleCost;
17481 bool IsIdentity =
true;
17482 for (
auto [
I, Idx] :
enumerate(SecondMask)) {
17483 if (Idx <
static_cast<int>(NewVF) && Idx >= 0) {
17489 IsIdentity &=
static_cast<int>(
I) == Idx;
17494 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
17496 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
17504 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
17506 const TreeEntry *BestEntry =
nullptr;
17507 if (FirstShuffleCost < ShuffleCost) {
17508 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
17509 std::next(
Mask.begin(), (Part + 1) * VL.size()),
17511 if (Idx >= static_cast<int>(VF))
17512 Idx = PoisonMaskElem;
17514 BestEntry = Entries.front();
17515 ShuffleCost = FirstShuffleCost;
17517 if (SecondShuffleCost < ShuffleCost) {
17518 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
17519 std::next(
Mask.begin(), (Part + 1) * VL.size()),
17521 if (Idx < static_cast<int>(VF))
17522 Idx = PoisonMaskElem;
17526 BestEntry = Entries[1];
17527 ShuffleCost = SecondShuffleCost;
17529 if (BuildVectorCost >= ShuffleCost) {
17532 Entries.push_back(BestEntry);
17540 std::fill(std::next(
Mask.begin(), Part * VL.size()),
17542 return std::nullopt;
17546BoUpSLP::isGatherShuffledEntry(
17550 assert(NumParts > 0 && NumParts < VL.
size() &&
17551 "Expected positive number of registers.");
17554 if (TE == VectorizableTree.front().get() &&
17555 (!GatheredLoadsEntriesFirst.has_value() ||
17557 [](
const std::unique_ptr<TreeEntry> &TE) {
17558 return !
TE->isGather();
17563 if (
TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
17566 assert((
TE->UserTreeIndex || TE == VectorizableTree.front().get()) &&
17567 "Expected only single user of the gather node.");
17569 "Number of scalars must be divisible by NumParts.");
17570 if (
TE->UserTreeIndex &&
TE->UserTreeIndex.UserTE->isGather() &&
17571 TE->UserTreeIndex.EdgeIdx == UINT_MAX &&
17573 (
TE->hasState() &&
TE->getOpcode() == Instruction::ExtractElement) ||
17576 getSameValuesTreeEntry(
TE->getMainOp(),
TE->Scalars))))
17583 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
17584 std::optional<TTI::ShuffleKind> SubRes =
17585 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
17588 SubEntries.
clear();
17591 SubEntries.
front()->getVectorFactor() == VL.
size() &&
17592 (SubEntries.
front()->isSame(
TE->Scalars) ||
17593 SubEntries.
front()->isSame(VL))) {
17595 LocalSubEntries.
swap(SubEntries);
17598 std::iota(
Mask.begin(),
Mask.end(), 0);
17600 for (
int I = 0, Sz = VL.
size();
I < Sz; ++
I)
17603 Entries.emplace_back(1, LocalSubEntries.
front());
17609 [](
const std::optional<TTI::ShuffleKind> &SK) {
return !SK; })) {
17617 Type *ScalarTy)
const {
17618 const unsigned VF = VL.
size();
17626 auto EstimateInsertCost = [&](
unsigned I,
Value *
V) {
17628 if (
V->getType() != ScalarTy)
17629 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy,
V->getType(),
17633 std::iota(ConstantShuffleMask.begin(), ConstantShuffleMask.end(), 0);
17640 ConstantShuffleMask[
I] =
I + VF;
17643 EstimateInsertCost(
I, V);
17646 bool IsAnyNonUndefConst =
17649 if (!ForPoisonSrc && IsAnyNonUndefConst) {
17651 ConstantShuffleMask);
17655 if (!DemandedElements.
isZero())
17659 ForPoisonSrc && !IsAnyNonUndefConst, VL);
17663Instruction &BoUpSLP::getLastInstructionInBundle(
const TreeEntry *
E) {
17664 auto It = EntryToLastInstruction.find(
E);
17665 if (It != EntryToLastInstruction.end())
17673 if (
E->hasState()) {
17674 Front =
E->getMainOp();
17675 Opcode =
E->getOpcode();
17682 ((GatheredLoadsEntriesFirst.has_value() && Opcode == Instruction::Load &&
17683 E->isGather() &&
E->Idx < *GatheredLoadsEntriesFirst) ||
17684 E->State == TreeEntry::SplitVectorize ||
E->hasCopyableElements() ||
17686 [=](
Value *V) ->
bool {
17687 if (Opcode == Instruction::GetElementPtr &&
17688 !isa<GetElementPtrInst>(V))
17690 auto *I = dyn_cast<Instruction>(V);
17691 return !I || !E->getMatchingMainOpOrAltOp(I) ||
17692 I->getParent() == BB || isVectorLikeInstWithConstOps(I);
17694 "Expected gathered loads or GEPs or instructions from same basic "
17697 auto FindLastInst = [&]() {
17699 for (
Value *V :
E->Scalars) {
17703 if (
E->isCopyableElement(
I))
17705 if (LastInst->
getParent() ==
I->getParent()) {
17710 assert(((Opcode == Instruction::GetElementPtr &&
17712 E->State == TreeEntry::SplitVectorize ||
17715 (GatheredLoadsEntriesFirst.has_value() &&
17716 Opcode == Instruction::Load &&
E->isGather() &&
17717 E->Idx < *GatheredLoadsEntriesFirst)) &&
17718 "Expected vector-like or non-GEP in GEP node insts only.");
17719 if (!DT->isReachableFromEntry(LastInst->
getParent())) {
17723 if (!DT->isReachableFromEntry(
I->getParent()))
17725 auto *NodeA = DT->getNode(LastInst->
getParent());
17726 auto *NodeB = DT->getNode(
I->getParent());
17727 assert(NodeA &&
"Should only process reachable instructions");
17728 assert(NodeB &&
"Should only process reachable instructions");
17729 assert((NodeA == NodeB) ==
17730 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17731 "Different nodes should have different DFS numbers");
17732 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
17739 auto FindFirstInst = [&]() {
17741 for (
Value *V :
E->Scalars) {
17745 if (
E->isCopyableElement(
I))
17747 if (FirstInst->
getParent() ==
I->getParent()) {
17748 if (
I->comesBefore(FirstInst))
17752 assert(((Opcode == Instruction::GetElementPtr &&
17756 "Expected vector-like or non-GEP in GEP node insts only.");
17757 if (!DT->isReachableFromEntry(FirstInst->
getParent())) {
17761 if (!DT->isReachableFromEntry(
I->getParent()))
17763 auto *NodeA = DT->getNode(FirstInst->
getParent());
17764 auto *NodeB = DT->getNode(
I->getParent());
17765 assert(NodeA &&
"Should only process reachable instructions");
17766 assert(NodeB &&
"Should only process reachable instructions");
17767 assert((NodeA == NodeB) ==
17768 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
17769 "Different nodes should have different DFS numbers");
17770 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
17776 if (
E->State == TreeEntry::SplitVectorize) {
17777 Res = FindLastInst();
17779 for (
auto *
E : Entries) {
17782 I = &getLastInstructionInBundle(
E);
17787 EntryToLastInstruction.try_emplace(
E, Res);
17792 if (GatheredLoadsEntriesFirst.has_value() &&
17793 E->Idx >= *GatheredLoadsEntriesFirst && !
E->isGather() &&
17794 Opcode == Instruction::Load) {
17795 Res = FindFirstInst();
17796 EntryToLastInstruction.try_emplace(
E, Res);
17802 auto FindScheduleBundle = [&](
const TreeEntry *
E) ->
const ScheduleBundle * {
17806 const auto *It = BlocksSchedules.find(BB);
17807 if (It == BlocksSchedules.end())
17809 for (
Value *V :
E->Scalars) {
17815 if (Bundles.
empty())
17818 Bundles, [&](ScheduleBundle *
B) {
return B->getTreeEntry() ==
E; });
17819 if (It != Bundles.
end())
17824 const ScheduleBundle *Bundle = FindScheduleBundle(
E);
17825 if (!
E->isGather() && !Bundle) {
17826 if ((Opcode == Instruction::GetElementPtr &&
17829 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
17833 return isa<PoisonValue>(V) ||
17834 (E->Idx == 0 && isa<InsertElementInst>(V)) ||
17835 E->isCopyableElement(V) ||
17836 (!isVectorLikeInstWithConstOps(V) &&
17837 isUsedOutsideBlock(V));
17839 (!
E->doesNotNeedToSchedule() ||
17842 if (!isa<Instruction>(V) ||
17843 (E->hasCopyableElements() && E->isCopyableElement(V)))
17845 return !areAllOperandsNonInsts(V);
17848 if (!isa<Instruction>(V) ||
17849 (E->hasCopyableElements() && E->isCopyableElement(V)))
17851 return MustGather.contains(V);
17853 Res = FindLastInst();
17855 Res = FindFirstInst();
17856 EntryToLastInstruction.try_emplace(
E, Res);
17865 assert(!
E->isGather() &&
"Gathered instructions should not be scheduled");
17866 Res = Bundle->getBundle().back()->getInst();
17867 EntryToLastInstruction.try_emplace(
E, Res);
17890 Res = FindLastInst();
17891 assert(Res &&
"Failed to find last instruction in bundle");
17892 EntryToLastInstruction.try_emplace(
E, Res);
17896void BoUpSLP::setInsertPointAfterBundle(
const TreeEntry *
E) {
17897 auto *Front =
E->getMainOp();
17898 Instruction *LastInst = &getLastInstructionInBundle(
E);
17899 assert(LastInst &&
"Failed to find last instruction in bundle");
17904 LastInstIt = LastInst->
getParent()->getFirstNonPHIIt();
17905 if (LastInstIt != LastInst->
getParent()->end() &&
17906 LastInstIt->getParent()->isLandingPad())
17907 LastInstIt = std::next(LastInstIt);
17910 (!
E->isGather() &&
E->State != TreeEntry::SplitVectorize &&
17911 (
E->doesNotNeedToSchedule() ||
17912 (
E->hasCopyableElements() && !
E->isCopyableElement(LastInst) &&
17914 (GatheredLoadsEntriesFirst.has_value() &&
17915 E->Idx >= *GatheredLoadsEntriesFirst && !
E->isGather() &&
17916 E->getOpcode() == Instruction::Load)) {
17917 Builder.SetInsertPoint(LastInst->
getParent(), LastInstIt);
17921 Builder.SetInsertPoint(
17924 if (Instruction *Res = LastInstructionToPos.lookup(LastInst)) {
17927 Res = Builder.CreateAlignedLoad(Builder.getPtrTy(),
17932 LastInstructionToPos.try_emplace(LastInst, Res);
17935 Builder.SetCurrentDebugLocation(Front->
getDebugLoc());
17938Value *BoUpSLP::gather(
17940 function_ref<
Value *(
Value *,
Value *, ArrayRef<int>)> CreateShuffle) {
17946 SmallSet<int, 4> PostponedIndices;
17947 Loop *
L = LI->getLoopFor(Builder.GetInsertBlock());
17949 SmallPtrSet<BasicBlock *, 4> Visited;
17950 while (InsertBB && InsertBB != InstBB && Visited.
insert(InsertBB).second)
17951 InsertBB = InsertBB->getSinglePredecessor();
17952 return InsertBB && InsertBB == InstBB;
17954 for (
int I = 0,
E = VL.
size();
I <
E; ++
I) {
17956 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
17958 (L && (!Root ||
L->isLoopInvariant(Root)) &&
L->contains(Inst))) &&
17959 PostponedIndices.
insert(
I).second)
17963 auto &&CreateInsertElement = [
this](
Value *Vec,
Value *
V,
unsigned Pos,
17966 if (
Scalar->getType() != Ty) {
17977 Scalar = Builder.CreateIntCast(
17991 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
17996 GatherShuffleExtractSeq.insert(InsElt);
18002 User *UserOp =
nullptr;
18007 if (
V->getType()->isVectorTy()) {
18009 SV && SV->getOperand(0) != V && SV->getOperand(1) != V) {
18011 auto FindOperand = [](
Value *Vec,
Value *
V) -> Instruction * {
18013 if (SV->getOperand(0) == V)
18015 if (SV->getOperand(1) == V)
18021 if (Instruction *User = FindOperand(SV->getOperand(0), V))
18023 else if (Instruction *User = FindOperand(SV->getOperand(1), V))
18026 "Failed to find shufflevector, caused by resize.");
18032 unsigned FoundLane = Entries.front()->findLaneForValue(V);
18033 ExternalUses.emplace_back(V, UserOp, *Entries.front(), FoundLane);
18041 SmallVector<int> NonConsts;
18043 std::iota(
Mask.begin(),
Mask.end(), 0);
18044 Value *OriginalRoot = Root;
18047 SV->getOperand(0)->getType() == VecTy) {
18048 Root = SV->getOperand(0);
18049 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
18052 for (
int I = 0,
E = VL.
size();
I <
E; ++
I) {
18061 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
18066 Vec = OriginalRoot;
18068 Vec = CreateShuffle(Root, Vec, Mask);
18070 OI && OI->use_empty() &&
18071 none_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
18072 return TE->VectorizedValue == OI;
18078 for (
int I : NonConsts)
18079 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
18082 for (
const std::pair<Value *, unsigned> &Pair : PostponedInsts)
18083 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
18121 bool IsFinalized =
false;
18134 class ShuffleIRBuilder {
18147 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
18148 CSEBlocks(CSEBlocks),
DL(DL) {}
18149 ~ShuffleIRBuilder() =
default;
18155 "Expected integer vector types only.");
18161 ->getIntegerBitWidth())
18162 V2 = Builder.CreateIntCast(
18165 V1 = Builder.CreateIntCast(
18169 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
18171 GatherShuffleExtractSeq.insert(
I);
18172 CSEBlocks.insert(
I->getParent());
18181 unsigned VF = Mask.size();
18185 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
18187 GatherShuffleExtractSeq.insert(
I);
18188 CSEBlocks.insert(
I->getParent());
18192 Value *createIdentity(
Value *V) {
return V; }
18193 Value *createPoison(
Type *Ty,
unsigned VF) {
18198 void resizeToMatch(
Value *&V1,
Value *&V2) {
18203 int VF = std::max(V1VF, V2VF);
18204 int MinVF = std::min(V1VF, V2VF);
18206 std::iota(IdentityMask.
begin(), std::next(IdentityMask.
begin(), MinVF),
18208 Value *&
Op = MinVF == V1VF ? V1 : V2;
18209 Op = Builder.CreateShuffleVector(
Op, IdentityMask);
18211 GatherShuffleExtractSeq.insert(
I);
18212 CSEBlocks.insert(
I->getParent());
18225 assert(V1 &&
"Expected at least one vector value.");
18226 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
18227 R.CSEBlocks, *R.DL);
18228 return BaseShuffleAnalysis::createShuffle<Value *>(
18229 V1, V2, Mask, ShuffleBuilder, ScalarTy);
18235 std::optional<bool> IsSigned = std::nullopt) {
18238 if (VecTy->getElementType() == ScalarTy->getScalarType())
18240 return Builder.CreateIntCast(
18241 V,
VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
18245 Value *getVectorizedValue(
const TreeEntry &E) {
18246 Value *Vec = E.VectorizedValue;
18249 return castToScalarTyElem(Vec,
any_of(E.Scalars, [&](
Value *V) {
18250 return !isa<PoisonValue>(V) &&
18251 !isKnownNonNegative(
18252 V, SimplifyQuery(*R.DL));
18258 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
18262 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
18263 unsigned NumParts,
bool &UseVecBaseAsInput) {
18264 UseVecBaseAsInput =
false;
18266 Value *VecBase =
nullptr;
18268 if (!E->ReorderIndices.empty()) {
18270 E->ReorderIndices.end());
18273 for (
int I = 0, Sz = Mask.size();
I < Sz; ++
I) {
18278 VecBase = EI->getVectorOperand();
18280 VecBase = TEs.front()->VectorizedValue;
18281 assert(VecBase &&
"Expected vectorized value.");
18282 UniqueBases.
insert(VecBase);
18285 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
18286 (NumParts != 1 &&
count(VL, EI) > 1) ||
18288 ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(U);
18289 return UTEs.empty() || UTEs.size() > 1 ||
18290 (isa<GetElementPtrInst>(U) &&
18291 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
18293 count_if(R.VectorizableTree,
18294 [&](const std::unique_ptr<TreeEntry> &TE) {
18295 return TE->UserTreeIndex.UserTE ==
18297 is_contained(VL, EI);
18301 R.eraseInstruction(EI);
18303 if (NumParts == 1 || UniqueBases.
size() == 1) {
18304 assert(VecBase &&
"Expected vectorized value.");
18305 return castToScalarTyElem(VecBase);
18307 UseVecBaseAsInput =
true;
18317 Value *Vec =
nullptr;
18324 constexpr int MaxBases = 2;
18326 auto VLMask =
zip(SubVL, SubMask);
18327 const unsigned VF = std::accumulate(
18328 VLMask.begin(), VLMask.end(), 0U, [&](
unsigned S,
const auto &
D) {
18329 if (std::get<1>(D) == PoisonMaskElem)
18332 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
18333 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(VecOp);
18335 VecOp = TEs.front()->VectorizedValue;
18336 assert(VecOp &&
"Expected vectorized value.");
18337 const unsigned Size =
18338 cast<FixedVectorType>(VecOp->getType())->getNumElements();
18339 return std::max(S, Size);
18341 for (
const auto [V,
I] : VLMask) {
18346 VecOp = TEs.front()->VectorizedValue;
18347 assert(VecOp &&
"Expected vectorized value.");
18348 VecOp = castToScalarTyElem(VecOp);
18349 Bases[
I / VF] = VecOp;
18351 if (!Bases.front())
18354 if (Bases.back()) {
18355 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
18356 TransformToIdentity(SubMask);
18358 SubVec = Bases.front();
18364 ArrayRef<int> SubMask =
18365 Mask.slice(
P * SliceSize,
18368 return all_of(SubMask, [](
int Idx) {
18372 "Expected first part or all previous parts masked.");
18373 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18378 unsigned SubVecVF =
18380 NewVF = std::max(NewVF, SubVecVF);
18383 for (
int &Idx : SubMask)
18386 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
18387 Vec = createShuffle(Vec, SubVec, VecMask);
18388 TransformToIdentity(VecMask);
18396 std::optional<Value *>
18402 TEs, [](
const TreeEntry *TE) {
return TE->VectorizedValue; });
18404 return std::nullopt;
18407 auto *ResVecTy =
getWidenedType(ScalarTy, E->getVectorFactor());
18408 return Builder.CreateAlignedLoad(
18415 IsFinalized =
false;
18416 CommonMask.clear();
18422 Value *V1 = getVectorizedValue(E1);
18423 Value *V2 = getVectorizedValue(E2);
18429 Value *V1 = getVectorizedValue(E1);
18434 assert(V1 && V2 && !Mask.empty() &&
"Expected non-empty input vectors.");
18437 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
18438 V1 = castToScalarTyElem(V1);
18439 V2 = castToScalarTyElem(V2);
18440 if (InVectors.empty()) {
18441 InVectors.push_back(V1);
18442 InVectors.push_back(V2);
18443 CommonMask.assign(Mask.begin(), Mask.end());
18446 Value *Vec = InVectors.front();
18447 if (InVectors.size() == 2) {
18448 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
18449 transformMaskAfterShuffle(CommonMask, CommonMask);
18452 Vec = createShuffle(Vec,
nullptr, CommonMask);
18453 transformMaskAfterShuffle(CommonMask, CommonMask);
18455 V1 = createShuffle(V1, V2, Mask);
18456 unsigned VF = std::max(getVF(V1), getVF(Vec));
18457 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18459 CommonMask[Idx] = Idx + VF;
18460 InVectors.front() = Vec;
18461 if (InVectors.size() == 2)
18462 InVectors.back() = V1;
18464 InVectors.push_back(V1);
18469 "castToScalarTyElem expects V1 to be FixedVectorType");
18470 V1 = castToScalarTyElem(V1);
18471 if (InVectors.empty()) {
18472 InVectors.push_back(V1);
18473 CommonMask.assign(Mask.begin(), Mask.end());
18476 const auto *It =
find(InVectors, V1);
18477 if (It == InVectors.end()) {
18478 if (InVectors.size() == 2 ||
18479 InVectors.front()->getType() != V1->
getType()) {
18480 Value *V = InVectors.front();
18481 if (InVectors.size() == 2) {
18482 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
18483 transformMaskAfterShuffle(CommonMask, CommonMask);
18485 CommonMask.size()) {
18486 V = createShuffle(InVectors.front(),
nullptr, CommonMask);
18487 transformMaskAfterShuffle(CommonMask, CommonMask);
18489 unsigned VF = std::max(CommonMask.size(), Mask.size());
18490 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18492 CommonMask[Idx] = V->getType() != V1->
getType()
18494 : Mask[Idx] + getVF(V1);
18495 if (V->getType() != V1->
getType())
18496 V1 = createShuffle(V1,
nullptr, Mask);
18497 InVectors.front() = V;
18498 if (InVectors.size() == 2)
18499 InVectors.back() = V1;
18501 InVectors.push_back(V1);
18506 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18508 InVectors.push_back(V1);
18513 for (
Value *V : InVectors)
18514 VF = std::max(VF, getVF(V));
18515 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
18517 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
18526 Value *Root =
nullptr) {
18527 return R.gather(VL, Root, ScalarTy,
18529 return createShuffle(V1, V2, Mask);
18538 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
18543 IsFinalized =
true;
18546 if (InVectors.
size() == 2) {
18547 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
18550 Vec = createShuffle(Vec,
nullptr, CommonMask);
18552 transformMaskAfterShuffle(CommonMask, CommonMask);
18554 "Expected vector length for the final value before action.");
18558 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
18559 Vec = createShuffle(Vec,
nullptr, ResizeMask);
18561 Action(Vec, CommonMask, [
this](
Value *V1,
Value *V2, ArrayRef<int> Mask) {
18562 return createShuffle(V1, V2, Mask);
18564 InVectors.
front() = Vec;
18566 if (!SubVectors.empty()) {
18568 if (InVectors.
size() == 2) {
18569 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
18572 Vec = createShuffle(Vec,
nullptr, CommonMask);
18574 transformMaskAfterShuffle(CommonMask, CommonMask);
18575 auto CreateSubVectors = [&](
Value *Vec,
18576 SmallVectorImpl<int> &CommonMask) {
18577 for (
auto [
E, Idx] : SubVectors) {
18578 Value *
V = getVectorizedValue(*
E);
18585 Type *OrigScalarTy = ScalarTy;
18588 Builder, Vec, V, InsertionIndex,
18589 std::bind(&ShuffleInstructionBuilder::createShuffle,
this, _1, _2,
18591 ScalarTy = OrigScalarTy;
18592 if (!CommonMask.
empty()) {
18593 std::iota(std::next(CommonMask.
begin(), Idx),
18594 std::next(CommonMask.
begin(), Idx +
E->getVectorFactor()),
18600 if (SubVectorsMask.
empty()) {
18601 Vec = CreateSubVectors(Vec, CommonMask);
18604 copy(SubVectorsMask, SVMask.begin());
18605 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
18608 I1 = I2 + CommonMask.
size();
18613 Vec = createShuffle(InsertVec, Vec, SVMask);
18614 transformMaskAfterShuffle(CommonMask, SVMask);
18616 InVectors.
front() = Vec;
18619 if (!ExtMask.
empty()) {
18620 if (CommonMask.
empty()) {
18624 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
18627 NewMask[
I] = CommonMask[ExtMask[
I]];
18629 CommonMask.
swap(NewMask);
18632 if (CommonMask.
empty()) {
18633 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
18634 return InVectors.
front();
18636 if (InVectors.
size() == 2)
18637 return createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
18638 return createShuffle(InVectors.
front(),
nullptr, CommonMask);
18642 assert((IsFinalized || CommonMask.empty()) &&
18643 "Shuffle construction must be finalized.");
18647Value *BoUpSLP::vectorizeOperand(TreeEntry *E,
unsigned NodeIdx) {
18651template <
typename BVTy,
typename ResTy,
typename... Args>
18652ResTy BoUpSLP::processBuildVector(
const TreeEntry *E,
Type *ScalarTy,
18654 assert(E->isGather() &&
"Expected gather node.");
18655 unsigned VF = E->getVectorFactor();
18657 bool NeedFreeze =
false;
18660 for (
auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
18662 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
18665 E->CombinedEntriesWithIndices.size());
18666 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
18667 [&](
const auto &
P) {
18668 return std::make_pair(VectorizableTree[P.first].get(), P.second);
18673 E->ReorderIndices.end());
18674 if (!ReorderMask.
empty())
18680 if (!SubVectors.empty() && !SubVectorsMask.
empty()) {
18682 if (E->Scalars[
I] == GatheredScalars[ReorderMask[
I]])
18685 SubVectorsMask.
clear();
18689 unsigned I,
unsigned SliceSize,
18690 bool IsNotPoisonous) {
18692 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
18695 TreeEntry *UserTE = E->UserTreeIndex.UserTE;
18696 unsigned EdgeIdx = E->UserTreeIndex.EdgeIdx;
18697 if (UserTE->getNumOperands() != 2)
18699 if (!IsNotPoisonous) {
18700 auto *It =
find_if(
ArrayRef(VectorizableTree).drop_front(UserTE->Idx + 1),
18701 [=](
const std::unique_ptr<TreeEntry> &TE) {
18702 return TE->UserTreeIndex.UserTE == UserTE &&
18703 TE->UserTreeIndex.EdgeIdx != EdgeIdx;
18705 if (It == VectorizableTree.end())
18708 if (!(*It)->ReorderIndices.empty()) {
18712 if (!
all_of(
zip(GatheredScalars, GS), [&](
const auto &
P) {
18713 Value *V0 = std::get<0>(
P);
18714 Value *V1 = std::get<1>(
P);
18722 if ((Mask.size() < InputVF &&
18725 (Mask.size() == InputVF &&
18728 std::next(Mask.begin(),
I * SliceSize),
18729 std::next(Mask.begin(),
18736 std::next(Mask.begin(),
I * SliceSize),
18737 std::next(Mask.begin(),
18743 BVTy ShuffleBuilder(ScalarTy, Params...);
18744 ResTy Res = ResTy();
18748 Value *ExtractVecBase =
nullptr;
18749 bool UseVecBaseAsInput =
false;
18752 Type *OrigScalarTy = GatheredScalars.
front()->getType();
18757 bool Resized =
false;
18759 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
18760 if (!ExtractShuffles.
empty()) {
18762 for (
auto [Idx,
I] :
enumerate(ExtractMask)) {
18768 ExtractEntries.
append(TEs.begin(), TEs.end());
18770 if (std::optional<ResTy> Delayed =
18771 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
18773 PostponedGathers.insert(E);
18778 if (
Value *VecBase = ShuffleBuilder.adjustExtracts(
18779 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
18780 ExtractVecBase = VecBase;
18782 if (VF == VecBaseTy->getNumElements() &&
18783 GatheredScalars.
size() != VF) {
18785 GatheredScalars.
append(VF - GatheredScalars.
size(),
18793 if (!ExtractShuffles.
empty() || !E->hasState() ||
18794 E->getOpcode() != Instruction::Load ||
18795 (((E->hasState() && E->getOpcode() == Instruction::Load) ||
18799 return isa<LoadInst>(V) && isVectorized(V);
18801 (E->hasState() && E->isAltShuffle()) ||
18802 all_of(E->Scalars, [
this](
Value *V) { return isVectorized(V); }) ||
18804 (E->Scalars != GatheredScalars && GatheredScalars.
size() <= 2)) {
18806 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
18808 if (!GatherShuffles.
empty()) {
18809 if (std::optional<ResTy> Delayed =
18810 ShuffleBuilder.needToDelay(E, Entries)) {
18812 PostponedGathers.insert(E);
18817 if (GatherShuffles.
size() == 1 &&
18819 Entries.
front().front()->isSame(E->Scalars)) {
18822 LLVM_DEBUG(
dbgs() <<
"SLP: perfect diamond match for gather bundle "
18825 Mask.resize(E->Scalars.size());
18826 const TreeEntry *FrontTE = Entries.
front().front();
18827 if (FrontTE->ReorderIndices.empty() &&
18828 ((FrontTE->ReuseShuffleIndices.empty() &&
18829 E->Scalars.size() == FrontTE->Scalars.size()) ||
18830 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
18831 std::iota(Mask.begin(), Mask.end(), 0);
18838 Mask[
I] = FrontTE->findLaneForValue(V);
18843 ShuffleBuilder.resetForSameNode();
18844 ShuffleBuilder.add(*FrontTE, Mask);
18846 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
18850 if (GatheredScalars.
size() != VF &&
18852 return any_of(TEs, [&](
const TreeEntry *TE) {
18853 return TE->getVectorFactor() == VF;
18856 GatheredScalars.
append(VF - GatheredScalars.
size(),
18860 for (
int I = 0, Sz = Mask.size();
I < Sz; ++
I) {
18868 bool IsRootPoison) {
18871 bool IsSplat = IsRootPoison &&
isSplat(Scalars) &&
18878 int NumNonConsts = 0;
18897 Scalars.
front() = OrigV;
18900 const auto Res = UniquePositions.
try_emplace(OrigV,
I);
18901 Scalars[Res.first->second] = OrigV;
18902 ReuseMask[
I] = Res.first->second;
18905 if (NumNonConsts == 1) {
18910 if (!UndefPos.
empty() && UndefPos.
front() == 0)
18913 ReuseMask[SinglePos] = SinglePos;
18914 }
else if (!UndefPos.
empty() && IsSplat) {
18921 (E->UserTreeIndex &&
any_of(V->uses(), [E](
const Use &U) {
18924 return E->UserTreeIndex.EdgeIdx != U.getOperandNo() &&
18925 is_contained(E->UserTreeIndex.UserTE->Scalars,
18929 if (It != Scalars.
end()) {
18931 int Pos = std::distance(Scalars.
begin(), It);
18932 for (
int I : UndefPos) {
18934 ReuseMask[
I] = Pos;
18943 for (
int I : UndefPos) {
18952 if (!ExtractShuffles.
empty() || !GatherShuffles.
empty()) {
18953 bool IsNonPoisoned =
true;
18954 bool IsUsedInExpr =
true;
18955 Value *Vec1 =
nullptr;
18956 if (!ExtractShuffles.
empty()) {
18960 Value *Vec2 =
nullptr;
18961 for (
unsigned I = 0, Sz = ExtractMask.
size();
I < Sz; ++
I) {
18965 if (UseVecBaseAsInput) {
18966 Vec1 = ExtractVecBase;
18968 for (
unsigned I = 0, Sz = ExtractMask.
size();
I < Sz; ++
I) {
18974 Value *VecOp = EI->getVectorOperand();
18976 !TEs.
empty() && TEs.front()->VectorizedValue)
18977 VecOp = TEs.front()->VectorizedValue;
18980 }
else if (Vec1 != VecOp) {
18981 assert((!Vec2 || Vec2 == VecOp) &&
18982 "Expected only 1 or 2 vectors shuffle.");
18988 IsUsedInExpr =
false;
18991 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
18994 IsUsedInExpr &= FindReusedSplat(
18997 ExtractMask.
size(), IsNotPoisonedVec);
18998 ShuffleBuilder.add(Vec1, ExtractMask,
true);
18999 IsNonPoisoned &= IsNotPoisonedVec;
19001 IsUsedInExpr =
false;
19006 if (!GatherShuffles.
empty()) {
19007 unsigned SliceSize =
19011 for (
const auto [
I, TEs] :
enumerate(Entries)) {
19014 "No shuffles with empty entries list expected.");
19017 assert((TEs.size() == 1 || TEs.size() == 2) &&
19018 "Expected shuffle of 1 or 2 entries.");
19019 unsigned Limit =
getNumElems(Mask.size(), SliceSize,
I);
19022 copy(SubMask, std::next(VecMask.
begin(),
I * SliceSize));
19023 if (TEs.size() == 1) {
19024 bool IsNotPoisonedVec =
19025 TEs.front()->VectorizedValue
19029 FindReusedSplat(VecMask, TEs.
front()->getVectorFactor(),
I,
19030 SliceSize, IsNotPoisonedVec);
19031 ShuffleBuilder.add(*TEs.front(), VecMask);
19032 IsNonPoisoned &= IsNotPoisonedVec;
19034 IsUsedInExpr =
false;
19035 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
19036 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
19047 int EMSz = ExtractMask.
size();
19048 int MSz = Mask.size();
19051 bool IsSingleShuffle = ExtractShuffles.
empty() || GatherShuffles.
empty();
19052 bool IsIdentityShuffle =
19053 ((UseVecBaseAsInput ||
19055 [](
const std::optional<TTI::ShuffleKind> &SK) {
19059 none_of(ExtractMask, [&](
int I) {
return I >= EMSz; }) &&
19061 (!GatherShuffles.
empty() &&
19063 [](
const std::optional<TTI::ShuffleKind> &SK) {
19067 none_of(Mask, [&](
int I) {
return I >= MSz; }) &&
19069 bool EnoughConstsForShuffle =
19079 (!IsIdentityShuffle ||
19080 (GatheredScalars.
size() == 2 &&
19088 for (
int I = 0, Sz = GatheredScalars.
size();
I < Sz; ++
I) {
19089 if (EnoughConstsForShuffle &&
isa<Constant>(GatheredScalars[
I]))
19097 TryPackScalars(GatheredScalars, BVMask,
true);
19098 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.
size());
19099 ShuffleBuilder.add(BV, BVMask);
19103 (IsSingleShuffle && ((IsIdentityShuffle &&
19106 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
19109 Res = ShuffleBuilder.finalize(
19110 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.
size(),
19112 bool IsSplat = isSplat(NonConstants);
19113 SmallVector<int> BVMask(Mask.size(), PoisonMaskElem);
19114 TryPackScalars(NonConstants, BVMask, false);
19115 auto CheckIfSplatIsProfitable = [&]() {
19118 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
19119 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
19120 if (isa<ExtractElementInst>(V) || isVectorized(V))
19122 InstructionCost SplatCost = TTI->getVectorInstrCost(
19123 Instruction::InsertElement, VecTy, CostKind, 0,
19124 PoisonValue::get(VecTy), V);
19125 SmallVector<int> NewMask(Mask.begin(), Mask.end());
19126 for (auto [Idx, I] : enumerate(BVMask))
19127 if (I != PoisonMaskElem)
19128 NewMask[Idx] = Mask.size();
19129 SplatCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy,
19130 NewMask, CostKind);
19131 InstructionCost BVCost = TTI->getVectorInstrCost(
19132 Instruction::InsertElement, VecTy, CostKind,
19133 *find_if(Mask, [](int I) { return I != PoisonMaskElem; }),
19136 if (count(BVMask, PoisonMaskElem) <
19137 static_cast<int>(BVMask.size() - 1)) {
19138 SmallVector<int> NewMask(Mask.begin(), Mask.end());
19139 for (auto [Idx, I] : enumerate(BVMask))
19140 if (I != PoisonMaskElem)
19142 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
19143 VecTy, NewMask, CostKind);
19145 return SplatCost <= BVCost;
19147 if (!IsSplat || Mask.size() <= 2 || !CheckIfSplatIsProfitable()) {
19151 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
19157 Value *BV = ShuffleBuilder.gather(Values, BVMask.
size());
19160 return I == PoisonMaskElem ? PoisonMaskElem : 0;
19163 BV = CreateShuffle(BV,
nullptr, SplatMask);
19166 Mask[Idx] = BVMask.size() + Idx;
19167 Vec = CreateShuffle(Vec, BV, Mask);
19176 TryPackScalars(GatheredScalars, ReuseMask,
true);
19177 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
19178 ShuffleBuilder.add(BV, ReuseMask);
19179 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
19184 for (
auto [
I, V] :
enumerate(GatheredScalars)) {
19188 Value *BV = ShuffleBuilder.gather(GatheredScalars);
19189 ShuffleBuilder.add(BV, Mask);
19190 Res = ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors,
19195 Res = ShuffleBuilder.createFreeze(Res);
19199Value *BoUpSLP::createBuildVector(
const TreeEntry *
E,
Type *ScalarTy) {
19200 for (
auto [EIdx,
_] :
E->CombinedEntriesWithIndices)
19202 return processBuildVector<ShuffleInstructionBuilder, Value *>(
E, ScalarTy,
19210 for (
Value *V : VL)
19223 IRBuilderBase::InsertPointGuard Guard(Builder);
19225 Value *
V =
E->Scalars.front();
19226 Type *ScalarTy =
V->getType();
19229 auto It = MinBWs.find(
E);
19230 if (It != MinBWs.end()) {
19236 if (
E->VectorizedValue)
19237 return E->VectorizedValue;
19239 if (
E->isGather()) {
19241 if (
E->hasState() &&
E->Idx == 0 && !UserIgnoreList)
19242 setInsertPointAfterBundle(
E);
19243 Value *Vec = createBuildVector(
E, ScalarTy);
19244 E->VectorizedValue = Vec;
19247 if (
E->State == TreeEntry::SplitVectorize) {
19248 assert(
E->CombinedEntriesWithIndices.size() == 2 &&
19249 "Expected exactly 2 combined entries.");
19250 setInsertPointAfterBundle(
E);
19252 *VectorizableTree[
E->CombinedEntriesWithIndices.front().first];
19254 ArrayRef(
E->Scalars).take_front(OpTE1.getVectorFactor())) &&
19255 "Expected same first part of scalars.");
19258 *VectorizableTree[
E->CombinedEntriesWithIndices.back().first];
19260 OpTE2.isSame(
ArrayRef(
E->Scalars).take_back(OpTE2.getVectorFactor())) &&
19261 "Expected same second part of scalars.");
19263 auto GetOperandSignedness = [&](
const TreeEntry *OpE) {
19264 bool IsSigned =
false;
19265 auto It = MinBWs.find(OpE);
19266 if (It != MinBWs.end())
19267 IsSigned = It->second.second;
19270 if (isa<PoisonValue>(V))
19272 return !isKnownNonNegative(R, SimplifyQuery(*DL));
19279 Op1 = Builder.CreateIntCast(
19284 GetOperandSignedness(&OpTE1));
19289 Op2 = Builder.CreateIntCast(
19294 GetOperandSignedness(&OpTE2));
19296 if (
E->ReorderIndices.empty()) {
19300 std::next(
Mask.begin(),
E->CombinedEntriesWithIndices.back().second),
19303 if (ScalarTyNumElements != 1) {
19307 Value *Vec = Builder.CreateShuffleVector(Op1, Mask);
19309 E->CombinedEntriesWithIndices.back().second *
19310 ScalarTyNumElements);
19311 E->VectorizedValue = Vec;
19314 unsigned CommonVF =
19315 std::max(OpTE1.getVectorFactor(), OpTE2.getVectorFactor());
19318 std::iota(
Mask.begin(), std::next(
Mask.begin(), OpTE1.getVectorFactor()),
19320 Op1 = Builder.CreateShuffleVector(Op1, Mask);
19324 std::iota(
Mask.begin(), std::next(
Mask.begin(), OpTE2.getVectorFactor()),
19326 Op2 = Builder.CreateShuffleVector(Op2, Mask);
19328 Value *Vec = Builder.CreateShuffleVector(Op1, Op2,
E->getSplitMask());
19329 E->VectorizedValue = Vec;
19333 bool IsReverseOrder =
19335 auto FinalShuffle = [&](
Value *
V,
const TreeEntry *
E) {
19337 if (
E->getOpcode() == Instruction::Store &&
19338 E->State == TreeEntry::Vectorize) {
19339 ArrayRef<int>
Mask =
19340 ArrayRef(
reinterpret_cast<const int *
>(
E->ReorderIndices.begin()),
19341 E->ReorderIndices.size());
19342 ShuffleBuilder.add(V, Mask);
19343 }
else if ((
E->State == TreeEntry::StridedVectorize && IsReverseOrder) ||
19344 E->State == TreeEntry::CompressVectorize) {
19345 ShuffleBuilder.addOrdered(V, {});
19347 ShuffleBuilder.addOrdered(V,
E->ReorderIndices);
19350 E->CombinedEntriesWithIndices.size());
19352 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](
const auto &
P) {
19353 return std::make_pair(VectorizableTree[P.first].get(), P.second);
19356 (
E->CombinedEntriesWithIndices.empty() ||
E->ReorderIndices.empty()) &&
19357 "Expected either combined subnodes or reordering");
19358 return ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors, {});
19361 assert(!
E->isGather() &&
"Unhandled state");
19362 unsigned ShuffleOrOp =
19363 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector :
E->
getOpcode();
19365 auto GetOperandSignedness = [&](
unsigned Idx) {
19366 const TreeEntry *OpE = getOperandEntry(
E, Idx);
19367 bool IsSigned =
false;
19368 auto It = MinBWs.find(OpE);
19369 if (It != MinBWs.end())
19370 IsSigned = It->second.second;
19373 if (isa<PoisonValue>(V))
19375 return !isKnownNonNegative(R, SimplifyQuery(*DL));
19379 switch (ShuffleOrOp) {
19380 case Instruction::PHI: {
19381 assert((
E->ReorderIndices.empty() || !
E->ReuseShuffleIndices.empty() ||
19382 E != VectorizableTree.front().get() ||
E->UserTreeIndex) &&
19383 "PHI reordering is free.");
19385 Builder.SetInsertPoint(PH->getParent(),
19386 PH->getParent()->getFirstNonPHIIt());
19388 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
19392 Builder.SetInsertPoint(PH->getParent(),
19393 PH->getParent()->getFirstInsertionPt());
19396 V = FinalShuffle(V,
E);
19398 E->VectorizedValue =
V;
19405 SmallPtrSet<BasicBlock *, 4> VisitedBBs;
19412 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
19416 if (!VisitedBBs.
insert(IBB).second) {
19419 TreeEntry *OpTE = getOperandEntry(
E,
I);
19420 assert(!OpTE->VectorizedValue &&
"Expected no vectorized value.");
19421 OpTE->VectorizedValue = VecOp;
19427 Value *Vec = vectorizeOperand(
E,
I);
19428 if (VecTy != Vec->
getType()) {
19430 MinBWs.contains(getOperandEntry(
E,
I))) &&
19431 "Expected item in MinBWs.");
19432 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(
I));
19438 "Invalid number of incoming values");
19439 assert(
E->VectorizedValue &&
"Expected vectorized value.");
19440 return E->VectorizedValue;
19443 case Instruction::ExtractElement: {
19444 Value *
V =
E->getSingleOperand(0);
19445 setInsertPointAfterBundle(
E);
19446 V = FinalShuffle(V,
E);
19447 E->VectorizedValue =
V;
19450 case Instruction::ExtractValue: {
19452 Builder.SetInsertPoint(LI);
19453 Value *Ptr = LI->getPointerOperand();
19454 LoadInst *
V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
19456 NewV = FinalShuffle(NewV,
E);
19457 E->VectorizedValue = NewV;
19460 case Instruction::InsertElement: {
19461 assert(
E->ReuseShuffleIndices.empty() &&
"All inserts should be unique");
19462 if (
const TreeEntry *OpE = getOperandEntry(
E, 1);
19463 OpE && !OpE->isGather() && OpE->hasState() &&
19464 !OpE->hasCopyableElements())
19467 setInsertPointAfterBundle(
E);
19468 Value *
V = vectorizeOperand(
E, 1);
19470 Type *ScalarTy =
Op.front()->getType();
19473 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(
E, 1));
19474 assert(Res.first > 0 &&
"Expected item in MinBWs.");
19475 V = Builder.CreateIntCast(
19485 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
19487 const unsigned NumElts =
19489 const unsigned NumScalars =
E->Scalars.size();
19492 assert(
Offset < NumElts &&
"Failed to find vector index offset");
19495 SmallVector<int>
Mask;
19496 if (!
E->ReorderIndices.empty()) {
19501 std::iota(
Mask.begin(), std::next(
Mask.begin(), NumScalars), 0);
19504 bool IsIdentity =
true;
19506 Mask.swap(PrevMask);
19507 for (
unsigned I = 0;
I < NumScalars; ++
I) {
19510 IsIdentity &= InsertIdx -
Offset ==
I;
19513 if (!IsIdentity || NumElts != NumScalars) {
19514 Value *V2 =
nullptr;
19515 bool IsVNonPoisonous =
19517 SmallVector<int> InsertMask(Mask);
19518 if (NumElts != NumScalars &&
Offset == 0) {
19527 InsertMask[*InsertIdx] = *InsertIdx;
19533 SmallBitVector UseMask =
19534 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19535 SmallBitVector IsFirstPoison =
19537 SmallBitVector IsFirstUndef =
19539 if (!IsFirstPoison.
all()) {
19541 for (
unsigned I = 0;
I < NumElts;
I++) {
19543 IsFirstUndef.
test(
I)) {
19544 if (IsVNonPoisonous) {
19545 InsertMask[
I] =
I < NumScalars ?
I : 0;
19550 if (Idx >= NumScalars)
19551 Idx = NumScalars - 1;
19552 InsertMask[
I] = NumScalars + Idx;
19565 V = Builder.CreateShuffleVector(V, V2, InsertMask);
19567 GatherShuffleExtractSeq.insert(
I);
19568 CSEBlocks.insert(
I->getParent());
19573 for (
unsigned I = 0;
I < NumElts;
I++) {
19577 SmallBitVector UseMask =
19578 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
19579 SmallBitVector IsFirstUndef =
19581 if ((!IsIdentity ||
Offset != 0 || !IsFirstUndef.
all()) &&
19582 NumElts != NumScalars) {
19583 if (IsFirstUndef.
all()) {
19585 SmallBitVector IsFirstPoison =
19587 if (!IsFirstPoison.
all()) {
19588 for (
unsigned I = 0;
I < NumElts;
I++) {
19590 InsertMask[
I] =
I + NumElts;
19593 V = Builder.CreateShuffleVector(
19599 GatherShuffleExtractSeq.insert(
I);
19600 CSEBlocks.insert(
I->getParent());
19604 SmallBitVector IsFirstPoison =
19606 for (
unsigned I = 0;
I < NumElts;
I++) {
19610 InsertMask[
I] += NumElts;
19612 V = Builder.CreateShuffleVector(
19613 FirstInsert->getOperand(0), V, InsertMask,
19616 GatherShuffleExtractSeq.insert(
I);
19617 CSEBlocks.insert(
I->getParent());
19622 ++NumVectorInstructions;
19623 E->VectorizedValue =
V;
19626 case Instruction::ZExt:
19627 case Instruction::SExt:
19628 case Instruction::FPToUI:
19629 case Instruction::FPToSI:
19630 case Instruction::FPExt:
19631 case Instruction::PtrToInt:
19632 case Instruction::IntToPtr:
19633 case Instruction::SIToFP:
19634 case Instruction::UIToFP:
19635 case Instruction::Trunc:
19636 case Instruction::FPTrunc:
19637 case Instruction::BitCast: {
19638 setInsertPointAfterBundle(
E);
19640 Value *InVec = vectorizeOperand(
E, 0);
19645 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
19647 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
19650 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
19651 if (SrcIt != MinBWs.end())
19652 SrcBWSz = SrcIt->second.first;
19653 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->
getScalarType());
19654 if (BWSz == SrcBWSz) {
19655 VecOpcode = Instruction::BitCast;
19656 }
else if (BWSz < SrcBWSz) {
19657 VecOpcode = Instruction::Trunc;
19658 }
else if (It != MinBWs.end()) {
19659 assert(BWSz > SrcBWSz &&
"Invalid cast!");
19660 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
19661 }
else if (SrcIt != MinBWs.end()) {
19662 assert(BWSz > SrcBWSz &&
"Invalid cast!");
19664 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
19666 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
19667 !SrcIt->second.second) {
19668 VecOpcode = Instruction::UIToFP;
19670 Value *
V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
19672 : Builder.CreateCast(VecOpcode, InVec, VecTy);
19673 V = FinalShuffle(V,
E);
19675 E->VectorizedValue =
V;
19676 ++NumVectorInstructions;
19679 case Instruction::FCmp:
19680 case Instruction::ICmp: {
19681 setInsertPointAfterBundle(
E);
19683 Value *
L = vectorizeOperand(
E, 0);
19684 Value *
R = vectorizeOperand(
E, 1);
19685 if (
L->getType() !=
R->getType()) {
19688 MinBWs.contains(getOperandEntry(
E, 0)) ||
19689 MinBWs.contains(getOperandEntry(
E, 1))) &&
19690 "Expected item in MinBWs.");
19695 ->getIntegerBitWidth()) {
19696 Type *CastTy =
R->getType();
19697 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
19699 Type *CastTy =
L->getType();
19700 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
19705 Value *
V = Builder.CreateCmp(P0, L, R);
19708 ICmp->setSameSign(
false);
19711 V = FinalShuffle(V,
E);
19713 E->VectorizedValue =
V;
19714 ++NumVectorInstructions;
19717 case Instruction::Select: {
19718 setInsertPointAfterBundle(
E);
19721 Value *True = vectorizeOperand(
E, 1);
19722 Value *False = vectorizeOperand(
E, 2);
19726 MinBWs.contains(getOperandEntry(
E, 1)) ||
19727 MinBWs.contains(getOperandEntry(
E, 2))) &&
19728 "Expected item in MinBWs.");
19729 if (True->
getType() != VecTy)
19730 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
19731 if (False->
getType() != VecTy)
19732 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
19737 assert(TrueNumElements >= CondNumElements &&
19738 TrueNumElements % CondNumElements == 0 &&
19739 "Cannot vectorize Instruction::Select");
19741 "Cannot vectorize Instruction::Select");
19742 if (CondNumElements != TrueNumElements) {
19745 Cond = Builder.CreateShuffleVector(
19750 "Cannot vectorize Instruction::Select");
19752 Builder.CreateSelectWithUnknownProfile(
Cond, True, False,
DEBUG_TYPE);
19753 V = FinalShuffle(V,
E);
19755 E->VectorizedValue =
V;
19756 ++NumVectorInstructions;
19759 case Instruction::FNeg: {
19760 setInsertPointAfterBundle(
E);
19762 Value *
Op = vectorizeOperand(
E, 0);
19764 Value *
V = Builder.CreateUnOp(
19770 V = FinalShuffle(V,
E);
19772 E->VectorizedValue =
V;
19773 ++NumVectorInstructions;
19777 case Instruction::Freeze: {
19778 setInsertPointAfterBundle(
E);
19780 Value *
Op = vectorizeOperand(
E, 0);
19782 if (
Op->getType() != VecTy) {
19784 MinBWs.contains(getOperandEntry(
E, 0))) &&
19785 "Expected item in MinBWs.");
19786 Op = Builder.CreateIntCast(
Op, VecTy, GetOperandSignedness(0));
19788 Value *
V = Builder.CreateFreeze(
Op);
19789 V = FinalShuffle(V,
E);
19791 E->VectorizedValue =
V;
19792 ++NumVectorInstructions;
19796 case Instruction::Add:
19797 case Instruction::FAdd:
19798 case Instruction::Sub:
19799 case Instruction::FSub:
19800 case Instruction::Mul:
19801 case Instruction::FMul:
19802 case Instruction::UDiv:
19803 case Instruction::SDiv:
19804 case Instruction::FDiv:
19805 case Instruction::URem:
19806 case Instruction::SRem:
19807 case Instruction::FRem:
19808 case Instruction::Shl:
19809 case Instruction::LShr:
19810 case Instruction::AShr:
19811 case Instruction::And:
19812 case Instruction::Or:
19813 case Instruction::Xor: {
19814 setInsertPointAfterBundle(
E);
19818 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
19823 return CI && CI->getValue().countr_one() >= It->second.first;
19825 V = FinalShuffle(
I == 0 ?
RHS :
LHS,
E);
19826 E->VectorizedValue =
V;
19827 ++NumVectorInstructions;
19835 MinBWs.contains(getOperandEntry(
E, 0)) ||
19836 MinBWs.contains(getOperandEntry(
E, 1))) &&
19837 "Expected item in MinBWs.");
19839 LHS = Builder.CreateIntCast(
LHS, VecTy, GetOperandSignedness(0));
19841 RHS = Builder.CreateIntCast(
RHS, VecTy, GetOperandSignedness(1));
19844 Value *
V = Builder.CreateBinOp(
19851 if (!MinBWs.contains(
E) && ShuffleOrOp == Instruction::Sub &&
19853 return isa<PoisonValue>(V) ||
19854 (E->hasCopyableElements() && E->isCopyableElement(V)) ||
19855 isCommutative(cast<Instruction>(V));
19857 I->setHasNoUnsignedWrap(
false);
19860 V = FinalShuffle(V,
E);
19862 E->VectorizedValue =
V;
19863 ++NumVectorInstructions;
19867 case Instruction::Load: {
19870 setInsertPointAfterBundle(
E);
19874 FixedVectorType *StridedLoadTy =
nullptr;
19875 Value *PO = LI->getPointerOperand();
19876 if (
E->State == TreeEntry::Vectorize) {
19877 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
19878 }
else if (
E->State == TreeEntry::CompressVectorize) {
19879 auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
19880 CompressEntryToData.at(
E);
19881 Align CommonAlignment = LI->getAlign();
19887 for (
int I : CompressMask)
19891 MaskValues =
replicateMask(MaskValues, VecTy->getNumElements());
19894 NewLI = Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment,
19897 NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment);
19908 }
else if (
E->State == TreeEntry::StridedVectorize) {
19911 PO = IsReverseOrder ? PtrN : Ptr0;
19912 Type *StrideTy = DL->getIndexType(PO->
getType());
19914 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(
E);
19915 StridedLoadTy = SPtrInfo.Ty;
19916 assert(StridedLoadTy &&
"Missing StridedPoinerInfo for tree entry.");
19917 unsigned StridedLoadEC =
19920 Value *Stride = SPtrInfo.StrideVal;
19922 const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
19923 assert(StrideSCEV &&
"Neither StrideVal nor StrideSCEV were set.");
19924 SCEVExpander Expander(*SE,
"strided-load-vec");
19925 Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->
getType(),
19926 &*Builder.GetInsertPoint());
19929 Builder.CreateIntCast(Stride, StrideTy,
true);
19930 StrideVal = Builder.CreateMul(
19931 NewStride, ConstantInt::get(
19932 StrideTy, (IsReverseOrder ? -1 : 1) *
19934 DL->getTypeAllocSize(ScalarTy))));
19936 auto *Inst = Builder.CreateIntrinsic(
19937 Intrinsic::experimental_vp_strided_load,
19938 {StridedLoadTy, PO->
getType(), StrideTy},
19941 Builder.getInt32(StridedLoadEC)});
19942 Inst->addParamAttr(
19944 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
19947 assert(
E->State == TreeEntry::ScatterVectorize &&
"Unhandled state");
19948 Value *VecPtr = vectorizeOperand(
E, 0);
19953 unsigned ScalarTyNumElements =
19955 unsigned VecTyNumElements =
19957 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
19958 "Cannot expand getelementptr.");
19959 unsigned VF = VecTyNumElements / ScalarTyNumElements;
19962 return Builder.getInt64(I % ScalarTyNumElements);
19964 VecPtr = Builder.CreateGEP(
19965 VecTy->getElementType(),
19966 Builder.CreateShuffleVector(
19972 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
19974 Value *
V =
E->State == TreeEntry::CompressVectorize
19978 if (StridedLoadTy != VecTy)
19979 V = Builder.CreateBitOrPointerCast(V, VecTy);
19980 V = FinalShuffle(V,
E);
19981 E->VectorizedValue =
V;
19982 ++NumVectorInstructions;
19985 case Instruction::Store: {
19988 setInsertPointAfterBundle(
E);
19990 Value *VecValue = vectorizeOperand(
E, 0);
19991 if (VecValue->
getType() != VecTy)
19993 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
19994 VecValue = FinalShuffle(VecValue,
E);
19996 Value *Ptr =
SI->getPointerOperand();
19998 if (
E->State == TreeEntry::Vectorize) {
19999 ST = Builder.CreateAlignedStore(VecValue, Ptr,
SI->getAlign());
20001 assert(
E->State == TreeEntry::StridedVectorize &&
20002 "Expected either strided or consecutive stores.");
20003 if (!
E->ReorderIndices.empty()) {
20005 Ptr =
SI->getPointerOperand();
20008 Type *StrideTy = DL->getIndexType(
SI->getPointerOperandType());
20009 auto *Inst = Builder.CreateIntrinsic(
20010 Intrinsic::experimental_vp_strided_store,
20011 {VecTy, Ptr->
getType(), StrideTy},
20014 StrideTy, -
static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
20015 Builder.getAllOnesMask(VecTy->getElementCount()),
20016 Builder.getInt32(
E->Scalars.size())});
20017 Inst->addParamAttr(
20019 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
20025 E->VectorizedValue =
V;
20026 ++NumVectorInstructions;
20029 case Instruction::GetElementPtr: {
20031 setInsertPointAfterBundle(
E);
20033 Value *Op0 = vectorizeOperand(
E, 0);
20036 for (
int J = 1,
N = GEP0->getNumOperands(); J <
N; ++J) {
20037 Value *OpVec = vectorizeOperand(
E, J);
20041 Value *
V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
20044 for (
Value *V :
E->Scalars) {
20051 V = FinalShuffle(V,
E);
20053 E->VectorizedValue =
V;
20054 ++NumVectorInstructions;
20058 case Instruction::Call: {
20060 setInsertPointAfterBundle(
E);
20065 CI,
ID, VecTy->getNumElements(),
20066 It != MinBWs.end() ? It->second.first : 0, TTI);
20069 VecCallCosts.first <= VecCallCosts.second;
20071 Value *ScalarArg =
nullptr;
20082 ScalarArg = CEI->getArgOperand(
I);
20085 if (
ID == Intrinsic::abs && It != MinBWs.end() &&
20086 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
20087 ScalarArg = Builder.getFalse();
20094 Value *OpVec = vectorizeOperand(
E,
I);
20095 ScalarArg = CEI->getArgOperand(
I);
20098 It == MinBWs.end()) {
20101 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(
I));
20102 }
else if (It != MinBWs.end()) {
20103 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(
I));
20112 if (!UseIntrinsic) {
20117 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
20124 Value *
V = Builder.CreateCall(CF, OpVecs, OpBundles);
20127 V = FinalShuffle(V,
E);
20129 E->VectorizedValue =
V;
20130 ++NumVectorInstructions;
20133 case Instruction::ShuffleVector: {
20136 setInsertPointAfterBundle(
E);
20137 Value *Src = vectorizeOperand(
E, 0);
20140 SmallVector<int> NewMask(ThisMask.size());
20142 return SVSrc->getShuffleMask()[Mask];
20144 V = Builder.CreateShuffleVector(SVSrc->getOperand(0),
20145 SVSrc->getOperand(1), NewMask);
20147 V = Builder.CreateShuffleVector(Src, ThisMask);
20152 V = FinalShuffle(V,
E);
20160 "Invalid Shuffle Vector Operand");
20164 setInsertPointAfterBundle(
E);
20165 LHS = vectorizeOperand(
E, 0);
20166 RHS = vectorizeOperand(
E, 1);
20168 setInsertPointAfterBundle(
E);
20169 LHS = vectorizeOperand(
E, 0);
20175 assert((It != MinBWs.end() ||
20176 getOperandEntry(
E, 0)->State == TreeEntry::NeedToGather ||
20177 getOperandEntry(
E, 1)->State == TreeEntry::NeedToGather ||
20178 MinBWs.contains(getOperandEntry(
E, 0)) ||
20179 MinBWs.contains(getOperandEntry(
E, 1))) &&
20180 "Expected item in MinBWs.");
20181 Type *CastTy = VecTy;
20187 ->getIntegerBitWidth())
20193 LHS = Builder.CreateIntCast(
LHS, CastTy, GetOperandSignedness(0));
20195 RHS = Builder.CreateIntCast(
RHS, CastTy, GetOperandSignedness(1));
20200 V0 = Builder.CreateBinOp(
20202 V1 = Builder.CreateBinOp(
20205 V0 = Builder.CreateCmp(CI0->getPredicate(),
LHS,
RHS);
20208 V1 = Builder.CreateCmp(AltPred,
LHS,
RHS);
20211 unsigned SrcBWSz = DL->getTypeSizeInBits(
20213 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
20214 if (BWSz <= SrcBWSz) {
20215 if (BWSz < SrcBWSz)
20216 LHS = Builder.CreateIntCast(
LHS, VecTy, It->second.first);
20218 "Expected same type as operand.");
20222 E->VectorizedValue =
LHS;
20223 ++NumVectorInstructions;
20227 V0 = Builder.CreateCast(
20229 V1 = Builder.CreateCast(
20234 for (
Value *V : {V0, V1}) {
20236 GatherShuffleExtractSeq.insert(
I);
20237 CSEBlocks.insert(
I->getParent());
20245 SmallVector<int>
Mask;
20246 E->buildAltOpShuffleMask(
20247 [
E,
this](Instruction *
I) {
20248 assert(
E->getMatchingMainOpOrAltOp(
I) &&
20249 "Unexpected main/alternate opcode");
20253 Mask, &OpScalars, &AltScalars);
20257 auto DropNuwFlag = [&](
Value *Vec,
unsigned Opcode) {
20260 I && Opcode == Instruction::Sub && !MinBWs.contains(
E) &&
20262 if (isa<PoisonValue>(V))
20264 if (E->hasCopyableElements() && E->isCopyableElement(V))
20266 auto *IV = cast<Instruction>(V);
20267 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
20269 I->setHasNoUnsignedWrap(
false);
20271 DropNuwFlag(V0,
E->getOpcode());
20272 DropNuwFlag(V1,
E->getAltOpcode());
20278 V = Builder.CreateShuffleVector(V0, V1, Mask);
20281 GatherShuffleExtractSeq.insert(
I);
20282 CSEBlocks.insert(
I->getParent());
20286 E->VectorizedValue =
V;
20287 ++NumVectorInstructions;
20305 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
20308 EntryToLastInstruction.clear();
20310 for (
auto &BSIter : BlocksSchedules)
20311 scheduleBlock(*
this, BSIter.second.get());
20314 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20315 if (TE->isGather())
20317 (void)getLastInstructionInBundle(TE.get());
20321 Builder.SetInsertPoint(ReductionRoot->
getParent(),
20324 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20328 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20329 if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
20330 TE->UserTreeIndex.UserTE->hasState() &&
20331 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
20332 (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
20333 TE->UserTreeIndex.UserTE->isAltShuffle()) &&
20334 !TE->UserTreeIndex.UserTE->hasCopyableElements() &&
20335 all_of(TE->UserTreeIndex.UserTE->Scalars,
20336 [](
Value *V) { return isUsedOutsideBlock(V); })) {
20338 getLastInstructionInBundle(TE->UserTreeIndex.UserTE);
20342 for (
auto &Entry : GatherEntries) {
20344 Builder.SetInsertPoint(Entry.second);
20345 Builder.SetCurrentDebugLocation(Entry.second->getDebugLoc());
20350 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
20351 if (GatheredLoadsEntriesFirst.has_value() &&
20352 TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue &&
20353 (!TE->isGather() || TE->UserTreeIndex)) {
20354 assert((TE->UserTreeIndex ||
20355 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
20356 "Expected gathered load node.");
20365 for (
const TreeEntry *E : PostponedNodes) {
20366 auto *TE =
const_cast<TreeEntry *
>(E);
20368 TE->VectorizedValue =
nullptr;
20379 (TE->UserTreeIndex.UserTE->hasState() &&
20380 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI)) {
20389 if (UI->comesBefore(InsertPt))
20392 Builder.SetInsertPoint(InsertPt);
20394 Builder.SetInsertPoint(PrevVec);
20396 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
20399 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
20400 Builder.GetInsertPoint()->comesBefore(VecI))
20401 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
20402 Builder.GetInsertPoint());
20403 if (Vec->
getType() != PrevVec->getType()) {
20405 PrevVec->getType()->isIntOrIntVectorTy() &&
20406 "Expected integer vector types only.");
20407 std::optional<bool> IsSigned;
20408 for (
Value *V : TE->Scalars) {
20410 for (
const TreeEntry *MNTE : getTreeEntries(V)) {
20411 auto It = MinBWs.find(MNTE);
20412 if (It != MinBWs.end()) {
20413 IsSigned = IsSigned.value_or(
false) || It->second.second;
20418 if (IsSigned.value_or(
false))
20421 for (
const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
20422 auto It = MinBWs.find(BVE);
20423 if (It != MinBWs.end()) {
20424 IsSigned = IsSigned.value_or(
false) || It->second.second;
20429 if (IsSigned.value_or(
false))
20433 IsSigned.value_or(
false) ||
20437 if (IsSigned.value_or(
false))
20441 if (IsSigned.value_or(
false)) {
20443 auto It = MinBWs.find(TE->UserTreeIndex.UserTE);
20444 if (It != MinBWs.end())
20445 IsSigned = It->second.second;
20448 "Expected user node or perfect diamond match in MinBWs.");
20449 Vec = Builder.CreateIntCast(Vec, PrevVec->
getType(), *IsSigned);
20451 PrevVec->replaceAllUsesWith(Vec);
20452 PostponedValues.
try_emplace(Vec).first->second.push_back(TE);
20455 auto It = PostponedValues.
find(PrevVec);
20456 if (It != PostponedValues.
end()) {
20457 for (TreeEntry *VTE : It->getSecond())
20458 VTE->VectorizedValue = Vec;
20478 for (
const auto &ExternalUse : ExternalUses) {
20479 Value *Scalar = ExternalUse.Scalar;
20486 const TreeEntry *E = &ExternalUse.E;
20487 assert(E &&
"Invalid scalar");
20488 assert(!E->isGather() &&
"Extracting from a gather list");
20490 if (E->getOpcode() == Instruction::GetElementPtr &&
20494 Value *Vec = E->VectorizedValue;
20495 assert(Vec &&
"Can't find vectorizable value");
20497 Value *Lane = Builder.getInt32(ExternalUse.Lane);
20498 auto ExtractAndExtendIfNeeded = [&](
Value *Vec) {
20499 if (Scalar->getType() != Vec->
getType()) {
20500 Value *Ex =
nullptr;
20501 Value *ExV =
nullptr;
20503 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
20504 auto It = ScalarToEEs.
find(Scalar);
20505 if (It != ScalarToEEs.
end()) {
20508 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
20509 : Builder.GetInsertBlock());
20510 if (EEIt != It->second.end()) {
20511 Value *PrevV = EEIt->second.first;
20513 I && !ReplaceInst &&
20514 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
20515 Builder.GetInsertPoint()->comesBefore(
I)) {
20516 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
20517 Builder.GetInsertPoint());
20522 ExV = EEIt->second.second ? EEIt->second.second : Ex;
20531 IgnoredExtracts.
insert(EE);
20534 auto *CloneInst = Inst->clone();
20535 CloneInst->insertBefore(Inst->getIterator());
20536 if (Inst->hasName())
20537 CloneInst->takeName(Inst);
20542 Value *V = ES->getVectorOperand();
20545 V = ETEs.front()->VectorizedValue;
20547 !
IV ||
IV == Vec ||
IV->getParent() != IVec->getParent() ||
20548 IV->comesBefore(IVec))
20549 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
20551 Ex = Builder.CreateExtractElement(Vec, Lane);
20552 }
else if (
auto *VecTy =
20555 unsigned VecTyNumElements = VecTy->getNumElements();
20560 ExternalUse.Lane * VecTyNumElements);
20562 Ex = Builder.CreateExtractElement(Vec, Lane);
20567 if (Scalar->getType() != Ex->
getType())
20568 ExV = Builder.CreateIntCast(
20573 : &F->getEntryBlock(),
20574 std::make_pair(Ex, ExV));
20580 GatherShuffleExtractSeq.insert(ExI);
20581 CSEBlocks.insert(ExI->getParent());
20587 "In-tree scalar of vector type is not insertelement?");
20596 if (!ScalarsWithNullptrUser.
insert(Scalar).second)
20599 (ExternallyUsedValues.
count(Scalar) ||
20600 ExternalUsesWithNonUsers.count(Scalar) ||
20601 ExternalUsesAsOriginalScalar.contains(Scalar) ||
20605 if (ExternalUsesAsOriginalScalar.contains(U))
20607 ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
20608 return !UseEntries.empty() &&
20609 (E->State == TreeEntry::Vectorize ||
20610 E->State == TreeEntry::StridedVectorize ||
20611 E->State == TreeEntry::CompressVectorize) &&
20612 any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {
20613 return (UseEntry->State == TreeEntry::Vectorize ||
20615 TreeEntry::StridedVectorize ||
20617 TreeEntry::CompressVectorize) &&
20618 doesInTreeUserNeedToExtract(
20619 Scalar, getRootEntryInstruction(*UseEntry),
20623 "Scalar with nullptr User must be registered in "
20624 "ExternallyUsedValues map or remain as scalar in vectorized "
20628 if (
PHI->getParent()->isLandingPad())
20629 Builder.SetInsertPoint(
20632 PHI->getParent()->getLandingPadInst()->getIterator()));
20634 Builder.SetInsertPoint(
PHI->getParent(),
20635 PHI->getParent()->getFirstNonPHIIt());
20637 Builder.SetInsertPoint(VecI->getParent(),
20638 std::next(VecI->getIterator()));
20641 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20643 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20645 if (Scalar != NewInst) {
20648 "Extractelements should not be replaced.");
20649 Scalar->replaceAllUsesWith(NewInst);
20659 if (!UsedInserts.
insert(VU).second)
20662 auto BWIt = MinBWs.find(E);
20664 auto *ScalarTy = FTy->getElementType();
20665 auto Key = std::make_pair(Vec, ScalarTy);
20666 auto VecIt = VectorCasts.
find(
Key);
20667 if (VecIt == VectorCasts.
end()) {
20670 if (IVec->getParent()->isLandingPad())
20671 Builder.SetInsertPoint(IVec->getParent(),
20672 std::next(IVec->getParent()
20673 ->getLandingPadInst()
20676 Builder.SetInsertPoint(
20677 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
20679 Builder.SetInsertPoint(IVec->getNextNode());
20681 Vec = Builder.CreateIntCast(
20686 BWIt->second.second);
20689 Vec = VecIt->second;
20696 ShuffledInserts, [VU](
const ShuffledInsertData<Value *> &
Data) {
20703 unsigned Idx = *InsertIdx;
20704 if (It == ShuffledInserts.
end()) {
20706 It = std::next(ShuffledInserts.
begin(),
20707 ShuffledInserts.
size() - 1);
20712 Mask[Idx] = ExternalUse.Lane;
20724 for (
unsigned I :
seq<unsigned>(0, PH->getNumIncomingValues())) {
20725 if (PH->getIncomingValue(
I) == Scalar) {
20727 PH->getIncomingBlock(
I)->getTerminator();
20729 Builder.SetInsertPoint(VecI->getParent(),
20730 std::next(VecI->getIterator()));
20732 Builder.SetInsertPoint(PH->getIncomingBlock(
I)->getTerminator());
20734 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20735 PH->setOperand(
I, NewInst);
20740 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20744 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
20745 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
20756 for (
int I = 0, E = Mask.size();
I < E; ++
I) {
20758 CombinedMask1[
I] = Mask[
I];
20760 CombinedMask2[
I] = Mask[
I] - VF;
20762 ShuffleInstructionBuilder ShuffleBuilder(
20764 ShuffleBuilder.add(V1, CombinedMask1);
20766 ShuffleBuilder.add(V2, CombinedMask2);
20767 return ShuffleBuilder.finalize({}, {}, {});
20770 auto &&ResizeToVF = [&CreateShuffle](
Value *Vec, ArrayRef<int>
Mask,
20771 bool ForSingleMask) {
20772 unsigned VF =
Mask.size();
20775 if (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); })) {
20776 Vec = CreateShuffle(Vec,
nullptr, Mask);
20777 return std::make_pair(Vec,
true);
20779 if (!ForSingleMask) {
20781 for (
unsigned I = 0;
I < VF; ++
I) {
20785 Vec = CreateShuffle(Vec,
nullptr, ResizeMask);
20789 return std::make_pair(Vec,
false);
20793 for (
int I = 0,
E = ShuffledInserts.size();
I <
E; ++
I) {
20796 InsertElementInst *FirstInsert = ShuffledInserts[
I].InsertElements.front();
20797 InsertElementInst *LastInsert = ShuffledInserts[
I].InsertElements.back();
20798 Builder.SetInsertPoint(LastInsert);
20799 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
20804 return cast<VectorType>(Vec->getType())
20805 ->getElementCount()
20806 .getKnownMinValue();
20809 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
20811 assert((Vals.size() == 1 || Vals.size() == 2) &&
20812 "Expected exactly 1 or 2 input values.");
20813 if (Vals.size() == 1) {
20816 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
20817 ->getNumElements() ||
20818 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
20819 return CreateShuffle(Vals.front(), nullptr, Mask);
20820 return Vals.front();
20822 return CreateShuffle(Vals.
front() ? Vals.
front()
20824 Vals.
back(), Mask);
20826 auto It = ShuffledInserts[
I].InsertElements.rbegin();
20828 InsertElementInst *
II =
nullptr;
20829 if (It != ShuffledInserts[
I].InsertElements.rend())
20832 while (It != ShuffledInserts[
I].InsertElements.rend()) {
20833 assert(
II &&
"Must be an insertelement instruction.");
20840 for (Instruction *
II :
reverse(Inserts)) {
20841 II->replaceUsesOfWith(
II->getOperand(0), NewInst);
20843 if (
II->getParent() == NewI->getParent() &&
II->comesBefore(NewI))
20844 II->moveAfter(NewI);
20848 for (InsertElementInst *IE :
reverse(ShuffledInserts[
I].InsertElements)) {
20849 IE->replaceUsesOfWith(
IE->getOperand(0),
20851 IE->replaceUsesOfWith(
IE->getOperand(1),
20855 CSEBlocks.insert(LastInsert->
getParent());
20860 for (
auto &TEPtr : VectorizableTree) {
20861 TreeEntry *
Entry = TEPtr.get();
20864 if (
Entry->isGather() ||
Entry->State == TreeEntry::SplitVectorize)
20867 assert(
Entry->VectorizedValue &&
"Can't find vectorizable value");
20870 for (
int Lane = 0, LE =
Entry->Scalars.size(); Lane != LE; ++Lane) {
20873 if (
Entry->getOpcode() == Instruction::GetElementPtr &&
20877 EE && IgnoredExtracts.contains(EE))
20884 for (User *U :
Scalar->users()) {
20889 (UserIgnoreList && UserIgnoreList->contains(U)) ||
20892 "Deleting out-of-tree value");
20896 LLVM_DEBUG(
dbgs() <<
"SLP: \tErasing scalar:" << *Scalar <<
".\n");
20905 V->mergeDIAssignID(RemovedInsts);
20908 if (UserIgnoreList) {
20909 for (Instruction *
I : RemovedInsts) {
20910 const TreeEntry *
IE = getTreeEntries(
I).front();
20911 if (
IE->Idx != 0 &&
20912 !(VectorizableTree.front()->isGather() &&
IE->UserTreeIndex &&
20913 (ValueToGatherNodes.lookup(
I).contains(
20914 VectorizableTree.front().get()) ||
20915 (
IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
20916 IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
20917 !(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
20918 IE->UserTreeIndex &&
20920 !(GatheredLoadsEntriesFirst.has_value() &&
20921 IE->Idx >= *GatheredLoadsEntriesFirst &&
20922 VectorizableTree.front()->isGather() &&
20924 !(!VectorizableTree.front()->isGather() &&
20925 VectorizableTree.front()->isCopyableElement(
I)))
20930 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
20931 (match(U.getUser(), m_LogicalAnd()) ||
20932 match(U.getUser(), m_LogicalOr())) &&
20933 U.getOperandNo() == 0;
20934 if (IsPoisoningLogicalOp) {
20935 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
20938 return UserIgnoreList->contains(
U.getUser());
20942 for (SelectInst *SI : LogicalOpSelects)
20952 Builder.ClearInsertionPoint();
20953 InstrElementSize.clear();
20955 const TreeEntry &RootTE = *VectorizableTree.front();
20956 Value *Vec = RootTE.VectorizedValue;
20957 if (
auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
20958 It != MinBWs.end() &&
20959 ReductionBitWidth != It->second.first) {
20960 IRBuilder<>::InsertPointGuard Guard(Builder);
20961 Builder.SetInsertPoint(ReductionRoot->getParent(),
20962 ReductionRoot->getIterator());
20963 Vec = Builder.CreateIntCast(
20965 VectorType::get(Builder.getIntNTy(ReductionBitWidth),
20967 It->second.second);
20973 LLVM_DEBUG(
dbgs() <<
"SLP: Optimizing " << GatherShuffleExtractSeq.size()
20974 <<
" gather sequences instructions.\n");
20981 Loop *L = LI->getLoopFor(
I->getParent());
20986 BasicBlock *PreHeader = L->getLoopPreheader();
20994 auto *OpI = dyn_cast<Instruction>(V);
20995 return OpI && L->contains(OpI);
21001 CSEBlocks.insert(PreHeader);
21006 CSEWorkList.
reserve(CSEBlocks.size());
21009 assert(DT->isReachableFromEntry(
N));
21016 assert((
A ==
B) == (
A->getDFSNumIn() ==
B->getDFSNumIn()) &&
21017 "Different nodes should have different DFS numbers");
21018 return A->getDFSNumIn() <
B->getDFSNumIn();
21026 auto &&IsIdenticalOrLessDefined = [TTI = TTI](
Instruction *I1,
21029 if (I1->getType() != I2->getType())
21034 return I1->isIdenticalTo(I2);
21035 if (SI1->isIdenticalTo(SI2))
21037 for (
int I = 0, E = SI1->getNumOperands();
I < E; ++
I)
21038 if (SI1->getOperand(
I) != SI2->getOperand(
I))
21041 NewMask.
assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
21045 unsigned LastUndefsCnt = 0;
21046 for (
int I = 0, E = NewMask.
size();
I < E; ++
I) {
21052 NewMask[
I] != SM1[
I])
21055 NewMask[
I] = SM1[
I];
21059 return SM1.
size() - LastUndefsCnt > 1 &&
21063 SM1.
size() - LastUndefsCnt));
21069 for (
auto I = CSEWorkList.
begin(), E = CSEWorkList.
end();
I != E; ++
I) {
21071 (
I == CSEWorkList.
begin() || !DT->dominates(*
I, *std::prev(
I))) &&
21072 "Worklist not sorted properly!");
21079 !GatherShuffleExtractSeq.contains(&In))
21084 bool Replaced =
false;
21087 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
21088 DT->dominates(V->getParent(), In.getParent())) {
21089 In.replaceAllUsesWith(V);
21092 if (!NewMask.
empty())
21093 SI->setShuffleMask(NewMask);
21098 GatherShuffleExtractSeq.contains(V) &&
21099 IsIdenticalOrLessDefined(V, &In, NewMask) &&
21100 DT->dominates(In.getParent(), V->getParent())) {
21102 V->replaceAllUsesWith(&In);
21105 if (!NewMask.
empty())
21106 SI->setShuffleMask(NewMask);
21114 Visited.push_back(&In);
21119 GatherShuffleExtractSeq.clear();
21122BoUpSLP::ScheduleBundle &BoUpSLP::BlockScheduling::buildBundle(
21125 ScheduledBundlesList.emplace_back(std::make_unique<ScheduleBundle>());
21126 for (
Value *V : VL) {
21127 if (S.isNonSchedulable(V))
21130 if (S.isCopyableElement(V)) {
21132 ScheduleCopyableData &SD =
21133 addScheduleCopyableData(EI,
I, SchedulingRegionID, *BundlePtr);
21135 BundlePtr->add(&SD);
21138 ScheduleData *BundleMember = getScheduleData(V);
21139 assert(BundleMember &&
"no ScheduleData for bundle member "
21140 "(maybe not in same basic block)");
21142 BundlePtr->add(BundleMember);
21143 ScheduledBundles.try_emplace(
I).first->getSecond().push_back(
21146 assert(BundlePtr && *BundlePtr &&
"Failed to find schedule bundle");
21152std::optional<BoUpSLP::ScheduleBundle *>
21154 const InstructionsState &S,
21167 if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() &&
21168 EI.UserTE->doesNotNeedToSchedule() &&
21169 EI.UserTE->getOpcode() != Instruction::PHI &&
21171 auto *I = dyn_cast<Instruction>(V);
21172 if (!I || I->hasOneUser())
21174 for (User *U : I->users()) {
21175 auto *UI = cast<Instruction>(U);
21176 if (isa<BinaryOperator>(UI))
21181 return std::nullopt;
21182 if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() &&
21183 EI.UserTE->hasCopyableElements() &&
21184 EI.UserTE->getMainOp()->getParent() == S.getMainOp()->getParent() &&
21186 if (S.isCopyableElement(V))
21190 return std::nullopt;
21193 if (S.areInstructionsWithCopyableElements() &&
any_of(VL, [&](
Value *V) {
21206 return std::nullopt;
21207 if (S.areInstructionsWithCopyableElements() && EI) {
21208 bool IsNonSchedulableWithParentPhiNode =
21209 EI.UserTE->doesNotNeedToSchedule() && EI.UserTE->UserTreeIndex &&
21210 EI.UserTE->UserTreeIndex.UserTE->hasState() &&
21211 EI.UserTE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
21212 EI.UserTE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
21213 if (IsNonSchedulableWithParentPhiNode) {
21214 SmallSet<std::pair<Value *, Value *>, 4> Values;
21215 for (
const auto [Idx, V] :
21216 enumerate(EI.UserTE->UserTreeIndex.UserTE->Scalars)) {
21217 Value *
Op = EI.UserTE->UserTreeIndex.UserTE->getOperand(
21218 EI.UserTE->UserTreeIndex.EdgeIdx)[Idx];
21222 if (!Values.
insert(std::make_pair(V,
Op)).second)
21223 return std::nullopt;
21227 bool HasCopyables = S.areInstructionsWithCopyableElements();
21229 all_of(VL, [&](
Value *V) {
return S.isNonSchedulable(V); }))) {
21233 SmallVector<ScheduleData *> ControlDependentMembers;
21234 for (
Value *V : VL) {
21236 if (!
I || (HasCopyables && S.isCopyableElement(V)))
21238 SmallDenseMap<std::pair<Instruction *, Value *>,
unsigned> UserOpToNumOps;
21239 for (
const Use &U :
I->operands()) {
21242 .first->getSecond();
21245 Op && areAllOperandsReplacedByCopyableData(
I,
Op, *SLP,
NumOps)) {
21246 if (ScheduleData *OpSD = getScheduleData(
Op);
21247 OpSD && OpSD->hasValidDependencies()) {
21248 OpSD->clearDirectDependencies();
21249 if (RegionHasStackSave ||
21251 ControlDependentMembers.
push_back(OpSD);
21256 if (!ControlDependentMembers.
empty()) {
21257 ScheduleBundle
Invalid = ScheduleBundle::invalid();
21258 calculateDependencies(
Invalid,
true, SLP,
21259 ControlDependentMembers);
21266 LLVM_DEBUG(
dbgs() <<
"SLP: bundle: " << *S.getMainOp() <<
"\n");
21268 auto TryScheduleBundleImpl = [=](
bool ReSchedule, ScheduleBundle &Bundle) {
21271 SmallVector<ScheduleData *> ControlDependentMembers;
21272 auto CheckIfNeedToClearDeps = [&](ScheduleBundle &Bundle) {
21273 SmallDenseMap<std::pair<Instruction *, Value *>,
unsigned> UserOpToNumOps;
21274 for (ScheduleEntity *SE : Bundle.getBundle()) {
21276 if (ScheduleData *BundleMember = getScheduleData(SD->getInst());
21277 BundleMember && BundleMember->hasValidDependencies()) {
21278 BundleMember->clearDirectDependencies();
21279 if (RegionHasStackSave ||
21281 BundleMember->getInst()))
21282 ControlDependentMembers.
push_back(BundleMember);
21287 if (SD->hasValidDependencies() &&
21288 (!S.areInstructionsWithCopyableElements() ||
21289 !S.isCopyableElement(SD->getInst())) &&
21290 !getScheduleCopyableData(SD->getInst()).empty() && EI.UserTE &&
21291 EI.UserTE->hasState() &&
21292 (!EI.UserTE->hasCopyableElements() ||
21293 !EI.UserTE->isCopyableElement(SD->getInst())))
21294 SD->clearDirectDependencies();
21295 for (
const Use &U : SD->getInst()->operands()) {
21298 .
try_emplace(std::make_pair(SD->getInst(),
U.get()), 0)
21299 .first->getSecond();
21302 Op && areAllOperandsReplacedByCopyableData(SD->getInst(),
Op,
21304 if (ScheduleData *OpSD = getScheduleData(
Op);
21305 OpSD && OpSD->hasValidDependencies()) {
21306 OpSD->clearDirectDependencies();
21307 if (RegionHasStackSave ||
21309 ControlDependentMembers.
push_back(OpSD);
21320 if (OldScheduleEnd && ScheduleEnd != OldScheduleEnd) {
21321 for_each(ScheduleDataMap, [&](
auto &
P) {
21322 if (BB !=
P.first->getParent())
21324 ScheduleData *SD =
P.second;
21325 if (isInSchedulingRegion(*SD))
21326 SD->clearDependencies();
21328 for_each(ScheduleCopyableDataMapByInst, [&](
auto &
P) {
21329 for_each(
P.second, [&](ScheduleCopyableData *SD) {
21330 if (isInSchedulingRegion(*SD))
21331 SD->clearDependencies();
21338 if (Bundle && !Bundle.getBundle().empty()) {
21339 if (S.areInstructionsWithCopyableElements() ||
21340 !ScheduleCopyableDataMap.empty())
21341 CheckIfNeedToClearDeps(Bundle);
21342 LLVM_DEBUG(
dbgs() <<
"SLP: try schedule bundle " << Bundle <<
" in block "
21344 calculateDependencies(Bundle, !ReSchedule, SLP,
21345 ControlDependentMembers);
21346 }
else if (!ControlDependentMembers.
empty()) {
21347 ScheduleBundle
Invalid = ScheduleBundle::invalid();
21348 calculateDependencies(
Invalid, !ReSchedule, SLP,
21349 ControlDependentMembers);
21354 initialFillReadyList(ReadyInsts);
21361 while (((!Bundle && ReSchedule) || (Bundle && !Bundle.isReady())) &&
21362 !ReadyInsts.empty()) {
21363 ScheduleEntity *Picked = ReadyInsts.pop_back_val();
21364 assert(Picked->isReady() &&
"must be ready to schedule");
21365 schedule(*SLP, S, EI, Picked, ReadyInsts);
21366 if (Picked == &Bundle)
21373 for (
Value *V : VL) {
21374 if (S.isNonSchedulable(V))
21376 if (!extendSchedulingRegion(V, S)) {
21383 ScheduleBundle
Invalid = ScheduleBundle::invalid();
21384 TryScheduleBundleImpl(
false,
Invalid);
21385 return std::nullopt;
21389 bool ReSchedule =
false;
21390 for (
Value *V : VL) {
21391 if (S.isNonSchedulable(V))
21395 if (!CopyableData.
empty()) {
21396 for (ScheduleCopyableData *SD : CopyableData)
21397 ReadyInsts.remove(SD);
21399 ScheduleData *BundleMember = getScheduleData(V);
21400 assert((BundleMember || S.isCopyableElement(V)) &&
21401 "no ScheduleData for bundle member (maybe not in same basic block)");
21407 ReadyInsts.remove(BundleMember);
21409 !Bundles.
empty()) {
21410 for (ScheduleBundle *
B : Bundles)
21411 ReadyInsts.remove(
B);
21414 if (!S.isCopyableElement(V) && !BundleMember->isScheduled())
21421 LLVM_DEBUG(
dbgs() <<
"SLP: reset schedule because " << *BundleMember
21422 <<
" was already scheduled\n");
21426 ScheduleBundle &Bundle = buildBundle(VL, S, EI);
21427 TryScheduleBundleImpl(ReSchedule, Bundle);
21428 if (!Bundle.isReady()) {
21429 for (ScheduleEntity *BD : Bundle.getBundle()) {
21433 if (BD->isReady()) {
21435 if (Bundles.
empty()) {
21436 ReadyInsts.insert(BD);
21439 for (ScheduleBundle *
B : Bundles)
21441 ReadyInsts.insert(
B);
21444 ScheduledBundlesList.pop_back();
21445 SmallVector<ScheduleData *> ControlDependentMembers;
21446 for (
Value *V : VL) {
21447 if (S.isNonSchedulable(V))
21450 if (S.isCopyableElement(
I)) {
21453 auto KV = std::make_pair(EI,
I);
21454 assert(ScheduleCopyableDataMap.contains(KV) &&
21455 "no ScheduleCopyableData for copyable element");
21456 ScheduleCopyableData *SD =
21457 ScheduleCopyableDataMapByInst.find(
I)->getSecond().pop_back_val();
21458 ScheduleCopyableDataMapByUsers[
I].remove(SD);
21461 const auto *It =
find(
Op,
I);
21462 assert(It !=
Op.end() &&
"Lane not set");
21463 SmallPtrSet<Instruction *, 4> Visited;
21465 int Lane = std::distance(
Op.begin(), It);
21466 assert(Lane >= 0 &&
"Lane not set");
21468 !EI.UserTE->ReorderIndices.empty())
21469 Lane = EI.UserTE->ReorderIndices[Lane];
21470 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
21471 "Couldn't find extract lane");
21473 if (!Visited.
insert(In).second) {
21477 ScheduleCopyableDataMapByInstUser
21478 [std::make_pair(std::make_pair(In, EI.EdgeIdx),
I)]
21481 }
while (It !=
Op.end());
21483 if (ScheduleCopyableData *UserCD = getScheduleCopyableData(UserEI,
I))
21484 ScheduleCopyableDataMapByUsers[
I].insert(UserCD);
21486 if (ScheduleCopyableDataMapByUsers[
I].
empty())
21487 ScheduleCopyableDataMapByUsers.erase(
I);
21488 ScheduleCopyableDataMap.erase(KV);
21490 if (ScheduleData *OpSD = getScheduleData(
I);
21491 OpSD && OpSD->hasValidDependencies()) {
21492 OpSD->clearDirectDependencies();
21493 if (RegionHasStackSave ||
21495 ControlDependentMembers.
push_back(OpSD);
21499 ScheduledBundles.find(
I)->getSecond().pop_back();
21501 if (!ControlDependentMembers.
empty()) {
21502 ScheduleBundle
Invalid = ScheduleBundle::invalid();
21503 calculateDependencies(
Invalid,
false, SLP,
21504 ControlDependentMembers);
21506 return std::nullopt;
21511BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
21513 if (ChunkPos >= ChunkSize) {
21514 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
21517 return &(ScheduleDataChunks.back()[ChunkPos++]);
21520bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
21521 Value *V,
const InstructionsState &S) {
21523 assert(
I &&
"bundle member must be an instruction");
21524 if (getScheduleData(
I))
21526 if (!ScheduleStart) {
21528 initScheduleData(
I,
I->getNextNode(),
nullptr,
nullptr);
21530 ScheduleEnd =
I->getNextNode();
21531 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
21532 LLVM_DEBUG(
dbgs() <<
"SLP: initialize schedule region to " << *
I <<
"\n");
21540 ++ScheduleStart->getIterator().getReverse();
21546 return II->isAssumeLikeIntrinsic();
21549 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21550 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21551 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter !=
I &&
21553 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
21554 LLVM_DEBUG(
dbgs() <<
"SLP: exceeded schedule region size limit\n");
21561 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
21562 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
21564 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter ==
I)) {
21565 assert(
I->getParent() == ScheduleStart->getParent() &&
21566 "Instruction is in wrong basic block.");
21567 initScheduleData(
I, ScheduleStart,
nullptr, FirstLoadStoreInRegion);
21573 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter ==
I)) &&
21574 "Expected to reach top of the basic block or instruction down the "
21576 assert(
I->getParent() == ScheduleEnd->getParent() &&
21577 "Instruction is in wrong basic block.");
21578 initScheduleData(ScheduleEnd,
I->getNextNode(), LastLoadStoreInRegion,
21580 ScheduleEnd =
I->getNextNode();
21581 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
21582 LLVM_DEBUG(
dbgs() <<
"SLP: extend schedule region end to " << *
I <<
"\n");
21586void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
21588 ScheduleData *PrevLoadStore,
21589 ScheduleData *NextLoadStore) {
21590 ScheduleData *CurrentLoadStore = PrevLoadStore;
21595 ScheduleData *SD = ScheduleDataMap.lookup(
I);
21597 SD = allocateScheduleDataChunks();
21598 ScheduleDataMap[
I] = SD;
21600 assert(!isInSchedulingRegion(*SD) &&
21601 "new ScheduleData already in scheduling region");
21602 SD->init(SchedulingRegionID,
I);
21609 return LI && LI->isSimple() &&
21610 LI->getMetadata(LLVMContext::MD_invariant_load);
21613 if (
I->mayReadOrWriteMemory() &&
21615 !CanIgnoreLoad(
I) &&
21619 Intrinsic::pseudoprobe))) {
21621 if (CurrentLoadStore) {
21622 CurrentLoadStore->setNextLoadStore(SD);
21624 FirstLoadStoreInRegion = SD;
21626 CurrentLoadStore = SD;
21631 RegionHasStackSave =
true;
21633 if (NextLoadStore) {
21634 if (CurrentLoadStore)
21635 CurrentLoadStore->setNextLoadStore(NextLoadStore);
21637 LastLoadStoreInRegion = CurrentLoadStore;
21641void BoUpSLP::BlockScheduling::calculateDependencies(
21642 ScheduleBundle &Bundle,
bool InsertInReadyList,
BoUpSLP *SLP,
21644 SmallVector<ScheduleEntity *> WorkList;
21645 auto ProcessNode = [&](ScheduleEntity *SE) {
21647 if (CD->hasValidDependencies())
21650 CD->initDependencies();
21651 CD->resetUnscheduledDeps();
21652 const EdgeInfo &EI = CD->getEdgeInfo();
21655 const auto *It =
find(
Op, CD->getInst());
21656 assert(It !=
Op.end() &&
"Lane not set");
21657 SmallPtrSet<Instruction *, 4> Visited;
21659 int Lane = std::distance(
Op.begin(), It);
21660 assert(Lane >= 0 &&
"Lane not set");
21662 !EI.UserTE->ReorderIndices.empty())
21663 Lane = EI.UserTE->ReorderIndices[Lane];
21664 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
21665 "Couldn't find extract lane");
21667 if (EI.UserTE->isCopyableElement(In)) {
21670 if (ScheduleCopyableData *UseSD =
21671 getScheduleCopyableData(EI.UserTE->UserTreeIndex, In)) {
21672 CD->incDependencies();
21673 if (!UseSD->isScheduled())
21674 CD->incrementUnscheduledDeps(1);
21675 if (!UseSD->hasValidDependencies() ||
21676 (InsertInReadyList && UseSD->isReady()))
21679 }
else if (Visited.
insert(In).second) {
21680 if (ScheduleData *UseSD = getScheduleData(In)) {
21681 CD->incDependencies();
21682 if (!UseSD->isScheduled())
21683 CD->incrementUnscheduledDeps(1);
21684 if (!UseSD->hasValidDependencies() ||
21685 (InsertInReadyList && UseSD->isReady()))
21690 }
while (It !=
Op.end());
21691 if (CD->isReady() && CD->getDependencies() == 0 &&
21692 (EI.UserTE->hasState() &&
21693 (EI.UserTE->getMainOp()->getParent() !=
21694 CD->getInst()->getParent() ||
21696 (EI.UserTE->getMainOp()->hasNUsesOrMore(
UsesLimit) ||
21697 any_of(EI.UserTE->getMainOp()->users(), [&](User *U) {
21698 auto *IU = dyn_cast<Instruction>(U);
21701 return IU->getParent() == EI.UserTE->getMainOp()->getParent();
21707 CD->incDependencies();
21708 CD->incrementUnscheduledDeps(1);
21714 if (BundleMember->hasValidDependencies())
21716 LLVM_DEBUG(
dbgs() <<
"SLP: update deps of " << *BundleMember <<
"\n");
21717 BundleMember->initDependencies();
21718 BundleMember->resetUnscheduledDeps();
21720 SmallDenseMap<Value *, unsigned> UserToNumOps;
21721 for (User *U : BundleMember->getInst()->users()) {
21724 if (ScheduleData *UseSD = getScheduleData(U)) {
21728 if (areAllOperandsReplacedByCopyableData(
21731 BundleMember->incDependencies();
21732 if (!UseSD->isScheduled())
21733 BundleMember->incrementUnscheduledDeps(1);
21734 if (!UseSD->hasValidDependencies() ||
21735 (InsertInReadyList && UseSD->isReady()))
21739 for (ScheduleCopyableData *UseSD :
21740 getScheduleCopyableDataUsers(BundleMember->getInst())) {
21741 BundleMember->incDependencies();
21742 if (!UseSD->isScheduled())
21743 BundleMember->incrementUnscheduledDeps(1);
21744 if (!UseSD->hasValidDependencies() ||
21745 (InsertInReadyList && UseSD->isReady()))
21749 SmallPtrSet<const Instruction *, 4> Visited;
21752 if (!Visited.
insert(
I).second)
21754 auto *DepDest = getScheduleData(
I);
21755 assert(DepDest &&
"must be in schedule window");
21756 DepDest->addControlDependency(BundleMember);
21757 BundleMember->incDependencies();
21758 if (!DepDest->isScheduled())
21759 BundleMember->incrementUnscheduledDeps(1);
21760 if (!DepDest->hasValidDependencies() ||
21761 (InsertInReadyList && DepDest->isReady()))
21769 for (Instruction *
I = BundleMember->getInst()->getNextNode();
21770 I != ScheduleEnd;
I =
I->getNextNode()) {
21775 MakeControlDependent(
I);
21783 if (RegionHasStackSave) {
21788 match(BundleMember->getInst(),
21790 for (Instruction *
I = BundleMember->getInst()->getNextNode();
21791 I != ScheduleEnd;
I =
I->getNextNode()) {
21802 MakeControlDependent(
I);
21812 BundleMember->getInst()->mayReadOrWriteMemory()) {
21813 for (Instruction *
I = BundleMember->getInst()->getNextNode();
21814 I != ScheduleEnd;
I =
I->getNextNode()) {
21820 MakeControlDependent(
I);
21827 ScheduleData *NextLoadStore = BundleMember->getNextLoadStore();
21828 if (!NextLoadStore)
21832 "NextLoadStore list for non memory effecting bundle?");
21835 unsigned NumAliased = 0;
21836 unsigned DistToSrc = 1;
21837 bool IsNonSimpleSrc = !SrcLoc.
Ptr || !
isSimple(SrcInst);
21839 for (ScheduleData *DepDest = NextLoadStore; DepDest;
21840 DepDest = DepDest->getNextLoadStore()) {
21841 assert(isInSchedulingRegion(*DepDest) &&
"Expected to be in region");
21851 ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
21853 SLP->isAliased(SrcLoc, SrcInst, DepDest->getInst())))) {
21860 DepDest->addMemoryDependency(BundleMember);
21861 BundleMember->incDependencies();
21862 if (!DepDest->isScheduled())
21863 BundleMember->incrementUnscheduledDeps(1);
21864 if (!DepDest->hasValidDependencies() ||
21865 (InsertInReadyList && DepDest->isReady()))
21889 "expected at least one instruction to schedule");
21891 WorkList.
push_back(Bundle.getBundle().front());
21893 SmallPtrSet<ScheduleBundle *, 16> Visited;
21894 while (!WorkList.
empty()) {
21899 CopyableBundle.
push_back(&CD->getBundle());
21900 Bundles = CopyableBundle;
21902 Bundles = getScheduleBundles(SD->getInst());
21904 if (Bundles.
empty()) {
21905 if (!SD->hasValidDependencies())
21907 if (InsertInReadyList && SD->isReady()) {
21908 ReadyInsts.insert(SD);
21909 LLVM_DEBUG(
dbgs() <<
"SLP: gets ready on update: " << *SD <<
"\n");
21913 for (ScheduleBundle *Bundle : Bundles) {
21914 if (Bundle->hasValidDependencies() || !Visited.
insert(Bundle).second)
21916 assert(isInSchedulingRegion(*Bundle) &&
21917 "ScheduleData not in scheduling region");
21918 for_each(Bundle->getBundle(), ProcessNode);
21920 if (InsertInReadyList && SD->isReady()) {
21921 for (ScheduleBundle *Bundle : Bundles) {
21922 assert(isInSchedulingRegion(*Bundle) &&
21923 "ScheduleData not in scheduling region");
21924 if (!Bundle->isReady())
21926 ReadyInsts.insert(Bundle);
21934void BoUpSLP::BlockScheduling::resetSchedule() {
21936 "tried to reset schedule on block which has not been scheduled");
21937 for_each(ScheduleDataMap, [&](
auto &
P) {
21938 if (BB !=
P.first->getParent())
21940 ScheduleData *SD =
P.second;
21941 if (isInSchedulingRegion(*SD)) {
21942 SD->setScheduled(
false);
21943 SD->resetUnscheduledDeps();
21946 for_each(ScheduleCopyableDataMapByInst, [&](
auto &
P) {
21947 for_each(
P.second, [&](ScheduleCopyableData *SD) {
21948 if (isInSchedulingRegion(*SD)) {
21949 SD->setScheduled(false);
21950 SD->resetUnscheduledDeps();
21954 for_each(ScheduledBundles, [&](
auto &
P) {
21955 for_each(
P.second, [&](ScheduleBundle *Bundle) {
21956 if (isInSchedulingRegion(*Bundle))
21957 Bundle->setScheduled(false);
21961 for (
auto &
P : ScheduleCopyableDataMap) {
21962 if (isInSchedulingRegion(*
P.second)) {
21963 P.second->setScheduled(
false);
21964 P.second->resetUnscheduledDeps();
21967 ReadyInsts.clear();
21970void BoUpSLP::scheduleBlock(
const BoUpSLP &R, BlockScheduling *BS) {
21971 if (!BS->ScheduleStart)
21974 LLVM_DEBUG(
dbgs() <<
"SLP: schedule block " << BS->BB->getName() <<
"\n");
21981 BS->resetSchedule();
21988 struct ScheduleDataCompare {
21989 bool operator()(
const ScheduleEntity *SD1,
21990 const ScheduleEntity *SD2)
const {
21991 return SD2->getSchedulingPriority() < SD1->getSchedulingPriority();
21994 std::set<ScheduleEntity *, ScheduleDataCompare> ReadyInsts;
21999 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
22000 I =
I->getNextNode()) {
22002 if (!Bundles.
empty()) {
22003 for (ScheduleBundle *Bundle : Bundles) {
22004 Bundle->setSchedulingPriority(Idx++);
22005 if (!Bundle->hasValidDependencies())
22006 BS->calculateDependencies(*Bundle,
false,
this);
22009 for (ScheduleCopyableData *SD :
reverse(SDs)) {
22010 ScheduleBundle &Bundle = SD->getBundle();
22011 Bundle.setSchedulingPriority(Idx++);
22012 if (!Bundle.hasValidDependencies())
22013 BS->calculateDependencies(Bundle,
false,
this);
22018 BS->getScheduleCopyableDataUsers(
I);
22019 if (ScheduleData *SD = BS->getScheduleData(
I)) {
22022 SDTEs.
front()->doesNotNeedToSchedule() ||
22024 "scheduler and vectorizer bundle mismatch");
22025 SD->setSchedulingPriority(Idx++);
22026 if (!SD->hasValidDependencies() &&
22027 (!CopyableData.
empty() ||
22028 any_of(
R.ValueToGatherNodes.lookup(
I), [&](
const TreeEntry *TE) {
22029 assert(TE->isGather() &&
"expected gather node");
22030 return TE->hasState() && TE->hasCopyableElements() &&
22031 TE->isCopyableElement(I);
22037 ScheduleBundle Bundle;
22039 BS->calculateDependencies(Bundle,
false,
this);
22042 for (ScheduleCopyableData *SD :
reverse(CopyableData)) {
22043 ScheduleBundle &Bundle = SD->getBundle();
22044 Bundle.setSchedulingPriority(Idx++);
22045 if (!Bundle.hasValidDependencies())
22046 BS->calculateDependencies(Bundle,
false,
this);
22049 BS->initialFillReadyList(ReadyInsts);
22051 Instruction *LastScheduledInst = BS->ScheduleEnd;
22054 SmallPtrSet<Instruction *, 16> Scheduled;
22055 while (!ReadyInsts.empty()) {
22056 auto *Picked = *ReadyInsts.begin();
22057 ReadyInsts.erase(ReadyInsts.begin());
22062 for (
const ScheduleEntity *BundleMember : Bundle->getBundle()) {
22063 Instruction *PickedInst = BundleMember->getInst();
22065 bool IsCopyable = Bundle->getTreeEntry()->isCopyableElement(PickedInst);
22066 if ((IsCopyable && BS->getScheduleData(PickedInst)) ||
22067 (!IsCopyable && !Scheduled.
insert(PickedInst).second))
22069 if (PickedInst->
getNextNode() != LastScheduledInst)
22071 LastScheduledInst = PickedInst;
22073 EntryToLastInstruction.try_emplace(Bundle->getTreeEntry(),
22074 LastScheduledInst);
22078 if (PickedInst->
getNextNode() != LastScheduledInst)
22080 LastScheduledInst = PickedInst;
22082 auto Invalid = InstructionsState::invalid();
22087#ifdef EXPENSIVE_CHECKS
22091#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
22093 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
22094 I =
I->getNextNode()) {
22097 [](
const ScheduleBundle *Bundle) {
22098 return Bundle->isScheduled();
22100 "must be scheduled at this point");
22105 BS->ScheduleStart =
nullptr;
22113 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
22118 auto E = InstrElementSize.find(V);
22119 if (E != InstrElementSize.end())
22136 Value *FirstNonBool =
nullptr;
22137 while (!Worklist.
empty()) {
22142 auto *Ty =
I->getType();
22145 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
22153 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
22161 for (
Use &U :
I->operands()) {
22163 if (Visited.
insert(J).second &&
22169 FirstNonBool = U.get();
22180 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
22182 Width = DL->getTypeSizeInBits(V->getType());
22186 InstrElementSize[
I] = Width;
22191bool BoUpSLP::collectValuesToDemote(
22192 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
22195 bool &IsProfitableToDemote,
bool IsTruncRoot)
const {
22200 unsigned OrigBitWidth =
22201 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
22215 if (isa<PoisonValue>(R))
22217 return !isKnownNonNegative(R, SimplifyQuery(*DL));
22219 auto IsPotentiallyTruncated = [&](
Value *V,
unsigned &
BitWidth) ->
bool {
22222 if (getTreeEntries(V).
size() > 1)
22228 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth >
BitWidth) {
22234 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
22238 APInt
Mask = DB->getDemandedBits(
I);
22239 unsigned BitWidth2 =
22240 std::max<unsigned>(1,
Mask.getBitWidth() -
Mask.countl_zero());
22241 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
22247 BitWidth1 = std::min(BitWidth1, BitWidth2);
22252 auto FinalAnalysis = [&, TTI = TTI]() {
22253 if (!IsProfitableToDemote)
22256 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(
BitWidth)));
22258 if (Res &&
E.isGather()) {
22259 if (
E.hasState()) {
22260 if (
const TreeEntry *SameTE =
22261 getSameValuesTreeEntry(
E.getMainOp(),
E.Scalars);
22263 if (collectValuesToDemote(*SameTE, IsProfitableToDemoteRoot,
BitWidth,
22264 ToDemote, Visited, NodesToKeepBWs,
22265 MaxDepthLevel, IsProfitableToDemote,
22273 SmallPtrSet<Value *, 4> UniqueBases;
22274 for (
Value *V :
E.Scalars) {
22278 UniqueBases.
insert(EE->getVectorOperand());
22280 const unsigned VF =
E.Scalars.size();
22281 Type *OrigScalarTy =
E.Scalars.front()->getType();
22282 if (UniqueBases.
size() <= 2 ||
22295 if (
E.isGather() || !Visited.
insert(&
E).second ||
22297 return !isa<Constant>(V) && all_of(V->users(), [&](User *U) {
22298 return isa<InsertElementInst>(U) && !isVectorized(U);
22301 return FinalAnalysis();
22304 return !isa<Constant>(V) && !all_of(V->users(), [=](User *U) {
22305 return isVectorized(U) ||
22306 (E.Idx == 0 && UserIgnoreList &&
22307 UserIgnoreList->contains(U)) ||
22308 (!isa<CmpInst>(U) && U->getType()->isSized() &&
22309 !U->getType()->isScalableTy() &&
22310 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
22311 }) && !IsPotentiallyTruncated(V,
BitWidth);
22316 bool &NeedToExit) {
22317 NeedToExit =
false;
22318 unsigned InitLevel = MaxDepthLevel;
22319 for (
const TreeEntry *
Op : Operands) {
22320 unsigned Level = InitLevel;
22321 if (!collectValuesToDemote(*
Op, IsProfitableToDemoteRoot,
BitWidth,
22322 ToDemote, Visited, NodesToKeepBWs, Level,
22323 IsProfitableToDemote, IsTruncRoot)) {
22324 if (!IsProfitableToDemote)
22327 if (!FinalAnalysis())
22331 MaxDepthLevel = std::max(MaxDepthLevel, Level);
22335 auto AttemptCheckBitwidth =
22336 [&](function_ref<bool(
unsigned,
unsigned)> Checker,
bool &NeedToExit) {
22338 NeedToExit =
false;
22339 unsigned BestFailBitwidth = 0;
22341 if (Checker(
BitWidth, OrigBitWidth))
22343 if (BestFailBitwidth == 0 && FinalAnalysis())
22347 if (BestFailBitwidth == 0) {
22358 auto TryProcessInstruction =
22360 function_ref<bool(
unsigned,
unsigned)> Checker = {}) {
22361 if (Operands.empty()) {
22364 for (
Value *V :
E.Scalars)
22365 (void)IsPotentiallyTruncated(V,
BitWidth);
22370 return !V->hasOneUse() && !IsPotentiallyTruncated(V, BitWidth);
22373 bool NeedToExit =
false;
22374 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
22378 if (!ProcessOperands(Operands, NeedToExit))
22387 return IsProfitableToDemote;
22390 if (
E.State == TreeEntry::SplitVectorize)
22391 return TryProcessInstruction(
22393 {VectorizableTree[
E.CombinedEntriesWithIndices.front().first].get(),
22394 VectorizableTree[
E.CombinedEntriesWithIndices.back().first].get()});
22396 if (
E.isAltShuffle()) {
22398 auto IsDangerousOpcode = [](
unsigned Opcode) {
22400 case Instruction::Shl:
22401 case Instruction::AShr:
22402 case Instruction::LShr:
22403 case Instruction::UDiv:
22404 case Instruction::SDiv:
22405 case Instruction::URem:
22406 case Instruction::SRem:
22413 if (IsDangerousOpcode(
E.getAltOpcode()))
22414 return FinalAnalysis();
22417 switch (
E.getOpcode()) {
22421 case Instruction::Trunc:
22422 if (IsProfitableToDemoteRoot)
22423 IsProfitableToDemote =
true;
22424 return TryProcessInstruction(
BitWidth);
22425 case Instruction::ZExt:
22426 case Instruction::SExt:
22427 if (
E.UserTreeIndex.UserTE &&
E.UserTreeIndex.UserTE->hasState() &&
22428 E.UserTreeIndex.UserTE->getOpcode() == Instruction::BitCast &&
22429 E.UserTreeIndex.UserTE->getMainOp()->getType()->isFPOrFPVectorTy())
22431 IsProfitableToDemote =
true;
22432 return TryProcessInstruction(
BitWidth);
22436 case Instruction::Add:
22437 case Instruction::Sub:
22438 case Instruction::Mul:
22439 case Instruction::And:
22440 case Instruction::Or:
22441 case Instruction::Xor: {
22442 return TryProcessInstruction(
22443 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)});
22445 case Instruction::Freeze:
22446 return TryProcessInstruction(
BitWidth, getOperandEntry(&
E, 0));
22447 case Instruction::Shl: {
22450 auto ShlChecker = [&](
unsigned BitWidth, unsigned) {
22452 if (isa<PoisonValue>(V))
22454 if (E.isCopyableElement(V))
22456 auto *I = cast<Instruction>(V);
22457 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22458 return AmtKnownBits.getMaxValue().ult(BitWidth);
22461 return TryProcessInstruction(
22462 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)}, ShlChecker);
22464 case Instruction::LShr: {
22468 auto LShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22470 if (isa<PoisonValue>(V))
22472 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22473 if (E.isCopyableElement(V))
22474 return MaskedValueIsZero(V, ShiftedBits, SimplifyQuery(*DL));
22475 auto *I = cast<Instruction>(V);
22476 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22477 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
22478 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
22479 SimplifyQuery(*DL));
22482 return TryProcessInstruction(
22483 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)},
22486 case Instruction::AShr: {
22490 auto AShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22492 if (isa<PoisonValue>(V))
22494 auto *I = cast<Instruction>(V);
22495 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
22496 unsigned ShiftedBits = OrigBitWidth - BitWidth;
22497 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
22499 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22502 return TryProcessInstruction(
22503 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)},
22506 case Instruction::UDiv:
22507 case Instruction::URem: {
22509 auto Checker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22512 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22513 if (E.hasCopyableElements() && E.isCopyableElement(V))
22514 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
22515 auto *I = cast<Instruction>(V);
22516 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
22517 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
22520 return TryProcessInstruction(
22521 BitWidth, {getOperandEntry(&
E, 0), getOperandEntry(&
E, 1)}, Checker);
22525 case Instruction::Select: {
22526 return TryProcessInstruction(
22527 BitWidth, {getOperandEntry(&
E, 1), getOperandEntry(&
E, 2)});
22531 case Instruction::PHI: {
22532 const unsigned NumOps =
E.getNumOperands();
22535 [&](
unsigned Idx) { return getOperandEntry(&E, Idx); });
22540 case Instruction::Call: {
22545 if (
ID != Intrinsic::abs &&
ID != Intrinsic::smin &&
22546 ID != Intrinsic::smax &&
ID != Intrinsic::umin &&
ID != Intrinsic::umax)
22549 function_ref<bool(
unsigned,
unsigned)> CallChecker;
22550 auto CompChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22553 auto *I = cast<Instruction>(V);
22554 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
22555 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
22556 return MaskedValueIsZero(I->getOperand(0), Mask,
22557 SimplifyQuery(*DL)) &&
22558 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
22560 assert((
ID == Intrinsic::smin ||
ID == Intrinsic::smax) &&
22561 "Expected min/max intrinsics only.");
22562 unsigned SignBits = OrigBitWidth -
BitWidth;
22564 unsigned Op0SignBits =
22566 unsigned Op1SignBits =
22568 return SignBits <= Op0SignBits &&
22569 ((SignBits != Op0SignBits &&
22572 SimplifyQuery(*DL))) &&
22573 SignBits <= Op1SignBits &&
22574 ((SignBits != Op1SignBits &&
22579 auto AbsChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
22582 auto *I = cast<Instruction>(V);
22583 unsigned SignBits = OrigBitWidth - BitWidth;
22584 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
22585 unsigned Op0SignBits =
22586 ComputeNumSignBits(I->getOperand(0), *DL, AC, nullptr, DT);
22587 return SignBits <= Op0SignBits &&
22588 ((SignBits != Op0SignBits &&
22589 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
22590 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
22593 if (
ID != Intrinsic::abs) {
22594 Operands.push_back(getOperandEntry(&
E, 1));
22595 CallChecker = CompChecker;
22597 CallChecker = AbsChecker;
22600 std::numeric_limits<InstructionCost::CostType>::max();
22602 unsigned VF =
E.Scalars.size();
22604 auto Checker = [&](
unsigned BitWidth, unsigned) {
22612 if (
Cost < BestCost) {
22618 [[maybe_unused]]
bool NeedToExit;
22619 (void)AttemptCheckBitwidth(Checker, NeedToExit);
22621 return TryProcessInstruction(
BitWidth, Operands, CallChecker);
22629 return FinalAnalysis();
22636 bool IsStoreOrInsertElt =
22637 VectorizableTree.front()->hasState() &&
22638 (VectorizableTree.front()->
getOpcode() == Instruction::Store ||
22639 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
22640 if ((IsStoreOrInsertElt || UserIgnoreList) &&
22641 ExtraBitWidthNodes.size() <= 1 &&
22642 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
22643 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
22646 unsigned NodeIdx = 0;
22647 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
22651 assert((VectorizableTree[NodeIdx]->
isGather() || NodeIdx != 0 ||
22652 !VectorizableTree[NodeIdx]->UserTreeIndex) &&
22653 "Unexpected tree is graph.");
22657 bool IsTruncRoot =
false;
22658 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
22661 if (NodeIdx != 0 &&
22662 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22663 VectorizableTree[NodeIdx]->
getOpcode() == Instruction::Trunc) {
22664 assert(IsStoreOrInsertElt &&
"Expected store/insertelement seeded graph.");
22665 IsTruncRoot =
true;
22667 IsProfitableToDemoteRoot =
true;
22672 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
22676 auto ComputeMaxBitWidth =
22677 [&](
const TreeEntry &E,
bool IsTopRoot,
bool IsProfitableToDemoteRoot,
22678 unsigned Limit,
bool IsTruncRoot,
bool IsSignedCmp) ->
unsigned {
22682 if (E.isGather() && IsTruncRoot && E.UserTreeIndex &&
22683 !NodesToKeepBWs.
contains(E.Idx) &&
22684 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
22686 return V->hasOneUse() || isa<Constant>(V) ||
22687 (!V->hasNUsesOrMore(UsesLimit) &&
22688 none_of(V->users(), [&](User *U) {
22689 ArrayRef<TreeEntry *> TEs = getTreeEntries(U);
22690 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22691 if (TEs.empty() || is_contained(TEs, UserTE))
22693 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22695 isa<SIToFPInst, UIToFPInst>(U) ||
22696 (UserTE->hasState() &&
22697 (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
22698 SelectInst>(UserTE->getMainOp()) ||
22699 isa<SIToFPInst, UIToFPInst>(UserTE->getMainOp()))))
22701 unsigned UserTESz = DL->getTypeSizeInBits(
22702 UserTE->Scalars.front()->getType());
22703 if (all_of(TEs, [&](const TreeEntry *TE) {
22704 auto It = MinBWs.find(TE);
22705 return It != MinBWs.end() &&
22706 It->second.first > UserTESz;
22709 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
22713 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
22714 auto It = MinBWs.find(UserTE);
22715 if (It != MinBWs.end())
22716 return It->second.first;
22717 unsigned MaxBitWidth =
22718 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
22719 MaxBitWidth =
bit_ceil(MaxBitWidth);
22720 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22722 return MaxBitWidth;
22728 unsigned VF = E.getVectorFactor();
22729 Type *ScalarTy = E.Scalars.front()->getType();
22736 [&](
Value *V) { return AnalyzedMinBWVals.contains(V); }))
22745 unsigned MaxBitWidth = 1u;
22753 bool IsKnownPositive = !IsSignedCmp &&
all_of(E.Scalars, [&](
Value *R) {
22754 if (isa<PoisonValue>(R))
22756 KnownBits Known = computeKnownBits(R, *DL);
22757 return Known.isNonNegative();
22760 if (!IsKnownPositive && !IsTopRoot && E.UserTreeIndex &&
22761 E.UserTreeIndex.UserTE->hasState() &&
22762 E.UserTreeIndex.UserTE->getOpcode() == Instruction::UIToFP)
22764 std::min(DL->getTypeSizeInBits(
22765 E.UserTreeIndex.UserTE->Scalars.front()->getType()),
22766 DL->getTypeSizeInBits(ScalarTy));
22770 for (
Value *Root : E.Scalars) {
22776 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22792 if (!IsKnownPositive)
22797 MaxBitWidth = std::max(BitWidth1, MaxBitWidth);
22800 APInt Mask = DB->getDemandedBits(
I);
22801 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
22803 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
22806 if (MaxBitWidth < 8 && MaxBitWidth > 1)
22811 if (NumParts > 1 &&
22819 unsigned Opcode = E.getOpcode();
22820 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
22821 Opcode == Instruction::SExt ||
22822 Opcode == Instruction::ZExt || NumParts > 1;
22827 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
22828 bool NeedToDemote = IsProfitableToDemote;
22830 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
22831 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
22832 NeedToDemote, IsTruncRoot) ||
22833 (MaxDepthLevel <= Limit &&
22834 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
22835 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
22836 DL->getTypeSizeInBits(TreeRootIT) /
22837 DL->getTypeSizeInBits(
22838 E.getMainOp()->getOperand(0)->getType()) >
22842 MaxBitWidth =
bit_ceil(MaxBitWidth);
22844 return MaxBitWidth;
22851 if (UserIgnoreList &&
22855 if (
all_of(*UserIgnoreList,
22860 VectorizableTree.front()->State == TreeEntry::Vectorize &&
22861 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
22862 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
22863 Builder.getInt1Ty()) {
22864 ReductionBitWidth = 1;
22866 for (
Value *V : *UserIgnoreList) {
22870 TypeSize NumTypeBits = DL->getTypeSizeInBits(
V->getType());
22871 unsigned BitWidth1 = NumTypeBits - NumSignBits;
22874 unsigned BitWidth2 = BitWidth1;
22877 BitWidth2 =
Mask.getBitWidth() -
Mask.countl_zero();
22879 ReductionBitWidth =
22880 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
22882 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
22883 ReductionBitWidth = 8;
22885 ReductionBitWidth =
bit_ceil(ReductionBitWidth);
22888 bool IsTopRoot = NodeIdx == 0;
22889 while (NodeIdx < VectorizableTree.size() &&
22890 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
22891 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
22892 RootDemotes.push_back(NodeIdx);
22894 IsTruncRoot =
true;
22896 bool IsSignedCmp =
false;
22897 if (UserIgnoreList &&
22901 IsSignedCmp =
true;
22902 while (NodeIdx < VectorizableTree.size()) {
22904 unsigned Limit = 2;
22906 ReductionBitWidth ==
22907 DL->getTypeSizeInBits(
22908 VectorizableTree.front()->Scalars.front()->getType()))
22910 unsigned MaxBitWidth = ComputeMaxBitWidth(
22911 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
22912 IsTruncRoot, IsSignedCmp);
22913 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
22914 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
22915 ReductionBitWidth =
bit_ceil(MaxBitWidth);
22916 else if (MaxBitWidth == 0)
22917 ReductionBitWidth = 0;
22920 for (
unsigned Idx : RootDemotes) {
22921 if (
all_of(VectorizableTree[Idx]->Scalars, [&](
Value *V) {
22922 uint32_t OrigBitWidth =
22923 DL->getTypeSizeInBits(
V->getType()->getScalarType());
22924 if (OrigBitWidth > MaxBitWidth) {
22932 RootDemotes.clear();
22934 IsProfitableToDemoteRoot =
true;
22936 if (ExtraBitWidthNodes.empty()) {
22937 NodeIdx = VectorizableTree.size();
22939 unsigned NewIdx = 0;
22941 NewIdx = *ExtraBitWidthNodes.begin();
22942 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
22943 }
while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
22946 NodeIdx < VectorizableTree.size() &&
22947 VectorizableTree[NodeIdx]->UserTreeIndex &&
22948 VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&
22949 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22950 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22951 Instruction::Trunc &&
22952 !VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();
22954 NodeIdx < VectorizableTree.size() &&
22955 VectorizableTree[NodeIdx]->UserTreeIndex &&
22956 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
22957 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
22958 Instruction::ICmp &&
22960 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->Scalars,
22962 auto *IC = dyn_cast<ICmpInst>(V);
22963 return IC && (IC->isSigned() ||
22964 !isKnownNonNegative(IC->getOperand(0),
22965 SimplifyQuery(*DL)) ||
22966 !isKnownNonNegative(IC->getOperand(1),
22967 SimplifyQuery(*DL)));
22973 if (MaxBitWidth == 0 ||
22977 if (UserIgnoreList)
22978 AnalyzedMinBWVals.insert_range(TreeRoot);
22985 for (
unsigned Idx : ToDemote) {
22986 TreeEntry *
TE = VectorizableTree[Idx].get();
22987 if (MinBWs.contains(TE))
22990 if (isa<PoisonValue>(R))
22992 return !isKnownNonNegative(R, SimplifyQuery(*DL));
22994 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
23035 DL = &
F.getDataLayout();
23043 if (!
TTI->getNumberOfRegisters(
TTI->getRegisterClassForType(
true))) {
23045 dbgs() <<
"SLP: Didn't find any vector registers for target, abort.\n");
23050 if (
F.hasFnAttribute(Attribute::NoImplicitFloat))
23053 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing blocks in " <<
F.getName() <<
".\n");
23057 BoUpSLP R(&
F,
SE,
TTI,
TLI,
AA,
LI,
DT,
AC,
DB,
DL, ORE_);
23063 DT->updateDFSNumbers();
23066 for (
auto *BB :
post_order(&
F.getEntryBlock())) {
23071 R.clearReductionData();
23072 collectSeedInstructions(BB);
23075 if (!Stores.empty()) {
23077 <<
" underlying objects.\n");
23078 Changed |= vectorizeStoreChains(R);
23082 Changed |= vectorizeChainsInBlock(BB, R);
23087 if (!GEPs.empty()) {
23089 <<
" underlying objects.\n");
23090 Changed |= vectorizeGEPIndices(BB, R);
23095 R.optimizeGatherSequence();
23103 unsigned Idx,
unsigned MinVF,
23108 const unsigned Sz = R.getVectorElementSize(Chain[0]);
23109 unsigned VF = Chain.
size();
23115 VF < 2 || VF < MinVF) {
23123 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << VF <<
" stores at offset " << Idx
23127 for (
Value *V : Chain)
23130 InstructionsCompatibilityAnalysis
Analysis(*DT, *
DL, *
TTI, *TLI);
23131 InstructionsState S =
Analysis.buildInstructionsState(
23135 bool IsAllowedSize =
23139 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
23140 (!S.getMainOp()->isSafeToRemove() ||
23143 return !isa<ExtractElementInst>(V) &&
23144 (V->getNumUses() > Chain.size() ||
23145 any_of(V->users(), [&](User *U) {
23146 return !Stores.contains(U);
23149 (ValOps.
size() > Chain.size() / 2 && !S)) {
23150 Size = (!IsAllowedSize && S) ? 1 : 2;
23154 if (
R.isLoadCombineCandidate(Chain))
23156 R.buildTree(Chain);
23158 if (
R.isTreeTinyAndNotFullyVectorizable()) {
23159 if (
R.isGathered(Chain.front()) ||
23161 return std::nullopt;
23162 Size =
R.getCanonicalGraphSize();
23165 if (
R.isProfitableToReorder()) {
23166 R.reorderTopToBottom();
23167 R.reorderBottomToTop();
23169 R.transformNodes();
23170 R.buildExternalUses();
23172 R.computeMinimumValueSizes();
23174 Size =
R.getCanonicalGraphSize();
23175 if (S && S.getOpcode() == Instruction::Load)
23183 using namespace ore;
23185 R.getORE()->emit(OptimizationRemark(
SV_NAME,
"StoresVectorized",
23187 <<
"Stores SLP vectorized with cost " <<
NV(
"Cost",
Cost)
23188 <<
" and with tree size "
23189 <<
NV(
"TreeSize",
R.getTreeSize()));
23203 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
23204 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
23205 unsigned Size = First ? Val.first : Val.second;
23217 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
23218 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
23219 unsigned P = First ? Val.first : Val.second;
23222 return V + (P - Mean) * (P - Mean);
23225 return Dev * 96 / (Mean * Mean) == 0;
23233class RelatedStoreInsts {
23236 : AllStores(AllStores) {
23237 reset(BaseInstrIdx);
23240 void reset(
unsigned NewBaseInstr) {
23241 assert(NewBaseInstr < AllStores.size() &&
23242 "Instruction index out of bounds");
23243 BaseInstrIdx = NewBaseInstr;
23245 insertOrLookup(NewBaseInstr, 0);
23252 std::optional<unsigned> insertOrLookup(
unsigned InstrIdx, int64_t PtrDist) {
23253 auto [It,
Inserted] = Instrs.emplace(PtrDist, InstrIdx);
23254 return Inserted ? std::nullopt : std::make_optional(It->second);
23257 using DistToInstMap = std::map<int64_t, unsigned>;
23258 const DistToInstMap &getStores()
const {
return Instrs; }
23262 std::optional<int64_t> getPointerDiff(StoreInst &SI,
const DataLayout &
DL,
23263 ScalarEvolution &SE)
const {
23264 StoreInst &BaseStore = *AllStores[BaseInstrIdx];
23267 SI.getValueOperand()->getType(),
SI.getPointerOperand(),
DL, SE,
23273 void rebase(
unsigned MinSafeIdx,
unsigned NewBaseInstIdx,
23274 int64_t DistFromCurBase) {
23275 DistToInstMap PrevSet = std::move(Instrs);
23276 reset(NewBaseInstIdx);
23281 for (
auto [Dist, InstIdx] : PrevSet) {
23282 if (InstIdx >= MinSafeIdx)
23283 insertOrLookup(InstIdx, Dist - DistFromCurBase);
23289 DistToInstMap::reverse_iterator LastVectorizedStore =
find_if(
23290 reverse(Instrs), [&](
const std::pair<int64_t, unsigned> &DistAndIdx) {
23291 return VectorizedStores.
contains(AllStores[DistAndIdx.second]);
23296 DistToInstMap::iterator VectorizedStoresEnd = LastVectorizedStore.base();
23297 Instrs.erase(Instrs.begin(), VectorizedStoresEnd);
23302 unsigned BaseInstrIdx;
23305 DistToInstMap Instrs;
23313bool SLPVectorizerPass::vectorizeStores(
23315 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
23322 auto TryToVectorize = [&](
const RelatedStoreInsts::DistToInstMap &StoreSeq) {
23323 int64_t PrevDist = -1;
23327 auto &[Dist, InstIdx] =
Data;
23328 if (Operands.
empty() || Dist - PrevDist == 1) {
23331 if (Idx != StoreSeq.size() - 1)
23340 if (Operands.
size() <= 1 ||
23342 .
insert({Operands.front(),
23343 cast<StoreInst>(Operands.front())->getValueOperand(),
23345 cast<StoreInst>(Operands.back())->getValueOperand(),
23350 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
23351 unsigned EltSize =
R.getVectorElementSize(Operands[0]);
23355 std::min(
R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
23357 Type *StoreTy =
Store->getValueOperand()->getType();
23358 Type *ValueTy = StoreTy;
23360 ValueTy = Trunc->getSrcTy();
23369 R.getMinVF(DL->getTypeStoreSizeInBits(StoreScalarTy)), StoreScalarTy,
23372 MinVF = std::max<unsigned>(2, MinVF);
23374 if (MaxVF < MinVF) {
23375 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
23377 <<
"MinVF (" << MinVF <<
")\n");
23381 unsigned NonPowerOf2VF = 0;
23386 unsigned CandVF = std::clamp<unsigned>(Operands.
size(), MinVF, MaxVF);
23388 NonPowerOf2VF = CandVF;
23389 assert(NonPowerOf2VF != MaxVF &&
23390 "Non-power-of-2 VF should not be equal to MaxVF");
23397 unsigned MaxRegVF = MaxVF;
23399 MaxVF = std::min<unsigned>(MaxVF,
bit_floor(Operands.
size()));
23400 if (MaxVF < MinVF) {
23401 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
23403 <<
"MinVF (" << MinVF <<
")\n");
23407 SmallVector<unsigned> CandidateVFs;
23408 for (
unsigned VF = std::max(MaxVF, NonPowerOf2VF); VF >= MinVF;
23412 unsigned End = Operands.
size();
23413 unsigned Repeat = 0;
23414 constexpr unsigned MaxAttempts = 4;
23415 OwningArrayRef<std::pair<unsigned, unsigned>> RangeSizes(Operands.
size());
23416 for (std::pair<unsigned, unsigned> &
P : RangeSizes)
23417 P.first =
P.second = 1;
23418 DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable;
23419 auto IsNotVectorized = [](
bool First,
23420 const std::pair<unsigned, unsigned> &
P) {
23421 return First ?
P.first > 0 :
P.second > 0;
23423 auto IsVectorized = [](
bool First,
23424 const std::pair<unsigned, unsigned> &
P) {
23425 return First ?
P.first == 0 :
P.second == 0;
23427 auto VFIsProfitable = [](
bool First,
unsigned Size,
23428 const std::pair<unsigned, unsigned> &
P) {
23431 auto FirstSizeSame = [](
unsigned Size,
23432 const std::pair<unsigned, unsigned> &
P) {
23433 return Size ==
P.first;
23437 bool RepeatChanged =
false;
23438 bool AnyProfitableGraph =
false;
23439 for (
unsigned VF : CandidateVFs) {
23440 AnyProfitableGraph =
false;
23441 unsigned FirstUnvecStore =
23442 std::distance(RangeSizes.begin(),
23443 find_if(RangeSizes, std::bind(IsNotVectorized,
23444 VF >= MaxRegVF, _1)));
23448 while (FirstUnvecStore < End) {
23449 unsigned FirstVecStore = std::distance(
23450 RangeSizes.begin(),
23451 find_if(RangeSizes.drop_front(FirstUnvecStore),
23452 std::bind(IsVectorized, VF >= MaxRegVF, _1)));
23453 unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore;
23454 for (
unsigned SliceStartIdx = FirstUnvecStore;
23455 SliceStartIdx + VF <= MaxSliceEnd;) {
23466 ->getValueOperand()
23469 ->getValueOperand()
23472 "Expected all operands of same type.");
23473 if (!NonSchedulable.
empty()) {
23474 auto [NonSchedSizeMax, NonSchedSizeMin] =
23476 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= VF) {
23479 SliceStartIdx += NonSchedSizeMax;
23484 std::optional<bool> Res =
23485 vectorizeStoreChain(Slice, R, SliceStartIdx, MinVF, TreeSize);
23491 .first->getSecond()
23499 AnyProfitableGraph = RepeatChanged =
Changed =
true;
23502 for (std::pair<unsigned, unsigned> &
P :
23503 RangeSizes.slice(SliceStartIdx, VF))
23504 P.first =
P.second = 0;
23505 if (SliceStartIdx < FirstUnvecStore + MinVF) {
23506 for (std::pair<unsigned, unsigned> &
P : RangeSizes.slice(
23507 FirstUnvecStore, SliceStartIdx - FirstUnvecStore))
23508 P.first =
P.second = 0;
23509 FirstUnvecStore = SliceStartIdx + VF;
23511 if (SliceStartIdx > MaxSliceEnd - VF - MinVF) {
23512 for (std::pair<unsigned, unsigned> &
P :
23513 RangeSizes.slice(SliceStartIdx + VF,
23514 MaxSliceEnd - (SliceStartIdx + VF)))
23515 P.first =
P.second = 0;
23516 if (MaxSliceEnd == End)
23517 End = SliceStartIdx;
23518 MaxSliceEnd = SliceStartIdx;
23520 SliceStartIdx += VF;
23523 if (VF > 2 && Res &&
23524 !
all_of(RangeSizes.slice(SliceStartIdx, VF),
23525 std::bind(VFIsProfitable, VF >= MaxRegVF, TreeSize,
23527 SliceStartIdx += VF;
23532 if (VF > MaxRegVF && TreeSize > 1 &&
23533 all_of(RangeSizes.slice(SliceStartIdx, VF),
23534 std::bind(FirstSizeSame, TreeSize, _1))) {
23535 SliceStartIdx += VF;
23536 while (SliceStartIdx != MaxSliceEnd &&
23537 RangeSizes[SliceStartIdx].first == TreeSize)
23541 if (TreeSize > 1) {
23542 for (std::pair<unsigned, unsigned> &
P :
23543 RangeSizes.slice(SliceStartIdx, VF)) {
23544 if (VF >= MaxRegVF)
23545 P.second = std::max(
P.second, TreeSize);
23547 P.first = std::max(
P.first, TreeSize);
23551 AnyProfitableGraph =
true;
23553 if (FirstUnvecStore >= End)
23555 if (MaxSliceEnd - FirstUnvecStore < VF &&
23556 MaxSliceEnd - FirstUnvecStore >= MinVF)
23557 AnyProfitableGraph =
true;
23558 FirstUnvecStore = std::distance(
23559 RangeSizes.begin(),
23560 find_if(RangeSizes.drop_front(MaxSliceEnd),
23561 std::bind(IsNotVectorized, VF >= MaxRegVF, _1)));
23563 if (!AnyProfitableGraph && VF >= MaxRegVF &&
has_single_bit(VF))
23567 if (
all_of(RangeSizes, [](
const std::pair<unsigned, unsigned> &
P) {
23568 return P.first == 0 &&
P.second == 0;
23572 if (Repeat >= MaxAttempts ||
23573 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
23575 constexpr unsigned StoresLimit = 64;
23576 const unsigned MaxTotalNum = std::min<unsigned>(
23578 static_cast<unsigned>(
23581 RangeSizes.begin(),
23582 find_if(RangeSizes, std::bind(IsNotVectorized,
true, _1))) +
23584 unsigned VF =
bit_ceil(CandidateVFs.front()) * 2;
23587 CandidateVFs.clear();
23589 CandidateVFs.push_back(Limit);
23590 if (VF > MaxTotalNum || VF >= StoresLimit)
23592 for (std::pair<unsigned, unsigned> &
P : RangeSizes) {
23594 P.first = std::max(
P.second,
P.first);
23598 CandidateVFs.push_back(VF);
23638 auto FillStoresSet = [&](
unsigned Idx, StoreInst *
SI) {
23639 std::optional<int64_t> PtrDist;
23640 auto *RelatedStores =
find_if(
23641 SortedStores, [&PtrDist, SI,
this](
const RelatedStoreInsts &StoreSeq) {
23642 PtrDist = StoreSeq.getPointerDiff(*SI, *DL, *SE);
23643 return PtrDist.has_value();
23647 if (RelatedStores == SortedStores.
end()) {
23655 if (std::optional<unsigned> PrevInst =
23656 RelatedStores->insertOrLookup(Idx, *PtrDist)) {
23657 TryToVectorize(RelatedStores->getStores());
23658 RelatedStores->clearVectorizedStores(VectorizedStores);
23659 RelatedStores->rebase(*PrevInst + 1,
23664 Type *PrevValTy =
nullptr;
23666 if (
R.isDeleted(SI))
23669 PrevValTy =
SI->getValueOperand()->getType();
23671 if (PrevValTy !=
SI->getValueOperand()->getType()) {
23672 for (RelatedStoreInsts &StoreSeq : SortedStores)
23673 TryToVectorize(StoreSeq.getStores());
23674 SortedStores.clear();
23675 PrevValTy =
SI->getValueOperand()->getType();
23677 FillStoresSet(
I, SI);
23681 for (RelatedStoreInsts &StoreSeq : SortedStores)
23682 TryToVectorize(StoreSeq.getStores());
23687void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
23695 for (Instruction &
I : *BB) {
23699 if (!
SI->isSimple())
23710 if (
GEP->getNumIndices() != 1)
23712 Value *Idx =
GEP->idx_begin()->get();
23717 if (
GEP->getType()->isVectorTy())
23729 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize a list of length = "
23730 << VL.
size() <<
".\n");
23741 for (
Value *V : VL) {
23742 Type *Ty =
V->getType();
23746 R.getORE()->emit([&]() {
23747 std::string TypeStr;
23748 llvm::raw_string_ostream OS(TypeStr);
23750 return OptimizationRemarkMissed(
SV_NAME,
"UnsupportedType", I0)
23751 <<
"Cannot SLP vectorize list: type "
23752 << TypeStr +
" is unsupported by vectorizer";
23759 unsigned Sz =
R.getVectorElementSize(I0);
23760 unsigned MinVF =
R.getMinVF(Sz);
23761 unsigned MaxVF = std::max<unsigned>(
23763 MaxVF = std::min(
R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
23765 R.getORE()->emit([&]() {
23766 return OptimizationRemarkMissed(
SV_NAME,
"SmallVF", I0)
23767 <<
"Cannot SLP vectorize list: vectorization factor "
23768 <<
"less than 2 is not supported";
23774 bool CandidateFound =
false;
23777 unsigned NextInst = 0, MaxInst = VL.size();
23778 for (
unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
23784 if (TTI->getNumberOfParts(VecTy) == VF)
23786 for (
unsigned I = NextInst;
I < MaxInst; ++
I) {
23787 unsigned ActualVF = std::min(MaxInst -
I, VF);
23792 if (MaxVFOnly && ActualVF < MaxVF)
23794 if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
23799 for (
Value *V : VL.drop_front(
I)) {
23803 !Inst || !
R.isDeleted(Inst)) {
23806 if (Idx == ActualVF)
23811 if (Idx != ActualVF)
23814 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << ActualVF <<
" operations "
23818 if (
R.isTreeTinyAndNotFullyVectorizable())
23820 if (
R.isProfitableToReorder()) {
23821 R.reorderTopToBottom();
23824 R.transformNodes();
23825 R.buildExternalUses();
23827 R.computeMinimumValueSizes();
23829 CandidateFound =
true;
23830 MinCost = std::min(MinCost,
Cost);
23833 <<
" for VF=" << ActualVF <<
"\n");
23836 R.getORE()->emit(OptimizationRemark(
SV_NAME,
"VectorizedList",
23838 <<
"SLP vectorized with cost " <<
ore::NV(
"Cost",
Cost)
23839 <<
" and with tree size "
23840 <<
ore::NV(
"TreeSize",
R.getTreeSize()));
23851 if (!
Changed && CandidateFound) {
23852 R.getORE()->emit([&]() {
23853 return OptimizationRemarkMissed(
SV_NAME,
"NotBeneficial", I0)
23854 <<
"List vectorization was possible but not beneficial with cost "
23855 <<
ore::NV(
"Cost", MinCost) <<
" >= "
23859 R.getORE()->emit([&]() {
23860 return OptimizationRemarkMissed(
SV_NAME,
"NotPossible", I0)
23861 <<
"Cannot SLP vectorize list: vectorization was impossible"
23862 <<
" with available vectorization factors";
23897 using ReductionOpsType = SmallVector<Value *, 16>;
23898 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
23899 ReductionOpsListType ReductionOps;
23903 SmallDenseMap<Value *, SmallVector<Instruction *>, 16> ReducedValsToOps;
23904 WeakTrackingVH ReductionRoot;
23909 bool IsSupportedHorRdxIdentityOp =
false;
23916 static bool isCmpSelMinMax(Instruction *
I) {
23924 static bool isBoolLogicOp(Instruction *
I) {
23930 static bool isVectorizable(
RecurKind Kind, Instruction *
I,
23931 bool TwoElementReduction =
false) {
23932 if (Kind == RecurKind::None)
23941 if (TwoElementReduction)
23944 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
23948 return I->getFastMathFlags().noNaNs();
23951 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
23954 return I->isAssociative();
23957 static Value *getRdxOperand(Instruction *
I,
unsigned Index) {
23963 return I->getOperand(2);
23964 return I->getOperand(Index);
23969 Value *
RHS,
const Twine &Name,
bool UseSelect) {
23973 case RecurKind::Or: {
23982 case RecurKind::And: {
23992 case RecurKind::Add:
23993 case RecurKind::Mul:
23994 case RecurKind::Xor:
23995 case RecurKind::FAdd:
23996 case RecurKind::FMul: {
24001 case RecurKind::SMax:
24002 case RecurKind::SMin:
24003 case RecurKind::UMax:
24004 case RecurKind::UMin:
24012 case RecurKind::FMax:
24013 case RecurKind::FMin:
24014 case RecurKind::FMaximum:
24015 case RecurKind::FMinimum:
24016 case RecurKind::FMaximumNum:
24017 case RecurKind::FMinimumNum: {
24030 const ReductionOpsListType &ReductionOps) {
24031 bool UseSelect = ReductionOps.size() == 2 ||
24033 (ReductionOps.size() == 1 &&
24035 assert((!UseSelect || ReductionOps.size() != 2 ||
24037 "Expected cmp + select pairs for reduction");
24038 Value *
Op = createOp(Builder, RdxKind,
LHS,
RHS, Name, UseSelect);
24056 return RecurKind::None;
24058 return RecurKind::Add;
24060 return RecurKind::Mul;
24063 return RecurKind::And;
24066 return RecurKind::Or;
24068 return RecurKind::Xor;
24070 return RecurKind::FAdd;
24072 return RecurKind::FMul;
24075 return RecurKind::FMax;
24077 return RecurKind::FMin;
24080 return RecurKind::FMaximum;
24082 return RecurKind::FMinimum;
24088 return RecurKind::SMax;
24090 return RecurKind::SMin;
24092 return RecurKind::UMax;
24094 return RecurKind::UMin;
24120 return RecurKind::None;
24124 return RecurKind::None;
24127 return RecurKind::None;
24131 return RecurKind::None;
24136 return RecurKind::None;
24139 return RecurKind::SMax;
24142 return RecurKind::SMin;
24145 return RecurKind::UMax;
24148 return RecurKind::UMin;
24151 return RecurKind::None;
24155 static unsigned getFirstOperandIndex(Instruction *
I) {
24156 return isCmpSelMinMax(
I) ? 1 : 0;
24161 static unsigned getNumberOfOperands(Instruction *
I) {
24162 return isCmpSelMinMax(
I) ? 3 : 2;
24167 static bool hasSameParent(Instruction *
I, BasicBlock *BB) {
24168 if (isCmpSelMinMax(
I) || isBoolLogicOp(
I)) {
24171 return Sel->getParent() == BB &&
Cmp &&
Cmp->getParent() == BB;
24173 return I->getParent() == BB;
24177 static bool hasRequiredNumberOfUses(
bool IsCmpSelMinMax, Instruction *
I) {
24178 if (IsCmpSelMinMax) {
24182 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
24183 return I->hasNUses(2);
24191 void initReductionOps(Instruction *
I) {
24192 if (isCmpSelMinMax(
I))
24193 ReductionOps.assign(2, ReductionOpsType());
24195 ReductionOps.assign(1, ReductionOpsType());
24199 void addReductionOps(Instruction *
I) {
24200 if (isCmpSelMinMax(
I)) {
24202 ReductionOps[1].emplace_back(
I);
24204 ReductionOps[0].emplace_back(
I);
24209 int Sz =
Data.size();
24218 : ReductionRoot(
I), ReductionLimit(2) {
24219 RdxKind = HorizontalReduction::getRdxKind(
I);
24220 ReductionOps.emplace_back().push_back(
I);
24223 ReducedValsToOps[
V].push_back(
I);
24226 bool matchReductionForOperands()
const {
24229 assert(ReductionRoot &&
"Reduction root is not set!");
24232 return Ops.size() == 2;
24240 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
24241 ScalarEvolution &SE,
const DataLayout &
DL,
24242 const TargetLibraryInfo &TLI) {
24243 RdxKind = HorizontalReduction::getRdxKind(Root);
24244 if (!isVectorizable(RdxKind, Root))
24256 if (!Sel->getCondition()->hasOneUse())
24259 ReductionRoot = Root;
24264 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
24266 1, std::make_pair(Root, 0));
24271 SmallVectorImpl<Value *> &PossibleReducedVals,
24272 SmallVectorImpl<Instruction *> &ReductionOps,
24275 getNumberOfOperands(TreeN)))) {
24276 Value *EdgeVal = getRdxOperand(TreeN,
I);
24277 ReducedValsToOps[EdgeVal].push_back(TreeN);
24285 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
24286 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
24287 !isVectorizable(RdxKind, EdgeInst) ||
24288 (
R.isAnalyzedReductionRoot(EdgeInst) &&
24290 PossibleReducedVals.push_back(EdgeVal);
24293 ReductionOps.push_back(EdgeInst);
24302 size_t, SmallMapVector<size_t, SmallMapVector<Value *, unsigned, 2>, 2>,
24304 PossibleReducedVals;
24305 initReductionOps(Root);
24307 SmallSet<size_t, 2> LoadKeyUsed;
24309 auto GenerateLoadsSubkey = [&](
size_t Key, LoadInst *LI) {
24314 auto LIt = LoadsMap.
find(std::make_pair(
Key, Ptr));
24315 if (LIt != LoadsMap.
end()) {
24316 for (LoadInst *RLI : LIt->second) {
24322 for (LoadInst *RLI : LIt->second) {
24329 if (LIt->second.size() > 2) {
24331 hash_value(LIt->second.back()->getPointerOperand());
24337 .first->second.push_back(LI);
24341 while (!Worklist.empty()) {
24342 auto [TreeN,
Level] = Worklist.pop_back_val();
24345 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
24346 addReductionOps(TreeN);
24349 for (
Value *V : PossibleRedVals) {
24353 ++PossibleReducedVals[
Key][Idx].
try_emplace(V, 0).first->second;
24355 for (Instruction *
I :
reverse(PossibleReductionOps))
24356 Worklist.emplace_back(
I,
I->getParent() == BB ? 0 : Level + 1);
24358 auto PossibleReducedValsVect = PossibleReducedVals.
takeVector();
24361 for (
auto &PossibleReducedVals : PossibleReducedValsVect) {
24362 auto PossibleRedVals = PossibleReducedVals.second.
takeVector();
24364 for (
auto &Slice : PossibleRedVals) {
24366 auto RedValsVect = Slice.second.takeVector();
24368 for (
const std::pair<Value *, unsigned> &
Data : RedValsVect)
24369 PossibleRedValsVect.
back().append(
Data.second,
Data.first);
24371 stable_sort(PossibleRedValsVect, [](
const auto &P1,
const auto &P2) {
24372 return P1.size() > P2.size();
24379 }
else if (!isGoodForReduction(
Data)) {
24382 if (!LI || !LastLI ||
24387 ReducedVals.
back().append(
Data.rbegin(),
Data.rend());
24393 return P1.size() > P2.
size();
24399 Value *tryToReduce(BoUpSLP &V,
const DataLayout &
DL, TargetTransformInfo *
TTI,
24400 const TargetLibraryInfo &TLI, AssumptionCache *AC,
24401 DominatorTree &DT) {
24402 constexpr unsigned RegMaxNumber = 4;
24403 constexpr unsigned RedValsMaxNumber = 128;
24407 if (
unsigned NumReducedVals = std::accumulate(
24408 ReducedVals.
begin(), ReducedVals.
end(), 0,
24410 if (!isGoodForReduction(Vals))
24412 return Num + Vals.size();
24414 NumReducedVals < ReductionLimit &&
24418 for (ReductionOpsType &RdxOps : ReductionOps)
24419 for (
Value *RdxOp : RdxOps)
24424 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
24430 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.
size() *
24431 ReducedVals.
front().size());
24435 auto &&GetCmpForMinMaxReduction = [](
Instruction *RdxRootInst) {
24437 "Expected min/max reduction to have select root instruction");
24440 "Expected min/max reduction to have compare condition");
24444 bool AnyBoolLogicOp =
any_of(ReductionOps.back(), [](
Value *V) {
24445 return isBoolLogicOp(cast<Instruction>(V));
24448 auto GetNewVectorizedTree = [&](
Value *VectorizedTree,
Value *Res) {
24449 if (VectorizedTree) {
24453 if (AnyBoolLogicOp) {
24454 auto It = ReducedValsToOps.
find(VectorizedTree);
24455 auto It1 = ReducedValsToOps.
find(Res);
24456 if ((It == ReducedValsToOps.
end() && It1 == ReducedValsToOps.
end()) ||
24458 (It != ReducedValsToOps.
end() &&
24459 any_of(It->getSecond(), [&](Instruction *
I) {
24460 return isBoolLogicOp(I) &&
24461 getRdxOperand(I, 0) == VectorizedTree;
24465 (It1 != ReducedValsToOps.
end() &&
24466 any_of(It1->getSecond(), [&](Instruction *
I) {
24467 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
24471 VectorizedTree = Builder.
CreateFreeze(VectorizedTree);
24475 return createOp(Builder, RdxKind, VectorizedTree, Res,
"op.rdx",
24481 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
24482 ReductionOps.front().size());
24483 for (ReductionOpsType &RdxOps : ReductionOps)
24484 for (
Value *RdxOp : RdxOps) {
24487 IgnoreList.insert(RdxOp);
24490 FastMathFlags RdxFMF;
24492 for (
Value *U : IgnoreList)
24494 RdxFMF &= FPMO->getFastMathFlags();
24500 for (
Value *V : Candidates)
24501 TrackedVals.try_emplace(V, V);
24503 auto At = [](SmallMapVector<Value *, unsigned, 16> &MV,
24504 Value *
V) ->
unsigned & {
24505 auto *It = MV.
find(V);
24506 assert(It != MV.
end() &&
"Unable to find given key.");
24510 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.
size());
24513 SmallPtrSet<Value *, 4> RequiredExtract;
24514 WeakTrackingVH VectorizedTree =
nullptr;
24515 bool CheckForReusedReductionOps =
false;
24520 for (
unsigned I = 0,
E = ReducedVals.
size();
I <
E; ++
I) {
24522 InstructionsState S = States[
I];
24525 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.
size());
24526 for (
Value *ReducedVal : OrigReducedVals) {
24527 Value *RdxVal = TrackedVals.at(ReducedVal);
24534 (!S || !S.getMatchingMainOpOrAltOp(Inst))) ||
24538 TrackedToOrig.try_emplace(RdxVal, ReducedVal);
24540 bool ShuffledExtracts =
false;
24542 if (S && S.getOpcode() == Instruction::ExtractElement &&
24543 !S.isAltShuffle() &&
I + 1 <
E) {
24545 for (
Value *RV : ReducedVals[
I + 1]) {
24546 Value *RdxVal = TrackedVals.at(RV);
24553 CommonCandidates.push_back(RdxVal);
24554 TrackedToOrig.try_emplace(RdxVal, RV);
24556 SmallVector<int>
Mask;
24559 Candidates.
swap(CommonCandidates);
24560 ShuffledExtracts =
true;
24567 Value *OrigV = TrackedToOrig.at(Candidates.
front());
24568 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24570 Res = createOp(Builder, RdxKind, Res, VC,
"const.rdx", ReductionOps);
24571 Value *OrigV = TrackedToOrig.at(VC);
24572 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24574 V.analyzedReductionRoot(ResI);
24576 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
24580 unsigned NumReducedVals = Candidates.
size();
24581 if (NumReducedVals < ReductionLimit &&
24582 (NumReducedVals < 2 || !
isSplat(Candidates)))
24587 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
24588 RdxKind != RecurKind::FMul &&
24589 RdxKind != RecurKind::FMulAdd;
24591 SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
24592 if (IsSupportedHorRdxIdentityOp)
24593 for (
Value *V : Candidates) {
24594 Value *OrigV = TrackedToOrig.at(V);
24595 ++SameValuesCounter.
try_emplace(OrigV).first->second;
24607 bool SameScaleFactor =
false;
24608 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
24609 SameValuesCounter.
size() != Candidates.size();
24611 if (OptReusedScalars) {
24613 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
24614 RdxKind == RecurKind::Xor) &&
24616 [&SameValuesCounter](
const std::pair<Value *, unsigned> &
P) {
24617 return P.second == SameValuesCounter.
front().second;
24619 Candidates.resize(SameValuesCounter.
size());
24620 transform(SameValuesCounter, Candidates.begin(),
24621 [&](
const auto &
P) { return TrackedVals.at(P.first); });
24622 NumReducedVals = Candidates.size();
24624 if (NumReducedVals == 1) {
24625 Value *OrigV = TrackedToOrig.at(Candidates.front());
24626 unsigned Cnt = At(SameValuesCounter, OrigV);
24628 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
24629 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
24630 VectorizedVals.try_emplace(OrigV, Cnt);
24631 ExternallyUsedValues.
insert(OrigV);
24636 unsigned MaxVecRegSize =
V.getMaxVecRegSize();
24637 unsigned EltSize =
V.getVectorElementSize(Candidates[0]);
24638 const unsigned MaxElts = std::clamp<unsigned>(
24640 RegMaxNumber * RedValsMaxNumber);
24642 unsigned ReduxWidth = NumReducedVals;
24643 auto GetVectorFactor = [&, &
TTI = *
TTI](
unsigned ReduxWidth) {
24644 unsigned NumParts, NumRegs;
24645 Type *ScalarTy = Candidates.front()->getType();
24652 while (NumParts > NumRegs) {
24653 assert(ReduxWidth > 0 &&
"ReduxWidth is unexpectedly 0.");
24654 ReduxWidth =
bit_floor(ReduxWidth - 1);
24660 if (NumParts > NumRegs / 2)
24665 ReduxWidth = GetVectorFactor(ReduxWidth);
24666 ReduxWidth = std::min(ReduxWidth, MaxElts);
24668 unsigned Start = 0;
24669 unsigned Pos =
Start;
24671 unsigned PrevReduxWidth = ReduxWidth;
24672 bool CheckForReusedReductionOpsLocal =
false;
24673 auto AdjustReducedVals = [&](
bool IgnoreVL =
false) {
24674 bool IsAnyRedOpGathered = !IgnoreVL &&
V.isAnyGathered(IgnoreList);
24675 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
24678 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
24681 if (Pos < NumReducedVals - ReduxWidth + 1)
24682 return IsAnyRedOpGathered;
24685 if (ReduxWidth > 1)
24686 ReduxWidth = GetVectorFactor(ReduxWidth);
24687 return IsAnyRedOpGathered;
24689 bool AnyVectorized =
false;
24690 SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
24691 while (Pos < NumReducedVals - ReduxWidth + 1 &&
24692 ReduxWidth >= ReductionLimit) {
24695 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
24697 CheckForReusedReductionOps =
true;
24700 PrevReduxWidth = ReduxWidth;
24703 if (IgnoredCandidates.
contains(std::make_pair(Pos, ReduxWidth)) ||
24706 std::make_pair(Pos,
bit_floor(ReduxWidth))) ||
24708 std::make_pair(Pos + (ReduxWidth -
bit_floor(ReduxWidth)),
24710 V.areAnalyzedReductionVals(VL)) {
24711 (void)AdjustReducedVals(
true);
24718 return RedValI &&
V.isDeleted(RedValI);
24721 V.buildTree(VL, IgnoreList);
24722 if (
V.isTreeTinyAndNotFullyVectorizable(
true)) {
24723 if (!AdjustReducedVals())
24724 V.analyzedReductionVals(VL);
24727 if (
V.isLoadCombineReductionCandidate(RdxKind)) {
24728 if (!AdjustReducedVals())
24729 V.analyzedReductionVals(VL);
24732 V.reorderTopToBottom();
24735 VL.front()->getType()->isIntOrIntVectorTy() ||
24736 ReductionLimit > 2);
24740 ExternallyUsedValues);
24744 LocalExternallyUsedValues.insert(ReductionRoot);
24745 for (
unsigned Cnt = 0, Sz = ReducedVals.
size(); Cnt < Sz; ++Cnt) {
24746 if (Cnt ==
I || (ShuffledExtracts && Cnt ==
I - 1))
24748 for (
Value *V : ReducedVals[Cnt])
24750 LocalExternallyUsedValues.insert(TrackedVals[V]);
24752 if (!IsSupportedHorRdxIdentityOp) {
24755 "Reused values counter map is not empty");
24756 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
24757 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
24759 Value *
V = Candidates[Cnt];
24760 Value *OrigV = TrackedToOrig.at(V);
24761 ++SameValuesCounter.
try_emplace(OrigV).first->second;
24764 V.transformNodes();
24767 SmallPtrSet<Value *, 4> Visited;
24768 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
24769 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
24771 Value *RdxVal = Candidates[Cnt];
24772 if (
auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
24773 RdxVal = It->second;
24774 if (!Visited.
insert(RdxVal).second)
24778 if (!VLScalars.contains(RdxVal) &&
V.isVectorized(RdxVal)) {
24779 LocalExternallyUsedValues.insert(RdxVal);
24782 Value *OrigV = TrackedToOrig.at(RdxVal);
24784 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
24785 if (
NumOps != ReducedValsToOps.
at(OrigV).size())
24786 LocalExternallyUsedValues.insert(RdxVal);
24789 if (!IsSupportedHorRdxIdentityOp)
24790 SameValuesCounter.
clear();
24791 for (
Value *RdxVal : VL)
24792 if (RequiredExtract.
contains(RdxVal))
24793 LocalExternallyUsedValues.insert(RdxVal);
24794 V.buildExternalUses(LocalExternallyUsedValues);
24796 V.computeMinimumValueSizes();
24800 getReductionCost(
TTI, VL, IsCmpSelMinMax, RdxFMF, V, DT,
DL, TLI);
24803 <<
" for reduction\n");
24807 V.getORE()->emit([&]() {
24808 return OptimizationRemarkMissed(
SV_NAME,
"HorSLPNotBeneficial",
24809 ReducedValsToOps.
at(VL[0]).front())
24810 <<
"Vectorizing horizontal reduction is possible "
24811 <<
"but not beneficial with cost " <<
ore::NV(
"Cost",
Cost)
24812 <<
" and threshold "
24815 if (!AdjustReducedVals()) {
24816 V.analyzedReductionVals(VL);
24818 if (ReduxWidth > ReductionLimit &&
V.isTreeNotExtendable()) {
24821 *
TTI, VL.front()->getType(), ReduxWidth - 1);
24822 VF >= ReductionLimit;
24824 *
TTI, VL.front()->getType(), VF - 1)) {
24826 V.getCanonicalGraphSize() !=
V.getTreeSize())
24829 IgnoredCandidates.
insert(std::make_pair(
Offset + Idx, VF));
24836 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorizing horizontal reduction at cost:"
24837 <<
Cost <<
". (HorRdx)\n");
24838 V.getORE()->emit([&]() {
24839 return OptimizationRemark(
SV_NAME,
"VectorizedHorizontalReduction",
24840 ReducedValsToOps.
at(VL[0]).front())
24841 <<
"Vectorized horizontal reduction with cost "
24842 <<
ore::NV(
"Cost",
Cost) <<
" and with tree size "
24843 <<
ore::NV(
"TreeSize",
V.getTreeSize());
24852 if (IsCmpSelMinMax)
24853 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
24856 Value *VectorizedRoot =
V.vectorizeTree(
24857 LocalExternallyUsedValues, InsertPt, VectorValuesAndScales);
24860 for (
Value *RdxVal : Candidates) {
24861 Value *OrigVal = TrackedToOrig.at(RdxVal);
24862 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
24863 if (TransformedRdxVal != RdxVal)
24864 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
24873 VectorizedRoot = Builder.
CreateFreeze(VectorizedRoot);
24876 if (OptReusedScalars && !SameScaleFactor) {
24877 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
24878 SameValuesCounter, TrackedToOrig);
24881 Type *ScalarTy = VL.front()->getType();
24886 OptReusedScalars && SameScaleFactor
24887 ? SameValuesCounter.
front().second
24890 ?
V.isSignedMinBitwidthRootNode()
24894 for (
Value *RdxVal : VL) {
24895 Value *OrigV = TrackedToOrig.at(RdxVal);
24896 if (IsSupportedHorRdxIdentityOp) {
24897 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
24900 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
24901 if (!
V.isVectorized(RdxVal))
24902 RequiredExtract.
insert(RdxVal);
24906 ReduxWidth = NumReducedVals - Pos;
24907 if (ReduxWidth > 1)
24908 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
24909 AnyVectorized =
true;
24911 if (OptReusedScalars && !AnyVectorized) {
24912 for (
const std::pair<Value *, unsigned> &
P : SameValuesCounter) {
24913 Value *RdxVal = TrackedVals.at(
P.first);
24914 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder,
P.second);
24915 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
24916 VectorizedVals.try_emplace(
P.first,
P.second);
24921 if (!VectorValuesAndScales.
empty())
24922 VectorizedTree = GetNewVectorizedTree(
24924 emitReduction(Builder, *
TTI, ReductionRoot->getType()));
24926 if (!VectorizedTree) {
24927 if (!CheckForReusedReductionOps) {
24928 for (ReductionOpsType &RdxOps : ReductionOps)
24929 for (
Value *RdxOp : RdxOps)
24951 auto FixBoolLogicalOps =
24954 if (!AnyBoolLogicOp)
24956 if (isBoolLogicOp(RedOp1) && ((!InitStep &&
LHS == VectorizedTree) ||
24957 getRdxOperand(RedOp1, 0) ==
LHS ||
24960 if (isBoolLogicOp(RedOp2) && ((!InitStep &&
RHS == VectorizedTree) ||
24961 getRdxOperand(RedOp2, 0) ==
RHS ||
24966 if (
LHS != VectorizedTree)
24974 unsigned Sz = InstVals.
size();
24976 for (
unsigned I = 0,
E = (Sz / 2) * 2;
I <
E;
I += 2) {
24979 Value *RdxVal1 = InstVals[
I].second;
24980 Value *StableRdxVal1 = RdxVal1;
24981 auto It1 = TrackedVals.find(RdxVal1);
24982 if (It1 != TrackedVals.end())
24983 StableRdxVal1 = It1->second;
24984 Value *RdxVal2 = InstVals[
I + 1].second;
24985 Value *StableRdxVal2 = RdxVal2;
24986 auto It2 = TrackedVals.find(RdxVal2);
24987 if (It2 != TrackedVals.end())
24988 StableRdxVal2 = It2->second;
24992 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[
I].first,
24994 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
24995 StableRdxVal2,
"op.rdx", ReductionOps);
24996 ExtraReds[
I / 2] = std::make_pair(InstVals[
I].first, ExtraRed);
24999 ExtraReds[Sz / 2] = InstVals.
back();
25005 SmallPtrSet<Value *, 8> Visited;
25007 for (
Value *RdxVal : Candidates) {
25008 if (!Visited.
insert(RdxVal).second)
25010 unsigned NumOps = VectorizedVals.lookup(RdxVal);
25011 for (Instruction *RedOp :
25017 bool InitStep =
true;
25018 while (ExtraReductions.
size() > 1) {
25020 FinalGen(ExtraReductions, InitStep);
25021 ExtraReductions.
swap(NewReds);
25024 VectorizedTree = ExtraReductions.
front().second;
25026 ReductionRoot->replaceAllUsesWith(VectorizedTree);
25033 SmallPtrSet<Value *, 4> IgnoreSet;
25042 for (
auto *U :
Ignore->users()) {
25044 "All users must be either in the reduction ops list.");
25047 if (!
Ignore->use_empty()) {
25049 Ignore->replaceAllUsesWith(
P);
25052 V.removeInstructionsAndOperands(RdxOps, VectorValuesAndScales);
25054 return VectorizedTree;
25060 Value *createSingleOp(IRBuilderBase &Builder,
const TargetTransformInfo &
TTI,
25061 Value *Vec,
unsigned Scale,
bool IsSigned,
25085 Rdx, emitReduction(Lane, Builder, &
TTI, DestTy),
I);
25088 Rdx = emitReduction(Vec, Builder, &
TTI, DestTy);
25090 if (Rdx->
getType() != DestTy)
25096 Rdx = emitScaleForReusedOps(Rdx, Builder, Scale);
25103 bool IsCmpSelMinMax, FastMathFlags FMF,
25104 const BoUpSLP &R, DominatorTree &DT,
25105 const DataLayout &
DL,
25106 const TargetLibraryInfo &TLI) {
25108 Type *ScalarTy = ReducedVals.
front()->getType();
25109 unsigned ReduxWidth = ReducedVals.
size();
25110 FixedVectorType *VectorTy =
R.getReductionType();
25115 auto EvaluateScalarCost = [&](function_ref<
InstructionCost()> GenCostFn) {
25118 int Cnt = ReducedVals.
size();
25119 for (
Value *RdxVal : ReducedVals) {
25124 Cost += GenCostFn();
25128 for (User *U : RdxVal->
users()) {
25130 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
25131 if (RdxKind == RecurKind::FAdd) {
25141 FMACost -= FMulCost;
25143 ScalarCost += FMACost;
25150 ScalarCost = InstructionCost::getInvalid();
25154 Cost += ScalarCost;
25156 Cost += GenCostFn();
25165 bool DoesRequireReductionOp = !AllConsts && VectorValuesAndScales.
empty();
25167 case RecurKind::Add:
25168 case RecurKind::Mul:
25169 case RecurKind::Or:
25170 case RecurKind::And:
25171 case RecurKind::Xor:
25172 case RecurKind::FAdd:
25173 case RecurKind::FMul: {
25176 if (DoesRequireReductionOp) {
25179 unsigned ScalarTyNumElements = VecTy->getNumElements();
25184 ReducedVals.size()),
25195 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
25196 std::make_pair(RedTy,
true));
25197 if (RType == RedTy) {
25202 RdxOpcode, !IsSigned, RedTy,
25208 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
25209 std::make_pair(RedTy,
true));
25212 if (RdxKind == RecurKind::FAdd) {
25217 for (
Value *RdxVal : ReducedVals) {
25223 FMF &= FPCI->getFastMathFlags();
25226 if (!
Ops.empty()) {
25231 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy,
25232 {RVecTy, RVecTy, RVecTy}, FMF);
25238 Instruction::FMul, RVecTy,
CostKind);
25240 <<
"Minus vector FMul cost: " << FMulCost <<
"\n");
25241 FMACost -= FMulCost;
25245 if (FMACost.isValid())
25246 VectorCost += FMACost;
25250 if (RType != RedTy) {
25251 unsigned Opcode = Instruction::Trunc;
25253 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
25259 ScalarCost = EvaluateScalarCost([&]() {
25264 case RecurKind::FMax:
25265 case RecurKind::FMin:
25266 case RecurKind::FMaximum:
25267 case RecurKind::FMinimum:
25268 case RecurKind::SMax:
25269 case RecurKind::SMin:
25270 case RecurKind::UMax:
25271 case RecurKind::UMin: {
25274 if (DoesRequireReductionOp) {
25280 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
25281 std::make_pair(RedTy,
true));
25283 IntrinsicCostAttributes ICA(Id, RVecTy, {RVecTy, RVecTy}, FMF);
25285 if (RType != RedTy) {
25286 unsigned Opcode = Instruction::Trunc;
25288 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
25294 ScalarCost = EvaluateScalarCost([&]() {
25295 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
25304 LLVM_DEBUG(
dbgs() <<
"SLP: Adding cost " << VectorCost - ScalarCost
25306 <<
" (It is a splitting reduction)\n");
25307 return VectorCost - ScalarCost;
25313 Value *emitReduction(IRBuilderBase &Builder,
const TargetTransformInfo &
TTI,
25315 Value *ReducedSubTree =
nullptr;
25317 auto CreateSingleOp = [&](
Value *Vec,
unsigned Scale,
bool IsSigned) {
25318 Value *Rdx = createSingleOp(Builder,
TTI, Vec, Scale, IsSigned, DestTy);
25319 if (ReducedSubTree)
25320 ReducedSubTree = createOp(Builder, RdxKind, ReducedSubTree, Rdx,
25321 "op.rdx", ReductionOps);
25323 ReducedSubTree = Rdx;
25325 if (VectorValuesAndScales.
size() == 1) {
25326 const auto &[Vec, Scale, IsSigned] = VectorValuesAndScales.
front();
25327 CreateSingleOp(Vec, Scale, IsSigned);
25328 return ReducedSubTree;
25332 Value *VecRes =
nullptr;
25333 bool VecResSignedness =
false;
25334 auto CreateVecOp = [&](
Value *Vec,
unsigned Cnt,
bool IsSigned) {
25340 case RecurKind::Add: {
25341 if (ScalarTy == Builder.
getInt1Ty() && ScalarTy != DestTy) {
25344 <<
". (HorRdx)\n");
25347 std::iota(std::next(
Mask.begin(), VF *
I),
25348 std::next(
Mask.begin(), VF * (
I + 1)), 0);
25349 ++NumVectorInstructions;
25360 LLVM_DEBUG(
dbgs() <<
"SLP: Add (to-mul) " << Cnt <<
"of " << Vec
25361 <<
". (HorRdx)\n");
25362 ++NumVectorInstructions;
25366 case RecurKind::Xor: {
25369 <<
"SLP: Xor " << Cnt <<
"of " << Vec <<
". (HorRdx)\n");
25374 case RecurKind::FAdd: {
25378 LLVM_DEBUG(
dbgs() <<
"SLP: FAdd (to-fmul) " << Cnt <<
"of " << Vec
25379 <<
". (HorRdx)\n");
25380 ++NumVectorInstructions;
25384 case RecurKind::And:
25385 case RecurKind::Or:
25386 case RecurKind::SMax:
25387 case RecurKind::SMin:
25388 case RecurKind::UMax:
25389 case RecurKind::UMin:
25390 case RecurKind::FMax:
25391 case RecurKind::FMin:
25392 case RecurKind::FMaximum:
25393 case RecurKind::FMinimum:
25396 case RecurKind::Sub:
25397 case RecurKind::AddChainWithSubs:
25398 case RecurKind::Mul:
25399 case RecurKind::FMul:
25400 case RecurKind::FMulAdd:
25401 case RecurKind::AnyOf:
25402 case RecurKind::FindFirstIVSMin:
25403 case RecurKind::FindFirstIVUMin:
25404 case RecurKind::FindLastIVSMax:
25405 case RecurKind::FindLastIVUMax:
25406 case RecurKind::FMaxNum:
25407 case RecurKind::FMinNum:
25408 case RecurKind::FMaximumNum:
25409 case RecurKind::FMinimumNum:
25410 case RecurKind::None:
25417 VecResSignedness = IsSigned;
25419 ++NumVectorInstructions;
25420 if (ScalarTy == Builder.
getInt1Ty() && ScalarTy != DestTy &&
25426 std::iota(
Mask.begin(),
Mask.end(), 0);
25428 if (VecResVF < VecVF) {
25432 if (VecResVF != VecVF) {
25434 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
25451 if (VecResVF < VecVF) {
25457 if (VecResVF != VecVF)
25459 Op = createOp(Builder, RdxKind,
Op, Vec,
"rdx.op", ReductionOps);
25460 if (VecResVF != VecVF)
25465 for (
auto [Vec, Scale, IsSigned] : VectorValuesAndScales)
25466 CreateVecOp(Vec, Scale, IsSigned);
25467 CreateSingleOp(VecRes, 1,
false);
25469 return ReducedSubTree;
25473 Value *emitReduction(
Value *VectorizedValue, IRBuilderBase &Builder,
25474 const TargetTransformInfo *
TTI,
Type *DestTy) {
25475 assert(VectorizedValue &&
"Need to have a vectorized tree node");
25476 assert(RdxKind != RecurKind::FMulAdd &&
25477 "A call to the llvm.fmuladd intrinsic is not handled yet");
25480 if (FTy->getScalarType() == Builder.
getInt1Ty() &&
25481 RdxKind == RecurKind::Add &&
25486 VectorizedValue, Builder.
getIntNTy(FTy->getNumElements()));
25487 ++NumVectorInstructions;
25490 ++NumVectorInstructions;
25495 Value *emitScaleForReusedOps(
Value *VectorizedValue, IRBuilderBase &Builder,
25497 assert(IsSupportedHorRdxIdentityOp &&
25498 "The optimization of matched scalar identity horizontal reductions "
25499 "must be supported.");
25501 return VectorizedValue;
25503 case RecurKind::Add: {
25505 Value *Scale = ConstantInt::get(VectorizedValue->
getType(), Cnt);
25507 << VectorizedValue <<
". (HorRdx)\n");
25508 return Builder.
CreateMul(VectorizedValue, Scale);
25510 case RecurKind::Xor: {
25512 LLVM_DEBUG(
dbgs() <<
"SLP: Xor " << Cnt <<
"of " << VectorizedValue
25513 <<
". (HorRdx)\n");
25516 return VectorizedValue;
25518 case RecurKind::FAdd: {
25520 Value *Scale = ConstantFP::get(VectorizedValue->
getType(), Cnt);
25522 << VectorizedValue <<
". (HorRdx)\n");
25523 return Builder.
CreateFMul(VectorizedValue, Scale);
25525 case RecurKind::And:
25526 case RecurKind::Or:
25527 case RecurKind::SMax:
25528 case RecurKind::SMin:
25529 case RecurKind::UMax:
25530 case RecurKind::UMin:
25531 case RecurKind::FMax:
25532 case RecurKind::FMin:
25533 case RecurKind::FMaximum:
25534 case RecurKind::FMinimum:
25536 return VectorizedValue;
25537 case RecurKind::Sub:
25538 case RecurKind::AddChainWithSubs:
25539 case RecurKind::Mul:
25540 case RecurKind::FMul:
25541 case RecurKind::FMulAdd:
25542 case RecurKind::AnyOf:
25543 case RecurKind::FindFirstIVSMin:
25544 case RecurKind::FindFirstIVUMin:
25545 case RecurKind::FindLastIVSMax:
25546 case RecurKind::FindLastIVUMax:
25547 case RecurKind::FMaxNum:
25548 case RecurKind::FMinNum:
25549 case RecurKind::FMaximumNum:
25550 case RecurKind::FMinimumNum:
25551 case RecurKind::None:
25560 emitReusedOps(
Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,
25561 const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter,
25562 const DenseMap<Value *, Value *> &TrackedToOrig) {
25563 assert(IsSupportedHorRdxIdentityOp &&
25564 "The optimization of matched scalar identity horizontal reductions "
25565 "must be supported.");
25568 if (VTy->getElementType() != VL.
front()->getType()) {
25572 R.isSignedMinBitwidthRootNode());
25575 case RecurKind::Add: {
25578 for (
Value *V : VL) {
25579 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
25580 Vals.
push_back(ConstantInt::get(
V->getType(), Cnt,
false));
25584 << VectorizedValue <<
". (HorRdx)\n");
25585 return Builder.
CreateMul(VectorizedValue, Scale);
25587 case RecurKind::And:
25588 case RecurKind::Or:
25591 <<
". (HorRdx)\n");
25592 return VectorizedValue;
25593 case RecurKind::SMax:
25594 case RecurKind::SMin:
25595 case RecurKind::UMax:
25596 case RecurKind::UMin:
25597 case RecurKind::FMax:
25598 case RecurKind::FMin:
25599 case RecurKind::FMaximum:
25600 case RecurKind::FMinimum:
25603 <<
". (HorRdx)\n");
25604 return VectorizedValue;
25605 case RecurKind::Xor: {
25610 SmallVector<int>
Mask(
25613 std::iota(
Mask.begin(),
Mask.end(), 0);
25614 bool NeedShuffle =
false;
25615 for (
unsigned I = 0, VF = VL.size();
I < VF; ++
I) {
25617 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
25618 if (Cnt % 2 == 0) {
25620 NeedShuffle =
true;
25626 dbgs() <<
"> of " << VectorizedValue <<
". (HorRdx)\n");
25630 ConstantVector::getNullValue(VectorizedValue->
getType()), Mask);
25631 return VectorizedValue;
25633 case RecurKind::FAdd: {
25636 for (
Value *V : VL) {
25637 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
25638 Vals.
push_back(ConstantFP::get(
V->getType(), Cnt));
25641 return Builder.
CreateFMul(VectorizedValue, Scale);
25643 case RecurKind::Sub:
25644 case RecurKind::AddChainWithSubs:
25645 case RecurKind::Mul:
25646 case RecurKind::FMul:
25647 case RecurKind::FMulAdd:
25648 case RecurKind::AnyOf:
25649 case RecurKind::FindFirstIVSMin:
25650 case RecurKind::FindFirstIVUMin:
25651 case RecurKind::FindLastIVSMax:
25652 case RecurKind::FindLastIVUMax:
25653 case RecurKind::FMaxNum:
25654 case RecurKind::FMinNum:
25655 case RecurKind::FMaximumNum:
25656 case RecurKind::FMinimumNum:
25657 case RecurKind::None:
25667 return HorizontalReduction::getRdxKind(V);
25673 unsigned AggregateSize = 1;
25675 Type *CurrentType =
IV->getType();
25678 for (
auto *Elt : ST->elements())
25679 if (Elt != ST->getElementType(0))
25680 return std::nullopt;
25681 AggregateSize *= ST->getNumElements();
25682 CurrentType = ST->getElementType(0);
25684 AggregateSize *= AT->getNumElements();
25685 CurrentType = AT->getElementType();
25687 AggregateSize *= VT->getNumElements();
25688 return AggregateSize;
25690 return AggregateSize;
25692 return std::nullopt;
25701 unsigned OperandOffset,
const BoUpSLP &R) {
25704 std::optional<unsigned> OperandIndex =
25706 if (!OperandIndex || R.isDeleted(LastInsertInst))
25710 BuildVectorOpds, InsertElts, *OperandIndex, R);
25713 BuildVectorOpds[*OperandIndex] = InsertedOperand;
25714 InsertElts[*OperandIndex] = LastInsertInst;
25717 }
while (LastInsertInst !=
nullptr &&
25744 "Expected insertelement or insertvalue instruction!");
25747 "Expected empty result vectors!");
25750 if (!AggregateSize)
25752 BuildVectorOpds.
resize(*AggregateSize);
25753 InsertElts.
resize(*AggregateSize);
25758 if (BuildVectorOpds.
size() >= 2)
25776 auto DominatedReduxValue = [&](
Value *R) {
25784 if (
P->getIncomingBlock(0) == ParentBB) {
25786 }
else if (
P->getIncomingBlock(1) == ParentBB) {
25790 if (Rdx && DominatedReduxValue(Rdx))
25803 if (
P->getIncomingBlock(0) == BBLatch) {
25805 }
else if (
P->getIncomingBlock(1) == BBLatch) {
25809 if (Rdx && DominatedReduxValue(Rdx))
25845 "Expected binop, select, or intrinsic for reduction matching");
25847 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root));
25849 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
25860 Value *Op0 =
nullptr;
25861 Value *Op1 =
nullptr;
25870 Value *B0 =
nullptr, *B1 =
nullptr;
25875bool SLPVectorizerPass::vectorizeHorReduction(
25876 PHINode *
P, Instruction *Root, BasicBlock *BB,
BoUpSLP &R,
25877 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
25886 auto SelectRoot = [&]() {
25888 HorizontalReduction::getRdxKind(Root) != RecurKind::None)
25905 std::queue<std::pair<Instruction *, unsigned>>
Stack;
25906 Stack.emplace(SelectRoot(), 0);
25907 SmallPtrSet<Value *, 8> VisitedInstrs;
25910 if (
R.isAnalyzedReductionRoot(Inst))
25915 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
25917 return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC, *DT);
25919 auto TryAppendToPostponedInsts = [&](
Instruction *FutureSeed) {
25920 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
25932 while (!
Stack.empty()) {
25935 std::tie(Inst, Level) =
Stack.front();
25940 if (
R.isDeleted(Inst))
25942 if (
Value *VectorizedV = TryToReduce(Inst)) {
25946 Stack.emplace(
I, Level);
25949 if (
R.isDeleted(Inst))
25953 if (!TryAppendToPostponedInsts(Inst)) {
25964 if (VisitedInstrs.
insert(
Op).second)
25969 !
R.isDeleted(
I) &&
I->getParent() == BB)
25970 Stack.emplace(
I, Level);
25975bool SLPVectorizerPass::tryToVectorize(Instruction *
I,
BoUpSLP &R) {
25982 if ((
I->getOpcode() == Instruction::FAdd ||
25983 I->getOpcode() == Instruction::FSub) &&
25993 if (!Op0 || !Op1 || Op0->getParent() !=
P || Op1->getParent() !=
P ||
25994 R.isDeleted(Op0) ||
R.isDeleted(Op1))
26004 if (
A &&
B &&
B->hasOneUse()) {
26007 if (B0 && B0->getParent() ==
P && !
R.isDeleted(B0))
26009 if (B1 && B1->getParent() ==
P && !
R.isDeleted(B1))
26013 if (
B &&
A &&
A->hasOneUse()) {
26016 if (A0 && A0->getParent() ==
P && !
R.isDeleted(A0))
26018 if (A1 && A1->getParent() ==
P && !
R.isDeleted(A1))
26022 auto TryToReduce = [
this, &
R, &TTI = *TTI](
Instruction *Inst,
26026 Type *Ty = Inst->getType();
26030 if (!HorRdx.matchReductionForOperands())
26036 TTI.getScalarizationOverhead(
26039 TTI.getInstructionCost(Inst,
CostKind);
26042 case RecurKind::Add:
26043 case RecurKind::Mul:
26044 case RecurKind::Or:
26045 case RecurKind::And:
26046 case RecurKind::Xor:
26047 case RecurKind::FAdd:
26048 case RecurKind::FMul: {
26051 FMF = FPCI->getFastMathFlags();
26052 RedCost = TTI.getArithmeticReductionCost(Inst->getOpcode(), VecTy, FMF,
26059 if (RedCost >= ScalarCost)
26062 return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC, *DT) !=
nullptr;
26064 if (Candidates.
size() == 1)
26065 return TryToReduce(
I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1},
R);
26068 std::optional<int> BestCandidate =
R.findBestRootPair(Candidates);
26069 if (!BestCandidate)
26071 return (*BestCandidate == 0 &&
26072 TryToReduce(
I, {Candidates[*BestCandidate].first,
26073 Candidates[*BestCandidate].second})) ||
26074 tryToVectorizeList({Candidates[*BestCandidate].first,
26075 Candidates[*BestCandidate].second},
26079bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *
P, Instruction *Root,
26080 BasicBlock *BB,
BoUpSLP &R) {
26082 bool Res = vectorizeHorReduction(
P, Root, BB, R, PostponedInsts);
26083 Res |= tryToVectorize(PostponedInsts, R);
26090 for (
Value *V : Insts)
26092 Res |= tryToVectorize(Inst, R);
26096bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
26099 if (!
R.canMapToVector(IVI->
getType()))
26102 SmallVector<Value *, 16> BuildVectorOpds;
26103 SmallVector<Value *, 16> BuildVectorInsts;
26107 if (MaxVFOnly && BuildVectorOpds.
size() == 2) {
26108 R.getORE()->emit([&]() {
26109 return OptimizationRemarkMissed(
SV_NAME,
"NotPossible", IVI)
26110 <<
"Cannot SLP vectorize list: only 2 elements of buildvalue, "
26111 "trying reduction first.";
26115 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IVI <<
"\n");
26117 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
26120bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
26123 SmallVector<Value *, 16> BuildVectorInsts;
26124 SmallVector<Value *, 16> BuildVectorOpds;
26125 SmallVector<int>
Mask;
26131 if (MaxVFOnly && BuildVectorInsts.
size() == 2) {
26132 R.getORE()->emit([&]() {
26133 return OptimizationRemarkMissed(
SV_NAME,
"NotPossible", IEI)
26134 <<
"Cannot SLP vectorize list: only 2 elements of buildvector, "
26135 "trying reduction first.";
26139 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IEI <<
"\n");
26140 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
26143template <
typename T>
26148 bool MaxVFOnly,
BoUpSLP &R) {
26161 if (!
I || R.isDeleted(
I)) {
26165 auto *SameTypeIt = IncIt;
26168 AreCompatible(VL, *SameTypeIt))) {
26171 if (
I && !R.isDeleted(
I))
26176 unsigned NumElts = VL.
size();
26177 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize starting at nodes ("
26178 << NumElts <<
")\n");
26188 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL), MaxVFOnly)) {
26191 VL.
swap(Candidates);
26192 Candidates.
clear();
26200 auto GetMinNumElements = [&R](
Value *V) {
26201 unsigned EltSize = R.getVectorElementSize(V);
26202 return std::max(2U, R.getMaxVecRegSize() / EltSize);
26204 if (NumElts < GetMinNumElements(*IncIt) &&
26205 (Candidates.
empty() ||
26206 Candidates.
front()->getType() == (*IncIt)->getType())) {
26214 if (Candidates.
size() > 1 &&
26215 (SameTypeIt ==
E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
26216 if (TryToVectorizeHelper(Candidates,
false)) {
26219 }
else if (MaxVFOnly) {
26222 for (
auto *It = Candidates.
begin(), *End = Candidates.
end(); It != End;
26225 if (!
I || R.isDeleted(
I)) {
26229 auto *SameTypeIt = It;
26230 while (SameTypeIt != End &&
26233 AreCompatible(*SameTypeIt, *It))) {
26236 if (
I && !R.isDeleted(
I))
26239 unsigned NumElts = VL.
size();
26240 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL),
26246 Candidates.
clear();
26250 IncIt = SameTypeIt;
26262template <
bool IsCompatibility>
26267 "Expected valid element types only.");
26269 return IsCompatibility;
26272 if (CI1->getOperand(0)->getType()->getTypeID() <
26274 return !IsCompatibility;
26275 if (CI1->getOperand(0)->getType()->getTypeID() >
26278 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
26280 return !IsCompatibility;
26281 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
26290 if (BasePred1 < BasePred2)
26291 return !IsCompatibility;
26292 if (BasePred1 > BasePred2)
26295 bool CI1Preds = Pred1 == BasePred1;
26296 bool CI2Preds = Pred2 == BasePred1;
26297 for (
int I = 0,
E = CI1->getNumOperands();
I <
E; ++
I) {
26298 auto *Op1 = CI1->getOperand(CI1Preds ?
I :
E -
I - 1);
26303 return !IsCompatibility;
26308 if (IsCompatibility) {
26309 if (I1->getParent() != I2->getParent())
26316 return NodeI2 !=
nullptr;
26319 assert((NodeI1 == NodeI2) ==
26321 "Different nodes should have different DFS numbers");
26322 if (NodeI1 != NodeI2)
26326 if (S && (IsCompatibility || !S.isAltShuffle()))
26328 if (IsCompatibility)
26330 if (I1->getOpcode() != I2->getOpcode())
26331 return I1->getOpcode() < I2->getOpcode();
26334 return IsCompatibility;
26337template <
typename ItT>
26343 if (R.isDeleted(
I))
26347 Changed |= vectorizeRootInstruction(
nullptr, RootOp, BB, R);
26348 if (R.isDeleted(
I))
26354 if (R.isDeleted(
I))
26360 auto CompareSorter = [&](
Value *V,
Value *V2) {
26376 if (Vals.
size() <= 1)
26379 Vals, CompareSorter, AreCompatibleCompares,
26382 bool ArePossiblyReducedInOtherBlock =
any_of(Candidates, [](
Value *V) {
26384 auto *Select = dyn_cast<SelectInst>(U);
26386 Select->getParent() != cast<Instruction>(V)->getParent();
26389 if (ArePossiblyReducedInOtherBlock)
26391 return tryToVectorizeList(Candidates, R, MaxVFOnly);
26397bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
26400 "This function only accepts Insert instructions");
26401 bool OpsChanged =
false;
26403 for (
auto *
I :
reverse(Instructions)) {
26409 vectorizeInsertValueInst(LastInsertValue, BB, R,
true);
26412 vectorizeInsertElementInst(LastInsertElem, BB, R,
true);
26415 if (
R.isDeleted(
I))
26417 OpsChanged |= vectorizeHorReduction(
nullptr,
I, BB, R, PostponedInsts);
26423 vectorizeInsertValueInst(LastInsertValue, BB, R,
false);
26425 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
26430 OpsChanged |= tryToVectorize(PostponedInsts, R);
26436bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB,
BoUpSLP &R) {
26439 SmallPtrSet<Value *, 16> VisitedInstrs;
26443 DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;
26444 auto PHICompare = [
this, &PHIToOpcodes](
Value *V1,
Value *V2) {
26447 "Expected vectorizable types only.");
26457 V2->getType()->getScalarSizeInBits())
26460 V2->getType()->getScalarSizeInBits())
26464 if (Opcodes1.
size() < Opcodes2.
size())
26466 if (Opcodes1.
size() > Opcodes2.
size())
26468 for (
int I = 0,
E = Opcodes1.
size();
I <
E; ++
I) {
26477 return NodeI2 !=
nullptr;
26480 assert((NodeI1 == NodeI2) ==
26482 "Different nodes should have different DFS numbers");
26483 if (NodeI1 != NodeI2)
26486 if (S && !S.isAltShuffle() &&
I1->getOpcode() == I2->getOpcode()) {
26502 DT->getNode(V1->getParent());
26504 DT->getNode(V2->getParent());
26506 return NodeI2 !=
nullptr;
26509 assert((NodeI1 == NodeI2) ==
26511 "Different nodes should have different DFS numbers");
26512 if (NodeI1 != NodeI2)
26514 return V1->comesBefore(V2);
26527 return *Id1 < *Id2;
26531 if (
I1->getOpcode() == I2->getOpcode())
26533 return I1->getOpcode() < I2->getOpcode();
26556 auto ValID1 = Opcodes1[
I]->getValueID();
26557 auto ValID2 = Opcodes2[
I]->getValueID();
26558 if (ValID1 == ValID2)
26560 if (ValID1 < ValID2)
26562 if (ValID1 > ValID2)
26571 assert(U1 && U2 &&
"The only thing left should be undef & undef.");
26577 if (VL.empty() || V1 == VL.back())
26579 Value *V2 = VL.back();
26584 if (Opcodes1.
size() != Opcodes2.
size())
26586 for (
int I = 0,
E = Opcodes1.
size();
I <
E; ++
I) {
26592 if (
R.isDeleted(I1) ||
R.isDeleted(I2))
26594 if (
I1->getParent() != I2->getParent())
26602 if (Opcodes1[
I]->getValueID() != Opcodes2[
I]->getValueID())
26608 bool HaveVectorizedPhiNodes =
false;
26612 for (Instruction &
I : *BB) {
26619 if (!VisitedInstrs.
count(
P) && !
R.isDeleted(
P) &&
26624 if (Incoming.
size() <= 1)
26629 for (
Value *V : Incoming) {
26630 SmallVectorImpl<Value *> &Opcodes =
26632 if (!Opcodes.
empty())
26635 SmallPtrSet<Value *, 4> Visited;
26636 while (!Nodes.empty()) {
26640 for (
Value *V :
PHI->incoming_values()) {
26642 Nodes.push_back(PHI1);
26651 Incoming, PHICompare, AreCompatiblePHIs,
26653 return tryToVectorizeList(Candidates, R, MaxVFOnly);
26656 Changed |= HaveVectorizedPhiNodes;
26657 if (HaveVectorizedPhiNodes &&
any_of(PHIToOpcodes, [&](
const auto &
P) {
26659 return !
PHI ||
R.isDeleted(
PHI);
26661 PHIToOpcodes.
clear();
26663 }
while (HaveVectorizedPhiNodes);
26665 VisitedInstrs.
clear();
26667 InstSetVector PostProcessInserts;
26668 SmallSetVector<CmpInst *, 8> PostProcessCmps;
26671 auto VectorizeInsertsAndCmps = [&](
bool VectorizeCmps) {
26672 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
26673 if (VectorizeCmps) {
26675 PostProcessCmps.
clear();
26677 PostProcessInserts.clear();
26683 return PostProcessCmps.
contains(Cmp);
26685 PostProcessInserts.contains(
I);
26691 return I->use_empty() &&
26701 if (
R.isDeleted(&*It))
26704 if (!VisitedInstrs.
insert(&*It).second) {
26705 if (HasNoUsers(&*It) &&
26706 VectorizeInsertsAndCmps(It->isTerminator())) {
26719 if (
P->getNumIncomingValues() == 2) {
26722 if (Root && vectorizeRootInstruction(
P, Root, BB, R)) {
26736 if (BB ==
P->getIncomingBlock(
I) ||
26737 !DT->isReachableFromEntry(
P->getIncomingBlock(
I)))
26743 PI && !IsInPostProcessInstrs(PI)) {
26745 vectorizeRootInstruction(
nullptr, PI,
P->getIncomingBlock(
I), R);
26747 if (Res &&
R.isDeleted(
P)) {
26757 if (HasNoUsers(&*It)) {
26758 bool OpsChanged =
false;
26769 TryToVectorizeRoot |= (
I == Stores.
end() ||
I->second.size() == 1) &&
26770 SI->getValueOperand()->hasOneUse();
26772 if (TryToVectorizeRoot) {
26773 for (
auto *V : It->operand_values()) {
26777 VI && !IsInPostProcessInstrs(VI))
26779 OpsChanged |= vectorizeRootInstruction(
nullptr, VI, BB, R);
26786 VectorizeInsertsAndCmps(It->isTerminator());
26798 PostProcessInserts.insert(&*It);
26806bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB,
BoUpSLP &R) {
26808 for (
auto &Entry : GEPs) {
26811 if (
Entry.second.size() < 2)
26814 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing a getelementptr list of length "
26815 <<
Entry.second.size() <<
".\n");
26823 return !R.isDeleted(GEP);
26825 if (It ==
Entry.second.end())
26827 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
26828 unsigned EltSize =
R.getVectorElementSize(*(*It)->idx_begin());
26829 if (MaxVecRegSize < EltSize)
26832 unsigned MaxElts = MaxVecRegSize / EltSize;
26833 for (
unsigned BI = 0, BE =
Entry.second.size(); BI < BE; BI += MaxElts) {
26834 auto Len = std::min<unsigned>(BE - BI, MaxElts);
26847 Candidates.remove_if([&R](
Value *
I) {
26857 for (
int I = 0,
E = GEPList.size();
I <
E && Candidates.
size() > 1; ++
I) {
26858 auto *GEPI = GEPList[
I];
26859 if (!Candidates.count(GEPI))
26861 const SCEV *SCEVI = SE->getSCEV(GEPList[
I]);
26862 for (
int J =
I + 1; J <
E && Candidates.
size() > 1; ++J) {
26863 auto *GEPJ = GEPList[J];
26864 const SCEV *SCEVJ = SE->getSCEV(GEPList[J]);
26866 Candidates.remove(GEPI);
26867 Candidates.remove(GEPJ);
26868 }
else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
26869 Candidates.remove(GEPJ);
26876 if (Candidates.
size() < 2)
26882 SmallVector<Value *, 16> Bundle(Candidates.
size());
26883 auto BundleIndex = 0
u;
26884 for (
auto *V : Candidates) {
26886 auto *GEPIdx =
GEP->idx_begin()->get();
26888 Bundle[BundleIndex++] = GEPIdx;
26900 Changed |= tryToVectorizeList(Bundle, R);
26906bool SLPVectorizerPass::vectorizeStoreChains(
BoUpSLP &R) {
26911 auto &&StoreSorter = [
this](StoreInst *
V, StoreInst *V2) {
26912 if (
V->getValueOperand()->getType()->getTypeID() <
26915 if (
V->getValueOperand()->getType()->getTypeID() >
26918 if (
V->getPointerOperandType()->getTypeID() <
26919 V2->getPointerOperandType()->getTypeID())
26921 if (
V->getPointerOperandType()->getTypeID() >
26922 V2->getPointerOperandType()->getTypeID())
26924 if (
V->getValueOperand()->getType()->getScalarSizeInBits() <
26927 if (
V->getValueOperand()->getType()->getScalarSizeInBits() >
26933 DomTreeNodeBase<llvm::BasicBlock> *NodeI1 =
26934 DT->getNode(
I1->getParent());
26935 DomTreeNodeBase<llvm::BasicBlock> *NodeI2 =
26936 DT->getNode(I2->getParent());
26937 assert(NodeI1 &&
"Should only process reachable instructions");
26938 assert(NodeI2 &&
"Should only process reachable instructions");
26939 assert((NodeI1 == NodeI2) ==
26941 "Different nodes should have different DFS numbers");
26942 if (NodeI1 != NodeI2)
26944 return I1->getOpcode() < I2->getOpcode();
26946 return V->getValueOperand()->getValueID() <
26950 bool SameParent =
true;
26956 StoreInst *V2 = VL.
back();
26981 SameParent &=
I1 && I2 &&
I1->getParent() == I2->getParent();
26983 for (
auto [SI, V] :
zip(VL, NewVL))
26984 V =
SI->getValueOperand();
26985 NewVL.back() = V1->getValueOperand();
26986 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
26987 InstructionsState S =
Analysis.buildInstructionsState(
26995 return V1->getValueOperand()->
getValueID() ==
27000 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> Attempted;
27001 for (
auto &Pair : Stores) {
27002 if (Pair.second.size() < 2)
27006 << Pair.second.size() <<
".\n");
27015 Pair.second.rend());
27017 ReversedStores, StoreSorter, AreCompatibleStores,
27019 return vectorizeStores(Candidates, R, Attempted);
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool isConstant(const MachineInstr &MI)
AMDGPU Register Bank Select
ReachingDefInfo InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
block Block Frequency Analysis
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
static bool runImpl(Function &F, const TargetLowering &TLI, const LibcallLoweringInfo &Libcalls, AssumptionCache *AC)
This is the interface for a simple mod/ref and alias analysis over globals.
static Value * getCondition(Instruction *I)
static void setCondition(Instruction *I, Value *NewCond)
static const HTTPClientCleanup Cleanup
static Type * getIndexType(Value *In)
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
This file provides utility analysis objects describing memory locations.
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
static bool IsSelect(MachineInstr &MI)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
static std::optional< OperandInfo > getOperandInfo(const MachineOperand &MO)
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool isAlternateInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static cl::opt< bool > SplitAlternateInstructions("slp-split-alternate-instructions", cl::init(true), cl::Hidden, cl::desc("Improve the code quality by splitting alternate instructions"))
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static bool isMaskedLoadCompress(ArrayRef< Value * > VL, ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC, const DominatorTree &DT, const TargetLibraryInfo &TLI, const function_ref< bool(Value *)> AreAllUsersVectorized, bool &IsMasked, unsigned &InterleaveFactor, SmallVectorImpl< int > &CompressMask, VectorType *&LoadVecTy)
Checks if the VL can be transformed to a (masked)load + compress or (masked) interleaved load.
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
static cl::opt< bool > VectorizeCopyableElements("slp-copyable-elements", cl::init(true), cl::Hidden, cl::desc("Try to replace values with the idempotent instructions for " "better vectorization."))
Enables vectorization of copyable elements.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static bool allSameOpcode(ArrayRef< Value * > VL)
static InstructionCost canConvertToFMA(ArrayRef< Value * > VL, const InstructionsState &S, DominatorTree &DT, const DataLayout &DL, TargetTransformInfo &TTI, const TargetLibraryInfo &TLI)
Check if we can convert fadd/fsub sequence to FMAD.
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool isCommutative(Instruction *I, Value *ValWithUses, bool IsCopyable=false)
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Creates subvector insert.
static void findBuildAggregateRec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool isSimple(Instruction *I)
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static const SCEV * calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static DebugLoc getDebugLocFromPHI(PHINode &PN)
static std::optional< unsigned > getExtractIndex(const Instruction *E)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static cl::opt< bool > ForceStridedLoads("slp-force-strided-loads", cl::init(false), cl::Hidden, cl::desc("Generate strided loads even if they are not " "profitable. Used for testing only."))
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static bool isMainInstruction(Instruction *I, Instruction *MainOp, Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an main operation for the given MainOp and AltOp instruction...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static unsigned getNumberOfPotentiallyCommutativeOps(Instruction *I)
static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy, VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={})
This is similar to TargetTransformInfo::getScalarizationOverhead, but if ScalarTy is a FixedVectorTyp...
static bool buildCompressMask(ArrayRef< Value * > PointerOps, ArrayRef< unsigned > Order, Type *ScalarTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< int > &CompressMask)
Builds compress-like mask for shuffles for the given PointerOps, ordered with Order.
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
Calculates the costs of vectorized intrinsic (if possible) and vectorized function (if possible) call...
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static bool isValidForAlternation(unsigned Opcode)
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
static SmallVector< Constant * > replicateMask(ArrayRef< Constant * > Val, unsigned VF)
Replicates the given Val VF times.
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static Instruction * findInstructionWithOpcode(ArrayRef< Value * > VL, unsigned Opcode)
Find an instruction with a specific opcode in VL.
static InstructionCost getExtractWithExtendCost(const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput)
This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst is a FixedVectorType,...
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static std::pair< Instruction *, Instruction * > getMainAltOpsNoStateVL(ArrayRef< Value * > VL)
Returns main/alternate instructions for the given VL.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int64_t > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(ArrayRef< T * >, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static unsigned getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy, const unsigned Limit=std::numeric_limits< unsigned >::max())
Returns number of parts, the type VecTy will be split at the codegen phase.
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static bool tryToFindDuplicates(SmallVectorImpl< Value * > &VL, SmallVectorImpl< int > &ReuseShuffleIndices, const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, const InstructionsState &S, const BoUpSLP::EdgeInfo &UserTreeIdx, bool TryPad=false)
Checks that every instruction appears once in the list and if not, packs them, building ReuseShuffleI...
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Value * createExtractVector(IRBuilderBase &Builder, Value *Vec, unsigned SubVecVF, unsigned Index)
Generates subvector extract using Generator or using default shuffle.
static cl::opt< bool > DisableTreeReorder("slp-disable-tree-reorder", cl::init(false), cl::Hidden, cl::desc("Disable tree reordering even if it is " "profitable. Used for testing only."))
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static InstructionCost getVectorInstrCost(const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar, ArrayRef< std::tuple< Value *, User *, int > > ScalarUserAndIdx)
This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy is a FixedVectorType,...
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, Type *ScalarTy, unsigned Opcode0, unsigned Opcode1)
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
static const int BlockSize
LocallyHashedType DenseMapInfo< LocallyHashedType >::Empty
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
static const uint32_t IV[8]
Merges shuffle masks and emits final shuffle instruction, if required.
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost createFreeze(InstructionCost Cost)
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void resetForSameNode()
Reset the builder to handle perfect diamond match.
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Merges shuffle masks and emits final shuffle instruction, if required.
Value * createFreeze(Value *V)
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void resetForSameNode()
Reset the builder to handle perfect diamond match.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &, function_ref< Value *(Value *, Value *, ArrayRef< int >)>)> Action={})
Finalize emission of the shuffles.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
~ShuffleInstructionBuilder()
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
A manager for alias analyses.
Class for arbitrary precision integers.
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
void clearBit(unsigned BitPosition)
Set a given bit to 0.
uint64_t getZExtValue() const
Get zero extended value.
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
unsigned getBitWidth() const
Return the number of bits in the APInt.
bool ult(const APInt &RHS) const
Unsigned less than comparison.
void clearAllBits()
Set every bit to 0.
void negate()
Negate this APInt in place.
unsigned logBase2() const
void setAllBits()
Set every bit to 1.
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
bool isOne() const
Determine if this is a value of 1.
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
const T & back() const
back - Get the last element.
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
const T & front() const
front - Get the first element.
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
const T & consume_front()
consume_front() - Returns the first element and drops it from ArrayRef.
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::reverse_iterator reverse_iterator
InstListType::iterator iterator
Instruction iterators...
LLVM_ABI const_iterator getFirstNonPHIOrDbgOrAlloca() const
Returns an iterator to the first instruction in this block that is not a PHINode, a debug intrinsic,...
InstListType::const_reverse_iterator const_reverse_iterator
bool isEHPad() const
Return true if this basic block is an exception handling block.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Represents analyses that only rely on functions' control flow.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
LLVM_ABI void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
This class is the base class for the comparison instructions.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ ICMP_SLT
signed less than
@ ICMP_SLE
signed less or equal
@ ICMP_UGE
unsigned greater or equal
@ ICMP_UGT
unsigned greater than
@ ICMP_SGT
signed greater than
@ ICMP_ULT
unsigned less than
@ ICMP_SGE
signed greater or equal
@ ICMP_ULE
unsigned less or equal
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Predicate getPredicate() const
Return the predicate for this instruction.
static LLVM_ABI Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
static LLVM_ABI Constant * getBinOpIdentity(unsigned Opcode, Type *Ty, bool AllowRHSConstant=false, bool NSZ=false)
Return the identity constant for a binary opcode.
This is the shared class of boolean and integer constants.
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
const APInt & getValue() const
Return the constant as an APInt value reference.
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
static bool shouldExecute(CounterInfo &Counter)
static DebugLoc getUnknown()
An analysis that produces DemandedBits for a function.
ValueT & at(const_arg_type_t< KeyT > Val)
at - Return the entry for the specified key, or abort if no such entry exists.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
bool erase(const KeyT &Val)
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
Base class for the actual dominator tree node.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Convenience struct for specifying and reasoning about fast-math flags.
bool allowReassoc() const
Flag queries.
bool allowContract() const
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
ArrayRef< Type * > params() const
Type * getReturnType() const
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
For the node iterator we just need to turn the TreeEntry iterator into a TreeEntry* iterator so that ...
nodes_iterator operator++()
nodes_iterator(const ItTy &It2)
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Value * CreateFreeze(Value *V, const Twine &Name="")
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
LLVM_ABI Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
LLVM_ABI Value * CreateSelectWithUnknownProfile(Value *C, Value *True, Value *False, StringRef PassName, const Twine &Name="")
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
static InstructionCost getInvalid(CostType Val=0)
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
LLVM_ABI bool mayWriteToMemory() const LLVM_READONLY
Return true if this instruction may modify memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
const SmallVectorImpl< Type * > & getArgTypes() const
An instruction for reading from memory.
Value * getPointerOperand()
Analysis pass that exposes the LoopInfo for a function.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
This class implements a map that also provides access to all stored values in a deterministic order.
VectorType takeVector()
Clear the MapVector and return the underlying vector.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
ValueT lookup(const KeyT &Key) const
std::pair< KeyT, ValueT > & front()
Information for memory intrinsic cost model.
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
T & front() const
front - Get the first element.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
ArrayRef< value_type > getArrayRef() const
size_type size() const
Determine the number of elements in the SetVector.
const value_type & front() const
Return the first element of the SetVector.
void insert_range(Range &&R)
Vector takeVector()
Clear the SetVector and return the underlying vector.
bool contains(const_arg_type key) const
Check if the SetVector contains the given key.
void clear()
Completely clear the SetVector.
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
static LLVM_ABI bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static LLVM_ABI bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static LLVM_ABI bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
bool contains(const T &V) const
Check if the SmallSet contains the given element.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
iterator erase(const_iterator CI)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
The instances of the Type class are immutable: once they are created, they are never changed.
LLVM_ABI bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
bool isVectorTy() const
True if this is an instance of VectorType.
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
bool isPointerTy() const
True if this is an instance of PointerType.
LLVM_ABI unsigned getStructNumElements() const
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
bool isIntegerTy() const
True if this is an instance of IntegerType.
TypeID getTypeID() const
Return the type id for the type.
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
bool isVoidTy() const
Return true if this is 'void'.
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Value * getOperand(unsigned i) const
unsigned getNumOperands() const
iterator_range< value_op_iterator > operand_values()
The Vector Function Database.
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
user_iterator user_begin()
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
unsigned getValueID() const
Return an ID for the concrete type of this object.
LLVM_ABI bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
LLVM_ABI User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
LLVM_ABI unsigned getNumUses() const
This method computes the number of uses of this Value.
iterator_range< use_iterator > uses()
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
iterator find(const_arg_type_t< ValueT > V)
void insert_range(Range &&R)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
const ParentTy * getParent() const
self_iterator getIterator()
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator I
iterator_adaptor_base()=default
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
static const int ScoreConstants
Constants.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
A helper data structure to hold the operands of a vector of instructions.
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, ArrayRef< ValueList > Operands, const InstructionsState &S, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
Bottom Up SLP Vectorizer.
static bool isIdentityOrder(ArrayRef< unsigned > Order)
Does this non-empty order represent an identity order?
bool isProfitableToReorder() const
Checks if it is profitable to reorder the current tree.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleEntity &SE)
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleBundle &Bundle)
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
unsigned getMaxVecRegSize() const
OptimizationRemarkEmitter * getORE()
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
unsigned getTreeSize() const
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
InstructionCost getSpillCost()
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={}, InstructionCost ReductionCost=TTI::TCC_Free)
bool isVectorized(const Value *V) const
Check if the value is vectorized in the tree.
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool isStridedLoad(ArrayRef< Value * > PointerOps, Type *ScalarTy, Align Alignment, const int64_t Diff, const size_t Sz) const
Checks if strided loads can be generated out of VL loads with pointers PointerOps:
SmallVector< StoreInst *, 8 > StoreList
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
unsigned getMinVecRegSize() const
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
unsigned getMinVF(unsigned Sz) const
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, StridedPtrInfo &SPtrInfo, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
SmallVector< unsigned, 4 > OrdersType
SmallVector< Instruction *, 16 > InstrList
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool analyzeRtStrideCandidate(ArrayRef< Value * > PointerOps, Type *ScalarTy, Align CommonAlignment, SmallVectorImpl< unsigned > &SortedIndices, StridedPtrInfo &SPtrInfo) const
Return true if an array of scalar loads can be replaced with a strided load (with run-time stride).
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::ScheduleData &SD)
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
SmallVector< Value *, 8 > ValueList
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
bool analyzeConstantStrideCandidate(const ArrayRef< Value * > PointerOps, Type *ElemTy, Align Alignment, const SmallVectorImpl< unsigned > &SortedIndices, const int64_t Diff, Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const
Return true if an array of scalar loads can be replaced with a strided load (with constant stride).
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder)
Gets reordering data for the given tree entry.
SmallPtrSet< Value *, 16 > ValueSet
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals, ArrayRef< std::tuple< Value *, unsigned, bool > > VectorValuesAndScales)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ BasicBlock
Various leaf nodes.
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaxNum(const Opnd0 &Op0, const Opnd1 &Op1)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinimum(const Opnd0 &Op0, const Opnd1 &Op1)
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMaximum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinNum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MatchFunctor< Val, Pattern > match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
initializer< Ty > init(const Ty &Val)
unsigned combineHashValue(unsigned a, unsigned b)
Simplistic combination of 32-bit hash values into 32-bit hash values.
@ User
could "use" a pointer
DiagnosticInfoOptimizationBase::Argument NV
friend class Instruction
Iterator for Instructions in a `BasicBlock.
LLVM_ABI iterator begin() const
LLVM_ABI Instruction & front() const
A private "module" namespace for types and utilities used by this pass.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
LLVM_ABI Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
FunctionAddr VTableAddr Value
void stable_sort(R &&Range)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
void fill(R &&Range, T &&Value)
Provide wrappers to std::fill which take ranges instead of having to pass begin/end explicitly.
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
hash_code hash_value(const FixedPointSemantics &Val)
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
MaybeAlign getAlign(const CallInst &I, unsigned Index)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
auto pred_end(const MachineBasicBlock *BB)
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
LLVM_ABI void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
constexpr from_range_t from_range
LLVM_ABI std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
auto cast_or_null(const Y &Val)
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
iterator_range< po_iterator< T > > post_order(const T &G)
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
bool isa_and_nonnull(const Y &Val)
auto binary_search(R &&Range, T &&Value)
Provide wrappers to std::binary_search which take ranges instead of having to pass begin/end explicit...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
DomTreeNodeBase< BasicBlock > DomTreeNode
auto dyn_cast_or_null(const Y &Val)
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
constexpr bool has_single_bit(T Value) noexcept
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
bool isa_and_present(const Y &Val)
isa_and_present<X> - Functionally identical to isa, except that a null value is accepted.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
bool isPointerTy(const Type *T)
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
auto make_first_range(ContainerTy &&c)
Given a container of pairs, return a range over the first elements.
LLVM_ABI std::optional< int64_t > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
LLVM_ABI bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
bool isModOrRefSet(const ModRefInfo MRI)
bool is_sorted(R &&Range, Compare C)
Wrapper function around std::is_sorted to check if elements in a range R are sorted with respect to a...
LLVM_ABI bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Ref
The access may reference the value stored in memory.
@ LLVM_MARK_AS_BITMASK_ENUM
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
LLVM_ABI CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)
Returns the comparison predicate used when expanding a min/max reduction.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
FunctionAddr VTableAddr Next
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
OutputIt copy(R &&Range, OutputIt Out)
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
constexpr unsigned BitWidth
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
auto pred_begin(const MachineBasicBlock *BB)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
template class LLVM_TEMPLATE_ABI DomTreeNodeBase< BasicBlock >
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
LLVM_ABI Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
LLVM_ABI bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
LLVM_ABI bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Used to keep track of an operand bundle.
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
BoUpSLP::TreeEntry TreeEntry
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits(bool IsSimple=false)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits(bool simple=false)
DenseMapInfo< BoUpSLP::TreeEntry * > FirstInfo
static bool isEqual(const BoUpSLP::EdgeInfo &LHS, const BoUpSLP::EdgeInfo &RHS)
static BoUpSLP::EdgeInfo getEmptyKey()
DenseMapInfo< unsigned > SecondInfo
static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val)
static BoUpSLP::EdgeInfo getTombstoneKey()
An information struct used to provide DenseMap with the various necessary components for a given valu...
Add the VectorizableTree to the index iterator to be able to return TreeEntry pointers.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
ContainerTy & VectorizableTree
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
BoUpSLP::TreeEntry TreeEntry
static nodes_iterator nodes_end(BoUpSLP *R)
BoUpSLP::TreeEntry::VecTreeTy ContainerTy
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
TargetTransformInfo * TTI
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Function object to check whether the second component of a container supported by std::get (like std:...
This structure holds any data we need about the edges being traversed during buildTreeRec().
bool operator==(const EdgeInfo &Other) const
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
unsigned EdgeIdx
The operand index of the use.
void dump(raw_ostream &OS) const
Debug print.
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)