73 #ifdef EXPENSIVE_CHECKS
103 using namespace llvm;
105 using namespace slpvectorizer;
107 #define SV_NAME "slp-vectorizer"
108 #define DEBUG_TYPE "SLP"
110 STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
113 cl::desc(
"Run the SLP vectorization passes"));
117 cl::desc(
"Only vectorize if you gain more than this "
122 cl::desc(
"Attempt to vectorize horizontal reductions"));
127 "Attempt to vectorize horizontal reductions feeding into a store"));
131 cl::desc(
"Attempt to vectorize for this register size in bits"));
135 cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
139 cl::desc(
"Maximum depth of the lookup for consecutive stores."));
147 cl::desc(
"Limit the size of the SLP scheduling region per block"));
151 cl::desc(
"Attempt to vectorize for this register size in bits"));
155 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
159 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
165 cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
174 cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
178 cl::desc(
"Display the SLP trees with Graphviz"));
208 return isa<Constant>(V) && !isa<ConstantExpr>(V) && !isa<GlobalValue>(V);
215 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
216 !isa<ExtractValueInst, UndefValue>(V))
218 auto *
I = dyn_cast<Instruction>(V);
219 if (!
I || isa<ExtractValueInst>(
I))
221 if (!isa<FixedVectorType>(
I->getOperand(0)->getType()))
223 if (isa<ExtractElementInst>(
I))
225 assert(isa<InsertElementInst>(V) &&
"Expected only insertelement.");
239 for (
int I = 1,
E = VL.
size();
I <
E;
I++) {
240 auto *II = dyn_cast<Instruction>(VL[
I]);
244 if (
BB != II->getParent())
261 Value *FirstNonUndef =
nullptr;
262 for (
Value *V : VL) {
263 if (isa<UndefValue>(V))
265 if (!FirstNonUndef) {
269 if (V != FirstNonUndef)
272 return FirstNonUndef !=
nullptr;
277 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
278 return Cmp->isCommutative();
279 if (
auto *BO = dyn_cast<BinaryOperator>(
I))
280 return BO->isCommutative();
289 if (isa<UndefValue>(V))
291 auto *
C = dyn_cast<Constant>(V);
294 if (!
C->containsUndefOrPoisonElement())
296 auto *VecTy = dyn_cast<FixedVectorType>(
C->getType());
299 for (
unsigned I = 0,
E = VecTy->getNumElements();
I !=
E; ++
I) {
301 if (!isa<UndefValue>(Elem))
352 find_if(VL, [](
Value *V) {
return isa<ExtractElementInst>(V); });
355 auto *EI0 = cast<ExtractElementInst>(*It);
356 if (isa<ScalableVectorType>(EI0->getVectorOperandType()))
359 cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements();
360 Value *Vec1 =
nullptr;
361 Value *Vec2 =
nullptr;
362 enum ShuffleMode { Unknown,
Select, Permute };
363 ShuffleMode CommonShuffleMode = Unknown;
365 for (
unsigned I = 0,
E = VL.
size();
I <
E; ++
I) {
367 if (isa<UndefValue>(VL[
I]))
369 auto *EI = cast<ExtractElementInst>(VL[
I]);
370 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
372 auto *Vec = EI->getVectorOperand();
377 if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size)
379 if (isa<UndefValue>(EI->getIndexOperand()))
381 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
385 if (Idx->getValue().uge(Size))
387 unsigned IntIdx = Idx->getValue().getZExtValue();
391 if (!Vec1 || Vec1 == Vec) {
393 }
else if (!Vec2 || Vec2 == Vec) {
399 if (CommonShuffleMode == Permute)
404 CommonShuffleMode = Permute;
407 CommonShuffleMode =
Select;
410 if (CommonShuffleMode ==
Select && Vec2)
421 struct InstructionsState {
423 Value *OpValue =
nullptr;
434 unsigned getAltOpcode()
const {
439 bool isAltShuffle()
const {
return AltOp != MainOp; }
442 unsigned CheckedOpcode =
I->getOpcode();
443 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
446 InstructionsState() =
delete;
448 : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
457 auto *
I = dyn_cast<Instruction>(
Op);
458 if (
I &&
S.isOpcodeOrAlt(
I))
476 unsigned BaseIndex = 0);
484 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
485 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
494 unsigned BaseIndex) {
497 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
499 bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
500 bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
501 bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);
503 IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()
505 unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
506 unsigned AltOpcode = Opcode;
507 unsigned AltIndex = BaseIndex;
511 for (
int Cnt = 0,
E = VL.
size(); Cnt <
E; Cnt++) {
512 unsigned InstOpcode = cast<Instruction>(VL[Cnt])->getOpcode();
513 if (IsBinOp && isa<BinaryOperator>(VL[Cnt])) {
514 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
518 AltOpcode = InstOpcode;
522 }
else if (IsCastOp && isa<CastInst>(VL[Cnt])) {
523 Type *Ty0 = cast<Instruction>(VL[BaseIndex])->getOperand(0)->getType();
524 Type *Ty1 = cast<Instruction>(VL[Cnt])->getOperand(0)->getType();
526 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
528 if (Opcode == AltOpcode) {
531 "Cast isn't safe for alternation, logic needs to be updated!");
532 AltOpcode = InstOpcode;
537 }
else if (IsCmpOp && isa<CmpInst>(VL[Cnt])) {
538 auto *BaseInst = cast<Instruction>(VL[BaseIndex]);
539 auto *Inst = cast<Instruction>(VL[Cnt]);
540 Type *Ty0 = BaseInst->getOperand(0)->getType();
541 Type *Ty1 = Inst->getOperand(0)->getType();
543 Value *BaseOp0 = BaseInst->getOperand(0);
544 Value *BaseOp1 = BaseInst->getOperand(1);
545 Value *Op0 = Inst->getOperand(0);
546 Value *Op1 = Inst->getOperand(1);
548 cast<CmpInst>(VL[Cnt])->getPredicate();
553 if (InstOpcode == Opcode) {
554 if (BasePred == CurrentPred &&
557 if (BasePred == SwappedCurrentPred &&
561 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
563 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
565 Value *AltOp0 = AltInst->getOperand(0);
566 Value *AltOp1 = AltInst->getOperand(1);
568 if (AltPred == CurrentPred &&
571 if (AltPred == SwappedCurrentPred &&
575 if (BaseIndex == AltIndex && BasePred != CurrentPred) {
578 "Cast isn't safe for alternation, logic needs to be updated!");
582 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
584 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
585 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
588 }
else if (InstOpcode == Opcode || InstOpcode == AltOpcode)
590 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
593 return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
594 cast<Instruction>(VL[AltIndex]));
600 Type *Ty = VL[0]->getType();
601 for (
int i = 1,
e = VL.
size();
i <
e;
i++)
610 unsigned Opcode =
E->getOpcode();
611 assert((Opcode == Instruction::ExtractElement ||
612 Opcode == Instruction::ExtractValue) &&
613 "Expected extractelement or extractvalue instruction.");
614 if (Opcode == Instruction::ExtractElement) {
615 auto *CI = dyn_cast<ConstantInt>(
E->getOperand(1));
618 return CI->getZExtValue();
633 LoadInst *LI = cast<LoadInst>(UserInst);
638 return (
SI->getPointerOperand() == Scalar);
641 CallInst *CI = cast<CallInst>(UserInst);
658 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
665 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
666 return LI->isSimple();
668 return SI->isSimple();
670 return !
MI->isVolatile();
684 for (
int I = 0,
E = SubMask.
size();
I <
E; ++
I) {
686 Mask[SubMask[
I]] >= TermValue)
688 NewMask[
I] =
Mask[SubMask[
I]];
704 const unsigned Sz = Order.size();
707 for (
unsigned I = 0;
I < Sz; ++
I) {
709 UnusedIndices.
reset(Order[
I]);
711 MaskedIndices.
set(
I);
713 if (MaskedIndices.
none())
716 "Non-synced masked/available indices.");
720 assert(Idx >= 0 &&
"Indices must be synced.");
732 const unsigned E = Indices.
size();
734 for (
unsigned I = 0;
I <
E; ++
I)
741 unsigned Offset = 0) {
743 if (
const auto *
IE = dyn_cast<InsertElementInst>(InsertInst)) {
744 if (
const auto *CI = dyn_cast<ConstantInt>(
IE->getOperand(2))) {
745 auto *VT = cast<FixedVectorType>(
IE->getType());
746 if (CI->getValue().uge(VT->getNumElements()))
748 Index *= VT->getNumElements();
749 Index += CI->getZExtValue();
755 const auto *
IV = cast<InsertValueInst>(InsertInst);
756 Type *CurrentType =
IV->getType();
757 for (
unsigned I :
IV->indices()) {
758 if (
const auto *
ST = dyn_cast<StructType>(CurrentType)) {
760 CurrentType =
ST->getElementType(
I);
761 }
else if (
const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
762 Index *= AT->getNumElements();
763 CurrentType = AT->getElementType();
775 assert(!
Mask.empty() &&
"Expected non-empty mask.");
779 for (
unsigned I = 0,
E = Prev.size();
I <
E; ++
I)
781 Scalars[
Mask[
I]] = Prev[
I];
789 auto *
I = dyn_cast<Instruction>(V);
794 auto *IO = dyn_cast<Instruction>(V);
797 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
806 auto *
I = dyn_cast<Instruction>(V);
810 constexpr
int UsesLimit = 8;
811 return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(UsesLimit) &&
813 auto *IU = dyn_cast<Instruction>(U);
816 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
832 return !VL.
empty() &&
836 namespace slpvectorizer {
856 : BatchAA(*Aa),
F(Func), SE(Se),
TTI(Tti), TLI(TLi), LI(Li),
857 DT(Dt), AC(AC), DB(DB),
DL(
DL), ORE(ORE),
Builder(Se->getContext()) {
880 Value *vectorizeTree();
885 Value *vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues);
908 buildExternalUses(
const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
912 VectorizableTree.clear();
913 ScalarToTreeEntry.clear();
915 ExternalUses.clear();
916 for (
auto &Iter : BlocksSchedules) {
917 BlockScheduling *BS = Iter.second.get();
921 InstrElementSize.clear();
922 UserIgnoreList =
nullptr;
928 void optimizeGatherSequence();
952 void reorderTopToBottom();
961 void reorderBottomToTop(
bool IgnoreReorder =
false);
968 unsigned getVectorElementSize(
Value *V);
976 return MaxVecRegSize;
981 return MinVecRegSize;
985 return std::max(2U, getMinVecRegSize() / Sz);
991 return MaxVF ? MaxVF : UINT_MAX;
1004 bool isTreeTinyAndNotFullyVectorizable(
bool ForReduction =
false)
const;
1013 bool isLoadCombineReductionCandidate(
RecurKind RdxKind)
const;
1033 : UserTE(UserTE), EdgeIdx(EdgeIdx) {}
1035 TreeEntry *UserTE =
nullptr;
1037 unsigned EdgeIdx = UINT_MAX;
1046 OS <<
"{User:" << (UserTE ?
std::to_string(UserTE->Idx) :
"null")
1047 <<
" EdgeIdx:" << EdgeIdx <<
"}";
1063 const BoUpSLP &R,
int NumLanes,
int MaxLevel)
1064 :
DL(
DL), SE(SE), R(R), NumLanes(NumLanes), MaxLevel(MaxLevel) {}
1078 static const int ScoreConsecutiveLoads = 4;
1083 static const int ScoreSplatLoads = 3;
1085 static const int ScoreReversedLoads = 3;
1087 static const int ScoreConsecutiveExtracts = 4;
1089 static const int ScoreReversedExtracts = 3;
1091 static const int ScoreConstants = 2;
1093 static const int ScoreSameOpcode = 2;
1095 static const int ScoreAltOpcodes = 1;
1097 static const int ScoreSplat = 1;
1099 static const int ScoreUndef = 1;
1101 static const int ScoreFail = 0;
1103 static const int ScoreAllUserVectorized = 1;
1112 if (isa<LoadInst>(V1)) {
1114 auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *
V2) {
1116 static constexpr
unsigned Limit = 8;
1117 if (V1->hasNUsesOrMore(Limit) ||
V2->hasNUsesOrMore(Limit))
1120 auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
1122 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1125 return AllUsersVectorized(V1) && AllUsersVectorized(
V2);
1128 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1130 ((
int)V1->getNumUses() == NumLanes ||
1131 AllUsersAreInternal(V1,
V2)))
1132 return LookAheadHeuristics::ScoreSplatLoads;
1134 return LookAheadHeuristics::ScoreSplat;
1137 auto *LI1 = dyn_cast<LoadInst>(V1);
1138 auto *LI2 = dyn_cast<LoadInst>(
V2);
1140 if (LI1->getParent() != LI2->getParent())
1141 return LookAheadHeuristics::ScoreFail;
1144 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1145 LI2->getPointerOperand(),
DL, SE,
true);
1146 if (!Dist || *Dist == 0)
1147 return LookAheadHeuristics::ScoreFail;
1150 if (
std::abs(*Dist) > NumLanes / 2)
1151 return LookAheadHeuristics::ScoreAltOpcodes;
1155 return (*Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveLoads
1156 : LookAheadHeuristics::ScoreReversedLoads;
1159 auto *
C1 = dyn_cast<Constant>(V1);
1160 auto *C2 = dyn_cast<Constant>(
V2);
1162 return LookAheadHeuristics::ScoreConstants;
1170 if (isa<UndefValue>(
V2))
1171 return LookAheadHeuristics::ScoreConsecutiveExtracts;
1172 Value *EV2 =
nullptr;
1179 return LookAheadHeuristics::ScoreConsecutiveExtracts;
1181 return LookAheadHeuristics::ScoreConsecutiveExtracts;
1185 int Dist = Idx2 - Idx1;
1189 return LookAheadHeuristics::ScoreSplat;
1191 return LookAheadHeuristics::ScoreSameOpcode;
1192 return (Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveExtracts
1193 : LookAheadHeuristics::ScoreReversedExtracts;
1195 return LookAheadHeuristics::ScoreAltOpcodes;
1197 return LookAheadHeuristics::ScoreFail;
1200 auto *
I1 = dyn_cast<Instruction>(V1);
1201 auto *I2 = dyn_cast<Instruction>(
V2);
1203 if (
I1->getParent() != I2->getParent())
1204 return LookAheadHeuristics::ScoreFail;
1211 if (
S.getOpcode() &&
1212 (
S.MainOp->getNumOperands() <= 2 || !MainAltOps.
empty() ||
1213 !
S.isAltShuffle()) &&
1215 return cast<Instruction>(V)->getNumOperands() ==
1216 S.MainOp->getNumOperands();
1218 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
1219 : LookAheadHeuristics::ScoreSameOpcode;
1222 if (isa<UndefValue>(
V2))
1223 return LookAheadHeuristics::ScoreUndef;
1225 return LookAheadHeuristics::ScoreFail;
1259 int ShallowScoreAtThisLevel =
1260 getShallowScore(
LHS,
RHS, U1, U2, MainAltOps);
1268 auto *
I1 = dyn_cast<Instruction>(
LHS);
1269 auto *I2 = dyn_cast<Instruction>(
RHS);
1270 if (CurrLevel == MaxLevel || !(
I1 && I2) ||
I1 == I2 ||
1271 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
1272 (((isa<LoadInst>(
I1) && isa<LoadInst>(I2)) ||
1273 (
I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1274 (isa<ExtractElementInst>(
I1) && isa<ExtractElementInst>(I2))) &&
1275 ShallowScoreAtThisLevel))
1276 return ShallowScoreAtThisLevel;
1277 assert(
I1 && I2 &&
"Should have early exited.");
1284 for (
unsigned OpIdx1 = 0, NumOperands1 =
I1->getNumOperands();
1285 OpIdx1 != NumOperands1; ++OpIdx1) {
1287 int MaxTmpScore = 0;
1288 unsigned MaxOpIdx2 = 0;
1289 bool FoundBest =
false;
1293 ? I2->getNumOperands()
1294 :
std::min(I2->getNumOperands(), OpIdx1 + 1);
1295 assert(FromIdx <= ToIdx &&
"Bad index");
1296 for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1298 if (Op2Used.
count(OpIdx2))
1302 getScoreAtLevelRec(
I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
1303 I1, I2, CurrLevel + 1,
None);
1305 if (TmpScore > LookAheadHeuristics::ScoreFail &&
1306 TmpScore > MaxTmpScore) {
1307 MaxTmpScore = TmpScore;
1314 Op2Used.
insert(MaxOpIdx2);
1315 ShallowScoreAtThisLevel += MaxTmpScore;
1318 return ShallowScoreAtThisLevel;
1349 struct OperandData {
1350 OperandData() =
default;
1351 OperandData(
Value *V,
bool APO,
bool IsUsed)
1352 : V(V), APO(APO), IsUsed(IsUsed) {}
1362 bool IsUsed =
false;
1371 enum class ReorderingMode {
1389 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
1390 return OpsVec[OpIdx][Lane];
1394 const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const {
1395 return OpsVec[OpIdx][Lane];
1400 for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
1401 OpIdx != NumOperands; ++OpIdx)
1402 for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1404 OpsVec[OpIdx][Lane].IsUsed =
false;
1408 void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
1409 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
1421 int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
1422 Value *IdxLaneV = getData(Idx, Lane).V;
1423 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
1426 for (
unsigned Ln = 0,
E = getNumLanes(); Ln <
E; ++Ln) {
1429 Value *OpIdxLnV = getData(OpIdx, Ln).V;
1430 if (!isa<Instruction>(OpIdxLnV))
1432 Uniques.
insert(OpIdxLnV);
1434 int UniquesCount = Uniques.
size();
1435 int UniquesCntWithIdxLaneV =
1436 Uniques.
contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
1437 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1438 int UniquesCntWithOpIdxLaneV =
1439 Uniques.
contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
1440 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
1443 UniquesCntWithOpIdxLaneV) -
1444 (
PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
1453 int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
1454 Value *IdxLaneV = getData(Idx, Lane).V;
1455 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1463 return LookAheadHeuristics::ScoreAllUserVectorized;
1464 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
1465 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
1467 return R.areAllUsersVectorized(IdxLaneI,
None)
1468 ? LookAheadHeuristics::ScoreAllUserVectorized
1475 static const int ScoreScaleFactor = 10;
1483 int Lane,
unsigned OpIdx,
unsigned Idx,
1493 int SplatScore = getSplatScore(Lane, OpIdx, Idx);
1494 if (Score <= -SplatScore) {
1499 Score += SplatScore;
1505 Score *= ScoreScaleFactor;
1506 Score += getExternalUseScore(Lane, OpIdx, Idx);
1527 unsigned NumOperands = getNumOperands();
1530 Value *OpLastLane = getData(OpIdx, LastLane).V;
1533 ReorderingMode RMode = ReorderingModes[OpIdx];
1538 bool OpIdxAPO = getData(OpIdx, Lane).APO;
1548 BestScoresPerLanes.
try_emplace(std::make_pair(OpIdx, Lane), 0)
1557 for (
unsigned Idx = 0; Idx != NumOperands; ++Idx) {
1559 OperandData &OpData = getData(Idx, Lane);
1561 bool OpAPO = OpData.APO;
1570 if (OpAPO != OpIdxAPO)
1577 case ReorderingMode::Opcode: {
1578 bool LeftToRight = Lane > LastLane;
1579 Value *OpLeft = (LeftToRight) ? OpLastLane :
Op;
1580 Value *OpRight = (LeftToRight) ?
Op : OpLastLane;
1581 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
1582 OpIdx, Idx, IsUsed);
1583 if (Score >
static_cast<int>(BestOp.Score)) {
1585 BestOp.Score = Score;
1586 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
1590 case ReorderingMode::Splat:
1591 if (
Op == OpLastLane)
1600 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
1611 unsigned getBestLaneToStartReordering()
const {
1612 unsigned Min = UINT_MAX;
1613 unsigned SameOpNumber = 0;
1624 for (
int I = getNumLanes();
I > 0; --
I) {
1625 unsigned Lane =
I - 1;
1626 OperandsOrderData NumFreeOpsHash =
1627 getMaxNumOperandsThatCanBeReordered(Lane);
1630 if (NumFreeOpsHash.NumOfAPOs < Min) {
1631 Min = NumFreeOpsHash.NumOfAPOs;
1632 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1634 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1635 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
1636 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
1639 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1640 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1641 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
1642 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
1643 auto It = HashMap.
find(NumFreeOpsHash.Hash);
1644 if (It == HashMap.
end())
1645 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1651 unsigned BestLane = 0;
1652 unsigned CntMin = UINT_MAX;
1654 if (
Data.second.first < CntMin) {
1655 CntMin =
Data.second.first;
1656 BestLane =
Data.second.second;
1663 struct OperandsOrderData {
1666 unsigned NumOfAPOs = UINT_MAX;
1669 unsigned NumOpsWithSameOpcodeParent = 0;
1683 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const {
1684 unsigned CntTrue = 0;
1685 unsigned NumOperands = getNumOperands();
1695 bool AllUndefs =
true;
1696 unsigned NumOpsWithSameOpcodeParent = 0;
1700 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1701 const OperandData &OpData = getData(OpIdx, Lane);
1706 if (
auto *
I = dyn_cast<Instruction>(OpData.V)) {
1708 I->getParent() != Parent) {
1709 if (NumOpsWithSameOpcodeParent == 0) {
1710 NumOpsWithSameOpcodeParent = 1;
1712 Parent =
I->getParent();
1714 --NumOpsWithSameOpcodeParent;
1717 ++NumOpsWithSameOpcodeParent;
1721 Hash,
hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
1722 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
1726 OperandsOrderData
Data;
1727 Data.NumOfAPOs =
std::max(CntTrue, NumOperands - CntTrue);
1728 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
1737 "Expected same number of lanes");
1738 assert(isa<Instruction>(VL[0]) &&
"Expected instruction");
1739 unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
1740 OpsVec.
resize(NumOperands);
1741 unsigned NumLanes = VL.
size();
1742 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1743 OpsVec[OpIdx].
resize(NumLanes);
1744 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
1745 assert(isa<Instruction>(VL[Lane]) &&
"Expected instruction");
1756 bool IsInverseOperation = !
isCommutative(cast<Instruction>(VL[Lane]));
1757 bool APO = (OpIdx == 0) ?
false : IsInverseOperation;
1758 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
1765 unsigned getNumOperands()
const {
return OpsVec.size(); }
1768 unsigned getNumLanes()
const {
return OpsVec[0].size(); }
1771 Value *getValue(
unsigned OpIdx,
unsigned Lane)
const {
1772 return getData(OpIdx, Lane).V;
1776 bool empty()
const {
return OpsVec.empty(); }
1784 bool shouldBroadcast(
Value *
Op,
unsigned OpIdx,
unsigned Lane) {
1785 bool OpAPO = getData(OpIdx, Lane).APO;
1786 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
1790 bool FoundCandidate =
false;
1791 for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
1792 OperandData &
Data = getData(OpI, Ln);
1793 if (
Data.APO != OpAPO ||
Data.IsUsed)
1796 FoundCandidate =
true;
1801 if (!FoundCandidate)
1811 :
DL(
DL), SE(SE), R(R) {
1813 appendOperandsOfVL(RootVL);
1820 assert(OpsVec[OpIdx].
size() == getNumLanes() &&
1821 "Expected same num of lanes across all operands");
1822 for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
1823 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
1831 unsigned NumOperands = getNumOperands();
1832 unsigned NumLanes = getNumLanes();
1852 unsigned FirstLane = getBestLaneToStartReordering();
1855 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1856 Value *OpLane0 = getValue(OpIdx, FirstLane);
1859 if (isa<LoadInst>(OpLane0))
1861 else if (isa<Instruction>(OpLane0)) {
1863 if (shouldBroadcast(OpLane0, OpIdx, FirstLane))
1864 ReorderingModes[OpIdx] = ReorderingMode::Splat;
1866 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
1868 else if (isa<Constant>(OpLane0))
1870 else if (isa<Argument>(OpLane0))
1872 ReorderingModes[OpIdx] = ReorderingMode::Splat;
1882 auto &&SkipReordering = [
this]() {
1885 for (
const OperandData &
Data : Op0)
1888 if (
any_of(
Op, [&UniqueValues](
const OperandData &
Data) {
1907 if (SkipReordering())
1910 bool StrategyFailed =
false;
1918 for (
unsigned I = 0;
I < NumOperands; ++
I)
1919 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
1921 for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
1924 int Lane = FirstLane +
Direction * Distance;
1925 if (Lane < 0 || Lane >= (
int)NumLanes)
1928 assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
1931 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1934 OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
1941 swap(OpIdx, *BestIdx, Lane);
1946 StrategyFailed =
true;
1949 if (MainAltOps[OpIdx].
size() != 2) {
1950 OperandData &AltOp = getData(OpIdx, Lane);
1951 InstructionsState OpS =
1953 if (OpS.getOpcode() && OpS.isAltShuffle())
1954 MainAltOps[OpIdx].push_back(AltOp.V);
1960 if (!StrategyFailed)
1965 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1970 case ReorderingMode::Opcode:
1974 case ReorderingMode::Splat:
1984 return OS << getModeStr(RMode);
1989 printMode(RMode,
dbgs());
1993 return printMode(RMode, OS);
1997 const unsigned Indent = 2;
2000 OS <<
"Operand " << Cnt++ <<
"\n";
2001 for (
const OperandData &OpData : OpDataVec) {
2002 OS.
indent(Indent) <<
"{";
2003 if (
Value *V = OpData.V)
2007 OS <<
", APO:" << OpData.APO <<
"}\n";
2026 int Limit = LookAheadHeuristics::ScoreFail) {
2029 int BestScore = Limit;
2031 for (
int I : seq<int>(0, Candidates.size())) {
2033 Candidates[
I].second,
2036 if (Score > BestScore) {
2051 DeletedInstructions.insert(
I);
2057 return AnalyzedReductionsRoots.count(
I);
2062 AnalyzedReductionsRoots.insert(
I);
2067 return AnalyzedReductionVals.contains(
hash_value(VL));
2072 AnalyzedReductionVals.insert(
hash_value(VL));
2076 AnalyzedReductionsRoots.clear();
2077 AnalyzedReductionVals.clear();
2095 canReorderOperands(TreeEntry *UserTE,
2102 TreeEntry *getVectorizedOperand(TreeEntry *UserTE,
unsigned OpIdx) {
2104 TreeEntry *TE =
nullptr;
2106 TE = getTreeEntry(V);
2109 if (It != VL.
end() && TE->isSame(VL))
2116 const TreeEntry *getVectorizedOperand(
const TreeEntry *UserTE,
2117 unsigned OpIdx)
const {
2118 return const_cast<BoUpSLP *
>(
this)->getVectorizedOperand(
2119 const_cast<TreeEntry *
>(UserTE), OpIdx);
2132 const EdgeInfo &EI);
2143 Value *vectorizeTree(TreeEntry *
E);
2158 const APInt &ShuffledIndices,
2159 bool NeedToShuffle)
const;
2176 void setInsertPointAfterBundle(
const TreeEntry *
E);
2183 bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
2198 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
2205 OrdersType &ReorderIndices)
const;
2214 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
2218 TreeEntry(VecTreeTy &Container) : Container(Container) {}
2227 [Scalars](
Value *V,
int Idx) {
2228 return (isa<UndefValue>(V) &&
2229 Idx == UndefMaskElem) ||
2230 (Idx != UndefMaskElem && V == Scalars[Idx]);
2233 if (!ReorderIndices.empty()) {
2240 return IsSame(Scalars,
Mask);
2241 if (VL.
size() == ReuseShuffleIndices.size()) {
2243 return IsSame(Scalars,
Mask);
2247 return IsSame(Scalars, ReuseShuffleIndices);
2251 bool hasEqualOperands(
const TreeEntry &TE)
const {
2252 if (
TE.getNumOperands() != getNumOperands())
2255 for (
unsigned I = 0,
E = getNumOperands();
I <
E; ++
I) {
2256 unsigned PrevCount =
Used.count();
2257 for (
unsigned K = 0; K <
E; ++K) {
2260 if (getOperand(K) ==
TE.getOperand(
I)) {
2266 if (PrevCount ==
Used.count())
2275 unsigned getVectorFactor()
const {
2276 if (!ReuseShuffleIndices.empty())
2277 return ReuseShuffleIndices.size();
2278 return Scalars.
size();
2285 Value *VectorizedValue =
nullptr;
2290 enum EntryState { Vectorize, ScatterVectorize, NeedToGather };
2305 VecTreeTy &Container;
2331 "Number of operands is greater than the number of scalars.");
2337 void setOperandsInOrder() {
2339 auto *I0 = cast<Instruction>(Scalars[0]);
2340 Operands.resize(I0->getNumOperands());
2341 unsigned NumLanes = Scalars.size();
2342 for (
unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
2343 OpIdx != NumOperands; ++OpIdx) {
2345 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2346 auto *
I = cast<Instruction>(Scalars[Lane]);
2347 assert(
I->getNumOperands() == NumOperands &&
2348 "Expected same number of operands");
2349 Operands[OpIdx][Lane] =
I->getOperand(OpIdx);
2356 for (ValueList &Operand :
Operands)
2361 ValueList &getOperand(
unsigned OpIdx) {
2373 unsigned getNumOperands()
const {
return Operands.size(); }
2376 Value *getSingleOperand(
unsigned OpIdx)
const {
2383 bool isAltShuffle()
const {
return MainOp != AltOp; }
2386 unsigned CheckedOpcode =
I->getOpcode();
2388 getAltOpcode() == CheckedOpcode);
2395 auto *
I = dyn_cast<Instruction>(
Op);
2396 if (
I && isOpcodeOrAlt(
I))
2401 void setOperations(
const InstructionsState &
S) {
2416 return MainOp ? MainOp->
getOpcode() : 0;
2419 unsigned getAltOpcode()
const {
2425 int findLaneForValue(
Value *V)
const {
2426 unsigned FoundLane = std::distance(Scalars.begin(),
find(Scalars, V));
2427 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
2428 if (!ReorderIndices.empty())
2429 FoundLane = ReorderIndices[FoundLane];
2430 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
2431 if (!ReuseShuffleIndices.empty()) {
2432 FoundLane = std::distance(ReuseShuffleIndices.begin(),
2433 find(ReuseShuffleIndices, FoundLane));
2441 dbgs() << Idx <<
".\n";
2442 for (
unsigned OpI = 0, OpE =
Operands.size(); OpI != OpE; ++OpI) {
2443 dbgs() <<
"Operand " << OpI <<
":\n";
2447 dbgs() <<
"Scalars: \n";
2448 for (
Value *V : Scalars)
2450 dbgs() <<
"State: ";
2453 dbgs() <<
"Vectorize\n";
2455 case ScatterVectorize:
2456 dbgs() <<
"ScatterVectorize\n";
2459 dbgs() <<
"NeedToGather\n";
2462 dbgs() <<
"MainOp: ";
2464 dbgs() << *MainOp <<
"\n";
2467 dbgs() <<
"AltOp: ";
2469 dbgs() << *AltOp <<
"\n";
2472 dbgs() <<
"VectorizedValue: ";
2473 if (VectorizedValue)
2474 dbgs() << *VectorizedValue <<
"\n";
2477 dbgs() <<
"ReuseShuffleIndices: ";
2478 if (ReuseShuffleIndices.empty())
2481 for (
int ReuseIdx : ReuseShuffleIndices)
2482 dbgs() << ReuseIdx <<
", ";
2484 dbgs() <<
"ReorderIndices: ";
2485 for (
unsigned ReorderIdx : ReorderIndices)
2486 dbgs() << ReorderIdx <<
", ";
2488 dbgs() <<
"UserTreeIndices: ";
2489 for (
const auto &EInfo : UserTreeIndices)
2490 dbgs() << EInfo <<
", ";
2500 dbgs() <<
"SLP: Calculated costs for Tree:\n";
E->dump();
2501 dbgs() <<
"SLP: Costs:\n";
2502 dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
2503 dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
2504 dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
2505 dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = " <<
2506 ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
2512 const InstructionsState &
S,
2513 const EdgeInfo &UserTreeIdx,
2516 TreeEntry::EntryState EntryState =
2517 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
2518 return newTreeEntry(VL, EntryState, Bundle,
S, UserTreeIdx,
2519 ReuseShuffleIndices, ReorderIndices);
2523 TreeEntry::EntryState EntryState,
2525 const InstructionsState &
S,
2526 const EdgeInfo &UserTreeIdx,
2529 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
2530 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
2531 "Need to vectorize gather entry?");
2532 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
2533 TreeEntry *
Last = VectorizableTree.back().get();
2534 Last->Idx = VectorizableTree.size() - 1;
2535 Last->State = EntryState;
2536 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
2537 ReuseShuffleIndices.end());
2538 if (ReorderIndices.empty()) {
2540 Last->setOperations(
S);
2543 Last->Scalars.assign(VL.
size(),
nullptr);
2545 [VL](
unsigned Idx) ->
Value * {
2546 if (Idx >= VL.size())
2547 return UndefValue::get(VL.front()->getType());
2551 Last->setOperations(
S);
2552 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
2554 if (
Last->State != TreeEntry::NeedToGather) {
2555 for (
Value *V : VL) {
2556 assert(!getTreeEntry(V) &&
"Scalar already in tree!");
2557 ScalarToTreeEntry[V] =
Last;
2560 ScheduleData *BundleMember = *Bundle;
2561 assert((BundleMember || isa<PHINode>(
S.MainOp) ||
2564 "Bundle and VL out of sync");
2566 for (
Value *V : VL) {
2569 assert(BundleMember &&
"Unexpected end of bundle.");
2570 BundleMember->TE =
Last;
2571 BundleMember = BundleMember->NextInBundle;
2574 assert(!BundleMember &&
"Bundle and VL out of sync");
2576 MustGather.insert(VL.begin(), VL.end());
2579 if (UserTreeIdx.UserTE)
2580 Last->UserTreeIndices.push_back(UserTreeIdx);
2587 TreeEntry::VecTreeTy VectorizableTree;
2592 for (
unsigned Id = 0, IdE = VectorizableTree.size();
Id != IdE; ++
Id) {
2593 VectorizableTree[
Id]->dump();
2599 TreeEntry *getTreeEntry(
Value *V) {
return ScalarToTreeEntry.lookup(V); }
2601 const TreeEntry *getTreeEntry(
Value *V)
const {
2602 return ScalarToTreeEntry.lookup(V);
2612 ValueSet MustGather;
2615 struct ExternalUser {
2637 AliasCacheKey key = std::make_pair(Inst1, Inst2);
2640 return result.getValue();
2642 bool aliased =
true;
2644 aliased =
isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
2650 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
2678 UserList ExternalUses;
2693 struct ScheduleData {
2696 enum { InvalidDeps = -1 };
2698 ScheduleData() =
default;
2700 void init(
int BlockSchedulingRegionID,
Value *OpVal) {
2701 FirstInBundle =
this;
2702 NextInBundle =
nullptr;
2703 NextLoadStore =
nullptr;
2704 IsScheduled =
false;
2705 SchedulingRegionID = BlockSchedulingRegionID;
2706 clearDependencies();
2713 if (hasValidDependencies()) {
2714 assert(UnscheduledDeps <= Dependencies &&
"invariant");
2716 assert(UnscheduledDeps == Dependencies &&
"invariant");
2720 assert(isSchedulingEntity() &&
2721 "unexpected scheduled state");
2722 for (
const ScheduleData *BundleMember =
this; BundleMember;
2723 BundleMember = BundleMember->NextInBundle) {
2724 assert(BundleMember->hasValidDependencies() &&
2725 BundleMember->UnscheduledDeps == 0 &&
2726 "unexpected scheduled state");
2727 assert((BundleMember ==
this || !BundleMember->IsScheduled) &&
2728 "only bundle is marked scheduled");
2732 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
2733 "all bundle members must be in same basic block");
2739 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
2743 bool isSchedulingEntity()
const {
return FirstInBundle ==
this; }
2747 bool isPartOfBundle()
const {
2748 return NextInBundle !=
nullptr || FirstInBundle !=
this ||
TE;
2753 bool isReady()
const {
2754 assert(isSchedulingEntity() &&
2755 "can't consider non-scheduling entity for ready list");
2756 return unscheduledDepsInBundle() == 0 && !IsScheduled;
2762 int incrementUnscheduledDeps(
int Incr) {
2763 assert(hasValidDependencies() &&
2764 "increment of unscheduled deps would be meaningless");
2765 UnscheduledDeps += Incr;
2766 return FirstInBundle->unscheduledDepsInBundle();
2771 void resetUnscheduledDeps() {
2772 UnscheduledDeps = Dependencies;
2776 void clearDependencies() {
2777 Dependencies = InvalidDeps;
2778 resetUnscheduledDeps();
2779 MemoryDependencies.clear();
2780 ControlDependencies.clear();
2783 int unscheduledDepsInBundle()
const {
2784 assert(isSchedulingEntity() &&
"only meaningful on the bundle");
2786 for (
const ScheduleData *BundleMember =
this; BundleMember;
2787 BundleMember = BundleMember->NextInBundle) {
2788 if (BundleMember->UnscheduledDeps == InvalidDeps)
2790 Sum += BundleMember->UnscheduledDeps;
2796 if (!isSchedulingEntity()) {
2797 os <<
"/ " << *Inst;
2798 }
else if (NextInBundle) {
2800 ScheduleData *SD = NextInBundle;
2802 os <<
';' << *SD->Inst;
2803 SD = SD->NextInBundle;
2814 Value *OpValue =
nullptr;
2817 TreeEntry *
TE =
nullptr;
2821 ScheduleData *FirstInBundle =
nullptr;
2825 ScheduleData *NextInBundle =
nullptr;
2829 ScheduleData *NextLoadStore =
nullptr;
2843 int SchedulingRegionID = 0;
2846 int SchedulingPriority = 0;
2852 int Dependencies = InvalidDeps;
2858 int UnscheduledDeps = InvalidDeps;
2862 bool IsScheduled =
false;
2867 const BoUpSLP::ScheduleData &SD) {
2892 struct BlockScheduling {
2894 :
BB(
BB), ChunkSize(
BB->size()), ChunkPos(ChunkSize) {}
2898 ScheduleStart =
nullptr;
2899 ScheduleEnd =
nullptr;
2900 FirstLoadStoreInRegion =
nullptr;
2901 LastLoadStoreInRegion =
nullptr;
2902 RegionHasStackSave =
false;
2906 ScheduleRegionSizeLimit -= ScheduleRegionSize;
2909 ScheduleRegionSize = 0;
2913 ++SchedulingRegionID;
2917 if (
BB !=
I->getParent())
2920 ScheduleData *SD = ScheduleDataMap.lookup(
I);
2921 if (SD && isInSchedulingRegion(SD))
2926 ScheduleData *getScheduleData(
Value *V) {
2927 if (
auto *
I = dyn_cast<Instruction>(V))
2928 return getScheduleData(
I);
2934 return getScheduleData(V);
2935 auto I = ExtraScheduleDataMap.find(V);
2936 if (
I != ExtraScheduleDataMap.end()) {
2937 ScheduleData *SD =
I->second.lookup(
Key);
2938 if (SD && isInSchedulingRegion(SD))
2944 bool isInSchedulingRegion(ScheduleData *SD)
const {
2945 return SD->SchedulingRegionID == SchedulingRegionID;
2950 template <
typename ReadyListType>
2951 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
2952 SD->IsScheduled =
true;
2955 for (ScheduleData *BundleMember = SD; BundleMember;
2956 BundleMember = BundleMember->NextInBundle) {
2957 if (BundleMember->Inst != BundleMember->OpValue)
2963 auto &&DecrUnsched = [
this, &ReadyList](
Instruction *
I) {
2964 doForAllOpcodes(
I, [&ReadyList](ScheduleData *OpDef) {
2965 if (OpDef && OpDef->hasValidDependencies() &&
2966 OpDef->incrementUnscheduledDeps(-1) == 0) {
2970 ScheduleData *DepBundle = OpDef->FirstInBundle;
2971 assert(!DepBundle->IsScheduled &&
2972 "already scheduled bundle gets ready");
2973 ReadyList.insert(DepBundle);
2975 <<
"SLP: gets ready (def): " << *DepBundle <<
"\n");
2983 if (TreeEntry *TE = BundleMember->TE) {
2985 int Lane = std::distance(TE->Scalars.begin(),
2986 find(TE->Scalars, BundleMember->Inst));
2987 assert(Lane >= 0 &&
"Lane not set");
2995 auto *
In = BundleMember->Inst;
2997 (isa<ExtractValueInst>(
In) || isa<ExtractElementInst>(
In) ||
2998 In->getNumOperands() == TE->getNumOperands()) &&
2999 "Missed TreeEntry operands?");
3002 for (
unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
3003 OpIdx != NumOperands; ++OpIdx)
3004 if (
auto *
I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
3009 for (
Use &U : BundleMember->Inst->operands())
3010 if (
auto *
I = dyn_cast<Instruction>(U.get()))
3014 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
3015 if (MemoryDepSD->hasValidDependencies() &&
3016 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
3019 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
3020 assert(!DepBundle->IsScheduled &&
3021 "already scheduled bundle gets ready");
3022 ReadyList.insert(DepBundle);
3024 <<
"SLP: gets ready (mem): " << *DepBundle <<
"\n");
3028 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
3029 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
3032 ScheduleData *DepBundle = DepSD->FirstInBundle;
3033 assert(!DepBundle->IsScheduled &&
3034 "already scheduled bundle gets ready");
3035 ReadyList.insert(DepBundle);
3037 <<
"SLP: gets ready (ctl): " << *DepBundle <<
"\n");
3049 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
3050 ScheduleStart->comesBefore(ScheduleEnd) &&
3051 "Not a valid scheduling region?");
3053 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
3054 auto *SD = getScheduleData(
I);
3057 assert(isInSchedulingRegion(SD) &&
3058 "primary schedule data not in window?");
3059 assert(isInSchedulingRegion(SD->FirstInBundle) &&
3060 "entire bundle in window!");
3062 doForAllOpcodes(
I, [](ScheduleData *SD) { SD->verify(); });
3065 for (
auto *SD : ReadyInsts) {
3066 assert(SD->isSchedulingEntity() && SD->isReady() &&
3067 "item in ready list not ready?");
3072 void doForAllOpcodes(
Value *V,
3074 if (ScheduleData *SD = getScheduleData(V))
3076 auto I = ExtraScheduleDataMap.find(V);
3077 if (
I != ExtraScheduleDataMap.end())
3078 for (
auto &
P :
I->second)
3079 if (isInSchedulingRegion(
P.second))
3084 template <
typename ReadyListType>
3085 void initialFillReadyList(ReadyListType &ReadyList) {
3086 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
3087 doForAllOpcodes(
I, [&](ScheduleData *SD) {
3088 if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
3090 ReadyList.insert(SD);
3092 <<
"SLP: initially in ready list: " << *SD <<
"\n");
3109 const InstructionsState &
S);
3115 ScheduleData *allocateScheduleDataChunks();
3119 bool extendSchedulingRegion(
Value *V,
const InstructionsState &
S);
3124 ScheduleData *PrevLoadStore,
3125 ScheduleData *NextLoadStore);
3129 void calculateDependencies(ScheduleData *SD,
bool InsertInReadyList,
3133 void resetSchedule();
3138 std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
3154 ExtraScheduleDataMap;
3167 ScheduleData *FirstLoadStoreInRegion =
nullptr;
3171 ScheduleData *LastLoadStoreInRegion =
nullptr;
3176 bool RegionHasStackSave =
false;
3179 int ScheduleRegionSize = 0;
3188 int SchedulingRegionID = 1;
3196 void scheduleBlock(BlockScheduling *BS);
3203 struct OrdersTypeDenseMapInfo {
3204 static OrdersType getEmptyKey() {
3210 static OrdersType getTombstoneKey() {
3216 static unsigned getHashValue(
const OrdersType &V) {
3220 static bool isEqual(
const OrdersType &
LHS,
const OrdersType &
RHS) {
3237 unsigned MaxVecRegSize;
3238 unsigned MinVecRegSize;
3263 struct ChildIteratorType
3265 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
3276 return R.VectorizableTree[0].get();
3280 return {
N->UserTreeIndices.begin(),
N->Container};
3284 return {
N->UserTreeIndices.end(),
N->Container};
3289 class nodes_iterator {
3290 using ItTy = ContainerTy::iterator;
3300 bool operator!=(
const nodes_iterator &N2)
const {
return N2.It != It; }
3304 return nodes_iterator(R->VectorizableTree.begin());
3308 return nodes_iterator(R->VectorizableTree.end());
3311 static unsigned size(BoUpSLP *R) {
return R->VectorizableTree.size(); }
3324 for (
auto V : Entry->Scalars) {
3326 if (
llvm::any_of(
R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
3327 return EU.Scalar == V;
3337 if (Entry->State == TreeEntry::NeedToGather)
3345 BoUpSLP::~BoUpSLP() {
3347 for (
auto *
I : DeletedInstructions) {
3348 for (
Use &U :
I->operands()) {
3349 auto *
Op = dyn_cast<Instruction>(U.get());
3350 if (
Op && !DeletedInstructions.count(
Op) &&
Op->hasOneUser() &&
3354 I->dropAllReferences();
3356 for (
auto *
I : DeletedInstructions) {
3358 "trying to erase instruction with users.");
3359 I->eraseFromParent();
3365 #ifdef EXPENSIVE_CHECKS
3377 "Expected non-empty mask.");
3380 for (
unsigned I = 0,
E = Prev.size();
I <
E; ++
I)
3382 Reuses[
Mask[
I]] = Prev[
I];
3390 assert(!
Mask.empty() &&
"Expected non-empty mask.");
3392 if (Order.empty()) {
3394 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
3404 for (
unsigned I = 0,
E =
Mask.size();
I <
E; ++
I)
3406 Order[MaskOrder[
I]] =
I;
3411 BoUpSLP::findReusedOrderedScalars(
const BoUpSLP::TreeEntry &TE) {
3412 assert(TE.State == TreeEntry::NeedToGather &&
"Expected gather node only.");
3413 unsigned NumScalars = TE.Scalars.size();
3414 OrdersType CurrentOrder(NumScalars, NumScalars);
3417 const TreeEntry *STE =
nullptr;
3421 for (
unsigned I = 0;
I < NumScalars; ++
I) {
3422 Value *V = TE.Scalars[
I];
3423 if (!isa<LoadInst, ExtractElementInst, ExtractValueInst>(V))
3425 if (
const auto *LocalSTE = getTreeEntry(V)) {
3428 else if (STE != LocalSTE)
3432 std::distance(STE->Scalars.begin(),
find(STE->Scalars, V));
3433 if (Lane >= NumScalars)
3435 if (CurrentOrder[Lane] != NumScalars) {
3438 UsedPositions.
reset(CurrentOrder[Lane]);
3442 CurrentOrder[Lane] =
I;
3443 UsedPositions.
set(
I);
3448 if (STE && (UsedPositions.
count() > 1 || STE->Scalars.size() == 2)) {
3450 for (
unsigned I = 0;
I < NumScalars; ++
I)
3451 if (CurrentOrder[
I] !=
I && CurrentOrder[
I] != NumScalars)
3455 if (IsIdentityOrder(CurrentOrder)) {
3456 CurrentOrder.
clear();
3457 return CurrentOrder;
3459 auto *It = CurrentOrder.begin();
3460 for (
unsigned I = 0;
I < NumScalars;) {
3461 if (UsedPositions.
test(
I)) {
3465 if (*It == NumScalars) {
3471 return CurrentOrder;
3478 enum class LoadsState { Gather, Vectorize, ScatterVectorize };
3497 if (
DL.getTypeSizeInBits(ScalarTy) !=
DL.getTypeAllocSizeInBits(ScalarTy))
3498 return LoadsState::Gather;
3504 auto *POIter = PointerOps.begin();
3505 for (
Value *V : VL) {
3506 auto *L = cast<LoadInst>(V);
3508 return LoadsState::Gather;
3509 *POIter = L->getPointerOperand();
3516 if (IsSorted ||
all_of(PointerOps, [&PointerOps](
Value *
P) {
3519 auto *
GEP = dyn_cast<GetElementPtrInst>(
P);
3522 auto *GEP0 = cast<GetElementPtrInst>(PointerOps.front());
3523 return GEP->getNumOperands() == 2 &&
3532 if (Order.empty()) {
3533 Ptr0 = PointerOps.front();
3534 PtrN = PointerOps.back();
3536 Ptr0 = PointerOps[Order.front()];
3537 PtrN = PointerOps[Order.back()];
3542 if (
static_cast<unsigned>(*Diff) == VL.size() - 1)
3543 return LoadsState::Vectorize;
3549 bool ProfitableGatherPointers =
3552 })) <= VL.size() / 2 && VL.size() > 2;
3553 if (ProfitableGatherPointers ||
all_of(PointerOps, [IsSorted](
Value *
P) {
3554 auto *
GEP = dyn_cast<GetElementPtrInst>(
P);
3556 (
GEP &&
GEP->getNumOperands() == 2);
3558 Align CommonAlignment = cast<LoadInst>(VL0)->getAlign();
3565 return LoadsState::ScatterVectorize;
3569 return LoadsState::Gather;
3576 VL, [](
const Value *V) {
return V->
getType()->isPointerTy(); }) &&
3577 "Expected list of pointer operands.");
3582 Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));
3593 Base.second.emplace_back(Ptr, *Diff, Cnt++);
3599 if (Bases.
size() > VL.
size() / 2 - 1)
3603 Bases[Ptr].emplace_back(Ptr, 0, Cnt++);
3609 bool AnyConsecutive =
false;
3610 for (
auto &
Base : Bases) {
3611 auto &Vec =
Base.second;
3612 if (Vec.size() > 1) {
3614 const std::tuple<Value *, int, unsigned> &
Y) {
3615 return std::get<1>(
X) < std::get<1>(
Y);
3617 int InitialOffset = std::get<1>(Vec[0]);
3619 return std::get<1>(
P.value()) ==
int(
P.index()) + InitialOffset;
3625 SortedIndices.
clear();
3626 if (!AnyConsecutive)
3629 for (
auto &
Base : Bases) {
3630 for (
auto &
T :
Base.second)
3631 SortedIndices.push_back(std::get<2>(
T));
3635 "Expected SortedIndices to be the size of VL");
3640 BoUpSLP::findPartiallyOrderedLoads(
const BoUpSLP::TreeEntry &TE) {
3641 assert(TE.State == TreeEntry::NeedToGather &&
"Expected gather node only.");
3642 Type *ScalarTy = TE.Scalars[0]->getType();
3645 Ptrs.
reserve(TE.Scalars.size());
3646 for (
Value *V : TE.Scalars) {
3647 auto *L = dyn_cast<LoadInst>(V);
3648 if (!L || !L->isSimple())
3650 Ptrs.push_back(L->getPointerOperand());
3663 if (!TE.ReuseShuffleIndices.empty())
3665 if (TE.State == TreeEntry::Vectorize &&
3666 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
3667 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
3669 return TE.ReorderIndices;
3670 if (TE.State == TreeEntry::NeedToGather) {
3673 if (((TE.getOpcode() == Instruction::ExtractElement &&
3674 !TE.isAltShuffle()) ||
3677 return isa<UndefValue, ExtractElementInst>(V);
3680 [](
Value *V) { return isa<ExtractElementInst>(V); }))) &&
3683 auto *EE = dyn_cast<ExtractElementInst>(V);
3684 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
3690 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder);
3691 if (Reuse || !CurrentOrder.empty()) {
3692 if (!CurrentOrder.empty())
3694 return CurrentOrder;
3698 return CurrentOrder;
3699 if (TE.Scalars.size() >= 4)
3706 void BoUpSLP::reorderTopToBottom() {
3719 ExternalUserReorderMap;
3725 for_each(VectorizableTree, [
this, &TTIRef, &VFToOrderedEntries,
3726 &GathersToOrders, &ExternalUserReorderMap,
3727 &AltShufflesToOrders](
3728 const std::unique_ptr<TreeEntry> &TE) {
3731 findExternalStoreUsersReorderIndices(TE.get());
3732 if (!ExternalUserReorderIndices.empty()) {
3733 VFToOrderedEntries[TE->Scalars.size()].
insert(TE.get());
3741 if (TE->isAltShuffle()) {
3744 unsigned Opcode0 = TE->getOpcode();
3745 unsigned Opcode1 = TE->getAltOpcode();
3748 for (
unsigned Lane : seq<unsigned>(0, TE->Scalars.size()))
3749 if (cast<Instruction>(TE->Scalars[Lane])->getOpcode() == Opcode1)
3750 OpcodeMask.
set(Lane);
3753 VFToOrderedEntries[TE->Scalars.size()].
insert(TE.get());
3760 getReorderingData(*TE,
true)) {
3769 const TreeEntry *UserTE = TE.get();
3771 if (UserTE->UserTreeIndices.size() != 1)
3774 return EI.UserTE->State == TreeEntry::Vectorize &&
3775 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
3778 UserTE = UserTE->UserTreeIndices.back().UserTE;
3781 VFToOrderedEntries[TE->Scalars.size()].
insert(TE.get());
3782 if (TE->State != TreeEntry::Vectorize)
3783 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
3788 for (
unsigned VF = VectorizableTree.front()->Scalars.size(); VF > 1;
3790 auto It = VFToOrderedEntries.
find(VF);
3791 if (It == VFToOrderedEntries.
end())
3803 for (
const TreeEntry *OpTE : OrderedEntries) {
3806 if (!OpTE->ReuseShuffleIndices.empty())
3809 const auto &Order = [OpTE, &GathersToOrders,
3810 &AltShufflesToOrders]() ->
const OrdersType & {
3811 if (OpTE->State == TreeEntry::NeedToGather) {
3812 auto It = GathersToOrders.find(OpTE);
3813 if (It != GathersToOrders.end())
3816 if (OpTE->isAltShuffle()) {
3817 auto It = AltShufflesToOrders.
find(OpTE);
3818 if (It != AltShufflesToOrders.
end())
3821 return OpTE->ReorderIndices;
3824 auto It = ExternalUserReorderMap.
find(OpTE);
3825 if (It != ExternalUserReorderMap.
end()) {
3826 const auto &ExternalUserReorderIndices = It->second;
3827 for (
const OrdersType &ExtOrder : ExternalUserReorderIndices)
3828 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
3834 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
3838 unsigned E = Order.size();
3841 return Idx == UndefMaskElem ? E : static_cast<unsigned>(Idx);
3844 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
3846 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
3850 if (OrdersUses.empty())
3854 unsigned Cnt = OrdersUses.front().second;
3855 for (
const auto &Pair :
drop_begin(OrdersUses)) {
3856 if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) {
3857 BestOrder = Pair.first;
3862 if (BestOrder.
empty())
3867 unsigned E = BestOrder.
size();
3868 transform(BestOrder, MaskOrder.begin(), [
E](
unsigned I) {
3869 return I < E ? static_cast<int>(I) : UndefMaskElem;
3872 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
3874 if (TE->Scalars.size() != VF) {
3875 if (TE->ReuseShuffleIndices.size() == VF) {
3881 return EI.UserTE->Scalars.size() == VF ||
3882 EI.UserTE->Scalars.size() ==
3885 "All users must be of VF size.");
3892 if (TE->State == TreeEntry::Vectorize &&
3895 !TE->isAltShuffle()) {
3899 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
3900 TE->reorderOperands(
Mask);
3903 TE->reorderOperands(
Mask);
3904 assert(TE->ReorderIndices.empty() &&
3905 "Expected empty reorder sequence.");
3908 if (!TE->ReuseShuffleIndices.empty()) {
3915 addMask(NewReuses, TE->ReuseShuffleIndices);
3916 TE->ReuseShuffleIndices.swap(NewReuses);
3922 bool BoUpSLP::canReorderOperands(
3923 TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
3926 for (
unsigned I = 0,
E = UserTE->getNumOperands();
I <
E; ++
I) {
3927 if (
any_of(Edges, [
I](
const std::pair<unsigned, TreeEntry *> &OpData) {
3928 return OpData.first ==
I &&
3929 OpData.second->State == TreeEntry::Vectorize;
3932 if (TreeEntry *TE = getVectorizedOperand(UserTE,
I)) {
3934 if (
any_of(TE->UserTreeIndices,
3935 [UserTE](
const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
3939 Edges.emplace_back(
I, TE);
3945 if (TE->State != TreeEntry::Vectorize && TE->ReuseShuffleIndices.empty())
3946 GatherOps.push_back(TE);
3949 TreeEntry *Gather =
nullptr;
3951 [&Gather, UserTE,
I](TreeEntry *TE) {
3952 assert(TE->State != TreeEntry::Vectorize &&
3953 "Only non-vectorized nodes are expected.");
3954 if (
any_of(TE->UserTreeIndices,
3955 [UserTE,
I](
const EdgeInfo &EI) {
3956 return EI.UserTE == UserTE && EI.EdgeIdx == I;
3958 assert(TE->isSame(UserTE->getOperand(
I)) &&
3959 "Operand entry does not match operands.");
3968 GatherOps.push_back(Gather);
3973 void BoUpSLP::reorderBottomToTop(
bool IgnoreReorder) {
3980 for_each(VectorizableTree, [
this, &OrderedEntries, &GathersToOrders,
3982 const std::unique_ptr<TreeEntry> &TE) {
3983 if (TE->State != TreeEntry::Vectorize)
3984 NonVectorized.push_back(TE.get());
3986 getReorderingData(*TE,
false)) {
3987 OrderedEntries.
insert(TE.get());
3988 if (TE->State != TreeEntry::Vectorize)
3989 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
3998 while (!OrderedEntries.
empty()) {
4003 for (TreeEntry *TE : OrderedEntries) {
4004 if (!(TE->State == TreeEntry::Vectorize ||
4005 (TE->State == TreeEntry::NeedToGather &&
4006 GathersToOrders.
count(TE))) ||
4007 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
4010 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
4012 !Visited.insert(TE).second) {
4013 Filtered.push_back(TE);
4018 for (
EdgeInfo &EI : TE->UserTreeIndices) {
4019 TreeEntry *UserTE = EI.
UserTE;
4020 auto It =
Users.find(UserTE);
4021 if (It ==
Users.end())
4022 It =
Users.insert({UserTE, {}}).first;
4023 It->second.emplace_back(EI.
EdgeIdx, TE);
4028 [&OrderedEntries](TreeEntry *TE) { OrderedEntries.remove(TE); });
4030 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
4032 sort(UsersVec, [](
const auto &Data1,
const auto &Data2) {
4033 return Data1.first->Idx > Data2.first->Idx;
4035 for (
auto &
Data : UsersVec) {
4038 if (!canReorderOperands(
Data.first,
Data.second, NonVectorized,
4041 [&OrderedEntries](
const std::pair<unsigned, TreeEntry *> &
Op) {
4042 OrderedEntries.remove(Op.second);
4056 for (
const auto &
Op :
Data.second) {
4057 TreeEntry *OpTE =
Op.second;
4058 if (!VisitedOps.
insert(OpTE).second)
4060 if (!OpTE->ReuseShuffleIndices.empty())
4062 const auto &Order = [OpTE, &GathersToOrders]() ->
const OrdersType & {
4063 if (OpTE->State == TreeEntry::NeedToGather)
4064 return GathersToOrders.
find(OpTE)->second;
4065 return OpTE->ReorderIndices;
4068 Data.second, [OpTE](
const std::pair<unsigned, TreeEntry *> &
P) {
4069 return P.second == OpTE;
4072 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
4076 unsigned E = Order.size();
4079 return Idx == UndefMaskElem ? E : static_cast<unsigned>(Idx);
4082 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
4085 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
4087 auto Res = OrdersUses.insert(std::make_pair(
OrdersType(), 0));
4088 const auto &&AllowsReordering = [IgnoreReorder, &GathersToOrders](
4089 const TreeEntry *TE) {
4090 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
4091 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
4092 (IgnoreReorder && TE->Idx == 0))
4094 if (TE->State == TreeEntry::NeedToGather) {
4095 auto It = GathersToOrders.
find(TE);
4096 if (It != GathersToOrders.
end())
4097 return !It->second.empty();
4102 for (
const EdgeInfo &EI : OpTE->UserTreeIndices) {
4103 TreeEntry *UserTE = EI.
UserTE;
4104 if (!VisitedUsers.
insert(UserTE).second)
4109 if (AllowsReordering(UserTE))
4117 if (
static_cast<unsigned>(
count_if(
4118 Ops, [UserTE, &AllowsReordering](
4119 const std::pair<unsigned, TreeEntry *> &
Op) {
4120 return AllowsReordering(
Op.second) &&
4123 return EI.UserTE == UserTE;
4125 })) <= Ops.
size() / 2)
4126 ++Res.first->second;
4130 if (OrdersUses.empty()) {
4132 [&OrderedEntries](
const std::pair<unsigned, TreeEntry *> &
Op) {
4133 OrderedEntries.remove(Op.second);
4139 unsigned Cnt = OrdersUses.front().second;
4140 for (
const auto &Pair :
drop_begin(OrdersUses)) {
4141 if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) {
4142 BestOrder = Pair.first;
4147 if (BestOrder.
empty()) {
4149 [&OrderedEntries](
const std::pair<unsigned, TreeEntry *> &
Op) {
4150 OrderedEntries.remove(Op.second);
4159 unsigned E = BestOrder.
size();
4160 transform(BestOrder, MaskOrder.begin(), [
E](
unsigned I) {
4161 return I < E ? static_cast<int>(I) : UndefMaskElem;
4163 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second) {
4164 TreeEntry *TE =
Op.second;
4165 OrderedEntries.remove(TE);
4166 if (!VisitedOps.
insert(TE).second)
4168 if (TE->ReuseShuffleIndices.size() == BestOrder.
size()) {
4174 if (TE->State != TreeEntry::Vectorize)
4176 assert((BestOrder.
size() == TE->ReorderIndices.size() ||
4177 TE->ReorderIndices.empty()) &&
4178 "Non-matching sizes of user/operand entries.");
4180 if (IgnoreReorder && TE == VectorizableTree.front().get())
4181 IgnoreReorder =
false;
4184 for (TreeEntry *Gather : GatherOps) {
4185 assert(Gather->ReorderIndices.empty() &&
4186 "Unexpected reordering of gathers.");
4187 if (!Gather->ReuseShuffleIndices.empty()) {
4193 OrderedEntries.remove(Gather);
4197 if (
Data.first->State != TreeEntry::Vectorize ||
4198 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
4199 Data.first->getMainOp()) ||
4200 Data.first->isAltShuffle())
4202 if (!isa<InsertElementInst, StoreInst>(
Data.first->getMainOp()) ||
4203 Data.first->isAltShuffle()) {
4206 if (
Data.first->ReuseShuffleIndices.empty() &&
4207 !
Data.first->ReorderIndices.empty() &&
4208 !
Data.first->isAltShuffle()) {
4211 OrderedEntries.insert(
Data.first);
4219 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
4220 VectorizableTree.front()->ReuseShuffleIndices.empty())
4221 VectorizableTree.front()->ReorderIndices.clear();
4224 void BoUpSLP::buildExternalUses(
4227 for (
auto &TEPtr : VectorizableTree) {
4228 TreeEntry *Entry = TEPtr.get();
4231 if (Entry->State == TreeEntry::NeedToGather)
4235 for (
int Lane = 0,
LE = Entry->Scalars.size(); Lane !=
LE; ++Lane) {
4236 Value *Scalar = Entry->Scalars[Lane];
4237 int FoundLane = Entry->findLaneForValue(Scalar);
4240 auto ExtI = ExternallyUsedValues.
find(Scalar);
4241 if (ExtI != ExternallyUsedValues.
end()) {
4242 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract: Extra arg from lane "
4243 << Lane <<
" from " << *Scalar <<
".\n");
4244 ExternalUses.emplace_back(Scalar,
nullptr, FoundLane);
4246 for (
User *U : Scalar->users()) {
4253 if (isDeleted(UserInst))
4257 if (TreeEntry *UseEntry = getTreeEntry(U)) {
4258 Value *UseScalar = UseEntry->Scalars[0];
4262 if (UseScalar != U ||
4263 UseEntry->State == TreeEntry::ScatterVectorize ||
4265 LLVM_DEBUG(
dbgs() <<
"SLP: \tInternal user will be removed:" << *U
4267 assert(UseEntry->State != TreeEntry::NeedToGather &&
"Bad state");
4273 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
4276 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract:" << *U <<
" from lane "
4277 << Lane <<
" from " << *Scalar <<
".\n");
4278 ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane));
4285 BoUpSLP::collectUserStores(
const BoUpSLP::TreeEntry *TE)
const {
4287 for (
unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
4288 Value *V = TE->Scalars[Lane];
4290 static constexpr
unsigned UsersLimit = 4;
4296 auto *
SI = dyn_cast<StoreInst>(U);
4297 if (
SI ==
nullptr || !
SI->isSimple() ||
4301 if (getTreeEntry(U))
4305 auto &StoresVec = PtrToStoresMap[Ptr];
4308 if (StoresVec.size() > Lane)
4311 if (!StoresVec.empty() &&
4312 SI->getParent() != StoresVec.back()->getParent())
4315 if (!StoresVec.empty() &&
4316 SI->getValueOperand()->getType() !=
4317 StoresVec.back()->getValueOperand()->getType())
4319 StoresVec.push_back(
SI);
4322 return PtrToStoresMap;
4326 OrdersType &ReorderIndices)
const {
4334 StoreOffsetVec[0] = {S0, 0};
4337 for (
unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
4341 SI->getPointerOperand(), *
DL, *SE,
4346 StoreOffsetVec[Idx] = {StoresVec[Idx], *Diff};
4351 stable_sort(StoreOffsetVec, [](
const std::pair<StoreInst *, int> &Pair1,
4352 const std::pair<StoreInst *, int> &Pair2) {
4353 int Offset1 = Pair1.second;
4354 int Offset2 = Pair2.second;
4355 return Offset1 < Offset2;
4359 for (
unsigned Idx : seq<unsigned>(1, StoreOffsetVec.size()))
4360 if (StoreOffsetVec[Idx].second != StoreOffsetVec[Idx-1].second + 1)
4365 ReorderIndices.reserve(StoresVec.size());
4367 unsigned Idx =
find_if(StoreOffsetVec,
4368 [
SI](
const std::pair<StoreInst *, int> &Pair) {
4369 return Pair.first ==
SI;
4371 StoreOffsetVec.begin();
4372 ReorderIndices.push_back(Idx);
4377 auto IsIdentityOrder = [](
const OrdersType &Order) {
4378 for (
unsigned Idx : seq<unsigned>(0, Order.size()))
4379 if (Idx != Order[Idx])
4383 if (IsIdentityOrder(ReorderIndices))
4384 ReorderIndices.clear();
4391 for (
unsigned Idx : Order)
4392 dbgs() << Idx <<
", ";
4398 BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE)
const {
4399 unsigned NumLanes =
TE->Scalars.size();
4402 collectUserStores(TE);
4411 for (
const auto &Pair : PtrToStoresMap) {
4412 auto &StoresVec = Pair.second;
4414 if (StoresVec.size() != NumLanes)
4418 OrdersType ReorderIndices;
4419 if (!CanFormVector(StoresVec, ReorderIndices))
4424 ExternalReorderIndices.push_back(ReorderIndices);
4426 return ExternalReorderIndices;
4432 UserIgnoreList = &UserIgnoreLst;
4435 buildTree_rec(Roots, 0,
EdgeInfo());
4442 buildTree_rec(Roots, 0,
EdgeInfo());
4449 Value *NeedsScheduling =
nullptr;
4450 for (
Value *V : VL) {
4453 if (!NeedsScheduling) {
4454 NeedsScheduling = V;
4459 return NeedsScheduling;
4470 bool AllowAlternate) {
4474 if (
auto *LI = dyn_cast<LoadInst>(V)) {
4482 if (isa<ExtractElementInst, UndefValue>(V))
4484 if (
auto *EI = dyn_cast<ExtractElementInst>(V)) {
4486 !isa<UndefValue>(EI->getIndexOperand()))
4489 }
else if (
auto *
I = dyn_cast<Instruction>(V)) {
4492 if ((isa<BinaryOperator>(
I) || isa<CastInst>(
I)) &&
4502 : cast<CastInst>(
I)->getOperand(0)->getType()));
4504 if (isa<CastInst>(
I)) {
4505 std::pair<size_t, size_t> OpVals =
4511 }
else if (
auto *CI = dyn_cast<CmpInst>(
I)) {
4513 if (CI->isCommutative())
4519 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
4533 }
else if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
4534 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
4535 SubKey =
hash_value(Gep->getPointerOperand());
4539 !isa<ConstantInt>(
I->getOperand(1))) {
4547 return std::make_pair(
Key, SubKey);
4551 const EdgeInfo &UserTreeIdx) {
4556 auto &&TryToFindDuplicates = [&VL, &ReuseShuffleIndicies, &UniqueValues,
4558 this](
const InstructionsState &
S) {
4561 for (
Value *V : VL) {
4563 ReuseShuffleIndicies.emplace_back(
4565 UniqueValues.emplace_back(V);
4568 auto Res = UniquePositions.
try_emplace(V, UniqueValues.size());
4569 ReuseShuffleIndicies.emplace_back(Res.first->second);
4571 UniqueValues.emplace_back(V);
4573 size_t NumUniqueScalarValues = UniqueValues.size();
4574 if (NumUniqueScalarValues == VL.size()) {
4575 ReuseShuffleIndicies.clear();
4578 if (NumUniqueScalarValues <= 1 ||
4579 (UniquePositions.
size() == 1 &&
all_of(UniqueValues,
4581 return isa<UndefValue>(V) ||
4586 newTreeEntry(VL, None ,
S, UserTreeIdx);
4596 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to max recursion depth.\n");
4597 if (TryToFindDuplicates(
S))
4598 newTreeEntry(VL, None ,
S, UserTreeIdx,
4599 ReuseShuffleIndicies);
4604 if (
S.getOpcode() == Instruction::ExtractElement &&
4605 isa<ScalableVectorType>(
4606 cast<ExtractElementInst>(
S.OpValue)->getVectorOperandType())) {
4607 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to scalable vector type.\n");
4608 if (TryToFindDuplicates(
S))
4609 newTreeEntry(VL, None ,
S, UserTreeIdx,
4610 ReuseShuffleIndicies);
4615 if (
S.OpValue->getType()->isVectorTy() &&
4616 !isa<InsertElementInst>(
S.OpValue)) {
4618 newTreeEntry(VL, None ,
S, UserTreeIdx);
4623 if (
SI->getValueOperand()->getType()->isVectorTy()) {
4624 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to store vector type.\n");
4625 newTreeEntry(VL, None ,
S, UserTreeIdx);
4634 auto &&NotProfitableForVectorization = [&
S,
this,
4636 if (!
S.getOpcode() || !
S.isAltShuffle() || VL.size() > 2)
4645 for (
Value *V : VL) {
4646 auto *
I = cast<Instruction>(V);
4648 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
4652 if ((IsCommutative &&
4653 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
4655 all_of(InstsCount, [](
unsigned ICnt) {
return ICnt < 2; })))
4657 assert(VL.size() == 2 &&
"Expected only 2 alternate op instructions.");
4659 auto *
I1 = cast<Instruction>(VL.front());
4660 auto *I2 = cast<Instruction>(VL.back());
4661 for (
int Op = 0,
E =
S.MainOp->getNumOperands();
Op <
E; ++
Op)
4663 I2->getOperand(
Op));
4664 if (
static_cast<unsigned>(
count_if(
4665 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
4666 return findBestRootPair(Cand, LookAheadHeuristics::ScoreSplat);
4667 })) >=
S.MainOp->getNumOperands() / 2)
4669 if (
S.MainOp->getNumOperands() > 2)
4671 if (IsCommutative) {
4674 for (
int Op = 0,
E =
S.MainOp->getNumOperands();
Op <
E; ++
Op)
4676 I2->getOperand((
Op + 1) %
E));
4678 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
4679 return findBestRootPair(Cand, LookAheadHeuristics::ScoreSplat);
4687 bool AreAllSameInsts =
4689 (
S.OpValue->getType()->isPointerTy() && UserTreeIdx.UserTE &&
4690 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize &&
4694 auto *
I = dyn_cast<GetElementPtrInst>(V);
4698 BB =
I->getParent();
4699 return BB ==
I->getParent() &&
I->getNumOperands() == 2;
4705 (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
4708 NotProfitableForVectorization(VL)) {
4709 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to C,S,B,O, small shuffle. \n");
4710 if (TryToFindDuplicates(
S))
4711 newTreeEntry(VL, None ,
S, UserTreeIdx,
4712 ReuseShuffleIndicies);
4720 if (!EphValues.
empty()) {
4721 for (
Value *V : VL) {
4722 if (EphValues.
count(V)) {
4724 <<
") is ephemeral.\n");
4725 newTreeEntry(VL, None ,
S, UserTreeIdx);
4732 if (TreeEntry *
E = getTreeEntry(
S.OpValue)) {
4733 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *
S.OpValue <<
".\n");
4734 if (!
E->isSame(VL)) {
4735 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to partial overlap.\n");
4736 if (TryToFindDuplicates(
S))
4737 newTreeEntry(VL, None ,
S, UserTreeIdx,
4738 ReuseShuffleIndicies);
4743 E->UserTreeIndices.push_back(UserTreeIdx);
4750 for (
Value *V : VL) {
4751 auto *
I = dyn_cast<Instruction>(V);
4754 if (getTreeEntry(
I)) {
4756 <<
") is already in tree.\n");
4757 if (TryToFindDuplicates(
S))
4758 newTreeEntry(VL, None ,
S, UserTreeIdx,
4759 ReuseShuffleIndicies);
4765 if (UserIgnoreList && !UserIgnoreList->empty()) {
4766 for (
Value *V : VL) {
4767 if (UserIgnoreList && UserIgnoreList->contains(V)) {
4768 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to gathered scalar.\n");
4769 if (TryToFindDuplicates(
S))
4770 newTreeEntry(VL, None ,
S, UserTreeIdx,
4771 ReuseShuffleIndicies);
4779 if (AreAllSameInsts && !(
S.getOpcode() &&
allSameBlock(VL)) &&
4780 UserTreeIdx.UserTE &&
4781 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize) {
4782 assert(
S.OpValue->getType()->isPointerTy() &&
4783 count_if(VL, [](
Value *V) {
return isa<GetElementPtrInst>(V); }) >=
4785 "Expected pointers only.");
4787 const auto *It =
find_if(VL, [](
Value *V) {
return isa<GetElementPtrInst>(V); });
4788 assert(It != VL.end() &&
"Expected at least one GEP.");
4794 auto *VL0 = cast<Instruction>(
S.OpValue);
4795 BB = VL0->getParent();
4797 if (!DT->isReachableFromEntry(
BB)) {
4801 newTreeEntry(VL, None ,
S, UserTreeIdx);
4806 if (!TryToFindDuplicates(
S))
4809 auto &BSRef = BlocksSchedules[
BB];
4811 BSRef = std::make_unique<BlockScheduling>(
BB);
4813 BlockScheduling &BS = *BSRef;
4816 #ifdef EXPENSIVE_CHECKS
4821 LLVM_DEBUG(
dbgs() <<
"SLP: We are not able to schedule this bundle!\n");
4822 assert((!BS.getScheduleData(VL0) ||
4823 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
4824 "tryScheduleBundle should cancelScheduling on failure");
4825 newTreeEntry(VL, None ,
S, UserTreeIdx,
4826 ReuseShuffleIndicies);
4829 LLVM_DEBUG(
dbgs() <<
"SLP: We are able to schedule this bundle.\n");
4831 unsigned ShuffleOrOp =
S.isAltShuffle() ?
4832 (unsigned) Instruction::ShuffleVector :
S.getOpcode();
4833 switch (ShuffleOrOp) {
4834 case Instruction::PHI: {
4835 auto *PH = cast<PHINode>(VL0);
4839 for (
Value *Incoming : cast<PHINode>(V)->incoming_values()) {
4841 if (
Term &&
Term->isTerminator()) {
4843 <<
"SLP: Need to swizzle PHINodes (terminator use).\n");
4844 BS.cancelScheduling(VL, VL0);
4845 newTreeEntry(VL, None ,
S, UserTreeIdx,
4846 ReuseShuffleIndicies);
4852 newTreeEntry(VL, Bundle,
S, UserTreeIdx, ReuseShuffleIndicies);
4857 for (
unsigned I = 0,
E = PH->getNumIncomingValues();
I <
E; ++
I) {
4858 if (!DT->isReachableFromEntry(PH->getIncomingBlock(
I))) {
4867 Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(
4868 PH->getIncomingBlock(
I)));
4872 for (
unsigned OpIdx = 0, OpE = OperandsVec.size(); OpIdx != OpE; ++OpIdx)
4873 buildTree_rec(OperandsVec[OpIdx], Depth + 1, {
TE, OpIdx});
4876 case Instruction::ExtractValue:
4877 case Instruction::ExtractElement: {
4878 OrdersType CurrentOrder;
4879 bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
4881 LLVM_DEBUG(
dbgs() <<
"SLP: Reusing or shuffling extract sequence.\n");
4882 newTreeEntry(VL, Bundle ,
S, UserTreeIdx,
4883 ReuseShuffleIndicies);
4887 Op0.assign(VL.size(), VL0->getOperand(0));
4888 VectorizableTree.back()->setOperand(0, Op0);
4891 if (!CurrentOrder.empty()) {
4893 dbgs() <<
"SLP: Reusing or shuffling of reordered extract sequence "
4895 for (
unsigned Idx : CurrentOrder)
4896 dbgs() <<
" " << Idx;
4902 newTreeEntry(VL, Bundle ,
S, UserTreeIdx,
4903 ReuseShuffleIndicies, CurrentOrder);
4907 Op0.assign(VL.size(), VL0->getOperand(0));
4908 VectorizableTree.back()->setOperand(0, Op0);
4912 newTreeEntry(VL, None ,
S, UserTreeIdx,
4913 ReuseShuffleIndicies);
4914 BS.cancelScheduling(VL, VL0);
4917 case Instruction::InsertElement: {
4918 assert(ReuseShuffleIndicies.empty() &&
"All inserts should be unique");
4922 ValueSet SourceVectors;
4923 for (
Value *V : VL) {
4924 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
4929 return !SourceVectors.contains(V);
4932 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
4933 "different source vectors.\n");
4934 newTreeEntry(VL, None ,
S, UserTreeIdx);
4935 BS.cancelScheduling(VL, VL0);
4939 auto OrdCompare = [](
const std::pair<int, int> &P1,
4940 const std::pair<int, int> &
P2) {
4941 return P1.first >
P2.first;
4944 decltype(OrdCompare)>
4945 Indices(OrdCompare);
4946 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
4948 Indices.emplace(Idx,
I);
4950 OrdersType CurrentOrder(VL.size(), VL.size());
4951 bool IsIdentity =
true;
4952 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
4953 CurrentOrder[Indices.top().second] =
I;
4954 IsIdentity &= Indices.top().second ==
I;
4958 CurrentOrder.clear();
4959 TreeEntry *
TE = newTreeEntry(VL, Bundle ,
S, UserTreeIdx,
4960 None, CurrentOrder);
4963 constexpr
int NumOps = 2;
4964 ValueList VectorOperands[NumOps];
4965 for (
int I = 0;
I < NumOps; ++
I) {
4967 VectorOperands[
I].push_back(cast<Instruction>(V)->getOperand(
I));
4969 TE->setOperand(
I, VectorOperands[
I]);
4971 buildTree_rec(VectorOperands[NumOps - 1], Depth + 1, {
TE, NumOps - 1});
4982 OrdersType CurrentOrder;
4983 TreeEntry *
TE =
nullptr;
4986 case LoadsState::Vectorize:
4987 if (CurrentOrder.empty()) {
4989 TE = newTreeEntry(VL, Bundle ,
S, UserTreeIdx,
4990 ReuseShuffleIndicies);
4995 TE = newTreeEntry(VL, Bundle ,
S, UserTreeIdx,
4996 ReuseShuffleIndicies, CurrentOrder);
4999 TE->setOperandsInOrder();
5001 case LoadsState::ScatterVectorize:
5003 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle,
S,
5004 UserTreeIdx, ReuseShuffleIndicies);
5005 TE->setOperandsInOrder();
5006 buildTree_rec(PointerOps, Depth + 1, {
TE, 0});
5007 LLVM_DEBUG(
dbgs() <<
"SLP: added a vector of non-consecutive loads.\n");
5009 case LoadsState::Gather:
5010 BS.cancelScheduling(VL, VL0);
5011 newTreeEntry(VL, None ,
S, UserTreeIdx,
5012 ReuseShuffleIndicies);
5014 Type *ScalarTy = VL0->getType();
5015 if (
DL->getTypeSizeInBits(ScalarTy) !=
5016 DL->getTypeAllocSizeInBits(ScalarTy))
5017 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering loads of non-packed type.\n");
5019 return !cast<LoadInst>(V)->isSimple();
5029 case Instruction::ZExt:
5030 case Instruction::SExt:
5031 case Instruction::FPToUI:
5032 case Instruction::FPToSI:
5033 case Instruction::FPExt:
5034 case Instruction::PtrToInt:
5035 case Instruction::IntToPtr:
5036 case Instruction::SIToFP:
5037 case Instruction::UIToFP:
5038 case Instruction::Trunc:
5039 case Instruction::FPTrunc:
5040 case Instruction::BitCast: {
5041 Type *SrcTy = VL0->getOperand(0)->getType();
5042 for (
Value *V : VL) {
5043 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
5045 BS.cancelScheduling(VL, VL0);
5046 newTreeEntry(VL, None ,
S, UserTreeIdx,
5047 ReuseShuffleIndicies);
5049 <<
"SLP: Gathering casts with different src types.\n");
5053 TreeEntry *
TE = newTreeEntry(VL, Bundle ,
S, UserTreeIdx,
5054 ReuseShuffleIndicies);
5057 TE->setOperandsInOrder();
5058 for (
unsigned i = 0,
e = VL0->getNumOperands();
i <
e; ++
i) {
5062 Operands.push_back(cast<Instruction>(V)->getOperand(
i));
5068 case Instruction::ICmp:
5069 case Instruction::FCmp: {
5073 Type *ComparedTy = VL0->getOperand(0)->getType();
5074 for (
Value *V : VL) {
5076 if ((
Cmp->getPredicate() != P0 &&
Cmp->getPredicate() != SwapP0) ||
5077 Cmp->getOperand(0)->getType() != ComparedTy) {
5078 BS.cancelScheduling(VL, VL0);
5079 newTreeEntry(VL, None ,
S, UserTreeIdx,
5080 ReuseShuffleIndicies);
5082 <<
"SLP: Gathering cmp with different predicate.\n");
5087 TreeEntry *
TE = newTreeEntry(VL, Bundle ,
S, UserTreeIdx,
5088 ReuseShuffleIndicies);
5095 assert(P0 == SwapP0 &&
"Commutative Predicate mismatch");
5096 reorderInputsAccordingToOpcode(VL, Left, Right, *
DL, *SE, *
this);
5099 for (
Value *V : VL) {
5100 auto *
Cmp = cast<CmpInst>(V);
5103 if (
Cmp->getPredicate() != P0)
5109 TE->setOperand(0, Left);
5110 TE->setOperand(1, Right);
5111 buildTree_rec(Left, Depth + 1, {
TE, 0});
5112 buildTree_rec(Right, Depth + 1, {
TE, 1});
5116 case Instruction::FNeg:
5118 case Instruction::FAdd:
5119 case Instruction::Sub:
5120 case Instruction::FSub:
5122 case Instruction::FMul:
5123 case Instruction::UDiv:
5124 case Instruction::SDiv:
5125 case Instruction::FDiv:
5126 case Instruction::URem:
5127 case Instruction::SRem:
5128 case Instruction::FRem:
5129 case Instruction::Shl:
5130 case Instruction::LShr:
5131 case Instruction::AShr:
5132 case Instruction::And:
5133 case Instruction::Or:
5134 case Instruction::Xor: {
5135 TreeEntry *
TE = newTreeEntry(VL, Bundle ,
S, UserTreeIdx,
5136 ReuseShuffleIndicies);
5141 if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
5143 reorderInputsAccordingToOpcode(VL, Left, Right, *
DL, *SE, *
this);
5144 TE->setOperand(0, Left);
5145 TE->setOperand(1, Right);
5146 buildTree_rec(Left, Depth + 1, {
TE, 0});
5147 buildTree_rec(Right, Depth + 1, {
TE, 1});
5151 TE->setOperandsInOrder();
5152 for (
unsigned i = 0,
e = VL0->getNumOperands();
i <
e; ++
i) {
5156 Operands.push_back(cast<Instruction>(V)->getOperand(
i));
5162 case Instruction::GetElementPtr: {
5164 for (
Value *V : VL) {
5165 auto *
I = dyn_cast<GetElementPtrInst>(V);
5168 if (
I->getNumOperands() != 2) {
5169 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (nested indexes).\n");
5170 BS.cancelScheduling(VL, VL0);
5171 newTreeEntry(VL, None ,
S, UserTreeIdx,
5172 ReuseShuffleIndicies);
5179 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
5180 for (
Value *V : VL) {
5181 auto *
GEP = dyn_cast<GEPOperator>(V);
5184 Type *CurTy =
GEP->getSourceElementType();
5187 <<
"SLP: not-vectorizable GEP (different types).\n");
5188 BS.cancelScheduling(VL, VL0);
5189 newTreeEntry(VL, None ,
S, UserTreeIdx,
5190 ReuseShuffleIndicies);
5195 bool IsScatterUser =
5196 UserTreeIdx.UserTE &&
5197 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
5199 Type *Ty1 = VL0->getOperand(1)->getType();
5200 for (
Value *V : VL) {
5201 auto *
I = dyn_cast<GetElementPtrInst>(V);
5204 auto *
Op =
I->getOperand(1);
5205 if ((!IsScatterUser && !isa<ConstantInt>(
Op)) ||
5206 (
Op->getType() != Ty1 &&
5207 ((IsScatterUser && !isa<ConstantInt>(
Op)) ||
5208 Op->getType()->getScalarSizeInBits() >
5209 DL->getIndexSizeInBits(
5212 <<
"SLP: not-vectorizable GEP (non-constant indexes).\n");
5213 BS.cancelScheduling(VL, VL0);
5214 newTreeEntry(VL, None ,
S, UserTreeIdx,
5215 ReuseShuffleIndicies);
5220 TreeEntry *
TE = newTreeEntry(VL, Bundle ,
S, UserTreeIdx,
5221 ReuseShuffleIndicies);
5225 for (
Value *V : VL) {
5226 auto *
GEP = dyn_cast<GetElementPtrInst>(V);
5231 Operands.front().push_back(
GEP->getPointerOperand());
5240 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
5242 [VL0Ty, IndexIdx](
Value *V) {
5243 auto *
GEP = dyn_cast<GetElementPtrInst>(V);
5246 return VL0Ty ==
GEP->getOperand(IndexIdx)->getType();
5249 :
DL->getIndexType(cast<GetElementPtrInst>(VL0)
5250 ->getPointerOperandType()
5253 for (
Value *V : VL) {
5254 auto *
I = dyn_cast<GetElementPtrInst>(V);
5260 auto *
Op =
I->getOperand(IndexIdx);
5261 auto *CI = dyn_cast<ConstantInt>(
Op);
5266 CI, Ty, CI->getValue().isSignBitSet()));
5270 for (
unsigned I = 0, Ops =
Operands.size();
I < Ops; ++
I)
5276 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
5279 if (
DL->getTypeSizeInBits(ScalarTy) !=
5280 DL->getTypeAllocSizeInBits(ScalarTy)) {
5281 BS.cancelScheduling(VL, VL0);
5282 newTreeEntry(VL, None ,
S, UserTreeIdx,
5283 ReuseShuffleIndicies);
5284 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering stores of non-packed type.\n");
5291 auto POIter = PointerOps.begin();
5293 for (
Value *V : VL) {
5294 auto *
SI = cast<StoreInst>(V);
5295 if (!
SI->isSimple()) {
5296 BS.cancelScheduling(VL, VL0);
5297 newTreeEntry(VL, None ,
S, UserTreeIdx,
5298 ReuseShuffleIndicies);
5302 *POIter =
SI->getPointerOperand();
5303 *OIter =
SI->getValueOperand();
5308 OrdersType CurrentOrder;
5313 if (CurrentOrder.empty()) {
5314 Ptr0 = PointerOps.front();
5315 PtrN = PointerOps.back();
5317 Ptr0 = PointerOps[CurrentOrder.front()];
5318 PtrN = PointerOps[CurrentOrder.back()];
5323 if (
static_cast<unsigned>(*Dist) == VL.size() - 1) {
5324 if (CurrentOrder.empty()) {
5326 TreeEntry *
TE = newTreeEntry(VL, Bundle ,
S,
5327 UserTreeIdx, ReuseShuffleIndicies);
5328 TE->setOperandsInOrder();
5334 newTreeEntry(VL, Bundle ,
S, UserTreeIdx,
5335 ReuseShuffleIndicies, CurrentOrder);
5336 TE->setOperandsInOrder();
5338 LLVM_DEBUG(
dbgs() <<
"SLP: added a vector of jumbled stores.\n");