48 using namespace slpvectorizer;
50 #define SV_NAME "slp-vectorizer"
51 #define DEBUG_TYPE "SLP"
53 STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
57 cl::desc(
"Only vectorize if you gain more than this "
62 cl::desc(
"Attempt to vectorize horizontal reductions"));
67 "Attempt to vectorize horizontal reductions feeding into a store"));
71 cl::desc(
"Attempt to vectorize for this register size in bits"));
79 cl::desc(
"Limit the size of the SLP scheduling region per block"));
83 cl::desc(
"Attempt to vectorize for this register size in bits"));
87 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
91 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
125 for (
int i = 1, e = VL.
size();
i < e;
i++) {
139 if (!isa<Constant>(
i))
146 for (
unsigned i = 1, e = VL.
size();
i < e; ++
i)
156 case Instruction::FAdd:
157 return Instruction::FSub;
158 case Instruction::FSub:
159 return Instruction::FAdd;
161 return Instruction::Sub;
162 case Instruction::Sub:
173 return Op == Instruction::FAdd || Op == Instruction::FSub ||
184 for (
int i = 1, e = VL.
size();
i < e;
i++) {
186 if (!I || I->
getOpcode() != ((
i & 1) ? AltOpcode : Opcode))
189 return Instruction::ShuffleVector;
199 for (
int i = 1, e = VL.
size();
i < e;
i++) {
214 if (
auto *VecOp = dyn_cast<Instruction>(I)) {
215 if (
auto *Intersection = dyn_cast<Instruction>(VL[0])) {
218 for (
int i = 1, e = VL.
size();
i < e; ++
i) {
219 if (
auto *
Scalar = dyn_cast<Instruction>(VL[
i]))
220 Intersection->andIRFlags(
Scalar);
222 VecOp->copyIRFlags(Intersection);
230 Type *Ty = VL[0]->getType();
231 for (
int i = 1, e = VL.
size();
i < e;
i++)
240 assert(Opcode == Instruction::ExtractElement ||
241 Opcode == Instruction::ExtractValue);
242 if (Opcode == Instruction::ExtractElement) {
259 LoadInst *LI = cast<LoadInst>(UserInst);
267 CallInst *CI = cast<CallInst>(UserInst);
282 if (
LoadInst *LI = dyn_cast<LoadInst>(I))
289 if (
LoadInst *LI = dyn_cast<LoadInst>(I))
290 return LI->isSimple();
292 return SI->isSimple();
294 return !
MI->isVolatile();
299 namespace slpvectorizer {
312 : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0),
F(Func),
313 SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), DB(DB),
314 DL(DL), Builder(Se->getContext()) {
325 MaxVecRegSize = TTI->getRegisterBitWidth(
true);
332 Value *vectorizeTree();
349 VectorizableTree.clear();
350 ScalarToTreeEntry.clear();
352 ExternalUses.clear();
353 NumLoadsWantToKeepOrder = 0;
354 NumLoadsWantToChangeOrder = 0;
355 for (
auto &Iter : BlocksSchedules) {
356 BlockScheduling *BS = Iter.second.get();
363 void optimizeGatherSequence();
367 return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;
375 unsigned getVectorElementSize(
Value *V);
383 return MaxVecRegSize;
388 return MinVecRegSize;
398 bool isTreeTinyAndNotFullyVectorizable();
404 int getEntryCost(TreeEntry *
E);
414 Value *vectorizeTree(TreeEntry *
E);
425 int getGatherCost(
Type *Ty);
441 bool isFullyVectorizableTinyTree();
454 TreeEntry() : Scalars(), VectorizedValue(nullptr),
459 assert(VL.
size() == Scalars.size() &&
"Invalid size");
467 Value *VectorizedValue;
475 VectorizableTree.emplace_back();
476 int idx = VectorizableTree.size() - 1;
477 TreeEntry *
Last = &VectorizableTree[idx];
478 Last->Scalars.insert(Last->Scalars.begin(), VL.
begin(), VL.
end());
479 Last->NeedToGather = !Vectorized;
481 for (
int i = 0, e = VL.
size();
i != e; ++
i) {
482 assert(!ScalarToTreeEntry.count(VL[
i]) &&
"Scalar already in tree!");
483 ScalarToTreeEntry[VL[
i]] = idx;
486 MustGather.insert(VL.
begin(), VL.
end());
493 std::vector<TreeEntry> VectorizableTree;
502 struct ExternalUser {
522 AliasCacheKey key = std::make_pair(Inst1, Inst2);
531 aliased = AA->alias(Loc1, Loc2);
538 typedef std::pair<Instruction *, Instruction *> AliasCacheKey;
553 DeletedInstructions.push_back(std::unique_ptr<Instruction>(I));
562 UserList ExternalUses;
576 struct ScheduleData {
580 enum { InvalidDeps = -1 };
583 : Inst(nullptr), FirstInBundle(nullptr), NextInBundle(nullptr),
584 NextLoadStore(nullptr), SchedulingRegionID(0), SchedulingPriority(0),
585 Dependencies(InvalidDeps), UnscheduledDeps(InvalidDeps),
586 UnscheduledDepsInBundle(InvalidDeps), IsScheduled(
false) {}
588 void init(
int BlockSchedulingRegionID) {
589 FirstInBundle =
this;
590 NextInBundle =
nullptr;
591 NextLoadStore =
nullptr;
593 SchedulingRegionID = BlockSchedulingRegionID;
594 UnscheduledDepsInBundle = UnscheduledDeps;
599 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
603 bool isSchedulingEntity()
const {
return FirstInBundle ==
this; }
607 bool isPartOfBundle()
const {
608 return NextInBundle !=
nullptr || FirstInBundle !=
this;
613 bool isReady()
const {
614 assert(isSchedulingEntity() &&
615 "can't consider non-scheduling entity for ready list");
616 return UnscheduledDepsInBundle == 0 && !IsScheduled;
621 int incrementUnscheduledDeps(
int Incr) {
622 UnscheduledDeps += Incr;
623 return FirstInBundle->UnscheduledDepsInBundle += Incr;
628 void resetUnscheduledDeps() {
629 incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
633 void clearDependencies() {
634 Dependencies = InvalidDeps;
635 resetUnscheduledDeps();
636 MemoryDependencies.clear();
640 if (!isSchedulingEntity()) {
642 }
else if (NextInBundle) {
644 ScheduleData *SD = NextInBundle;
646 os <<
';' << *SD->Inst;
647 SD = SD->NextInBundle;
659 ScheduleData *FirstInBundle;
663 ScheduleData *NextInBundle;
667 ScheduleData *NextLoadStore;
675 int SchedulingRegionID;
678 int SchedulingPriority;
695 int UnscheduledDepsInBundle;
704 const BoUpSLP::ScheduleData &SD) {
712 struct BlockScheduling {
715 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize),
716 ScheduleStart(nullptr), ScheduleEnd(nullptr),
717 FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr),
718 ScheduleRegionSize(0),
722 SchedulingRegionID(1) {}
726 ScheduleStart =
nullptr;
727 ScheduleEnd =
nullptr;
728 FirstLoadStoreInRegion =
nullptr;
729 LastLoadStoreInRegion =
nullptr;
733 ScheduleRegionSizeLimit -= ScheduleRegionSize;
736 ScheduleRegionSize = 0;
740 ++SchedulingRegionID;
743 ScheduleData *getScheduleData(
Value *V) {
744 ScheduleData *SD = ScheduleDataMap[V];
745 if (SD && SD->SchedulingRegionID == SchedulingRegionID)
750 bool isInSchedulingRegion(ScheduleData *SD) {
751 return SD->SchedulingRegionID == SchedulingRegionID;
756 template <
typename ReadyListType>
757 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
758 SD->IsScheduled =
true;
759 DEBUG(
dbgs() <<
"SLP: schedule " << *SD <<
"\n");
761 ScheduleData *BundleMember = SD;
762 while (BundleMember) {
764 for (
Use &U : BundleMember->Inst->operands()) {
765 ScheduleData *OpDef = getScheduleData(U.get());
766 if (OpDef && OpDef->hasValidDependencies() &&
767 OpDef->incrementUnscheduledDeps(-1) == 0) {
770 ScheduleData *DepBundle = OpDef->FirstInBundle;
771 assert(!DepBundle->IsScheduled &&
772 "already scheduled bundle gets ready");
773 ReadyList.insert(DepBundle);
774 DEBUG(
dbgs() <<
"SLP: gets ready (def): " << *DepBundle <<
"\n");
778 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
779 if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
782 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
783 assert(!DepBundle->IsScheduled &&
784 "already scheduled bundle gets ready");
785 ReadyList.insert(DepBundle);
786 DEBUG(
dbgs() <<
"SLP: gets ready (mem): " << *DepBundle <<
"\n");
789 BundleMember = BundleMember->NextInBundle;
794 template <
typename ReadyListType>
795 void initialFillReadyList(ReadyListType &ReadyList) {
796 for (
auto *I = ScheduleStart; I != ScheduleEnd; I = I->
getNextNode()) {
797 ScheduleData *SD = getScheduleData(I);
798 if (SD->isSchedulingEntity() && SD->isReady()) {
799 ReadyList.insert(SD);
800 DEBUG(
dbgs() <<
"SLP: initially in ready list: " << *I <<
"\n");
815 bool extendSchedulingRegion(
Value *V);
820 ScheduleData *PrevLoadStore,
821 ScheduleData *NextLoadStore);
825 void calculateDependencies(ScheduleData *SD,
bool InsertInReadyList,
829 void resetSchedule();
834 std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
849 void insert(ScheduleData *SD) { push_back(SD); }
853 ReadyList ReadyInsts;
863 ScheduleData *FirstLoadStoreInRegion;
867 ScheduleData *LastLoadStoreInRegion;
870 int ScheduleRegionSize;
873 int ScheduleRegionSizeLimit;
877 int SchedulingRegionID;
885 void scheduleBlock(BlockScheduling *BS);
891 int NumLoadsWantToKeepOrder;
894 int NumLoadsWantToChangeOrder;
907 unsigned MaxVecRegSize;
908 unsigned MinVecRegSize;
926 UserIgnoreList = UserIgnoreLst;
929 buildTree_rec(Roots, 0);
932 for (TreeEntry &EIdx : VectorizableTree) {
933 TreeEntry *Entry = &EIdx;
936 for (
int Lane = 0,
LE = Entry->Scalars.size(); Lane !=
LE; ++Lane) {
940 if (Entry->NeedToGather)
944 DEBUG(
dbgs() <<
"SLP: Checking user:" << *U <<
".\n");
951 if (ScalarToTreeEntry.count(U)) {
952 int Idx = ScalarToTreeEntry[U];
953 TreeEntry *UseEntry = &VectorizableTree[Idx];
954 Value *UseScalar = UseEntry->Scalars[0];
958 if (UseScalar != U ||
960 DEBUG(
dbgs() <<
"SLP: \tInternal user will be removed:" << *U
962 assert(!VectorizableTree[Idx].NeedToGather &&
"Bad state");
971 DEBUG(
dbgs() <<
"SLP: Need to extract:" << *U <<
" from lane " <<
972 Lane <<
" from " << *Scalar <<
".\n");
973 ExternalUses.
push_back(ExternalUser(Scalar, U, Lane));
981 bool isAltShuffle =
false;
985 DEBUG(
dbgs() <<
"SLP: Gathering due to max recursion depth.\n");
986 newTreeEntry(VL,
false);
991 if (VL[0]->
getType()->isVectorTy()) {
992 DEBUG(
dbgs() <<
"SLP: Gathering due to vector type.\n");
993 newTreeEntry(VL,
false);
997 if (
StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
998 if (
SI->getValueOperand()->getType()->isVectorTy()) {
999 DEBUG(
dbgs() <<
"SLP: Gathering due to store vector type.\n");
1000 newTreeEntry(VL,
false);
1007 if (Opcode == Instruction::ShuffleVector) {
1010 if (Op != Instruction::ShuffleVector)
1011 isAltShuffle =
true;
1016 DEBUG(
dbgs() <<
"SLP: Gathering due to C,S,B,O. \n");
1017 newTreeEntry(VL,
false);
1025 for (
unsigned i = 0, e = VL.
size();
i != e; ++
i) {
1026 if (EphValues.count(VL[
i])) {
1027 DEBUG(
dbgs() <<
"SLP: The instruction (" << *VL[i] <<
1028 ") is ephemeral.\n");
1029 newTreeEntry(VL,
false);
1035 if (ScalarToTreeEntry.count(VL[0])) {
1036 int Idx = ScalarToTreeEntry[VL[0]];
1037 TreeEntry *
E = &VectorizableTree[Idx];
1038 for (
unsigned i = 0, e = VL.size(); i != e; ++
i) {
1039 DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *VL[i] <<
".\n");
1040 if (E->Scalars[i] != VL[i]) {
1041 DEBUG(
dbgs() <<
"SLP: Gathering due to partial overlap.\n");
1042 newTreeEntry(VL,
false);
1046 DEBUG(
dbgs() <<
"SLP: Perfect diamond merge at " << *VL[0] <<
".\n");
1051 for (
unsigned i = 0, e = VL.size(); i != e; ++
i) {
1052 if (ScalarToTreeEntry.count(VL[i])) {
1053 DEBUG(
dbgs() <<
"SLP: The instruction (" << *VL[i] <<
1054 ") is already in tree.\n");
1055 newTreeEntry(VL,
false);
1062 for (
unsigned i = 0, e = VL.size(); i != e; ++
i) {
1063 if (MustGather.
count(VL[i])) {
1064 DEBUG(
dbgs() <<
"SLP: Gathering due to gathered scalar.\n");
1065 newTreeEntry(VL,
false);
1078 DEBUG(
dbgs() <<
"SLP: bundle in unreachable block.\n");
1079 newTreeEntry(VL,
false);
1084 for (
unsigned i = 0, e = VL.size(); i < e; ++
i)
1085 for (
unsigned j = i+1; j < e; ++j)
1086 if (VL[i] == VL[j]) {
1087 DEBUG(
dbgs() <<
"SLP: Scalar used twice in bundle.\n");
1088 newTreeEntry(VL,
false);
1092 auto &BSRef = BlocksSchedules[BB];
1094 BSRef = llvm::make_unique<BlockScheduling>(BB);
1096 BlockScheduling &BS = *BSRef.get();
1098 if (!BS.tryScheduleBundle(VL,
this)) {
1099 DEBUG(
dbgs() <<
"SLP: We are not able to schedule this bundle!\n");
1100 assert((!BS.getScheduleData(VL[0]) ||
1101 !BS.getScheduleData(VL[0])->isPartOfBundle()) &&
1102 "tryScheduleBundle should cancelScheduling on failure");
1103 newTreeEntry(VL,
false);
1106 DEBUG(
dbgs() <<
"SLP: We are able to schedule this bundle.\n");
1109 case Instruction::PHI: {
1113 for (
unsigned j = 0; j < VL.size(); ++j)
1118 DEBUG(
dbgs() <<
"SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
1119 BS.cancelScheduling(VL);
1120 newTreeEntry(VL,
false);
1125 newTreeEntry(VL,
true);
1126 DEBUG(
dbgs() <<
"SLP: added a vector of PHINodes.\n");
1132 Operands.
push_back(cast<PHINode>(j)->getIncomingValueForBlock(
1135 buildTree_rec(Operands, Depth + 1);
1139 case Instruction::ExtractValue:
1140 case Instruction::ExtractElement: {
1141 bool Reuse = canReuseExtract(VL, Opcode);
1143 DEBUG(
dbgs() <<
"SLP: Reusing extract sequence.\n");
1145 BS.cancelScheduling(VL);
1147 newTreeEntry(VL, Reuse);
1158 Type *ScalarTy = VL[0]->getType();
1162 BS.cancelScheduling(VL);
1163 newTreeEntry(VL,
false);
1164 DEBUG(
dbgs() <<
"SLP: Gathering loads of non-packed type.\n");
1170 for (
unsigned i = 0, e = VL.size() - 1; i < e; ++
i) {
1173 BS.cancelScheduling(VL);
1174 newTreeEntry(VL,
false);
1175 DEBUG(
dbgs() <<
"SLP: Gathering non-simple loads.\n");
1183 bool Consecutive =
true;
1184 bool ReverseConsecutive =
true;
1185 for (
unsigned i = 0, e = VL.size() - 1; i < e; ++
i) {
1187 Consecutive =
false;
1190 ReverseConsecutive =
false;
1195 ++NumLoadsWantToKeepOrder;
1196 newTreeEntry(VL,
true);
1197 DEBUG(
dbgs() <<
"SLP: added a vector of loads.\n");
1203 if (ReverseConsecutive)
1204 for (
unsigned i = VL.size() - 1; i > 0; --
i)
1206 ReverseConsecutive =
false;
1210 BS.cancelScheduling(VL);
1211 newTreeEntry(VL,
false);
1213 if (ReverseConsecutive) {
1214 ++NumLoadsWantToChangeOrder;
1215 DEBUG(
dbgs() <<
"SLP: Gathering reversed loads.\n");
1217 DEBUG(
dbgs() <<
"SLP: Gathering non-consecutive loads.\n");
1221 case Instruction::ZExt:
1222 case Instruction::SExt:
1223 case Instruction::FPToUI:
1224 case Instruction::FPToSI:
1225 case Instruction::FPExt:
1226 case Instruction::PtrToInt:
1227 case Instruction::IntToPtr:
1228 case Instruction::SIToFP:
1229 case Instruction::UIToFP:
1230 case Instruction::Trunc:
1231 case Instruction::FPTrunc:
1232 case Instruction::BitCast: {
1234 for (
unsigned i = 0; i < VL.size(); ++
i) {
1235 Type *Ty = cast<Instruction>(VL[
i])->getOperand(0)->getType();
1237 BS.cancelScheduling(VL);
1238 newTreeEntry(VL,
false);
1239 DEBUG(
dbgs() <<
"SLP: Gathering casts with different src types.\n");
1243 newTreeEntry(VL,
true);
1244 DEBUG(
dbgs() <<
"SLP: added a vector of casts.\n");
1250 Operands.
push_back(cast<Instruction>(j)->getOperand(i));
1252 buildTree_rec(Operands, Depth+1);
1256 case Instruction::ICmp:
1257 case Instruction::FCmp: {
1260 Type *ComparedTy = cast<Instruction>(VL[0])->getOperand(0)->getType();
1261 for (
unsigned i = 1, e = VL.size(); i < e; ++
i) {
1262 CmpInst *Cmp = cast<CmpInst>(VL[
i]);
1265 BS.cancelScheduling(VL);
1266 newTreeEntry(VL,
false);
1267 DEBUG(
dbgs() <<
"SLP: Gathering cmp with different predicate.\n");
1272 newTreeEntry(VL,
true);
1273 DEBUG(
dbgs() <<
"SLP: added a vector of compares.\n");
1279 Operands.
push_back(cast<Instruction>(j)->getOperand(i));
1281 buildTree_rec(Operands, Depth+1);
1287 case Instruction::FAdd:
1288 case Instruction::Sub:
1289 case Instruction::FSub:
1290 case Instruction::Mul:
1291 case Instruction::FMul:
1292 case Instruction::UDiv:
1293 case Instruction::SDiv:
1294 case Instruction::FDiv:
1295 case Instruction::URem:
1296 case Instruction::SRem:
1297 case Instruction::FRem:
1298 case Instruction::Shl:
1299 case Instruction::LShr:
1300 case Instruction::AShr:
1304 newTreeEntry(VL,
true);
1305 DEBUG(
dbgs() <<
"SLP: added a vector of bin op.\n");
1311 reorderInputsAccordingToOpcode(VL, Left, Right);
1312 buildTree_rec(Left, Depth + 1);
1313 buildTree_rec(Right, Depth + 1);
1321 Operands.
push_back(cast<Instruction>(j)->getOperand(i));
1323 buildTree_rec(Operands, Depth+1);
1327 case Instruction::GetElementPtr: {
1329 for (
unsigned j = 0; j < VL.size(); ++j) {
1330 if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
1331 DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (nested indexes).\n");
1332 BS.cancelScheduling(VL);
1333 newTreeEntry(VL,
false);
1340 Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType();
1341 for (
unsigned j = 0; j < VL.size(); ++j) {
1342 Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
1344 DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (different types).\n");
1345 BS.cancelScheduling(VL);
1346 newTreeEntry(VL,
false);
1352 for (
unsigned j = 0; j < VL.size(); ++j) {
1353 auto Op = cast<Instruction>(VL[j])->getOperand(1);
1354 if (!isa<ConstantInt>(Op)) {
1356 dbgs() <<
"SLP: not-vectorizable GEP (non-constant indexes).\n");
1357 BS.cancelScheduling(VL);
1358 newTreeEntry(VL,
false);
1363 newTreeEntry(VL,
true);
1364 DEBUG(
dbgs() <<
"SLP: added a vector of GEPs.\n");
1365 for (
unsigned i = 0, e = 2; i < e; ++
i) {
1369 Operands.
push_back(cast<Instruction>(j)->getOperand(i));
1371 buildTree_rec(Operands, Depth + 1);
1377 for (
unsigned i = 0, e = VL.size() - 1; i < e; ++
i)
1379 BS.cancelScheduling(VL);
1380 newTreeEntry(VL,
false);
1381 DEBUG(
dbgs() <<
"SLP: Non-consecutive store.\n");
1385 newTreeEntry(VL,
true);
1386 DEBUG(
dbgs() <<
"SLP: added a vector of stores.\n");
1390 Operands.
push_back(cast<Instruction>(j)->getOperand(0));
1392 buildTree_rec(Operands, Depth + 1);
1397 CallInst *CI = cast<CallInst>(VL[0]);
1402 BS.cancelScheduling(VL);
1403 newTreeEntry(VL,
false);
1404 DEBUG(
dbgs() <<
"SLP: Non-vectorizable call.\n");
1408 Value *A1I =
nullptr;
1411 for (
unsigned i = 1, e = VL.size(); i != e; ++
i) {
1416 BS.cancelScheduling(VL);
1417 newTreeEntry(VL,
false);
1418 DEBUG(
dbgs() <<
"SLP: mismatched calls:" << *CI <<
"!=" << *VL[i]
1427 BS.cancelScheduling(VL);
1428 newTreeEntry(VL,
false);
1429 DEBUG(
dbgs() <<
"SLP: mismatched arguments in call:" << *CI
1430 <<
" argument "<< A1I<<
"!=" << A1J
1440 BS.cancelScheduling(VL);
1441 newTreeEntry(VL,
false);
1442 DEBUG(
dbgs() <<
"SLP: mismatched bundle operands in calls:" << *CI <<
"!="
1448 newTreeEntry(VL,
true);
1452 for (
Value *j : VL) {
1456 buildTree_rec(Operands, Depth + 1);
1460 case Instruction::ShuffleVector: {
1463 if (!isAltShuffle) {
1464 BS.cancelScheduling(VL);
1465 newTreeEntry(VL,
false);
1466 DEBUG(
dbgs() <<
"SLP: ShuffleVector are not vectorized.\n");
1469 newTreeEntry(VL,
true);
1470 DEBUG(
dbgs() <<
"SLP: added a ShuffleVector op.\n");
1473 if (isa<BinaryOperator>(VL0)) {
1475 reorderAltShuffleOperands(VL, Left, Right);
1476 buildTree_rec(Left, Depth + 1);
1477 buildTree_rec(Right, Depth + 1);
1485 Operands.
push_back(cast<Instruction>(j)->getOperand(i));
1487 buildTree_rec(Operands, Depth + 1);
1492 BS.cancelScheduling(VL);
1493 newTreeEntry(VL,
false);
1494 DEBUG(
dbgs() <<
"SLP: Gathering unknown instruction.\n");
1504 N =
ST->getNumElements();
1505 EltTy = *
ST->element_begin();
1507 N = cast<ArrayType>(
T)->getNumElements();
1508 EltTy = cast<ArrayType>(
T)->getElementType();
1517 for (
const auto *Ty :
ST->elements())
1525 assert(Opcode == Instruction::ExtractElement ||
1526 Opcode == Instruction::ExtractValue);
1536 if (Opcode == Instruction::ExtractValue) {
1546 NElts = Vec->getType()->getVectorNumElements();
1549 if (NElts != VL.
size())
1556 for (
unsigned i = 1, e = VL.
size(); i < e; ++
i) {
1567 int BoUpSLP::getEntryCost(TreeEntry *E) {
1570 Type *ScalarTy = VL[0]->getType();
1571 if (
StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
1572 ScalarTy =
SI->getValueOperand()->getType();
1577 if (MinBWs.count(VL[0]))
1581 if (E->NeedToGather) {
1587 return getGatherCost(E->Scalars);
1593 case Instruction::PHI: {
1596 case Instruction::ExtractValue:
1597 case Instruction::ExtractElement: {
1598 if (canReuseExtract(VL, Opcode)) {
1600 for (
unsigned i = 0, e = VL.
size(); i < e; ++
i) {
1609 return getGatherCost(VecTy);
1611 case Instruction::ZExt:
1612 case Instruction::SExt:
1613 case Instruction::FPToUI:
1614 case Instruction::FPToSI:
1615 case Instruction::FPExt:
1616 case Instruction::PtrToInt:
1617 case Instruction::IntToPtr:
1618 case Instruction::SIToFP:
1619 case Instruction::UIToFP:
1620 case Instruction::Trunc:
1621 case Instruction::FPTrunc:
1622 case Instruction::BitCast: {
1631 return VecCost - ScalarCost;
1633 case Instruction::FCmp:
1634 case Instruction::ICmp:
1641 return VecCost - ScalarCost;
1644 case Instruction::FAdd:
1645 case Instruction::Sub:
1646 case Instruction::FSub:
1647 case Instruction::Mul:
1648 case Instruction::FMul:
1649 case Instruction::UDiv:
1650 case Instruction::SDiv:
1651 case Instruction::FDiv:
1652 case Instruction::URem:
1653 case Instruction::SRem:
1654 case Instruction::FRem:
1655 case Instruction::Shl:
1656 case Instruction::LShr:
1657 case Instruction::AShr:
1678 for (
unsigned i = 0; i < VL.
size(); ++
i) {
1700 Op2VK, Op1VP, Op2VP);
1703 return VecCost - ScalarCost;
1705 case Instruction::GetElementPtr: {
1717 return VecCost - ScalarCost;
1725 VecTy, alignment, 0);
1726 return VecLdCost - ScalarLdCost;
1734 VecTy, alignment, 0);
1735 return VecStCost - ScalarStCost;
1738 CallInst *CI = cast<CallInst>(VL0);
1750 if (
auto *FPMO = dyn_cast<FPMathOperator>(CI))
1751 FMF = FPMO->getFastMathFlags();
1758 DEBUG(
dbgs() <<
"SLP: Call cost "<< VecCallCost - ScalarCallCost
1759 <<
" (" << VecCallCost <<
"-" << ScalarCallCost <<
")"
1760 <<
" for " << *CI <<
"\n");
1762 return VecCallCost - ScalarCallCost;
1764 case Instruction::ShuffleVector: {
1771 for (
Value *i : VL) {
1788 return VecCost - ScalarCost;
1795 bool BoUpSLP::isFullyVectorizableTinyTree() {
1796 DEBUG(
dbgs() <<
"SLP: Check whether the tree with height " <<
1797 VectorizableTree.size() <<
" is fully vectorizable .\n");
1800 if (VectorizableTree.size() == 1 && !VectorizableTree[0].NeedToGather)
1803 if (VectorizableTree.size() != 2)
1807 if (!VectorizableTree[0].NeedToGather &&
1809 isSplat(VectorizableTree[1].Scalars)))
1813 if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather)
1828 if (isFullyVectorizableTinyTree())
1831 assert(VectorizableTree.empty()
1832 ? ExternalUses.
empty()
1833 :
true &&
"We shouldn't have any external users");
1845 unsigned BundleWidth = VectorizableTree.front().Scalars.size();
1851 for (
const auto &
N : VectorizableTree) {
1862 LiveValues.
erase(PrevInst);
1863 for (
auto &J : PrevInst->
operands()) {
1864 if (isa<Instruction>(&*J) && ScalarToTreeEntry.count(&*J))
1865 LiveValues.
insert(cast<Instruction>(&*J));
1869 dbgs() <<
"SLP: #LV: " << LiveValues.
size();
1870 for (
auto *
X : LiveValues)
1871 dbgs() <<
" " <<
X->getName();
1872 dbgs() <<
", Looking at ";
1880 while (InstIt != PrevInstIt) {
1886 if (isa<CallInst>(&*PrevInstIt) && &*PrevInstIt != PrevInst) {
1888 for (
auto *II : LiveValues)
1904 DEBUG(
dbgs() <<
"SLP: Calculating cost for tree of size " <<
1905 VectorizableTree.size() <<
".\n");
1907 unsigned BundleWidth = VectorizableTree[0].Scalars.size();
1909 for (TreeEntry &TE : VectorizableTree) {
1910 int C = getEntryCost(&TE);
1911 DEBUG(
dbgs() <<
"SLP: Adding cost " << C <<
" for bundle that starts with "
1912 << *TE.Scalars[0] <<
".\n");
1917 int ExtractCost = 0;
1918 for (ExternalUser &EU : ExternalUses) {
1920 if (!ExtractCostCalculated.
insert(EU.Scalar).second)
1926 if (EphValues.count(EU.User))
1933 auto *ScalarRoot = VectorizableTree[0].Scalars[0];
1934 if (MinBWs.count(ScalarRoot)) {
1937 MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt;
1948 Cost += SpillCost + ExtractCost;
1950 DEBUG(
dbgs() <<
"SLP: Spill Cost = " << SpillCost <<
".\n"
1951 <<
"SLP: Extract Cost = " << ExtractCost <<
".\n"
1952 <<
"SLP: Total Cost = " << Cost <<
".\n");
1956 int BoUpSLP::getGatherCost(
Type *Ty) {
1958 for (
unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++
i)
1965 Type *ScalarTy = VL[0]->getType();
1966 if (
StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
1967 ScalarTy =
SI->getValueOperand()->getType();
1970 return getGatherCost(VecTy);
1986 for (
Value *i : VL) {
1987 Left.
push_back(cast<Instruction>(i)->getOperand(0));
1988 Right.
push_back(cast<Instruction>(i)->getOperand(1));
1993 for (
unsigned j = 0; j < VL.size() - 1; ++j) {
1994 if (
LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
1995 if (
LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
2001 }
else if (VL2->isCommutative() &&
2009 if (
LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
2010 if (
LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
2016 }
else if (VL2->isCommutative() &&
2036 bool AllSameOpcodeLeft,
2037 bool AllSameOpcodeRight,
bool SplatLeft,
2043 if (VRight == Right[i - 1])
2046 if (VLeft == Right[i - 1]) {
2050 if (SplatLeft && VLeft == Left[i - 1])
2057 if (VLeft == Left[i - 1])
2060 if (VRight == Left[i - 1])
2069 if (AllSameOpcodeRight) {
2070 unsigned RightPrevOpcode = cast<Instruction>(Right[i - 1])->getOpcode();
2071 if (IRight && RightPrevOpcode == IRight->getOpcode())
2074 if (ILeft && RightPrevOpcode == ILeft->
getOpcode()) {
2079 if (AllSameOpcodeLeft && ILeft &&
2086 if (AllSameOpcodeLeft) {
2087 unsigned LeftPrevOpcode = cast<Instruction>(Left[i - 1])->getOpcode();
2088 if (ILeft && LeftPrevOpcode == ILeft->
getOpcode())
2090 if (IRight && LeftPrevOpcode == IRight->getOpcode())
2103 auto VLeft = cast<Instruction>(VL[0])->getOperand(0);
2104 auto VRight = cast<Instruction>(VL[0])->getOperand(1);
2105 if (!isa<Instruction>(VRight) && isa<Instruction>(VLeft))
2113 bool AllSameOpcodeLeft = isa<Instruction>(Left[0]);
2114 bool AllSameOpcodeRight = isa<Instruction>(Right[0]);
2116 bool SplatLeft =
true;
2117 bool SplatRight =
true;
2119 for (
unsigned i = 1, e = VL.
size(); i != e; ++
i) {
2125 AllSameOpcodeRight, SplatLeft, SplatRight)) {
2133 SplatRight = SplatRight && (Right[i - 1] == Right[
i]);
2134 SplatLeft = SplatLeft && (Left[i - 1] == Left[
i]);
2135 AllSameOpcodeLeft = AllSameOpcodeLeft && isa<Instruction>(Left[
i]) &&
2136 (cast<Instruction>(Left[i - 1])->getOpcode() ==
2137 cast<Instruction>(Left[
i])->getOpcode());
2138 AllSameOpcodeRight = AllSameOpcodeRight && isa<Instruction>(Right[
i]) &&
2139 (cast<Instruction>(Right[i - 1])->getOpcode() ==
2140 cast<Instruction>(Right[
i])->getOpcode());
2144 if (SplatRight || SplatLeft)
2162 for (
unsigned j = 0; j < VL.
size() - 1; ++j) {
2163 if (
LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
2164 if (
LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
2171 if (
LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
2172 if (
LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
2187 auto *Front = cast<Instruction>(VL.
front());
2188 auto *BB = Front->getParent();
2190 return cast<Instruction>(V)->
getParent() == BB;
2200 if (BlocksSchedules.count(BB)) {
2201 auto *Bundle = BlocksSchedules[BB]->getScheduleData(VL.
back());
2202 if (Bundle && Bundle->isPartOfBundle())
2203 for (; Bundle; Bundle = Bundle->NextInBundle)
2204 LastInst = Bundle->Inst;
2228 if (Bundle.erase(&I))
2246 if (
Instruction *Insrt = dyn_cast<Instruction>(Vec)) {
2247 GatherSeq.insert(Insrt);
2248 CSEBlocks.insert(Insrt->getParent());
2251 if (ScalarToTreeEntry.count(VL[i])) {
2252 int Idx = ScalarToTreeEntry[VL[
i]];
2253 TreeEntry *E = &VectorizableTree[Idx];
2256 for (
unsigned Lane = 0,
LE = VL.size(); Lane !=
LE; ++Lane) {
2258 if (E->Scalars[Lane] == VL[i]) {
2263 assert(FoundLane >= 0 &&
"Could not find the correct lane");
2264 ExternalUses.
push_back(ExternalUser(VL[i], Insrt, FoundLane));
2274 = ScalarToTreeEntry.find(VL[0]);
2275 if (Entry != ScalarToTreeEntry.end()) {
2276 int Idx = Entry->second;
2277 const TreeEntry *En = &VectorizableTree[Idx];
2278 if (En->isSame(VL) && En->VectorizedValue)
2279 return En->VectorizedValue;
2285 if (ScalarToTreeEntry.count(VL[0])) {
2286 int Idx = ScalarToTreeEntry[VL[0]];
2287 TreeEntry *E = &VectorizableTree[Idx];
2292 Type *ScalarTy = VL[0]->getType();
2293 if (
StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
2294 ScalarTy =
SI->getValueOperand()->getType();
2297 return Gather(VL, VecTy);
2303 if (E->VectorizedValue) {
2304 DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *E->Scalars[0] <<
".\n");
2305 return E->VectorizedValue;
2308 Instruction *VL0 = cast<Instruction>(E->Scalars[0]);
2310 if (
StoreInst *SI = dyn_cast<StoreInst>(VL0))
2311 ScalarTy =
SI->getValueOperand()->getType();
2314 if (E->NeedToGather) {
2315 setInsertPointAfterBundle(E->Scalars);
2316 auto *V = Gather(E->Scalars, VecTy);
2317 E->VectorizedValue = V;
2324 case Instruction::PHI: {
2329 E->VectorizedValue = NewPhi;
2339 if (!VisitedBBs.
insert(IBB).second) {
2345 for (
Value *V : E->Scalars)
2346 Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(IBB));
2355 "Invalid number of incoming values");
2359 case Instruction::ExtractElement: {
2360 if (canReuseExtract(E->Scalars, Instruction::ExtractElement)) {
2362 E->VectorizedValue = V;
2365 setInsertPointAfterBundle(E->Scalars);
2366 auto *V = Gather(E->Scalars, VecTy);
2367 E->VectorizedValue = V;
2370 case Instruction::ExtractValue: {
2371 if (canReuseExtract(E->Scalars, Instruction::ExtractValue)) {
2377 E->VectorizedValue = V;
2380 setInsertPointAfterBundle(E->Scalars);
2381 auto *V = Gather(E->Scalars, VecTy);
2382 E->VectorizedValue = V;
2385 case Instruction::ZExt:
2386 case Instruction::SExt:
2387 case Instruction::FPToUI:
2388 case Instruction::FPToSI:
2389 case Instruction::FPExt:
2390 case Instruction::PtrToInt:
2391 case Instruction::IntToPtr:
2392 case Instruction::SIToFP:
2393 case Instruction::UIToFP:
2394 case Instruction::Trunc:
2395 case Instruction::FPTrunc:
2396 case Instruction::BitCast: {
2398 for (
Value *V : E->Scalars)
2399 INVL.
push_back(cast<Instruction>(V)->getOperand(0));
2401 setInsertPointAfterBundle(E->Scalars);
2405 if (
Value *V = alreadyVectorized(E->Scalars))
2410 E->VectorizedValue = V;
2411 ++NumVectorInstructions;
2414 case Instruction::FCmp:
2415 case Instruction::ICmp: {
2417 for (
Value *V : E->Scalars) {
2418 LHSV.
push_back(cast<Instruction>(V)->getOperand(0));
2419 RHSV.push_back(cast<Instruction>(V)->getOperand(1));
2422 setInsertPointAfterBundle(E->Scalars);
2427 if (
Value *V = alreadyVectorized(E->Scalars))
2432 if (Opcode == Instruction::FCmp)
2437 E->VectorizedValue = V;
2439 ++NumVectorInstructions;
2444 for (
Value *V : E->Scalars) {
2445 CondVec.
push_back(cast<Instruction>(V)->getOperand(0));
2446 TrueVec.push_back(cast<Instruction>(V)->getOperand(1));
2447 FalseVec.push_back(cast<Instruction>(V)->getOperand(2));
2450 setInsertPointAfterBundle(E->Scalars);
2456 if (
Value *V = alreadyVectorized(E->Scalars))
2460 E->VectorizedValue = V;
2461 ++NumVectorInstructions;
2465 case Instruction::FAdd:
2466 case Instruction::Sub:
2467 case Instruction::FSub:
2468 case Instruction::Mul:
2469 case Instruction::FMul:
2470 case Instruction::UDiv:
2471 case Instruction::SDiv:
2472 case Instruction::FDiv:
2473 case Instruction::URem:
2474 case Instruction::SRem:
2475 case Instruction::FRem:
2476 case Instruction::Shl:
2477 case Instruction::LShr:
2478 case Instruction::AShr:
2484 reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL);
2486 for (
Value *V : E->Scalars) {
2487 LHSVL.push_back(cast<Instruction>(V)->getOperand(0));
2488 RHSVL.push_back(cast<Instruction>(V)->getOperand(1));
2491 setInsertPointAfterBundle(E->Scalars);
2496 if (
Value *V = alreadyVectorized(E->Scalars))
2501 E->VectorizedValue = V;
2503 ++NumVectorInstructions;
2513 setInsertPointAfterBundle(E->Scalars);
2515 LoadInst *LI = cast<LoadInst>(VL0);
2535 E->VectorizedValue = LI;
2536 ++NumVectorInstructions;
2545 for (
Value *V : E->Scalars)
2546 ValueOp.
push_back(cast<StoreInst>(V)->getValueOperand());
2548 setInsertPointAfterBundle(E->Scalars);
2566 E->VectorizedValue = S;
2567 ++NumVectorInstructions;
2570 case Instruction::GetElementPtr: {
2571 setInsertPointAfterBundle(E->Scalars);
2574 for (
Value *V : E->Scalars)
2575 Op0VL.
push_back(cast<GetElementPtrInst>(V)->getOperand(0));
2579 std::vector<Value *> OpVecs;
2580 for (
int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
2583 for (
Value *V : E->Scalars)
2584 OpVL.
push_back(cast<GetElementPtrInst>(V)->getOperand(j));
2587 OpVecs.push_back(OpVec);
2591 cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
2592 E->VectorizedValue = V;
2593 ++NumVectorInstructions;
2601 CallInst *CI = cast<CallInst>(VL0);
2602 setInsertPointAfterBundle(E->Scalars);
2605 Value *ScalarArg =
nullptr;
2609 std::vector<Value *> OpVecs;
2615 CallInst *CEI = cast<CallInst>(E->Scalars[0]);
2620 for (
Value *V : E->Scalars) {
2626 DEBUG(
dbgs() <<
"SLP: OpVec[" << j <<
"]: " << *OpVec <<
"\n");
2627 OpVecs.push_back(OpVec);
2641 if (ScalarArg && ScalarToTreeEntry.count(ScalarArg))
2642 ExternalUses.
push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
2644 E->VectorizedValue = V;
2646 ++NumVectorInstructions;
2649 case Instruction::ShuffleVector: {
2651 assert(isa<BinaryOperator>(VL0) &&
"Invalid Shuffle Vector Operand");
2652 reorderAltShuffleOperands(E->Scalars, LHSVL, RHSVL);
2653 setInsertPointAfterBundle(E->Scalars);
2658 if (
Value *V = alreadyVectorized(E->Scalars))
2666 Instruction *VL1 = cast<Instruction>(E->Scalars[1]);
2674 unsigned e = E->Scalars.size();
2676 for (
unsigned i = 0; i < e; ++
i) {
2679 OddScalars.push_back(E->Scalars[i]);
2682 EvenScalars.push_back(E->Scalars[i]);
2691 E->VectorizedValue = V;
2692 ++NumVectorInstructions;
2707 for (
auto &BSIter : BlocksSchedules) {
2708 scheduleBlock(BSIter.second.get());
2717 auto *ScalarRoot = VectorizableTree[0].Scalars[0];
2718 if (MinBWs.count(ScalarRoot)) {
2719 if (
auto *I = dyn_cast<Instruction>(VectorRoot))
2721 auto BundleWidth = VectorizableTree[0].Scalars.size();
2724 auto *Trunc = Builder.
CreateTrunc(VectorRoot, VecTy);
2725 VectorizableTree[0].VectorizedValue = Trunc;
2728 DEBUG(
dbgs() <<
"SLP: Extracting " << ExternalUses.
size() <<
" values .\n");
2732 auto extend = [&](
Value *ScalarRoot,
Value *Ex,
Type *ScalarType) {
2733 if (!MinBWs.count(ScalarRoot))
2735 if (MinBWs[ScalarRoot].second)
2741 for (
const auto &ExternalUse : ExternalUses) {
2749 assert(ScalarToTreeEntry.count(Scalar) &&
"Invalid scalar");
2751 int Idx = ScalarToTreeEntry[
Scalar];
2752 TreeEntry *E = &VectorizableTree[Idx];
2753 assert(!E->NeedToGather &&
"Extracting from a gather list");
2755 Value *Vec = E->VectorizedValue;
2756 assert(Vec &&
"Can't find vectorizable value");
2761 if (
auto *VecI = dyn_cast<Instruction>(Vec)) {
2762 if (
PHINode *PH = dyn_cast<PHINode>(User)) {
2767 if (isa<CatchSwitchInst>(IncomingTerminator)) {
2769 std::next(VecI->getIterator()));
2774 Ex = extend(ScalarRoot, Ex, Scalar->
getType());
2782 Ex = extend(ScalarRoot, Ex, Scalar->
getType());
2783 CSEBlocks.insert(cast<Instruction>(User)->
getParent());
2789 Ex = extend(ScalarRoot, Ex, Scalar->
getType());
2794 DEBUG(
dbgs() <<
"SLP: Replaced:" << *User <<
".\n");
2798 for (TreeEntry &EIdx : VectorizableTree) {
2799 TreeEntry *Entry = &EIdx;
2802 for (
int Lane = 0,
LE = Entry->Scalars.size(); Lane !=
LE; ++Lane) {
2805 if (Entry->NeedToGather)
2808 assert(Entry->VectorizedValue &&
"Can't find vectorizable value");
2814 DEBUG(
dbgs() <<
"SLP: \tvalidating user:" << *U <<
".\n");
2816 assert((ScalarToTreeEntry.count(U) ||
2819 "Replacing out-of-tree value with undef");
2825 DEBUG(
dbgs() <<
"SLP: \tErasing scalar:" << *Scalar <<
".\n");
2826 eraseInstruction(cast<Instruction>(Scalar));
2832 return VectorizableTree[0].VectorizedValue;
2836 DEBUG(
dbgs() <<
"SLP: Optimizing " << GatherSeq.size()
2837 <<
" gather sequences instructions.\n");
2860 if (CurrVec && L->
contains(CurrVec))
2862 if (NewElem && L->
contains(NewElem))
2871 CSEWorkList.
reserve(CSEBlocks.size());
2880 std::stable_sort(CSEWorkList.
begin(), CSEWorkList.
end(),
2889 for (
auto I = CSEWorkList.
begin(), E = CSEWorkList.
end(); I !=
E; ++
I) {
2891 "Worklist not sorted properly!");
2896 if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(
In))
2905 eraseInstruction(In);
2912 Visited.push_back(In);
2924 if (isa<PHINode>(VL[0]))
2929 ScheduleData *PrevInBundle =
nullptr;
2930 ScheduleData *Bundle =
nullptr;
2931 bool ReSchedule =
false;
2932 DEBUG(
dbgs() <<
"SLP: bundle: " << *VL[0] <<
"\n");
2936 for (
Value *V : VL) {
2937 if (!extendSchedulingRegion(V))
2941 for (
Value *V : VL) {
2942 ScheduleData *BundleMember = getScheduleData(V);
2944 "no ScheduleData for bundle member (maybe not in same basic block)");
2945 if (BundleMember->IsScheduled) {
2949 DEBUG(
dbgs() <<
"SLP: reset schedule because " << *BundleMember
2950 <<
" was already scheduled\n");
2953 assert(BundleMember->isSchedulingEntity() &&
2954 "bundle member already part of other bundle");
2956 PrevInBundle->NextInBundle = BundleMember;
2958 Bundle = BundleMember;
2960 BundleMember->UnscheduledDepsInBundle = 0;
2961 Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
2964 BundleMember->FirstInBundle = Bundle;
2965 PrevInBundle = BundleMember;
2967 if (ScheduleEnd != OldScheduleEnd) {
2973 for (
auto *I = ScheduleStart; I != ScheduleEnd; I = I->
getNextNode()) {
2974 ScheduleData *SD = getScheduleData(I);
2975 SD->clearDependencies();
2981 initialFillReadyList(ReadyInsts);
2984 DEBUG(
dbgs() <<
"SLP: try schedule bundle " << *Bundle <<
" in block "
2985 << BB->getName() <<
"\n");
2987 calculateDependencies(Bundle,
true, SLP);
2993 while (!Bundle->isReady() && !ReadyInsts.empty()) {
2995 ScheduleData *pickedSD = ReadyInsts.back();
2996 ReadyInsts.pop_back();
2998 if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) {
2999 schedule(pickedSD, ReadyInsts);
3002 if (!Bundle->isReady()) {
3003 cancelScheduling(VL);
3010 if (isa<PHINode>(VL[0]))
3013 ScheduleData *Bundle = getScheduleData(VL[0]);
3014 DEBUG(
dbgs() <<
"SLP: cancel scheduling of " << *Bundle <<
"\n");
3015 assert(!Bundle->IsScheduled &&
3016 "Can't cancel bundle which is already scheduled");
3017 assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&
3018 "tried to unbundle something which is not a bundle");
3021 ScheduleData *BundleMember = Bundle;
3022 while (BundleMember) {
3023 assert(BundleMember->FirstInBundle == Bundle &&
"corrupt bundle links");
3024 BundleMember->FirstInBundle = BundleMember;
3025 ScheduleData *Next = BundleMember->NextInBundle;
3026 BundleMember->NextInBundle =
nullptr;
3027 BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
3028 if (BundleMember->UnscheduledDepsInBundle == 0) {
3029 ReadyInsts.insert(BundleMember);
3031 BundleMember = Next;
3035 bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
Value *V) {
3036 if (getScheduleData(V))
3039 assert(I &&
"bundle member must be an instruction");
3040 assert(!isa<PHINode>(I) &&
"phi nodes don't need to be scheduled");
3041 if (!ScheduleStart) {
3043 initScheduleData(I, I->
getNextNode(),
nullptr,
nullptr);
3046 assert(ScheduleEnd &&
"tried to vectorize a TerminatorInst?");
3047 DEBUG(
dbgs() <<
"SLP: initialize schedule region to " << *I <<
"\n");
3058 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
3059 DEBUG(
dbgs() <<
"SLP: exceeded schedule region size limit\n");
3063 if (UpIter != UpperEnd) {
3064 if (&*UpIter == I) {
3065 initScheduleData(I, ScheduleStart,
nullptr, FirstLoadStoreInRegion);
3067 DEBUG(
dbgs() <<
"SLP: extend schedule region start to " << *I <<
"\n");
3072 if (DownIter != LowerEnd) {
3073 if (&*DownIter == I) {
3074 initScheduleData(ScheduleEnd, I->
getNextNode(), LastLoadStoreInRegion,
3077 assert(ScheduleEnd &&
"tried to vectorize a TerminatorInst?");
3078 DEBUG(
dbgs() <<
"SLP: extend schedule region end to " << *I <<
"\n");
3083 assert((UpIter != UpperEnd || DownIter != LowerEnd) &&
3084 "instruction not found in block");
3089 void BoUpSLP::BlockScheduling::initScheduleData(
Instruction *FromI,
3091 ScheduleData *PrevLoadStore,
3092 ScheduleData *NextLoadStore) {
3093 ScheduleData *CurrentLoadStore = PrevLoadStore;
3095 ScheduleData *SD = ScheduleDataMap[
I];
3098 if (ChunkPos >= ChunkSize) {
3099 ScheduleDataChunks.push_back(
3103 SD = &(ScheduleDataChunks.back()[ChunkPos++]);
3104 ScheduleDataMap[
I] = SD;
3107 assert(!isInSchedulingRegion(SD) &&
3108 "new ScheduleData already in scheduling region");
3109 SD->init(SchedulingRegionID);
3113 if (CurrentLoadStore) {
3114 CurrentLoadStore->NextLoadStore = SD;
3116 FirstLoadStoreInRegion = SD;
3118 CurrentLoadStore = SD;
3121 if (NextLoadStore) {
3122 if (CurrentLoadStore)
3123 CurrentLoadStore->NextLoadStore = NextLoadStore;
3125 LastLoadStoreInRegion = CurrentLoadStore;
3129 void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
3130 bool InsertInReadyList,
3132 assert(SD->isSchedulingEntity());
3137 while (!WorkList.empty()) {
3138 ScheduleData *SD = WorkList.back();
3139 WorkList.pop_back();
3141 ScheduleData *BundleMember = SD;
3142 while (BundleMember) {
3143 assert(isInSchedulingRegion(BundleMember));
3144 if (!BundleMember->hasValidDependencies()) {
3146 DEBUG(
dbgs() <<
"SLP: update deps of " << *BundleMember <<
"\n");
3147 BundleMember->Dependencies = 0;
3148 BundleMember->resetUnscheduledDeps();
3151 for (
User *U : BundleMember->Inst->users()) {
3152 if (isa<Instruction>(U)) {
3153 ScheduleData *UseSD = getScheduleData(U);
3154 if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
3155 BundleMember->Dependencies++;
3156 ScheduleData *DestBundle = UseSD->FirstInBundle;
3157 if (!DestBundle->IsScheduled) {
3158 BundleMember->incrementUnscheduledDeps(1);
3160 if (!DestBundle->hasValidDependencies()) {
3161 WorkList.push_back(DestBundle);
3168 BundleMember->Dependencies++;
3169 BundleMember->incrementUnscheduledDeps(1);
3174 ScheduleData *DepDest = BundleMember->NextLoadStore;
3178 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
3179 unsigned numAliased = 0;
3180 unsigned DistToSrc = 1;
3183 assert(isInSchedulingRegion(DepDest));
3193 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
3195 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
3202 DepDest->MemoryDependencies.push_back(BundleMember);
3203 BundleMember->Dependencies++;
3204 ScheduleData *DestBundle = DepDest->FirstInBundle;
3205 if (!DestBundle->IsScheduled) {
3206 BundleMember->incrementUnscheduledDeps(1);
3208 if (!DestBundle->hasValidDependencies()) {
3209 WorkList.push_back(DestBundle);
3212 DepDest = DepDest->NextLoadStore;
3233 BundleMember = BundleMember->NextInBundle;
3235 if (InsertInReadyList && SD->isReady()) {
3236 ReadyInsts.push_back(SD);
3237 DEBUG(
dbgs() <<
"SLP: gets ready on update: " << *SD->Inst <<
"\n");
3242 void BoUpSLP::BlockScheduling::resetSchedule() {
3244 "tried to reset schedule on block which has not been scheduled");
3246 ScheduleData *SD = getScheduleData(I);
3247 assert(isInSchedulingRegion(SD));
3248 SD->IsScheduled =
false;
3249 SD->resetUnscheduledDeps();
3254 void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
3256 if (!BS->ScheduleStart)
3259 DEBUG(
dbgs() <<
"SLP: schedule block " << BS->BB->getName() <<
"\n");
3261 BS->resetSchedule();
3266 struct ScheduleDataCompare {
3267 bool operator()(ScheduleData *SD1, ScheduleData *SD2) {
3268 return SD2->SchedulingPriority < SD1->SchedulingPriority;
3271 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
3276 int NumToSchedule = 0;
3277 for (
auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
3279 ScheduleData *SD = BS->getScheduleData(I);
3281 SD->isPartOfBundle() == (ScalarToTreeEntry.count(SD->Inst) != 0) &&
3282 "scheduler and vectorizer have different opinion on what is a bundle");
3283 SD->FirstInBundle->SchedulingPriority = Idx++;
3284 if (SD->isSchedulingEntity()) {
3285 BS->calculateDependencies(SD,
false,
this);
3289 BS->initialFillReadyList(ReadyInsts);
3294 while (!ReadyInsts.empty()) {
3295 ScheduleData *picked = *ReadyInsts.begin();
3296 ReadyInsts.erase(ReadyInsts.begin());
3300 ScheduleData *BundleMember = picked;
3301 while (BundleMember) {
3303 if (LastScheduledInst->
getNextNode() != pickedInst) {
3304 BS->BB->getInstList().remove(pickedInst);
3305 BS->BB->getInstList().insert(LastScheduledInst->
getIterator(),
3308 LastScheduledInst = pickedInst;
3309 BundleMember = BundleMember->NextInBundle;
3312 BS->schedule(picked, ReadyInsts);
3315 assert(NumToSchedule == 0 &&
"could not schedule all instructions");
3318 BS->ScheduleStart =
nullptr;
3324 if (
auto *
Store = dyn_cast<StoreInst>(V))
3333 if (
auto *I = dyn_cast<Instruction>(V))
3339 auto FoundUnknownInst =
false;
3340 while (!Worklist.
empty() && !FoundUnknownInst) {
3347 if (isa<VectorType>(Ty))
3348 FoundUnknownInst =
true;
3352 else if (isa<LoadInst>(I))
3358 else if (isa<PHINode>(I) || isa<CastInst>(
I) || isa<GetElementPtrInst>(I) ||
3359 isa<CmpInst>(
I) || isa<SelectInst>(I) || isa<BinaryOperator>(
I)) {
3361 if (
auto *J = dyn_cast<Instruction>(U.get()))
3362 if (!Visited.
count(J))
3368 FoundUnknownInst =
true;
3373 if (!MaxWidth || FoundUnknownInst)
3388 if (isa<Constant>(V)) {
3403 case Instruction::Trunc:
3405 case Instruction::ZExt:
3406 case Instruction::SExt:
3412 case Instruction::Sub:
3413 case Instruction::Mul:
3433 case Instruction::PHI: {
3454 if (ExternalUses.
empty())
3458 auto &TreeRoot = VectorizableTree[0].Scalars;
3470 for (
auto &EU : ExternalUses)
3471 if (!Expr.erase(EU.Scalar))
3479 for (
auto &Entry : VectorizableTree)
3484 for (
auto *Root : TreeRoot)
3485 if (!Root->hasOneUse() || Expr.count(*Root->user_begin()))
3493 for (
auto *Root : TreeRoot)
3500 auto MaxBitWidth = 8u;
3504 for (
auto *Root : TreeRoot) {
3506 MaxBitWidth = std::max<unsigned>(
3507 Mask.getBitWidth() -
Mask.countLeadingZeros(), MaxBitWidth);
3513 bool IsKnownPositive =
true;
3529 IsKnownPositive =
all_of(TreeRoot, [&](
Value *R) {
3530 bool KnownZero =
false;
3531 bool KnownOne =
false;
3538 for (
auto *
Scalar : ToDemote) {
3541 MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth);
3559 if (!IsKnownPositive)
3569 if (MaxBitWidth >= TreeRootIT->getBitWidth())
3575 while (!Roots.
empty())
3579 for (
auto *
Scalar : ToDemote)
3580 MinBWs[
Scalar] = std::make_pair(MaxBitWidth, !IsKnownPositive);
3596 bool doInitialization(
Module &M)
override {
3600 bool runOnFunction(
Function &F)
override {
3601 if (skipFunction(F))
3604 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
3605 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
3606 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
3607 auto *TLI = TLIP ? &TLIP->getTLI() :
nullptr;
3608 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
3609 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
3610 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
3611 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
3612 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
3614 return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB);
3645 bool Changed =
runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB);
3673 bool Changed =
false;
3688 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL);
3695 collectSeedInstructions(BB);
3698 if (!Stores.empty()) {
3699 DEBUG(
dbgs() <<
"SLP: Found stores for " << Stores.size()
3700 <<
" underlying objects.\n");
3701 Changed |= vectorizeStoreChains(R);
3705 Changed |= vectorizeChainsInBlock(BB, R);
3710 if (!GEPs.empty()) {
3711 DEBUG(
dbgs() <<
"SLP: Found GEPs for " << GEPs.size()
3712 <<
" underlying objects.\n");
3713 Changed |= vectorizeGEPIndices(BB, R);
3718 R.optimizeGatherSequence();
3730 unsigned SliceBegin,
unsigned SliceSize) {
3731 VL = VL.
slice(SliceBegin, SliceSize);
3732 VH = VH.
slice(SliceBegin, SliceSize);
3737 unsigned VecRegSize) {
3738 unsigned ChainLen = Chain.
size();
3739 DEBUG(
dbgs() <<
"SLP: Analyzing a store chain of length " << ChainLen
3742 unsigned VF = VecRegSize / Sz;
3750 bool Changed =
false;
3752 for (
unsigned i = 0, e = ChainLen; i < e; ++
i) {
3760 DEBUG(
dbgs() <<
"SLP: Analyzing " << VF <<
" stores at offset " << i
3772 DEBUG(
dbgs() <<
"SLP: Found cost=" << Cost <<
" for VF=" << VF <<
"\n");
3774 DEBUG(
dbgs() <<
"SLP: Decided to vectorize cost=" << Cost <<
"\n");
3794 bool Changed =
false;
3799 for (
unsigned i = 0, e = Stores.
size(); i < e; ++
i) {
3806 for (j = i + 1; j < e; ++j)
3808 for (j = i; j > 0; --j)
3811 for (
auto &k : IndexQueue) {
3815 ConsecutiveChain[Stores[
i]] = Stores[k];
3824 if (Tails.
count(*it))
3833 if (VectorizedStores.
count(I))
3837 I = ConsecutiveChain[
I];
3844 if (vectorizeStoreChain(Operands, R, Size)) {
3856 void SLPVectorizerPass::collectSeedInstructions(
BasicBlock *BB) {
3869 if (
auto *SI = dyn_cast<StoreInst>(&I)) {
3880 else if (
auto *
GEP = dyn_cast<GetElementPtrInst>(&I)) {
3881 auto Idx =
GEP->idx_begin()->get();
3882 if (
GEP->getNumIndices() > 1 || isa<Constant>(Idx))
3886 if (
GEP->getType()->isVectorTy())
3897 return tryToVectorizeList(VL, R, None,
true);
3902 bool AllowReorder) {
3906 DEBUG(
dbgs() <<
"SLP: Trying to vectorize a list of length = " << VL.
size()
3922 for (
Value *V : VL) {
3927 if (!Inst || Inst->
getOpcode() != Opcode0)
3931 bool Changed =
false;
3936 unsigned NextInst = 0, MaxInst = VL.
size();
3937 for (
unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
3945 for (
unsigned I = NextInst; I < MaxInst; ++
I) {
3946 unsigned OpsWidth = 0;
3948 if (I + VF > MaxInst)
3949 OpsWidth = MaxInst -
I;
3960 DEBUG(
dbgs() <<
"SLP: Analyzing " << OpsWidth <<
" operations "
3965 if (!BuildVector.
empty())
3966 BuildVectorSlice = BuildVector.
slice(I, OpsWidth);
3977 Value *ReorderedOps[] = {Ops[1], Ops[0]};
3987 DEBUG(
dbgs() <<
"SLP: Vectorizing list at cost:" << Cost <<
".\n");
3994 if (!BuildVectorSlice.
empty()) {
3998 Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.
back());
3999 unsigned VecIdx = 0;
4000 for (
auto &V : BuildVectorSlice) {
4004 assert(isa<InsertElementInst>(I) || isa<InsertValueInst>(I));
4007 VectorizedRoot, Builder.
getInt32(VecIdx++)));
4008 I->setOperand(1, Extract);
4009 I->removeFromParent();
4010 I->insertAfter(Extract);
4036 if (B && B->hasOneUse()) {
4039 if (tryToVectorizePair(A, B0, R)) {
4042 if (tryToVectorizePair(A, B1, R)) {
4051 if (tryToVectorizePair(A0, B, R)) {
4054 if (tryToVectorizePair(A1, B, R)) {
4072 bool IsPairwise,
bool IsLeft,
4074 assert((IsPairwise || !IsLeft) &&
"Don't support a <0,1,undef,...> mask");
4081 for (
unsigned i = 0; i != NumEltsToRdx; ++
i)
4082 ShuffleMask[i] = Builder.
getInt32(2 * i + !IsLeft);
4085 for (
unsigned i = 0; i != NumEltsToRdx; ++
i)
4086 ShuffleMask[i] = Builder.
getInt32(NumEltsToRdx + i);
4119 class HorizontalReduction {
4134 unsigned ReductionOpcode;
4136 unsigned ReducedValueOpcode;
4139 bool IsPairwiseReduction;
4143 unsigned ReduxWidth;
4147 unsigned MinVecRegSize;
4149 HorizontalReduction(
unsigned MinVecRegSize)
4150 : ReductionRoot(nullptr), ReductionOpcode(0), ReducedValueOpcode(0),
4151 IsPairwiseReduction(
false), ReduxWidth(0),
4152 MinVecRegSize(MinVecRegSize) {}
4157 "Thi phi needs to use the binary operator");
4181 ReducedValueOpcode = 0;
4193 ReductionOpcode != Instruction::FAdd)
4200 while (!Stack.
empty()) {
4202 unsigned EdgeToVist = Stack.
back().second++;
4203 bool IsReducedValue = TreeN->
getOpcode() != ReductionOpcode;
4215 if (EdgeToVist == 2 || IsReducedValue) {
4216 if (IsReducedValue) {
4219 if (!ReducedValueOpcode)
4220 ReducedValueOpcode = TreeN->
getOpcode();
4221 else if (ReducedValueOpcode != TreeN->
getOpcode())
4223 ReducedVals.push_back(TreeN);
4228 ReductionOps.push_back(TreeN);
4243 if (I && (!ReducedValueOpcode || I->getOpcode() == ReducedValueOpcode ||
4244 I->getOpcode() == ReductionOpcode)) {
4245 if (!ReducedValueOpcode && I->getOpcode() != ReductionOpcode)
4259 if (ReducedVals.empty())
4262 unsigned NumReducedVals = ReducedVals.size();
4263 if (NumReducedVals < ReduxWidth)
4266 Value *VectorizedTree =
nullptr;
4270 Builder.setFastMathFlags(Unsafe);
4273 for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {
4286 int Cost = V.
getTreeCost() + getReductionCost(TTI, ReducedVals[i]);
4290 DEBUG(
dbgs() <<
"SLP: Vectorizing horizontal reduction at cost:" << Cost
4294 DebugLoc Loc = cast<Instruction>(ReducedVals[
i])->getDebugLoc();
4298 Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder);
4299 if (VectorizedTree) {
4300 Builder.SetCurrentDebugLocation(Loc);
4301 VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
4302 ReducedSubTree,
"bin.rdx");
4304 VectorizedTree = ReducedSubTree;
4307 if (VectorizedTree) {
4309 for (; i < NumReducedVals; ++
i) {
4310 Builder.SetCurrentDebugLocation(
4311 cast<Instruction>(ReducedVals[i])->getDebugLoc());
4312 VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
4316 if (ReductionPHI && !isa<UndefValue>(ReductionPHI)) {
4317 assert(ReductionRoot &&
"Need a reduction operation");
4318 ReductionRoot->setOperand(0, VectorizedTree);
4319 ReductionRoot->setOperand(1, ReductionPHI);
4321 ReductionRoot->replaceAllUsesWith(VectorizedTree);
4323 return VectorizedTree !=
nullptr;
4326 unsigned numReductionValues()
const {
4327 return ReducedVals.size();
4337 int SplittingRdxCost = TTI->
getReductionCost(ReductionOpcode, VecTy,
false);
4339 IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
4340 int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
4342 int ScalarReduxCost =
4346 DEBUG(
dbgs() <<
"SLP: Adding cost " << VecReduxCost - ScalarReduxCost
4347 <<
" for reduction that starts with " << *FirstReducedVal
4349 << (IsPairwiseReduction ?
"pairwise" :
"splitting")
4350 <<
" reduction)\n");
4352 return VecReduxCost - ScalarReduxCost;
4357 if (Opcode == Instruction::FAdd)
4364 assert(VectorizedValue &&
"Need to have a vectorized tree node");
4366 "We only handle power-of-two reductions for now");
4368 Value *TmpVec = VectorizedValue;
4369 for (
unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
4370 if (IsPairwiseReduction) {
4381 TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf,
4388 TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf,
"bin.rdx");
4409 if (!isa<UndefValue>(FirstInsertElem->
getOperand(0)))
4444 if (!isa<UndefValue>(V)) {
4470 auto DominatedReduxValue = [&](
Value *R) {
4472 dyn_cast<Instruction>(R) &&
4476 Value *Rdx =
nullptr;
4485 if (Rdx && DominatedReduxValue(Rdx))
4504 if (Rdx && DominatedReduxValue(Rdx))
4518 unsigned MinRegSize) {
4522 HorizontalReduction HorRdx(MinRegSize);
4523 if (!HorRdx.matchAssociativeReduction(P, BI))
4530 std::max((uint64_t)4,
PowerOf2Floor(HorRdx.numReductionValues()));
4532 return HorRdx.tryToReduce(R, TTI);
4536 bool Changed =
false;
4540 bool HaveVectorizedPhiNodes =
true;
4541 while (HaveVectorizedPhiNodes) {
4542 HaveVectorizedPhiNodes =
false;
4551 if (!VisitedInstrs.
count(P))
4565 while (SameTypeIt != E &&
4566 (*SameTypeIt)->getType() == (*IncIt)->getType()) {
4567 VisitedInstrs.
insert(*SameTypeIt);
4572 unsigned NumElts = (SameTypeIt - IncIt);
4573 DEBUG(
errs() <<
"SLP: Trying to vectorize starting at PHIs (" << NumElts <<
")\n");
4574 if (NumElts > 1 && tryToVectorizeList(
makeArrayRef(IncIt, NumElts), R)) {
4576 HaveVectorizedPhiNodes =
true;
4586 VisitedInstrs.
clear();
4590 if (!VisitedInstrs.
insert(&*it).second)
4593 if (isa<DbgInfoIntrinsic>(it))
4597 if (
PHINode *P = dyn_cast<PHINode>(it)) {
4621 if (tryToVectorize(dyn_cast<BinaryOperator>(Inst), R)) {
4634 if (
StoreInst *SI = dyn_cast<StoreInst>(it))
4639 tryToVectorize(BinOp, R)) {
4648 if (
ReturnInst *RI = dyn_cast<ReturnInst>(it))
4649 if (RI->getNumOperands() != 0)
4651 dyn_cast<BinaryOperator>(RI->getOperand(0))) {
4652 DEBUG(
dbgs() <<
"SLP: Found a return to vectorize.\n");
4665 if (
CmpInst *CI = dyn_cast<CmpInst>(it)) {
4675 for (
int i = 0; i < 2; ++
i) {
4700 if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) {
4711 if (
StoreInst *SI = dyn_cast<StoreInst>(it)) {
4713 const DataLayout &DL = BB->getModule()->getDataLayout();
4720 DEBUG(
dbgs() <<
"SLP: store of array mappable to vector: " << *SI <<
"\n");
4721 if (tryToVectorizeList(BuildVectorOpds, R, BuildVector,
false)) {
4736 auto Changed =
false;
4737 for (
auto &Entry : GEPs) {
4741 if (Entry.second.
size() < 2)
4744 DEBUG(
dbgs() <<
"SLP: Analyzing a getelementptr list of length "
4745 << Entry.second.
size() <<
".\n");
4749 for (
unsigned BI = 0, BE = Entry.second.
size(); BI < BE; BI += 16) {
4750 auto Len = std::min<unsigned>(BE - BI, 16);
4762 Candidates.
remove(
nullptr);
4769 for (
int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++
I) {
4770 auto *GEPI = cast<GetElementPtrInst>(GEPList[
I]);
4771 if (!Candidates.count(GEPI))
4773 auto *SCEVI = SE->
getSCEV(GEPList[I]);
4774 for (
int J = I + 1; J < E && Candidates.size() > 1; ++J) {
4775 auto *GEPJ = cast<GetElementPtrInst>(GEPList[J]);
4776 auto *SCEVJ = SE->
getSCEV(GEPList[J]);
4777 if (isa<SCEVConstant>(SE->
getMinusSCEV(SCEVI, SCEVJ))) {
4778 Candidates.remove(GEPList[I]);
4779 Candidates.remove(GEPList[J]);
4780 }
else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
4781 Candidates.remove(GEPList[J]);
4788 if (Candidates.size() < 2)
4795 auto BundleIndex = 0u;
4796 for (
auto *V : Candidates) {
4797 auto *
GEP = cast<GetElementPtrInst>(V);
4798 auto *GEPIdx =
GEP->idx_begin()->get();
4799 assert(
GEP->getNumIndices() == 1 || !isa<Constant>(GEPIdx));
4800 Bundle[BundleIndex++] = GEPIdx;
4812 Changed |= tryToVectorizeList(Bundle, R);
4818 bool SLPVectorizerPass::vectorizeStoreChains(
BoUpSLP &R) {
4819 bool Changed =
false;
4821 for (StoreListMap::iterator it = Stores.
begin(), e = Stores.
end(); it != e;
4823 if (it->second.size() < 2)
4826 DEBUG(
dbgs() <<
"SLP: Analyzing a store chain of length "
4827 << it->second.size() <<
".\n");
4833 for (
unsigned CI = 0, CE = it->second.size(); CI <
CE; CI+=16) {
4834 unsigned Len = std::min<unsigned>(CE - CI, 16);
4835 Changed |= vectorizeStores(
makeArrayRef(&it->second[CI], Len), R);
Legacy wrapper pass to provide the GlobalsAAResult object.
Pass interface - Implemented by all 'passes'.
static unsigned getSameOpcode(ArrayRef< Value * > VL)
Return a value (possibly void), from a function.
Value * getValueOperand()
void push_back(const T &Elt)
A parsed version of the target data layout string in and methods for querying it. ...
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
static bool canCombineAsAltInst(unsigned Op)
This class is the base class for the comparison instructions.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
raw_ostream & errs()
This returns a reference to a raw_ostream for standard error.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
SmallPtrSet< Value *, 16 > ValueSet
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function. ...
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * getAggregateOperand()
void dropAllReferences()
Drop all references to operands.
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
STATISTIC(NumFunctions,"Total number of functions")
This is the interface for a simple mod/ref and alias analysis over globals.
static bool allSameBlock(ArrayRef< Value * > VL)
A Module instance is used to store all the information related to an LLVM module. ...
bool hasNUses(unsigned N) const
Return true if this Value has exactly N users.
const T & front() const
front - Get the first element.
static Value * getReductionValue(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction value from a phi node.
unsigned getNumOperands() const
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI)
The main scalar evolution driver.
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
ilist_iterator< OptionsT,!IsReverse, IsConst > getReverse() const
Get a reverse iterator to the same node.
This class represents a function call, abstracting a target machine's calling convention.
size_type count(PtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
An immutable pass that tracks lazily created AssumptionCache objects.
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space...
static const unsigned MaxMemDepDistance
A cache of .assume calls within a function.
Analysis pass providing the TargetTransformInfo.
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this ""number "))
This class implements a map that also provides access to all stored values in a deterministic order...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly...
static bool findBuildVector(InsertElementInst *FirstInsertElem, SmallVectorImpl< Value * > &BuildVector, SmallVectorImpl< Value * > &BuildVectorOpds)
Recognize construction of vectors like ra = insertelement <4 x float> undef, float s0...
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
const Instruction & front() const
Analysis pass which computes a DominatorTree.
An instruction for reading from memory.
reverse_iterator rbegin()
FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys=None)
Return the function type for an intrinsic.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
void reserve(size_type N)
iterator end()
Get an iterator to the end of the SetVector.
uint64_t getTypeAllocSizeInBits(Type *Ty) const
Returns the offset in bits between successive objects of the specified type, including alignment padd...
static bool hasValueBeenRAUWed(ArrayRef< Value * > VL, ArrayRef< WeakVH > VH, unsigned SliceBegin, unsigned SliceSize)
Check that the Values in the slice in VL array are still existent in the WeakVH array.
StringRef getName() const
Return a constant reference to the value's name.
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
iterator begin()
Instruction iterator methods.
SmallVector< Instruction *, 16 > InstrList
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array...
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
This class represents the LLVM 'select' instruction.
bool isIdenticalTo(const Instruction *I) const
Return true if the specified instruction is exactly identical to the current one. ...
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
This is the base class for all instructions that perform data casts.
const APInt & getValue() const
Return the constant as an APInt value reference.
ArrayRef< T > makeArrayRef(const T &OneElt)
Construct an ArrayRef from a single element.
Class to represent struct types.
bool isTreeTinyAndNotFullyVectorizable()
A Use represents the edge between a Value definition and its users.
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
unsigned getNumArgOperands() const
Return the number of call arguments.
Instruction * getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
static Constant * get(ArrayRef< Constant * > V)
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
An analysis that produces DemandedBits for a function.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
Windows NT (Windows on ARM)
static const unsigned AliasedCheckLimit
Analysis pass that exposes the LoopInfo for a function.
static bool allConstant(ArrayRef< Value * > VL)
bool remove(const value_type &X)
Remove an item from the set vector.
static bool shouldReorderOperands(int i, Instruction &I, SmallVectorImpl< Value * > &Left, SmallVectorImpl< Value * > &Right, bool AllSameOpcodeLeft, bool AllSameOpcodeRight, bool SplatLeft, bool SplatRight)
LLVM_NODISCARD bool empty() const
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
static bool isSimple(Instruction *I)
static unsigned getAlignment(GlobalVariable *GV)
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
static bool canMatchHorizontalReduction(PHINode *P, BinaryOperator *BI, BoUpSLP &R, TargetTransformInfo *TTI, unsigned MinRegSize)
Attempt to reduce a horizontal reduction.
bool insert(const value_type &X)
Insert a new element into the SetVector.
static unsigned isAltInst(ArrayRef< Value * > VL)
bool isAssociative() const
Return true if the instruction is associative:
Value handle that is nullable, but tries to track the Value.
const T & getValue() const LLVM_LVALUE_FUNCTION
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Function Alias Analysis false
iterator begin()
Get an iterator to the beginning of the SetVector.
void insert(ScheduleData *SD)
Base class for the actual dominator tree node.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Value * getInsertedValueOperand()
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AliasAnalysis *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_)
static Value * createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx, bool IsPairwise, bool IsLeft, IRBuilder<> &Builder)
Generate a shuffle mask to be used in a reduction tree.
static GCRegistry::Add< OcamlGC > B("ocaml","ocaml 3.10-compatible GC")
An instruction for storing to memory.
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="")
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
bool hasOperandBundles() const
Return true if this User has any operand bundles.
size_t size() const
size - Get the array size.
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block...
Class to represent pointers.
static bool collectValuesToDemote(Value *V, SmallPtrSetImpl< Value * > &Expr, SmallVectorImpl< Value * > &ToDemote, SmallVectorImpl< Value * > &Roots)
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
static GCRegistry::Add< CoreCLRGC > E("coreclr","CoreCLR-compatible GC")
unsigned getNumIncomingValues() const
Return the number of incoming edges.
void replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
Value * CreateFCmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType=true)
Returns true if the memory operations A and B are consecutive.
LoadInst * CreateLoad(Value *Ptr, const char *Name)
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
initializer< Ty > init(const Ty &Val)
This instruction inserts a single (scalar) element into a VectorType value.
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled...
unsigned getAlignment() const
Return the alignment of the access that is being performed.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Subclasses of this class are all able to terminate a basic block.
A set of analyses that are preserved following a run of a transformation pass.
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
constexpr bool isPowerOf2_32(uint32_t Value)
isPowerOf2_32 - This function returns true if the argument is a power of two > 0. ...
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs...ExtraArgs)
Get the result of an analysis pass for a given IR unit.
std::enable_if<!std::is_array< T >::value, std::unique_ptr< T > >::type make_unique(Args &&...args)
Constructs a new T() with the given args and returns a unique_ptr<T> which owns the object...
LLVM Basic Block Representation.
The instances of the Type class are immutable: once they are created, they are never changed...
uint64_t getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
SmallVector< Value *, 8 > ValueList
LLVM_ATTRIBUTE_ALWAYS_INLINE iterator begin()
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
MapVector< Instruction *, uint64_t > computeMinimumValueSizes(ArrayRef< BasicBlock * > Blocks, DemandedBits &DB, const TargetTransformInfo *TTI=nullptr)
Compute a map of integer instructions to their minimum legal type size.
A manager for alias analyses.
APInt Or(const APInt &LHS, const APInt &RHS)
Bitwise OR function for APInt.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
APInt Xor(const APInt &LHS, const APInt &RHS)
Bitwise XOR function for APInt.
unsigned getVectorElementSize(Value *V)
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Represent the analysis usage information of a pass.
constexpr bool isPowerOf2_64(uint64_t Value)
isPowerOf2_64 - This function returns true if the argument is a power of two 0 (64 bit edition...
static bool findBuildAggregate(InsertValueInst *IV, SmallVectorImpl< Value * > &BuildVector, SmallVectorImpl< Value * > &BuildVectorOpds)
Like findBuildVector, but looks backwards for construction of aggregate.
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
void buildTree(ArrayRef< Value * > Roots, ArrayRef< Value * > UserIgnoreLst=None)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang","erlang-compatible garbage collector")
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return LHS-RHS. Minus is represented in SCEV as A+B*-1.
Analysis pass providing a never-invalidated alias analysis result.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE,"Assign register bank of generic virtual registers", false, false) RegBankSelect
uint64_t getNumElements() const
FunctionPass class - This class is used to implement most global optimizations.
Value * getOperand(unsigned i) const
Value * getPointerOperand()
bool isCommutative() const
Return true if the instruction is commutative:
iterator_range< po_iterator< T > > post_order(const T &G)
self_iterator getIterator()
Class to represent integer types.
std::pair< NoneType, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Predicate getPredicate() const
Return the predicate for this instruction.
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
bool empty() const
empty - Check if the array is empty.
void setAlignment(unsigned Align)
unsigned getMinVecRegSize() const
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
uint64_t NextPowerOf2(uint64_t A)
NextPowerOf2 - Returns the next power of two (in 64-bits) that is strictly greater than A...
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
const Value * getTrueValue() const
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Value * GetUnderlyingObject(Value *V, const DataLayout &DL, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value...
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="")
const T & back() const
back - Get the last element.
void dump() const
Support for debugging, callable in GDB: V->dump()
bool dominates(const Instruction *Def, const Use &U) const
Return true if Def dominates a use in User.
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
const Value * Ptr
The address of the start of the location.
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Representation for a specific memory location.
A function analysis which provides an AssumptionCache.
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
BinaryOps getOpcode() const
unsigned getABITypeAlignment(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
This is the common base class for memset/memcpy/memmove.
Iterator for intrusive lists based on ilist_node.
Value * CreateFAdd(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
This is the shared class of boolean and integer constants.
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr)
Return the number of times the sign bit of the register is replicated into the other bits...
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false...
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Module.h This file contains the declarations for the Module class.
Type * getType() const
All values are typed, get the type of this value.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Value * CreateInsertElement(Value *Vec, Value *NewElt, Value *Idx, const Twine &Name="")
Provides information about what library functions are available for the current target.
static bool matchExtractIndex(Instruction *E, unsigned Idx, unsigned Opcode)
User(Type *ty, unsigned vty, Use *, unsigned NumOps)
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
LLVM_NODISCARD T pop_back_val()
SmallVector< StoreInst *, 8 > StoreList
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Function * getCalledFunction() const
Return the function called, or null if this is an indirect function invocation.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
const BasicBlock & getEntryBlock() const
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool hasVectorInstrinsicScalarOpd(Intrinsic::ID ID, unsigned ScalarOpdIdx)
Identifies if the intrinsic has a scalar operand.
APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
Intrinsic::ID getIntrinsicID() const LLVM_READONLY
getIntrinsicID - This method returns the ID number of the specified function, or Intrinsic::not_intri...
static GCRegistry::Add< ShadowStackGC > C("shadow-stack","Very portable GC for uncooperative code generators")
void setOperand(unsigned i, Value *Val)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Value * getArgOperand(unsigned i) const
getArgOperand/setArgOperand - Return/set the i-th call argument.
Value * CreateGEP(Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Class to represent vector types.
bool hasIdenticalOperandBundleSchema(const OperandBundleUser< InstrTy, OpIteratorTy > &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
Value * getIncomingValueForBlock(const BasicBlock *BB) const
iterator_range< user_iterator > users()
static void clear(coro::Shape &Shape)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
APInt And(const APInt &LHS, const APInt &RHS)
Bitwise AND function for APInt.
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
Analysis pass that exposes the ScalarEvolution for a function.
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
LLVM_ATTRIBUTE_ALWAYS_INLINE iterator end()
bool isX86_FP80Ty() const
Return true if this is x86 long double.
void ComputeSignBit(const Value *V, bool &KnownZero, bool &KnownOne, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr)
Determine whether the sign bit is known to be zero or one.
void initializeSLPVectorizerPass(PassRegistry &)
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Represents a single loop in the control flow graph.
unsigned getAlignment() const
Return the alignment of the access that is being performed.
TerminatorInst * getTerminator()
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
LLVM_ATTRIBUTE_ALWAYS_INLINE size_type size() const
bool hasOneUse() const
Return true if there is exactly one user of this value.
static void propagateIRFlags(Value *I, ArrayRef< Value * > VL)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
CallInst * CreateCall(Value *Callee, ArrayRef< Value * > Args=None, const Twine &Name="", MDNode *FPMathTag=nullptr)
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)
unsigned getMaxVecRegSize() const
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
void preserve()
Mark an analysis as preserved.
bool shouldReorder() const
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
static bool isSplat(ArrayRef< Value * > VL)
Pass * createSLPVectorizerPass()
unsigned canMapToVector(Type *T, const DataLayout &DL) const
Check if ArrayType or StructType is isomorphic to some VectorType.
Analysis pass providing the TargetLibraryInfo.
static MemoryLocation getLocation(Instruction *I, AliasAnalysis *AA)
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="")
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static const char lv_name[]
static bool PhiTypeSorterFunc(Value *V, Value *V2)
LoadInst * CreateAlignedLoad(Value *Ptr, unsigned Align, const char *Name)
uint64_t PowerOf2Floor(uint64_t A)
Returns the power of two which is less than or equal to the given value.
Module * getParent()
Get the module that this global value is contained inside of...
LLVM Value Representation.
void setAlignment(unsigned Align)
vector_type::const_iterator iterator
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
A vector that has set insertion semantics.
static VectorType * get(Type *ElementType, unsigned NumElements)
This static method is the primary way to construct an VectorType.
std::underlying_type< E >::type Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
static const Function * getParent(const Value *V)
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
uint64_t getTypeSizeInBits(Type *Ty) const
Size examples:
This class implements an extremely fast bulk output stream that can only output to a stream...
The legacy pass manager's analysis pass to compute loop information.
PointerType * getPointerTo(unsigned AddrSpace=0) const
Return a pointer to the current type.
const Value * getFalseValue() const
Convenience struct for specifying and reasoning about fast-math flags.
A container for analyses that lazily runs them and caches their results.
Legacy analysis pass which computes a DominatorTree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree...
DomTreeNodeBase< NodeT > * getNode(NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object...
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop)...
op_range incoming_values()
Bottom Up SLP Vectorizer.
static GCRegistry::Add< ErlangGC > A("erlang","erlang-compatible garbage collector")
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
Value * getPointerOperand()
const BasicBlock * getParent() const
InstListType::iterator iterator
Instruction iterators...
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL)
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal].
bool isVoidTy() const
Return true if this is 'void'.
This instruction inserts a struct field of array element value into an aggregate value.
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
bool is_contained(R &&Range, const E &Element)
Wrapper function around std::find to detect if an element exists in a container.
static bool allSameType(ArrayRef< Value * > VL)
static unsigned getAltOpcode(unsigned Op)