53 #define SV_NAME "slp-vectorizer"
54 #define DEBUG_TYPE "SLP"
56 STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
60 cl::desc(
"Only vectorize if you gain more than this "
65 cl::desc(
"Attempt to vectorize horizontal reductions"));
70 "Attempt to vectorize horizontal reductions feeding into a store"));
74 cl::desc(
"Attempt to vectorize for this register size in bits"));
79 static const unsigned MinVecRegSize = 128;
81 static const unsigned RecursionMaxDepth = 12;
85 static const unsigned AliasedCheckLimit = 10;
90 static const unsigned MaxMemDepDistance = 160;
99 static bool isValidElementType(
Type *Ty) {
111 for (
int i = 1, e = VL.
size(); i < e; i++) {
124 for (
unsigned i = 0, e = VL.
size(); i < e; ++i)
125 if (!isa<Constant>(VL[i]))
132 for (
unsigned i = 1, e = VL.
size(); i < e; ++i)
140 static unsigned getAltOpcode(
unsigned Op) {
142 case Instruction::FAdd:
143 return Instruction::FSub;
144 case Instruction::FSub:
145 return Instruction::FAdd;
146 case Instruction::Add:
147 return Instruction::Sub;
148 case Instruction::Sub:
149 return Instruction::Add;
158 static bool canCombineAsAltInst(
unsigned Op) {
159 if (Op == Instruction::FAdd || Op == Instruction::FSub ||
160 Op == Instruction::Sub || Op == Instruction::Add)
171 unsigned AltOpcode = getAltOpcode(Opcode);
172 for (
int i = 1, e = VL.
size(); i < e; i++) {
174 if (!I || I->
getOpcode() != ((i & 1) ? AltOpcode : Opcode))
177 return Instruction::ShuffleVector;
187 for (
int i = 1, e = VL.
size(); i < e; i++) {
190 if (canCombineAsAltInst(Opcode) && i == 1)
191 return isAltInst(VL);
202 if (
auto *VecOp = dyn_cast<BinaryOperator>(I)) {
203 if (
auto *Intersection = dyn_cast<BinaryOperator>(VL[0])) {
206 for (
int i = 1, e = VL.
size(); i < e; ++i) {
207 if (
auto *
Scalar = dyn_cast<BinaryOperator>(VL[i]))
208 Intersection->andIRFlags(
Scalar);
210 VecOp->copyIRFlags(Intersection);
221 for (
unsigned i = 0, n = Metadata.size(); i != n; ++i) {
222 unsigned Kind = Metadata[i].first;
223 MDNode *MD = Metadata[i].second;
225 for (
int i = 1, e = VL.
size(); MD && i != e; i++) {
255 Type *Ty = VL[0]->getType();
256 for (
int i = 1, e = VL.
size(); i < e; i++)
276 if (NElts != VL.
size())
284 for (
unsigned i = 1, e = VL.
size(); i < e; ++i) {
288 if (!CI || CI->getZExtValue() != i || E->
getOperand(0) != Vec)
303 LoadInst *LI = cast<LoadInst>(UserInst);
311 CallInst *CI = cast<CallInst>(UserInst);
324 if (
StoreInst *SI = dyn_cast<StoreInst>(I))
326 if (
LoadInst *LI = dyn_cast<LoadInst>(I))
333 if (
LoadInst *LI = dyn_cast<LoadInst>(I))
335 if (
StoreInst *SI = dyn_cast<StoreInst>(I))
338 return !
MI->isVolatile();
353 : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0),
F(Func),
354 SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt),
355 Builder(Se->getContext()) {
361 Value *vectorizeTree();
378 VectorizableTree.clear();
379 ScalarToTreeEntry.clear();
381 ExternalUses.clear();
382 NumLoadsWantToKeepOrder = 0;
383 NumLoadsWantToChangeOrder = 0;
384 for (
auto &Iter : BlocksSchedules) {
385 BlockScheduling *BS = Iter.second.get();
394 void optimizeGatherSequence();
397 bool shouldReorder()
const {
398 return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;
405 int getEntryCost(TreeEntry *E);
411 Value *vectorizeTree(TreeEntry *E);
426 static unsigned getAddressSpaceOperand(
Value *I);
430 int getGatherCost(
Type *Ty);
446 bool isFullyVectorizableTinyTree();
459 TreeEntry() : Scalars(), VectorizedValue(nullptr),
464 assert(VL.
size() == Scalars.size() &&
"Invalid size");
472 Value *VectorizedValue;
480 VectorizableTree.emplace_back();
481 int idx = VectorizableTree.size() - 1;
482 TreeEntry *
Last = &VectorizableTree[idx];
483 Last->Scalars.insert(Last->Scalars.begin(), VL.
begin(), VL.
end());
484 Last->NeedToGather = !Vectorized;
486 for (
int i = 0, e = VL.
size(); i != e; ++i) {
487 assert(!ScalarToTreeEntry.count(VL[i]) &&
"Scalar already in tree!");
488 ScalarToTreeEntry[VL[i]] = idx;
491 MustGather.insert(VL.
begin(), VL.
end());
498 std::vector<TreeEntry> VectorizableTree;
507 struct ExternalUser {
509 Scalar(S),
User(U), Lane(L){};
527 AliasCacheKey key = std::make_pair(Inst1, Inst2);
534 if (Loc1.
Ptr && Loc2.
Ptr && isSimple(Inst1) && isSimple(Inst2)) {
536 aliased = AA->
alias(Loc1, Loc2);
543 typedef std::pair<Instruction *, Instruction *> AliasCacheKey;
558 DeletedInstructions.push_back(std::unique_ptr<Instruction>(I));
567 UserList ExternalUses;
581 struct ScheduleData {
585 enum { InvalidDeps = -1 };
588 : Inst(nullptr), FirstInBundle(nullptr), NextInBundle(nullptr),
589 NextLoadStore(nullptr), SchedulingRegionID(0), SchedulingPriority(0),
590 Dependencies(InvalidDeps), UnscheduledDeps(InvalidDeps),
591 UnscheduledDepsInBundle(InvalidDeps), IsScheduled(
false) {}
593 void init(
int BlockSchedulingRegionID) {
594 FirstInBundle =
this;
595 NextInBundle =
nullptr;
596 NextLoadStore =
nullptr;
598 SchedulingRegionID = BlockSchedulingRegionID;
599 UnscheduledDepsInBundle = UnscheduledDeps;
604 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
608 bool isSchedulingEntity()
const {
return FirstInBundle ==
this; }
612 bool isPartOfBundle()
const {
613 return NextInBundle !=
nullptr || FirstInBundle !=
this;
618 bool isReady()
const {
619 assert(isSchedulingEntity() &&
620 "can't consider non-scheduling entity for ready list");
621 return UnscheduledDepsInBundle == 0 && !IsScheduled;
626 int incrementUnscheduledDeps(
int Incr) {
627 UnscheduledDeps += Incr;
628 return FirstInBundle->UnscheduledDepsInBundle += Incr;
633 void resetUnscheduledDeps() {
634 incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
638 void clearDependencies() {
639 Dependencies = InvalidDeps;
640 resetUnscheduledDeps();
641 MemoryDependencies.clear();
645 if (!isSchedulingEntity()) {
647 }
else if (NextInBundle) {
649 ScheduleData *SD = NextInBundle;
651 os <<
';' << *SD->Inst;
652 SD = SD->NextInBundle;
664 ScheduleData *FirstInBundle;
668 ScheduleData *NextInBundle;
672 ScheduleData *NextLoadStore;
680 int SchedulingRegionID;
683 int SchedulingPriority;
700 int UnscheduledDepsInBundle;
709 const BoUpSLP::ScheduleData &SD);
714 struct BlockScheduling {
717 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize),
718 ScheduleStart(nullptr), ScheduleEnd(nullptr),
719 FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr),
722 SchedulingRegionID(1) {}
726 ScheduleStart =
nullptr;
727 ScheduleEnd =
nullptr;
728 FirstLoadStoreInRegion =
nullptr;
729 LastLoadStoreInRegion =
nullptr;
733 ++SchedulingRegionID;
736 ScheduleData *getScheduleData(
Value *V) {
737 ScheduleData *SD = ScheduleDataMap[V];
738 if (SD && SD->SchedulingRegionID == SchedulingRegionID)
743 bool isInSchedulingRegion(ScheduleData *SD) {
744 return SD->SchedulingRegionID == SchedulingRegionID;
749 template <
typename ReadyListType>
750 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
751 SD->IsScheduled =
true;
752 DEBUG(
dbgs() <<
"SLP: schedule " << *SD <<
"\n");
754 ScheduleData *BundleMember = SD;
755 while (BundleMember) {
757 for (
Use &U : BundleMember->Inst->operands()) {
758 ScheduleData *OpDef = getScheduleData(U.get());
759 if (OpDef && OpDef->hasValidDependencies() &&
760 OpDef->incrementUnscheduledDeps(-1) == 0) {
763 ScheduleData *DepBundle = OpDef->FirstInBundle;
764 assert(!DepBundle->IsScheduled &&
765 "already scheduled bundle gets ready");
766 ReadyList.insert(DepBundle);
767 DEBUG(
dbgs() <<
"SLP: gets ready (def): " << *DepBundle <<
"\n");
771 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
772 if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
775 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
776 assert(!DepBundle->IsScheduled &&
777 "already scheduled bundle gets ready");
778 ReadyList.insert(DepBundle);
779 DEBUG(
dbgs() <<
"SLP: gets ready (mem): " << *DepBundle <<
"\n");
782 BundleMember = BundleMember->NextInBundle;
787 template <
typename ReadyListType>
788 void initialFillReadyList(ReadyListType &ReadyList) {
789 for (
auto *I = ScheduleStart; I != ScheduleEnd; I = I->
getNextNode()) {
790 ScheduleData *SD = getScheduleData(I);
791 if (SD->isSchedulingEntity() && SD->isReady()) {
792 ReadyList.insert(SD);
793 DEBUG(
dbgs() <<
"SLP: initially in ready list: " << *I <<
"\n");
807 void extendSchedulingRegion(
Value *V);
812 ScheduleData *PrevLoadStore,
813 ScheduleData *NextLoadStore);
817 void calculateDependencies(ScheduleData *SD,
bool InsertInReadyList,
821 void resetSchedule();
826 std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
840 struct ReadyList :
SmallVector<ScheduleData *, 8> {
841 void insert(ScheduleData *SD) { push_back(SD); }
845 ReadyList ReadyInsts;
855 ScheduleData *FirstLoadStoreInRegion;
859 ScheduleData *LastLoadStoreInRegion;
863 int SchedulingRegionID;
871 void scheduleBlock(BlockScheduling *BS);
877 int NumLoadsWantToKeepOrder;
880 int NumLoadsWantToChangeOrder;
904 UserIgnoreList = UserIgnoreLst;
905 if (!getSameType(Roots))
907 buildTree_rec(Roots, 0);
910 for (
int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) {
911 TreeEntry *Entry = &VectorizableTree[EIdx];
914 for (
int Lane = 0,
LE = Entry->Scalars.size(); Lane !=
LE; ++Lane) {
915 Value *Scalar = Entry->Scalars[Lane];
918 if (Entry->NeedToGather)
922 DEBUG(
dbgs() <<
"SLP: Checking user:" << *U <<
".\n");
929 if (ScalarToTreeEntry.count(U)) {
930 int Idx = ScalarToTreeEntry[U];
931 TreeEntry *UseEntry = &VectorizableTree[Idx];
932 Value *UseScalar = UseEntry->Scalars[0];
936 if (UseScalar != U ||
937 !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
938 DEBUG(
dbgs() <<
"SLP: \tInternal user will be removed:" << *U
940 assert(!VectorizableTree[Idx].NeedToGather &&
"Bad state");
946 if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UserInst) !=
947 UserIgnoreList.end())
950 DEBUG(
dbgs() <<
"SLP: Need to extract:" << *U <<
" from lane " <<
951 Lane <<
" from " << *Scalar <<
".\n");
952 ExternalUses.push_back(ExternalUser(Scalar, U, Lane));
960 bool SameTy = getSameType(VL); (void)SameTy;
961 bool isAltShuffle =
false;
962 assert(SameTy &&
"Invalid types!");
964 if (Depth == RecursionMaxDepth) {
965 DEBUG(
dbgs() <<
"SLP: Gathering due to max recursion depth.\n");
966 newTreeEntry(VL,
false);
971 if (VL[0]->
getType()->isVectorTy()) {
972 DEBUG(
dbgs() <<
"SLP: Gathering due to vector type.\n");
973 newTreeEntry(VL,
false);
977 if (
StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
979 DEBUG(
dbgs() <<
"SLP: Gathering due to store vector type.\n");
980 newTreeEntry(VL,
false);
983 unsigned Opcode = getSameOpcode(VL);
987 if (Opcode == Instruction::ShuffleVector) {
990 if (Op != Instruction::ShuffleVector)
995 if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || !Opcode) {
996 DEBUG(
dbgs() <<
"SLP: Gathering due to C,S,B,O. \n");
997 newTreeEntry(VL,
false);
1005 for (
unsigned i = 0, e = VL.
size(); i != e; ++i) {
1006 if (EphValues.count(VL[i])) {
1007 DEBUG(
dbgs() <<
"SLP: The instruction (" << *VL[i] <<
1008 ") is ephemeral.\n");
1009 newTreeEntry(VL,
false);
1015 if (ScalarToTreeEntry.count(VL[0])) {
1016 int Idx = ScalarToTreeEntry[VL[0]];
1017 TreeEntry *E = &VectorizableTree[Idx];
1018 for (
unsigned i = 0, e = VL.size(); i != e; ++i) {
1019 DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *VL[i] <<
".\n");
1020 if (E->Scalars[i] != VL[i]) {
1021 DEBUG(
dbgs() <<
"SLP: Gathering due to partial overlap.\n");
1022 newTreeEntry(VL,
false);
1026 DEBUG(
dbgs() <<
"SLP: Perfect diamond merge at " << *VL[0] <<
".\n");
1031 for (
unsigned i = 0, e = VL.size(); i != e; ++i) {
1032 if (ScalarToTreeEntry.count(VL[i])) {
1033 DEBUG(
dbgs() <<
"SLP: The instruction (" << *VL[i] <<
1034 ") is already in tree.\n");
1035 newTreeEntry(VL,
false);
1042 for (
unsigned i = 0, e = VL.size(); i != e; ++i) {
1043 if (MustGather.count(VL[i])) {
1044 DEBUG(
dbgs() <<
"SLP: Gathering due to gathered scalar.\n");
1045 newTreeEntry(VL,
false);
1055 if (!DT->isReachableFromEntry(BB)) {
1058 DEBUG(
dbgs() <<
"SLP: bundle in unreachable block.\n");
1059 newTreeEntry(VL,
false);
1064 for (
unsigned i = 0, e = VL.size(); i < e; ++i)
1065 for (
unsigned j = i+1; j < e; ++j)
1066 if (VL[i] == VL[j]) {
1067 DEBUG(
dbgs() <<
"SLP: Scalar used twice in bundle.\n");
1068 newTreeEntry(VL,
false);
1072 auto &BSRef = BlocksSchedules[BB];
1074 BSRef = llvm::make_unique<BlockScheduling>(BB);
1076 BlockScheduling &BS = *BSRef.get();
1078 if (!BS.tryScheduleBundle(VL,
this)) {
1079 DEBUG(
dbgs() <<
"SLP: We are not able to schedule this bundle!\n");
1080 BS.cancelScheduling(VL);
1081 newTreeEntry(VL,
false);
1084 DEBUG(
dbgs() <<
"SLP: We are able to schedule this bundle.\n");
1091 for (
unsigned j = 0; j < VL.size(); ++j)
1096 DEBUG(
dbgs() <<
"SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
1097 BS.cancelScheduling(VL);
1098 newTreeEntry(VL,
false);
1103 newTreeEntry(VL,
true);
1104 DEBUG(
dbgs() <<
"SLP: added a vector of PHINodes.\n");
1109 for (
unsigned j = 0; j < VL.size(); ++j)
1110 Operands.push_back(cast<PHINode>(VL[j])->getIncomingValueForBlock(
1113 buildTree_rec(Operands, Depth + 1);
1118 bool Reuse = CanReuseExtract(VL);
1120 DEBUG(
dbgs() <<
"SLP: Reusing extract sequence.\n");
1122 BS.cancelScheduling(VL);
1124 newTreeEntry(VL, Reuse);
1129 for (
unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
1130 LoadInst *L = cast<LoadInst>(VL[i]);
1132 BS.cancelScheduling(VL);
1133 newTreeEntry(VL,
false);
1134 DEBUG(
dbgs() <<
"SLP: Gathering non-simple loads.\n");
1138 if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) {
1139 if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0], DL)) {
1140 ++NumLoadsWantToChangeOrder;
1142 BS.cancelScheduling(VL);
1143 newTreeEntry(VL,
false);
1144 DEBUG(
dbgs() <<
"SLP: Gathering non-consecutive loads.\n");
1148 ++NumLoadsWantToKeepOrder;
1149 newTreeEntry(VL,
true);
1150 DEBUG(
dbgs() <<
"SLP: added a vector of loads.\n");
1153 case Instruction::ZExt:
1154 case Instruction::SExt:
1155 case Instruction::FPToUI:
1156 case Instruction::FPToSI:
1157 case Instruction::FPExt:
1158 case Instruction::PtrToInt:
1159 case Instruction::IntToPtr:
1160 case Instruction::SIToFP:
1161 case Instruction::UIToFP:
1162 case Instruction::Trunc:
1163 case Instruction::FPTrunc:
1164 case Instruction::BitCast: {
1166 for (
unsigned i = 0; i < VL.size(); ++i) {
1167 Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
1168 if (Ty != SrcTy || !isValidElementType(Ty)) {
1169 BS.cancelScheduling(VL);
1170 newTreeEntry(VL,
false);
1171 DEBUG(
dbgs() <<
"SLP: Gathering casts with different src types.\n");
1175 newTreeEntry(VL,
true);
1176 DEBUG(
dbgs() <<
"SLP: added a vector of casts.\n");
1181 for (
unsigned j = 0; j < VL.size(); ++j)
1182 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1184 buildTree_rec(Operands, Depth+1);
1188 case Instruction::ICmp:
1189 case Instruction::FCmp: {
1192 Type *ComparedTy = cast<Instruction>(VL[0])->getOperand(0)->getType();
1193 for (
unsigned i = 1, e = VL.size(); i < e; ++i) {
1194 CmpInst *Cmp = cast<CmpInst>(VL[i]);
1197 BS.cancelScheduling(VL);
1198 newTreeEntry(VL,
false);
1199 DEBUG(
dbgs() <<
"SLP: Gathering cmp with different predicate.\n");
1204 newTreeEntry(VL,
true);
1205 DEBUG(
dbgs() <<
"SLP: added a vector of compares.\n");
1210 for (
unsigned j = 0; j < VL.size(); ++j)
1211 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1213 buildTree_rec(Operands, Depth+1);
1218 case Instruction::Add:
1219 case Instruction::FAdd:
1220 case Instruction::Sub:
1221 case Instruction::FSub:
1222 case Instruction::Mul:
1223 case Instruction::FMul:
1224 case Instruction::UDiv:
1225 case Instruction::SDiv:
1226 case Instruction::FDiv:
1227 case Instruction::URem:
1228 case Instruction::SRem:
1229 case Instruction::FRem:
1230 case Instruction::Shl:
1231 case Instruction::LShr:
1232 case Instruction::AShr:
1236 newTreeEntry(VL,
true);
1237 DEBUG(
dbgs() <<
"SLP: added a vector of bin op.\n");
1242 ValueList Left, Right;
1243 reorderInputsAccordingToOpcode(VL, Left, Right);
1244 buildTree_rec(Left, Depth + 1);
1245 buildTree_rec(Right, Depth + 1);
1252 for (
unsigned j = 0; j < VL.size(); ++j)
1253 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1255 buildTree_rec(Operands, Depth+1);
1259 case Instruction::GetElementPtr: {
1261 for (
unsigned j = 0; j < VL.size(); ++j) {
1262 if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
1263 DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (nested indexes).\n");
1264 BS.cancelScheduling(VL);
1265 newTreeEntry(VL,
false);
1272 Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType();
1273 for (
unsigned j = 0; j < VL.size(); ++j) {
1274 Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
1276 DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (different types).\n");
1277 BS.cancelScheduling(VL);
1278 newTreeEntry(VL,
false);
1284 for (
unsigned j = 0; j < VL.size(); ++j) {
1285 auto Op = cast<Instruction>(VL[j])->getOperand(1);
1286 if (!isa<ConstantInt>(Op)) {
1288 dbgs() <<
"SLP: not-vectorizable GEP (non-constant indexes).\n");
1289 BS.cancelScheduling(VL);
1290 newTreeEntry(VL,
false);
1295 newTreeEntry(VL,
true);
1296 DEBUG(
dbgs() <<
"SLP: added a vector of GEPs.\n");
1297 for (
unsigned i = 0, e = 2; i < e; ++i) {
1300 for (
unsigned j = 0; j < VL.size(); ++j)
1301 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1303 buildTree_rec(Operands, Depth + 1);
1308 const DataLayout &DL =
F->getParent()->getDataLayout();
1310 for (
unsigned i = 0, e = VL.size() - 1; i < e; ++i)
1311 if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) {
1312 BS.cancelScheduling(VL);
1313 newTreeEntry(VL,
false);
1314 DEBUG(
dbgs() <<
"SLP: Non-consecutive store.\n");
1318 newTreeEntry(VL,
true);
1319 DEBUG(
dbgs() <<
"SLP: added a vector of stores.\n");
1322 for (
unsigned j = 0; j < VL.size(); ++j)
1323 Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
1325 buildTree_rec(Operands, Depth + 1);
1330 CallInst *CI = cast<CallInst>(VL[0]);
1335 BS.cancelScheduling(VL);
1336 newTreeEntry(VL,
false);
1337 DEBUG(
dbgs() <<
"SLP: Non-vectorizable call.\n");
1341 Value *A1I =
nullptr;
1344 for (
unsigned i = 1, e = VL.size(); i != e; ++i) {
1348 BS.cancelScheduling(VL);
1349 newTreeEntry(VL,
false);
1350 DEBUG(
dbgs() <<
"SLP: mismatched calls:" << *CI <<
"!=" << *VL[i]
1359 BS.cancelScheduling(VL);
1360 newTreeEntry(VL,
false);
1361 DEBUG(
dbgs() <<
"SLP: mismatched arguments in call:" << *CI
1362 <<
" argument "<< A1I<<
"!=" << A1J
1369 newTreeEntry(VL,
true);
1373 for (
unsigned j = 0; j < VL.size(); ++j) {
1377 buildTree_rec(Operands, Depth + 1);
1381 case Instruction::ShuffleVector: {
1384 if (!isAltShuffle) {
1385 BS.cancelScheduling(VL);
1386 newTreeEntry(VL,
false);
1387 DEBUG(
dbgs() <<
"SLP: ShuffleVector are not vectorized.\n");
1390 newTreeEntry(VL,
true);
1391 DEBUG(
dbgs() <<
"SLP: added a ShuffleVector op.\n");
1394 if (isa<BinaryOperator>(VL0)) {
1395 ValueList Left, Right;
1396 reorderAltShuffleOperands(VL, Left, Right);
1397 buildTree_rec(Left, Depth + 1);
1398 buildTree_rec(Right, Depth + 1);
1405 for (
unsigned j = 0; j < VL.size(); ++j)
1406 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1408 buildTree_rec(Operands, Depth + 1);
1413 BS.cancelScheduling(VL);
1414 newTreeEntry(VL,
false);
1415 DEBUG(
dbgs() <<
"SLP: Gathering unknown instruction.\n");
1420 int BoUpSLP::getEntryCost(TreeEntry *E) {
1423 Type *ScalarTy = VL[0]->getType();
1424 if (
StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
1428 if (E->NeedToGather) {
1429 if (allConstant(VL))
1434 return getGatherCost(E->Scalars);
1436 unsigned Opcode = getSameOpcode(VL);
1437 assert(Opcode && getSameType(VL) && getSameBlock(VL) &&
"Invalid VL");
1444 if (CanReuseExtract(VL)) {
1446 for (
unsigned i = 0, e = VL.
size(); i < e; ++i) {
1455 return getGatherCost(VecTy);
1457 case Instruction::ZExt:
1458 case Instruction::SExt:
1459 case Instruction::FPToUI:
1460 case Instruction::FPToSI:
1461 case Instruction::FPExt:
1462 case Instruction::PtrToInt:
1463 case Instruction::IntToPtr:
1464 case Instruction::SIToFP:
1465 case Instruction::UIToFP:
1466 case Instruction::Trunc:
1467 case Instruction::FPTrunc:
1468 case Instruction::BitCast: {
1472 int ScalarCost = VL.
size() * TTI->getCastInstrCost(VL0->
getOpcode(),
1476 int VecCost = TTI->getCastInstrCost(VL0->
getOpcode(), VecTy, SrcVecTy);
1477 return VecCost - ScalarCost;
1479 case Instruction::FCmp:
1480 case Instruction::ICmp:
1482 case Instruction::Add:
1483 case Instruction::FAdd:
1484 case Instruction::Sub:
1485 case Instruction::FSub:
1486 case Instruction::Mul:
1487 case Instruction::FMul:
1488 case Instruction::UDiv:
1489 case Instruction::SDiv:
1490 case Instruction::FDiv:
1491 case Instruction::URem:
1492 case Instruction::SRem:
1493 case Instruction::FRem:
1494 case Instruction::Shl:
1495 case Instruction::LShr:
1496 case Instruction::AShr:
1503 if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp ||
1507 TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.
getInt1Ty());
1508 VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);
1527 for (
unsigned i = 0; i < VL.
size(); ++i) {
1548 TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK,
1550 VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK,
1553 return VecCost - ScalarCost;
1555 case Instruction::GetElementPtr: {
1563 TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
1565 TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
1567 return VecCost - ScalarCost;
1574 return VecLdCost - ScalarLdCost;
1581 return VecStCost - ScalarStCost;
1584 CallInst *CI = cast<CallInst>(VL0);
1596 TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys);
1598 int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys);
1600 DEBUG(
dbgs() <<
"SLP: Call cost "<< VecCallCost - ScalarCallCost
1601 <<
" (" << VecCallCost <<
"-" << ScalarCallCost <<
")"
1602 <<
" for " << *CI <<
"\n");
1604 return VecCallCost - ScalarCallCost;
1606 case Instruction::ShuffleVector: {
1613 for (
unsigned i = 0; i < VL.
size(); ++i) {
1618 TTI->getArithmeticInstrCost(I->
getOpcode(), ScalarTy, Op1VK, Op2VK);
1624 TTI->getArithmeticInstrCost(I0->
getOpcode(), VecTy, Op1VK, Op2VK);
1627 TTI->getArithmeticInstrCost(I1->
getOpcode(), VecTy, Op1VK, Op2VK);
1630 return VecCost - ScalarCost;
1637 bool BoUpSLP::isFullyVectorizableTinyTree() {
1638 DEBUG(
dbgs() <<
"SLP: Check whether the tree with height " <<
1639 VectorizableTree.size() <<
" is fully vectorizable .\n");
1642 if (VectorizableTree.size() != 2)
1646 if (!VectorizableTree[0].NeedToGather &&
1647 (allConstant(VectorizableTree[1].Scalars) ||
1648 isSplat(VectorizableTree[1].Scalars)))
1652 if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather)
1658 int BoUpSLP::getSpillCost() {
1663 unsigned BundleWidth = VectorizableTree.front().Scalars.size();
1669 for (
unsigned N = 0;
N < VectorizableTree.size(); ++
N) {
1680 dbgs() <<
"SLP: #LV: " << LiveValues.
size();
1681 for (
auto *
X : LiveValues)
1682 dbgs() <<
" " <<
X->getName();
1683 dbgs() <<
", Looking at ";
1688 LiveValues.erase(PrevInst);
1689 for (
auto &J : PrevInst->
operands()) {
1690 if (isa<Instruction>(&*J) && ScalarToTreeEntry.count(&*J))
1691 LiveValues.insert(cast<Instruction>(&*J));
1697 while (InstIt != PrevInstIt) {
1703 if (isa<CallInst>(&*PrevInstIt) && &*PrevInstIt != PrevInst) {
1705 for (
auto *II : LiveValues)
1707 Cost += TTI->getCostOfKeepingLiveOverCall(V);
1716 DEBUG(
dbgs() <<
"SLP: SpillCost=" << Cost <<
"\n");
1720 int BoUpSLP::getTreeCost() {
1722 DEBUG(
dbgs() <<
"SLP: Calculating cost for tree of size " <<
1723 VectorizableTree.size() <<
".\n");
1726 if (VectorizableTree.size() < 3 && !isFullyVectorizableTinyTree()) {
1727 if (VectorizableTree.empty()) {
1728 assert(!ExternalUses.size() &&
"We should not have any external users");
1733 unsigned BundleWidth = VectorizableTree[0].Scalars.size();
1735 for (
unsigned i = 0, e = VectorizableTree.size(); i != e; ++i) {
1736 int C = getEntryCost(&VectorizableTree[i]);
1737 DEBUG(
dbgs() <<
"SLP: Adding cost " << C <<
" for bundle that starts with "
1738 << *VectorizableTree[i].Scalars[0] <<
" .\n");
1743 int ExtractCost = 0;
1744 for (UserList::iterator I = ExternalUses.begin(), E = ExternalUses.end();
1747 if (!ExtractCostCalculated.
insert(I->Scalar).second)
1753 if (EphValues.count(I->
User))
1761 Cost += getSpillCost();
1763 DEBUG(
dbgs() <<
"SLP: Total Cost " << Cost + ExtractCost<<
".\n");
1764 return Cost + ExtractCost;
1767 int BoUpSLP::getGatherCost(
Type *Ty) {
1769 for (
unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
1770 Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
1776 Type *ScalarTy = VL[0]->getType();
1777 if (
StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
1781 return getGatherCost(VecTy);
1785 if (
LoadInst *LI = dyn_cast<LoadInst>(I))
1787 if (
StoreInst *SI = dyn_cast<StoreInst>(I))
1792 unsigned BoUpSLP::getAddressSpaceOperand(
Value *I) {
1793 if (
LoadInst *L = dyn_cast<LoadInst>(I))
1795 if (
StoreInst *S = dyn_cast<StoreInst>(I))
1796 return S->getPointerAddressSpace();
1803 unsigned ASA = getAddressSpaceOperand(A);
1804 unsigned ASB = getAddressSpaceOperand(B);
1807 if (!PtrA || !PtrB || (ASA != ASB))
1815 Type *Ty = cast<PointerType>(PtrA->
getType())->getElementType();
1818 APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0);
1822 APInt OffsetDelta = OffsetB - OffsetA;
1827 return OffsetDelta == Size;
1831 APInt BaseDelta = Size - OffsetDelta;
1834 const SCEV *PtrSCEVA = SE->getSCEV(PtrA);
1835 const SCEV *PtrSCEVB = SE->getSCEV(PtrB);
1836 const SCEV *C = SE->getConstant(BaseDelta);
1837 const SCEV *
X = SE->getAddExpr(PtrSCEVA, C);
1838 return X == PtrSCEVB;
1853 const DataLayout &DL =
F->getParent()->getDataLayout();
1856 for (
unsigned i = 0, e = VL.
size(); i < e; ++i) {
1857 Left.
push_back(cast<Instruction>(VL[i])->getOperand(0));
1858 Right.
push_back(cast<Instruction>(VL[i])->getOperand(1));
1863 for (
unsigned j = 0; j < VL.
size() - 1; ++j) {
1864 if (
LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
1865 if (
LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
1868 if (isConsecutiveAccess(L, L1, DL) && VL1->
isCommutative()) {
1871 }
else if (isConsecutiveAccess(L, L1, DL) && VL2->isCommutative()) {
1878 if (
LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
1879 if (
LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
1882 if (isConsecutiveAccess(L, L1, DL) && VL1->
isCommutative()) {
1885 }
else if (isConsecutiveAccess(L, L1, DL) && VL2->isCommutative()) {
1901 bool AllSameOpcodeLeft =
true;
1902 bool AllSameOpcodeRight =
true;
1903 for (
unsigned i = 0, e = VL.
size(); i != e; ++i) {
1917 if (i && AllSameOpcodeLeft && ILeft) {
1918 if (
Instruction *PLeft = dyn_cast<Instruction>(OrigLeft[i - 1])) {
1919 if (PLeft->getOpcode() != ILeft->
getOpcode())
1920 AllSameOpcodeLeft =
false;
1922 AllSameOpcodeLeft =
false;
1924 if (i && AllSameOpcodeRight && IRight) {
1925 if (
Instruction *PRight = dyn_cast<Instruction>(OrigRight[i - 1])) {
1926 if (PRight->getOpcode() != IRight->getOpcode())
1927 AllSameOpcodeRight =
false;
1929 AllSameOpcodeRight =
false;
1947 if (ILeft && IRight) {
1948 if (!i && ILeft->
getOpcode() > IRight->getOpcode()) {
1951 }
else if (i && ILeft->
getOpcode() > IRight->getOpcode() &&
1952 Right[i - 1] != IRight) {
1956 }
else if (i && ILeft->
getOpcode() == IRight->getOpcode() &&
1957 Right[i - 1] == ILeft) {
1961 }
else if (i && ILeft->
getOpcode() == IRight->getOpcode() &&
1962 Left[i - 1] == IRight) {
1982 bool LeftBroadcast = isSplat(Left);
1983 bool RightBroadcast = isSplat(Right);
1986 if (LeftBroadcast || RightBroadcast)
1990 if (AllSameOpcodeRight || AllSameOpcodeLeft) {
1995 const DataLayout &DL =
F->getParent()->getDataLayout();
2012 for (
unsigned j = 0; j < VL.
size() - 1; ++j) {
2013 if (
LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
2014 if (
LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
2015 if (isConsecutiveAccess(L, L1, DL)) {
2021 if (
LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
2022 if (
LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
2023 if (isConsecutiveAccess(L, L1, DL)) {
2037 Builder.SetInsertPoint(VL0->
getParent(), NextInst);
2038 Builder.SetCurrentDebugLocation(VL0->
getDebugLoc());
2045 Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
2046 if (
Instruction *Insrt = dyn_cast<Instruction>(Vec)) {
2047 GatherSeq.insert(Insrt);
2048 CSEBlocks.insert(Insrt->getParent());
2051 if (ScalarToTreeEntry.count(VL[i])) {
2052 int Idx = ScalarToTreeEntry[VL[i]];
2053 TreeEntry *E = &VectorizableTree[Idx];
2056 for (
unsigned Lane = 0,
LE = VL.size(); Lane !=
LE; ++Lane) {
2058 if (E->Scalars[Lane] == VL[i]) {
2063 assert(FoundLane >= 0 &&
"Could not find the correct lane");
2064 ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));
2074 = ScalarToTreeEntry.find(VL[0]);
2075 if (Entry != ScalarToTreeEntry.
end()) {
2076 int Idx = Entry->second;
2077 const TreeEntry *En = &VectorizableTree[Idx];
2078 if (En->isSame(VL) && En->VectorizedValue)
2079 return En->VectorizedValue;
2085 if (ScalarToTreeEntry.count(VL[0])) {
2086 int Idx = ScalarToTreeEntry[VL[0]];
2087 TreeEntry *E = &VectorizableTree[Idx];
2089 return vectorizeTree(E);
2092 Type *ScalarTy = VL[0]->getType();
2093 if (
StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
2097 return Gather(VL, VecTy);
2100 Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
2103 if (E->VectorizedValue) {
2104 DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *E->Scalars[0] <<
".\n");
2105 return E->VectorizedValue;
2108 Instruction *VL0 = cast<Instruction>(E->Scalars[0]);
2110 if (
StoreInst *SI = dyn_cast<StoreInst>(VL0))
2114 if (E->NeedToGather) {
2115 setInsertPointAfterBundle(E->Scalars);
2116 return Gather(E->Scalars, VecTy);
2119 const DataLayout &DL =
F->getParent()->getDataLayout();
2120 unsigned Opcode = getSameOpcode(E->Scalars);
2126 Builder.SetCurrentDebugLocation(PH->
getDebugLoc());
2128 E->VectorizedValue = NewPhi;
2138 if (!VisitedBBs.
insert(IBB).second) {
2144 for (
Value *V : E->Scalars)
2145 Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(IBB));
2148 Builder.SetCurrentDebugLocation(PH->
getDebugLoc());
2149 Value *Vec = vectorizeTree(Operands);
2154 "Invalid number of incoming values");
2159 if (CanReuseExtract(E->Scalars)) {
2161 E->VectorizedValue = V;
2164 return Gather(E->Scalars, VecTy);
2166 case Instruction::ZExt:
2167 case Instruction::SExt:
2168 case Instruction::FPToUI:
2169 case Instruction::FPToSI:
2170 case Instruction::FPExt:
2171 case Instruction::PtrToInt:
2172 case Instruction::IntToPtr:
2173 case Instruction::SIToFP:
2174 case Instruction::UIToFP:
2175 case Instruction::Trunc:
2176 case Instruction::FPTrunc:
2177 case Instruction::BitCast: {
2179 for (
Value *V : E->Scalars)
2180 INVL.push_back(cast<Instruction>(V)->getOperand(0));
2182 setInsertPointAfterBundle(E->Scalars);
2184 Value *InVec = vectorizeTree(INVL);
2186 if (
Value *V = alreadyVectorized(E->Scalars))
2191 E->VectorizedValue = V;
2192 ++NumVectorInstructions;
2195 case Instruction::FCmp:
2196 case Instruction::ICmp: {
2197 ValueList LHSV, RHSV;
2198 for (
Value *V : E->Scalars) {
2199 LHSV.push_back(cast<Instruction>(V)->getOperand(0));
2200 RHSV.push_back(cast<Instruction>(V)->getOperand(1));
2203 setInsertPointAfterBundle(E->Scalars);
2205 Value *L = vectorizeTree(LHSV);
2206 Value *R = vectorizeTree(RHSV);
2208 if (
Value *V = alreadyVectorized(E->Scalars))
2213 if (Opcode == Instruction::FCmp)
2214 V = Builder.CreateFCmp(P0, L, R);
2216 V = Builder.CreateICmp(P0, L, R);
2218 E->VectorizedValue = V;
2219 ++NumVectorInstructions;
2223 ValueList TrueVec, FalseVec, CondVec;
2224 for (
Value *V : E->Scalars) {
2225 CondVec.push_back(cast<Instruction>(V)->getOperand(0));
2226 TrueVec.push_back(cast<Instruction>(V)->getOperand(1));
2227 FalseVec.push_back(cast<Instruction>(V)->getOperand(2));
2230 setInsertPointAfterBundle(E->Scalars);
2232 Value *Cond = vectorizeTree(CondVec);
2233 Value *True = vectorizeTree(TrueVec);
2234 Value *False = vectorizeTree(FalseVec);
2236 if (
Value *V = alreadyVectorized(E->Scalars))
2239 Value *V = Builder.CreateSelect(Cond, True, False);
2240 E->VectorizedValue = V;
2241 ++NumVectorInstructions;
2244 case Instruction::Add:
2245 case Instruction::FAdd:
2246 case Instruction::Sub:
2247 case Instruction::FSub:
2248 case Instruction::Mul:
2249 case Instruction::FMul:
2250 case Instruction::UDiv:
2251 case Instruction::SDiv:
2252 case Instruction::FDiv:
2253 case Instruction::URem:
2254 case Instruction::SRem:
2255 case Instruction::FRem:
2256 case Instruction::Shl:
2257 case Instruction::LShr:
2258 case Instruction::AShr:
2262 ValueList LHSVL, RHSVL;
2264 reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL);
2266 for (
Value *V : E->Scalars) {
2267 LHSVL.push_back(cast<Instruction>(V)->getOperand(0));
2268 RHSVL.push_back(cast<Instruction>(V)->getOperand(1));
2271 setInsertPointAfterBundle(E->Scalars);
2273 Value *LHS = vectorizeTree(LHSVL);
2274 Value *RHS = vectorizeTree(RHSVL);
2276 if (LHS == RHS && isa<Instruction>(LHS)) {
2280 if (
Value *V = alreadyVectorized(E->Scalars))
2285 E->VectorizedValue = V;
2286 propagateIRFlags(E->VectorizedValue, E->Scalars);
2287 ++NumVectorInstructions;
2290 return propagateMetadata(I, E->Scalars);
2297 setInsertPointAfterBundle(E->Scalars);
2299 LoadInst *LI = cast<LoadInst>(VL0);
2310 ExternalUses.push_back(
2314 LI = Builder.CreateLoad(VecPtr);
2319 E->VectorizedValue = LI;
2320 ++NumVectorInstructions;
2321 return propagateMetadata(LI, E->Scalars);
2329 for (
Value *V : E->Scalars)
2330 ValueOp.push_back(cast<StoreInst>(V)->getValueOperand());
2332 setInsertPointAfterBundle(E->Scalars);
2334 Value *VecValue = vectorizeTree(ValueOp);
2337 StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
2343 ExternalUses.push_back(
2350 E->VectorizedValue = S;
2351 ++NumVectorInstructions;
2352 return propagateMetadata(S, E->Scalars);
2354 case Instruction::GetElementPtr: {
2355 setInsertPointAfterBundle(E->Scalars);
2358 for (
Value *V : E->Scalars)
2359 Op0VL.push_back(cast<GetElementPtrInst>(V)->getOperand(0));
2361 Value *Op0 = vectorizeTree(Op0VL);
2363 std::vector<Value *> OpVecs;
2364 for (
int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
2367 for (
Value *V : E->Scalars)
2368 OpVL.push_back(cast<GetElementPtrInst>(V)->getOperand(j));
2370 Value *OpVec = vectorizeTree(OpVL);
2371 OpVecs.push_back(OpVec);
2374 Value *V = Builder.CreateGEP(
2375 cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
2376 E->VectorizedValue = V;
2377 ++NumVectorInstructions;
2380 return propagateMetadata(I, E->Scalars);
2385 CallInst *CI = cast<CallInst>(VL0);
2386 setInsertPointAfterBundle(E->Scalars);
2389 Value *ScalarArg =
nullptr;
2393 std::vector<Value *> OpVecs;
2399 CallInst *CEI = cast<CallInst>(E->Scalars[0]);
2404 for (
Value *V : E->Scalars) {
2409 Value *OpVec = vectorizeTree(OpVL);
2410 DEBUG(
dbgs() <<
"SLP: OpVec[" << j <<
"]: " << *OpVec <<
"\n");
2411 OpVecs.push_back(OpVec);
2418 Value *V = Builder.CreateCall(CF, OpVecs);
2423 if (ScalarArg && ScalarToTreeEntry.count(ScalarArg))
2424 ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
2426 E->VectorizedValue = V;
2427 ++NumVectorInstructions;
2430 case Instruction::ShuffleVector: {
2431 ValueList LHSVL, RHSVL;
2432 assert(isa<BinaryOperator>(VL0) &&
"Invalid Shuffle Vector Operand");
2433 reorderAltShuffleOperands(E->Scalars, LHSVL, RHSVL);
2434 setInsertPointAfterBundle(E->Scalars);
2436 Value *LHS = vectorizeTree(LHSVL);
2437 Value *RHS = vectorizeTree(RHSVL);
2439 if (
Value *V = alreadyVectorized(E->Scalars))
2447 Instruction *VL1 = cast<Instruction>(E->Scalars[1]);
2449 Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS);
2454 ValueList OddScalars, EvenScalars;
2455 unsigned e = E->Scalars.size();
2457 for (
unsigned i = 0; i < e; ++i) {
2459 Mask[i] = Builder.getInt32(e + i);
2460 OddScalars.push_back(E->Scalars[i]);
2462 Mask[i] = Builder.getInt32(i);
2463 EvenScalars.push_back(E->Scalars[i]);
2468 propagateIRFlags(V0, EvenScalars);
2469 propagateIRFlags(V1, OddScalars);
2471 Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
2472 E->VectorizedValue = V;
2473 ++NumVectorInstructions;
2475 return propagateMetadata(I, E->Scalars);
2485 Value *BoUpSLP::vectorizeTree() {
2488 for (
auto &BSIter : BlocksSchedules) {
2489 scheduleBlock(BSIter.second.get());
2492 Builder.SetInsertPoint(
F->getEntryBlock().begin());
2493 vectorizeTree(&VectorizableTree[0]);
2495 DEBUG(
dbgs() <<
"SLP: Extracting " << ExternalUses.size() <<
" values .\n");
2498 for (UserList::iterator it = ExternalUses.begin(), e = ExternalUses.end();
2500 Value *Scalar = it->Scalar;
2508 assert(ScalarToTreeEntry.count(Scalar) &&
"Invalid scalar");
2510 int Idx = ScalarToTreeEntry[
Scalar];
2511 TreeEntry *E = &VectorizableTree[Idx];
2512 assert(!E->NeedToGather &&
"Extracting from a gather list");
2514 Value *Vec = E->VectorizedValue;
2515 assert(Vec &&
"Can't find vectorizable value");
2517 Value *Lane = Builder.getInt32(it->Lane);
2520 if (isa<Instruction>(Vec)){
2521 if (
PHINode *PH = dyn_cast<PHINode>(User)) {
2525 Value *Ex = Builder.CreateExtractElement(Vec, Lane);
2531 Builder.SetInsertPoint(cast<Instruction>(User));
2532 Value *Ex = Builder.CreateExtractElement(Vec, Lane);
2533 CSEBlocks.insert(cast<Instruction>(User)->
getParent());
2537 Builder.SetInsertPoint(
F->getEntryBlock().begin());
2538 Value *Ex = Builder.CreateExtractElement(Vec, Lane);
2539 CSEBlocks.insert(&
F->getEntryBlock());
2543 DEBUG(
dbgs() <<
"SLP: Replaced:" << *User <<
".\n");
2547 for (
int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) {
2548 TreeEntry *Entry = &VectorizableTree[EIdx];
2551 for (
int Lane = 0,
LE = Entry->Scalars.size(); Lane !=
LE; ++Lane) {
2552 Value *Scalar = Entry->Scalars[Lane];
2554 if (Entry->NeedToGather)
2557 assert(Entry->VectorizedValue &&
"Can't find vectorizable value");
2562 for (User *U : Scalar->
users()) {
2563 DEBUG(
dbgs() <<
"SLP: \tvalidating user:" << *U <<
".\n");
2565 assert((ScalarToTreeEntry.count(U) ||
2567 (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), U) !=
2568 UserIgnoreList.end())) &&
2569 "Replacing out-of-tree value with undef");
2575 DEBUG(
dbgs() <<
"SLP: \tErasing scalar:" << *Scalar <<
".\n");
2576 eraseInstruction(cast<Instruction>(Scalar));
2580 Builder.ClearInsertionPoint();
2582 return VectorizableTree[0].VectorizedValue;
2585 void BoUpSLP::optimizeGatherSequence() {
2586 DEBUG(
dbgs() <<
"SLP: Optimizing " << GatherSeq.size()
2587 <<
" gather sequences instructions.\n");
2590 e = GatherSeq.end(); it != e; ++it) {
2611 if (CurrVec && L->
contains(CurrVec))
2613 if (NewElem && L->
contains(NewElem))
2622 CSEWorkList.
reserve(CSEBlocks.size());
2625 assert(DT->isReachableFromEntry(
N));
2631 std::stable_sort(CSEWorkList.
begin(), CSEWorkList.
end(),
2633 return DT->properlyDominates(A, B);
2640 for (
auto I = CSEWorkList.
begin(), E = CSEWorkList.
end(); I != E; ++
I) {
2641 assert((I == CSEWorkList.
begin() || !DT->dominates(*I, *std::prev(I))) &&
2642 "Worklist not sorted properly!");
2647 if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(
In))
2656 DT->dominates((*v)->getParent(), In->
getParent())) {
2658 eraseInstruction(In);
2664 assert(std::find(Visited.begin(), Visited.end(),
In) == Visited.end());
2665 Visited.push_back(In);
2677 if (isa<PHINode>(VL[0]))
2682 ScheduleData *PrevInBundle =
nullptr;
2683 ScheduleData *Bundle =
nullptr;
2684 bool ReSchedule =
false;
2685 DEBUG(
dbgs() <<
"SLP: bundle: " << *VL[0] <<
"\n");
2686 for (
Value *V : VL) {
2687 extendSchedulingRegion(V);
2688 ScheduleData *BundleMember = getScheduleData(V);
2689 assert(BundleMember &&
2690 "no ScheduleData for bundle member (maybe not in same basic block)");
2691 if (BundleMember->IsScheduled) {
2695 DEBUG(
dbgs() <<
"SLP: reset schedule because " << *BundleMember
2696 <<
" was already scheduled\n");
2699 assert(BundleMember->isSchedulingEntity() &&
2700 "bundle member already part of other bundle");
2702 PrevInBundle->NextInBundle = BundleMember;
2704 Bundle = BundleMember;
2706 BundleMember->UnscheduledDepsInBundle = 0;
2707 Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
2710 BundleMember->FirstInBundle = Bundle;
2711 PrevInBundle = BundleMember;
2713 if (ScheduleEnd != OldScheduleEnd) {
2719 for (
auto *I = ScheduleStart; I != ScheduleEnd; I = I->
getNextNode()) {
2720 ScheduleData *SD = getScheduleData(I);
2721 SD->clearDependencies();
2727 initialFillReadyList(ReadyInsts);
2730 DEBUG(
dbgs() <<
"SLP: try schedule bundle " << *Bundle <<
" in block "
2733 calculateDependencies(Bundle,
true, SLP);
2739 while (!Bundle->isReady() && !ReadyInsts.empty()) {
2741 ScheduleData *pickedSD = ReadyInsts.back();
2742 ReadyInsts.pop_back();
2744 if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) {
2745 schedule(pickedSD, ReadyInsts);
2748 return Bundle->isReady();
2752 if (isa<PHINode>(VL[0]))
2755 ScheduleData *Bundle = getScheduleData(VL[0]);
2756 DEBUG(
dbgs() <<
"SLP: cancel scheduling of " << *Bundle <<
"\n");
2757 assert(!Bundle->IsScheduled &&
2758 "Can't cancel bundle which is already scheduled");
2759 assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&
2760 "tried to unbundle something which is not a bundle");
2763 ScheduleData *BundleMember = Bundle;
2764 while (BundleMember) {
2765 assert(BundleMember->FirstInBundle == Bundle &&
"corrupt bundle links");
2766 BundleMember->FirstInBundle = BundleMember;
2767 ScheduleData *Next = BundleMember->NextInBundle;
2768 BundleMember->NextInBundle =
nullptr;
2769 BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
2770 if (BundleMember->UnscheduledDepsInBundle == 0) {
2771 ReadyInsts.insert(BundleMember);
2773 BundleMember = Next;
2777 void BoUpSLP::BlockScheduling::extendSchedulingRegion(
Value *V) {
2778 if (getScheduleData(V))
2781 assert(I &&
"bundle member must be an instruction");
2782 assert(!isa<PHINode>(I) &&
"phi nodes don't need to be scheduled");
2783 if (!ScheduleStart) {
2785 initScheduleData(I, I->
getNextNode(),
nullptr,
nullptr);
2788 assert(ScheduleEnd &&
"tried to vectorize a TerminatorInst?");
2789 DEBUG(
dbgs() <<
"SLP: initialize schedule region to " << *I <<
"\n");
2799 if (UpIter != UpperEnd) {
2800 if (&*UpIter == I) {
2801 initScheduleData(I, ScheduleStart,
nullptr, FirstLoadStoreInRegion);
2803 DEBUG(
dbgs() <<
"SLP: extend schedule region start to " << *I <<
"\n");
2808 if (DownIter != LowerEnd) {
2809 if (&*DownIter == I) {
2810 initScheduleData(ScheduleEnd, I->
getNextNode(), LastLoadStoreInRegion,
2813 assert(ScheduleEnd &&
"tried to vectorize a TerminatorInst?");
2814 DEBUG(
dbgs() <<
"SLP: extend schedule region end to " << *I <<
"\n");
2819 assert((UpIter != UpperEnd || DownIter != LowerEnd) &&
2820 "instruction not found in block");
2824 void BoUpSLP::BlockScheduling::initScheduleData(
Instruction *FromI,
2826 ScheduleData *PrevLoadStore,
2827 ScheduleData *NextLoadStore) {
2828 ScheduleData *CurrentLoadStore = PrevLoadStore;
2830 ScheduleData *SD = ScheduleDataMap[
I];
2833 if (ChunkPos >= ChunkSize) {
2834 ScheduleDataChunks.push_back(
2838 SD = &(ScheduleDataChunks.back()[ChunkPos++]);
2839 ScheduleDataMap[
I] = SD;
2842 assert(!isInSchedulingRegion(SD) &&
2843 "new ScheduleData already in scheduling region");
2844 SD->init(SchedulingRegionID);
2848 if (CurrentLoadStore) {
2849 CurrentLoadStore->NextLoadStore = SD;
2851 FirstLoadStoreInRegion = SD;
2853 CurrentLoadStore = SD;
2856 if (NextLoadStore) {
2857 if (CurrentLoadStore)
2858 CurrentLoadStore->NextLoadStore = NextLoadStore;
2860 LastLoadStoreInRegion = CurrentLoadStore;
2864 void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
2865 bool InsertInReadyList,
2867 assert(SD->isSchedulingEntity());
2872 while (!WorkList.empty()) {
2873 ScheduleData *SD = WorkList.back();
2874 WorkList.pop_back();
2876 ScheduleData *BundleMember = SD;
2877 while (BundleMember) {
2878 assert(isInSchedulingRegion(BundleMember));
2879 if (!BundleMember->hasValidDependencies()) {
2881 DEBUG(
dbgs() <<
"SLP: update deps of " << *BundleMember <<
"\n");
2882 BundleMember->Dependencies = 0;
2883 BundleMember->resetUnscheduledDeps();
2886 for (User *U : BundleMember->Inst->users()) {
2887 if (isa<Instruction>(U)) {
2888 ScheduleData *UseSD = getScheduleData(U);
2889 if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
2890 BundleMember->Dependencies++;
2891 ScheduleData *DestBundle = UseSD->FirstInBundle;
2892 if (!DestBundle->IsScheduled) {
2893 BundleMember->incrementUnscheduledDeps(1);
2895 if (!DestBundle->hasValidDependencies()) {
2896 WorkList.push_back(DestBundle);
2903 BundleMember->Dependencies++;
2904 BundleMember->incrementUnscheduledDeps(1);
2909 ScheduleData *DepDest = BundleMember->NextLoadStore;
2913 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
2914 unsigned numAliased = 0;
2915 unsigned DistToSrc = 1;
2918 assert(isInSchedulingRegion(DepDest));
2927 if (DistToSrc >= MaxMemDepDistance ||
2928 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
2929 (numAliased >= AliasedCheckLimit ||
2930 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
2937 DepDest->MemoryDependencies.push_back(BundleMember);
2938 BundleMember->Dependencies++;
2939 ScheduleData *DestBundle = DepDest->FirstInBundle;
2940 if (!DestBundle->IsScheduled) {
2941 BundleMember->incrementUnscheduledDeps(1);
2943 if (!DestBundle->hasValidDependencies()) {
2944 WorkList.push_back(DestBundle);
2947 DepDest = DepDest->NextLoadStore;
2962 if (DistToSrc >= 2 * MaxMemDepDistance)
2968 BundleMember = BundleMember->NextInBundle;
2970 if (InsertInReadyList && SD->isReady()) {
2971 ReadyInsts.push_back(SD);
2972 DEBUG(
dbgs() <<
"SLP: gets ready on update: " << *SD->Inst <<
"\n");
2977 void BoUpSLP::BlockScheduling::resetSchedule() {
2978 assert(ScheduleStart &&
2979 "tried to reset schedule on block which has not been scheduled");
2981 ScheduleData *SD = getScheduleData(I);
2982 assert(isInSchedulingRegion(SD));
2983 SD->IsScheduled =
false;
2984 SD->resetUnscheduledDeps();
2989 void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
2991 if (!BS->ScheduleStart)
2994 DEBUG(
dbgs() <<
"SLP: schedule block " << BS->BB->getName() <<
"\n");
2996 BS->resetSchedule();
3001 struct ScheduleDataCompare {
3002 bool operator()(ScheduleData *SD1, ScheduleData *SD2) {
3003 return SD2->SchedulingPriority < SD1->SchedulingPriority;
3006 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
3011 int NumToSchedule = 0;
3012 for (
auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
3014 ScheduleData *SD = BS->getScheduleData(I);
3016 SD->isPartOfBundle() == (ScalarToTreeEntry.count(SD->Inst) != 0) &&
3017 "scheduler and vectorizer have different opinion on what is a bundle");
3018 SD->FirstInBundle->SchedulingPriority = Idx++;
3019 if (SD->isSchedulingEntity()) {
3020 BS->calculateDependencies(SD,
false,
this);
3024 BS->initialFillReadyList(ReadyInsts);
3029 while (!ReadyInsts.empty()) {
3030 ScheduleData *picked = *ReadyInsts.begin();
3031 ReadyInsts.erase(ReadyInsts.begin());
3035 ScheduleData *BundleMember = picked;
3036 while (BundleMember) {
3038 if (LastScheduledInst->
getNextNode() != pickedInst) {
3039 BS->BB->getInstList().remove(pickedInst);
3040 BS->BB->getInstList().insert(LastScheduledInst, pickedInst);
3042 LastScheduledInst = pickedInst;
3043 BundleMember = BundleMember->NextInBundle;
3046 BS->schedule(picked, ReadyInsts);
3049 assert(NumToSchedule == 0 &&
"could not schedule all instructions");
3052 BS->ScheduleStart =
nullptr;
3075 bool runOnFunction(
Function &
F)
override {
3076 if (skipOptnoneFunction(F))
3079 SE = &getAnalysis<ScalarEvolution>();
3080 TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
3081 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
3082 TLI = TLIP ? &TLIP->getTLI() :
nullptr;
3083 AA = &getAnalysis<AliasAnalysis>();
3084 LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
3085 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
3086 AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
3089 bool Changed =
false;
3093 if (!TTI->getNumberOfRegisters(
true))
3105 MaxVecRegSize = TTI->getRegisterBitWidth(
true);
3115 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC);
3123 if (
unsigned count = collectStores(BB, R)) {
3125 DEBUG(
dbgs() <<
"SLP: Found " << count <<
" stores to vectorize.\n");
3126 Changed |= vectorizeStoreChains(R);
3130 Changed |= vectorizeChainsInBlock(BB, R);
3134 R.optimizeGatherSequence();
3160 unsigned collectStores(
BasicBlock *BB, BoUpSLP &R);
3163 bool tryToVectorizePair(
Value *A,
Value *B, BoUpSLP &R);
3171 bool allowReorder =
false);
3177 bool vectorizeStoreChains(BoUpSLP &R);
3181 bool vectorizeChainsInBlock(
BasicBlock *BB, BoUpSLP &R);
3184 BoUpSLP &R,
unsigned VecRegSize);
3189 StoreListMap StoreRefs;
3190 unsigned MaxVecRegSize;
3198 unsigned SliceBegin,
unsigned SliceSize) {
3199 VL = VL.
slice(SliceBegin, SliceSize);
3200 VH = VH.
slice(SliceBegin, SliceSize);
3205 int CostThreshold, BoUpSLP &R,
3206 unsigned VecRegSize) {
3207 unsigned ChainLen = Chain.
size();
3208 DEBUG(
dbgs() <<
"SLP: Analyzing a store chain of length " << ChainLen
3210 Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType();
3211 auto &DL = cast<StoreInst>(Chain[0])->getModule()->getDataLayout();
3213 unsigned VF = VecRegSize / Sz;
3221 bool Changed =
false;
3223 for (
unsigned i = 0, e = ChainLen; i < e; ++i) {
3228 if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))
3231 DEBUG(
dbgs() <<
"SLP: Analyzing " << VF <<
" stores at offset " << i
3235 R.buildTree(Operands);
3237 int Cost = R.getTreeCost();
3239 DEBUG(
dbgs() <<
"SLP: Found cost=" << Cost <<
" for VF=" << VF <<
"\n");
3240 if (Cost < CostThreshold) {
3241 DEBUG(
dbgs() <<
"SLP: Decided to vectorize cost=" << Cost <<
"\n");
3254 int costThreshold, BoUpSLP &R) {
3261 bool Changed =
false;
3265 for (
unsigned i = 0, e = Stores.
size(); i < e; ++i) {
3266 for (
unsigned j = 0; j < e; ++j) {
3269 const DataLayout &DL = Stores[i]->getModule()->getDataLayout();
3270 if (R.isConsecutiveAccess(Stores[i], Stores[j], DL)) {
3273 ConsecutiveChain[Stores[i]] = Stores[j];
3281 if (Tails.
count(*it))
3290 if (VectorizedStores.
count(I))
3294 I = ConsecutiveChain[
I];
3299 for (
unsigned Size = MaxVecRegSize; Size >= MinVecRegSize; Size /= 2) {
3300 if (vectorizeStoreChain(Operands, costThreshold, R, Size)) {
3313 unsigned SLPVectorizer::collectStores(
BasicBlock *BB, BoUpSLP &R) {
3328 if (!isValidElementType(Ty))
3335 StoreRefs[Ptr].push_back(SI);
3341 bool SLPVectorizer::tryToVectorizePair(
Value *A,
Value *B, BoUpSLP &R) {
3345 return tryToVectorizeList(VL, R,
None,
true);
3350 bool allowReorder) {
3354 DEBUG(
dbgs() <<
"SLP: Vectorizing a list of length = " << VL.
size() <<
".\n");
3362 const DataLayout &DL = I0->getModule()->getDataLayout();
3364 Type *Ty0 = I0->getType();
3368 unsigned VF = MinVecRegSize / Sz;
3370 for (
Value *V : VL) {
3372 if (!isValidElementType(Ty))
3375 if (!Inst || Inst->
getOpcode() != Opcode0)
3379 bool Changed =
false;
3384 for (
unsigned i = 0, e = VL.size(); i < e; ++i) {
3385 unsigned OpsWidth = 0;
3396 if (hasValueBeenRAUWed(VL, TrackValues, i, OpsWidth))
3399 DEBUG(
dbgs() <<
"SLP: Analyzing " << OpsWidth <<
" operations "
3404 if (!BuildVector.
empty())
3405 BuildVectorSlice = BuildVector.
slice(i, OpsWidth);
3407 R.buildTree(Ops, BuildVectorSlice);
3410 if (allowReorder && R.shouldReorder()) {
3411 assert(Ops.
size() == 2);
3412 assert(BuildVectorSlice.
empty());
3413 Value *ReorderedOps[] = { Ops[1], Ops[0] };
3414 R.buildTree(ReorderedOps,
None);
3416 int Cost = R.getTreeCost();
3419 DEBUG(
dbgs() <<
"SLP: Vectorizing list at cost:" << Cost <<
".\n");
3420 Value *VectorizedRoot = R.vectorizeTree();
3425 if (!BuildVectorSlice.
empty()) {
3429 Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.
back());
3430 unsigned VecIdx = 0;
3431 for (
auto &V : BuildVectorSlice) {
3435 Instruction *Extract = cast<Instruction>(Builder.CreateExtractElement(
3436 VectorizedRoot, Builder.getInt32(VecIdx++)));
3452 bool SLPVectorizer::tryToVectorize(
BinaryOperator *V, BoUpSLP &R) {
3463 if (B && B->hasOneUse()) {
3466 if (tryToVectorizePair(A, B0, R)) {
3469 if (tryToVectorizePair(A, B1, R)) {
3478 if (tryToVectorizePair(A0, B, R)) {
3481 if (tryToVectorizePair(A1, B, R)) {
3498 static Value *createRdxShuffleMask(
unsigned VecLen,
unsigned NumEltsToRdx,
3499 bool IsPairwise,
bool IsLeft,
3501 assert((IsPairwise || !IsLeft) &&
"Don't support a <0,1,undef,...> mask");
3508 for (
unsigned i = 0; i != NumEltsToRdx; ++i)
3509 ShuffleMask[i] = Builder.
getInt32(2 * i + !IsLeft);
3512 for (
unsigned i = 0; i != NumEltsToRdx; ++i)
3513 ShuffleMask[i] = Builder.
getInt32(NumEltsToRdx + i);
3546 class HorizontalReduction {
3554 unsigned ReductionOpcode;
3556 unsigned ReducedValueOpcode;
3558 unsigned ReduxWidth;
3561 bool IsPairwiseReduction;
3564 HorizontalReduction()
3565 : ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0),
3566 ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(
false) {}
3572 "Thi phi needs to use the binary operator");
3591 if (!isValidElementType(Ty))
3596 ReducedValueOpcode = 0;
3607 if (ReductionOpcode != Instruction::Add &&
3608 ReductionOpcode != Instruction::FAdd)
3615 while (!Stack.
empty()) {
3617 unsigned EdgeToVist = Stack.
back().second++;
3618 bool IsReducedValue = TreeN->
getOpcode() != ReductionOpcode;
3630 if (EdgeToVist == 2 || IsReducedValue) {
3631 if (IsReducedValue) {
3634 if (!ReducedValueOpcode)
3635 ReducedValueOpcode = TreeN->
getOpcode();
3636 else if (ReducedValueOpcode != TreeN->
getOpcode())
3638 ReducedVals.push_back(TreeN);
3643 ReductionOps.push_back(TreeN);
3654 Stack.
push_back(std::make_pair(Next, 0));
3655 else if (NextV != Phi)
3664 if (ReducedVals.empty())
3667 unsigned NumReducedVals = ReducedVals.size();
3668 if (NumReducedVals < ReduxWidth)
3671 Value *VectorizedTree =
nullptr;
3678 for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {
3679 V.buildTree(
makeArrayRef(&ReducedVals[i], ReduxWidth), ReductionOps);
3682 int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]);
3686 DEBUG(
dbgs() <<
"SLP: Vectorizing horizontal reduction at cost:" << Cost
3690 DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
3691 Value *VectorizedRoot = V.vectorizeTree();
3694 Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder);
3695 if (VectorizedTree) {
3697 VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
3698 ReducedSubTree,
"bin.rdx");
3700 VectorizedTree = ReducedSubTree;
3703 if (VectorizedTree) {
3705 for (; i < NumReducedVals; ++i) {
3707 cast<Instruction>(ReducedVals[i])->getDebugLoc());
3708 VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
3713 assert(ReductionRoot &&
"Need a reduction operation");
3714 ReductionRoot->setOperand(0, VectorizedTree);
3715 ReductionRoot->setOperand(1, ReductionPHI);
3717 ReductionRoot->replaceAllUsesWith(VectorizedTree);
3719 return VectorizedTree !=
nullptr;
3730 int SplittingRdxCost = TTI->
getReductionCost(ReductionOpcode, VecTy,
false);
3732 IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
3733 int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
3735 int ScalarReduxCost =
3738 DEBUG(
dbgs() <<
"SLP: Adding cost " << VecReduxCost - ScalarReduxCost
3739 <<
" for reduction that starts with " << *FirstReducedVal
3741 << (IsPairwiseReduction ?
"pairwise" :
"splitting")
3742 <<
" reduction)\n");
3744 return VecReduxCost - ScalarReduxCost;
3749 if (Opcode == Instruction::FAdd)
3756 assert(VectorizedValue &&
"Need to have a vectorized tree node");
3758 "We only handle power-of-two reductions for now");
3760 Value *TmpVec = VectorizedValue;
3761 for (
unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
3762 if (IsPairwiseReduction) {
3764 createRdxShuffleMask(ReduxWidth, i,
true,
true, Builder);
3766 createRdxShuffleMask(ReduxWidth, i,
true,
false, Builder);
3773 TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf,
3777 createRdxShuffleMask(ReduxWidth, i,
false,
false, Builder);
3780 TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf,
"bin.rdx");
3800 if (!isa<UndefValue>(FirstInsertElem->
getOperand(0)))
3830 bool SLPVectorizer::vectorizeChainsInBlock(
BasicBlock *BB, BoUpSLP &R) {
3831 bool Changed =
false;
3835 bool HaveVectorizedPhiNodes =
true;
3836 while (HaveVectorizedPhiNodes) {
3837 HaveVectorizedPhiNodes =
false;
3847 if (!VisitedInstrs.
count(P))
3852 std::stable_sort(Incoming.
begin(), Incoming.
end(), PhiTypeSorterFunc);
3861 while (SameTypeIt != E &&
3862 (*SameTypeIt)->getType() == (*IncIt)->getType()) {
3863 VisitedInstrs.
insert(*SameTypeIt);
3868 unsigned NumElts = (SameTypeIt - IncIt);
3869 DEBUG(
errs() <<
"SLP: Trying to vectorize starting at PHIs (" << NumElts <<
")\n");
3870 if (NumElts > 1 && tryToVectorizeList(
makeArrayRef(IncIt, NumElts), R)) {
3872 HaveVectorizedPhiNodes =
true;
3882 VisitedInstrs.
clear();
3886 if (!VisitedInstrs.
insert(it).second)
3889 if (isa<DbgInfoIntrinsic>(it))
3893 if (
PHINode *P = dyn_cast<PHINode>(it)) {
3908 HorizontalReduction HorRdx;
3910 HorRdx.tryToReduce(R, TTI)) {
3921 if (tryToVectorize(dyn_cast<BinaryOperator>(Inst), R)) {
3935 if (
StoreInst *SI = dyn_cast<StoreInst>(it))
3938 HorizontalReduction HorRdx;
3939 if (((HorRdx.matchAssociativeReduction(
nullptr, BinOp) &&
3940 HorRdx.tryToReduce(R, TTI)) ||
3941 tryToVectorize(BinOp, R))) {
3950 if (
ReturnInst *RI = dyn_cast<ReturnInst>(it))
3951 if (RI->getNumOperands() != 0)
3953 dyn_cast<BinaryOperator>(RI->getOperand(0))) {
3954 DEBUG(
dbgs() <<
"SLP: Found a return to vectorize.\n");
3965 if (
CmpInst *CI = dyn_cast<CmpInst>(it)) {
3975 for (
int i = 0; i < 2; ++i) {
3994 if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds))
4000 if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) {
4013 bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) {
4014 bool Changed =
false;
4016 for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end();
4018 if (it->second.size() < 2)
4021 DEBUG(
dbgs() <<
"SLP: Analyzing a store chain of length "
4022 << it->second.size() <<
".\n");
4028 for (
unsigned CI = 0, CE = it->second.size(); CI <
CE; CI+=16) {
4029 unsigned Len = std::min<unsigned>(CE - CI, 16);
4030 Changed |= vectorizeStores(
makeArrayRef(&it->second[CI], Len),
Pass interface - Implemented by all 'passes'.
ReturnInst - Return a value (possibly void), from a function.
Value * getValueOperand()
void push_back(const T &Elt)
Intrinsic::ID getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
A parsed version of the target data layout string in and methods for querying it. ...
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
This class is the base class for the comparison instructions.
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
raw_ostream & errs()
This returns a reference to a raw_ostream for standard error.
static IntegerType * getInt1Ty(LLVMContext &C)
void addIncoming(Value *V, BasicBlock *BB)
addIncoming - Add an incoming value to the end of the PHI list
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
void dropAllReferences()
Drop all references to operands.
STATISTIC(NumFunctions,"Total number of functions")
A Module instance is used to store all the information related to an LLVM module. ...
unsigned getNumOperands() const
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
ScalarEvolution - This class is the main scalar evolution driver.
static MDNode * getMostGenericAliasScope(MDNode *A, MDNode *B)
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
CallInst - This class represents a function call, abstracting a target machine's calling convention...
size_type count(PtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of .assume calls within a function.
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this ""number "))
This class implements a map that also provides access to all stored values in a deterministic order...
LoadInst - an instruction for reading from memory.
reverse_iterator rbegin()
FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys=None)
Return the function type for an intrinsic.
void reserve(size_type N)
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
iterator end()
Get an iterator to the end of the SetVector.
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
StringRef getName() const
Return a constant reference to the value's name.
iterator begin()
Instruction iterator methods.
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
static Value * getPointerOperand(Instruction &Inst)
bool isIdenticalTo(const Instruction *I) const
isIdenticalTo - Return true if the specified instruction is exactly identical to the current one...
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
This is the base class for all instructions that perform data casts.
const APInt & getValue() const
Return the constant as an APInt value reference.
NodeTy * getNextNode()
Get the next node, or 0 for the list tail.
ArrayRef< T > makeArrayRef(const T &OneElt)
Construct an ArrayRef from a single element.
const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr it the function does no...
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
A Use represents the edge between a Value definition and its users.
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
unsigned getNumArgOperands() const
getNumArgOperands - Return the number of call arguments.
Instruction * getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
static Constant * get(ArrayRef< Constant * > V)
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Windows NT (Windows on ARM)
static ConstantInt * ExtractElement(Constant *V, Constant *Idx)
Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset)
Accumulate offsets from stripInBoundsConstantOffsets().
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool insert(const value_type &X)
Insert a new element into the SetVector.
void SetFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
bool LLVM_ATTRIBUTE_UNUSED_RESULT empty() const
bool isAssociative() const
isAssociative - Return true if the instruction is associative:
ArrayRef< T > slice(unsigned N) const
slice(n) - Chop off the first N elements of the array.
const T & getValue() const LLVM_LVALUE_FUNCTION
static bool isValidElementType(Type *ElemTy)
isValidElementType - Return true if the specified type is valid as a element type.
iterator begin()
Get an iterator to the beginning of the SetVector.
Base class for the actual dominator tree node.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
static MDNode * intersect(MDNode *A, MDNode *B)
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
StoreInst - an instruction for storing to memory.
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
unsigned getNumElements() const
Return the number of elements in the Vector type.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
bool isPPC_FP128Ty() const
isPPC_FP128Ty - Return true if this is powerpc long double.
size_t size() const
size - Get the array size.
bool mayReadOrWriteMemory() const
mayReadOrWriteMemory - Return true if this instruction may read or write memory.
unsigned getNumIncomingValues() const
getNumIncomingValues - Return the number of incoming edges
void replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
InstListType::reverse_iterator reverse_iterator
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
initializer< Ty > init(const Ty &Val)
InsertElementInst - This instruction inserts a single (scalar) element into a VectorType value...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
unsigned getAlignment() const
getAlignment - Return the alignment of the access that is being performed
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Subclasses of this class are all able to terminate a basic block.
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
BlockT * getLoopPreheader() const
getLoopPreheader - If there is a preheader for this loop, return it.
std::enable_if<!std::is_array< T >::value, std::unique_ptr< T > >::type make_unique(Args &&...args)
Constructs a new T() with the given args and returns a unique_ptr<T> which owns the object...
LLVM Basic Block Representation.
The instances of the Type class are immutable: once they are created, they are never changed...
void getAllMetadataOtherThanDebugLoc(SmallVectorImpl< std::pair< unsigned, MDNode * >> &MDs) const
getAllMetadataOtherThanDebugLoc - This does the same thing as getAllMetadata, except that it filters ...
bool isVectorTy() const
isVectorTy - True if this is an instance of VectorType.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
APInt Or(const APInt &LHS, const APInt &RHS)
Bitwise OR function for APInt.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
APInt Xor(const APInt &LHS, const APInt &RHS)
Bitwise XOR function for APInt.
const DebugLoc & getDebugLoc() const
getDebugLoc - Return the debug location for this node as a DebugLoc.
Represent the analysis usage information of a pass.
BasicBlock * getIncomingBlock(unsigned i) const
getIncomingBlock - Return incoming basic block number i.
bool contains(const LoopT *L) const
contains - Return true if the specified loop is contained within in this loop.
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang","erlang-compatible garbage collector")
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
FunctionPass class - This class is used to implement most global optimizations.
Value * getOperand(unsigned i) const
Value * getPointerOperand()
bool isCommutative() const
isCommutative - Return true if the instruction is commutative:
iterator_range< po_iterator< T > > post_order(const T &G)
std::pair< NoneType, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Predicate getPredicate() const
Return the predicate for this instruction.
bool empty() const
empty - Check if the array is empty.
void setAlignment(unsigned Align)
#define INITIALIZE_AG_DEPENDENCY(depName)
static MDNode * getMostGenericTBAA(MDNode *A, MDNode *B)
static UndefValue * get(Type *T)
get() - Static factory methods - Return an 'undef' object of the specified type.
PointerType * getPointerTo(unsigned AddrSpace=0)
getPointerTo - Return a pointer to the current type.
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Value * GetUnderlyingObject(Value *V, const DataLayout &DL, unsigned MaxLookup=6)
GetUnderlyingObject - This method strips off any GEP address adjustments and pointer casts from the s...
void setMetadata(unsigned KindID, MDNode *Node)
setMetadata - Set the metadata of the specified kind to the specified node.
const T & back() const
back - Get the last element.
void dump() const
Support for debugging, callable in GDB: V->dump()
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
const Value * Ptr
The address of the start of the location.
Representation for a specific memory location.
BinaryOps getOpcode() const
unsigned getABITypeAlignment(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
MemIntrinsic - This is the common base class for memset/memcpy/memmove.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
This is the shared class of boolean and integer constants.
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Value * getIncomingValue(unsigned i) const
getIncomingValue - Return incoming value number x
unsigned getVectorNumElements() const
virtual AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB)
Alias Queries...
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small...
Module.h This file contains the declarations for the Module class.
Type * getType() const
All values are typed, get the type of this value.
Instruction * user_back()
user_back - Specialize the methods defined in Value, as we know that an instruction can only be used ...
Provides information about what library functions are available for the current target.
MDNode * getMetadata(unsigned KindID) const
getMetadata - Get the metadata of given kind attached to this Instruction.
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Function * getCalledFunction() const
getCalledFunction - Return the function called, or null if this is an indirect function invocation...
void setPreservesCFG()
This function should be called by the pass, iff they do not:
const BasicBlock & getEntryBlock() const
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool hasVectorInstrinsicScalarOpd(Intrinsic::ID ID, unsigned ScalarOpdIdx)
Identifies if the intrinsic has a scalar operand.
Intrinsic::ID getIntrinsicID() const LLVM_READONLY
getIntrinsicID - This method returns the ID number of the specified function, or Intrinsic::not_intri...
void setOperand(unsigned i, Value *Val)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Value * getArgOperand(unsigned i) const
getArgOperand/setArgOperand - Return/set the i-th call argument.
VectorType - Class to represent vector types.
Class for arbitrary precision integers.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
iterator_range< user_iterator > users()
LLVM_ATTRIBUTE_UNUSED_RESULT std::enable_if< !is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Value * CreateFAdd(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
APInt And(const APInt &LHS, const APInt &RHS)
Bitwise AND function for APInt.
void removeFromParent()
removeFromParent - This method unlinks 'this' from the containing basic block, but does not delete it...
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
bool isX86_FP80Ty() const
isX86_FP80Ty - Return true if this is x86 long double.
void initializeSLPVectorizerPass(PassRegistry &)
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
SCEV - This class represents an analyzed expression in the program.
unsigned getAlignment() const
getAlignment - Return the alignment of the access that is being performed
void insertAfter(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately after the specified instruction...
TerminatorInst * getTerminator()
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
bool hasOneUse() const
Return true if there is exactly one user of this value.
User(Type *ty, unsigned vty, Use *OpList, unsigned NumOps)
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
uint64_t getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type...
raw_ostream & operator<<(raw_ostream &OS, const APInt &I)
Pass * createSLPVectorizerPass()
user_iterator user_begin()
const ARM::ArchExtKind Kind
static const char lv_name[]
LLVM Value Representation.
void setAlignment(unsigned Align)
vector_type::const_iterator iterator
unsigned getOpcode() const
getOpcode() returns a member of one of the enums like Instruction::Add.
A vector that has set insertion semantics.
static VectorType * get(Type *ElementType, unsigned NumElements)
VectorType::get - This static method is the primary way to construct an VectorType.
Disable implicit floating point insts.
static const Function * getParent(const Value *V)
void moveBefore(Instruction *MovePos)
moveBefore - Unlink this instruction from its current basic block and insert it into the basic block ...
uint64_t getTypeSizeInBits(Type *Ty) const
Size examples:
This class implements an extremely fast bulk output stream that can only output to a stream...
The legacy pass manager's analysis pass to compute loop information.
C - The default llvm calling convention, compatible with C.
bool isPowerOf2_32(uint32_t Value)
isPowerOf2_32 - This function returns true if the argument is a power of two > 0. ...
Convenience struct for specifying and reasoning about fast-math flags.
Legacy analysis pass which computes a DominatorTree.
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop)...
static MDNode * getMostGenericFPMath(MDNode *A, MDNode *B)
Value * getPointerOperand()
const BasicBlock * getParent() const
bool isVoidTy() const
isVoidTy - Return true if this is 'void'.
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.