102 using namespace llvm;
103 using namespace llvm::PatternMatch;
105 #define LV_NAME "loop-vectorize"
106 #define DEBUG_TYPE LV_NAME
108 STATISTIC(LoopsVectorized,
"Number of loops vectorized");
109 STATISTIC(LoopsAnalyzed,
"Number of loops analyzed for vectorization");
113 cl::desc(
"Enable if-conversion during vectorization."));
118 cl::desc(
"Don't vectorize loops with a constant "
119 "trip count that is smaller than this "
124 cl::desc(
"Maximize bandwidth when selecting vectorization factor which "
125 "will be determined by the smallest type in loop."));
129 cl::desc(
"Enable vectorization on interleaved memory accesses in a loop"));
134 cl::desc(
"Maximum factor for an interleaved access group (default = 8)"),
143 cl::desc(
"A flag that overrides the target's number of scalar registers."));
147 cl::desc(
"A flag that overrides the target's number of vector registers."));
154 cl::desc(
"A flag that overrides the target's max interleave factor for "
159 cl::desc(
"A flag that overrides the target's max interleave factor for "
160 "vectorized loops."));
164 cl::desc(
"A flag that overrides the target's expected cost for "
165 "an instruction to a single constant value. Mostly "
166 "useful for getting consistent testing."));
171 "The cost of a loop that is considered 'small' by the interleaver."));
175 cl::desc(
"Enable the use of the block frequency analysis to access PGO "
176 "heuristics minimizing code growth in cold regions and being more "
177 "aggressive in hot regions."));
183 "Enable runtime interleaving until load/store ports are saturated"));
188 cl::desc(
"Max number of stores to be predicated behind an if."));
192 cl::desc(
"Count the induction variable only once when interleaving"));
196 cl::desc(
"Enable if predication of stores during vectorization."));
200 cl::desc(
"The maximum interleave count to use when interleaving a scalar "
201 "reduction in a nested loop."));
205 cl::desc(
"The maximum allowed number of runtime memory checks with a "
206 "vectorize(enable) pragma."));
210 cl::desc(
"The maximum number of SCEV checks allowed."));
214 cl::desc(
"The maximum number of SCEV checks allowed with a "
215 "vectorize(enable) pragma"));
231 CodeRegion =
I->getParent();
234 if (
I->getDebugLoc())
235 DL =
I->getDebugLoc();
239 R <<
"loop not vectorized: ";
246 class LoopVectorizeHints;
247 class LoopVectorizationLegality;
248 class LoopVectorizationCostModel;
249 class LoopVectorizationRequirements;
253 static bool hasCyclesInLoopBody(
const Loop &
L) {
257 for (
const auto &SCC :
260 if (SCC.size() > 1) {
261 DEBUG(
dbgs() <<
"LVL: Detected a cycle in the loop body:\n");
301 if (isa<GetElementPtrInst>(Ptr))
302 return cast<GetElementPtrInst>(
Ptr);
304 if (isa<BitCastInst>(Ptr) &&
305 isa<GetElementPtrInst>(cast<BitCastInst>(
Ptr)->getOperand(0))) {
307 Type *GEPTy = cast<BitCastInst>(
Ptr)->getSrcTy();
308 if (!isa<PointerType>(BitcastTy) || !isa<PointerType>(GEPTy))
310 Type *Pointee1Ty = cast<PointerType>(BitcastTy)->getPointerElementType();
311 Type *Pointee2Ty = cast<PointerType>(GEPTy)->getPointerElementType();
312 const DataLayout &
DL = cast<BitCastInst>(
Ptr)->getModule()->getDataLayout();
314 return cast<GetElementPtrInst>(cast<BitCastInst>(Ptr)->getOperand(0));
322 if (
auto *LI = dyn_cast<LoadInst>(I))
323 return LI->getPointerOperand();
324 if (
auto *SI = dyn_cast<StoreInst>(I))
325 return SI->getPointerOperand();
332 static bool hasIrregularType(
Type *Ty,
const DataLayout &DL,
unsigned VF) {
352 static unsigned getReciprocalPredBlockProb() {
return 2; }
368 class InnerLoopVectorizer {
375 unsigned UnrollFactor, LoopVectorizationLegality *LVL,
376 LoopVectorizationCostModel *CM)
377 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
378 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
379 Builder(PSE.getSE()->getContext()), Induction(nullptr),
380 OldInduction(nullptr), VectorLoopValueMap(UnrollFactor, VecWidth),
381 TripCount(nullptr), VectorTripCount(nullptr), Legal(LVL), Cost(CM),
382 AddedSafetyChecks(
false) {}
393 bool areSafetyChecksAdded() {
return AddedSafetyChecks; }
395 virtual ~InnerLoopVectorizer() {}
418 void createEmptyLoop();
429 virtual void vectorizeLoop();
433 void fixFirstOrderRecurrence(
PHINode *Phi);
447 void predicateInstructions();
451 void collectTriviallyDeadInstructions();
455 void truncateToMinimalBitwidths();
460 VectorParts createBlockInMask(
BasicBlock *BB);
466 void vectorizeBlockInLoop(
BasicBlock *BB, PhiVector *PV);
471 void widenPHIInstruction(
Instruction *PN,
unsigned UF,
unsigned VF,
476 void updateAnalysis();
482 virtual void scalarizeInstruction(
Instruction *Instr,
483 bool IfPredicateInstr =
false);
486 virtual void vectorizeMemoryInstruction(
Instruction *Instr);
500 Instruction::BinaryOpsEnd);
524 bool shouldScalarizeInstruction(
Instruction *I)
const;
537 const VectorParts &getVectorValue(
Value *V);
543 Value *getScalarValue(
Value *V,
unsigned Part,
unsigned Lane);
552 Value *getOrCreateTripCount(
Loop *NewLoop);
555 Value *getOrCreateVectorTripCount(
Loop *NewLoop);
619 ValueMap(
unsigned UnrollFactor,
unsigned VecWidth)
620 : UF(UnrollFactor), VF(VecWidth) {
628 bool hasVector(
Value *Key)
const {
return VectorMapStorage.count(Key); }
631 bool hasScalar(
Value *Key)
const {
return ScalarMapStorage.count(Key); }
637 const VectorParts &initVector(
Value *Key,
const VectorParts &Entry) {
638 assert(!hasVector(Key) &&
"Vector entry already initialized");
639 assert(Entry.size() == UF &&
"VectorParts has wrong dimensions");
640 VectorMapStorage[Key] = Entry;
641 return VectorMapStorage[Key];
648 const ScalarParts &initScalar(
Value *Key,
const ScalarParts &Entry) {
649 assert(!hasScalar(Key) &&
"Scalar entry already initialized");
650 assert(Entry.size() == UF &&
653 return Values.size() == VF;
655 "ScalarParts has wrong dimensions");
656 ScalarMapStorage[Key] = Entry;
657 return ScalarMapStorage[Key];
666 VectorParts &getVector(
Value *Key) {
667 assert(hasVector(Key) &&
"Vector entry not initialized");
668 return VectorMapStorage.find(Key)->second;
675 friend const VectorParts &InnerLoopVectorizer::getVectorValue(
Value *V);
676 friend Value *InnerLoopVectorizer::getScalarValue(
Value *V,
unsigned Part,
690 std::map<Value *, VectorParts> VectorMapStorage;
691 std::map<Value *, ScalarParts> ScalarMapStorage;
720 std::unique_ptr<LoopVersioning> LVer;
765 EdgeMaskCache MaskCache;
769 Value *VectorTripCount;
772 LoopVectorizationLegality *Legal;
775 LoopVectorizationCostModel *Cost;
778 bool AddedSafetyChecks;
793 class InnerLoopUnroller :
public InnerLoopVectorizer {
800 LoopVectorizationLegality *LVL,
801 LoopVectorizationCostModel *CM)
802 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
803 UnrollFactor, LVL, CM) {}
807 bool IfPredicateInstr =
false)
override;
808 void vectorizeMemoryInstruction(
Instruction *Instr)
override;
812 Instruction::BinaryOpsEnd)
override;
827 if (
Instruction *OpInst = dyn_cast<Instruction>(*OI))
828 if (OpInst->getDebugLoc() != Empty)
838 if (
const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr))
846 static std::string getDebugLocString(
const Loop *L) {
851 LoopDbgLoc.print(OS);
854 OS << L->
getHeader()->getParent()->getParent()->getModuleIdentifier();
861 void InnerLoopVectorizer::addNewMetadata(
Instruction *To,
865 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
866 LVer->annotateInstWithNoAlias(To, Orig);
869 void InnerLoopVectorizer::addMetadata(
Instruction *To,
872 addNewMetadata(To, From);
877 for (
Value *V : To) {
879 addMetadata(I, From);
909 class InterleaveGroup {
911 InterleaveGroup(
Instruction *Instr,
int Stride,
unsigned Align)
912 : Align(Align), SmallestKey(0), LargestKey(0), InsertPos(Instr) {
913 assert(Align &&
"The alignment should be non-zero");
916 assert(Factor > 1 &&
"Invalid interleave factor");
918 Reverse = Stride < 0;
922 bool isReverse()
const {
return Reverse; }
923 unsigned getFactor()
const {
return Factor; }
925 unsigned getNumMembers()
const {
return Members.size(); }
932 bool insertMember(
Instruction *Instr,
int Index,
unsigned NewAlign) {
933 assert(NewAlign &&
"The new member's alignment should be non-zero");
935 int Key = Index + SmallestKey;
938 if (Members.count(Key))
941 if (Key > LargestKey) {
943 if (Index >= static_cast<int>(Factor))
947 }
else if (Key < SmallestKey) {
949 if (LargestKey - Key >= static_cast<int>(Factor))
957 Members[Key] = Instr;
965 int Key = SmallestKey + Index;
966 if (!Members.count(Key))
969 return Members.find(Key)->second;
975 for (
auto I : Members)
976 if (I.second == Instr)
977 return I.first - SmallestKey;
982 Instruction *getInsertPos()
const {
return InsertPos; }
983 void setInsertPos(
Instruction *Inst) { InsertPos = Inst; }
1015 class InterleavedAccessInfo {
1019 : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(nullptr),
1020 RequiresScalarEpilogue(
false) {}
1022 ~InterleavedAccessInfo() {
1025 for (
auto &I : InterleaveGroupMap)
1027 for (
auto *Ptr : DelSet)
1037 return InterleaveGroupMap.count(Instr);
1041 unsigned getMaxInterleaveFactor()
const {
1042 unsigned MaxFactor = 1;
1043 for (
auto &Entry : InterleaveGroupMap)
1044 MaxFactor = std::max(MaxFactor, Entry.second->getFactor());
1051 InterleaveGroup *getInterleaveGroup(
Instruction *Instr)
const {
1052 if (InterleaveGroupMap.count(Instr))
1053 return InterleaveGroupMap.find(Instr)->second;
1059 bool requiresScalarEpilogue()
const {
return RequiresScalarEpilogue; }
1078 bool RequiresScalarEpilogue;
1088 struct StrideDescriptor {
1089 StrideDescriptor(int64_t Stride,
const SCEV *Scev, uint64_t Size,
1091 : Stride(Stride), Scev(Scev), Size(Size), Align(Align) {}
1093 StrideDescriptor() =
default;
1097 const SCEV *Scev =
nullptr;
1103 typedef std::pair<Instruction *, StrideDescriptor> StrideEntry;
1109 InterleaveGroup *createInterleaveGroup(
Instruction *Instr,
int Stride,
1111 assert(!InterleaveGroupMap.count(Instr) &&
1112 "Already in an interleaved access group");
1113 InterleaveGroupMap[Instr] =
new InterleaveGroup(Instr, Stride, Align);
1114 return InterleaveGroupMap[Instr];
1118 void releaseGroup(InterleaveGroup *Group) {
1119 for (
unsigned i = 0;
i < Group->getFactor();
i++)
1121 InterleaveGroupMap.erase(Member);
1127 void collectConstStrideAccesses(
1132 static bool isStrided(
int Stride) {
1133 unsigned Factor =
std::abs(Stride);
1143 bool areDependencesValid()
const {
1144 return LAI && LAI->getDepChecker().getDependences();
1152 bool canReorderMemAccessesForInterleavedGroups(StrideEntry *
A,
1153 StrideEntry *B)
const {
1170 auto *Src = A->first;
1171 auto SrcDes = A->second;
1174 auto *
Sink = B->first;
1175 auto SinkDes = B->second;
1179 if (!Src->mayWriteToMemory())
1183 if (!isStrided(SrcDes.Stride) && !isStrided(SinkDes.Stride))
1188 if (!areDependencesValid())
1193 return !Dependences.count(Src) || !Dependences.lookup(Src).count(
Sink);
1200 void collectDependences() {
1201 if (!areDependencesValid())
1203 auto *Deps = LAI->getDepChecker().getDependences();
1204 for (
auto Dep : *Deps)
1205 Dependences[Dep.getSource(*LAI)].insert(Dep.getDestination(*LAI));
1218 class LoopVectorizeHints {
1219 enum HintKind { HK_WIDTH, HK_UNROLL, HK_FORCE };
1228 : Name(Name), Value(Value), Kind(Kind) {}
1230 bool validate(
unsigned Val) {
1254 bool PotentiallyUnsafe;
1263 LoopVectorizeHints(
const Loop *L,
bool DisableInterleaving,
1267 Interleave(
"interleave.count", DisableInterleaving, HK_UNROLL),
1268 Force(
"vectorize.enable", FK_Undefined, HK_FORCE),
1269 PotentiallyUnsafe(
false), TheLoop(L), ORE(ORE) {
1271 getHintsFromMetadata();
1277 DEBUG(
if (DisableInterleaving && Interleave.Value == 1)
dbgs()
1278 <<
"LV: Interleaving disabled by the pass manager\n");
1282 void setAlreadyVectorized() {
1283 Width.Value = Interleave.Value = 1;
1284 Hint Hints[] = {Width, Interleave};
1285 writeHintsToMetadata(Hints);
1288 bool allowVectorization(
Function *
F,
Loop *L,
bool AlwaysVectorize)
const {
1289 if (getForce() == LoopVectorizeHints::FK_Disabled) {
1290 DEBUG(
dbgs() <<
"LV: Not vectorizing: #pragma vectorize disable.\n");
1291 emitRemarkWithHints();
1295 if (!AlwaysVectorize && getForce() != LoopVectorizeHints::FK_Enabled) {
1296 DEBUG(
dbgs() <<
"LV: Not vectorizing: No #pragma vectorize enable.\n");
1297 emitRemarkWithHints();
1301 if (
getWidth() == 1 && getInterleave() == 1) {
1304 DEBUG(
dbgs() <<
"LV: Not vectorizing: Disabled/already vectorized.\n");
1311 <<
"loop not vectorized: vectorization and interleaving are "
1312 "explicitly disabled, or vectorize width and interleave "
1313 "count are both set to 1");
1321 void emitRemarkWithHints()
const {
1322 using namespace ore;
1323 if (Force.Value == LoopVectorizeHints::FK_Disabled)
1325 TheLoop->getStartLoc(),
1326 TheLoop->getHeader())
1327 <<
"loop not vectorized: vectorization is explicitly disabled");
1330 TheLoop->getStartLoc(), TheLoop->getHeader());
1331 R <<
"loop not vectorized";
1332 if (Force.Value == LoopVectorizeHints::FK_Enabled) {
1333 R <<
" (Force=" <<
NV(
"Force",
true);
1334 if (Width.Value != 0)
1335 R <<
", Vector Width=" <<
NV(
"VectorWidth", Width.Value);
1336 if (Interleave.Value != 0)
1337 R <<
", Interleave Count=" <<
NV(
"InterleaveCount", Interleave.Value);
1344 unsigned getWidth()
const {
return Width.Value; }
1345 unsigned getInterleave()
const {
return Interleave.Value; }
1346 enum ForceKind getForce()
const {
return (ForceKind)Force.Value; }
1350 const char *vectorizeAnalysisPassName()
const {
1353 if (getForce() == LoopVectorizeHints::FK_Disabled)
1355 if (getForce() == LoopVectorizeHints::FK_Undefined &&
getWidth() == 0)
1360 bool allowReordering()
const {
1366 return getForce() == LoopVectorizeHints::FK_Enabled ||
getWidth() > 1;
1369 bool isPotentiallyUnsafe()
const {
1375 return getForce() != LoopVectorizeHints::FK_Enabled && PotentiallyUnsafe;
1378 void setPotentiallyUnsafe() { PotentiallyUnsafe =
true; }
1382 void getHintsFromMetadata() {
1383 MDNode *LoopID = TheLoop->getLoopID();
1398 if (!MD || MD->getNumOperands() == 0)
1401 for (
unsigned i = 1, ie = MD->getNumOperands();
i < ie; ++
i)
1405 assert(Args.
size() == 0 &&
"too many arguments for MDString");
1413 if (Args.
size() == 1)
1414 setHint(Name, Args[0]);
1424 const ConstantInt *
C = mdconst::dyn_extract<ConstantInt>(Arg);
1429 Hint *Hints[] = {&Width, &Interleave, &Force};
1430 for (
auto H : Hints) {
1431 if (Name ==
H->Name) {
1432 if (
H->validate(Val))
1435 DEBUG(
dbgs() <<
"LV: ignoring invalid hint '" << Name <<
"'\n");
1456 for (
auto H : HintTypes)
1464 if (HintTypes.
size() == 0)
1470 MDNode *LoopID = TheLoop->getLoopID();
1475 if (!matchesHintMetadataName(Node, HintTypes))
1476 MDs.push_back(Node);
1481 for (
auto H : HintTypes)
1485 LLVMContext &Context = TheLoop->getHeader()->getContext();
1490 TheLoop->setLoopID(NewLoopID);
1494 const Loop *TheLoop;
1500 static void emitAnalysisDiag(
const Loop *TheLoop,
1501 const LoopVectorizeHints &Hints,
1504 const char *Name = Hints.vectorizeAnalysisPassName();
1509 const LoopVectorizeHints &LH,
1511 LH.emitRemarkWithHints();
1513 if (LH.getForce() == LoopVectorizeHints::FK_Enabled) {
1514 if (LH.getWidth() != 1)
1517 "failed explicitly specified loop vectorization");
1518 else if (LH.getInterleave() != 1)
1521 "failed explicitly specified loop interleaving");
1538 class LoopVectorizationLegality {
1540 LoopVectorizationLegality(
1546 LoopVectorizeHints *
H)
1547 : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TTI(TTI), DT(DT),
1548 GetLAA(GetLAA), LAI(nullptr), ORE(ORE), InterleaveInfo(PSE, L, DT, LI),
1549 Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(
false),
1550 Requirements(R), Hints(H) {}
1567 bool canVectorize();
1570 PHINode *getInduction() {
return Induction; }
1573 ReductionList *getReductionVars() {
return &Reductions; }
1576 InductionList *getInductionVars() {
return &Inductions; }
1579 RecurrenceSet *getFirstOrderRecurrences() {
return &FirstOrderRecurrences; }
1582 Type *getWidestInductionType() {
return WidestIndTy; }
1585 bool isInductionVariable(
const Value *V);
1588 bool isReductionVariable(
PHINode *PN) {
return Reductions.count(PN); }
1591 bool isFirstOrderRecurrence(
const PHINode *Phi);
1605 int isConsecutivePtr(
Value *Ptr);
1608 bool isUniform(
Value *V);
1611 bool isUniformAfterVectorization(
Instruction *I) {
return Uniforms.count(I); }
1614 bool isScalarAfterVectorization(
Instruction *I) {
return Scalars.count(I); }
1618 return LAI->getRuntimePointerChecking();
1625 return InterleaveInfo.isInterleaved(Instr);
1629 unsigned getMaxInterleaveFactor()
const {
1630 return InterleaveInfo.getMaxInterleaveFactor();
1634 const InterleaveGroup *getInterleavedAccessGroup(
Instruction *Instr) {
1635 return InterleaveInfo.getInterleaveGroup(Instr);
1640 bool requiresScalarEpilogue()
const {
1641 return InterleaveInfo.requiresScalarEpilogue();
1646 bool hasStride(
Value *V) {
return LAI->hasStride(V); }
1655 bool isLegalMaskedLoad(
Type *DataType,
Value *Ptr) {
1660 bool isLegalMaskedScatter(
Type *DataType) {
1665 bool isLegalMaskedGather(
Type *DataType) {
1670 bool isLegalGatherOrScatter(
Value *V) {
1676 auto *Ty = cast<PointerType>(Ptr->
getType())->getElementType();
1677 return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
1682 bool isMaskRequired(
const Instruction *I) {
return (MaskedOp.count(I) != 0); }
1683 unsigned getNumStores()
const {
return LAI->getNumStores(); }
1684 unsigned getNumLoads()
const {
return LAI->getNumLoads(); }
1685 unsigned getNumPredStores()
const {
return NumPredStores; }
1696 bool hasConsecutiveLikePtrOperand(
Instruction *I);
1700 bool memoryInstructionMustBeScalarized(
Instruction *I,
unsigned VF = 1);
1706 bool canVectorizeInstrs();
1712 bool canVectorizeMemory();
1716 bool canVectorizeWithIfConvert();
1727 void collectLoopUniforms();
1734 void collectLoopScalars();
1752 emitAnalysisDiag(TheLoop, *Hints, *ORE, Message);
1764 RemarkName, TheLoop,
I);
1774 return LAI ? &LAI->getSymbolicStrides() :
nullptr;
1777 unsigned NumPredStores;
1794 std::function<const LoopAccessInfo &(Loop &)> *GetLAA;
1803 InterleavedAccessInfo InterleaveInfo;
1811 ReductionList Reductions;
1815 InductionList Inductions;
1817 RecurrenceSet FirstOrderRecurrences;
1832 bool HasFunNoNaNAttr;
1835 LoopVectorizationRequirements *Requirements;
1838 LoopVectorizeHints *Hints;
1852 class LoopVectorizationCostModel {
1855 LoopInfo *LI, LoopVectorizationLegality *Legal,
1860 const LoopVectorizeHints *Hints)
1861 : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
1862 AC(AC), ORE(ORE), TheFunction(F), Hints(Hints) {}
1878 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1884 unsigned selectInterleaveCount(
bool OptForSize,
unsigned VF,
1889 struct RegisterUsage {
1891 unsigned LoopInvariantRegs;
1893 unsigned MaxLocalUsers;
1895 unsigned NumInstructions;
1903 void collectValuesToIgnore();
1914 bool isProfitableToScalarize(
Instruction *I,
unsigned VF)
const {
1915 auto Scalars = InstsToScalarize.
find(VF);
1916 assert(Scalars != InstsToScalarize.end() &&
1917 "VF not yet analyzed for scalarization profitability");
1918 return Scalars->second.count(I);
1923 bool canTruncateToMinimalBitwidth(
Instruction *I,
unsigned VF)
const {
1924 return VF > 1 && MinBWs.count(I) && !isProfitableToScalarize(I, VF) &&
1925 !Legal->isScalarAfterVectorization(I);
1936 typedef std::pair<unsigned, bool> VectorizationCostTy;
1942 VectorizationCostTy expectedCost(
unsigned VF);
1946 VectorizationCostTy getInstructionCost(
Instruction *I,
unsigned VF);
1950 unsigned getInstructionCost(
Instruction *I,
unsigned VF,
Type *&VectorTy);
1962 RemarkName, TheLoop);
1986 int computePredInstDiscount(
Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1991 void collectInstsToScalarize(
unsigned VF);
2001 LoopVectorizationLegality *Legal;
2015 const LoopVectorizeHints *Hints;
2034 class LoopVectorizationRequirements {
2037 : NumRuntimePointerChecks(0), UnsafeAlgebraInst(nullptr), ORE(ORE) {}
2041 if (!UnsafeAlgebraInst)
2042 UnsafeAlgebraInst =
I;
2045 void addRuntimePointerChecks(
unsigned Num) { NumRuntimePointerChecks = Num; }
2047 bool doesNotMeet(
Function *F,
Loop *L,
const LoopVectorizeHints &Hints) {
2048 const char *PassName = Hints.vectorizeAnalysisPassName();
2049 bool Failed =
false;
2050 if (UnsafeAlgebraInst && !Hints.allowReordering()) {
2053 UnsafeAlgebraInst->getDebugLoc(),
2054 UnsafeAlgebraInst->getParent())
2055 <<
"loop not vectorized: cannot prove it is safe to reorder "
2056 "floating-point operations");
2061 bool PragmaThresholdReached =
2063 bool ThresholdReached =
2065 if ((ThresholdReached && !Hints.allowReordering()) ||
2066 PragmaThresholdReached) {
2070 <<
"loop not vectorized: cannot prove it is safe to reorder "
2071 "memory operations");
2072 DEBUG(
dbgs() <<
"LV: Too many memory checks needed.\n");
2080 unsigned NumRuntimePointerChecks;
2089 if (!hasCyclesInLoopBody(L))
2093 for (
Loop *InnerL : L)
2094 addAcyclicInnerLoop(*InnerL, V);
2102 explicit LoopVectorize(
bool NoUnrolling =
false,
bool AlwaysVectorize =
true)
2104 Impl.DisableUnrolling = NoUnrolling;
2105 Impl.AlwaysVectorize = AlwaysVectorize;
2111 bool runOnFunction(
Function &F)
override {
2112 if (skipFunction(F))
2115 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2116 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2117 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2118 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2119 auto *
BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2120 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2121 auto *TLI = TLIP ? &TLIP->getTLI() :
nullptr;
2122 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2123 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2124 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
2125 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2126 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2128 std::function<const LoopAccessInfo &(Loop &)> GetLAA =
2131 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *
BFI, TLI, *DB, *AA, *AC,
2162 Value *InnerLoopVectorizer::getBroadcastInstrs(
Value *V) {
2165 bool NewInstr = (Instr && Instr->
getParent() == LoopVectorBody);
2166 bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr;
2171 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2174 Value *Shuf = Builder.CreateVectorSplat(VF, V,
"broadcast");
2179 void InnerLoopVectorizer::createVectorIntInductionPHI(
2183 assert(Step &&
"Can not widen an IV with a non-constant step");
2186 auto CurrIP = Builder.saveIP();
2187 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2188 if (isa<TruncInst>(EntryVal)) {
2189 auto *TruncType = cast<IntegerType>(EntryVal->
getType());
2191 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
2193 Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
2194 Value *SteppedStart = getStepVector(SplatStart, 0, Step);
2195 Builder.restoreIP(CurrIP);
2199 VF * Step->getSExtValue()));
2203 &*LoopVectorBody->getFirstInsertionPt());
2205 VectorParts Entry(UF);
2206 for (
unsigned Part = 0; Part < UF; ++Part) {
2207 Entry[Part] = LastInduction;
2208 LastInduction = cast<Instruction>(
2209 Builder.CreateAdd(LastInduction, SplatVF,
"step.add"));
2211 VectorLoopValueMap.initVector(EntryVal, Entry);
2212 if (isa<TruncInst>(EntryVal))
2213 addMetadata(Entry, EntryVal);
2217 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
2218 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
2219 auto *ICmp = cast<Instruction>(Br->getCondition());
2220 LastInduction->moveBefore(ICmp);
2221 LastInduction->setName(
"vec.ind.next");
2223 VecInd->
addIncoming(SteppedStart, LoopVectorPreHeader);
2224 VecInd->
addIncoming(LastInduction, LoopVectorLatch);
2227 bool InnerLoopVectorizer::shouldScalarizeInstruction(
Instruction *I)
const {
2228 return Legal->isScalarAfterVectorization(I) ||
2229 Cost->isProfitableToScalarize(I, VF);
2232 bool InnerLoopVectorizer::needsScalarInduction(
Instruction *IV)
const {
2233 if (shouldScalarizeInstruction(IV))
2235 auto isScalarInst = [&](
User *U) ->
bool {
2236 auto *I = cast<Instruction>(U);
2237 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
2242 void InnerLoopVectorizer::widenIntInduction(
PHINode *IV,
TruncInst *Trunc) {
2244 auto II = Legal->getInductionVars()->find(IV);
2245 assert(II != Legal->getInductionVars()->end() &&
"IV is not an induction");
2247 auto ID = II->second;
2248 assert(IV->
getType() == ID.getStartValue()->getType() &&
"Types must match");
2252 Value *ScalarIV =
nullptr;
2255 Value *Step =
nullptr;
2259 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
2262 auto VectorizedIV =
false;
2267 auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
2271 if (ID.getConstIntStepValue())
2272 Step = ID.getConstIntStepValue();
2277 if (VF > 1 && IV->
getType() == Induction->getType() && Step &&
2278 !shouldScalarizeInstruction(EntryVal)) {
2279 createVectorIntInductionPHI(ID, EntryVal);
2280 VectorizedIV =
true;
2288 if (!VectorizedIV || NeedsScalarIV) {
2290 auto *TruncType = cast<IntegerType>(Trunc->
getType());
2291 assert(Step &&
"Truncation requires constant integer step");
2292 auto StepInt = cast<ConstantInt>(Step)->getSExtValue();
2293 ScalarIV = Builder.CreateCast(Instruction::Trunc, Induction, TruncType);
2296 ScalarIV = Induction;
2297 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2298 if (IV != OldInduction) {
2299 ScalarIV = Builder.CreateSExtOrTrunc(ScalarIV, IV->
getType());
2300 ScalarIV = ID.transform(Builder, ScalarIV, PSE.
getSE(), DL);
2301 ScalarIV->setName(
"offset.idx");
2305 Step =
Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
2306 &*Builder.GetInsertPoint());
2313 if (!VectorizedIV) {
2314 Value *Broadcasted = getBroadcastInstrs(ScalarIV);
2315 VectorParts Entry(UF);
2316 for (
unsigned Part = 0; Part < UF; ++Part)
2317 Entry[Part] = getStepVector(Broadcasted, VF * Part, Step);
2318 VectorLoopValueMap.initVector(EntryVal, Entry);
2320 addMetadata(Entry, Trunc);
2330 buildScalarSteps(ScalarIV, Step, EntryVal);
2333 Value *InnerLoopVectorizer::getStepVector(
Value *Val,
int StartIdx,
Value *Step,
2341 "Induction Step must be an integer or FP");
2348 for (
int i = 0;
i < VLen; ++
i)
2354 Step = Builder.CreateVectorSplat(VLen, Step);
2355 assert(Step->getType() == Val->
getType() &&
"Invalid step vec");
2358 Step = Builder.CreateMul(Cv, Step);
2359 return Builder.CreateAdd(Val, Step,
"induction");
2363 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2364 "Binary Opcode should be specified for FP induction");
2366 for (
int i = 0;
i < VLen; ++
i)
2372 Step = Builder.CreateVectorSplat(VLen, Step);
2378 Value *MulOp = Builder.CreateFMul(Cv, Step);
2379 if (isa<Instruction>(MulOp))
2381 cast<Instruction>(MulOp)->setFastMathFlags(Flags);
2383 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp,
"induction");
2384 if (isa<Instruction>(BOp))
2385 cast<Instruction>(BOp)->setFastMathFlags(Flags);
2389 void InnerLoopVectorizer::buildScalarSteps(
Value *ScalarIV,
Value *Step,
2393 assert(VF > 1 &&
"VF should be greater than one");
2398 "Val and Step should have the same integer type");
2404 Legal->isUniformAfterVectorization(cast<Instruction>(EntryVal)) ? 1 : VF;
2407 ScalarParts Entry(UF);
2408 for (
unsigned Part = 0; Part < UF; ++Part) {
2409 Entry[Part].resize(VF);
2410 for (
unsigned Lane = 0; Lane < Lanes; ++Lane) {
2412 auto *Mul = Builder.CreateMul(StartIdx, Step);
2413 auto *
Add = Builder.CreateAdd(ScalarIV, Mul);
2414 Entry[Part][Lane] =
Add;
2417 VectorLoopValueMap.initScalar(EntryVal, Entry);
2420 int LoopVectorizationLegality::isConsecutivePtr(
Value *Ptr) {
2422 const ValueToValueMap &Strides = getSymbolicStrides() ? *getSymbolicStrides() :
2425 int Stride =
getPtrStride(PSE, Ptr, TheLoop, Strides,
true,
false);
2426 if (Stride == 1 || Stride == -1)
2431 bool LoopVectorizationLegality::isUniform(
Value *V) {
2432 return LAI->isUniform(V);
2436 InnerLoopVectorizer::getVectorValue(
Value *V) {
2437 assert(V != Induction &&
"The new induction variable should not be used.");
2442 if (Legal->hasStride(V))
2446 if (VectorLoopValueMap.hasVector(V))
2447 return VectorLoopValueMap.VectorMapStorage[V];
2452 if (VectorLoopValueMap.hasScalar(V)) {
2455 VectorParts Entry(UF);
2458 auto *I = cast<Instruction>(V);
2463 for (
unsigned Part = 0; Part < UF; ++Part)
2464 Entry[Part] = getScalarValue(V, Part, 0);
2465 return VectorLoopValueMap.initVector(V, Entry);
2472 unsigned LastLane = Legal->isUniformAfterVectorization(I) ? 0 : VF - 1;
2473 auto *LastInst = cast<Instruction>(getScalarValue(V, UF - 1, LastLane));
2477 auto OldIP = Builder.saveIP();
2479 Builder.SetInsertPoint(&*NewIP);
2487 for (
unsigned Part = 0; Part < UF; ++Part) {
2488 Value *VectorValue =
nullptr;
2489 if (Legal->isUniformAfterVectorization(I)) {
2490 VectorValue = getBroadcastInstrs(getScalarValue(V, Part, 0));
2493 for (
unsigned Lane = 0; Lane < VF; ++Lane)
2494 VectorValue = Builder.CreateInsertElement(
2495 VectorValue, getScalarValue(V, Part, Lane),
2496 Builder.getInt32(Lane));
2498 Entry[Part] = VectorValue;
2500 Builder.restoreIP(OldIP);
2501 return VectorLoopValueMap.initVector(V, Entry);
2506 Value *B = getBroadcastInstrs(V);
2507 return VectorLoopValueMap.initVector(V, VectorParts(UF, B));
2510 Value *InnerLoopVectorizer::getScalarValue(
Value *V,
unsigned Part,
2515 if (OrigLoop->isLoopInvariant(V))
2518 assert(Lane > 0 ? !Legal->isUniformAfterVectorization(cast<Instruction>(V))
2519 :
true &&
"Uniform values only have lane zero");
2524 if (VectorLoopValueMap.hasScalar(V))
2525 return VectorLoopValueMap.ScalarMapStorage[V][Part][Lane];
2531 auto *U = getVectorValue(V)[Part];
2532 if (!U->getType()->isVectorTy()) {
2533 assert(VF == 1 &&
"Value not scalarized has non-vector type");
2540 return Builder.CreateExtractElement(U, Builder.getInt32(Lane));
2543 Value *InnerLoopVectorizer::reverseVector(
Value *Vec) {
2546 for (
unsigned i = 0;
i < VF; ++
i)
2547 ShuffleMask.
push_back(Builder.getInt32(VF -
i - 1));
2561 for (
unsigned i = 0;
i < VF;
i++)
2562 for (
unsigned j = 0; j < NumVec; j++)
2571 unsigned Stride,
unsigned VF) {
2573 for (
unsigned i = 0;
i < VF;
i++)
2583 unsigned NumUndef) {
2585 for (
unsigned i = 0;
i < NumInt;
i++)
2589 for (
unsigned i = 0;
i < NumUndef;
i++)
2602 assert(VecTy1 && VecTy2 &&
2604 "Expect two vectors with the same element type");
2607 unsigned NumElts2 = VecTy2->getNumElements();
2608 assert(NumElts1 >= NumElts2 &&
"Unexpect the first vector has less elements");
2610 if (NumElts1 > NumElts2) {
2624 unsigned NumVec = InputList.
size();
2625 assert(NumVec > 1 &&
"Should be at least two vectors");
2631 for (
unsigned i = 0;
i < NumVec - 1;
i += 2) {
2632 Value *V0 = ResList[
i], *V1 = ResList[
i + 1];
2634 "Only the last vector may have a different type");
2640 if (NumVec % 2 != 0)
2644 NumVec = ResList.
size();
2645 }
while (NumVec > 1);
2678 void InnerLoopVectorizer::vectorizeInterleaveGroup(
Instruction *Instr) {
2679 const InterleaveGroup *Group = Legal->getInterleavedAccessGroup(Instr);
2680 assert(Group &&
"Fail to get an interleaved access group.");
2683 if (Instr != Group->getInsertPos())
2691 Type *ScalarTy = LI ? LI->
getType() : SI->getValueOperand()->getType();
2692 unsigned InterleaveFactor = Group->getFactor();
2694 Type *PtrTy = VecTy->
getPointerTo(Ptr->getType()->getPointerAddressSpace());
2697 setDebugLocFromInst(Builder, Ptr);
2699 unsigned Index = Group->getIndex(Instr);
2707 if (Group->isReverse())
2708 Index += (VF - 1) * Group->getFactor();
2710 for (
unsigned Part = 0; Part < UF; Part++) {
2711 Value *NewPtr = getScalarValue(Ptr, Part, 0);
2724 NewPtr = Builder.CreateGEP(NewPtr, Builder.getInt32(-Index));
2727 NewPtrs.
push_back(Builder.CreateBitCast(NewPtr, PtrTy));
2730 setDebugLocFromInst(Builder, Instr);
2738 for (
unsigned Part = 0; Part < UF; Part++) {
2739 auto *NewLoad = Builder.CreateAlignedLoad(
2740 NewPtrs[Part], Group->getAlignment(),
"wide.vec");
2741 addMetadata(NewLoad, Instr);
2747 for (
unsigned I = 0; I < InterleaveFactor; ++
I) {
2754 VectorParts Entry(UF);
2756 for (
unsigned Part = 0; Part < UF; Part++) {
2757 Value *StridedVec = Builder.CreateShuffleVector(
2758 NewLoads[Part], UndefVec, StrideMask,
"strided.vec");
2761 if (Member->
getType() != ScalarTy) {
2763 StridedVec = Builder.CreateBitOrPointerCast(StridedVec, OtherVTy);
2767 Group->isReverse() ? reverseVector(StridedVec) : StridedVec;
2769 VectorLoopValueMap.initVector(Member, Entry);
2778 for (
unsigned Part = 0; Part < UF; Part++) {
2781 for (
unsigned i = 0;
i < InterleaveFactor;
i++) {
2784 assert(Member &&
"Fail to get a member from an interleaved store group");
2787 getVectorValue(cast<StoreInst>(Member)->getValueOperand())[Part];
2788 if (Group->isReverse())
2789 StoredVec = reverseVector(StoredVec);
2792 if (StoredVec->
getType() != SubVT)
2793 StoredVec = Builder.CreateBitOrPointerCast(StoredVec, SubVT);
2803 Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2807 Builder.CreateAlignedStore(IVec, NewPtrs[Part], Group->getAlignment());
2808 addMetadata(NewStoreInstr, Instr);
2812 void InnerLoopVectorizer::vectorizeMemoryInstruction(
Instruction *Instr) {
2817 assert((LI || SI) &&
"Invalid Load/Store instruction");
2820 if (Legal->isAccessInterleaved(Instr))
2821 return vectorizeInterleaveGroup(Instr);
2823 Type *ScalarDataTy = LI ? LI->
getType() : SI->getValueOperand()->getType();
2826 unsigned Alignment = LI ? LI->
getAlignment() : SI->getAlignment();
2835 if (Legal->memoryInstructionMustBeScalarized(Instr, VF))
2836 return scalarizeInstruction(Instr, Legal->isScalarWithPredication(Instr));
2840 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
2841 bool Reverse = ConsecutiveStride < 0;
2844 bool CreateGatherScatter =
2845 !ConsecutiveStride && Legal->isLegalGatherOrScatter(Instr);
2847 VectorParts VectorGep;
2851 if (ConsecutiveStride) {
2857 unsigned NumOfLoopVariantOps = 0;
2858 for (
unsigned i = 0;
i < NumOperands; ++
i)
2861 NumOfLoopVariantOps++;
2862 assert(NumOfLoopVariantOps == 1 &&
2863 "Consecutive GEP should have only one loop-variant operand");
2871 for (
unsigned i = 0;
i < NumOperands; ++
i)
2878 setDebugLocFromInst(Builder, Gep);
2879 Ptr = Builder.Insert(Gep2);
2882 setDebugLocFromInst(Builder, Ptr);
2883 Ptr = getScalarValue(Ptr, 0, 0);
2887 assert(CreateGatherScatter &&
"The instruction should be scalarized");
2896 if (SrcInst && OrigLoop->contains(SrcInst))
2901 for (
unsigned Part = 0; Part < UF; ++Part) {
2903 Value *GEPBasePtr = OpsV[0][Part];
2906 Value *NewGep = Builder.CreateGEP(GEPBasePtr, Ops,
"VectorGep");
2907 cast<GetElementPtrInst>(NewGep)->setIsInBounds(Gep->
isInBounds());
2912 VectorGep.push_back(NewGep);
2915 VectorGep = getVectorValue(Ptr);
2921 assert(!Legal->isUniform(SI->getPointerOperand()) &&
2922 "We do not allow storing to uniform addresses");
2923 setDebugLocFromInst(Builder, SI);
2926 VectorParts StoredVal = getVectorValue(SI->getValueOperand());
2928 for (
unsigned Part = 0; Part < UF; ++Part) {
2930 if (CreateGatherScatter) {
2931 Value *MaskPart = Legal->isMaskRequired(SI) ? Mask[Part] :
nullptr;
2932 NewSI = Builder.CreateMaskedScatter(StoredVal[Part], VectorGep[Part],
2933 Alignment, MaskPart);
2937 Builder.CreateGEP(
nullptr, Ptr, Builder.getInt32(Part * VF));
2942 StoredVal[Part] = reverseVector(StoredVal[Part]);
2946 Builder.CreateGEP(
nullptr, Ptr, Builder.getInt32(-Part * VF));
2948 Builder.CreateGEP(
nullptr, PartPtr, Builder.getInt32(1 - VF));
2949 Mask[Part] = reverseVector(Mask[Part]);
2953 Builder.CreateBitCast(PartPtr, DataTy->
getPointerTo(AddressSpace));
2955 if (Legal->isMaskRequired(SI))
2956 NewSI = Builder.CreateMaskedStore(StoredVal[Part], VecPtr, Alignment,
2960 Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment);
2962 addMetadata(NewSI, SI);
2968 assert(LI &&
"Must have a load instruction");
2969 setDebugLocFromInst(Builder, LI);
2970 VectorParts Entry(UF);
2971 for (
unsigned Part = 0; Part < UF; ++Part) {
2973 if (CreateGatherScatter) {
2974 Value *MaskPart = Legal->isMaskRequired(LI) ? Mask[Part] :
nullptr;
2975 NewLI = Builder.CreateMaskedGather(VectorGep[Part], Alignment, MaskPart,
2976 0,
"wide.masked.gather");
2977 Entry[Part] = NewLI;
2981 Builder.CreateGEP(
nullptr, Ptr, Builder.getInt32(Part * VF));
2986 PartPtr = Builder.CreateGEP(
nullptr, Ptr, Builder.getInt32(-Part * VF));
2987 PartPtr = Builder.CreateGEP(
nullptr, PartPtr, Builder.getInt32(1 - VF));
2988 Mask[Part] = reverseVector(Mask[Part]);
2992 Builder.CreateBitCast(PartPtr, DataTy->
getPointerTo(AddressSpace));
2993 if (Legal->isMaskRequired(LI))
2994 NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
2996 "wide.masked.load");
2998 NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment,
"wide.load");
2999 Entry[Part] = Reverse ? reverseVector(NewLI) : NewLI;
3001 addMetadata(NewLI, LI);
3003 VectorLoopValueMap.initVector(Instr, Entry);
3006 void InnerLoopVectorizer::scalarizeInstruction(
Instruction *Instr,
3007 bool IfPredicateInstr) {
3010 << (IfPredicateInstr ?
" and predicating:" :
":") << *Instr
3015 setDebugLocFromInst(Builder, Instr);
3021 ScalarParts Entry(UF);
3024 if (IfPredicateInstr)
3025 Cond = createBlockInMask(Instr->
getParent());
3030 unsigned Lanes = Legal->isUniformAfterVectorization(Instr) ? 1 : VF;
3033 for (
unsigned Part = 0; Part < UF; ++Part) {
3034 Entry[Part].resize(VF);
3036 for (
unsigned Lane = 0; Lane < Lanes; ++Lane) {
3039 Value *Cmp =
nullptr;
3040 if (IfPredicateInstr) {
3041 Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Lane));
3053 auto *NewOp = getScalarValue(Instr->
getOperand(
op), Part, Lane);
3056 addNewMetadata(Cloned, Instr);
3059 Builder.Insert(Cloned);
3062 Entry[Part][Lane] = Cloned;
3065 if (
auto *II = dyn_cast<IntrinsicInst>(Cloned))
3066 if (II->getIntrinsicID() == Intrinsic::assume)
3067 AC->registerAssumption(II);
3070 if (IfPredicateInstr)
3071 PredicatedInstructions.push_back(std::make_pair(Cloned, Cmp));
3074 VectorLoopValueMap.initScalar(Instr, Entry);
3077 PHINode *InnerLoopVectorizer::createInductionVariable(
Loop *L,
Value *Start,
3088 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
3089 setDebugLocFromInst(Builder, OldInst);
3090 auto *Induction = Builder.CreatePHI(Start->
getType(), 2,
"index");
3093 setDebugLocFromInst(Builder, OldInst);
3096 Value *Next = Builder.CreateAdd(Induction, Step,
"index.next");
3098 Induction->addIncoming(Next, Latch);
3100 Value *ICmp = Builder.CreateICmpEQ(Next, End);
3109 Value *InnerLoopVectorizer::getOrCreateTripCount(
Loop *L) {
3117 assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
3118 "Invalid loop count");
3120 Type *IdxTy = Legal->getWidestInductionType();
3128 IdxTy->getPrimitiveSizeInBits())
3129 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
3130 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
3133 const SCEV *ExitCount = SE->getAddExpr(
3134 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->
getType()));
3143 TripCount =
Exp.expandCodeFor(ExitCount, ExitCount->
getType(),
3146 if (TripCount->getType()->isPointerTy())
3154 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(
Loop *L) {
3155 if (VectorTripCount)
3156 return VectorTripCount;
3158 Value *TC = getOrCreateTripCount(L);
3167 Value *R = Builder.CreateURem(TC, Step,
"n.mod.vf");
3176 if (VF > 1 && Legal->requiresScalarEpilogue()) {
3178 R = Builder.CreateSelect(IsZero, Step, R);
3181 VectorTripCount = Builder.CreateSub(TC, R,
"n.vec");
3183 return VectorTripCount;
3186 void InnerLoopVectorizer::emitMinimumIterationCountCheck(
Loop *L,
3188 Value *Count = getOrCreateTripCount(L);
3194 Value *CheckMinIters = Builder.CreateICmpULT(
3207 LoopBypassBlocks.push_back(BB);
3210 void InnerLoopVectorizer::emitVectorLoopEnteredCheck(
Loop *L,
3212 Value *TC = getOrCreateVectorTripCount(L);
3232 LoopBypassBlocks.push_back(BB);
3235 void InnerLoopVectorizer::emitSCEVChecks(
Loop *L,
BasicBlock *Bypass) {
3246 if (
auto *C = dyn_cast<ConstantInt>(SCEVCheck))
3251 BB->
setName(
"vector.scevcheck");
3261 LoopBypassBlocks.push_back(BB);
3262 AddedSafetyChecks =
true;
3265 void InnerLoopVectorizer::emitMemRuntimeChecks(
Loop *L,
BasicBlock *Bypass) {
3273 std::tie(FirstCheckInst, MemRuntimeCheck) =
3275 if (!MemRuntimeCheck)
3279 BB->
setName(
"vector.memcheck");
3289 LoopBypassBlocks.push_back(BB);
3290 AddedSafetyChecks =
true;
3294 LVer = llvm::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
3296 LVer->prepareNoAliasMetadata();
3299 void InnerLoopVectorizer::createEmptyLoop() {
3332 BasicBlock *OldBasicBlock = OrigLoop->getHeader();
3333 BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
3334 BasicBlock *ExitBlock = OrigLoop->getExitBlock();
3335 assert(VectorPH &&
"Invalid loop structure");
3336 assert(ExitBlock &&
"Must have an exit block");
3349 OldInduction = Legal->getInduction();
3350 Type *IdxTy = Legal->getWidestInductionType();
3371 LI->addTopLevelLoop(Lp);
3376 Value *Count = getOrCreateTripCount(Lp);
3384 emitMinimumIterationCountCheck(Lp, ScalarPH);
3387 emitVectorLoopEnteredCheck(Lp, ScalarPH);
3390 emitSCEVChecks(Lp, ScalarPH);
3395 emitMemRuntimeChecks(Lp, ScalarPH);
3400 Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3403 createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3404 getDebugLocFromInstOrOperands(OldInduction));
3418 for (
auto &InductionEntry : *List) {
3419 PHINode *OrigPhi = InductionEntry.first;
3425 Value *&EndValue = IVEndValues[OrigPhi];
3426 if (OrigPhi == OldInduction) {
3428 EndValue = CountRoundDown;
3430 IRBuilder<> B(LoopBypassBlocks.back()->getTerminator());
3434 Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType,
"cast.crd");
3435 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
3468 LoopScalarPreHeader = ScalarPH;
3469 LoopMiddleBlock = MiddleBlock;
3470 LoopExitBlock = ExitBlock;
3471 LoopVectorBody = VecBody;
3472 LoopScalarBody = OldBasicBlock;
3476 if (
MDNode *LID = OrigLoop->getLoopID())
3479 LoopVectorizeHints Hints(Lp,
true, *ORE);
3480 Hints.setAlreadyVectorized();
3487 void InnerLoopVectorizer::fixupIVUsers(
PHINode *OrigPhi,
3496 assert(OrigLoop->getExitBlock() &&
"Expected a single exit block");
3505 if (!OrigLoop->contains(UI)) {
3506 assert(isa<PHINode>(UI) &&
"Expected LCSSA form");
3507 MissingVals[UI] = EndValue;
3515 auto *UI = cast<Instruction>(U);
3516 if (!OrigLoop->contains(UI)) {
3518 OrigLoop->getHeader()->getModule()->getDataLayout();
3519 assert(isa<PHINode>(UI) &&
"Expected LCSSA form");
3522 Value *CountMinusOne = B.CreateSub(
3527 Escape->
setName(
"ind.escape");
3528 MissingVals[UI] = Escape;
3532 for (
auto &I : MissingVals) {
3533 PHINode *PHI = cast<PHINode>(I.first);
3545 struct CSEDenseMapInfo {
3547 return isa<InsertElementInst>(
I) || isa<ExtractElementInst>(I) ||
3548 isa<ShuffleVectorInst>(
I) || isa<GetElementPtrInst>(I);
3557 assert(canHandle(I) &&
"Unknown instruction!");
3562 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3563 LHS == getTombstoneKey() || RHS == getTombstoneKey())
3577 if (!CSEDenseMapInfo::canHandle(In))
3594 if (isa<FPMathOperator>(V)) {
3597 cast<Instruction>(V)->setFastMathFlags(Flags);
3628 unsigned ScalarizationCost =
3631 for (
Type *Ty : OpTys)
3634 return ScalarizationCost;
3648 for (
unsigned OpInd = 0; OpInd < OperandsNum; ++OpInd)
3661 bool &NeedToScalarize) {
3675 return ScalarCallCost;
3678 Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3679 for (
Type *ScalarTy : ScalarTys)
3680 Tys.
push_back(ToVectorTy(ScalarTy, VF));
3686 unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3690 NeedToScalarize =
true;
3696 if (VectorCallCost < Cost) {
3697 NeedToScalarize =
false;
3698 return VectorCallCost;
3710 assert(ID &&
"Expected intrinsic call!");
3715 Tys.
push_back(ToVectorTy(ArgOperand->getType(), VF));
3718 if (
auto *FPMO = dyn_cast<FPMathOperator>(CI))
3719 FMF = FPMO->getFastMathFlags();
3727 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3732 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3735 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3741 for (
const auto &KV : Cost->getMinimalBitwidths()) {
3745 if (!VectorLoopValueMap.hasVector(KV.first))
3747 VectorParts &Parts = VectorLoopValueMap.getVector(KV.first);
3748 for (
Value *&I : Parts) {
3752 Type *ScalarTruncatedTy =
3756 if (TruncatedTy == OriginalTy)
3760 auto ShrinkOperand = [&](
Value *V) ->
Value * {
3761 if (
auto *ZI = dyn_cast<ZExtInst>(V))
3762 if (ZI->getSrcTy() == TruncatedTy)
3763 return ZI->getOperand(0);
3764 return B.CreateZExtOrTrunc(V, TruncatedTy);
3769 Value *NewI =
nullptr;
3770 if (
auto *BO = dyn_cast<BinaryOperator>(I)) {
3771 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3772 ShrinkOperand(BO->getOperand(1)));
3773 cast<BinaryOperator>(NewI)->copyIRFlags(I);
3774 }
else if (
auto *CI = dyn_cast<ICmpInst>(I)) {
3776 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3777 ShrinkOperand(CI->getOperand(1)));
3778 }
else if (
auto *SI = dyn_cast<SelectInst>(I)) {
3779 NewI = B.CreateSelect(SI->getCondition(),
3780 ShrinkOperand(SI->getTrueValue()),
3781 ShrinkOperand(SI->getFalseValue()));
3782 }
else if (
auto *CI = dyn_cast<CastInst>(I)) {
3783 switch (CI->getOpcode()) {
3786 case Instruction::Trunc:
3787 NewI = ShrinkOperand(CI->getOperand(0));
3789 case Instruction::SExt:
3790 NewI = B.CreateSExtOrTrunc(
3794 case Instruction::ZExt:
3795 NewI = B.CreateZExtOrTrunc(
3800 }
else if (
auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3802 auto *O0 = B.CreateZExtOrTrunc(
3805 auto *O1 = B.CreateZExtOrTrunc(
3808 NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
3809 }
else if (isa<LoadInst>(I)) {
3812 }
else if (
auto *
IE = dyn_cast<InsertElementInst>(I)) {
3813 auto Elements =
IE->getOperand(0)->getType()->getVectorNumElements();
3814 auto *O0 = B.CreateZExtOrTrunc(
3816 auto *O1 = B.CreateZExtOrTrunc(
IE->getOperand(1), ScalarTruncatedTy);
3817 NewI = B.CreateInsertElement(O0, O1,
IE->getOperand(2));
3818 }
else if (
auto *EE = dyn_cast<ExtractElementInst>(I)) {
3820 auto *O0 = B.CreateZExtOrTrunc(
3822 NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3828 NewI->
takeName(cast<Instruction>(I));
3829 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3831 cast<Instruction>(
I)->eraseFromParent();
3838 for (
const auto &KV : Cost->getMinimalBitwidths()) {
3842 if (!VectorLoopValueMap.hasVector(KV.first))
3844 VectorParts &Parts = VectorLoopValueMap.getVector(KV.first);
3845 for (
Value *&I : Parts) {
3856 void InnerLoopVectorizer::vectorizeLoop() {
3873 PhiVector PHIsToFix;
3878 collectTriviallyDeadInstructions();
3887 vectorizeBlockInLoop(BB, &PHIsToFix);
3892 truncateToMinimalBitwidths();
3898 for (
PHINode *Phi : PHIsToFix) {
3899 assert(Phi &&
"Unable to recover vectorized PHI");
3902 if (Legal->isFirstOrderRecurrence(Phi)) {
3903 fixFirstOrderRecurrence(Phi);
3909 assert(Legal->isReductionVariable(Phi) &&
3910 "Unable to find the reduction variable");
3915 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3917 RdxDesc.getMinMaxRecurrenceKind();
3918 setDebugLocFromInst(Builder, ReductionStartValue);
3924 Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator());
3927 const VectorParts &VectorExit = getVectorValue(LoopExitInst);
3928 Type *VecTy = VectorExit[0]->getType();
3938 VectorStart = Identity = ReductionStartValue;
3940 VectorStart = Identity =
3941 Builder.CreateVectorSplat(VF, ReductionStartValue,
"minmax.ident");
3951 VectorStart = ReductionStartValue;
3958 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3966 const VectorParts &VecRdxPhi = getVectorValue(Phi);
3967 BasicBlock *Latch = OrigLoop->getLoopLatch();
3968 Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3969 const VectorParts &Val = getVectorValue(LoopVal);
3970 for (
unsigned part = 0; part < UF; ++part) {
3973 Value *StartVal = (part == 0) ? VectorStart : Identity;
3974 cast<PHINode>(VecRdxPhi[part])
3975 ->addIncoming(StartVal, LoopVectorPreHeader);
3976 cast<PHINode>(VecRdxPhi[part])
3977 ->addIncoming(Val[part], LoopVectorBody);
3984 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3986 VectorParts &RdxParts = VectorLoopValueMap.getVector(LoopExitInst);
3987 setDebugLocFromInst(Builder, LoopExitInst);
3992 if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3994 Builder.SetInsertPoint(LoopVectorBody->getTerminator());
3995 for (
unsigned part = 0; part < UF; ++part) {
3996 Value *Trunc = Builder.CreateTrunc(RdxParts[part], RdxVecTy);
3997 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3998 : Builder.CreateZExt(Trunc, VecTy);
4002 (*UI++)->replaceUsesOfWith(RdxParts[part], Extnd);
4003 RdxParts[part] = Extnd;
4008 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4009 for (
unsigned part = 0; part < UF; ++part)
4010 RdxParts[part] = Builder.CreateTrunc(RdxParts[part], RdxVecTy);
4014 Value *ReducedPartRdx = RdxParts[0];
4016 setDebugLocFromInst(Builder, ReducedPartRdx);
4017 for (
unsigned part = 1; part < UF; ++part) {
4018 if (Op != Instruction::ICmp && Op != Instruction::FCmp)
4022 ReducedPartRdx,
"bin.rdx"));
4025 Builder, MinMaxKind, ReducedPartRdx, RdxParts[part]);
4033 "Reduction emission only supported for pow2 vectors!");
4034 Value *TmpVec = ReducedPartRdx;
4036 for (
unsigned i = VF;
i != 1;
i >>= 1) {
4038 for (
unsigned j = 0; j !=
i / 2; ++j)
4039 ShuffleMask[j] = Builder.getInt32(
i / 2 + j);
4042 std::fill(&ShuffleMask[
i / 2], ShuffleMask.end(),
4045 Value *Shuf = Builder.CreateShuffleVector(
4049 if (Op != Instruction::ICmp && Op != Instruction::FCmp)
4060 Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
4064 if (Phi->getType() != RdxDesc.getRecurrenceType())
4067 ? Builder.CreateSExt(ReducedPartRdx, Phi->
getType())
4068 : Builder.CreateZExt(ReducedPartRdx, Phi->
getType());
4074 LoopScalarPreHeader->getTerminator());
4075 for (
unsigned I = 0,
E = LoopBypassBlocks.size(); I !=
E; ++
I)
4076 BCBlockPhi->
addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4077 BCBlockPhi->
addIncoming(ReducedPartRdx, LoopMiddleBlock);
4084 LEE = LoopExitBlock->end();
4085 LEI != LEE; ++LEI) {
4098 LCSSAPhi->
addIncoming(ReducedPartRdx, LoopMiddleBlock);
4105 int IncomingEdgeBlockIdx =
4106 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4107 assert(IncomingEdgeBlockIdx >= 0 &&
"Invalid block index");
4109 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4110 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4111 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4125 for (
auto &Entry : *Legal->getInductionVars())
4126 fixupIVUsers(Entry.first, Entry.second,
4127 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
4128 IVEndValues[Entry.first], LoopMiddleBlock);
4131 predicateInstructions();
4134 cse(LoopVectorBody);
4137 void InnerLoopVectorizer::fixFirstOrderRecurrence(
PHINode *Phi) {
4189 auto *Preheader = OrigLoop->getLoopPreheader();
4190 auto *Latch = OrigLoop->getLoopLatch();
4197 auto *VectorInit = ScalarInit;
4199 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4200 VectorInit = Builder.CreateInsertElement(
4202 Builder.getInt32(VF - 1),
"vector.recur.init");
4207 VectorParts &PhiParts = VectorLoopValueMap.getVector(Phi);
4208 Builder.SetInsertPoint(cast<Instruction>(PhiParts[0]));
4212 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2,
"vector.recur");
4213 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
4217 auto &PreviousParts = getVectorValue(Previous);
4222 Builder.SetInsertPoint(
4228 ShuffleMask[0] = Builder.getInt32(VF - 1);
4229 for (
unsigned I = 1; I < VF; ++
I)
4230 ShuffleMask[I] = Builder.getInt32(I + VF - 1);
4234 Value *Incoming = VecPhi;
4237 for (
unsigned Part = 0; Part < UF; ++Part) {
4240 ? Builder.CreateShuffleVector(Incoming, PreviousParts[Part],
4244 cast<Instruction>(PhiParts[Part])->eraseFromParent();
4245 PhiParts[Part] = Shuffle;
4246 Incoming = PreviousParts[Part];
4250 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4254 auto *Extract = Incoming;
4256 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4257 Extract = Builder.CreateExtractElement(Extract, Builder.getInt32(VF - 1),
4258 "vector.recur.extract");
4262 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
4263 auto *Start = Builder.CreatePHI(Phi->
getType(), 2,
"scalar.recur.init");
4265 auto *Incoming = BB == LoopMiddleBlock ? Extract : ScalarInit;
4266 Start->addIncoming(Incoming, BB);
4277 for (
auto &I : *LoopExitBlock) {
4288 void InnerLoopVectorizer::fixLCSSAPHIs() {
4299 void InnerLoopVectorizer::collectTriviallyDeadInstructions() {
4300 BasicBlock *Latch = OrigLoop->getLoopLatch();
4306 if (Cmp && Cmp->hasOneUse())
4307 DeadInstructions.insert(Cmp);
4312 for (
auto &Induction : *Legal->getInductionVars()) {
4313 PHINode *Ind = Induction.first;
4315 if (
all_of(IndUpdate->users(), [&](
User *U) ->
bool {
4316 return U == Ind || DeadInstructions.count(cast<Instruction>(U));
4318 DeadInstructions.insert(IndUpdate);
4322 void InnerLoopVectorizer::sinkScalarOperands(
Instruction *PredInst) {
4326 auto *VectorLoop = LI->getLoopFor(PredBB);
4337 auto isBlockOfUsePredicated = [&](
Use &U) ->
bool {
4338 auto *I = cast<Instruction>(U.getUser());
4340 if (
auto *Phi = dyn_cast<PHINode>(I))
4343 return BB == PredBB;
4355 Worklist.insert(InstsToReanalyze.
begin(), InstsToReanalyze.
end());
4356 InstsToReanalyze.
clear();
4359 while (!Worklist.empty()) {
4364 if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
4365 !VectorLoop->contains(I) || I->mayHaveSideEffects())
4371 if (!
all_of(I->uses(), isBlockOfUsePredicated)) {
4372 InstsToReanalyze.push_back(I);
4378 I->moveBefore(&*PredBB->getFirstInsertionPt());
4379 Worklist.
insert(I->op_begin(), I->op_end());
4388 void InnerLoopVectorizer::predicateInstructions() {
4464 for (
auto KV : PredicatedInstructions) {
4467 auto *BB =
SplitBlock(Head, &*std::next(I), DT, LI);
4471 sinkScalarOperands(&*I);
4473 I->getParent()->setName(
Twine(
"pred.") + I->getOpcodeName() +
".if");
4474 BB->setName(
Twine(
"pred.") + I->getOpcodeName() +
".continue");
4477 if (!I->getType()->isVoidTy()) {
4478 Value *IncomingTrue =
nullptr;
4479 Value *IncomingFalse =
nullptr;
4481 if (I->hasOneUse() && isa<InsertElementInst>(*I->user_begin())) {
4495 assert(PostDom &&
"Then block has multiple successors");
4504 DEBUG(DT->verifyDomTree());
4512 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
4513 EdgeMaskCache::iterator ECEntryIt = MaskCache.find(Edge);
4514 if (ECEntryIt != MaskCache.end())
4515 return ECEntryIt->second;
4517 VectorParts SrcMask = createBlockInMask(Src);
4521 assert(BI &&
"Unexpected terminator found");
4524 VectorParts EdgeMask = getVectorValue(BI->
getCondition());
4527 for (
unsigned part = 0; part < UF; ++part)
4528 EdgeMask[part] = Builder.CreateNot(EdgeMask[part]);
4530 for (
unsigned part = 0; part < UF; ++part)
4531 EdgeMask[part] = Builder.CreateAnd(EdgeMask[part], SrcMask[part]);
4533 MaskCache[Edge] = EdgeMask;
4537 MaskCache[Edge] = SrcMask;
4542 InnerLoopVectorizer::createBlockInMask(
BasicBlock *BB) {
4543 assert(OrigLoop->contains(BB) &&
"Block is not a part of a loop");
4546 if (OrigLoop->getHeader() == BB) {
4548 return getVectorValue(C);
4553 VectorParts BlockMask = getVectorValue(Zero);
4557 VectorParts EM = createEdgeMask(*it, BB);
4558 for (
unsigned part = 0; part < UF; ++part)
4559 BlockMask[part] = Builder.CreateOr(BlockMask[part], EM[part]);
4565 void InnerLoopVectorizer::widenPHIInstruction(
Instruction *PN,
unsigned UF,
4566 unsigned VF, PhiVector *PV) {
4569 if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4570 VectorParts Entry(UF);
4571 for (
unsigned part = 0; part < UF; ++part) {
4576 VecTy, 2,
"vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4578 VectorLoopValueMap.initVector(P, Entry);
4583 setDebugLocFromInst(Builder, P);
4585 if (P->
getParent() != OrigLoop->getHeader()) {
4599 VectorParts Entry(UF);
4600 for (
unsigned In = 0;
In < NumIncoming;
In++) {
4605 for (
unsigned part = 0; part < UF; ++part) {
4609 Entry[part] = Builder.CreateSelect(Cond[part], In0[part], In0[part]);
4613 Entry[part] = Builder.CreateSelect(Cond[part], In0[part], Entry[part],
4617 VectorLoopValueMap.initVector(P, Entry);
4623 assert(Legal->getInductionVars()->count(P) &&
"Not an induction variable");
4626 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4630 switch (II.getKind()) {
4634 return widenIntInduction(P);
4639 Value *PtrInd = Induction;
4640 PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
4644 unsigned Lanes = Legal->isUniformAfterVectorization(P) ? 1 : VF;
4647 ScalarParts Entry(UF);
4648 for (
unsigned Part = 0; Part < UF; ++Part) {
4649 Entry[Part].resize(VF);
4650 for (
unsigned Lane = 0; Lane < Lanes; ++Lane) {
4652 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4653 Value *SclrGep = II.transform(Builder, GlobalIdx, PSE.
getSE(), DL);
4655 Entry[Part][Lane] = SclrGep;
4658 VectorLoopValueMap.initScalar(P, Entry);
4663 "Types must match");
4666 assert(P != OldInduction &&
"Primary induction can be integer only");
4668 Value *V = Builder.CreateCast(Instruction::SIToFP, Induction, P->
getType());
4669 V = II.transform(Builder, V, PSE.
getSE(), DL);
4674 Value *Broadcasted = getBroadcastInstrs(V);
4677 Value *StepVal = cast<SCEVUnknown>(II.getStep())->getValue();
4678 VectorParts Entry(UF);
4679 for (
unsigned part = 0; part < UF; ++part)
4680 Entry[part] = getStepVector(Broadcasted, VF * part, StepVal,
4681 II.getInductionOpcode());
4682 VectorLoopValueMap.initVector(P, Entry);
4700 "Unexpected instruction");
4703 return !CInt || CInt->
isZero();
4706 void InnerLoopVectorizer::vectorizeBlockInLoop(
BasicBlock *BB, PhiVector *PV) {
4712 if (DeadInstructions.count(&I))
4717 !(isa<BranchInst>(&I) || isa<PHINode>(&I) ||
4718 isa<DbgInfoIntrinsic>(&I)) &&
4719 shouldScalarizeInstruction(&I)) {
4720 scalarizeInstruction(&I, Legal->isScalarWithPredication(&I));
4724 switch (I.getOpcode()) {
4725 case Instruction::Br:
4729 case Instruction::PHI: {
4731 widenPHIInstruction(&I, UF, VF, PV);
4735 case Instruction::UDiv:
4736 case Instruction::SDiv:
4737 case Instruction::SRem:
4738 case Instruction::URem:
4741 if (Legal->isScalarWithPredication(&I)) {
4742 scalarizeInstruction(&I,
true);
4746 case Instruction::FAdd:
4747 case Instruction::Sub:
4748 case Instruction::FSub:
4749 case Instruction::Mul:
4750 case Instruction::FMul:
4751 case Instruction::FDiv:
4752 case Instruction::FRem:
4753 case Instruction::Shl:
4754 case Instruction::LShr:
4755 case Instruction::AShr:
4760 auto *BinOp = cast<BinaryOperator>(&
I);
4761 setDebugLocFromInst(Builder, BinOp);
4762 const VectorParts &A = getVectorValue(BinOp->getOperand(0));
4763 const VectorParts &B = getVectorValue(BinOp->getOperand(1));
4766 VectorParts Entry(UF);
4767 for (
unsigned Part = 0; Part < UF; ++Part) {
4768 Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]);
4771 VecOp->copyIRFlags(BinOp);
4776 VectorLoopValueMap.initVector(&I, Entry);
4777 addMetadata(Entry, BinOp);
4784 auto *SE = PSE.
getSE();
4785 bool InvariantCond =
4786 SE->isLoopInvariant(PSE.
getSCEV(I.getOperand(0)), OrigLoop);
4787 setDebugLocFromInst(Builder, &I);
4793 const VectorParts &Cond = getVectorValue(I.getOperand(0));
4794 const VectorParts &Op0 = getVectorValue(I.getOperand(1));
4795 const VectorParts &Op1 = getVectorValue(I.getOperand(2));
4797 auto *ScalarCond = getScalarValue(I.getOperand(0), 0, 0);
4799 VectorParts Entry(UF);
4800 for (
unsigned Part = 0; Part < UF; ++Part) {
4801 Entry[Part] = Builder.CreateSelect(
4802 InvariantCond ? ScalarCond : Cond[Part], Op0[Part], Op1[Part]);
4805 VectorLoopValueMap.initVector(&I, Entry);
4806 addMetadata(Entry, &I);
4810 case Instruction::ICmp:
4811 case Instruction::FCmp: {
4813 bool FCmp = (I.getOpcode() == Instruction::FCmp);
4815 setDebugLocFromInst(Builder, Cmp);
4816 const VectorParts &A = getVectorValue(Cmp->getOperand(0));
4817 const VectorParts &B = getVectorValue(Cmp->getOperand(1));
4818 VectorParts Entry(UF);
4819 for (
unsigned Part = 0; Part < UF; ++Part) {
4822 C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]);
4823 cast<FCmpInst>(
C)->copyFastMathFlags(Cmp);
4825 C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]);
4830 VectorLoopValueMap.initVector(&I, Entry);
4831 addMetadata(Entry, &I);
4837 vectorizeMemoryInstruction(&I);
4839 case Instruction::ZExt:
4840 case Instruction::SExt:
4841 case Instruction::FPToUI:
4842 case Instruction::FPToSI:
4843 case Instruction::FPExt:
4844 case Instruction::PtrToInt:
4845 case Instruction::IntToPtr:
4846 case Instruction::SIToFP:
4847 case Instruction::UIToFP:
4848 case Instruction::Trunc:
4849 case Instruction::FPTrunc:
4850 case Instruction::BitCast: {
4852 setDebugLocFromInst(Builder, CI);
4858 auto ID = Legal->getInductionVars()->lookup(OldInduction);
4859 if (isa<TruncInst>(CI) && CI->getOperand(0) == OldInduction &&
4860 ID.getConstIntStepValue()) {
4861 widenIntInduction(OldInduction, cast<TruncInst>(CI));
4869 const VectorParts &A = getVectorValue(CI->getOperand(0));
4870 VectorParts Entry(UF);
4871 for (
unsigned Part = 0; Part < UF; ++Part)
4872 Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy);
4873 VectorLoopValueMap.initVector(&I, Entry);
4874 addMetadata(Entry, &I);
4880 if (isa<DbgInfoIntrinsic>(I))
4882 setDebugLocFromInst(Builder, &I);
4884 Module *M = BB->getParent()->getParent();
4885 auto *CI = cast<CallInst>(&
I);
4887 StringRef FnName = CI->getCalledFunction()->getName();
4888 Function *F = CI->getCalledFunction();
4889 Type *RetTy = ToVectorTy(CI->getType(), VF);
4891 for (
Value *ArgOperand : CI->arg_operands())
4892 Tys.
push_back(ToVectorTy(ArgOperand->getType(), VF));
4895 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
4896 ID == Intrinsic::lifetime_start)) {
4897 scalarizeInstruction(&I);
4903 bool NeedToScalarize;
4905 bool UseVectorIntrinsic =
4907 if (!UseVectorIntrinsic && NeedToScalarize) {
4908 scalarizeInstruction(&I);
4912 VectorParts Entry(UF);
4913 for (
unsigned Part = 0; Part < UF; ++Part) {
4915 for (
unsigned i = 0, ie = CI->getNumArgOperands();
i != ie; ++
i) {
4916 Value *Arg = CI->getArgOperand(
i);
4920 const VectorParts &VectorArg = getVectorValue(CI->getArgOperand(
i));
4921 Arg = VectorArg[Part];
4927 if (UseVectorIntrinsic) {
4929 Type *TysForDecl[] = {CI->getType()};
4935 StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
4936 assert(!VFnName.
empty() &&
"Vector function name is empty.");
4943 VectorF->copyAttributesFrom(F);
4946 assert(VectorF &&
"Can't create vector function.");
4949 CI->getOperandBundlesAsDefs(OpBundles);
4950 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4952 if (isa<FPMathOperator>(V))
4958 VectorLoopValueMap.initVector(&I, Entry);
4959 addMetadata(Entry, &I);
4965 scalarizeInstruction(&I);
4971 void InnerLoopVectorizer::updateAnalysis() {
4976 assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
4977 "Entry does not dominate exit.");
4981 DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader);
4983 DT->addNewBlock(LoopMiddleBlock, LoopVectorBody);
4984 DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
4985 DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
4986 DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
4988 DEBUG(DT->verifyDomTree());
5001 if (
auto *C = dyn_cast<Constant>(V))
5008 bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
5011 <<
"if-conversion is disabled");
5022 if (blockNeedsPredication(BB))
5034 if (!isa<BranchInst>(BB->getTerminator())) {
5036 <<
"loop contains a switch statement");
5041 if (blockNeedsPredication(BB)) {
5042 if (!blockCanBePredicated(BB, SafePointes)) {
5044 <<
"control flow cannot be substituted for a select");
5049 <<
"control flow cannot be substituted for a select");
5058 bool LoopVectorizationLegality::canVectorize() {
5063 <<
"loop control flow is not understood by vectorizer");
5071 if (!TheLoop->
empty()) {
5073 <<
"loop is not the innermost loop");
5080 <<
"loop control flow is not understood by vectorizer");
5087 <<
"loop control flow is not understood by vectorizer");
5096 <<
"loop control flow is not understood by vectorizer");
5106 if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
5107 DEBUG(
dbgs() <<
"LV: Can't if-convert the loop.\n");
5115 <<
"could not determine number of loop iterations");
5116 DEBUG(
dbgs() <<
"LV: SCEV could not compute the loop exit count.\n");
5121 if (!canVectorizeInstrs()) {
5122 DEBUG(
dbgs() <<
"LV: Can't vectorize the instructions or CFG\n");
5127 if (!canVectorizeMemory()) {
5128 DEBUG(
dbgs() <<
"LV: Can't vectorize due to memory conflicts\n");
5132 DEBUG(
dbgs() <<
"LV: We can vectorize this loop"
5133 << (LAI->getRuntimePointerChecking()->Need
5134 ?
" (with a runtime bound check)"
5146 InterleaveInfo.analyzeInterleaving(*getSymbolicStrides());
5149 collectLoopUniforms();
5152 collectLoopScalars();
5155 if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
5160 <<
"Too many SCEV assumptions need to be made and checked "
5162 DEBUG(
dbgs() <<
"LV: Too many SCEV checks needed.\n");
5198 if (!AllowedExit.
count(Inst))
5204 DEBUG(
dbgs() <<
"LV: Found an outside user for : " << *UI <<
'\n');
5211 void LoopVectorizationLegality::addInductionPhi(
5214 Inductions[Phi] =
ID;
5237 if (!Induction || PhiTy == WidestIndTy)
5246 DEBUG(
dbgs() <<
"LV: Found an induction variable.\n");
5250 bool LoopVectorizationLegality::canVectorizeInstrs() {
5262 if (
auto *Phi = dyn_cast<PHINode>(&I)) {
5268 <<
"loop control flow is not understood by vectorizer");
5269 DEBUG(
dbgs() <<
"LV: Found an non-int non-pointer PHI.\n");
5282 <<
"value could not be identified as "
5283 "an induction or reduction variable");
5290 <<
"control flow not understood by vectorizer");
5291 DEBUG(
dbgs() <<
"LV: Found an invalid PHI.\n");
5300 Reductions[Phi] = RedDes;
5306 addInductionPhi(Phi, ID, AllowedExit);
5313 FirstOrderRecurrences.insert(Phi);
5320 addInductionPhi(Phi, ID, AllowedExit);
5325 <<
"value that could not be identified as "
5326 "reduction is used outside the loop");
5327 DEBUG(
dbgs() <<
"LV: Found an unidentified PHI." << *Phi <<
"\n");
5337 !isa<DbgInfoIntrinsic>(CI) &&
5338 !(CI->getCalledFunction() && TLI &&
5339 TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) {
5341 <<
"call instruction cannot be vectorized");
5342 DEBUG(
dbgs() <<
"LV: Found a non-intrinsic, non-libfunc callsite.\n");
5350 auto *SE = PSE.
getSE();
5351 if (!SE->isLoopInvariant(PSE.
getSCEV(CI->getOperand(1)), TheLoop)) {
5353 <<
"intrinsic instruction cannot be vectorized");
5354 DEBUG(
dbgs() <<
"LV: Found unvectorizable intrinsic " << *CI <<
"\n");
5362 !I.getType()->isVoidTy()) ||
5363 isa<ExtractElementInst>(I)) {
5365 <<
"instruction return type cannot be vectorized");
5366 DEBUG(
dbgs() <<
"LV: Found unvectorizable type.\n");
5371 if (
auto *
ST = dyn_cast<StoreInst>(&I)) {
5372 Type *
T =
ST->getValueOperand()->getType();
5375 <<
"store instruction cannot be vectorized");
5384 }
else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) &&
5385 !I.hasUnsafeAlgebra()) {
5386 DEBUG(
dbgs() <<
"LV: Found FP op with unsafe algebra.\n");
5387 Hints->setPotentiallyUnsafe();
5394 <<
"value cannot be used outside the loop");
5402 DEBUG(
dbgs() <<
"LV: Did not find one integer induction var.\n");
5403 if (Inductions.empty()) {
5405 <<
"loop induction variable could not be identified");
5413 if (Induction && WidestIndTy != Induction->getType())
5414 Induction =
nullptr;
5419 void LoopVectorizationLegality::collectLoopScalars() {
5422 Scalars.insert(Uniforms.begin(), Uniforms.end());
5427 for (
auto *BB : TheLoop->
blocks())
5428 for (
auto &I : *BB) {
5429 if (
auto *
GEP = dyn_cast<GetElementPtrInst>(&I)) {
5430 Scalars.insert(
GEP);
5436 auto *
GEP = getGEPInstruction(Ptr);
5437 if (
GEP && isLegalGatherOrScatter(&I))
5444 for (
auto &Induction : *getInductionVars()) {
5445 auto *Ind = Induction.first;
5451 auto *I = cast<Instruction>(U);
5452 return I == IndUpdate || !TheLoop->
contains(I) || Scalars.count(I);
5459 auto ScalarIndUpdate =
all_of(IndUpdate->users(), [&](
User *U) ->
bool {
5460 auto *I = cast<Instruction>(U);
5461 return I == Ind || !TheLoop->
contains(I) || Scalars.count(I);
5463 if (!ScalarIndUpdate)
5467 Scalars.insert(Ind);
5468 Scalars.insert(IndUpdate);
5472 bool LoopVectorizationLegality::hasConsecutiveLikePtrOperand(
Instruction *I) {
5473 if (isAccessInterleaved(I))
5476 return isConsecutivePtr(Ptr);
5480 bool LoopVectorizationLegality::isScalarWithPredication(
Instruction *I) {
5481 if (!blockNeedsPredication(I->
getParent()))
5487 return !isMaskRequired(I);
5488 case Instruction::UDiv:
5489 case Instruction::SDiv:
5490 case Instruction::SRem:
5491 case Instruction::URem:
5497 bool LoopVectorizationLegality::memoryInstructionMustBeScalarized(
5502 if (isAccessInterleaved(I))
5508 assert((LI || SI) &&
"Invalid memory instruction");
5513 if (LI && isUniform(Ptr))
5518 if (!isConsecutivePtr(Ptr) && !isLegalGatherOrScatter(I))
5523 if (isScalarWithPredication(I))
5529 auto *ScalarTy = LI ? LI->
getType() : SI->getValueOperand()->getType();
5530 if (hasIrregularType(ScalarTy, DL, VF))
5538 void LoopVectorizationLegality::collectLoopUniforms() {
5545 auto isOutOfScope = [&](
Value *V) ->
bool {
5547 return (!I || !TheLoop->
contains(I));
5557 if (Cmp && TheLoop->
contains(Cmp) && Cmp->hasOneUse()) {
5559 DEBUG(
dbgs() <<
"LV: Found uniform instruction: " << *Cmp <<
"\n");
5579 for (
auto *BB : TheLoop->
blocks())
5580 for (
auto &I : *BB) {
5589 auto UsersAreMemAccesses =
all_of(Ptr->
users(), [&](
User *U) ->
bool {
5598 if (!UsersAreMemAccesses || memoryInstructionMustBeScalarized(&I))
5599 PossibleNonUniformPtrs.
insert(Ptr);
5603 else if (hasConsecutiveLikePtrOperand(&I))
5604 ConsecutiveLikePtrs.
insert(Ptr);
5611 PossibleNonUniformPtrs.
insert(Ptr);
5616 for (
auto *V : ConsecutiveLikePtrs)
5617 if (!PossibleNonUniformPtrs.
count(V)) {
5618 DEBUG(
dbgs() <<
"LV: Found uniform instruction: " << *V <<
"\n");
5627 while (idx != Worklist.
size()) {
5631 if (isOutOfScope(OV))
5633 auto *OI = cast<Instruction>(OV);
5634 if (
all_of(OI->users(), [&](
User *U) ->
bool {
5635 return isOutOfScope(U) || Worklist.
count(cast<Instruction>(U));
5638 DEBUG(
dbgs() <<
"LV: Found uniform instruction: " << *OI <<
"\n");
5655 for (
auto &Induction : Inductions) {
5656 auto *Ind = Induction.first;
5662 auto *I = cast<Instruction>(U);
5663 return I == IndUpdate || !TheLoop->
contains(I) || Worklist.
count(I) ||
5664 isVectorizedMemAccessUse(I, Ind);
5671 auto UniformIndUpdate =
all_of(IndUpdate->users(), [&](
User *U) ->
bool {
5672 auto *I = cast<Instruction>(U);
5673 return I == Ind || !TheLoop->
contains(I) || Worklist.
count(I) ||
5674 isVectorizedMemAccessUse(I, IndUpdate);
5676 if (!UniformIndUpdate)
5681 Worklist.
insert(IndUpdate);
5682 DEBUG(
dbgs() <<
"LV: Found uniform instruction: " << *Ind <<
"\n");
5683 DEBUG(
dbgs() <<
"LV: Found uniform instruction: " << *IndUpdate <<
"\n");
5686 Uniforms.insert(Worklist.
begin(), Worklist.
end());
5689 bool LoopVectorizationLegality::canVectorizeMemory() {
5690 LAI = &(*GetLAA)(*TheLoop);
5691 InterleaveInfo.setLAI(LAI);
5695 "loop not vectorized: ", *LAR);
5698 if (!LAI->canVectorizeMemory())
5701 if (LAI->hasStoreToLoopInvariantAddress()) {
5703 <<
"write to a loop invariant address could not be vectorized");
5704 DEBUG(
dbgs() <<
"LV: We don't allow storing to uniform addresses\n");
5708 Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks());
5714 bool LoopVectorizationLegality::isInductionVariable(
const Value *V) {
5716 PHINode *PN = dyn_cast_or_null<PHINode>(In0);
5720 return Inductions.count(PN);
5723 bool LoopVectorizationLegality::isFirstOrderRecurrence(
const PHINode *Phi) {
5724 return FirstOrderRecurrences.count(Phi);
5727 bool LoopVectorizationLegality::blockNeedsPredication(
BasicBlock *BB) {
5731 bool LoopVectorizationLegality::blockCanBePredicated(
5733 const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
5737 for (
Value *Operand : I.operands()) {
5738 if (
auto *C = dyn_cast<Constant>(Operand))
5743 if (I.mayReadFromMemory()) {
5749 isLegalMaskedGather(LI->
getType())) {
5750 MaskedOp.insert(LI);
5754 if (IsAnnotatedParallel)
5760 if (I.mayWriteToMemory()) {
5768 if (isLegalMaskedStore(SI->getValueOperand()->getType(),
5769 SI->getPointerOperand()) ||
5770 isLegalMaskedScatter(SI->getValueOperand()->getType())) {
5771 MaskedOp.insert(SI);
5775 bool isSafePtr = (SafePtrs.
count(SI->getPointerOperand()) != 0);
5779 !isSinglePredecessor)
5789 void InterleavedAccessInfo::collectConstStrideAccesses(
5793 auto &DL = TheLoop->getHeader()->getModule()->getDataLayout();
5804 for (
auto &I : *BB) {
5818 int64_t Stride =
getPtrStride(PSE, Ptr, TheLoop, Strides,
5826 unsigned Align = LI ? LI->
getAlignment() : SI->getAlignment();
5830 AccessStrideInfo[&
I] = StrideDescriptor(Stride, Scev, Size, Align);
5870 void InterleavedAccessInfo::analyzeInterleaving(
5872 DEBUG(
dbgs() <<
"LV: Analyzing interleaved accesses...\n");
5876 collectConstStrideAccesses(AccessStrideInfo, Strides);
5878 if (AccessStrideInfo.
empty())
5882 collectDependences();
5901 for (
auto BI = AccessStrideInfo.
rbegin(),
E = AccessStrideInfo.
rend();
5904 StrideDescriptor DesB = BI->second;
5909 InterleaveGroup *Group =
nullptr;
5910 if (isStrided(DesB.Stride)) {
5911 Group = getInterleaveGroup(B);
5913 DEBUG(
dbgs() <<
"LV: Creating an interleave group with:" << *B <<
'\n');
5914 Group = createInterleaveGroup(B, DesB.Stride, DesB.Align);
5917 StoreGroups.
insert(Group);
5919 LoadGroups.
insert(Group);
5922 for (
auto AI = std::next(BI); AI !=
E; ++AI) {
5924 StrideDescriptor DesA = AI->second;
5945 if (!canReorderMemAccessesForInterleavedGroups(&*AI, &*BI)) {
5952 if (isInterleaved(A)) {
5953 InterleaveGroup *StoreGroup = getInterleaveGroup(A);
5954 StoreGroups.
remove(StoreGroup);
5955 releaseGroup(StoreGroup);
5968 if (!isStrided(DesA.Stride) || !isStrided(DesB.Stride))
5978 if (DesA.Stride != DesB.Stride || DesA.Size != DesB.Size)
5990 if (DistanceToB % static_cast<int64_t>(DesB.Size))
6003 Group->getIndex(B) + DistanceToB /
static_cast<int64_t
>(DesB.Size);
6006 if (Group->insertMember(A, IndexA, DesA.Align)) {
6007 DEBUG(
dbgs() <<
"LV: Inserted:" << *A <<
'\n'
6008 <<
" into the interleave group with" << *B <<
'\n');
6009 InterleaveGroupMap[
A] = Group;
6013 Group->setInsertPos(A);
6019 for (InterleaveGroup *Group : StoreGroups)
6020 if (Group->getNumMembers() != Group->getFactor())
6021 releaseGroup(Group);
6038 for (InterleaveGroup *Group : LoadGroups) {
6043 if (Group->getNumMembers() == Group->getFactor())
6052 if (!
getPtrStride(PSE, FirstMemberPtr, TheLoop, Strides,
false,
6054 DEBUG(
dbgs() <<
"LV: Invalidate candidate interleaved group due to "
6055 "first group member potentially pointer-wrapping.\n");
6056 releaseGroup(Group);
6059 Instruction *LastMember = Group->getMember(Group->getFactor() - 1);
6062 if (!
getPtrStride(PSE, LastMemberPtr, TheLoop, Strides,
false,
6064 DEBUG(
dbgs() <<
"LV: Invalidate candidate interleaved group due to "
6065 "last group member potentially pointer-wrapping.\n");
6066 releaseGroup(Group);
6075 if (Group->isReverse()) {
6076 releaseGroup(Group);
6079 DEBUG(
dbgs() <<
"LV: Interleaved group requires epilogue iteration.\n");
6080 RequiresScalarEpilogue =
true;
6086 LoopVectorizationCostModel::selectVectorizationFactor(
bool OptForSize) {
6089 if (OptForSize && Legal->getRuntimePointerChecking()->Need) {
6091 <<
"runtime pointer checks needed. Enable vectorization of this "
6092 "loop with '#pragma clang loop vectorize(enable)' when "
6093 "compiling with -Os/-Oz");
6095 <<
"LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n");
6101 <<
"store that is conditionally executed prevents vectorization");
6102 DEBUG(
dbgs() <<
"LV: No vectorization. There are conditional stores.\n");
6107 unsigned SmallestType, WidestType;
6108 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
6110 unsigned MaxSafeDepDist = -1U;
6118 if (Legal->getMaxSafeDepDistBytes() != -1U)
6120 Legal->getMaxSafeDepDistBytes() * 8 / Legal->getMaxInterleaveFactor();
6123 ((WidestRegister < MaxSafeDepDist) ? WidestRegister : MaxSafeDepDist);
6124 unsigned MaxVectorSize = WidestRegister / WidestType;
6126 DEBUG(
dbgs() <<
"LV: The Smallest and Widest types: " << SmallestType <<
" / "
6127 << WidestType <<
" bits.\n");
6128 DEBUG(
dbgs() <<
"LV: The Widest register is: " << WidestRegister
6131 if (MaxVectorSize == 0) {
6132 DEBUG(
dbgs() <<
"LV: The target has no vector registers.\n");
6136 assert(MaxVectorSize <= 64 &&
"Did not expect to pack so many elements"
6137 " into one vector!");
6139 unsigned VF = MaxVectorSize;
6143 unsigned NewMaxVectorSize = WidestRegister / SmallestType;
6144 for (
unsigned VS = MaxVectorSize;
VS <= NewMaxVectorSize;
VS *= 2)
6148 auto RUs = calculateRegisterUsage(VFs);
6153 for (
int i = RUs.size() - 1;
i >= 0; --
i) {
6154 if (RUs[
i].MaxLocalUsers <= TargetNumRegisters) {
6164 DEBUG(
dbgs() <<
"LV: Found trip count: " << TC <<
'\n');
6170 <<
"unable to calculate the loop count due to complex control flow");
6171 DEBUG(
dbgs() <<
"LV: Aborting. A tail loop is required with -Os/-Oz.\n");
6176 VF = TC % MaxVectorSize;
6184 <<
"cannot optimize for size and vectorize at the "
6185 "same time. Enable vectorization of this loop "
6186 "with '#pragma clang loop vectorize(enable)' "
6187 "when compiling with -Os/-Oz");
6188 DEBUG(
dbgs() <<
"LV: Aborting. A tail loop is required with -Os/-Oz.\n");
6193 int UserVF = Hints->getWidth();
6196 DEBUG(
dbgs() <<
"LV: Using user VF " << UserVF <<
".\n");
6198 Factor.Width = UserVF;
6199 collectInstsToScalarize(UserVF);
6203 float Cost = expectedCost(1).first;
6205 const float ScalarCost = Cost;
6208 DEBUG(
dbgs() <<
"LV: Scalar loop costs: " << (
int)ScalarCost <<
".\n");
6210 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
6212 if (ForceVectorization && VF > 1) {
6214 Cost = expectedCost(Width).first / (float)Width;
6217 for (
unsigned i = 2;
i <= VF;
i *= 2) {
6221 VectorizationCostTy C = expectedCost(
i);
6222 float VectorCost = C.first / (float)
i;
6223 DEBUG(
dbgs() <<
"LV: Vector loop of width " <<
i
6224 <<
" costs: " << (
int)VectorCost <<
".\n");
6225 if (!C.second && !ForceVectorization) {
6227 dbgs() <<
"LV: Not considering vector loop of width " <<
i
6228 <<
" because it will not generate any vector instructions.\n");
6231 if (VectorCost < Cost) {
6237 DEBUG(
if (ForceVectorization && Width > 1 && Cost >= ScalarCost)
dbgs()
6238 <<
"LV: Vectorization seems to be not beneficial, "
6239 <<
"but was forced by a user.\n");
6240 DEBUG(
dbgs() <<
"LV: Selecting VF: " << Width <<
".\n");
6241 Factor.Width = Width;
6242 Factor.Cost = Width * Cost;
6246 std::pair<unsigned, unsigned>
6247 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
6248 unsigned MinWidth = -1U;
6249 unsigned MaxWidth = 8;
6250 const DataLayout &DL = TheFunction->getParent()->getDataLayout();
6256 Type *T = I.getType();
6259 if (ValuesToIgnore.count(&I))
6263 if (!isa<LoadInst>(I) && !isa<StoreInst>(
I) && !isa<PHINode>(I))
6268 if (
auto *PN = dyn_cast<PHINode>(&I)) {
6269 if (!Legal->isReductionVariable(PN))
6276 if (
auto *
ST = dyn_cast<StoreInst>(&I))
6277 T =
ST->getValueOperand()->getType();
6282 if (T->
isPointerTy() && !isConsecutiveLoadOrStore(&I))
6287 MaxWidth = std::max(MaxWidth,
6292 return {MinWidth, MaxWidth};
6295 unsigned LoopVectorizationCostModel::selectInterleaveCount(
bool OptForSize,
6297 unsigned LoopCost) {
6318 if (Legal->getMaxSafeDepDistBytes() != -1U)
6327 DEBUG(
dbgs() <<
"LV: The target has " << TargetNumRegisters
6338 RegisterUsage R = calculateRegisterUsage({VF})[0];
6341 R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
6342 R.NumInstructions = std::max(R.NumInstructions, 1U);
6352 unsigned IC =
PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
6357 IC =
PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
6358 std::max(1U, (R.MaxLocalUsers - 1)));
6375 LoopCost = expectedCost(VF).first;
6379 if (IC > MaxInterleaveCount)
6380 IC = MaxInterleaveCount;
6386 if (VF > 1 && Legal->getReductionVars()->size()) {
6387 DEBUG(
dbgs() <<
"LV: Interleaving because of reductions.\n");
6393 bool InterleavingRequiresRuntimePointerCheck =
6394 (VF == 1 && Legal->getRuntimePointerChecking()->Need);
6398 DEBUG(
dbgs() <<
"LV: Loop cost is " << LoopCost <<
'\n');
6399 if (!InterleavingRequiresRuntimePointerCheck && LoopCost <
SmallLoopCost) {
6408 unsigned NumStores = Legal->getNumStores();
6409 unsigned NumLoads = Legal->getNumLoads();
6410 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
6411 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
6417 if (Legal->getReductionVars()->size() && TheLoop->getLoopDepth() > 1) {
6425 std::max(StoresIC, LoadsIC) > SmallIC) {
6426 DEBUG(
dbgs() <<
"LV: Interleaving to saturate store or load ports.\n");
6427 return std::max(StoresIC, LoadsIC);
6430 DEBUG(
dbgs() <<
"LV: Interleaving to reduce branch cost.\n");
6436 bool HasReductions = (Legal->getReductionVars()->size() > 0);
6438 DEBUG(
dbgs() <<
"LV: Interleaving to expose ILP.\n");
6442 DEBUG(
dbgs() <<
"LV: Not Interleaving.\n");
6469 RU.NumInstructions = 0;
6478 IntervalMap EndPoint;
6487 RU.NumInstructions += BB->size();
6489 IdxToInstr[Index++] = &
I;
6492 for (
Value *U : I.operands()) {
6500 if (!TheLoop->contains(Instr)) {
6501 LoopInvariants.
insert(Instr);
6506 EndPoint[Instr] = Index;
6523 unsigned MaxSafeDepDist = -1U;
6524 if (Legal->getMaxSafeDepDistBytes() != -1U)
6525 MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
6526 unsigned WidestRegister =
6528 const DataLayout &DL = TheFunction->getParent()->getDataLayout();
6533 DEBUG(
dbgs() <<
"LV(REG): Calculating max register usage:\n");
6536 auto GetRegUsage = [&DL, WidestRegister](
Type *Ty,
unsigned VF) {
6537 if (Ty->isTokenTy())
6539 unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
6540 return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
6543 for (
unsigned int i = 0;
i < Index; ++
i) {
6547 InstrList &List = TransposeEnds[
i];
6549 OpenIntervals.
erase(ToRemove);
6556 if (ValuesToIgnore.count(I))
6560 for (
unsigned j = 0, e = VFs.
size(); j < e; ++j) {
6562 MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.
size());
6568 for (
auto Inst : OpenIntervals) {
6570 if (VecValuesToIgnore.count(Inst))
6572 RegUsage += GetRegUsage(Inst->
getType(), VFs[j]);
6574 MaxUsages[j] = std::max(MaxUsages[j], RegUsage);
6577 DEBUG(
dbgs() <<
"LV(REG): At #" <<
i <<
" Interval # "
6578 << OpenIntervals.size() <<
'\n');
6581 OpenIntervals.insert(I);
6584 for (
unsigned i = 0, e = VFs.
size();
i < e; ++
i) {
6585 unsigned Invariant = 0;
6587 Invariant = LoopInvariants.
size();
6589 for (
auto Inst : LoopInvariants)
6590 Invariant += GetRegUsage(Inst->
getType(), VFs[
i]);
6593 DEBUG(
dbgs() <<
"LV(REG): VF = " << VFs[
i] <<
'\n');
6594 DEBUG(
dbgs() <<
"LV(REG): Found max usage: " << MaxUsages[
i] <<
'\n');
6595 DEBUG(
dbgs() <<
"LV(REG): Found invariant usage: " << Invariant <<
'\n');
6596 DEBUG(
dbgs() <<
"LV(REG): LoopSize: " << RU.NumInstructions <<
'\n');
6598 RU.LoopInvariantRegs = Invariant;
6599 RU.MaxLocalUsers = MaxUsages[
i];
6606 void LoopVectorizationCostModel::collectInstsToScalarize(
unsigned VF) {
6612 if (VF < 2 || InstsToScalarize.count(VF))
6618 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6624 if (!Legal->blockNeedsPredication(BB))
6627 if (Legal->isScalarWithPredication(&I)) {
6628 ScalarCostsTy ScalarCosts;
6629 if (computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6630 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6635 int LoopVectorizationCostModel::computePredInstDiscount(
6639 assert(!Legal->isUniformAfterVectorization(PredInst) &&
6640 "Instruction marked uniform-after-vectorization will be predicated");
6660 Legal->isScalarAfterVectorization(I))
6665 if (Legal->isScalarWithPredication(I))
6679 if (
auto *J = dyn_cast<Instruction>(U.get()))
6680 if (Legal->isUniformAfterVectorization(J))
6693 return TheLoop->contains(I) && !Legal->isScalarAfterVectorization(I);
6700 while (!Worklist.
empty()) {
6704 if (ScalarCosts.
count(I))
6709 unsigned VectorCost = getInstructionCost(I, VF).first;
6715 unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
6730 if (
auto *J = dyn_cast<Instruction>(U.get())) {
6732 "Instruction has non-scalar type");
6733 if (canBeScalarized(J))
6735 else if (needsExtract(J))
6741 ScalarCost /= getReciprocalPredBlockProb();
6745 Discount += VectorCost - ScalarCost;
6746 ScalarCosts[
I] = ScalarCost;
6752 LoopVectorizationCostModel::VectorizationCostTy
6753 LoopVectorizationCostModel::expectedCost(
unsigned VF) {
6754 VectorizationCostTy Cost;
6758 collectInstsToScalarize(VF);
6762 VectorizationCostTy BlockCost;
6767 if (isa<DbgInfoIntrinsic>(I))
6771 if (ValuesToIgnore.count(&I))
6774 VectorizationCostTy C = getInstructionCost(&I, VF);
6780 BlockCost.first += C.first;
6781 BlockCost.second |= C.second;
6782 DEBUG(
dbgs() <<
"LV: Found an estimated cost of " << C.first <<
" for VF "
6783 << VF <<
" For instruction: " << I <<
'\n');
6792 if (VF == 1 && Legal->blockNeedsPredication(BB))
6793 BlockCost.first /= getReciprocalPredBlockProb();
6795 Cost.first += BlockCost.first;
6796 Cost.second |= BlockCost.second;
6809 LoopVectorizationLegality *Legal,
6811 const Loop *TheLoop) {
6819 for (
unsigned i = 1;
i < NumOperands; ++
i) {
6822 !Legal->isInductionVariable(Opd))
6835 LoopVectorizationCostModel::VectorizationCostTy
6836 LoopVectorizationCostModel::getInstructionCost(
Instruction *I,
unsigned VF) {
6839 if (Legal->isUniformAfterVectorization(I))
6842 if (VF > 1 && isProfitableToScalarize(I, VF))
6843 return VectorizationCostTy(InstsToScalarize[VF][I],
false);
6846 unsigned C = getInstructionCost(I, VF, VectorTy);
6848 bool TypeNotScalarized =
6850 return VectorizationCostTy(C, TypeNotScalarized);
6853 unsigned LoopVectorizationCostModel::getInstructionCost(
Instruction *I,
6857 if (canTruncateToMinimalBitwidth(I, VF))
6859 VectorTy = ToVectorTy(RetTy, VF);
6860 auto SE = PSE.
getSE();
6864 case Instruction::GetElementPtr:
6870 case Instruction::Br: {
6873 case Instruction::PHI: {
6874 auto *Phi = cast<PHINode>(
I);
6877 if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
6879 VectorTy, VF - 1, VectorTy);
6884 case Instruction::UDiv:
6885 case Instruction::SDiv:
6886 case Instruction::URem:
6887 case Instruction::SRem:
6892 if (VF > 1 && Legal->isScalarWithPredication(I)) {
6911 return Cost / getReciprocalPredBlockProb();
6914 case Instruction::FAdd:
6915 case Instruction::Sub:
6916 case Instruction::FSub:
6917 case Instruction::Mul:
6918 case Instruction::FMul:
6919 case Instruction::FDiv:
6920 case Instruction::FRem:
6921 case Instruction::Shl:
6922 case Instruction::LShr:
6923 case Instruction::AShr:
6943 if (isa<ConstantInt>(Op2)) {
6948 }
else if (isa<ConstantVector>(Op2) || isa<ConstantDataVector>(Op2)) {
6957 }
else if (Legal->isUniform(Op2)) {
6962 Op2VK, Op1VP, Op2VP, Operands);
6967 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6974 case Instruction::ICmp:
6975 case Instruction::FCmp: {
6978 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6980 VectorTy = ToVectorTy(ValTy, VF);
6988 VectorTy = ToVectorTy(ValTy, VF);
6990 unsigned Alignment = SI ? SI->
getAlignment() : LI->getAlignment();
7001 if (LI && Legal->isUniform(Ptr)) {
7012 if (Legal->isAccessInterleaved(I)) {
7013 auto Group = Legal->getInterleavedAccessGroup(I);
7014 assert(Group &&
"Fail to get an interleaved access group.");
7017 if (Group->getInsertPos() !=
I)
7020 unsigned InterleaveFactor = Group->getFactor();
7023 VectorTy->getVectorNumElements() * InterleaveFactor);
7029 for (
unsigned i = 0;
i < InterleaveFactor;
i++)
7030 if (Group->getMember(
i))
7036 I->
getOpcode(), WideVecTy, Group->getFactor(), Indices,
7037 Group->getAlignment(), AS);
7039 if (Group->isReverse())
7041 Group->getNumMembers() *
7051 if (Legal->memoryInstructionMustBeScalarized(I, VF)) {
7072 if (Legal->isScalarWithPredication(I))
7073 Cost /= getReciprocalPredBlockProb();
7080 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
7081 bool Reverse = ConsecutiveStride < 0;
7084 bool UseGatherOrScatter =
7085 !ConsecutiveStride && Legal->isLegalGatherOrScatter(I);
7088 if (UseGatherOrScatter) {
7089 assert(ConsecutiveStride == 0 &&
7090 "Gather/Scatter are not used for consecutive stride");
7093 Legal->isMaskRequired(I), Alignment);
7096 if (Legal->isMaskRequired(I))
7106 case Instruction::ZExt:
7107 case Instruction::SExt:
7108 case Instruction::FPToUI:
7109 case Instruction::FPToSI:
7110 case Instruction::FPExt:
7111 case Instruction::PtrToInt:
7112 case Instruction::IntToPtr:
7113 case Instruction::SIToFP:
7114 case Instruction::UIToFP:
7115 case Instruction::Trunc:
7116 case Instruction::FPTrunc:
7117 case Instruction::BitCast: {
7120 if (I->
getOpcode() == Instruction::Trunc &&
7121 Legal->isInductionVariable(I->
getOperand(0)))
7126 Type *SrcVecTy = ToVectorTy(SrcScalarTy, VF);
7127 if (canTruncateToMinimalBitwidth(I, VF)) {
7133 Type *MinVecTy = VectorTy;
7134 if (I->
getOpcode() == Instruction::Trunc) {
7138 }
else if (I->
getOpcode() == Instruction::ZExt ||
7149 bool NeedToScalarize;
7165 static const char lv_name[] =
"Loop Vectorization";
7185 return new LoopVectorize(NoUnrolling, AlwaysVectorize);
7189 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(
Instruction *Inst) {
7194 return Legal->isConsecutivePtr(Ptr);
7198 void LoopVectorizationCostModel::collectValuesToIgnore() {
7204 for (
auto &
Reduction : *Legal->getReductionVars()) {
7218 for (
auto *BB : TheLoop->getBlocks())
7220 if (Legal->isScalarAfterVectorization(&I))
7221 VecValuesToIgnore.insert(&I);
7224 void InnerLoopUnroller::scalarizeInstruction(
Instruction *Instr,
7225 bool IfPredicateInstr) {
7230 setDebugLocFromInst(Builder, Instr);
7236 ScalarParts Entry(UF);
7239 if (IfPredicateInstr)
7240 Cond = createBlockInMask(Instr->
getParent());
7243 for (
unsigned Part = 0; Part < UF; ++Part) {
7244 Entry[Part].resize(1);
7248 Value *Cmp =
nullptr;
7249 if (IfPredicateInstr) {
7250 if (Cond[Part]->
getType()->isVectorTy())
7264 auto *NewOp = getScalarValue(Instr->
getOperand(
op), Part, 0);
7269 Builder.Insert(Cloned);
7272 Entry[Part][0] = Cloned;
7275 if (
auto *II = dyn_cast<IntrinsicInst>(Cloned))
7276 if (II->getIntrinsicID() == Intrinsic::assume)
7277 AC->registerAssumption(II);
7280 if (IfPredicateInstr)
7281 PredicatedInstructions.push_back(std::make_pair(Cloned, Cmp));
7283 VectorLoopValueMap.initScalar(Instr, Entry);
7286 void InnerLoopUnroller::vectorizeMemoryInstruction(
Instruction *Instr) {
7288 bool IfPredicateInstr = (SI && Legal->blockNeedsPredication(SI->
getParent()));
7290 return scalarizeInstruction(Instr, IfPredicateInstr);
7293 Value *InnerLoopUnroller::reverseVector(
Value *Vec) {
return Vec; }
7295 Value *InnerLoopUnroller::getBroadcastInstrs(
Value *V) {
return V; }
7297 Value *InnerLoopUnroller::getStepVector(
Value *Val,
int StartIdx,
Value *Step,
7311 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step),
"induction");
7318 bool IsUnrollMetadata =
false;
7333 if (!IsUnrollMetadata) {
7338 MDString::get(Context,
"llvm.loop.unroll.runtime.disable"));
7349 assert(L->
empty() &&
"Only process inner loops.");
7352 const std::string DebugLocStr = getDebugLocString(L);
7355 DEBUG(
dbgs() <<
"\nLV: Checking a loop in \""
7356 << L->
getHeader()->getParent()->getName() <<
"\" from "
7357 << DebugLocStr <<
"\n");
7359 LoopVectorizeHints Hints(L, DisableUnrolling, *ORE);
7363 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
7365 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
7368 <<
" width=" << Hints.getWidth()
7369 <<
" unroll=" << Hints.getInterleave() <<
"\n");
7382 if (!Hints.allowVectorization(F, L, AlwaysVectorize)) {
7383 DEBUG(
dbgs() <<
"LV: Loop hints prevent vectorization.\n");
7389 const unsigned MaxTC = SE->getSmallConstantMaxTripCount(L);
7391 DEBUG(
dbgs() <<
"LV: Found a loop with a very small trip count. "
7392 <<
"This loop is not worth vectorizing.");
7393 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
7394 DEBUG(
dbgs() <<
" But vectorizing was explicitly forced.\n");
7399 <<
"vectorization is not beneficial "
7400 "and is not explicitly forced");
7408 LoopVectorizationRequirements Requirements(*ORE);
7409 LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, GetLAA, LI, ORE,
7410 &Requirements, &Hints);
7411 if (!LVL.canVectorize()) {
7412 DEBUG(
dbgs() <<
"LV: Not vectorizing: Cannot prove legality.\n");
7413 emitMissedWarning(F, L, Hints, ORE);
7418 LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F,
7420 CM.collectValuesToIgnore();
7425 Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize();
7434 if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
7435 LoopEntryFreq < ColdEntryFreq)
7443 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
7444 DEBUG(
dbgs() <<
"LV: Can't vectorize when the NoImplicitFloat"
7445 "attribute is used.\n");
7447 "NoImplicitFloat",
L)
7448 <<
"loop not vectorized due to NoImplicitFloat attribute");
7449 emitMissedWarning(F, L, Hints, ORE);
7457 if (Hints.isPotentiallyUnsafe() &&
7459 DEBUG(
dbgs() <<
"LV: Potentially unsafe FP op prevents vectorization.\n");
7462 <<
"loop not vectorized due to unsafe FP support.");
7463 emitMissedWarning(F, L, Hints, ORE);
7469 CM.selectVectorizationFactor(OptForSize);
7472 unsigned IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost);
7475 unsigned UserIC = Hints.getInterleave();
7478 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
7479 bool VectorizeLoop =
true, InterleaveLoop =
true;
7480 if (Requirements.doesNotMeet(F, L, Hints)) {
7481 DEBUG(
dbgs() <<
"LV: Not vectorizing: loop did not meet vectorization "
7483 emitMissedWarning(F, L, Hints, ORE);
7487 if (VF.Width == 1) {
7488 DEBUG(
dbgs() <<
"LV: Vectorization is possible but not beneficial.\n");
7489 VecDiagMsg = std::make_pair(
7490 "VectorizationNotBeneficial",
7491 "the cost-model indicates that vectorization is not beneficial");
7492 VectorizeLoop =
false;
7495 if (IC == 1 && UserIC <= 1) {
7497 DEBUG(
dbgs() <<
"LV: Interleaving is not beneficial.\n");
7498 IntDiagMsg = std::make_pair(
7499 "InterleavingNotBeneficial",
7500 "the cost-model indicates that interleaving is not beneficial");
7501 InterleaveLoop =
false;
7503 IntDiagMsg.first =
"InterleavingNotBeneficialAndDisabled";
7504 IntDiagMsg.second +=
7505 " and is explicitly disabled or interleave count is set to 1";
7507 }
else if (IC > 1 && UserIC == 1) {
7510 <<
"LV: Interleaving is beneficial but is explicitly disabled.");
7511 IntDiagMsg = std::make_pair(
7512 "InterleavingBeneficialButDisabled",
7513 "the cost-model indicates that interleaving is beneficial "
7514 "but is explicitly disabled or interleave count is set to 1");
7515 InterleaveLoop =
false;
7519 IC = UserIC > 0 ? UserIC : IC;
7522 const char *VAPassName = Hints.vectorizeAnalysisPassName();
7523 if (!VectorizeLoop && !InterleaveLoop) {
7527 << VecDiagMsg.second);
7530 << IntDiagMsg.second);
7532 }
else if (!VectorizeLoop && InterleaveLoop) {
7533 DEBUG(
dbgs() <<
"LV: Interleave Count is " << IC <<
'\n');
7536 << VecDiagMsg.second);
7537 }
else if (VectorizeLoop && !InterleaveLoop) {
7538 DEBUG(
dbgs() <<
"LV: Found a vectorizable loop (" << VF.Width <<
") in "
7539 << DebugLocStr <<
'\n');
7542 << IntDiagMsg.second);
7543 }
else if (VectorizeLoop && InterleaveLoop) {
7544 DEBUG(
dbgs() <<
"LV: Found a vectorizable loop (" << VF.Width <<
") in "
7545 << DebugLocStr <<
'\n');
7546 DEBUG(
dbgs() <<
"LV: Interleave Count is " << IC <<
'\n');
7549 using namespace ore;
7550 if (!VectorizeLoop) {
7551 assert(IC > 1 &&
"interleave count should not be 1 or 0");
7554 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
7556 Unroller.vectorize();
7560 <<
"interleaved loop (interleaved count: "
7561 <<
NV(
"InterleaveCount", IC) <<
")");
7564 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
7572 if (!LB.areSafetyChecksAdded())
7578 <<
"vectorized loop (vectorization width: "
7579 <<
NV(
"VectorizationFactor", VF.Width)
7580 <<
", interleaved count: " <<
NV(
"InterleaveCount", IC) <<
")");
7584 Hints.setAlreadyVectorized();
7630 addAcyclicInnerLoop(*L, Worklist);
7632 LoopsAnalyzed += Worklist.
size();
7635 bool Changed =
false;
7636 while (!Worklist.
empty())
7659 std::function<const LoopAccessInfo &(Loop &)> GetLAA =
7665 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE);
Legacy wrapper pass to provide the GlobalsAAResult object.
unsigned getNumBackEdges() const
Calculate the number of back edges to the loop header.
Pass interface - Implemented by all 'passes'.
static unsigned RuntimeMemoryCheckThreshold
\brief When performing memory disambiguation checks at runtime do not make more than this number of c...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
static unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract, const TargetTransformInfo &TTI)
Estimate the overhead of scalarizing a value based on its type.
Value * getValueOperand()
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
void push_back(const T &Elt)
This builds on the llvm/ADT/GraphTraits.h file to find the strongly connected components (SCCs) of a ...
A parsed version of the target data layout string in and methods for querying it. ...
void copyFastMathFlags(FastMathFlags FMF)
Convenience function for transferring all fast-math flag values to this instruction, which must be an operator which supports these flags.
bool processLoop(Loop *L)
void ReplaceInstWithInst(BasicBlock::InstListType &BIL, BasicBlock::iterator &BI, Instruction *I)
Replace the instruction specified by BI with the instruction specified by I.
This class is the base class for the comparison instructions.
LLVM_NODISCARD LLVM_ATTRIBUTE_ALWAYS_INLINE bool endswith(StringRef Suffix) const
Check if this string ends with the given Suffix.
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
uint64_t getMaxSafeDepDistBytes() const
static IntegerType * getInt1Ty(LLVMContext &C)
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function. ...
DiagnosticInfoOptimizationBase::Argument NV
STATISTIC(NumFunctions,"Total number of functions")
BasicBlock * SplitBlock(BasicBlock *Old, Instruction *SplitPt, DominatorTree *DT=nullptr, LoopInfo *LI=nullptr)
Split the specified block at the specified instruction - everything before SplitPt stays in Old and e...
ValueT lookup(const KeyT &Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
static cl::opt< unsigned > ForceTargetNumScalarRegs("force-target-num-scalar-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of scalar registers."))
Instruction * getUnsafeAlgebraInst()
Returns first unsafe algebra instruction in the PHI node's use-chain.
int getWidth()
Get the width of a number.
This is the interface for a simple mod/ref and alias analysis over globals.
A Module instance is used to store all the information related to an LLVM module. ...
void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
static Constant * getSequentialMask(IRBuilder<> &Builder, unsigned NumInt, unsigned NumUndef)
static MDString * get(LLVMContext &Context, StringRef Str)
const Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
Interval Class - An Interval is a set of nodes defined such that every node in the interval has all o...
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Min/max implemented in terms of select(cmp()).
This class represents zero extension of integer types.
unsigned getNumOperands() const
static Value * ConcatenateVectors(IRBuilder<> &Builder, ArrayRef< Value * > InputList)
unsigned getNumOperands() const
Return number of MDNode operands.
value_op_iterator value_op_begin()
static void AddRuntimeUnrollDisableMetaData(Loop *L)
The main scalar evolution driver.
This class represents a function call, abstracting a target machine's calling convention.
int64_t getPtrStride(PredicatedScalarEvolution &PSE, Value *Ptr, const Loop *Lp, const ValueToValueMap &StridesMap=ValueToValueMap(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of its element size.
size_type count(PtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
An immutable pass that tracks lazily created AssumptionCache objects.
static Type * largestIntegerVectorType(Type *T1, Type *T2)
static bool isReductionPHI(PHINode *Phi, Loop *TheLoop, RecurrenceDescriptor &RedDes)
Returns true if Phi is a reduction in TheLoop.
A cache of .assume calls within a function.
Analysis pass providing the TargetTransformInfo.
static cl::opt< unsigned > TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), cl::Hidden, cl::desc("Don't vectorize loops with a constant ""trip count that is smaller than this ""value."))
We don't vectorize loops with a known constant trip count below this number.
Externally visible function.
void initializeLoopVectorizePass(PassRegistry &)
This class implements a map that also provides access to all stored values in a deterministic order...
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
value_op_iterator value_op_end()
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly...
LoopT * getParentLoop() const
const Function * getParent() const
Return the enclosing method, or null if none.
static cl::opt< bool > MaximizeBandwidth("vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which ""will be determined by the smallest type in loop."))
const Instruction & front() const
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
Analysis pass which computes a DominatorTree.
An instruction for reading from memory.
FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys=None)
Return the function type for an intrinsic.
BlockT * getExitBlock() const
If getExitBlocks would return exactly one block, return that block.
Type * getElementType() const
static cl::opt< unsigned > NumberOfStoresToPredicate("vectorize-num-stores-pred", cl::init(1), cl::Hidden, cl::desc("Max number of stores to be predicated behind an if."))
The number of stores in a loop that are allowed to need predication.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
iterator end()
Get an iterator to the end of the SetVector.
size_type size() const
Determine the number of elements in the SetVector.
static Value * ConcatenateTwoVectors(IRBuilder<> &Builder, Value *V1, Value *V2)
BlockT * getHeader() const
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
uint64_t getTypeAllocSizeInBits(Type *Ty) const
Returns the offset in bits between successive objects of the specified type, including alignment padd...
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
StringRef getName() const
Return a constant reference to the value's name.
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
iterator begin()
Instruction iterator methods.
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
std::string str() const
Return the twine contents as a std::string.
bool hasUnsafeAlgebra()
Returns true if the recurrence has unsafe algebra which requires a relaxed floating-point model...
static Value * getPointerOperand(Instruction &Inst)
This class represents the LLVM 'select' instruction.
bool isIdenticalTo(const Instruction *I) const
Return true if the specified instruction is exactly identical to the current one. ...
struct fuzzer::@269 Flags
static const unsigned MaxVectorWidth
Maximum SIMD width.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal)
This is the base class for all instructions that perform data casts.
const APInt & getValue() const
Return the constant as an APInt value reference.
const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr it the function does no...
A Use represents the edge between a Value definition and its users.
static cl::opt< bool > EnableLoadStoreRuntimeInterleave("enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, cl::desc("Enable runtime interleaving until load/store ports are saturated"))
static cl::opt< unsigned > MaxInterleaveGroupFactor("max-interleave-group-factor", cl::Hidden, cl::desc("Maximum factor for an interleaved access group (default = 8)"), cl::init(8))
Maximum factor for an interleaved memory access.
static Constant * get(ArrayRef< Constant * > V)
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
An analysis that produces DemandedBits for a function.
Legacy analysis pass which computes BlockFrequencyInfo.
void setName(const Twine &Name)
Change the name of the value.
Analysis pass that exposes the LoopInfo for a function.
ConstantInt * getConstIntStepValue() const
Type * getVectorElementType() const
Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following: ...
static void cse(BasicBlock *BB)
Perform cse of induction variable instructions.
bool remove(const value_type &X)
Remove an item from the set vector.
static const unsigned TinyTripCountInterleaveThreshold
We don't interleave loops with a known constant trip count below this number.
void addPredicate(const SCEVPredicate &Pred)
Adds a new predicate.
LLVM_NODISCARD bool empty() const
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
user_iterator_impl< User > user_iterator
Class to represent function types.
static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI)
static unsigned getAlignment(GlobalVariable *GV)
bool insert(const value_type &X)
Insert a new element into the SetVector.
bool mayReadFromMemory() const
Return true if this instruction may read memory.
LLVM_NODISCARD LLVM_ATTRIBUTE_ALWAYS_INLINE bool startswith(StringRef Prefix) const
Check if this string starts with the given Prefix.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
static bool isEqual(const Function &Caller, const Function &Callee)
void addBasicBlockToLoop(BlockT *NewBB, LoopInfoBase< BlockT, LoopT > &LI)
This method is used by other analyses to update loop information.
Value * transform(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL) const
Compute the transformed value of Index at offset StartValue using step StepValue. ...
void addChildLoop(LoopT *NewChild)
Add the specified loop to be a child of this loop.
Pass * createLoopVectorizePass(bool NoUnrolling=false, bool AlwaysVectorize=true)
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Function Alias Analysis false
BasicBlock * getSuccessor(unsigned i) const
iterator begin()
Get an iterator to the beginning of the SetVector.
static Type * smallestIntegerVectorType(Type *T1, Type *T2)
static FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
static CmpInst * Create(OtherOps Op, Predicate predicate, Value *S1, Value *S2, const Twine &Name="", Instruction *InsertBefore=nullptr)
Construct a compare instruction, given the opcode, the predicate and the two operands.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
bool isFloatingPointTy() const
Return true if this is one of the six floating-point types.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Not an induction variable.
static GCRegistry::Add< OcamlGC > B("ocaml","ocaml 3.10-compatible GC")
An instruction for storing to memory.
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
static cl::opt< unsigned > ForceTargetMaxScalarInterleaveFactor("force-target-max-scalar-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for ""scalar loops."))
static Constant * getInterleavedMask(IRBuilder<> &Builder, unsigned VF, unsigned NumVec)
void takeName(Value *V)
Transfer the name from V to this value.
iterator find(const KeyT &Key)
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
Type * getScalarType() const LLVM_READONLY
If this is a vector type, return the element type, otherwise return 'this'.
This class represents a truncation of integer types.
size_t size() const
size - Get the array size.
Maximum length of the test input libFuzzer tries to guess a good value based on the corpus and reports it always prefer smaller inputs during the corpus shuffle When libFuzzer itself reports a bug this exit code will be used If indicates the maximal total time in seconds to run the fuzzer minimizes the provided crash input Use with etc Experimental Use value profile to guide fuzzing Number of simultaneous worker processes to run the jobs If min(jobs, NumberOfCpuCores()/2)\" is used.") FUZZER_FLAG_INT(reload
bool isInBounds() const
Determine whether the GEP has the inbounds flag.
bool runImpl(Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_, std::function< const LoopAccessInfo &(Loop &)> &GetLAA_, OptimizationRemarkEmitter &ORE)
Class to represent pointers.
static GCRegistry::Add< CoreCLRGC > E("coreclr","CoreCLR-compatible GC")
unsigned getNumIncomingValues() const
Return the number of incoming edges.
iterator_range< block_iterator > blocks() const
Pointer induction var. Step = C / sizeof(elem).
static bool canIfConvertPHINodes(BasicBlock *BB)
Check whether it is safe to if-convert this phi node.
const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Optimization analysis message produced during vectorization.
an instruction for type-safe pointer arithmetic to access elements of arrays and structs ...
const SCEV * getCouldNotCompute()
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
initializer< Ty > init(const Ty &Val)
This instruction inserts a single (scalar) element into a VectorType value.
static cl::opt< bool > EnableCondStoresVectorization("enable-cond-stores-vec", cl::init(true), cl::Hidden, cl::desc("Enable if predication of stores during vectorization."))
Integer induction variable. Step = C.
unsigned getAlignment() const
Return the alignment of the access that is being performed.
A set of analyses that are preserved following a run of a transformation pass.
* if(!EatIfPresent(lltok::kw_thread_local)) return false
ParseOptionalThreadLocal := /*empty.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
constexpr bool isPowerOf2_32(uint32_t Value)
isPowerOf2_32 - This function returns true if the argument is a power of two > 0. ...
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs...ExtraArgs)
Get the result of an analysis pass for a given IR unit.
LLVM Basic Block Representation.
static cl::opt< unsigned > MaxNestedScalarReductionIC("max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, cl::desc("The maximum interleave count to use when interleaving a scalar ""reduction in a nested loop."))
The instances of the Type class are immutable: once they are created, they are never changed...
This is an important class for using LLVM in a threaded context.
Type * getType() const
Return the LLVM type of this SCEV expression.
Conditional or Unconditional Branch instruction.
Min/max implemented in terms of select(cmp()).
bool isVectorTy() const
True if this is an instance of VectorType.
Value handle that tracks a Value across RAUW.
static cl::opt< unsigned > PragmaVectorizeSCEVCheckThreshold("pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden, cl::desc("The maximum number of SCEV checks allowed with a ""vectorize(enable) pragma"))
This is an important base class in LLVM.
This analysis provides dependence information for the memory accesses of a loop.
const Value * getCondition() const
Instruction * getUnsafeAlgebraInst()
Returns induction operator that does not have "fast-math" property and requires FP unsafe mode...
int64_t getSExtValue() const
Get sign extended value.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static Type * getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1)
static Value * addFastMathFlag(Value *V)
Adds a 'fast' flag to floating point operations.
MapVector< Instruction *, uint64_t > computeMinimumValueSizes(ArrayRef< BasicBlock * > Blocks, DemandedBits &DB, const TargetTransformInfo *TTI=nullptr)
Compute a map of integer instructions to their minimum legal type size.
A manager for alias analyses.
APInt Or(const APInt &LHS, const APInt &RHS)
Bitwise OR function for APInt.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
APInt Xor(const APInt &LHS, const APInt &RHS)
Bitwise XOR function for APInt.
LLVM_NODISCARD LLVM_ATTRIBUTE_ALWAYS_INLINE StringRef substr(size_t Start, size_t N=npos) const
Return a reference to the substring from [Start, Start + N).
static Constant * getStridedMask(IRBuilder<> &Builder, unsigned Start, unsigned Stride, unsigned VF)
Interval::pred_iterator pred_begin(Interval *I)
pred_begin/pred_end - define methods so that Intervals may be used just like BasicBlocks can with the...
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
MDNode * getLoopID() const
Return the llvm.loop loop id metadata node for this loop if it is present.
static cl::opt< unsigned > PragmaVectorizeMemoryCheckThreshold("pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, cl::desc("The maximum allowed number of runtime memory checks with a ""vectorize(enable) pragma."))
Represent the analysis usage information of a pass.
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
bool any_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly...
Analysis pass providing a never-invalidated alias analysis result.
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return LHS-RHS. Minus is represented in SCEV as A+B*-1.
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE,"Assign register bank of generic virtual registers", false, false) RegBankSelect
static const unsigned End
uint64_t getNumElements() const
static bool isFirstOrderRecurrence(PHINode *Phi, Loop *TheLoop, DominatorTree *DT)
Returns true if Phi is a first-order recurrence.
FunctionPass class - This class is used to implement most global optimizations.
BlockT * getExitingBlock() const
If getExitingBlocks would return exactly one block, return that block.
Value * getOperand(unsigned i) const
const SCEV * replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE, const ValueToValueMap &PtrToStride, Value *Ptr, Value *OrigPtr=nullptr)
Return the SCEV corresponding to a pointer with the symbolic stride replaced with constant one...
Interval::pred_iterator pred_end(Interval *I)
Value * getPointerOperand()
unsigned getSmallConstantTripCount(Loop *L)
Returns the maximum trip count of the loop if it is a single-exit loop and we can compute a small max...
std::pair< NoneType, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
static cl::opt< unsigned > ForceTargetNumVectorRegs("force-target-num-vector-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of vector registers."))
static CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd)
Create a BitCast AddrSpaceCast, or a PtrToInt cast instruction.
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
const APInt & getAPInt() const
void append(in_iter in_start, in_iter in_end)
Add the specified range to the end of the SmallVector.
bool isPointerTy() const
True if this is an instance of PointerType.
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
static unsigned getIncomingValueNumForOperand(unsigned i)
RecurrenceKind getRecurrenceKind()
static unsigned getRecurrenceBinOp(RecurrenceKind Kind)
Returns the opcode of binary operation corresponding to the RecurrenceKind.
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool mayWriteToMemory() const
Return true if this instruction may modify memory.
void emitLoopVectorizeWarning(LLVMContext &Ctx, const Function &Fn, const DebugLoc &DLoc, const Twine &Msg)
Emit a warning when loop vectorization is specified but fails.
Value * getStartValue() const
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
bool isConditional() const
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
void setLoopID(MDNode *LoopID) const
Set the llvm.loop loop id metadata for this loop.
static const SCEV * getAddressAccessSCEV(Value *Ptr, LoopVectorizationLegality *Legal, ScalarEvolution *SE, const Loop *TheLoop)
Gets Address Access SCEV after verifying that the access pattern is loop invariant except the inducti...
IntegerType * getIntPtrType(LLVMContext &C, unsigned AddressSpace=0) const
Returns an integer type with size at least as big as that of a pointer in the given address space...
A function analysis which provides an AssumptionCache.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
StringRef getString() const
unsigned getABITypeAlignment(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
static Constant * getSplat(unsigned NumElts, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
const MDOperand & getOperand(unsigned I) const
void emitLoopInterleaveWarning(LLVMContext &Ctx, const Function &Fn, const DebugLoc &DLoc, const Twine &Msg)
Emit a warning when loop interleaving is specified but fails.
A SetVector that performs no allocations if smaller than a certain size.
Analysis pass which computes BlockFrequencyInfo.
Iterator for intrusive lists based on ilist_node.
static bool mayDivideByZero(Instruction &I)
A helper function for checking whether an integer division-related instruction may divide by zero (in...
This is the shared class of boolean and integer constants.
A struct for saving information about induction variables.
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
uint64_t getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type...
AnalysisUsage & addRequiredID(const void *ID)
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small...
Module.h This file contains the declarations for the Module class.
Type * getType() const
All values are typed, get the type of this value.
static cl::opt< unsigned > ForceTargetInstructionCost("force-target-instruction-cost", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's expected cost for ""an instruction to a single constant value. Mostly ""useful for getting consistent testing."))
Provides information about what library functions are available for the current target.
InductionKind getKind() const
static Type * convertPointerToIntegerType(const DataLayout &DL, Type *Ty)
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
TerminatorInst * SplitBlockAndInsertIfThen(Value *Cond, Instruction *SplitBefore, bool Unreachable, MDNode *BranchWeights=nullptr, DominatorTree *DT=nullptr, LoopInfo *LI=nullptr)
Split the containing block at the specified instruction - everything before SplitBefore stays in the ...
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
LLVM_NODISCARD T pop_back_val()
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Drive the analysis of memory accesses in the loop.
static Constant * get(Type *Ty, uint64_t V, bool isSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Function * getCalledFunction() const
Return the function called, or null if this is an indirect function invocation.
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
static BranchInst * Create(BasicBlock *IfTrue, Instruction *InsertBefore=nullptr)
static cl::opt< bool > EnableIndVarRegisterHeur("enable-ind-var-reg-heur", cl::init(true), cl::Hidden, cl::desc("Count the induction variable only once when interleaving"))
reverse_iterator rbegin()
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
pred_range predecessors(BasicBlock *BB)
static Constant * get(Type *Ty, double V)
This returns a ConstantFP, or a vector containing a splat of a ConstantFP, for the specified value in...
bool hasVectorInstrinsicScalarOpd(Intrinsic::ID ID, unsigned ScalarOpdIdx)
Identifies if the intrinsic has a scalar operand.
static GCRegistry::Add< ShadowStackGC > C("shadow-stack","Very portable GC for uncooperative code generators")
void setOperand(unsigned i, Value *Val)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
size_type count(const KeyT &Val) const
Return 1 if the specified key is in the map, 0 otherwise.
static Constant * getRecurrenceIdentity(RecurrenceKind K, Type *Tp)
Returns identity corresponding to the RecurrenceKind.
Store the result of a depth first search within basic blocks contained by a single loop...
void clear()
Completely clear the SetVector.
Class to represent vector types.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
bool isIntegerTy() const
True if this is an instance of IntegerType.
iterator_range< user_iterator > users()
BasicBlock * getSinglePredecessor()
Return the predecessor of this block if it has a single predecessor block.
static Value * createMinMaxOp(IRBuilder<> &Builder, MinMaxRecurrenceKind RK, Value *Left, Value *Right)
Returns a Min/Max operation corresponding to MinMaxRecurrenceKind.
unsigned getVectorNumElements() const
static const char lv_name[]
This class uses information about analyze scalars to rewrite expressions in canonical form...
bool isPredicated(MCInstrInfo const &MCII, MCInst const &MCI)
static const unsigned MaxInterleaveFactor
Maximum vectorization interleave count.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Holds information about the memory runtime legality checks to verify that a group of pointers do not ...
LLVM_NODISCARD LLVM_ATTRIBUTE_ALWAYS_INLINE bool empty() const
empty - Check if the string is empty.
loop Loop Strength Reduction
bool hasUnsafeAlgebra()
Returns true if the induction type is FP and the binary operator does not have the "fast-math" proper...
APInt And(const APInt &LHS, const APInt &RHS)
Bitwise AND function for APInt.
Analysis pass that exposes the ScalarEvolution for a function.
unsigned getComplexity() const override
We estimate the complexity of a union predicate as the size number of predicates in the union...
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Analysis pass providing a never-invalidated alias analysis result.
static cl::opt< bool > EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, cl::desc("Enable if-conversion during vectorization."))
const SCEV * getBackedgeTakenCount()
Get the (predicated) backedge count for the analyzed loop.
const SCEVUnionPredicate & getUnionPredicate() const
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
This analysis provides dependence information for the memory accesses of a loop.
Value * getCondition() const
void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
bool isAggregateType() const
Return true if the type is an aggregate type.
static unsigned getVectorCallCost(CallInst *CI, unsigned VF, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI, bool &NeedToScalarize)
This class represents an analyzed expression in the program.
static IntegerType * getInt32Ty(LLVMContext &C)
static cl::opt< bool > EnableInterleavedMemAccesses("enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on interleaved memory accesses in a loop"))
unsigned getNumBlocks() const
Get the number of blocks in this loop in constant time.
Floating point induction variable.
Instruction * getLoopExitInstr()
bool isFunctionVectorizable(StringRef F, unsigned VF) const
Represents a single loop in the control flow graph.
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
unsigned getAlignment() const
Return the alignment of the access that is being performed.
static void emitAnalysis(const LoopAccessReport &Message, const Loop *TheLoop, const char *PassName, OptimizationRemarkEmitter &ORE)
Emit an analysis note for PassName with the debug location from the instruction in Message if availab...
static Instruction::CastOps getCastOpcode(const Value *Val, bool SrcIsSigned, Type *Ty, bool DstIsSigned)
Returns the opcode necessary to cast Val into Ty using usual casting rules.
TerminatorInst * getTerminator()
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
LLVM_ATTRIBUTE_ALWAYS_INLINE size_type size() const
bool hasOneUse() const
Return true if there is exactly one user of this value.
APFloat abs(APFloat X)
Returns the absolute value of the argument.
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
static cl::opt< unsigned, true > VectorizationFactor("force-vector-width", cl::Hidden, cl::desc("Sets the SIMD width. Zero is autoselect."), cl::location(VectorizerParams::VectorizationFactor))
iterator_range< value_op_iterator > operand_values()
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
static cl::opt< unsigned > ForceTargetMaxVectorInterleaveFactor("force-target-max-vector-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for ""vectorized loops."))
void preserve()
Mark an analysis as preserved.
static cl::opt< unsigned > SmallLoopCost("small-loop-cost", cl::init(20), cl::Hidden, cl::desc("The cost of a loop that is considered 'small' by the interleaver."))
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="")
Split the basic block into two basic blocks at the specified instruction.
uint64_t getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type...
Collection of parameters shared beetween the Loop Vectorizer and the Loop Access Analysis.
const SCEV * getStep() const
static OptimizationRemarkAnalysis createMissedAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop, Instruction *I=nullptr)
Create an analysis remark that explains why vectorization failed.
Analysis pass providing the TargetLibraryInfo.
iterator_range< op_iterator > arg_operands()
Iteration adapter for range-for loops.
StringRef getValueAsString() const
Return the attribute's value as a string.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static cl::opt< bool > LoopVectorizeWithBlockFrequency("loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden, cl::desc("Enable the use of the block frequency analysis to access PGO ""heuristics minimizing code growth in cold regions and being more ""aggressive in hot regions."))
SmallPtrSet< Instruction *, 8 > & getCastInsts()
Returns a reference to the instructions used for type-promoting the recurrence.
LLVMContext & getContext() const
Get the context in which this basic block lives.
A raw_ostream that writes to an std::string.
unsigned getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
uint64_t PowerOf2Floor(uint64_t A)
Returns the power of two which is less than or equal to the given value.
LLVM Value Representation.
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
static VectorType * get(Type *ElementType, unsigned NumElements)
This static method is the primary way to construct an VectorType.
std::underlying_type< E >::type Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
static bool blockNeedsPredication(BasicBlock *BB, Loop *TheLoop, DominatorTree *DT)
Return true if the block BB needs to be predicated in order for the loop to be vectorized.
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
uint64_t getTypeSizeInBits(Type *Ty) const
Size examples:
static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst, SmallPtrSetImpl< Value * > &AllowedExit)
Check that the instruction has outside loop users and is not an identified reduction variable...
The legacy pass manager's analysis pass to compute loop information.
PointerType * getPointerTo(unsigned AddrSpace=0) const
Return a pointer to the current type.
print Print MemDeps of function
static unsigned VectorizationInterleave
Interleave factor as overridden by the user.
Convenience struct for specifying and reasoning about fast-math flags.
StringRef - Represent a constant reference to a string, i.e.
A container for analyses that lazily runs them and caches their results.
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, const Twine &N="", Module *M=nullptr)
Legacy analysis pass which computes a DominatorTree.
iterator getFirstInsertionPt()
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object...
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop)...
void setIncomingValue(unsigned i, Value *V)
static bool isInterleaveForced()
True if force-vector-interleave was specified by the user.
op_range incoming_values()
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static GCRegistry::Add< ErlangGC > A("erlang","erlang-compatible garbage collector")
int getBasicBlockIndex(const BasicBlock *BB) const
Return the first index of the specified basic block in the value list for this PHI.
const BasicBlock * getParent() const
static bool isInductionPHI(PHINode *Phi, const Loop *L, ScalarEvolution *SE, InductionDescriptor &D, const SCEV *Expr=nullptr)
Returns true if Phi is an induction in the loop L.
bool isOne() const
This is just a convenience method to make client code smaller for a common case.
static cl::opt< unsigned > VectorizeSCEVCheckThreshold("vectorize-scev-check-threshold", cl::init(16), cl::Hidden, cl::desc("The maximum number of SCEV checks allowed."))
RecurrenceKind
This enum represents the kinds of recurrences that we support.
Enumerate the SCCs of a directed graph in reverse topological order of the SCC DAG.
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal].
bool isVoidTy() const
Return true if this is 'void'.
This class represents a constant integer value.
Legacy wrapper pass to provide the BasicAAResult object.
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
Type * getRecurrenceType()
Returns the type of the recurrence.
bool is_contained(R &&Range, const E &Element)
Wrapper function around std::find to detect if an element exists in a container.