105 using namespace llvm;
106 using namespace llvm::PatternMatch;
108 #define LV_NAME "loop-vectorize"
109 #define DEBUG_TYPE LV_NAME
111 STATISTIC(LoopsVectorized,
"Number of loops vectorized");
112 STATISTIC(LoopsAnalyzed,
"Number of loops analyzed for vectorization");
116 cl::desc(
"Enable if-conversion during vectorization."));
122 cl::desc(
"Don't vectorize loops with a constant "
123 "trip count that is smaller than this "
139 cl::desc(
"Enable symblic stride memory access versioning"));
143 cl::desc(
"Enable vectorization on interleaved memory accesses in a loop"));
148 cl::desc(
"Maximum factor for an interleaved access group (default = 8)"),
157 cl::desc(
"A flag that overrides the target's number of scalar registers."));
161 cl::desc(
"A flag that overrides the target's number of vector registers."));
168 cl::desc(
"A flag that overrides the target's max interleave factor for "
173 cl::desc(
"A flag that overrides the target's max interleave factor for "
174 "vectorized loops."));
178 cl::desc(
"A flag that overrides the target's expected cost for "
179 "an instruction to a single constant value. Mostly "
180 "useful for getting consistent testing."));
185 "The cost of a loop that is considered 'small' by the interleaver."));
189 cl::desc(
"Enable the use of the block frequency analysis to access PGO "
190 "heuristics minimizing code growth in cold regions and being more "
191 "aggressive in hot regions."));
197 "Enable runtime interleaving until load/store ports are saturated"));
202 cl::desc(
"Max number of stores to be predicated behind an if."));
206 cl::desc(
"Count the induction variable only once when interleaving"));
210 cl::desc(
"Enable if predication of stores during vectorization."));
214 cl::desc(
"The maximum interleave count to use when interleaving a scalar "
215 "reduction in a nested loop."));
220 class LoopVectorizationLegality;
221 class LoopVectorizationCostModel;
222 class LoopVectorizeHints;
262 class InnerLoopVectorizer {
267 unsigned UnrollFactor)
268 : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
269 VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()),
270 Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor),
271 Legal(nullptr), AddedSafetyChecks(
false) {}
274 void vectorize(LoopVectorizationLegality *L) {
286 bool IsSafetyChecksAdded() {
287 return AddedSafetyChecks;
290 virtual ~InnerLoopVectorizer() {}
303 VectorParts> EdgeMaskCache;
309 std::pair<Instruction *, Instruction *> addStrideCheck(
Instruction *Loc);
312 void createEmptyLoop();
314 virtual void vectorizeLoop();
325 VectorParts createBlockInMask(
BasicBlock *BB);
331 void vectorizeBlockInLoop(
BasicBlock *BB, PhiVector *PV);
336 void widenPHIInstruction(
Instruction *PN, VectorParts &Entry,
337 unsigned UF,
unsigned VF, PhiVector *PV);
341 void updateAnalysis();
347 virtual void scalarizeInstruction(
Instruction *Instr,
348 bool IfPredicateStore=
false);
351 virtual void vectorizeMemoryInstruction(
Instruction *Instr);
369 VectorParts &getVectorValue(
Value *V);
384 ValueMap(
unsigned UnrollFactor) : UF(UnrollFactor) {}
387 bool has(
Value *Key)
const {
return MapStorage.count(Key); }
393 VectorParts &Entry = MapStorage[Key];
394 Entry.assign(UF, Val);
399 VectorParts &
get(
Value *Key) {
400 VectorParts &Entry = MapStorage[Key];
403 assert(Entry.size() == UF);
414 std::map<Value *, VectorParts> MapStorage;
469 EdgeMaskCache MaskCache;
471 LoopVectorizationLegality *Legal;
474 bool AddedSafetyChecks;
477 class InnerLoopUnroller :
public InnerLoopVectorizer {
482 : InnerLoopVectorizer(OrigLoop, SE, LI, DT, TLI, TTI, 1, UnrollFactor) {}
486 bool IfPredicateStore =
false)
override;
487 void vectorizeMemoryInstruction(
Instruction *Instr)
override;
504 if (
Instruction *OpInst = dyn_cast<Instruction>(*OI))
505 if (OpInst->getDebugLoc() !=
Empty)
515 if (
const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr))
523 static std::string getDebugLocString(
const Loop *L) {
528 LoopDbgLoc.print(OS);
531 OS << L->
getHeader()->getParent()->getParent()->getModuleIdentifier();
543 for (
auto M : Metadata) {
544 unsigned Kind = M.first;
565 propagateMetadata(I, From);
594 class InterleaveGroup {
597 : Align(Align), SmallestKey(0), LargestKey(0), InsertPos(Instr) {
598 assert(Align &&
"The alignment should be non-zero");
601 assert(Factor > 1 &&
"Invalid interleave factor");
603 Reverse = Stride < 0;
607 bool isReverse()
const {
return Reverse; }
608 unsigned getFactor()
const {
return Factor; }
609 unsigned getAlignment()
const {
return Align; }
610 unsigned getNumMembers()
const {
return Members.size(); }
617 bool insertMember(
Instruction *Instr,
int Index,
unsigned NewAlign) {
618 assert(NewAlign &&
"The new member's alignment should be non-zero");
620 int Key = Index + SmallestKey;
623 if (Members.count(Key))
626 if (Key > LargestKey) {
628 if (Index >= static_cast<int>(Factor))
632 }
else if (Key < SmallestKey) {
634 if (LargestKey - Key >= static_cast<int>(Factor))
642 Members[Key] = Instr;
650 int Key = SmallestKey + Index;
651 if (!Members.count(Key))
654 return Members.find(Key)->second;
660 for (
auto I : Members)
661 if (I.second == Instr)
662 return I.first - SmallestKey;
667 Instruction *getInsertPos()
const {
return InsertPos; }
668 void setInsertPos(
Instruction *Inst) { InsertPos = Inst; }
700 class InterleavedAccessInfo {
703 : SE(SE), TheLoop(L), DT(DT) {}
705 ~InterleavedAccessInfo() {
708 for (
auto &I : InterleaveGroupMap)
710 for (
auto *Ptr : DelSet)
720 return InterleaveGroupMap.count(Instr);
726 InterleaveGroup *getInterleaveGroup(
Instruction *Instr)
const {
727 if (InterleaveGroupMap.count(Instr))
728 return InterleaveGroupMap.find(Instr)->second;
741 struct StrideDescriptor {
742 StrideDescriptor(
int Stride,
const SCEV *Scev,
unsigned Size,
744 : Stride(Stride), Scev(Scev), Size(Size), Align(Align) {}
746 StrideDescriptor() : Stride(0), Scev(nullptr), Size(0),
Align(0) {}
758 InterleaveGroup *createInterleaveGroup(
Instruction *Instr,
int Stride,
760 assert(!InterleaveGroupMap.count(Instr) &&
761 "Already in an interleaved access group");
762 InterleaveGroupMap[Instr] =
new InterleaveGroup(Instr, Stride, Align);
763 return InterleaveGroupMap[Instr];
767 void releaseGroup(InterleaveGroup *Group) {
768 for (
unsigned i = 0; i < Group->getFactor(); i++)
770 InterleaveGroupMap.erase(Member);
776 void collectConstStridedAccesses(
794 class LoopVectorizationLegality {
800 : NumPredStores(0), TheLoop(L), SE(SE), TLI(TLI), TheFunction(F),
801 TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), InterleaveInfo(SE, L, DT),
802 Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(
false) {}
812 struct InductionInfo {
814 : StartValue(Start), IK(K), StepValue(Step) {
815 assert(IK != IK_NoInduction &&
"Not an induction");
816 assert(StartValue &&
"StartValue is null");
817 assert(StepValue && !StepValue->isZero() &&
"StepValue is zero");
818 assert((IK != IK_PtrInduction || StartValue->getType()->isPointerTy()) &&
819 "StartValue is not a pointer for pointer induction");
820 assert((IK != IK_IntInduction || StartValue->getType()->isIntegerTy()) &&
821 "StartValue is not an integer for integer induction");
822 assert(StepValue->getType()->isIntegerTy() &&
823 "StepValue is not an integer");
826 : StartValue(nullptr), IK(IK_NoInduction), StepValue(nullptr) {}
832 int getConsecutiveDirection()
const {
833 if (StepValue && (StepValue->isOne() || StepValue->isMinusOne()))
834 return StepValue->getSExtValue();
846 case IK_IntInduction:
847 assert(Index->
getType() == StartValue->getType() &&
848 "Index type does not match StartValue type");
849 if (StepValue->isMinusOne())
851 if (!StepValue->isOne())
855 case IK_PtrInduction:
856 assert(Index->
getType() == StepValue->getType() &&
857 "Index type does not match StepValue type");
858 if (StepValue->isMinusOne())
860 else if (!StepValue->isOne())
862 return B.
CreateGEP(
nullptr, StartValue, Index);
892 PHINode *getInduction() {
return Induction; }
895 ReductionList *getReductionVars() {
return &Reductions; }
898 InductionList *getInductionVars() {
return &Inductions; }
901 Type *getWidestInductionType() {
return WidestIndTy; }
904 bool isInductionVariable(
const Value *V);
918 int isConsecutivePtr(
Value *Ptr);
921 bool isUniform(
Value *V);
924 bool isUniformAfterVectorization(
Instruction* I) {
return Uniforms.count(I); }
928 return LAI->getRuntimePointerChecking();
937 return InterleaveInfo.isInterleaved(Instr);
941 const InterleaveGroup *getInterleavedAccessGroup(
Instruction *Instr) {
942 return InterleaveInfo.getInterleaveGroup(Instr);
947 bool hasStride(
Value *V) {
return StrideSet.count(V); }
948 bool mustCheckStrides() {
return !StrideSet.empty(); }
950 return StrideSet.
begin();
957 return TTI->isLegalMaskedStore(DataType, isConsecutivePtr(Ptr));
961 bool isLegalMaskedLoad(
Type *DataType,
Value *Ptr) {
962 return TTI->isLegalMaskedLoad(DataType, isConsecutivePtr(Ptr));
967 return (MaskedOp.count(I) != 0);
969 unsigned getNumStores()
const {
970 return LAI->getNumStores();
972 unsigned getNumLoads()
const {
973 return LAI->getNumLoads();
975 unsigned getNumPredStores()
const {
976 return NumPredStores;
982 bool canVectorizeInstrs();
988 bool canVectorizeMemory();
992 bool canVectorizeWithIfConvert();
995 void collectLoopUniforms();
1010 void collectStridedAccess(
Value *LoadOrStoreInst);
1020 unsigned NumPredStores;
1042 InterleavedAccessInfo InterleaveInfo;
1050 ReductionList Reductions;
1054 InductionList Inductions;
1066 bool HasFunNoNaNAttr;
1083 class LoopVectorizationCostModel {
1086 LoopVectorizationLegality *Legal,
1089 const Function *
F,
const LoopVectorizeHints *Hints)
1090 : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI),
1091 TheFunction(F), Hints(Hints) {
1109 unsigned getWidestType();
1115 unsigned selectInterleaveCount(
bool OptForSize,
unsigned VF,
1122 unsigned computeInterleaveCount(
bool OptForSize,
unsigned VF,
1127 struct RegisterUsage {
1129 unsigned LoopInvariantRegs;
1131 unsigned MaxLocalUsers;
1133 unsigned NumInstructions;
1137 RegisterUsage calculateRegisterUsage();
1144 unsigned expectedCost(
unsigned VF);
1148 unsigned getInstructionCost(
Instruction *I,
unsigned VF);
1172 LoopVectorizationLegality *Legal;
1179 const LoopVectorizeHints *Hints;
1191 class LoopVectorizeHints {
1205 : Name(Name), Value(Value), Kind(Kind) { }
1207 bool validate(
unsigned Val) {
1237 LoopVectorizeHints(
const Loop *L,
bool DisableInterleaving)
1240 Interleave(
"interleave.count", DisableInterleaving, HK_UNROLL),
1241 Force(
"vectorize.enable", FK_Undefined, HK_FORCE),
1244 getHintsFromMetadata();
1250 DEBUG(
if (DisableInterleaving && Interleave.Value == 1)
dbgs()
1251 <<
"LV: Interleaving disabled by the pass manager\n");
1255 void setAlreadyVectorized() {
1256 Width.Value = Interleave.Value = 1;
1257 Hint Hints[] = {Width, Interleave};
1258 writeHintsToMetadata(Hints);
1262 std::string emitRemark()
const {
1263 VectorizationReport R;
1264 if (Force.Value == LoopVectorizeHints::FK_Disabled)
1265 R <<
"vectorization is explicitly disabled";
1267 R <<
"use -Rpass-analysis=loop-vectorize for more info";
1268 if (Force.Value == LoopVectorizeHints::FK_Enabled) {
1269 R <<
" (Force=true";
1270 if (Width.Value != 0)
1271 R <<
", Vector Width=" << Width.Value;
1272 if (Interleave.Value != 0)
1273 R <<
", Interleave Count=" << Interleave.Value;
1281 unsigned getWidth()
const {
return Width.Value; }
1282 unsigned getInterleave()
const {
return Interleave.Value; }
1283 enum ForceKind getForce()
const {
return (ForceKind)Force.Value; }
1287 void getHintsFromMetadata() {
1288 MDNode *LoopID = TheLoop->getLoopID();
1293 assert(LoopID->
getNumOperands() > 0 &&
"requires at least one operand");
1294 assert(LoopID->
getOperand(0) == LoopID &&
"invalid loop id");
1296 for (
unsigned i = 1, ie = LoopID->
getNumOperands(); i < ie; ++i) {
1303 if (!MD || MD->getNumOperands() == 0)
1306 for (
unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
1310 assert(Args.
size() == 0 &&
"too many arguments for MDString");
1318 if (Args.
size() == 1)
1319 setHint(Name, Args[0]);
1324 void setHint(
StringRef Name, Metadata *Arg) {
1329 const ConstantInt *
C = mdconst::dyn_extract<ConstantInt>(Arg);
1333 Hint *Hints[] = {&Width, &Interleave, &Force};
1334 for (
auto H : Hints) {
1335 if (Name ==
H->Name) {
1336 if (
H->validate(Val))
1339 DEBUG(
dbgs() <<
"LV: ignoring invalid hint '" << Name <<
"'\n");
1347 LLVMContext &Context = TheLoop->getHeader()->getContext();
1360 for (
auto H : HintTypes)
1368 if (HintTypes.
size() == 0)
1374 MDNode *LoopID = TheLoop->getLoopID();
1376 for (
unsigned i = 1, ie = LoopID->
getNumOperands(); i < ie; ++i) {
1379 if (!matchesHintMetadataName(Node, HintTypes))
1380 MDs.push_back(Node);
1385 for (
auto H : HintTypes)
1389 LLVMContext &Context = TheLoop->getHeader()->getContext();
1394 TheLoop->setLoopID(NewLoopID);
1398 const Loop *TheLoop;
1402 const LoopVectorizeHints &LH) {
1406 if (LH.getForce() == LoopVectorizeHints::FK_Enabled) {
1407 if (LH.getWidth() != 1)
1410 "failed explicitly specified loop vectorization");
1411 else if (LH.getInterleave() != 1)
1414 "failed explicitly specified loop interleaving");
1422 for (
Loop *InnerL : L)
1423 addInnerLoop(*InnerL, V);
1431 explicit LoopVectorize(
bool NoUnrolling =
false,
bool AlwaysVectorize =
true)
1433 DisableUnrolling(NoUnrolling),
1434 AlwaysVectorize(AlwaysVectorize) {
1447 bool DisableUnrolling;
1448 bool AlwaysVectorize;
1452 bool runOnFunction(
Function &F)
override {
1453 SE = &getAnalysis<ScalarEvolution>();
1454 LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1455 TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1456 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1457 BFI = &getAnalysis<BlockFrequencyInfo>();
1458 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1459 TLI = TLIP ? &TLIP->getTLI() :
nullptr;
1460 AA = &getAnalysis<AliasAnalysis>();
1461 AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1462 LAA = &getAnalysis<LoopAccessAnalysis>();
1476 if (!TTI->getNumberOfRegisters(
true) && TTI->getMaxInterleaveFactor(1) < 2)
1485 addInnerLoop(*L, Worklist);
1487 LoopsAnalyzed += Worklist.
size();
1490 bool Changed =
false;
1491 while (!Worklist.
empty())
1498 static void AddRuntimeUnrollDisableMetaData(
Loop *L) {
1502 bool IsUnrollMetadata =
false;
1506 for (
unsigned i = 1, ie = LoopID->
getNumOperands(); i < ie; ++i) {
1517 if (!IsUnrollMetadata) {
1522 MDString::get(Context,
"llvm.loop.unroll.runtime.disable"));
1532 bool processLoop(
Loop *L) {
1533 assert(L->
empty() &&
"Only process inner loops.");
1536 const std::string DebugLocStr = getDebugLocString(L);
1539 DEBUG(
dbgs() <<
"\nLV: Checking a loop in \""
1540 << L->
getHeader()->getParent()->getName() <<
"\" from "
1541 << DebugLocStr <<
"\n");
1543 LoopVectorizeHints Hints(L, DisableUnrolling);
1547 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
1549 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
1551 :
"?")) <<
" width=" << Hints.getWidth()
1552 <<
" unroll=" << Hints.getInterleave() <<
"\n");
1565 if (Hints.getForce() == LoopVectorizeHints::FK_Disabled) {
1566 DEBUG(
dbgs() <<
"LV: Not vectorizing: #pragma vectorize disable.\n");
1572 if (!AlwaysVectorize && Hints.getForce() != LoopVectorizeHints::FK_Enabled) {
1573 DEBUG(
dbgs() <<
"LV: Not vectorizing: No #pragma vectorize enable.\n");
1579 if (Hints.getWidth() == 1 && Hints.getInterleave() == 1) {
1580 DEBUG(
dbgs() <<
"LV: Not vectorizing: Disabled/already vectorized.\n");
1583 "loop not vectorized: vector width and interleave count are "
1584 "explicitly set to 1");
1590 const unsigned TC = SE->getSmallConstantTripCount(L);
1592 DEBUG(
dbgs() <<
"LV: Found a loop with a very small trip count. "
1593 <<
"This loop is not worth vectorizing.");
1594 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
1595 DEBUG(
dbgs() <<
" But vectorizing was explicitly forced.\n");
1600 "vectorization is not beneficial and is not explicitly forced");
1606 LoopVectorizationLegality LVL(L, SE, DT, TLI, AA, F, TTI, LAA);
1607 if (!LVL.canVectorize()) {
1608 DEBUG(
dbgs() <<
"LV: Not vectorizing: Cannot prove legality.\n");
1609 emitMissedWarning(F, L, Hints);
1614 LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, TLI, AC, F, &Hints);
1618 bool OptForSize = Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
1628 if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
1629 LoopEntryFreq < ColdEntryFreq)
1638 DEBUG(
dbgs() <<
"LV: Can't vectorize when the NoImplicitFloat"
1639 "attribute is used.\n");
1642 "loop not vectorized due to NoImplicitFloat attribute");
1643 emitMissedWarning(F, L, Hints);
1649 CM.selectVectorizationFactor(OptForSize);
1652 unsigned IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost);
1654 DEBUG(
dbgs() <<
"LV: Found a vectorizable loop (" << VF.Width <<
") in "
1655 << DebugLocStr <<
'\n');
1656 DEBUG(
dbgs() <<
"LV: Interleave Count is " << IC <<
'\n');
1658 if (VF.Width == 1) {
1659 DEBUG(
dbgs() <<
"LV: Vectorization is possible but not beneficial\n");
1664 "not beneficial to vectorize and user disabled interleaving");
1667 DEBUG(
dbgs() <<
"LV: Trying to at least unroll the loops.\n");
1672 " (vectorization not beneficial)"));
1674 InnerLoopUnroller Unroller(L, SE, LI, DT, TLI, TTI, IC);
1675 Unroller.vectorize(&LVL);
1678 InnerLoopVectorizer LB(L, SE, LI, DT, TLI, TTI, VF.Width, IC);
1685 if (!LB.IsSafetyChecksAdded())
1686 AddRuntimeUnrollDisableMetaData(L);
1690 Twine(
"vectorized loop (vectorization width: ") +
1691 Twine(VF.Width) +
", interleaved count: " +
1696 Hints.setAlreadyVectorized();
1727 Value *InnerLoopVectorizer::getBroadcastInstrs(
Value *V) {
1731 (Instr && std::find(LoopVectorBody.begin(), LoopVectorBody.end(),
1732 Instr->
getParent()) != LoopVectorBody.end());
1733 bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr;
1738 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1741 Value *Shuf = Builder.CreateVectorSplat(VF, V,
"broadcast");
1746 Value *InnerLoopVectorizer::getStepVector(
Value *Val,
int StartIdx,
1750 "Elem must be an integer");
1752 "Step has wrong type");
1760 for (
int i = 0; i < VLen; ++i)
1765 assert(Cv->
getType() == Val->
getType() &&
"Invalid consecutive vec");
1766 Step = Builder.CreateVectorSplat(VLen, Step);
1767 assert(Step->getType() == Val->
getType() &&
"Invalid step vec");
1770 Step = Builder.CreateMul(Cv, Step);
1771 return Builder.CreateAdd(Val, Step,
"induction");
1774 int LoopVectorizationLegality::isConsecutivePtr(
Value *Ptr) {
1781 PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr);
1782 if (Phi && Inductions.count(Phi)) {
1783 InductionInfo II = Inductions[Phi];
1784 return II.getConsecutiveDirection();
1796 if (Phi && Inductions.count(Phi)) {
1804 for (
unsigned i = 1; i < NumOperands; ++i)
1805 if (!SE->isLoopInvariant(SE->getSCEV(Gep->
getOperand(i)), TheLoop))
1808 InductionInfo II = Inductions[Phi];
1809 return II.getConsecutiveDirection();
1816 for (
unsigned i = 0; i != NumOperands; ++i)
1817 if (i != InductionOperand &&
1818 !SE->isLoopInvariant(SE->getSCEV(Gep->
getOperand(i)), TheLoop))
1824 if (!Strides.count(Gep))
1825 Last = SE->getSCEV(Gep->
getOperand(InductionOperand));
1838 if (
const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(Last))
1845 const SCEV *Step = AR->getStepRecurrence(*SE);
1858 bool LoopVectorizationLegality::isUniform(
Value *V) {
1859 return LAI->isUniform(V);
1863 InnerLoopVectorizer::getVectorValue(
Value *V) {
1864 assert(V != Induction &&
"The new induction variable should not be used.");
1868 if (Legal->hasStride(V))
1872 if (WidenMap.has(V))
1873 return WidenMap.get(V);
1877 Value *B = getBroadcastInstrs(V);
1878 return WidenMap.splat(V, B);
1881 Value *InnerLoopVectorizer::reverseVector(
Value *Vec) {
1884 for (
unsigned i = 0; i < VF; ++i)
1885 ShuffleMask.
push_back(Builder.getInt32(VF - i - 1));
1899 for (
unsigned i = 0; i < VF; i++)
1900 for (
unsigned j = 0; j < NumVec; j++)
1909 unsigned Stride,
unsigned VF) {
1911 for (
unsigned i = 0; i < VF; i++)
1921 unsigned NumUndef) {
1923 for (
unsigned i = 0; i < NumInt; i++)
1927 for (
unsigned i = 0; i < NumUndef; i++)
1940 assert(VecTy1 && VecTy2 &&
1942 "Expect two vectors with the same element type");
1945 unsigned NumElts2 = VecTy2->getNumElements();
1946 assert(NumElts1 >= NumElts2 &&
"Unexpect the first vector has less elements");
1948 if (NumElts1 > NumElts2) {
1962 unsigned NumVec = InputList.
size();
1963 assert(NumVec > 1 &&
"Should be at least two vectors");
1969 for (
unsigned i = 0; i < NumVec - 1; i += 2) {
1970 Value *V0 = ResList[i], *V1 = ResList[i + 1];
1971 assert((V0->
getType() == V1->getType() || i == NumVec - 2) &&
1972 "Only the last vector may have a different type");
1978 if (NumVec % 2 != 0)
1982 NumVec = ResList.
size();
1983 }
while (NumVec > 1);
2016 void InnerLoopVectorizer::vectorizeInterleaveGroup(
Instruction *Instr) {
2017 const InterleaveGroup *Group = Legal->getInterleavedAccessGroup(Instr);
2018 assert(Group &&
"Fail to get an interleaved access group.");
2021 if (Instr != Group->getInsertPos())
2029 Type *ScalarTy = LI ? LI->
getType() : SI->getValueOperand()->getType();
2030 unsigned InterleaveFactor = Group->getFactor();
2032 Type *PtrTy = VecTy->
getPointerTo(Ptr->getType()->getPointerAddressSpace());
2035 setDebugLocFromInst(Builder, Ptr);
2036 VectorParts &PtrParts = getVectorValue(Ptr);
2038 unsigned Index = Group->getIndex(Instr);
2039 for (
unsigned Part = 0; Part < UF; Part++) {
2042 Value *NewPtr = Builder.CreateExtractElement(
2044 Group->isReverse() ? Builder.getInt32(VF - 1) : Builder.getInt32(0));
2057 NewPtr = Builder.CreateGEP(NewPtr, Builder.getInt32(-Index));
2060 NewPtrs.
push_back(Builder.CreateBitCast(NewPtr, PtrTy));
2063 setDebugLocFromInst(Builder, Instr);
2068 for (
unsigned Part = 0; Part < UF; Part++) {
2069 Instruction *NewLoadInstr = Builder.CreateAlignedLoad(
2070 NewPtrs[Part], Group->getAlignment(),
"wide.vec");
2072 for (
unsigned i = 0; i < InterleaveFactor; i++) {
2080 Value *StridedVec = Builder.CreateShuffleVector(
2081 NewLoadInstr, UndefVec, StrideMask,
"strided.vec");
2084 if (Member->
getType() != ScalarTy) {
2086 StridedVec = Builder.CreateBitOrPointerCast(StridedVec, OtherVTy);
2089 VectorParts &Entry = WidenMap.get(Member);
2091 Group->isReverse() ? reverseVector(StridedVec) : StridedVec;
2094 propagateMetadata(NewLoadInstr, Instr);
2103 for (
unsigned Part = 0; Part < UF; Part++) {
2106 for (
unsigned i = 0; i < InterleaveFactor; i++) {
2109 assert(Member &&
"Fail to get a member from an interleaved store group");
2112 getVectorValue(dyn_cast<StoreInst>(Member)->getValueOperand())[Part];
2113 if (Group->isReverse())
2114 StoredVec = reverseVector(StoredVec);
2117 if (StoredVec->
getType() != SubVT)
2118 StoredVec = Builder.CreateBitOrPointerCast(StoredVec, SubVT);
2128 Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2132 Builder.CreateAlignedStore(IVec, NewPtrs[Part], Group->getAlignment());
2133 propagateMetadata(NewStoreInstr, Instr);
2137 void InnerLoopVectorizer::vectorizeMemoryInstruction(
Instruction *Instr) {
2142 assert((LI || SI) &&
"Invalid Load/Store instruction");
2145 if (Legal->isAccessInterleaved(Instr))
2146 return vectorizeInterleaveGroup(Instr);
2148 Type *ScalarDataTy = LI ? LI->
getType() : SI->getValueOperand()->getType();
2151 unsigned Alignment = LI ? LI->
getAlignment() : SI->getAlignment();
2161 if (SI && Legal->blockNeedsPredication(SI->getParent()) &&
2162 !Legal->isMaskRequired(SI))
2163 return scalarizeInstruction(Instr,
true);
2165 if (ScalarAllocatedSize != VectorElementSize)
2166 return scalarizeInstruction(Instr);
2170 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
2171 bool Reverse = ConsecutiveStride < 0;
2172 bool UniformLoad = LI && Legal->isUniform(Ptr);
2173 if (!ConsecutiveStride || UniformLoad)
2174 return scalarizeInstruction(Instr);
2176 Constant *Zero = Builder.getInt32(0);
2177 VectorParts &Entry = WidenMap.get(Instr);
2182 setDebugLocFromInst(Builder, Gep);
2184 Value *FirstBasePtr = getVectorValue(PtrOperand)[0];
2185 FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero);
2190 Gep2->
setName(
"gep.indvar.base");
2191 Ptr = Builder.Insert(Gep2);
2193 setDebugLocFromInst(Builder, Gep);
2195 OrigLoop) &&
"Base ptr must be invariant");
2204 for (
unsigned i = 0; i < NumOperands; ++i) {
2209 if (i == InductionOperand ||
2210 (GepOperandInst && OrigLoop->contains(GepOperandInst))) {
2211 assert((i == InductionOperand ||
2212 SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) &&
2213 "Must be last index or loop invariant");
2215 VectorParts &GEPParts = getVectorValue(GepOperand);
2216 Value *Index = GEPParts[0];
2217 Index = Builder.CreateExtractElement(Index, Zero);
2219 Gep2->
setName(
"gep.indvar.idx");
2222 Ptr = Builder.Insert(Gep2);
2225 assert(isa<PHINode>(Ptr) &&
"Invalid induction ptr");
2226 setDebugLocFromInst(Builder, Ptr);
2227 VectorParts &PtrVal = getVectorValue(Ptr);
2228 Ptr = Builder.CreateExtractElement(PtrVal[0], Zero);
2231 VectorParts Mask = createBlockInMask(Instr->
getParent());
2234 assert(!Legal->isUniform(SI->getPointerOperand()) &&
2235 "We do not allow storing to uniform addresses");
2236 setDebugLocFromInst(Builder, SI);
2239 VectorParts StoredVal = getVectorValue(SI->getValueOperand());
2241 for (
unsigned Part = 0; Part < UF; ++Part) {
2244 Builder.CreateGEP(
nullptr, Ptr, Builder.getInt32(Part * VF));
2249 StoredVal[Part] = reverseVector(StoredVal[Part]);
2252 PartPtr = Builder.CreateGEP(
nullptr, Ptr, Builder.getInt32(-Part * VF));
2253 PartPtr = Builder.CreateGEP(
nullptr, PartPtr, Builder.getInt32(1 - VF));
2254 Mask[Part] = reverseVector(Mask[Part]);
2257 Value *VecPtr = Builder.CreateBitCast(PartPtr,
2261 if (Legal->isMaskRequired(SI))
2262 NewSI = Builder.CreateMaskedStore(StoredVal[Part], VecPtr, Alignment,
2265 NewSI = Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment);
2266 propagateMetadata(NewSI, SI);
2272 assert(LI &&
"Must have a load instruction");
2273 setDebugLocFromInst(Builder, LI);
2274 for (
unsigned Part = 0; Part < UF; ++Part) {
2277 Builder.CreateGEP(
nullptr, Ptr, Builder.getInt32(Part * VF));
2282 PartPtr = Builder.CreateGEP(
nullptr, Ptr, Builder.getInt32(-Part * VF));
2283 PartPtr = Builder.CreateGEP(
nullptr, PartPtr, Builder.getInt32(1 - VF));
2284 Mask[Part] = reverseVector(Mask[Part]);
2288 Value *VecPtr = Builder.CreateBitCast(PartPtr,
2290 if (Legal->isMaskRequired(LI))
2291 NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
2293 "wide.masked.load");
2295 NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment,
"wide.load");
2296 propagateMetadata(NewLI, LI);
2297 Entry[Part] = Reverse ? reverseVector(NewLI) : NewLI;
2301 void InnerLoopVectorizer::scalarizeInstruction(
Instruction *Instr,
bool IfPredicateStore) {
2306 setDebugLocFromInst(Builder, Instr);
2313 if (SrcOp == OldInduction) {
2314 Params.push_back(getVectorValue(SrcOp));
2323 if (SrcInst && OrigLoop->contains(SrcInst)) {
2324 assert(WidenMap.has(SrcInst) &&
"Source operand is unavailable");
2326 Params.push_back(WidenMap.get(SrcInst));
2329 VectorParts Scalars;
2330 Scalars.append(UF, SrcOp);
2331 Params.push_back(Scalars);
2336 "Invalid number of operands");
2341 Value *UndefVec = IsVoidRetTy ?
nullptr :
2344 VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
2347 BasicBlock *IfBlock = Builder.GetInsertBlock();
2351 Loop *VectorLp =
nullptr;
2352 if (IfPredicateStore) {
2354 "Only support single predecessor blocks");
2357 VectorLp = LI->getLoopFor(IfBlock);
2358 assert(VectorLp &&
"Must have a loop for this block");
2362 for (
unsigned Part = 0; Part < UF; ++Part) {
2364 for (
unsigned Width = 0; Width < VF; ++Width) {
2367 Value *Cmp =
nullptr;
2368 if (IfPredicateStore) {
2369 Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width));
2372 LoopVectorBody.push_back(CondBlock);
2373 VectorLp->addBasicBlockToLoop(CondBlock, *LI);
2375 Builder.SetInsertPoint(InsertPt);
2386 Op = Builder.CreateExtractElement(Op, Builder.getInt32(Width));
2391 Builder.Insert(Cloned);
2396 VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned,
2397 Builder.getInt32(Width));
2399 if (IfPredicateStore) {
2401 LoopVectorBody.push_back(NewIfBlock);
2402 VectorLp->addBasicBlockToLoop(NewIfBlock, *LI);
2403 Builder.SetInsertPoint(InsertPt);
2406 IfBlock = NewIfBlock;
2421 std::pair<Instruction *, Instruction *>
2422 InnerLoopVectorizer::addStrideCheck(
Instruction *Loc) {
2424 if (!Legal->mustCheckStrides())
2425 return std::pair<Instruction *, Instruction *>(tnullptr, tnullptr);
2433 SE = Legal->strides_end();
2441 Check = ChkBuilder.CreateOr(Check, C);
2452 ChkBuilder.Insert(TheCheck,
"stride.not.one");
2455 return std::make_pair(FirstInst, TheCheck);
2458 void InnerLoopVectorizer::createEmptyLoop() {
2491 BasicBlock *OldBasicBlock = OrigLoop->getHeader();
2492 BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
2493 BasicBlock *ExitBlock = OrigLoop->getExitBlock();
2494 assert(VectorPH &&
"Invalid loop structure");
2495 assert(ExitBlock &&
"Must have an exit block");
2501 OldInduction = Legal->getInduction();
2502 Type *IdxTy = Legal->getWidestInductionType();
2505 const SCEV *ExitCount = SE->getBackedgeTakenCount(OrigLoop);
2506 assert(ExitCount != SE->getCouldNotCompute() &&
"Invalid loop count");
2515 ExitCount = SE->getTruncateOrNoop(ExitCount, IdxTy);
2517 const SCEV *BackedgeTakeCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy);
2519 ExitCount = SE->getAddExpr(BackedgeTakeCount,
2520 SE->getConstant(BackedgeTakeCount->
getType(), 1));
2532 Value *BackedgeCount =
2533 Exp.expandCodeFor(BackedgeTakeCount, BackedgeTakeCount->
getType(),
2537 "backedge.ptrcnt.to.int",
2548 Value *StartIdx = ExtendedIdx =
2550 ? Builder.CreateZExt(OldInduction->getIncomingValueForBlock(VectorPH),
2555 Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->
getType(),
2558 LoopBypassBlocks.push_back(VectorPH);
2579 LI->addTopLevelLoop(Lp);
2588 setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction));
2589 Induction = Builder.CreatePHI(IdxTy, 2,
"index");
2603 VectorPH = NewVectorPH;
2607 IRBuilder<> BypassBuilder(VectorPH->getTerminator());
2608 setDebugLocFromInst(BypassBuilder,
2609 getDebugLocFromInstOrOperands(OldInduction));
2613 if (Count->
getType() != IdxTy) {
2617 Count = BypassBuilder.CreatePointerCast(Count, IdxTy,
"ptrcnt.to.int");
2619 Count = BypassBuilder.CreateZExtOrTrunc(Count, IdxTy,
"cnt.cast");
2623 Value *IdxEnd = BypassBuilder.CreateAdd(Count, StartIdx,
"end.idx");
2627 Value *R = BypassBuilder.CreateURem(Count, Step,
"n.mod.vf");
2628 Value *CountRoundDown = BypassBuilder.CreateSub(Count, R,
"n.vec");
2629 Value *IdxEndRoundDown = BypassBuilder.CreateAdd(CountRoundDown, StartIdx,
2630 "end.idx.rnd.down");
2635 BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx,
"cmp.zero");
2640 LoopBypassBlocks.push_back(VectorPH);
2643 VectorPH = NewVectorPH;
2650 std::tie(FirstCheckInst, StrideCheck) =
2651 addStrideCheck(VectorPH->getTerminator());
2653 AddedSafetyChecks =
true;
2655 VectorPH->
setName(
"vector.stridecheck");
2660 LoopBypassBlocks.push_back(VectorPH);
2665 VectorPH->getTerminator(),
2668 VectorPH = NewVectorPH;
2675 std::tie(FirstCheckInst, MemRuntimeCheck) =
2676 Legal->getLAI()->addRuntimeCheck(VectorPH->getTerminator());
2677 if (MemRuntimeCheck) {
2678 AddedSafetyChecks =
true;
2680 VectorPH->
setName(
"vector.memcheck");
2685 LoopBypassBlocks.push_back(VectorPH);
2690 VectorPH->getTerminator(),
2693 VectorPH = NewVectorPH;
2705 PHINode *ResumeIndex =
nullptr;
2709 BypassBuilder.SetInsertPoint(LoopBypassBlocks.back()->getTerminator());
2710 for (I = List->
begin(), E = List->
end(); I != E; ++
I) {
2712 LoopVectorizationLegality::InductionInfo II = I->second;
2714 Type *ResumeValTy = (OrigPhi == OldInduction) ? IdxTy : OrigPhi->
getType();
2719 PHINode *TruncResumeVal = (OrigPhi == OldInduction) ?
2728 PHINode *BCTruncResumeVal =
nullptr;
2729 if (OrigPhi == OldInduction) {
2733 BCTruncResumeVal->
addIncoming(TruncResumeVal, MiddleBlock);
2736 Value *EndValue =
nullptr;
2738 case LoopVectorizationLegality::IK_NoInduction:
2740 case LoopVectorizationLegality::IK_IntInduction: {
2745 if (OrigPhi == OldInduction) {
2749 BypassBuilder.CreateTrunc(IdxEndRoundDown, OrigPhi->
getType());
2752 for (
unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++
I)
2753 TruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
2754 TruncResumeVal->addIncoming(EndValue, VecBody);
2756 BCTruncResumeVal->
addIncoming(II.StartValue, LoopBypassBlocks[0]);
2759 EndValue = IdxEndRoundDown;
2761 ResumeIndex = ResumeVal;
2767 Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown,
2770 EndValue = II.transform(BypassBuilder, CRD);
2774 case LoopVectorizationLegality::IK_PtrInduction: {
2775 Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown,
2778 EndValue = II.transform(BypassBuilder, CRD);
2779 EndValue->
setName(
"ptr.ind.end");
2786 for (
unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++
I) {
2787 if (OrigPhi == OldInduction)
2788 ResumeVal->
addIncoming(StartIdx, LoopBypassBlocks[I]);
2790 ResumeVal->
addIncoming(II.StartValue, LoopBypassBlocks[I]);
2799 if (OrigPhi == OldInduction) {
2800 BCResumeVal->
addIncoming(StartIdx, LoopBypassBlocks[0]);
2803 BCResumeVal->
addIncoming(II.StartValue, LoopBypassBlocks[0]);
2813 assert(!ResumeIndex &&
"Unexpected resume value found");
2816 for (
unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++
I)
2817 ResumeIndex->
addIncoming(StartIdx, LoopBypassBlocks[I]);
2818 ResumeIndex->
addIncoming(IdxEndRoundDown, VecBody);
2823 "Invalid resume Index");
2829 ResumeIndex,
"cmp.n",
2835 Value *NextIdx = Builder.CreateAdd(Induction, Step,
"index.next");
2836 Induction->addIncoming(StartIdx, VectorPH);
2837 Induction->addIncoming(NextIdx, VecBody);
2839 Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown);
2840 Builder.CreateCondBr(ICmp, MiddleBlock, VecBody);
2849 LoopVectorPreHeader = VectorPH;
2850 LoopScalarPreHeader = ScalarPH;
2851 LoopMiddleBlock = MiddleBlock;
2852 LoopExitBlock = ExitBlock;
2853 LoopVectorBody.push_back(VecBody);
2854 LoopScalarBody = OldBasicBlock;
2856 LoopVectorizeHints Hints(Lp,
true);
2857 Hints.setAlreadyVectorized();
2861 struct CSEDenseMapInfo {
2863 return isa<InsertElementInst>(
I) || isa<ExtractElementInst>(I) ||
2864 isa<ShuffleVectorInst>(
I) || isa<GetElementPtrInst>(I);
2873 assert(canHandle(I) &&
"Unknown instruction!");
2878 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
2879 LHS == getTombstoneKey() || RHS == getTombstoneKey())
2892 return BlockNum % 2;
2899 for (
unsigned i = 0, e = BBs.
size(); i != e; ++i) {
2904 if (!CSEDenseMapInfo::canHandle(In))
2927 if (isa<FPMathOperator>(V)){
2930 cast<Instruction>(V)->setFastMathFlags(Flags);
2942 assert(Ty->
isVectorTy() &&
"Can only scalarize vectors");
2962 bool &NeedToScalarize) {
2976 return ScalarCallCost;
2979 Type *RetTy = ToVectorTy(ScalarRetTy, VF);
2980 for (
unsigned i = 0, ie = ScalarTys.
size(); i != ie; ++i)
2981 Tys.
push_back(ToVectorTy(ScalarTys[i], VF));
2985 unsigned ScalarizationCost =
2987 for (
unsigned i = 0, ie = Tys.
size(); i != ie; ++i)
2990 unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
2994 NeedToScalarize =
true;
3000 if (VectorCallCost < Cost) {
3001 NeedToScalarize =
false;
3002 return VectorCallCost;
3014 assert(ID &&
"Expected intrinsic call!");
3024 void InnerLoopVectorizer::vectorizeLoop() {
3032 Constant *Zero = Builder.getInt32(0);
3042 PhiVector RdxPHIsToFix;
3051 be = DFS.endRPO(); bb != be; ++bb)
3052 vectorizeBlockInLoop(*bb, &RdxPHIsToFix);
3063 for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end();
3066 assert(RdxPhi &&
"Unable to recover vectorized PHI");
3069 assert(Legal->getReductionVars()->count(RdxPhi) &&
3070 "Unable to find the reduction variable");
3075 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3077 RdxDesc.getMinMaxRecurrenceKind();
3078 setDebugLocFromInst(Builder, ReductionStartValue);
3084 Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator());
3087 VectorParts &VectorExit = getVectorValue(LoopExitInst);
3088 Type *VecTy = VectorExit[0]->getType();
3098 VectorStart = Identity = ReductionStartValue;
3100 VectorStart = Identity =
3101 Builder.CreateVectorSplat(VF, ReductionStartValue,
"minmax.ident");
3111 VectorStart = ReductionStartValue;
3118 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3126 VectorParts &VecRdxPhi = WidenMap.get(RdxPhi);
3127 BasicBlock *Latch = OrigLoop->getLoopLatch();
3129 VectorParts &Val = getVectorValue(LoopVal);
3130 for (
unsigned part = 0; part < UF; ++part) {
3133 Value *StartVal = (part == 0) ? VectorStart : Identity;
3134 cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal,
3135 LoopVectorPreHeader);
3136 cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part],
3137 LoopVectorBody.back());
3144 Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt());
3146 VectorParts RdxParts;
3147 setDebugLocFromInst(Builder, LoopExitInst);
3148 for (
unsigned part = 0; part < UF; ++part) {
3151 VectorParts &RdxExitVal = getVectorValue(LoopExitInst);
3152 PHINode *NewPhi = Builder.CreatePHI(VecTy, 2,
"rdx.vec.exit.phi");
3153 Value *StartVal = (part == 0) ? VectorStart : Identity;
3154 for (
unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++
I)
3155 NewPhi->
addIncoming(StartVal, LoopBypassBlocks[I]);
3157 LoopVectorBody.back());
3158 RdxParts.push_back(NewPhi);
3162 Value *ReducedPartRdx = RdxParts[0];
3164 setDebugLocFromInst(Builder, ReducedPartRdx);
3165 for (
unsigned part = 1; part < UF; ++part) {
3166 if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3170 ReducedPartRdx,
"bin.rdx"));
3173 Builder, MinMaxKind, ReducedPartRdx, RdxParts[part]);
3181 "Reduction emission only supported for pow2 vectors!");
3182 Value *TmpVec = ReducedPartRdx;
3184 for (
unsigned i = VF; i != 1; i >>= 1) {
3186 for (
unsigned j = 0; j != i/2; ++j)
3187 ShuffleMask[j] = Builder.getInt32(i/2 + j);
3190 std::fill(&ShuffleMask[i/2], ShuffleMask.end(),
3194 Builder.CreateShuffleVector(TmpVec,
3199 if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3209 ReducedPartRdx = Builder.CreateExtractElement(TmpVec,
3210 Builder.getInt32(0));
3216 LoopScalarPreHeader->getTerminator());
3217 BCBlockPhi->
addIncoming(ReductionStartValue, LoopBypassBlocks[0]);
3218 BCBlockPhi->
addIncoming(ReducedPartRdx, LoopMiddleBlock);
3225 LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
3227 if (!LCSSAPhi)
break;
3237 LCSSAPhi->
addIncoming(ReducedPartRdx, LoopMiddleBlock);
3244 int IncomingEdgeBlockIdx =
3245 (RdxPhi)->getBasicBlockIndex(OrigLoop->getLoopLatch());
3246 assert(IncomingEdgeBlockIdx >= 0 &&
"Invalid block index");
3248 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3249 (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3250 (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3256 cse(LoopVectorBody);
3259 void InnerLoopVectorizer::fixLCSSAPHIs() {
3261 LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
3263 if (!LCSSAPhi)
break;
3276 std::pair<BasicBlock*, BasicBlock*> Edge(Src, Dst);
3277 EdgeMaskCache::iterator ECEntryIt = MaskCache.find(Edge);
3278 if (ECEntryIt != MaskCache.end())
3279 return ECEntryIt->second;
3281 VectorParts SrcMask = createBlockInMask(Src);
3285 assert(BI &&
"Unexpected terminator found");
3288 VectorParts EdgeMask = getVectorValue(BI->
getCondition());
3291 for (
unsigned part = 0; part < UF; ++part)
3292 EdgeMask[part] = Builder.CreateNot(EdgeMask[part]);
3294 for (
unsigned part = 0; part < UF; ++part)
3295 EdgeMask[part] = Builder.CreateAnd(EdgeMask[part], SrcMask[part]);
3297 MaskCache[Edge] = EdgeMask;
3301 MaskCache[Edge] = SrcMask;
3306 InnerLoopVectorizer::createBlockInMask(
BasicBlock *BB) {
3307 assert(OrigLoop->contains(BB) &&
"Block is not a part of a loop");
3310 if (OrigLoop->getHeader() == BB) {
3312 return getVectorValue(C);
3317 VectorParts BlockMask = getVectorValue(Zero);
3321 VectorParts EM = createEdgeMask(*it, BB);
3322 for (
unsigned part = 0; part < UF; ++part)
3323 BlockMask[part] = Builder.CreateOr(BlockMask[part], EM[part]);
3329 void InnerLoopVectorizer::widenPHIInstruction(
Instruction *PN,
3331 unsigned UF,
unsigned VF, PhiVector *PV) {
3334 if (Legal->getReductionVars()->count(P)) {
3335 for (
unsigned part = 0; part < UF; ++part) {
3340 LoopVectorBody.back()-> getFirstInsertionPt());
3346 setDebugLocFromInst(Builder, P);
3348 if (P->
getParent() != OrigLoop->getHeader()) {
3362 for (
unsigned In = 0;
In < NumIncoming;
In++) {
3367 for (
unsigned part = 0; part < UF; ++part) {
3371 Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
3376 Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
3377 Entry[part],
"predphi");
3385 assert(Legal->getInductionVars()->count(P) &&
3386 "Not an induction variable");
3388 LoopVectorizationLegality::InductionInfo II =
3389 Legal->getInductionVars()->lookup(P);
3394 case LoopVectorizationLegality::IK_NoInduction:
3396 case LoopVectorizationLegality::IK_IntInduction: {
3397 assert(P->
getType() == II.StartValue->getType() &&
"Types must match");
3400 if (P == OldInduction) {
3403 Broadcasted = Builder.CreateTrunc(Induction, PhiTy);
3407 Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx,
3409 NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy);
3410 Broadcasted = II.transform(Builder, NormalizedIdx);
3411 Broadcasted->
setName(
"offset.idx");
3413 Broadcasted = getBroadcastInstrs(Broadcasted);
3416 for (
unsigned part = 0; part < UF; ++part)
3417 Entry[part] = getStepVector(Broadcasted, VF * part, II.StepValue);
3420 case LoopVectorizationLegality::IK_PtrInduction:
3424 Value *NormalizedIdx =
3425 Builder.CreateSub(Induction, ExtendedIdx,
"normalized.idx");
3427 Builder.CreateSExtOrTrunc(NormalizedIdx, II.StepValue->getType());
3430 for (
unsigned part = 0; part < UF; ++part) {
3432 int EltIndex = part;
3434 Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx);
3435 Value *SclrGep = II.transform(Builder, GlobalIdx);
3437 Entry[part] = SclrGep;
3442 for (
unsigned int i = 0; i < VF; ++i) {
3443 int EltIndex = i + part * VF;
3445 Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx);
3446 Value *SclrGep = II.transform(Builder, GlobalIdx);
3448 VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
3449 Builder.getInt32(i),
3452 Entry[part] = VecVal;
3458 void InnerLoopVectorizer::vectorizeBlockInLoop(
BasicBlock *BB, PhiVector *PV) {
3461 VectorParts &Entry = WidenMap.get(it);
3462 switch (it->getOpcode()) {
3463 case Instruction::Br:
3469 widenPHIInstruction(it, Entry, UF, VF, PV);
3473 case Instruction::Add:
3474 case Instruction::FAdd:
3475 case Instruction::Sub:
3476 case Instruction::FSub:
3477 case Instruction::Mul:
3478 case Instruction::FMul:
3479 case Instruction::UDiv:
3480 case Instruction::SDiv:
3481 case Instruction::FDiv:
3482 case Instruction::URem:
3483 case Instruction::SRem:
3484 case Instruction::FRem:
3485 case Instruction::Shl:
3486 case Instruction::LShr:
3487 case Instruction::AShr:
3493 setDebugLocFromInst(Builder, BinOp);
3494 VectorParts &
A = getVectorValue(it->getOperand(0));
3495 VectorParts &B = getVectorValue(it->getOperand(1));
3498 for (
unsigned Part = 0; Part < UF; ++Part) {
3499 Value *V = Builder.CreateBinOp(BinOp->
getOpcode(), A[Part], B[Part]);
3502 VecOp->copyIRFlags(BinOp);
3507 propagateMetadata(Entry, it);
3514 bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(it->getOperand(0)),
3516 setDebugLocFromInst(Builder, it);
3522 VectorParts &Cond = getVectorValue(it->getOperand(0));
3523 VectorParts &Op0 = getVectorValue(it->getOperand(1));
3524 VectorParts &Op1 = getVectorValue(it->getOperand(2));
3526 Value *ScalarCond = (VF == 1) ? Cond[0] :
3527 Builder.CreateExtractElement(Cond[0], Builder.getInt32(0));
3529 for (
unsigned Part = 0; Part < UF; ++Part) {
3530 Entry[Part] = Builder.CreateSelect(
3531 InvariantCond ? ScalarCond : Cond[Part],
3536 propagateMetadata(Entry, it);
3540 case Instruction::ICmp:
3541 case Instruction::FCmp: {
3543 bool FCmp = (it->getOpcode() == Instruction::FCmp);
3545 setDebugLocFromInst(Builder, it);
3546 VectorParts &A = getVectorValue(it->getOperand(0));
3547 VectorParts &B = getVectorValue(it->getOperand(1));
3548 for (
unsigned Part = 0; Part < UF; ++Part) {
3551 C = Builder.CreateFCmp(Cmp->
getPredicate(), A[Part], B[Part]);
3553 C = Builder.CreateICmp(Cmp->
getPredicate(), A[Part], B[Part]);
3557 propagateMetadata(Entry, it);
3563 vectorizeMemoryInstruction(it);
3565 case Instruction::ZExt:
3566 case Instruction::SExt:
3567 case Instruction::FPToUI:
3568 case Instruction::FPToSI:
3569 case Instruction::FPExt:
3570 case Instruction::PtrToInt:
3571 case Instruction::IntToPtr:
3572 case Instruction::SIToFP:
3573 case Instruction::UIToFP:
3574 case Instruction::Trunc:
3575 case Instruction::FPTrunc:
3576 case Instruction::BitCast: {
3578 setDebugLocFromInst(Builder, it);
3584 it->getOpcode() == Instruction::Trunc) {
3585 Value *ScalarCast = Builder.CreateCast(CI->
getOpcode(), Induction,
3587 Value *Broadcasted = getBroadcastInstrs(ScalarCast);
3588 LoopVectorizationLegality::InductionInfo II =
3589 Legal->getInductionVars()->lookup(OldInduction);
3592 for (
unsigned Part = 0; Part < UF; ++Part)
3593 Entry[Part] = getStepVector(Broadcasted, VF * Part, Step);
3594 propagateMetadata(Entry, it);
3601 VectorParts &A = getVectorValue(it->getOperand(0));
3602 for (
unsigned Part = 0; Part < UF; ++Part)
3603 Entry[Part] = Builder.CreateCast(CI->
getOpcode(), A[Part], DestTy);
3604 propagateMetadata(Entry, it);
3610 if (isa<DbgInfoIntrinsic>(it))
3612 setDebugLocFromInst(Builder, it);
3626 (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
3627 ID == Intrinsic::lifetime_start)) {
3628 scalarizeInstruction(it);
3634 bool NeedToScalarize;
3636 bool UseVectorIntrinsic =
3638 if (!UseVectorIntrinsic && NeedToScalarize) {
3639 scalarizeInstruction(it);
3643 for (
unsigned Part = 0; Part < UF; ++Part) {
3650 VectorParts &VectorArg = getVectorValue(CI->
getArgOperand(i));
3651 Arg = VectorArg[Part];
3657 if (UseVectorIntrinsic) {
3665 StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
3666 assert(!VFnName.
empty() &&
"Vector function name is empty.");
3673 VectorF->copyAttributesFrom(F);
3676 assert(VectorF &&
"Can't create vector function.");
3677 Entry[Part] = Builder.CreateCall(VectorF, Args);
3680 propagateMetadata(Entry, it);
3686 scalarizeInstruction(it);
3692 void InnerLoopVectorizer::updateAnalysis() {
3694 SE->forgetLoop(OrigLoop);
3697 assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
3698 "Entry does not dominate exit.");
3700 for (
unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++
I)
3701 DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]);
3702 DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back());
3706 for (
unsigned i = 0, e = LoopVectorBody.size(); i != e; ++i) {
3708 DT->addNewBlock(LoopVectorBody[0], LoopVectorPreHeader);
3710 DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-1]);
3712 DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-2]);
3716 DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks[1]);
3717 DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
3718 DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
3719 DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
3721 DEBUG(DT->verifyDomTree());
3741 bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
3743 emitAnalysis(VectorizationReport() <<
"if-conversion is disabled");
3747 assert(TheLoop->getNumBlocks() > 1 &&
"Single block loops are vectorizable");
3754 BE = TheLoop->block_end(); BI != BE; ++BI) {
3757 if (blockNeedsPredication(BB))
3761 if (
LoadInst *LI = dyn_cast<LoadInst>(I))
3763 else if (
StoreInst *SI = dyn_cast<StoreInst>(I))
3764 SafePointes.insert(SI->getPointerOperand());
3771 BE = TheLoop->block_end(); BI != BE; ++BI) {
3777 <<
"loop contains a switch statement");
3782 if (blockNeedsPredication(BB)) {
3783 if (!blockCanBePredicated(BB, SafePointes)) {
3785 <<
"control flow cannot be substituted for a select");
3790 <<
"control flow cannot be substituted for a select");
3799 bool LoopVectorizationLegality::canVectorize() {
3802 if (!TheLoop->getLoopPreheader()) {
3804 VectorizationReport() <<
3805 "loop control flow is not understood by vectorizer");
3810 if (!TheLoop->empty()) {
3811 emitAnalysis(VectorizationReport() <<
"loop is not the innermost loop");
3816 if (TheLoop->getNumBackEdges() != 1) {
3818 VectorizationReport() <<
3819 "loop control flow is not understood by vectorizer");
3824 if (!TheLoop->getExitingBlock()) {
3826 VectorizationReport() <<
3827 "loop control flow is not understood by vectorizer");
3834 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
3836 VectorizationReport() <<
3837 "loop control flow is not understood by vectorizer");
3843 TheLoop->getHeader()->getName() <<
'\n');
3846 unsigned NumBlocks = TheLoop->getNumBlocks();
3847 if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
3848 DEBUG(
dbgs() <<
"LV: Can't if-convert the loop.\n");
3853 const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop);
3854 if (ExitCount == SE->getCouldNotCompute()) {
3856 "could not determine number of loop iterations");
3857 DEBUG(
dbgs() <<
"LV: SCEV could not compute the loop exit count.\n");
3862 if (!canVectorizeInstrs()) {
3863 DEBUG(
dbgs() <<
"LV: Can't vectorize the instructions or CFG\n");
3868 if (!canVectorizeMemory()) {
3869 DEBUG(
dbgs() <<
"LV: Can't vectorize due to memory conflicts\n");
3874 collectLoopUniforms();
3876 DEBUG(
dbgs() <<
"LV: We can vectorize this loop"
3877 << (LAI->getRuntimePointerChecking()->Need
3878 ?
" (with a runtime bound check)"
3884 InterleaveInfo.analyzeInterleaving(Strides);
3918 if (!Reductions.
count(Inst))
3924 DEBUG(
dbgs() <<
"LV: Found an outside user for : " << *UI <<
'\n');
3931 bool LoopVectorizationLegality::canVectorizeInstrs() {
3932 BasicBlock *PreHeader = TheLoop->getLoopPreheader();
3944 be = TheLoop->block_end(); bb != be; ++bb) {
3950 if (
PHINode *Phi = dyn_cast<PHINode>(it)) {
3957 <<
"loop control flow is not understood by vectorizer");
3958 DEBUG(
dbgs() <<
"LV: Found an non-int non-pointer PHI.\n");
3965 if (*bb != Header) {
3971 "value could not be identified as "
3972 "an induction or reduction variable");
3979 <<
"control flow not understood by vectorizer");
3980 DEBUG(
dbgs() <<
"LV: Found an invalid PHI.\n");
3988 InductionKind IK = isInductionVariable(Phi, StepValue);
3990 if (IK_NoInduction != IK) {
3998 if (IK == IK_IntInduction && StepValue->
isOne()) {
4002 if (!Induction || PhiTy == WidestIndTy)
4006 DEBUG(
dbgs() <<
"LV: Found an induction variable.\n");
4007 Inductions[Phi] = InductionInfo(StartValue, IK, StepValue);
4013 "use of induction value outside of the "
4014 "loop is not handled by vectorizer");
4023 AllowedExit.insert(Reductions[Phi].getLoopExitInstr());
4028 "value that could not be identified as "
4029 "reduction is used outside the loop");
4030 DEBUG(
dbgs() <<
"LV: Found an unidentified PHI."<< *Phi <<
"\n");
4043 "call instruction cannot be vectorized");
4044 DEBUG(
dbgs() <<
"LV: Found a non-intrinsic, non-libfunc callsite.\n");
4052 if (!SE->isLoopInvariant(SE->getSCEV(CI->
getOperand(1)), TheLoop)) {
4054 <<
"intrinsic instruction cannot be vectorized");
4055 DEBUG(
dbgs() <<
"LV: Found unvectorizable intrinsic " << *CI <<
"\n");
4063 !it->getType()->isVoidTy()) || isa<ExtractElementInst>(it)) {
4065 <<
"instruction return type cannot be vectorized");
4066 DEBUG(
dbgs() <<
"LV: Found unvectorizable type.\n");
4072 Type *
T =
ST->getValueOperand()->getType();
4075 "store instruction cannot be vectorized");
4079 collectStridedAccess(
ST);
4083 if (
LoadInst *LI = dyn_cast<LoadInst>(it))
4084 collectStridedAccess(LI);
4090 "value cannot be used outside the loop");
4099 DEBUG(
dbgs() <<
"LV: Did not find one integer induction var.\n");
4100 if (Inductions.empty()) {
4102 <<
"loop induction variable could not be identified");
4110 void LoopVectorizationLegality::collectStridedAccess(
Value *MemAccess) {
4111 Value *Ptr =
nullptr;
4112 if (
LoadInst *LI = dyn_cast<LoadInst>(MemAccess))
4114 else if (
StoreInst *SI = dyn_cast<StoreInst>(MemAccess))
4115 Ptr = SI->getPointerOperand();
4123 DEBUG(
dbgs() <<
"LV: Found a strided access that we can version");
4124 DEBUG(
dbgs() <<
" Ptr: " << *Ptr <<
" Stride: " << *Stride <<
"\n");
4125 Strides[Ptr] = Stride;
4126 StrideSet.insert(Stride);
4129 void LoopVectorizationLegality::collectLoopUniforms() {
4132 std::vector<Value*> Worklist;
4142 BE = TheLoop->block_end(); B != BE; ++B)
4145 if (I->getType()->isPointerTy() && isConsecutivePtr(I))
4146 Worklist.insert(Worklist.end(), I->op_begin(), I->op_end());
4148 while (!Worklist.empty()) {
4150 Worklist.pop_back();
4155 if (!I || !TheLoop->contains(I) || isa<PHINode>(
I))
4166 bool LoopVectorizationLegality::canVectorizeMemory() {
4167 LAI = &LAA->getInfo(TheLoop, Strides);
4168 auto &OptionalReport = LAI->getReport();
4171 if (!LAI->canVectorizeMemory())
4174 if (LAI->hasStoreToLoopInvariantAddress()) {
4176 VectorizationReport()
4177 <<
"write to a loop invariant address could not be vectorized");
4178 DEBUG(
dbgs() <<
"LV: We don't allow storing to uniform addresses\n");
4182 if (LAI->getNumRuntimePointerChecks() >
4185 << LAI->getNumRuntimePointerChecks() <<
" exceeds limit of "
4187 <<
" dependent memory operations checked at runtime");
4188 DEBUG(
dbgs() <<
"LV: Too many memory checks needed.\n");
4194 LoopVectorizationLegality::InductionKind
4195 LoopVectorizationLegality::isInductionVariable(
PHINode *Phi,
4198 return IK_NoInduction;
4203 return IK_IntInduction;
4205 return IK_PtrInduction;
4208 bool LoopVectorizationLegality::isInductionVariable(
const Value *V) {
4210 PHINode *PN = dyn_cast_or_null<PHINode>(In0);
4214 return Inductions.count(PN);
4217 bool LoopVectorizationLegality::blockNeedsPredication(
BasicBlock *BB) {
4221 bool LoopVectorizationLegality::blockCanBePredicated(
BasicBlock *BB,
4228 if (
Constant *C = dyn_cast<Constant>(*OI))
4233 if (it->mayReadFromMemory()) {
4239 MaskedOp.insert(LI);
4247 if (it->mayWriteToMemory()) {
4258 !isSinglePredecessor) {
4261 bool isLegalMaskedOp =
4264 if (isLegalMaskedOp) {
4266 MaskedOp.insert(SI);
4276 switch (it->getOpcode()) {
4278 case Instruction::UDiv:
4279 case Instruction::SDiv:
4280 case Instruction::URem:
4281 case Instruction::SRem:
4289 void InterleavedAccessInfo::collectConstStridedAccesses(
4295 for (
auto *BB : TheLoop->getBlocks()) {
4298 for (
auto &I : *BB) {
4299 if (!isa<LoadInst>(&I) && !isa<StoreInst>(&I))
4309 if (AccessList.
empty())
4312 auto &DL = TheLoop->getHeader()->getModule()->getDataLayout();
4313 for (
auto I : AccessList) {
4321 unsigned Factor =
std::abs(Stride);
4332 unsigned Align = LI ? LI->
getAlignment() : SI->getAlignment();
4336 StrideAccesses[
I] = StrideDescriptor(Stride, Scev, Size, Align);
4360 void InterleavedAccessInfo::analyzeInterleaving(
4362 DEBUG(
dbgs() <<
"LV: Analyzing interleaved accesses...\n");
4366 collectConstStridedAccesses(StrideAccesses, Strides);
4368 if (StrideAccesses.
empty())
4387 for (
auto I = StrideAccesses.
rbegin(), E = StrideAccesses.
rend(); I != E;
4390 StrideDescriptor DesA = I->second;
4392 InterleaveGroup *Group = getInterleaveGroup(A);
4394 DEBUG(
dbgs() <<
"LV: Creating an interleave group with:" << *A <<
'\n');
4395 Group = createInterleaveGroup(A, DesA.Stride, DesA.Align);
4399 StoreGroups.
insert(Group);
4401 for (
auto II = std::next(I); II != E; ++II) {
4403 StrideDescriptor DesB = II->second;
4410 if (DesB.Stride != DesA.Stride || DesB.Size != DesA.Size)
4423 if (DistanceToA % static_cast<int>(DesA.Size))
4428 Group->getIndex(A) + DistanceToA /
static_cast<int>(DesA.Size);
4431 if (Group->insertMember(B, IndexB, DesB.Align)) {
4432 DEBUG(
dbgs() <<
"LV: Inserted:" << *B <<
'\n'
4433 <<
" into the interleave group with" << *A <<
'\n');
4434 InterleaveGroupMap[B] = Group;
4438 Group->setInsertPos(B);
4444 for (InterleaveGroup *Group : StoreGroups)
4445 if (Group->getNumMembers() != Group->getFactor())
4446 releaseGroup(Group);
4450 LoopVectorizationCostModel::selectVectorizationFactor(
bool OptForSize) {
4453 if (OptForSize && Legal->getRuntimePointerChecking()->Need) {
4455 "runtime pointer checks needed. Enable vectorization of this "
4456 "loop with '#pragma clang loop vectorize(enable)' when "
4457 "compiling with -Os");
4458 DEBUG(
dbgs() <<
"LV: Aborting. Runtime ptr check is required in Os.\n");
4464 "store that is conditionally executed prevents vectorization");
4465 DEBUG(
dbgs() <<
"LV: No vectorization. There are conditional stores.\n");
4470 unsigned TC = SE->getSmallConstantTripCount(TheLoop);
4471 DEBUG(
dbgs() <<
"LV: Found trip count: " << TC <<
'\n');
4473 unsigned WidestType = getWidestType();
4474 unsigned WidestRegister = TTI.getRegisterBitWidth(
true);
4475 unsigned MaxSafeDepDist = -1U;
4476 if (Legal->getMaxSafeDepDistBytes() != -1U)
4477 MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
4478 WidestRegister = ((WidestRegister < MaxSafeDepDist) ?
4479 WidestRegister : MaxSafeDepDist);
4480 unsigned MaxVectorSize = WidestRegister / WidestType;
4481 DEBUG(
dbgs() <<
"LV: The Widest type: " << WidestType <<
" bits.\n");
4482 DEBUG(
dbgs() <<
"LV: The Widest register is: "
4483 << WidestRegister <<
" bits.\n");
4485 if (MaxVectorSize == 0) {
4486 DEBUG(
dbgs() <<
"LV: The target has no vector registers.\n");
4490 assert(MaxVectorSize <= 64 &&
"Did not expect to pack so many elements"
4491 " into one vector!");
4493 unsigned VF = MaxVectorSize;
4500 (VectorizationReport() <<
4501 "unable to calculate the loop count due to complex control flow");
4502 DEBUG(
dbgs() <<
"LV: Aborting. A tail loop is required in Os.\n");
4507 VF = TC % MaxVectorSize;
4515 "cannot optimize for size and vectorize at the "
4516 "same time. Enable vectorization of this loop "
4517 "with '#pragma clang loop vectorize(enable)' "
4518 "when compiling with -Os");
4519 DEBUG(
dbgs() <<
"LV: Aborting. A tail loop is required in Os.\n");
4524 int UserVF = Hints->getWidth();
4526 assert(
isPowerOf2_32(UserVF) &&
"VF needs to be a power of two");
4527 DEBUG(
dbgs() <<
"LV: Using user VF " << UserVF <<
".\n");
4529 Factor.Width = UserVF;
4533 float Cost = expectedCost(1);
4535 const float ScalarCost = Cost;
4538 DEBUG(
dbgs() <<
"LV: Scalar loop costs: " << (
int)ScalarCost <<
".\n");
4540 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
4542 if (ForceVectorization && VF > 1) {
4544 Cost = expectedCost(Width) / (float)Width;
4547 for (
unsigned i=2; i <= VF; i*=2) {
4551 float VectorCost = expectedCost(i) / (float)i;
4552 DEBUG(
dbgs() <<
"LV: Vector loop of width " << i <<
" costs: " <<
4553 (
int)VectorCost <<
".\n");
4554 if (VectorCost < Cost) {
4560 DEBUG(
if (ForceVectorization && Width > 1 && Cost >= ScalarCost)
dbgs()
4561 <<
"LV: Vectorization seems to be not beneficial, "
4562 <<
"but was forced by a user.\n");
4563 DEBUG(
dbgs() <<
"LV: Selecting VF: "<< Width <<
".\n");
4564 Factor.Width = Width;
4565 Factor.Cost = Width * Cost;
4569 unsigned LoopVectorizationCostModel::getWidestType() {
4570 unsigned MaxWidth = 8;
4571 const DataLayout &DL = TheFunction->getParent()->getDataLayout();
4575 be = TheLoop->block_end(); bb != be; ++bb) {
4580 Type *T = it->getType();
4583 if (EphValues.count(it))
4587 if (!isa<LoadInst>(it) && !isa<StoreInst>(it) && !isa<PHINode>(it))
4591 if (
PHINode *PN = dyn_cast<PHINode>(it))
4592 if (!Legal->getReductionVars()->count(PN))
4597 T =
ST->getValueOperand()->getType();
4602 if (T->
isPointerTy() && !isConsecutiveLoadOrStore(it))
4605 MaxWidth = std::max(MaxWidth,
4613 unsigned LoopVectorizationCostModel::selectInterleaveCount(
bool OptForSize,
4615 unsigned LoopCost) {
4632 int UserUF = Hints->getInterleave();
4641 if (Legal->getMaxSafeDepDistBytes() != -1U)
4645 unsigned TC = SE->getSmallConstantTripCount(TheLoop);
4649 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
4650 DEBUG(
dbgs() <<
"LV: The target has " << TargetNumRegisters <<
4661 LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage();
4664 R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
4665 R.NumInstructions = std::max(R.NumInstructions, 1U);
4675 unsigned IC =
PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
4680 IC =
PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
4681 std::max(1U, (R.MaxLocalUsers - 1)));
4684 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
4698 LoopCost = expectedCost(VF);
4702 if (IC > MaxInterleaveCount)
4703 IC = MaxInterleaveCount;
4709 if (VF > 1 && Legal->getReductionVars()->size()) {
4710 DEBUG(
dbgs() <<
"LV: Interleaving because of reductions.\n");
4716 bool InterleavingRequiresRuntimePointerCheck =
4717 (VF == 1 && Legal->getRuntimePointerChecking()->Need);
4721 DEBUG(
dbgs() <<
"LV: Loop cost is " << LoopCost <<
'\n');
4722 if (!InterleavingRequiresRuntimePointerCheck && LoopCost <
SmallLoopCost) {
4731 unsigned NumStores = Legal->getNumStores();
4732 unsigned NumLoads = Legal->getNumLoads();
4733 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
4734 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
4740 if (Legal->getReductionVars()->size() &&
4741 TheLoop->getLoopDepth() > 1) {
4749 std::max(StoresIC, LoadsIC) > SmallIC) {
4750 DEBUG(
dbgs() <<
"LV: Interleaving to saturate store or load ports.\n");
4751 return std::max(StoresIC, LoadsIC);
4754 DEBUG(
dbgs() <<
"LV: Interleaving to reduce branch cost.\n");
4761 bool HasReductions = (Legal->getReductionVars()->size() > 0);
4762 if (TTI.enableAggressiveInterleaving(HasReductions)) {
4763 DEBUG(
dbgs() <<
"LV: Interleaving to expose ILP.\n");
4767 DEBUG(
dbgs() <<
"LV: Not Interleaving.\n");
4771 LoopVectorizationCostModel::RegisterUsage
4772 LoopVectorizationCostModel::calculateRegisterUsage() {
4794 R.NumInstructions = 0;
4803 IntervalMap EndPoint;
4812 be = DFS.endRPO(); bb != be; ++bb) {
4813 R.NumInstructions += (*bb)->size();
4817 IdxToInstr[Index++] =
I;
4825 if (!Instr)
continue;
4828 if (!TheLoop->contains(Instr)) {
4829 LoopInvariants.
insert(Instr);
4834 EndPoint[Instr] = Index;
4847 TransposeEnds[it->second].push_back(it->first);
4850 unsigned MaxUsage = 0;
4853 DEBUG(
dbgs() <<
"LV(REG): Calculating max register usage:\n");
4854 for (
unsigned int i = 0; i < Index; ++i) {
4857 if (!Ends.
count(I))
continue;
4860 if (EphValues.count(I))
4864 InstrList &List = TransposeEnds[i];
4865 for (
unsigned int j=0, e = List.size(); j < e; ++j)
4866 OpenIntervals.
erase(List[j]);
4869 MaxUsage = std::max(MaxUsage, OpenIntervals.
size());
4871 DEBUG(
dbgs() <<
"LV(REG): At #" << i <<
" Interval # " <<
4872 OpenIntervals.
size() <<
'\n');
4878 unsigned Invariant = LoopInvariants.
size();
4879 DEBUG(
dbgs() <<
"LV(REG): Found max usage: " << MaxUsage <<
'\n');
4880 DEBUG(
dbgs() <<
"LV(REG): Found invariant usage: " << Invariant <<
'\n');
4881 DEBUG(
dbgs() <<
"LV(REG): LoopSize: " << R.NumInstructions <<
'\n');
4883 R.LoopInvariantRegs = Invariant;
4884 R.MaxLocalUsers = MaxUsage;
4888 unsigned LoopVectorizationCostModel::expectedCost(
unsigned VF) {
4893 be = TheLoop->block_end(); bb != be; ++bb) {
4894 unsigned BlockCost = 0;
4900 if (isa<DbgInfoIntrinsic>(it))
4904 if (EphValues.count(it))
4907 unsigned C = getInstructionCost(it, VF);
4914 DEBUG(
dbgs() <<
"LV: Found an estimated cost of " << C <<
" for VF " <<
4915 VF <<
" For instruction: " << *it <<
'\n');
4921 if (VF == 1 && Legal->blockNeedsPredication(*bb))
4940 LoopVectorizationLegality *Legal,
4942 const Loop *TheLoop) {
4950 for (
unsigned i = 1; i < NumOperands; ++i) {
4953 !Legal->isInductionVariable(Opd))
4959 unsigned MaxMergeDistance = 64;
4980 return StepVal > MaxMergeDistance;
4990 LoopVectorizationCostModel::getInstructionCost(
Instruction *I,
unsigned VF) {
4993 if (Legal->isUniformAfterVectorization(I))
4997 Type *VectorTy = ToVectorTy(RetTy, VF);
5001 case Instruction::GetElementPtr:
5007 case Instruction::Br: {
5008 return TTI.getCFInstrCost(I->
getOpcode());
5013 case Instruction::Add:
5014 case Instruction::FAdd:
5015 case Instruction::Sub:
5016 case Instruction::FSub:
5017 case Instruction::Mul:
5018 case Instruction::FMul:
5019 case Instruction::UDiv:
5020 case Instruction::SDiv:
5021 case Instruction::FDiv:
5022 case Instruction::URem:
5023 case Instruction::SRem:
5024 case Instruction::FRem:
5025 case Instruction::Shl:
5026 case Instruction::LShr:
5027 case Instruction::AShr:
5047 if (isa<ConstantInt>(Op2)) {
5052 }
else if (isa<ConstantVector>(Op2) || isa<ConstantDataVector>(Op2)) {
5054 Constant *SplatValue = cast<Constant>(Op2)->getSplatValue();
5063 return TTI.getArithmeticInstrCost(I->
getOpcode(), VectorTy, Op1VK, Op2VK,
5069 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
5074 return TTI.getCmpSelInstrCost(I->
getOpcode(), VectorTy, CondTy);
5076 case Instruction::ICmp:
5077 case Instruction::FCmp: {
5079 VectorTy = ToVectorTy(ValTy, VF);
5080 return TTI.getCmpSelInstrCost(I->
getOpcode(), VectorTy);
5088 VectorTy = ToVectorTy(ValTy, VF);
5090 unsigned Alignment = SI ? SI->
getAlignment() : LI->getAlignment();
5092 LI->getPointerAddressSpace();
5098 return TTI.getAddressComputationCost(VectorTy) +
5099 TTI.getMemoryOpCost(I->
getOpcode(), VectorTy, Alignment, AS);
5103 if (Legal->isAccessInterleaved(I)) {
5104 auto Group = Legal->getInterleavedAccessGroup(I);
5105 assert(Group &&
"Fail to get an interleaved access group.");
5108 if (Group->getInsertPos() !=
I)
5111 unsigned InterleaveFactor = Group->getFactor();
5120 for (
unsigned i = 0; i < InterleaveFactor; i++)
5121 if (Group->getMember(i))
5126 unsigned Cost = TTI.getInterleavedMemoryOpCost(
5127 I->
getOpcode(), WideVecTy, Group->getFactor(), Indices,
5128 Group->getAlignment(), AS);
5130 if (Group->isReverse())
5132 Group->getNumMembers() *
5142 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
5143 bool Reverse = ConsecutiveStride < 0;
5147 if (!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) {
5148 bool IsComplexComputation =
5153 for (
unsigned i = 0; i < VF; ++i) {
5160 Instruction::InsertElement,
5165 Cost += VF * TTI.getAddressComputationCost(PtrTy, IsComplexComputation);
5166 Cost += VF * TTI.getMemoryOpCost(I->
getOpcode(), ValTy->getScalarType(),
5172 unsigned Cost = TTI.getAddressComputationCost(VectorTy);
5173 if (Legal->isMaskRequired(I))
5174 Cost += TTI.getMaskedMemoryOpCost(I->
getOpcode(), VectorTy, Alignment,
5177 Cost += TTI.getMemoryOpCost(I->
getOpcode(), VectorTy, Alignment, AS);
5184 case Instruction::ZExt:
5185 case Instruction::SExt:
5186 case Instruction::FPToUI:
5187 case Instruction::FPToSI:
5188 case Instruction::FPExt:
5189 case Instruction::PtrToInt:
5190 case Instruction::IntToPtr:
5191 case Instruction::SIToFP:
5192 case Instruction::UIToFP:
5193 case Instruction::Trunc:
5194 case Instruction::FPTrunc:
5195 case Instruction::BitCast: {
5198 if (I->
getOpcode() == Instruction::Trunc &&
5199 Legal->isInductionVariable(I->
getOperand(0)))
5204 return TTI.getCastInstrCost(I->
getOpcode(), VectorTy, SrcVecTy);
5207 bool NeedToScalarize;
5220 if (!RetTy->
isVoidTy() && VF != 1) {
5221 unsigned InsCost = TTI.getVectorInstrCost(Instruction::InsertElement,
5233 Cost += VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy);
5240 static const char lv_name[] =
"Loop Vectorization";
5256 return new LoopVectorize(NoUnrolling, AlwaysVectorize);
5260 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(
Instruction *Inst) {
5263 return Legal->isConsecutivePtr(
ST->getPointerOperand()) != 0;
5266 if (
LoadInst *LI = dyn_cast<LoadInst>(Inst))
5267 return Legal->isConsecutivePtr(LI->getPointerOperand()) != 0;
5273 void InnerLoopUnroller::scalarizeInstruction(
Instruction *Instr,
5274 bool IfPredicateStore) {
5279 setDebugLocFromInst(Builder, Instr);
5286 if (SrcOp == OldInduction) {
5287 Params.push_back(getVectorValue(SrcOp));
5296 if (SrcInst && OrigLoop->contains(SrcInst)) {
5297 assert(WidenMap.has(SrcInst) &&
"Source operand is unavailable");
5299 Params.push_back(WidenMap.get(SrcInst));
5302 VectorParts Scalars;
5303 Scalars.append(UF, SrcOp);
5304 Params.push_back(Scalars);
5309 "Invalid number of operands");
5314 Value *UndefVec = IsVoidRetTy ?
nullptr :
5317 VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
5320 BasicBlock *IfBlock = Builder.GetInsertBlock();
5324 Loop *VectorLp =
nullptr;
5325 if (IfPredicateStore) {
5327 "Only support single predecessor blocks");
5330 VectorLp = LI->getLoopFor(IfBlock);
5331 assert(VectorLp &&
"Must have a loop for this block");
5335 for (
unsigned Part = 0; Part < UF; ++Part) {
5339 Value *Cmp =
nullptr;
5340 if (IfPredicateStore) {
5341 if (Cond[Part]->
getType()->isVectorTy())
5343 Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0));
5347 LoopVectorBody.push_back(CondBlock);
5348 VectorLp->addBasicBlockToLoop(CondBlock, *LI);
5350 Builder.SetInsertPoint(InsertPt);
5363 Builder.Insert(Cloned);
5368 VecResults[Part] = Cloned;
5371 if (IfPredicateStore) {
5373 LoopVectorBody.push_back(NewIfBlock);
5374 VectorLp->addBasicBlockToLoop(NewIfBlock, *LI);
5375 Builder.SetInsertPoint(InsertPt);
5378 IfBlock = NewIfBlock;
5383 void InnerLoopUnroller::vectorizeMemoryInstruction(
Instruction *Instr) {
5385 bool IfPredicateStore = (SI && Legal->blockNeedsPredication(SI->
getParent()));
5387 return scalarizeInstruction(Instr, IfPredicateStore);
5390 Value *InnerLoopUnroller::reverseVector(
Value *Vec) {
5394 Value *InnerLoopUnroller::getBroadcastInstrs(
Value *V) {
5398 Value *InnerLoopUnroller::getStepVector(
Value *Val,
int StartIdx,
Value *Step) {
5401 assert(!ITy->
isVectorTy() &&
"Val must be a scalar");
5403 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step),
"induction");
Pass interface - Implemented by all 'passes'.
static unsigned RuntimeMemoryCheckThreshold
\brief When performing memory disambiguation checks at runtime do not make more than this number of c...
VectorType::iterator iterator
Value * CreateGEP(Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
static unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract, const TargetTransformInfo &TTI)
Estimate the overhead of scalarizing a value.
Value * getValueOperand()
iplist< Instruction >::iterator eraseFromParent()
eraseFromParent - This method unlinks 'this' from the containing basic block and deletes it...
void push_back(const T &Elt)
Intrinsic::ID getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
A parsed version of the target data layout string in and methods for querying it. ...
Value * getPointerOperand()
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
void ReplaceInstWithInst(BasicBlock::InstListType &BIL, BasicBlock::iterator &BI, Instruction *I)
ReplaceInstWithInst - Replace the instruction specified by BI with the instruction specified by I...
This class is the base class for the comparison instructions.
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
static IntegerType * getInt1Ty(LLVMContext &C)
void addIncoming(Value *V, BasicBlock *BB)
addIncoming - Add an incoming value to the end of the PHI list
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
const_iterator begin() const
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function. ...
bool isOne() const
isOne - Return true if the expression is a constant one.
STATISTIC(NumFunctions,"Total number of functions")
ValueT lookup(const KeyT &Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
static cl::opt< unsigned > ForceTargetNumScalarRegs("force-target-num-scalar-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of scalar registers."))
int getWidth()
Get the width of a number.
A Module instance is used to store all the information related to an LLVM module. ...
void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
static Constant * getSequentialMask(IRBuilder<> &Builder, unsigned NumInt, unsigned NumUndef)
static MDString * get(LLVMContext &Context, StringRef Str)
Min/max implemented in terms of select(cmp()).
unsigned getNumOperands() const
static Value * ConcatenateVectors(IRBuilder<> &Builder, ArrayRef< Value * > InputList)
unsigned getNumOperands() const
Return number of MDNode operands.
value_op_iterator value_op_begin()
ScalarEvolution - This class is the main scalar evolution driver.
bool isInductionPHI(PHINode *, ScalarEvolution *, ConstantInt *&)
Checks if the given PHINode in a loop header is an induction variable.
bool endswith(StringRef Suffix) const
Check if this string ends with the given Suffix.
CallInst - This class represents a function call, abstracting a target machine's calling convention...
size_type count(PtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
An immutable pass that tracks lazily created AssumptionCache objects.
static bool isReductionPHI(PHINode *Phi, Loop *TheLoop, RecurrenceDescriptor &RedDes)
Returns true if Phi is a reduction in TheLoop.
A cache of .assume calls within a function.
StringRef substr(size_t Start, size_t N=npos) const
Return a reference to the substring from [Start, Start + N).
static cl::opt< unsigned > TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), cl::Hidden, cl::desc("Don't vectorize loops with a constant ""trip count that is smaller than this ""value."))
We don't vectorize loops with a known constant trip count below this number.
const SCEV * getStepRecurrence(ScalarEvolution &SE) const
getStepRecurrence - This method constructs and returns the recurrence indicating how much this expres...
Externally visible function.
void initializeLoopVectorizePass(PassRegistry &)
This class implements a map that also provides access to all stored values in a deterministic order...
bool isLoopInvariant(const SCEV *S, const Loop *L)
isLoopInvariant - Return true if the value of the given SCEV is unchanging in the specified loop...
value_op_iterator value_op_end()
LoopT * getParentLoop() const
const Function * getParent() const
Return the enclosing method, or null if none.
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
LoadInst - an instruction for reading from memory.
const_iterator end() const
FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys=None)
Return the function type for an intrinsic.
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static cl::opt< unsigned > NumberOfStoresToPredicate("vectorize-num-stores-pred", cl::init(1), cl::Hidden, cl::desc("Max number of stores to be predicated behind an if."))
The number of stores in a loop that are allowed to need predication.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
void emitOptimizationRemark(LLVMContext &Ctx, const char *PassName, const Function &Fn, const DebugLoc &DLoc, const Twine &Msg)
Emit an optimization-applied message.
const SCEV * replaceSymbolicStrideSCEV(ScalarEvolution *SE, const ValueToValueMap &PtrToStride, Value *Ptr, Value *OrigPtr=nullptr)
Return the SCEV corresponding to a pointer with the symbolic stride replaced with constant one...
static Value * ConcatenateTwoVectors(IRBuilder<> &Builder, Value *V1, Value *V2)
BlockT * getHeader() const
aarch64 collect AArch64 Collect Linker Optimization Hint(LOH)"
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
SCEVCastExpr - This is the base class for unary cast operator classes.
Type * getPointerElementType() const
StringRef getName() const
Return a constant reference to the value's name.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
iterator begin()
Instruction iterator methods.
static void cse(SmallVector< BasicBlock *, 4 > &BBs)
Perform cse of induction variable instructions.
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
std::string str() const
Return the twine contents as a std::string.
SelectInst - This class represents the LLVM 'select' instruction.
bool isIdenticalTo(const Instruction *I) const
isIdenticalTo - Return true if the specified instruction is exactly identical to the current one...
void emitOptimizationRemarkAnalysis(LLVMContext &Ctx, const char *PassName, const Function &Fn, const DebugLoc &DLoc, const Twine &Msg)
Emit an optimization analysis remark message.
static const unsigned MaxVectorWidth
Maximum SIMD width.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal)
This is the base class for all instructions that perform data casts.
const APInt & getValue() const
Return the constant as an APInt value reference.
const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr it the function does no...
T LLVM_ATTRIBUTE_UNUSED_RESULT pop_back_val()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
A Use represents the edge between a Value definition and its users.
static cl::opt< bool > EnableLoadStoreRuntimeInterleave("enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, cl::desc("Enable runtime interleaving until load/store ports are saturated"))
static cl::opt< unsigned > MaxInterleaveGroupFactor("max-interleave-group-factor", cl::Hidden, cl::desc("Maximum factor for an interleaved access group (default = 8)"), cl::init(8))
Maximum factor for an interleaved memory access.
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
unsigned getNumArgOperands() const
getNumArgOperands - Return the number of call arguments.
Instruction * getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
static Constant * get(ArrayRef< Constant * > V)
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Number of individual test Apply this number of consecutive mutations to each input exit after the first new interesting input is found the minimized corpus is saved into the first input directory Number of jobs to run If min(jobs, NumberOfCpuCores()/2)\" is used.") FUZZER_FLAG_INT(reload
void setName(const Twine &Name)
Change the name of the value.
static ConstantInt * ExtractElement(Constant *V, Constant *Idx)
Type * getVectorElementType() const
Instruction * clone() const
clone() - Create a copy of 'this' instruction that is identical in all ways except the following: ...
static const unsigned TinyTripCountInterleaveThreshold
We don't interleave loops with a known constant trip count below this number.
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp)
Get the stride of a pointer access in a loop.
FunctionType - Class to represent function types.
static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI)
void emitOptimizationRemarkMissed(LLVMContext &Ctx, const char *PassName, const Function &Fn, const DebugLoc &DLoc, const Twine &Msg)
Emit an optimization-missed message.
bool insert(const value_type &X)
Insert a new element into the SetVector.
bool mayReadFromMemory() const
mayReadFromMemory - Return true if this instruction may read memory.
LLVMContext & getContext() const
getContext - Return the LLVMContext in which this type was uniqued.
void addBasicBlockToLoop(BlockT *NewBB, LoopInfoBase< BlockT, LoopT > &LI)
addBasicBlockToLoop - This method is used by other analyses to update loop information.
SCEVAddRecExpr - This node represents a polynomial recurrence on the trip count of the specified loop...
bool LLVM_ATTRIBUTE_UNUSED_RESULT empty() const
void addChildLoop(LoopT *NewChild)
addChildLoop - Add the specified loop to be a child of this loop.
Pass * createLoopVectorizePass(bool NoUnrolling=false, bool AlwaysVectorize=true)
static bool isValidElementType(Type *ElemTy)
isValidElementType - Return true if the specified type is valid as a element type.
BasicBlock * getSuccessor(unsigned i) const
static FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
FunctionType::get - This static method is the primary way of constructing a FunctionType.
static Instruction * getFirstInst(Instruction *FirstInst, Value *V, Instruction *Loc)
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
bool isFloatingPointTy() const
isFloatingPointTy - Return true if this is one of the six floating point types
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
StoreInst - an instruction for storing to memory.
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
unsigned getNumElements() const
Return the number of elements in the Vector type.
static cl::opt< unsigned > ForceTargetMaxScalarInterleaveFactor("force-target-max-scalar-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for ""scalar loops."))
static Constant * getInterleavedMask(IRBuilder<> &Builder, unsigned VF, unsigned NumVec)
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
Type * getElementType() const
size_t size() const
size - Get the array size.
PointerType - Class to represent pointers.
unsigned getNumIncomingValues() const
getNumIncomingValues - Return the number of incoming edges
static bool canIfConvertPHINodes(BasicBlock *BB)
Check whether it is safe to if-convert this phi node.
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Optimization analysis message produced during vectorization.
GetElementPtrInst - an instruction for type-safe pointer arithmetic to access elements of arrays and ...
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
initializer< Ty > init(const Ty &Val)
static CmpInst * Create(OtherOps Op, unsigned short predicate, Value *S1, Value *S2, const Twine &Name="", Instruction *InsertBefore=nullptr)
Construct a compare instruction, given the opcode, the predicate and the two operands.
unsigned getAlignment() const
getAlignment - Return the alignment of the access that is being performed
BlockT * getLoopPreheader() const
getLoopPreheader - If there is a preheader for this loop, return it.
LLVM Basic Block Representation.
static cl::opt< unsigned > MaxNestedScalarReductionIC("max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, cl::desc("The maximum interleave count to use when interleaving a scalar ""reduction in a nested loop."))
The instances of the Type class are immutable: once they are created, they are never changed...
void getAllMetadataOtherThanDebugLoc(SmallVectorImpl< std::pair< unsigned, MDNode * >> &MDs) const
getAllMetadataOtherThanDebugLoc - This does the same thing as getAllMetadata, except that it filters ...
This is an important class for using LLVM in a threaded context.
Type * getType() const
getType - Return the LLVM type of this SCEV expression.
BranchInst - Conditional or Unconditional Branch instruction.
Min/max implemented in terms of select(cmp()).
bool isVectorTy() const
isVectorTy - True if this is an instance of VectorType.
Value handle that tracks a Value across RAUW.
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
This is an important base class in LLVM.
const Value * getCondition() const
int64_t getSExtValue() const
Get sign extended value.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static Type * getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1)
static Value * addFastMathFlag(Value *V)
Adds a 'fast' flag to floating point operations.
APInt Or(const APInt &LHS, const APInt &RHS)
Bitwise OR function for APInt.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
APInt Xor(const APInt &LHS, const APInt &RHS)
Bitwise XOR function for APInt.
static bool isLikelyComplexAddressComputation(Value *Ptr, LoopVectorizationLegality *Legal, ScalarEvolution *SE, const Loop *TheLoop)
Check whether the address computation for a non-consecutive memory access looks like an unlikely cand...
static Constant * getStridedMask(IRBuilder<> &Builder, unsigned Start, unsigned Stride, unsigned VF)
Interval::pred_iterator pred_begin(Interval *I)
pred_begin/pred_end - define methods so that Intervals may be used just like BasicBlocks can with the...
const DebugLoc & getDebugLoc() const
getDebugLoc - Return the debug location for this node as a DebugLoc.
MDNode * getLoopID() const
Return the llvm.loop loop id metadata node for this loop if it is present.
Represent the analysis usage information of a pass.
BasicBlock * getIncomingBlock(unsigned i) const
getIncomingBlock - Return incoming basic block number i.
bool contains(const LoopT *L) const
contains - Return true if the specified loop is contained within in this loop.
unsigned getBitWidth() const
Return the number of bits in the APInt.
std::vector< BasicBlock * >::const_reverse_iterator RPOIterator
FunctionPass class - This class is used to implement most global optimizations.
Value * getOperand(unsigned i) const
Interval::pred_iterator pred_end(Interval *I)
Value * getPointerOperand()
int isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp, const ValueToValueMap &StridesMap)
Check the stride of the pointer and ensure that it does not wrap in the address space.
std::pair< NoneType, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
static cl::opt< unsigned > ForceTargetNumVectorRegs("force-target-num-vector-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of vector registers."))
Predicate getPredicate() const
Return the predicate for this instruction.
static CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd)
Create a BitCast AddrSpaceCast, or a PtrToInt cast instruction.
static Constant * getAllOnesValue(Type *Ty)
Get the all ones value.
#define INITIALIZE_AG_DEPENDENCY(depName)
void append(in_iter in_start, in_iter in_end)
Add the specified range to the end of the SmallVector.
bool isPointerTy() const
isPointerTy - True if this is an instance of PointerType.
static UndefValue * get(Type *T)
get() - Static factory methods - Return an 'undef' object of the specified type.
LLVMContext & getContext() const
All values hold a context through their type.
RecurrenceKind getRecurrenceKind()
PointerType * getPointerTo(unsigned AddrSpace=0)
getPointerTo - Return a pointer to the current type.
static unsigned getRecurrenceBinOp(RecurrenceKind Kind)
Returns the opcode of binary operation corresponding to the RecurrenceKind.
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Value * stripIntegerCast(Value *V)
static void emitAnalysis(const LoopAccessReport &Message, const Function *TheFunction, const Loop *TheLoop, const char *PassName)
Emit an analysis note for PassName with the debug location from the instruction in Message if availab...
void setMetadata(unsigned KindID, MDNode *Node)
setMetadata - Set the metadata of the specified kind to the specified node.
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
bool mayWriteToMemory() const
mayWriteToMemory - Return true if this instruction may modify memory.
void emitLoopVectorizeWarning(LLVMContext &Ctx, const Function &Fn, const DebugLoc &DLoc, const Twine &Msg)
Emit a warning when loop vectorization is specified but fails.
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
bool isConditional() const
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
void setLoopID(MDNode *LoopID) const
Set the llvm.loop loop id metadata for this loop.
IntegerType * getIntPtrType(LLVMContext &C, unsigned AddressSpace=0) const
Returns an integer type with size at least as big as that of a pointer in the given address space...
BinaryOps getOpcode() const
StringRef getString() const
unsigned getABITypeAlignment(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
static Constant * getSplat(unsigned NumElts, Constant *Elt)
getSplat - Return a ConstantVector with the specified constant in each element.
const MDOperand & getOperand(unsigned I) const
void emitLoopInterleaveWarning(LLVMContext &Ctx, const Function &Fn, const DebugLoc &DLoc, const Twine &Msg)
Emit a warning when loop interleaving is specified but fails.
A SetVector that performs no allocations if smaller than a certain size.
bool startswith(StringRef Prefix) const
Check if this string starts with the given Prefix.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
This is the shared class of boolean and integer constants.
Value * getIncomingValue(unsigned i) const
getIncomingValue - Return incoming value number x
uint64_t getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
unsigned getVectorNumElements() const
static void emitAnalysis(CallSite CS, const Twine &Msg)
unsigned getScalarSizeInBits() const LLVM_READONLY
getScalarSizeInBits - If this is a vector type, return the getPrimitiveSizeInBits value for the eleme...
AnalysisUsage & addRequiredID(const void *ID)
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small...
bool isAllOnesValue() const
isAllOnesValue - Return true if the expression is a constant all-ones value.
Module.h This file contains the declarations for the Module class.
Type * getType() const
All values are typed, get the type of this value.
static cl::opt< unsigned > ForceTargetInstructionCost("force-target-instruction-cost", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's expected cost for ""an instruction to a single constant value. Mostly ""useful for getting consistent testing."))
Provides information about what library functions are available for the current target.
static Type * convertPointerToIntegerType(const DataLayout &DL, Type *Ty)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Drive the analysis of memory accesses in the loop.
static Constant * get(Type *Ty, uint64_t V, bool isSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Function * getCalledFunction() const
getCalledFunction - Return the function called, or null if this is an indirect function invocation...
static BranchInst * Create(BasicBlock *IfTrue, Instruction *InsertBefore=nullptr)
static cl::opt< bool > EnableIndVarRegisterHeur("enable-ind-var-reg-heur", cl::init(true), cl::Hidden, cl::desc("Count the induction variable only once when interleaving"))
reverse_iterator rbegin()
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
ConstantInt * getValue() const
bool hasVectorInstrinsicScalarOpd(Intrinsic::ID ID, unsigned ScalarOpdIdx)
Identifies if the intrinsic has a scalar operand.
static cl::opt< AlignMode > Align(cl::desc("Load/store alignment support"), cl::Hidden, cl::init(NoStrictAlign), cl::values(clEnumValN(StrictAlign,"aarch64-strict-align","Disallow all unaligned memory accesses"), clEnumValN(NoStrictAlign,"aarch64-no-strict-align","Allow unaligned memory accesses"), clEnumValEnd))
static ConstantInt * getTrue(LLVMContext &Context)
void setOperand(unsigned i, Value *Val)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Value * getArgOperand(unsigned i) const
getArgOperand/setArgOperand - Return/set the i-th call argument.
static Constant * getRecurrenceIdentity(RecurrenceKind K, Type *Tp)
Returns identity corresponding to the RecurrenceKind.
Store the result of a depth first search within basic blocks contained by a single loop...
VectorType - Class to represent vector types.
Class for arbitrary precision integers.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
bool isIntegerTy() const
isIntegerTy - True if this is an instance of IntegerType.
iterator_range< user_iterator > users()
BasicBlock * getSinglePredecessor()
Return the predecessor of this block if it has a single predecessor block.
static Value * createMinMaxOp(IRBuilder<> &Builder, MinMaxRecurrenceKind RK, Value *Left, Value *Right)
Returns a Min/Max operation corresponding to MinMaxRecurrenceKind.
static const char lv_name[]
This class uses information about analyze scalars to rewrite expressions in canonical form...
LLVM_ATTRIBUTE_UNUSED_RESULT std::enable_if< !is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
static const unsigned MaxInterleaveFactor
Maximum vectorization interleave count.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
std::vector< BlockT * >::const_iterator block_iterator
Holds information about the memory runtime legality checks to verify that a group of pointers do not ...
const Type * getScalarType() const LLVM_READONLY
getScalarType - If this is a vector type, return the element type, otherwise return 'this'...
APInt And(const APInt &LHS, const APInt &RHS)
Bitwise AND function for APInt.
unsigned getGEPInductionOperand(const GetElementPtrInst *Gep)
Find the operand of the GEP that should be checked for consecutive stores.
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
static cl::opt< bool > EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, cl::desc("Enable if-conversion during vectorization."))
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
This analysis provides dependence information for the memory accesses of a loop.
Value * getCondition() const
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
bool isAggregateType() const
isAggregateType - Return true if the type is an aggregate type.
static unsigned getVectorCallCost(CallInst *CI, unsigned VF, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI, bool &NeedToScalarize)
SCEV - This class represents an analyzed expression in the program.
static IntegerType * getInt32Ty(LLVMContext &C)
static cl::opt< bool > EnableInterleavedMemAccesses("enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on interleaved memory accesses in a loop"))
bool isFunctionVectorizable(StringRef F, unsigned VF) const
std::error_code Check(std::error_code Err)
unsigned getAlignment() const
getAlignment - Return the alignment of the access that is being performed
static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst, SmallPtrSetImpl< Value * > &Reductions)
Check that the instruction has outside loop users and is not an identified reduction variable...
TerminatorInst * getTerminator()
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
APFloat abs(APFloat X)
Returns the absolute value of the argument.
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
static cl::opt< unsigned, true > VectorizationFactor("force-vector-width", cl::Hidden, cl::desc("Sets the SIMD width. Zero is autoselect."), cl::location(VectorizerParams::VectorizationFactor))
static cl::opt< unsigned > ForceTargetMaxVectorInterleaveFactor("force-target-max-vector-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for ""vectorized loops."))
static cl::opt< unsigned > SmallLoopCost("small-loop-cost", cl::init(20), cl::Hidden, cl::desc("The cost of a loop that is considered 'small' by the interleaver."))
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="")
Split the basic block into two basic blocks at the specified instruction.
uint64_t getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type...
Collection of parameters shared beetween the Loop Vectorizer and the Loop Access Analysis.
iterator_range< op_iterator > arg_operands()
arg_operands - iteration adapter for range-for loops.
StringRef getValueAsString() const
Return the attribute's value as a string.
static cl::opt< bool > LoopVectorizeWithBlockFrequency("loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden, cl::desc("Enable the use of the block frequency analysis to access PGO ""heuristics minimizing code growth in cold regions and being more ""aggressive in hot regions."))
const ARM::ArchExtKind Kind
LLVMContext & getContext() const
Get the context in which this basic block lives.
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
A raw_ostream that writes to an std::string.
unsigned getPrimitiveSizeInBits() const LLVM_READONLY
getPrimitiveSizeInBits - Return the basic size of this type if it is a primitive type.
uint64_t PowerOf2Floor(uint64_t A)
Returns the power of two which is less than or equal to the given value.
Module * getParent()
Get the module that this global value is contained inside of...
LLVM Value Representation.
const SCEV * getSCEV(Value *V)
getSCEV - Return a SCEV expression for the full generality of the specified expression.
unsigned getOpcode() const
getOpcode() returns a member of one of the enums like Instruction::Add.
static VectorType * get(Type *ElementType, unsigned NumElements)
VectorType::get - This static method is the primary way to construct an VectorType.
Disable implicit floating point insts.
static bool blockNeedsPredication(BasicBlock *BB, Loop *TheLoop, DominatorTree *DT)
Return true if the block BB needs to be predicated in order for the loop to be vectorized.
uint64_t getTypeSizeInBits(Type *Ty) const
Size examples:
The legacy pass manager's analysis pass to compute loop information.
static unsigned VectorizationInterleave
Interleave factor as overridden by the user.
C - The default llvm calling convention, compatible with C.
bool isPowerOf2_32(uint32_t Value)
isPowerOf2_32 - This function returns true if the argument is a power of two > 0. ...
Convenience struct for specifying and reasoning about fast-math flags.
StringRef - Represent a constant reference to a string, i.e.
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, const Twine &N="", Module *M=nullptr)
Legacy analysis pass which computes a DominatorTree.
static cl::opt< bool > EnableMemAccessVersioning("enable-mem-access-versioning", cl::init(true), cl::Hidden, cl::desc("Enable symblic stride memory access versioning"))
This enables versioning on the strides of symbolically striding memory accesses in code like the foll...
iterator getFirstInsertionPt()
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop)...
void setIncomingValue(unsigned i, Value *V)
static bool isInterleaveForced()
True if force-vector-interleave was specified by the user.
static cl::opt< bool > EnableCondStoresVectorization("enable-cond-stores-vec", cl::init(false), cl::Hidden, cl::desc("Enable if predication of stores during vectorization."))
Value * getPointerOperand()
int getBasicBlockIndex(const BasicBlock *BB) const
getBasicBlockIndex - Return the first index of the specified basic block in the value list for this P...
const BasicBlock * getParent() const
bool isOne() const
This is just a convenience method to make client code smaller for a common case.
unsigned getMaxSafeDepDistBytes() const
static bool isPredicatedBlock(unsigned BlockNum)
Check whether this block is a predicated block.
RecurrenceKind
This enum represents the kinds of recurrences that we support.
bool isVoidTy() const
isVoidTy - Return true if this is 'void'.
SCEVConstant - This class represents a constant integer value.
bool empty() const
empty - Check if the string is empty.