146 #include <functional>
155 using namespace llvm;
157 #define LV_NAME "loop-vectorize"
158 #define DEBUG_TYPE LV_NAME
168 "llvm.loop.vectorize.followup_vectorized";
170 "llvm.loop.vectorize.followup_epilogue";
173 STATISTIC(LoopsVectorized,
"Number of loops vectorized");
174 STATISTIC(LoopsAnalyzed,
"Number of loops analyzed for vectorization");
175 STATISTIC(LoopsEpilogueVectorized,
"Number of epilogues vectorized");
179 cl::desc(
"Enable vectorization of epilogue loops."));
183 cl::desc(
"When epilogue vectorization is enabled, and a value greater than "
184 "1 is specified, forces the given VF for all applicable epilogue "
189 cl::desc(
"Only loops with vectorization factor equal to or larger than "
190 "the specified value are considered for epilogue vectorization."));
196 cl::desc(
"Loops with a constant trip count that is smaller than this "
197 "value are vectorized only if no scalar iteration overheads "
202 cl::desc(
"The maximum allowed number of runtime memory checks with a "
203 "vectorize(enable) pragma."));
219 "prefer-predicate-over-epilogue",
222 cl::desc(
"Tail-folding and predication preferences over creating a scalar "
226 "Don't tail-predicate loops, create scalar epilogue"),
228 "predicate-else-scalar-epilogue",
229 "prefer tail-folding, create scalar epilogue if tail "
232 "predicate-dont-vectorize",
233 "prefers tail-folding, don't attempt vectorization if "
234 "tail-folding fails.")));
238 cl::desc(
"Maximize bandwidth when selecting vectorization factor which "
239 "will be determined by the smallest type in loop."));
243 cl::desc(
"Enable vectorization on interleaved memory accesses in a loop"));
249 cl::desc(
"Enable vectorization on masked interleaved memory accesses in a loop"));
253 cl::desc(
"We don't interleave loops with a estimated constant trip count "
254 "below this number"));
258 cl::desc(
"A flag that overrides the target's number of scalar registers."));
262 cl::desc(
"A flag that overrides the target's number of vector registers."));
266 cl::desc(
"A flag that overrides the target's max interleave factor for "
271 cl::desc(
"A flag that overrides the target's max interleave factor for "
272 "vectorized loops."));
276 cl::desc(
"A flag that overrides the target's expected cost for "
277 "an instruction to a single constant value. Mostly "
278 "useful for getting consistent testing."));
283 "Pretend that scalable vectors are supported, even if the target does "
284 "not support them. This flag should only be used for testing."));
289 "The cost of a loop that is considered 'small' by the interleaver."));
293 cl::desc(
"Enable the use of the block frequency analysis to access PGO "
294 "heuristics minimizing code growth in cold regions and being more "
295 "aggressive in hot regions."));
301 "Enable runtime interleaving until load/store ports are saturated"));
306 cl::desc(
"Enable interleaving for loops with small iteration counts that "
307 "contain scalar reductions to expose ILP."));
312 cl::desc(
"Max number of stores to be predicated behind an if."));
316 cl::desc(
"Count the induction variable only once when interleaving"));
320 cl::desc(
"Enable if predication of stores during vectorization."));
324 cl::desc(
"The maximum interleave count to use when interleaving a scalar "
325 "reduction in a nested loop."));
330 cl::desc(
"Prefer in-loop vector reductions, "
331 "overriding the targets preference."));
335 cl::desc(
"Enable the vectorisation of loops with in-order (strict) "
341 "Prefer predicating a reduction operation over an after loop select."));
345 cl::desc(
"Enable VPlan-native vectorization path with "
346 "support for outer loop vectorization."));
352 cl::desc(
"Enable VPlan-native vectorization path predicator with "
353 "support for outer loop vectorization."));
362 "Build VPlan for every supported loop nest in the function and bail "
363 "out right after the build (stress test the VPlan H-CFG construction "
364 "in the VPlan-native vectorization path)."));
368 cl::desc(
"Enable loop interleaving in Loop vectorization passes"));
371 cl::desc(
"Run the Loop vectorization passes"));
375 cl::desc(
"Use dot format instead of plain text when dumping VPlans"));
384 return DL.getTypeAllocSizeInBits(Ty) !=
DL.getTypeSizeInBits(Ty);
504 const VPIteration &Instance,
bool IfPredicateInstr,
519 VPValue *BlockInMask =
nullptr);
638 std::pair<BasicBlock *, Value *> AdditionalBypass = {
nullptr,
nullptr});
702 std::unique_ptr<LoopVersioning>
LVer;
817 "A high UF for the epilogue loop is likely not beneficial.");
845 std::pair<BasicBlock *, Value *>
853 virtual std::pair<BasicBlock *, Value *>
882 std::pair<BasicBlock *, Value *>
913 std::pair<BasicBlock *, Value *>
920 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
935 if (
I->getDebugLoc() != Empty)
938 for (
Use &
Op :
I->operands()) {
940 if (OpInst->getDebugLoc() != Empty)
950 if (
const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) {
955 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
961 B->SetCurrentDebugLocation(NewDIL.getValue());
964 <<
"Failed to create new discriminator: "
965 << DIL->getFilename() <<
" Line: " << DIL->getLine());
967 B->SetCurrentDebugLocation(DIL);
1000 CodeRegion =
I->getParent();
1003 if (
I->getDebugLoc())
1004 DL =
I->getDebugLoc();
1031 return B.CreateUIToFP(RuntimeVF, FTy);
1042 <<
"loop not vectorized: " << OREMsg);
1064 LoopDbgLoc.print(OS);
1078 if (
LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
1079 LVer->annotateInstWithNoAlias(To, Orig);
1088 auto collectPoisonGeneratingInstrsInBackwardSlice([&](
VPRecipeBase *Root) {
1090 Worklist.push_back(Root);
1093 while (!Worklist.empty()) {
1094 VPRecipeBase *CurRec = Worklist.back();
1095 Worklist.pop_back();
1097 if (!Visited.insert(CurRec).second)
1104 if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
1105 isa<VPInterleaveRecipe>(CurRec) ||
1106 isa<VPScalarIVStepsRecipe>(CurRec) ||
1107 isa<VPCanonicalIVPHIRecipe>(CurRec))
1113 Instruction *Instr = CurRec->getUnderlyingInstr();
1114 if (Instr && Instr->hasPoisonGeneratingFlags())
1115 State.MayGeneratePoisonRecipes.insert(CurRec);
1118 for (VPValue *operand : CurRec->operands())
1119 if (VPDef *OpDef = operand->getDef())
1120 Worklist.push_back(cast<VPRecipeBase>(OpDef));
1129 for (
VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
1131 if (
auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
1132 Instruction *UnderlyingInstr = WidenRec->getUnderlyingInstr();
1133 VPDef *AddrDef = WidenRec->getAddr()->getDef();
1134 if (AddrDef && WidenRec->isConsecutive() && UnderlyingInstr &&
1136 collectPoisonGeneratingInstrsInBackwardSlice(
1137 cast<VPRecipeBase>(AddrDef));
1138 }
else if (
auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
1139 VPDef *AddrDef = InterleaveRec->getAddr()->getDef();
1143 InterleaveRec->getInterleaveGroup();
1144 bool NeedPredication =
false;
1146 I < NumMembers; ++
I) {
1153 if (NeedPredication)
1154 collectPoisonGeneratingInstrsInBackwardSlice(
1155 cast<VPRecipeBase>(AddrDef));
1170 for (
Value *V : To) {
1180 "Expected to find a resume value for the reduction.");
1213 return std::make_tuple(
LHS.isScalable(),
LHS.getKnownMinValue()) <
1214 std::make_tuple(
RHS.isScalable(),
RHS.getKnownMinValue());
1239 Hints(Hints), InterleaveInfo(IAI) {}
1248 bool runtimeChecksRequired();
1258 selectEpilogueVectorizationFactor(
const ElementCount MaxVF,
1264 collectUniformsAndScalars(UserVF);
1265 collectInstsToScalarize(UserVF);
1266 return expectedCost(UserVF).first.isValid();
1272 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1278 unsigned selectInterleaveCount(
ElementCount VF,
unsigned LoopCost);
1306 void collectValuesToIgnore();
1309 void collectElementTypesForWidening();
1313 void collectInLoopReductions();
1320 return !Hints->allowReordering() && RdxDesc.
isOrdered();
1334 "Profitable to scalarize relevant only for VF > 1.");
1341 auto Scalars = InstsToScalarize.find(
VF);
1342 assert(Scalars != InstsToScalarize.end() &&
1343 "VF not yet analyzed for scalarization profitability");
1344 return Scalars->second.find(
I) != Scalars->second.end();
1357 auto UniformsPerVF = Uniforms.find(
VF);
1358 assert(UniformsPerVF != Uniforms.end() &&
1359 "VF not yet analyzed for uniformity");
1360 return UniformsPerVF->second.count(
I);
1373 auto ScalarsPerVF = Scalars.find(
VF);
1374 assert(ScalarsPerVF != Scalars.end() &&
1375 "Scalar values are not calculated for VF");
1376 return ScalarsPerVF->second.count(
I);
1382 return VF.
isVector() && MinBWs.find(
I) != MinBWs.end() &&
1383 !isProfitableToScalarize(
I,
VF) &&
1384 !isScalarAfterVectorization(
I,
VF);
1402 WideningDecisions[std::make_pair(
I,
VF)] = std::make_pair(
W,
Cost);
1416 WideningDecisions[std::make_pair(
I,
VF)] = std::make_pair(
W,
Cost);
1418 WideningDecisions[std::make_pair(
I,
VF)] = std::make_pair(
W, 0);
1431 return CM_GatherScatter;
1433 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(
I,
VF);
1434 auto Itr = WideningDecisions.find(InstOnVF);
1435 if (Itr == WideningDecisions.end())
1437 return Itr->second.first;
1444 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(
I,
VF);
1445 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1446 "The cost is not calculated");
1447 return WideningDecisions[InstOnVF].second;
1455 auto *Trunc = dyn_cast<TruncInst>(
I);
1468 Value *
Op = Trunc->getOperand(0);
1485 if (
VF.
isScalar() || Uniforms.find(
VF) != Uniforms.end())
1487 setCostBasedWideningDecision(
VF);
1488 collectLoopUniforms(
VF);
1489 collectLoopScalars(
VF);
1510 bool LI = isa<LoadInst>(V);
1511 bool SI = isa<StoreInst>(V);
1526 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1527 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1542 bool IsKnownUniform =
false) {
1549 if (IsKnownUniform && isa<LoadInst>(
I) &&
1552 if (!blockNeedsPredicationForAnyReason(
I->getParent()))
1556 if (isa<LoadInst>(
I) || isa<StoreInst>(
I))
1558 return isScalarWithPredication(
I,
VF);
1576 return InterleaveInfo.isInterleaved(Instr);
1582 return InterleaveInfo.getInterleaveGroup(Instr);
1588 if (!isScalarEpilogueAllowed())
1592 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1594 return VF.
isVector() && InterleaveInfo.requiresScalarEpilogue();
1616 using ReductionChainMap =
1621 return InLoopReductionChains;
1626 return InLoopReductionChains.count(Phi);
1640 bool &NeedToScalarize)
const;
1649 WideningDecisions.clear();
1655 unsigned NumPredStores = 0;
1669 bool FoldTailByMasking);
1679 ElementCount getMaximizedVFForTarget(
unsigned ConstTripCount,
1680 unsigned SmallestType,
1681 unsigned WidestType,
1683 bool FoldTailByMasking);
1687 ElementCount getMaxLegalScalableVF(
unsigned MaxSafeElements);
1694 using VectorizationCostTy = std::pair<InstructionCost, bool>;
1702 using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1781 bool FoldTailByMasking =
false;
1804 ReductionChainMap InLoopReductionChains;
1817 int computePredInstDiscount(
Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1843 std::pair<InstWidening, InstructionCost>>;
1845 DecisionList WideningDecisions;
1852 TheLoop->isLoopInvariant(
I))
1861 return Scalars.
find(
VF) == Scalars.
end() ||
1862 !isScalarAfterVectorization(
I,
VF);
1869 Ops, [
this,
VF](
Value *V) {
return this->needsExtract(V,
VF); }));
1874 bool isCandidateForEpilogueVectorization(
const Loop &L,
1880 bool isEpilogueVectorizationProfitable(
const ElementCount VF)
const;
1945 Value *SCEVCheckCond =
nullptr;
1953 Value *MemRuntimeCheckCond =
nullptr;
1964 :
DT(
DT),
LI(
LI), SCEVExp(SE,
DL,
"scev.check"),
1965 MemCheckExp(SE,
DL,
"scev.check") {}
1984 nullptr,
"vector.scevcheck");
1991 if (RtPtrChecking.Need) {
1992 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1993 MemCheckBlock =
SplitBlock(Pred, Pred->getTerminator(),
DT,
LI,
nullptr,
1996 auto DiffChecks = RtPtrChecking.getDiffChecks();
1999 MemCheckBlock->
getTerminator(), L, *DiffChecks, MemCheckExp,
2001 return getRuntimeVF(B, B.getIntNTy(Bits), VF);
2005 MemRuntimeCheckCond =
2007 RtPtrChecking.getChecks(), MemCheckExp);
2009 assert(MemRuntimeCheckCond &&
2010 "no RT checks generated although RtPtrChecking "
2011 "claimed checks are required");
2014 if (!MemCheckBlock && !SCEVCheckBlock)
2024 if (SCEVCheckBlock) {
2029 if (MemCheckBlock) {
2036 if (MemCheckBlock) {
2040 if (SCEVCheckBlock) {
2054 if (!MemRuntimeCheckCond)
2057 if (MemRuntimeCheckCond) {
2058 auto &SE = *MemCheckExp.
getSE();
2065 I.eraseFromParent();
2073 if (MemRuntimeCheckCond)
2088 SCEVCheckCond =
nullptr;
2089 if (
auto *
C = dyn_cast<ConstantInt>(
Cond))
2098 PL->addBasicBlockToLoop(SCEVCheckBlock, *
LI);
2110 return SCEVCheckBlock;
2119 if (!MemRuntimeCheckCond)
2131 PL->addBasicBlockToLoop(MemCheckBlock, *
LI);
2137 Pred->getTerminator()->getDebugLoc());
2140 MemRuntimeCheckCond =
nullptr;
2141 return MemCheckBlock;
2172 LLVM_DEBUG(
dbgs() <<
"LV: Loop hints prevent outer loop vectorization.\n");
2178 LLVM_DEBUG(
dbgs() <<
"LV: Not vectorizing: Interleave is not supported for "
2198 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *
LI)) {
2208 for (
Loop *InnerL : L)
2221 explicit LoopVectorize(
bool InterleaveOnlyWhenForced =
false,
2222 bool VectorizeOnlyWhenForced =
false)
2224 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
2229 if (skipFunction(
F))
2232 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2233 auto *
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2234 auto *
TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
F);
2235 auto *
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2236 auto *
BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2237 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2238 auto *
TLI = TLIP ? &TLIP->getTLI(
F) :
nullptr;
2239 auto *
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2240 auto *
AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
F);
2241 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
2242 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2243 auto *
ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2244 auto *
PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
2249 return Impl.
runImpl(
F, *SE, *
LI, *
TTI, *
DT, *
BFI,
TLI, *DB, *
AA, *
AC,
2317 auto *ValVTy = cast<VectorType>(Val->
getType());
2322 "Induction Step must be an integer or FP");
2330 Type *InitVecValSTy =
2350 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2351 "Binary Opcode should be specified for FP induction");
2371 "Val and Step should have the same type");
2381 AddOp =
ID.getInductionOpcode();
2382 MulOp = Instruction::FMul;
2392 Type *VecIVTy =
nullptr;
2393 Value *UnitStepVec =
nullptr, *SplatStep =
nullptr, *SplatIV =
nullptr;
2402 for (
unsigned Part = 0; Part < State.
UF; ++Part) {
2412 State.
set(
Def, Add, Part);
2421 for (
unsigned Lane = 0; Lane < Lanes; ++Lane) {
2427 "Expected StartIdx to be folded to a constant when VF is not "
2443 "Induction step should be loop invariant");
2444 if (
auto *
E = dyn_cast<SCEVUnknown>(Step))
2445 return E->getValue();
2448 return Exp.expandCodeFor(Step, Step->
getType(), InsertBefore);
2460 assert(Index->getType()->getScalarType() == Step->
getType() &&
2461 "Index scalar type does not match StepValue type");
2470 assert(
X->getType() ==
Y->getType() &&
"Types don't match!");
2471 if (
auto *CX = dyn_cast<ConstantInt>(
X))
2474 if (
auto *CY = dyn_cast<ConstantInt>(
Y))
2477 return B.CreateAdd(
X,
Y);
2483 assert(
X->getType()->getScalarType() ==
Y->getType() &&
2484 "Types don't match!");
2485 if (
auto *CX = dyn_cast<ConstantInt>(
X))
2488 if (
auto *CY = dyn_cast<ConstantInt>(
Y))
2491 VectorType *XVTy = dyn_cast<VectorType>(
X->getType());
2492 if (XVTy && !isa<VectorType>(
Y->getType()))
2494 return B.CreateMul(
X,
Y);
2497 switch (
ID.getKind()) {
2499 assert(!isa<VectorType>(Index->getType()) &&
2500 "Vector indices not supported for integer inductions yet");
2502 "Index type does not match StartValue type");
2503 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2504 return B.CreateSub(StartValue, Index);
2509 assert(isa<Constant>(Step) &&
2510 "Expected constant step for pointer induction");
2511 return B.CreateGEP(
ID.getElementType(), StartValue,
CreateMul(Index, Step));
2514 assert(!isa<VectorType>(Index->getType()) &&
2515 "Vector indices not supported for FP inductions yet");
2517 auto InductionBinOp =
ID.getInductionBinOp();
2519 (InductionBinOp->getOpcode() == Instruction::FAdd ||
2520 InductionBinOp->getOpcode() == Instruction::FSub) &&
2521 "Original bin op should be defined for FP induction");
2523 Value *MulExp =
B.CreateFMul(Step, Index);
2524 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2539 VectorValue, ScalarInst,
2592 unsigned InterleaveFactor = Group->
getFactor();
2598 unsigned Index = Group->
getIndex(Instr);
2602 "Reversed masked interleave-group not supported.");
2613 for (
unsigned Part = 0; Part <
UF; Part++) {
2629 bool InBounds =
false;
2631 InBounds =
gep->isInBounds();
2633 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2644 Value *MaskForGaps =
nullptr;
2647 assert(MaskForGaps &&
"Mask for Gaps is required but it is null");
2651 if (isa<LoadInst>(Instr)) {
2654 for (
unsigned Part = 0; Part <
UF; Part++) {
2656 if (BlockInMask || MaskForGaps) {
2658 "masked interleaved groups are not allowed.");
2659 Value *GroupMask = MaskForGaps;
2661 Value *BlockInMaskPart = State.
get(BlockInMask, Part);
2665 "interleaved.mask");
2666 GroupMask = MaskForGaps
2673 GroupMask, PoisonVec,
"wide.masked.vec");
2679 NewLoads.push_back(NewLoad);
2685 for (
unsigned I = 0;
I < InterleaveFactor; ++
I) {
2694 for (
unsigned Part = 0; Part <
UF; Part++) {
2696 NewLoads[Part], StrideMask,
"strided.vec");
2699 if (Member->getType() != ScalarTy) {
2708 State.
set(VPDefs[J], StridedVec, Part);
2721 "masked interleaved groups are not allowed.");
2723 "masking gaps for scalable vectors is not yet supported.");
2724 for (
unsigned Part = 0; Part <
UF; Part++) {
2727 for (
unsigned i = 0;
i < InterleaveFactor;
i++) {
2729 "Fail to get a member from an interleaved store group");
2735 StoredVecs.push_back(
Undef);
2739 Value *StoredVec = State.
get(StoredValues[
i], Part);
2746 if (StoredVec->
getType() != SubVT)
2749 StoredVecs.push_back(StoredVec);
2761 if (BlockInMask || MaskForGaps) {
2762 Value *GroupMask = MaskForGaps;
2764 Value *BlockInMaskPart = State.
get(BlockInMask, Part);
2768 "interleaved.mask");
2770 ShuffledMask, MaskForGaps)
2786 bool IfPredicateInstr,
2792 if (isa<NoAliasScopeDeclInst>(Instr))
2818 auto InputInstance = Instance;
2823 Cloned->
setOperand(
I.index(), State.
get(Operand, InputInstance));
2830 State.
set(RepRecipe, Cloned, Instance);
2833 if (
auto *II = dyn_cast<AssumeInst>(Cloned))
2837 if (IfPredicateInstr)
2850 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
2851 "Invalid loop count");
2854 assert(IdxTy &&
"No type for induction");
2868 BackedgeTakenCount, SE->
getOne(BackedgeTakenCount->
getType()));
2910 "VF*UF must be a power of 2 when folding tail by masking");
2942 auto *DstFVTy = cast<FixedVectorType>(DstVTy);
2943 unsigned VF = DstFVTy->getNumElements();
2944 auto *SrcVecTy = cast<FixedVectorType>(V->
getType());
2945 assert((
VF == SrcVecTy->getNumElements()) &&
"Vector dimensions do not match");
2946 Type *SrcElemTy = SrcVecTy->getElementType();
2947 Type *DstElemTy = DstFVTy->getElementType();
2948 assert((
DL.getTypeSizeInBits(SrcElemTy) ==
DL.getTypeSizeInBits(DstElemTy)) &&
2949 "Vector elements must have same size");
2960 "Only one type should be a pointer type");
2962 "Only one type should be a floating point type");
2997 Value *MaxUIntTripCount =
3011 "TC check is expected to dominate Bypass");
3031 if (!SCEVCheckBlock)
3037 "Cannot SCEV check stride or overflow when optimizing for size");
3052 return SCEVCheckBlock;
3071 "Cannot emit memory checks when optimizing for size, unless forced "
3077 <<
"Code-size may be reduced by not forcing "
3078 "vectorization, or by source-code modifications "
3079 "eliminating the need for runtime checks "
3080 "(e.g., adding 'restrict').";
3093 LVer = std::make_unique<LoopVersioning>(
3097 LVer->prepareNoAliasMetadata();
3099 return MemCheckBlock;
3108 "multiple exit loop without required epilogue?");
3131 BrInst->
setDebugLoc(ScalarLatchTerm->getDebugLoc());
3145 std::pair<BasicBlock *, Value *> AdditionalBypass) {
3146 assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3147 (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3148 "Inconsistent information about additional bypass.");
3161 PHINode *OrigPhi = InductionEntry.first;
3171 Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3172 if (OrigPhi == OldInduction) {
3192 if (AdditionalBypass.first) {
3193 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3199 B.CreateCast(CastOp, AdditionalBypass.second, StepType,
"cast.vtc");
3200 EndValueFromAdditionalBypass =
3202 EndValueFromAdditionalBypass->
setName(
"ind.end");
3215 if (AdditionalBypass.first)
3217 EndValueFromAdditionalBypass);
3247 CmpN->
setDebugLoc(ScalarLatchTerm->getDebugLoc());
3251 #ifdef EXPENSIVE_CHECKS
3258 std::pair<BasicBlock *, Value *>
3354 assert(isa<PHINode>(UI) &&
"Expected LCSSA form");
3355 MissingVals[UI] = EndValue;
3363 auto *UI = cast<Instruction>(U);
3365 assert(isa<PHINode>(UI) &&
"Expected LCSSA form");
3373 Value *CountMinusOne =
B.CreateSub(
3377 ?
B.CreateCast(Instruction::SIToFP, CountMinusOne,
3386 Escape->
setName(
"ind.escape");
3387 MissingVals[UI] = Escape;
3391 for (
auto &
I : MissingVals) {
3392 PHINode *PHI = cast<PHINode>(
I.first);
3405 struct CSEDenseMapInfo {
3407 return isa<InsertElementInst>(
I) || isa<ExtractElementInst>(
I) ||
3408 isa<ShuffleVectorInst>(
I) || isa<GetElementPtrInst>(
I);
3420 assert(canHandle(
I) &&
"Unknown instruction!");
3422 I->value_op_end()));
3426 if (
LHS == getEmptyKey() ||
RHS == getEmptyKey() ||
3427 LHS == getTombstoneKey() ||
RHS == getTombstoneKey())
3429 return LHS->isIdenticalTo(
RHS);
3440 if (!CSEDenseMapInfo::canHandle(&
In))
3446 In.replaceAllUsesWith(V);
3447 In.eraseFromParent();
3457 bool &NeedToScalarize)
const {
3461 for (
auto &ArgOp : CI->
args())
3462 ScalarTys.push_back(ArgOp->getType());
3471 return ScalarCallCost;
3475 for (
Type *ScalarTy : ScalarTys)
3487 NeedToScalarize =
true;
3497 if (VectorCallCost <
Cost) {
3498 NeedToScalarize =
false;
3499 Cost = VectorCallCost;
3514 assert(
ID &&
"Expected intrinsic call!");
3517 if (
auto *FPMO = dyn_cast<FPMathOperator>(CI))
3518 FMF = FPMO->getFastMathFlags();
3524 std::back_inserter(ParamTys),
3525 [&](
Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3528 dyn_cast<IntrinsicInst>(CI));
3534 auto *
I1 = cast<IntegerType>(cast<VectorType>(
T1)->getElementType());
3535 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3536 return I1->getBitWidth() < I2->getBitWidth() ?
T1 : T2;
3540 auto *
I1 = cast<IntegerType>(cast<VectorType>(
T1)->getElementType());
3541 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3542 return I1->getBitWidth() > I2->getBitWidth() ?
T1 : T2;
3558 for (
unsigned Part = 0; Part <
UF; ++Part) {
3560 if (Erased.
count(
I) ||
I->use_empty() || !isa<Instruction>(
I))
3562 Type *OriginalTy =
I->getType();
3563 Type *ScalarTruncatedTy =
3566 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount());
3567 if (TruncatedTy == OriginalTy)
3571 auto ShrinkOperand = [&](
Value *V) ->
Value * {
3572 if (
auto *ZI = dyn_cast<ZExtInst>(V))
3573 if (ZI->getSrcTy() == TruncatedTy)
3574 return ZI->getOperand(0);
3575 return B.CreateZExtOrTrunc(V, TruncatedTy);
3580 Value *NewI =
nullptr;
3581 if (
auto *BO = dyn_cast<BinaryOperator>(
I)) {
3582 NewI =
B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3583 ShrinkOperand(BO->getOperand(1)));
3588 cast<BinaryOperator>(NewI)->copyIRFlags(
I,
false);
3589 }
else if (
auto *CI = dyn_cast<ICmpInst>(
I)) {
3591 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3592 ShrinkOperand(CI->getOperand(1)));
3593 }
else if (
auto *
SI = dyn_cast<SelectInst>(
I)) {
3594 NewI =
B.CreateSelect(
SI->getCondition(),
3595 ShrinkOperand(
SI->getTrueValue()),
3596 ShrinkOperand(
SI->getFalseValue()));
3597 }
else if (
auto *CI = dyn_cast<CastInst>(
I)) {
3598 switch (CI->getOpcode()) {
3601 case Instruction::Trunc:
3602 NewI = ShrinkOperand(CI->getOperand(0));
3604 case Instruction::SExt:
3605 NewI =
B.CreateSExtOrTrunc(
3609 case Instruction::ZExt:
3610 NewI =
B.CreateZExtOrTrunc(
3615 }
else if (
auto *
SI = dyn_cast<ShuffleVectorInst>(
I)) {
3617 cast<VectorType>(
SI->getOperand(0)->getType())->getElementCount();
3618 auto *O0 =
B.CreateZExtOrTrunc(
3621 cast<VectorType>(
SI->getOperand(1)->getType())->getElementCount();
3622 auto *O1 =
B.CreateZExtOrTrunc(
3625 NewI =
B.CreateShuffleVector(O0, O1,
SI->getShuffleMask());
3626 }
else if (isa<LoadInst>(
I) || isa<PHINode>(
I)) {
3629 }
else if (
auto *
IE = dyn_cast<InsertElementInst>(
I)) {
3631 cast<VectorType>(
IE->getOperand(0)->getType())->getElementCount();
3632 auto *O0 =
B.CreateZExtOrTrunc(
3634 auto *O1 =
B.CreateZExtOrTrunc(
IE->getOperand(1), ScalarTruncatedTy);
3635 NewI =
B.CreateInsertElement(O0, O1,
IE->getOperand(2));
3636 }
else if (
auto *EE = dyn_cast<ExtractElementInst>(
I)) {
3638 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount();
3639 auto *O0 =
B.CreateZExtOrTrunc(
3641 NewI =
B.CreateExtractElement(O0, EE->getOperand(2));
3649 Value *Res =
B.CreateZExtOrTrunc(NewI, OriginalTy);
3650 I->replaceAllUsesWith(Res);
3651 cast<Instruction>(
I)->eraseFromParent();
3666 for (
unsigned Part = 0; Part <
UF; ++Part) {
3688 "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3751 if (
auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
3753 else if (
auto *
FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
3812 Value *Incoming = State.
get(PreviousDef,
UF - 1);
3813 auto *ExtractForScalar = Incoming;
3821 "vector.recur.extract");
3828 Value *ExtractForPhiUsedOutsideLoop =
nullptr;
3833 Incoming, Idx,
"vector.recur.extract.for.phi");
3839 ExtractForPhiUsedOutsideLoop = State.
get(PreviousDef,
UF - 2);
3848 Start->addIncoming(Incoming,
BB);
3874 "Unable to find the reduction variable");
3907 for (
unsigned Part = 0; Part <
UF; ++Part) {
3908 Value *VecLoopExitInst = State.
get(LoopExitInstDef, Part);
3909 Value *Sel =
nullptr;
3910 for (
User *U : VecLoopExitInst->
users()) {
3911 if (isa<SelectInst>(U)) {
3912 assert(!Sel &&
"Reduction exit feeding two selects");
3915 assert(isa<PHINode>(U) &&
"Reduction exit must feed Phi's or select");
3917 assert(Sel &&
"Reduction exit feeds no select");
3918 State.
reset(LoopExitInstDef, Sel, Part);
3930 cast<PHINode>(State.
get(PhiR, Part));
3931 VecRdxPhi->setIncomingValueForBlock(VectorLoopLatch, Sel);
3940 assert(!PhiR->
isInLoop() &&
"Unexpected truncated inloop reduction!");
3944 for (
unsigned Part = 0; Part <
UF; ++Part) {
3945 RdxParts[Part] = State.
get(LoopExitInstDef, Part);
3951 U->replaceUsesOfWith(RdxParts[Part], Extnd);
3952 RdxParts[Part] = Extnd;
3956 for (
unsigned Part = 0; Part <
UF; ++Part) {
3958 State.
reset(LoopExitInstDef, RdxParts[Part], Part);
3963 Value *ReducedPartRdx = State.
get(LoopExitInstDef, 0);
3975 ReducedPartRdx = State.
get(LoopExitInstDef,
UF - 1);
3980 for (
unsigned Part = 1; Part <
UF; ++Part) {
3981 Value *RdxPart = State.
get(LoopExitInstDef, Part);
3982 if (
Op != Instruction::ICmp &&
Op != Instruction::FCmp) {
3987 ReducedPartRdx, RdxPart);
4001 ReducedPartRdx = RdxDesc.
isSigned()
4019 BCBlockPhi->
addIncoming(ReducedPartRdx, Incoming);
4024 BCBlockPhi->
addIncoming(ReductionStartValue, Incoming);
4054 int IncomingEdgeBlockIdx =
4056 assert(IncomingEdgeBlockIdx >= 0 &&
"Invalid block index");
4058 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4070 assert(LoopExitInstr &&
"null loop exit instruction");
4073 Worklist.push_back(LoopExitInstr);
4074 Visited.
insert(LoopExitInstr);
4076 while (!Worklist.empty()) {
4078 if (isa<OverflowingBinaryOperator>(Cur))
4079 for (
unsigned Part = 0; Part <
UF; ++Part) {
4082 cast<Instruction>(V)->dropPoisonGeneratingFlags();
4088 Visited.
insert(UI).second)
4089 Worklist.push_back(UI);
4101 auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4105 if (isa<Instruction>(IncomingValue) &&
4114 Value *lastIncomingValue =
4137 auto isBlockOfUsePredicated = [&](
Use &U) ->
bool {
4138 auto *
I = cast<Instruction>(U.getUser());
4140 if (
auto *Phi = dyn_cast<PHINode>(
I))
4141 BB = Phi->getIncomingBlock(
4143 return BB == PredBB;
4154 Worklist.
insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4155 InstsToReanalyze.
clear();
4158 while (!Worklist.
empty()) {
4163 if (!
I || isa<PHINode>(
I) || !VectorLoop->contains(
I) ||
4164 I->mayHaveSideEffects())
4172 if (
I->getParent() == PredBB) {
4173 Worklist.
insert(
I->op_begin(),
I->op_end());
4181 InstsToReanalyze.push_back(
I);
4187 I->moveBefore(&*PredBB->getFirstInsertionPt());
4188 Worklist.
insert(
I->op_begin(),
I->op_end());
4201 PHINode *NewPhi = cast<PHINode>(State.
get(VPPhi, 0));
4221 "Non-native vplans are not expected to have VPWidenPHIRecipes.");
4230 State.
set(PhiR, VecPhi, 0);
4242 assert((
I.getOpcode() == Instruction::UDiv ||
4243 I.getOpcode() == Instruction::SDiv ||
4244 I.getOpcode() == Instruction::URem ||
4245 I.getOpcode() == Instruction::SRem) &&
4246 "Unexpected instruction");
4247 Value *Divisor =
I.getOperand(1);
4248 auto *CInt = dyn_cast<ConstantInt>(Divisor);
4249 return !CInt || CInt->isZero();
4255 assert(!isa<DbgInfoIntrinsic>(
I) &&
4256 "DbgInfoIntrinsic should have been dropped during VPlan construction");
4259 Module *
M =
I.getParent()->getParent()->getParent();
4260 auto *CI = cast<CallInst>(&
I);
4263 for (
Value *ArgOperand : CI->args())
4271 bool NeedToScalarize =
false;
4274 bool UseVectorIntrinsic =
ID && IntrinsicCost <= CallCost;
4275 assert((UseVectorIntrinsic || !NeedToScalarize) &&
4276 "Instruction should be scalarized elsewhere.");
4278 "Either the intrinsic cost or vector call cost must be valid");
4280 for (
unsigned Part = 0; Part <
UF; ++Part) {
4287 if (!UseVectorIntrinsic ||
4289 Arg = State.
get(
I.value(), Part);
4293 TysForDecl.push_back(
Arg->getType());
4298 if (UseVectorIntrinsic) {
4303 assert(VectorF &&
"Can't retrieve vector intrinsic.");
4309 "Can't create vector function.");
4314 CI->getOperandBundlesAsDefs(OpBundles);
4317 if (isa<FPMathOperator>(V))
4325 void LoopVectorizationCostModel::collectLoopScalars(
ElementCount VF) {
4330 "This function should not be visited twice for the same VF");
4346 auto *Latch = TheLoop->getLoopLatch();
4353 InstWidening WideningDecision = getWideningDecision(MemAccess,
VF);
4354 assert(WideningDecision != CM_Unknown &&
4355 "Widening decision should be ready at this moment");
4356 if (
auto *
Store = dyn_cast<StoreInst>(MemAccess))
4357 if (Ptr ==
Store->getValueOperand())
4358 return WideningDecision == CM_Scalarize;
4360 "Ptr is neither a value or pointer operand");
4361 return WideningDecision != CM_GatherScatter;
4366 auto isLoopVaryingBitCastOrGEP = [&](
Value *V) {
4368 isa<GetElementPtrInst>(V)) &&
4369 !TheLoop->isLoopInvariant(V);
4379 if (!isLoopVaryingBitCastOrGEP(Ptr))
4384 auto *
I = cast<Instruction>(Ptr);
4392 return isa<LoadInst>(U) || isa<StoreInst>(U);
4396 PossibleNonScalarPtrs.
insert(
I);
4413 for (
auto *
BB : TheLoop->blocks())
4414 for (
auto &
I : *
BB) {
4415 if (
auto *
Load = dyn_cast<LoadInst>(&
I)) {
4416 evaluatePtrUse(
Load,
Load->getPointerOperand());
4417 }
else if (
auto *
Store = dyn_cast<StoreInst>(&
I)) {
4418 evaluatePtrUse(
Store,
Store->getPointerOperand());
4419 evaluatePtrUse(
Store,
Store->getValueOperand());
4422 for (
auto *
I : ScalarPtrs)
4423 if (!PossibleNonScalarPtrs.
count(
I)) {
4431 auto ForcedScalar = ForcedScalars.
find(
VF);
4432 if (ForcedScalar != ForcedScalars.
end())
4433 for (
auto *
I : ForcedScalar->second)
4441 while (Idx != Worklist.
size()) {
4443 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4445 auto *Src = cast<Instruction>(Dst->getOperand(0));
4447 auto *J = cast<Instruction>(U);
4448 return !TheLoop->contains(J) || Worklist.count(J) ||
4449 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4450 isScalarUse(J, Src));
4453 LLVM_DEBUG(
dbgs() <<
"LV: Found scalar instruction: " << *Src <<
"\n");
4460 auto *Ind = Induction.first;
4461 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4470 auto IsDirectLoadStoreFromPtrIndvar = [&](
Instruction *Indvar,
4472 return Induction.second.getKind() ==
4474 (isa<LoadInst>(
I) || isa<StoreInst>(
I)) &&
4481 auto *I = cast<Instruction>(U);
4482 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4483 IsDirectLoadStoreFromPtrIndvar(Ind, I);
4490 auto ScalarIndUpdate =
4492 auto *I = cast<Instruction>(U);
4493 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4494 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
4496 if (!ScalarIndUpdate)
4501 Worklist.
insert(IndUpdate);
4502 LLVM_DEBUG(
dbgs() <<
"LV: Found scalar instruction: " << *Ind <<
"\n");
4503 LLVM_DEBUG(
dbgs() <<
"LV: Found scalar instruction: " << *IndUpdate
4512 if (!blockNeedsPredicationForAnyReason(
I->getParent()))
4514 switch(
I->getOpcode()) {
4527 return isa<LoadInst>(
I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4529 : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4532 case Instruction::UDiv:
4533 case Instruction::SDiv:
4534 case Instruction::SRem:
4535 case Instruction::URem:
4543 assert(isAccessInterleaved(
I) &&
"Expecting interleaved access.");
4544 assert(getWideningDecision(
I,
VF) == CM_Unknown &&
4545 "Decision should not be set yet.");
4546 auto *Group = getInterleavedAccessGroup(
I);
4547 assert(Group &&
"Must have a group.");
4551 auto &
DL =
I->getModule()->getDataLayout();
4558 unsigned InterleaveFactor = Group->getFactor();
4559 bool ScalarNI =
DL.isNonIntegralPointerType(ScalarTy);
4560 for (
unsigned i = 0;
i < InterleaveFactor;
i++) {
4565 bool MemberNI =
DL.isNonIntegralPointerType(MemberTy);
4567 if (MemberNI != ScalarNI) {
4570 }
else if (MemberNI && ScalarNI &&
4571 ScalarTy->getPointerAddressSpace() !=
4572 MemberTy->getPointerAddressSpace()) {
4582 bool PredicatedAccessRequiresMasking =
4583 blockNeedsPredicationForAnyReason(
I->getParent()) &&
4585 bool LoadAccessWithGapsRequiresEpilogMasking =
4586 isa<LoadInst>(
I) && Group->requiresScalarEpilogue() &&
4587 !isScalarEpilogueAllowed();
4588 bool StoreAccessWithGapsRequiresMasking =
4589 isa<StoreInst>(
I) && (Group->getNumMembers() < Group->getFactor());
4590 if (!PredicatedAccessRequiresMasking &&
4591 !LoadAccessWithGapsRequiresEpilogMasking &&
4592 !StoreAccessWithGapsRequiresMasking)
4599 "Masked interleave-groups for predicated accesses are not enabled.");
4601 if (Group->isReverse())
4613 assert((isa<LoadInst, StoreInst>(
I)) &&
"Invalid memory instruction");
4624 if (isScalarWithPredication(
I,
VF))
4629 auto &
DL =
I->getModule()->getDataLayout();
4636 void LoopVectorizationCostModel::collectLoopUniforms(
ElementCount VF) {
4643 "This function should not be visited twice for the same VF");
4655 auto isOutOfScope = [&](
Value *V) ->
bool {
4657 return (!
I || !TheLoop->contains(