156#define LV_NAME "loop-vectorize"
157#define DEBUG_TYPE LV_NAME
167 "llvm.loop.vectorize.followup_vectorized";
169 "llvm.loop.vectorize.followup_epilogue";
172STATISTIC(LoopsVectorized,
"Number of loops vectorized");
173STATISTIC(LoopsAnalyzed,
"Number of loops analyzed for vectorization");
174STATISTIC(LoopsEpilogueVectorized,
"Number of epilogues vectorized");
178 cl::desc(
"Enable vectorization of epilogue loops."));
182 cl::desc(
"When epilogue vectorization is enabled, and a value greater than "
183 "1 is specified, forces the given VF for all applicable epilogue "
188 cl::desc(
"Only loops with vectorization factor equal to or larger than "
189 "the specified value are considered for epilogue vectorization."));
195 cl::desc(
"Loops with a constant trip count that is smaller than this "
196 "value are vectorized only if no scalar iteration overheads "
201 cl::desc(
"The maximum allowed number of runtime memory checks"));
217 "prefer-predicate-over-epilogue",
220 cl::desc(
"Tail-folding and predication preferences over creating a scalar "
224 "Don't tail-predicate loops, create scalar epilogue"),
226 "predicate-else-scalar-epilogue",
227 "prefer tail-folding, create scalar epilogue if tail "
230 "predicate-dont-vectorize",
231 "prefers tail-folding, don't attempt vectorization if "
232 "tail-folding fails.")));
236 cl::desc(
"Maximize bandwidth when selecting vectorization factor which "
237 "will be determined by the smallest type in loop."));
241 cl::desc(
"Enable vectorization on interleaved memory accesses in a loop"));
247 cl::desc(
"Enable vectorization on masked interleaved memory accesses in a loop"));
251 cl::desc(
"We don't interleave loops with a estimated constant trip count "
252 "below this number"));
256 cl::desc(
"A flag that overrides the target's number of scalar registers."));
260 cl::desc(
"A flag that overrides the target's number of vector registers."));
264 cl::desc(
"A flag that overrides the target's max interleave factor for "
269 cl::desc(
"A flag that overrides the target's max interleave factor for "
270 "vectorized loops."));
274 cl::desc(
"A flag that overrides the target's expected cost for "
275 "an instruction to a single constant value. Mostly "
276 "useful for getting consistent testing."));
281 "Pretend that scalable vectors are supported, even if the target does "
282 "not support them. This flag should only be used for testing."));
287 "The cost of a loop that is considered 'small' by the interleaver."));
291 cl::desc(
"Enable the use of the block frequency analysis to access PGO "
292 "heuristics minimizing code growth in cold regions and being more "
293 "aggressive in hot regions."));
299 "Enable runtime interleaving until load/store ports are saturated"));
304 cl::desc(
"Enable interleaving for loops with small iteration counts that "
305 "contain scalar reductions to expose ILP."));
310 cl::desc(
"Max number of stores to be predicated behind an if."));
314 cl::desc(
"Count the induction variable only once when interleaving"));
318 cl::desc(
"Enable if predication of stores during vectorization."));
322 cl::desc(
"The maximum interleave count to use when interleaving a scalar "
323 "reduction in a nested loop."));
328 cl::desc(
"Prefer in-loop vector reductions, "
329 "overriding the targets preference."));
333 cl::desc(
"Enable the vectorisation of loops with in-order (strict) "
339 "Prefer predicating a reduction operation over an after loop select."));
343 cl::desc(
"Enable VPlan-native vectorization path with "
344 "support for outer loop vectorization."));
353 "Build VPlan for every supported loop nest in the function and bail "
354 "out right after the build (stress test the VPlan H-CFG construction "
355 "in the VPlan-native vectorization path)."));
359 cl::desc(
"Enable loop interleaving in Loop vectorization passes"));
362 cl::desc(
"Run the Loop vectorization passes"));
366 cl::desc(
"Use dot format instead of plain text when dumping VPlans"));
369 "force-widen-divrem-via-safe-divisor",
cl::Hidden,
371 "Override cost based safe divisor widening for div/rem instructions"));
380 return DL.getTypeAllocSizeInBits(Ty) !=
DL.getTypeSizeInBits(Ty);
424class GeneratedRTChecks;
466 this->MinProfitableTripCount = VecWidth;
501 const VPIteration &Instance,
bool IfPredicateInstr,
516 VPValue *BlockInMask =
nullptr);
545 std::pair<BasicBlock *, Value *> AdditionalBypass = {
nullptr,
nullptr});
624 std::pair<BasicBlock *, Value *> AdditionalBypass = {
nullptr,
nullptr});
785 "A high UF for the epilogue loop is likely not beneficial.");
805 GeneratedRTChecks &Checks)
807 EPI.MainLoopVF,
EPI.MainLoopVF,
EPI.MainLoopUF, LVL,
820 virtual std::pair<BasicBlock *, Value *>
844 GeneratedRTChecks &Check)
872 GeneratedRTChecks &Checks)
885 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
900 if (
I->getDebugLoc() !=
Empty)
903 for (
Use &Op :
I->operands()) {
904 if (
Instruction *OpInst = dyn_cast<Instruction>(Op))
905 if (OpInst->getDebugLoc() !=
Empty)
918 dbgs() <<
"LV: " << Prefix << DebugMsg;
940 CodeRegion =
I->getParent();
943 if (
I->getDebugLoc())
944 DL =
I->getDebugLoc();
957 return VF.
isScalable() ?
B.CreateVScale(StepVal) : StepVal;
968 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
"Invalid loop count");
992 return B.CreateUIToFP(RuntimeVF, FTy);
1003 <<
"loop not vectorized: " << OREMsg);
1025 LoopDbgLoc.print(OS);
1041 auto collectPoisonGeneratingInstrsInBackwardSlice([&](
VPRecipeBase *Root) {
1046 while (!Worklist.
empty()) {
1047 VPRecipeBase *CurRec = Worklist.back();
1048 Worklist.pop_back();
1050 if (!Visited.insert(CurRec).second)
1057 if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
1058 isa<VPInterleaveRecipe>(CurRec) ||
1059 isa<VPScalarIVStepsRecipe>(CurRec) ||
1060 isa<VPCanonicalIVPHIRecipe>(CurRec) ||
1061 isa<VPActiveLaneMaskPHIRecipe>(CurRec))
1067 Instruction *Instr = CurRec->getUnderlyingInstr();
1068 if (Instr && Instr->hasPoisonGeneratingFlags())
1069 State.MayGeneratePoisonRecipes.insert(CurRec);
1072 for (VPValue *operand : CurRec->operands())
1073 if (VPRecipeBase *OpDef = operand->getDefiningRecipe())
1074 Worklist.push_back(OpDef);
1082 for (
VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
1084 if (
auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
1085 Instruction &UnderlyingInstr = WidenRec->getIngredient();
1086 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
1087 if (AddrDef && WidenRec->isConsecutive() &&
1089 collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
1090 }
else if (
auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
1091 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
1095 InterleaveRec->getInterleaveGroup();
1096 bool NeedPredication =
false;
1098 I < NumMembers; ++
I) {
1102 Legal->blockNeedsPredication(Member->getParent());
1105 if (NeedPredication)
1106 collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
1117 "Expected to find a resume value for the reduction.");
1150 return std::make_tuple(
LHS.isScalable(),
LHS.getKnownMinValue()) <
1151 std::make_tuple(
RHS.isScalable(),
RHS.getKnownMinValue());
1174 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI),
Legal(
Legal),
1175 TTI(
TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(
F),
1176 Hints(Hints), InterleaveInfo(IAI) {}
1185 bool runtimeChecksRequired();
1195 selectEpilogueVectorizationFactor(
const ElementCount MaxVF,
1201 collectUniformsAndScalars(UserVF);
1202 collectInstsToScalarize(UserVF);
1203 return expectedCost(UserVF).first.isValid();
1209 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1243 void collectValuesToIgnore();
1246 void collectElementTypesForWidening();
1250 void collectInLoopReductions();
1257 return !Hints->allowReordering() && RdxDesc.
isOrdered();
1271 "Profitable to scalarize relevant only for VF > 1.");
1278 auto Scalars = InstsToScalarize.find(VF);
1279 assert(Scalars != InstsToScalarize.end() &&
1280 "VF not yet analyzed for scalarization profitability");
1281 return Scalars->second.find(
I) != Scalars->second.end();
1294 auto UniformsPerVF = Uniforms.find(VF);
1295 assert(UniformsPerVF != Uniforms.end() &&
1296 "VF not yet analyzed for uniformity");
1297 return UniformsPerVF->second.count(
I);
1310 auto ScalarsPerVF = Scalars.find(VF);
1311 assert(ScalarsPerVF != Scalars.end() &&
1312 "Scalar values are not calculated for VF");
1313 return ScalarsPerVF->second.count(
I);
1319 return VF.
isVector() && MinBWs.find(
I) != MinBWs.end() &&
1320 !isProfitableToScalarize(
I, VF) &&
1321 !isScalarAfterVectorization(
I, VF);
1339 WideningDecisions[std::make_pair(
I, VF)] = std::make_pair(W,
Cost);
1350 for (
unsigned i = 0; i < Grp->
getFactor(); ++i) {
1353 WideningDecisions[std::make_pair(
I, VF)] = std::make_pair(W,
Cost);
1355 WideningDecisions[std::make_pair(
I, VF)] = std::make_pair(W, 0);
1368 return CM_GatherScatter;
1370 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(
I, VF);
1371 auto Itr = WideningDecisions.find(InstOnVF);
1372 if (Itr == WideningDecisions.end())
1374 return Itr->second.first;
1381 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(
I, VF);
1382 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1383 "The cost is not calculated");
1384 return WideningDecisions[InstOnVF].second;
1392 auto *Trunc = dyn_cast<TruncInst>(
I);
1405 Value *Op = Trunc->getOperand(0);
1410 return Legal->isInductionPhi(Op);
1422 if (VF.
isScalar() || Uniforms.find(VF) != Uniforms.end())
1424 setCostBasedWideningDecision(VF);
1425 collectLoopUniforms(VF);
1426 collectLoopScalars(VF);
1432 return Legal->isConsecutivePtr(DataType,
Ptr) &&
1439 return Legal->isConsecutivePtr(DataType,
Ptr) &&
1447 bool LI = isa<LoadInst>(V);
1448 bool SI = isa<StoreInst>(V);
1454 Ty = VectorType::get(Ty, VF);
1463 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1464 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1475 return ScalarCost < SafeDivisorCost;
1499 std::pair<InstructionCost, InstructionCost>
1514 return InterleaveInfo.isInterleaved(Instr);
1520 return InterleaveInfo.getInterleaveGroup(Instr);
1526 if (!isScalarEpilogueAllowed())
1530 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1532 return VF.
isVector() && InterleaveInfo.requiresScalarEpilogue();
1543 if (!CanFoldTailByMasking)
1544 return TailFoldingStyle::None;
1551 return getTailFoldingStyle() != TailFoldingStyle::None;
1557 return getTailFoldingStyle() == TailFoldingStyle::DataAndControlFlow;
1564 return foldTailByMasking() ||
Legal->blockNeedsPredication(BB);
1575 return InLoopReductionChains;
1580 return InLoopReductionChains.count(Phi);
1594 bool &NeedToScalarize)
const;
1603 WideningDecisions.clear();
1611 std::optional<unsigned> getVScaleForTuning()
const;
1614 unsigned NumPredStores = 0;
1623 bool FoldTailByMasking);
1628 ElementCount getMaximizedVFForTarget(
unsigned ConstTripCount,
1629 unsigned SmallestType,
1630 unsigned WidestType,
1632 bool FoldTailByMasking);
1636 ElementCount getMaxLegalScalableVF(
unsigned MaxSafeElements);
1643 using VectorizationCostTy = std::pair<InstructionCost, bool>;
1651 using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1667 std::optional<InstructionCost>
1715 PredicatedBBsAfterVectorization;
1727 bool CanFoldTailByMasking =
false;
1750 ReductionChainMap InLoopReductionChains;
1764 ScalarCostsTy &ScalarCosts,
1790 std::pair<InstWidening, InstructionCost>>;
1792 DecisionList WideningDecisions;
1798 if (VF.
isScalar() || !
I || !TheLoop->contains(
I) ||
1799 TheLoop->isLoopInvariant(
I))
1808 return Scalars.
find(VF) == Scalars.
end() ||
1809 !isScalarAfterVectorization(
I, VF);
1816 Ops, [
this, VF](
Value *V) {
return this->needsExtract(V, VF); }));
1821 bool isCandidateForEpilogueVectorization(
const Loop &L,
1827 bool isEpilogueVectorizationProfitable(
const ElementCount VF)
const;
1887class GeneratedRTChecks {
1893 Value *SCEVCheckCond =
nullptr;
1901 Value *MemRuntimeCheckCond =
nullptr;
1910 bool CostTooHigh =
false;
1915 : DT(DT), LI(LI),
TTI(
TTI), SCEVExp(SE,
DL,
"scev.check"),
1916 MemCheckExp(SE,
DL,
"scev.check") {}
1944 nullptr,
"vector.scevcheck");
1951 if (RtPtrChecking.Need) {
1952 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1953 MemCheckBlock =
SplitBlock(Pred, Pred->getTerminator(), DT, LI,
nullptr,
1956 auto DiffChecks = RtPtrChecking.getDiffChecks();
1958 Value *RuntimeVF =
nullptr;
1963 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1968 MemRuntimeCheckCond =
1970 RtPtrChecking.getChecks(), MemCheckExp);
1972 assert(MemRuntimeCheckCond &&
1973 "no RT checks generated although RtPtrChecking "
1974 "claimed checks are required");
1977 if (!MemCheckBlock && !SCEVCheckBlock)
1987 if (SCEVCheckBlock) {
1992 if (MemCheckBlock) {
1999 if (MemCheckBlock) {
2003 if (SCEVCheckBlock) {
2010 if (SCEVCheckBlock || MemCheckBlock)
2023 if (SCEVCheckBlock->getTerminator() == &
I)
2032 if (MemCheckBlock->getTerminator() == &
I)
2040 if (SCEVCheckBlock || MemCheckBlock)
2041 LLVM_DEBUG(
dbgs() <<
"Total cost of runtime checks: " << RTCheckCost
2049 ~GeneratedRTChecks() {
2053 SCEVCleaner.markResultUsed();
2055 if (!MemRuntimeCheckCond)
2056 MemCheckCleaner.markResultUsed();
2058 if (MemRuntimeCheckCond) {
2059 auto &SE = *MemCheckExp.
getSE();
2066 I.eraseFromParent();
2069 MemCheckCleaner.cleanup();
2070 SCEVCleaner.cleanup();
2073 SCEVCheckBlock->eraseFromParent();
2074 if (MemRuntimeCheckCond)
2075 MemCheckBlock->eraseFromParent();
2089 SCEVCheckCond =
nullptr;
2090 if (
auto *
C = dyn_cast<ConstantInt>(
Cond))
2098 if (
auto *PL = LI->
getLoopFor(LoopVectorPreHeader))
2099 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2101 SCEVCheckBlock->getTerminator()->eraseFromParent();
2102 SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2103 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2111 return SCEVCheckBlock;
2120 if (!MemRuntimeCheckCond)
2129 MemCheckBlock->moveBefore(LoopVectorPreHeader);
2131 if (
auto *PL = LI->
getLoopFor(LoopVectorPreHeader))
2132 PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2135 MemCheckBlock->getTerminator(),
2137 MemCheckBlock->getTerminator()->setDebugLoc(
2138 Pred->getTerminator()->getDebugLoc());
2141 MemRuntimeCheckCond =
nullptr;
2142 return MemCheckBlock;
2174 LLVM_DEBUG(
dbgs() <<
"LV: Loop hints prevent outer loop vectorization.\n");
2180 LLVM_DEBUG(
dbgs() <<
"LV: Not vectorizing: Interleave is not supported for "
2200 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *
LI)) {
2210 for (
Loop *InnerL : L)
2223 explicit LoopVectorize(
bool InterleaveOnlyWhenForced =
false,
2224 bool VectorizeOnlyWhenForced =
false)
2226 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
2231 if (skipFunction(
F))
2234 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2235 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2236 auto *
TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
F);
2237 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2238 auto *
BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2239 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2240 auto *TLI = TLIP ? &TLIP->getTLI(
F) :
nullptr;
2241 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
F);
2242 auto &LAIs = getAnalysis<LoopAccessLegacyAnalysis>().getLAIs();
2243 auto *
DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2244 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2245 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
2248 .
runImpl(
F, *SE, *LI, *
TTI, *DT, *BFI, TLI, *DB, *AC, LAIs, *ORE, PSI)
2315 auto *ValVTy = cast<VectorType>(Val->
getType());
2320 "Induction Step must be an integer or FP");
2328 Type *InitVecValSTy =
2348 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2349 "Binary Opcode should be specified for FP induction");
2367 if (ScalarIVTy != Step->
getType()) {
2371 "Truncation requires an integer step");
2380 AddOp = Instruction::Add;
2381 MulOp = Instruction::Mul;
2383 AddOp =
ID.getInductionOpcode();
2384 MulOp = Instruction::FMul;
2393 Type *VecIVTy =
nullptr;
2394 Value *UnitStepVec =
nullptr, *SplatStep =
nullptr, *SplatIV =
nullptr;
2403 unsigned StartPart = 0;
2404 unsigned EndPart = State.
UF;
2405 unsigned StartLane = 0;
2409 EndPart = StartPart + 1;
2410 StartLane = State.
Instance->Lane.getKnownLane();
2411 EndLane = StartLane + 1;
2413 for (
unsigned Part = StartPart; Part < EndPart; ++Part) {
2423 State.
set(Def,
Add, Part);
2432 for (
unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
2438 "Expected StartIdx to be folded to a constant when VF is not "
2454 "Induction step should be loop invariant");
2455 if (
auto *
E = dyn_cast<SCEVUnknown>(Step))
2456 return E->getValue();
2459 return Exp.expandCodeFor(Step, Step->
getType(), InsertBefore);
2473 ?
B.CreateSExtOrTrunc(
Index, StepTy)
2474 :
B.CreateCast(Instruction::SIToFP,
Index, StepTy);
2475 if (CastedIndex !=
Index) {
2477 Index = CastedIndex;
2487 assert(
X->getType() ==
Y->getType() &&
"Types don't match!");
2488 if (
auto *CX = dyn_cast<ConstantInt>(
X))
2491 if (
auto *CY = dyn_cast<ConstantInt>(
Y))
2494 return B.CreateAdd(
X,
Y);
2500 assert(
X->getType()->getScalarType() ==
Y->getType() &&
2501 "Types don't match!");
2502 if (
auto *CX = dyn_cast<ConstantInt>(
X))
2505 if (
auto *CY = dyn_cast<ConstantInt>(
Y))
2508 VectorType *XVTy = dyn_cast<VectorType>(
X->getType());
2509 if (XVTy && !isa<VectorType>(
Y->getType()))
2510 Y =
B.CreateVectorSplat(XVTy->getElementCount(),
Y);
2511 return B.CreateMul(
X,
Y);
2514 switch (
ID.getKind()) {
2517 "Vector indices not supported for integer inductions yet");
2519 "Index type does not match StartValue type");
2520 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2521 return B.CreateSub(StartValue,
Index);
2526 assert(isa<Constant>(Step) &&
2527 "Expected constant step for pointer induction");
2532 "Vector indices not supported for FP inductions yet");
2534 auto InductionBinOp =
ID.getInductionBinOp();
2536 (InductionBinOp->getOpcode() == Instruction::FAdd ||
2537 InductionBinOp->getOpcode() == Instruction::FSub) &&
2538 "Original bin op should be defined for FP induction");
2541 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2553 Value *ScalarInst = State.
get(Def, Instance);
2556 VectorValue, ScalarInst,
2558 State.
set(Def, VectorValue, Instance.
Part);
2609 unsigned InterleaveFactor = Group->
getFactor();
2619 "Reversed masked interleave-group not supported.");
2630 for (
unsigned Part = 0; Part <
UF; Part++) {
2646 bool InBounds =
false;
2648 InBounds =
gep->isInBounds();
2650 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2661 Value *MaskForGaps =
nullptr;
2664 assert(MaskForGaps &&
"Mask for Gaps is required but it is null");
2668 if (isa<LoadInst>(Instr)) {
2671 for (
unsigned Part = 0; Part <
UF; Part++) {
2673 if (BlockInMask || MaskForGaps) {
2675 "masked interleaved groups are not allowed.");
2676 Value *GroupMask = MaskForGaps;
2678 Value *BlockInMaskPart = State.
get(BlockInMask, Part);
2682 "interleaved.mask");
2683 GroupMask = MaskForGaps
2690 GroupMask, PoisonVec,
"wide.masked.vec");
2702 for (
unsigned I = 0;
I < InterleaveFactor; ++
I) {
2711 for (
unsigned Part = 0; Part <
UF; Part++) {
2713 NewLoads[Part], StrideMask,
"strided.vec");
2716 if (Member->getType() != ScalarTy) {
2725 State.
set(VPDefs[J], StridedVec, Part);
2738 "masked interleaved groups are not allowed.");
2740 "masking gaps for scalable vectors is not yet supported.");
2741 for (
unsigned Part = 0; Part <
UF; Part++) {
2744 unsigned StoredIdx = 0;
2745 for (
unsigned i = 0; i < InterleaveFactor; i++) {
2747 "Fail to get a member from an interleaved store group");
2757 Value *StoredVec = State.
get(StoredValues[StoredIdx], Part);
2765 if (StoredVec->
getType() != SubVT)
2780 if (BlockInMask || MaskForGaps) {
2781 Value *GroupMask = MaskForGaps;
2783 Value *BlockInMaskPart = State.
get(BlockInMask, Part);
2787 "interleaved.mask");
2789 ShuffledMask, MaskForGaps)
2805 bool IfPredicateInstr,
2811 if (isa<NoAliasScopeDeclInst>(Instr))
2837 auto InputInstance = Instance;
2841 Cloned->
setOperand(
I.index(), State.
get(Operand, InputInstance));
2848 State.
set(RepRecipe, Cloned, Instance);
2851 if (
auto *II = dyn_cast<AssumeInst>(Cloned))
2855 if (IfPredicateInstr)
2866 Type *IdxTy =
Legal->getWidestInductionType();
2867 assert(IdxTy &&
"No type for induction");
2910 "VF*UF must be a power of 2 when folding tail by masking");
2942 auto *DstFVTy = cast<FixedVectorType>(DstVTy);
2943 unsigned VF = DstFVTy->getNumElements();
2944 auto *SrcVecTy = cast<FixedVectorType>(V->
getType());
2945 assert((
VF == SrcVecTy->getNumElements()) &&
"Vector dimensions do not match");
2946 Type *SrcElemTy = SrcVecTy->getElementType();
2947 Type *DstElemTy = DstFVTy->getElementType();
2948 assert((
DL.getTypeSizeInBits(SrcElemTy) ==
DL.getTypeSizeInBits(DstElemTy)) &&
2949 "Vector elements must have same size");
2960 "Only one type should be a pointer type");
2962 "Only one type should be a floating point type");
2988 auto CreateStep = [&]() ->
Value * {
3010 Value *MaxUIntTripCount =
3025 "TC check is expected to dominate Bypass");
3044 if (!SCEVCheckBlock)
3050 "Cannot SCEV check stride or overflow when optimizing for size");
3065 return SCEVCheckBlock;
3084 "Cannot emit memory checks when optimizing for size, unless forced "
3090 <<
"Code-size may be reduced by not forcing "
3091 "vectorization, or by source-code modifications "
3092 "eliminating the need for runtime checks "
3093 "(e.g., adding 'restrict').";
3101 return MemCheckBlock;
3110 "multiple exit loop without required epilogue?");
3114 LI,
nullptr,
Twine(Prefix) +
"middle.block");
3117 nullptr,
Twine(Prefix) +
"scalar.ph");
3133 BrInst->
setDebugLoc(ScalarLatchTerm->getDebugLoc());
3149 std::pair<BasicBlock *, Value *> AdditionalBypass) {
3155 Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3156 if (OrigPhi == OldInduction) {
3173 if (AdditionalBypass.first) {
3174 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3179 EndValueFromAdditionalBypass->
setName(
"ind.end");
3199 if (AdditionalBypass.first)
3201 EndValueFromAdditionalBypass);
3206 std::pair<BasicBlock *, Value *> AdditionalBypass) {
3207 assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3208 (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3209 "Inconsistent information about additional bypass.");
3217 for (
const auto &InductionEntry :
Legal->getInductionVars()) {
3218 PHINode *OrigPhi = InductionEntry.first;
3250 CmpN->
setDebugLoc(ScalarLatchTerm->getDebugLoc());
3254#ifdef EXPENSIVE_CHECKS
3261std::pair<BasicBlock *, Value *>
3345 assert(isa<PHINode>(UI) &&
"Expected LCSSA form");
3346 MissingVals[UI] = EndValue;
3354 auto *UI = cast<Instruction>(U);
3356 assert(isa<PHINode>(UI) &&
"Expected LCSSA form");
3364 Value *CountMinusOne =
B.CreateSub(
3366 CountMinusOne->
setName(
"cmo");
3371 Escape->
setName(
"ind.escape");
3372 MissingVals[UI] = Escape;
3376 for (
auto &
I : MissingVals) {
3383 if (
PHI->getBasicBlockIndex(MiddleBlock) == -1) {
3384 PHI->addIncoming(
I.second, MiddleBlock);
3392struct CSEDenseMapInfo {
3394 return isa<InsertElementInst>(
I) || isa<ExtractElementInst>(
I) ||
3395 isa<ShuffleVectorInst>(
I) || isa<GetElementPtrInst>(
I);
3407 assert(canHandle(
I) &&
"Unknown instruction!");
3409 I->value_op_end()));
3413 if (
LHS == getEmptyKey() ||
RHS == getEmptyKey() ||
3414 LHS == getTombstoneKey() ||
RHS == getTombstoneKey())
3416 return LHS->isIdenticalTo(
RHS);
3427 if (!CSEDenseMapInfo::canHandle(&In))
3433 In.replaceAllUsesWith(V);
3434 In.eraseFromParent();
3444 bool &NeedToScalarize)
const {
3448 for (
auto &ArgOp : CI->
args())
3459 return ScalarCallCost;
3463 for (
Type *ScalarTy : ScalarTys)
3476 NeedToScalarize =
true;
3486 if (VectorCallCost <
Cost) {
3487 NeedToScalarize =
false;
3488 Cost = VectorCallCost;
3503 assert(
ID &&
"Expected intrinsic call!");
3506 if (
auto *FPMO = dyn_cast<FPMathOperator>(CI))
3507 FMF = FPMO->getFastMathFlags();
3513 std::back_inserter(ParamTys),
3514 [&](
Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3517 dyn_cast<IntrinsicInst>(CI));
3523 auto *I1 = cast<IntegerType>(cast<VectorType>(
T1)->getElementType());
3524 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3525 return I1->getBitWidth() < I2->getBitWidth() ?
T1 : T2;
3529 auto *I1 = cast<IntegerType>(cast<VectorType>(
T1)->getElementType());
3530 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3531 return I1->getBitWidth() > I2->getBitWidth() ?
T1 : T2;
3547 for (
unsigned Part = 0; Part <
UF; ++Part) {
3549 if (Erased.
count(
I) ||
I->use_empty() || !isa<Instruction>(
I))
3551 Type *OriginalTy =
I->getType();
3552 Type *ScalarTruncatedTy =
3555 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount());
3556 if (TruncatedTy == OriginalTy)
3560 auto ShrinkOperand = [&](
Value *V) ->
Value * {
3561 if (
auto *ZI = dyn_cast<ZExtInst>(V))
3562 if (ZI->getSrcTy() == TruncatedTy)
3563 return ZI->getOperand(0);
3564 return B.CreateZExtOrTrunc(V, TruncatedTy);
3569 Value *NewI =
nullptr;
3570 if (
auto *BO = dyn_cast<BinaryOperator>(
I)) {
3571 NewI =
B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3572 ShrinkOperand(BO->getOperand(1)));
3577 cast<BinaryOperator>(NewI)->copyIRFlags(
I,
false);
3578 }
else if (
auto *CI = dyn_cast<ICmpInst>(
I)) {
3580 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3581 ShrinkOperand(CI->getOperand(1)));
3582 }
else if (
auto *
SI = dyn_cast<SelectInst>(
I)) {
3583 NewI =
B.CreateSelect(
SI->getCondition(),
3584 ShrinkOperand(
SI->getTrueValue()),
3585 ShrinkOperand(
SI->getFalseValue()));
3586 }
else if (
auto *CI = dyn_cast<CastInst>(
I)) {
3587 switch (CI->getOpcode()) {
3590 case Instruction::Trunc:
3591 NewI = ShrinkOperand(CI->getOperand(0));
3593 case Instruction::SExt:
3594 NewI =
B.CreateSExtOrTrunc(
3598 case Instruction::ZExt:
3599 NewI =
B.CreateZExtOrTrunc(
3604 }
else if (
auto *
SI = dyn_cast<ShuffleVectorInst>(
I)) {
3606 cast<VectorType>(
SI->getOperand(0)->getType())->getElementCount();
3607 auto *O0 =
B.CreateZExtOrTrunc(
3610 cast<VectorType>(
SI->getOperand(1)->getType())->getElementCount();
3611 auto *O1 =
B.CreateZExtOrTrunc(
3614 NewI =
B.CreateShuffleVector(O0, O1,
SI->getShuffleMask());
3615 }
else if (isa<LoadInst>(
I) || isa<PHINode>(
I)) {
3618 }
else if (
auto *IE = dyn_cast<InsertElementInst>(
I)) {
3620 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount();
3621 auto *O0 =
B.CreateZExtOrTrunc(
3623 auto *O1 =
B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3624 NewI =
B.CreateInsertElement(O0, O1, IE->getOperand(2));
3625 }
else if (
auto *EE = dyn_cast<ExtractElementInst>(
I)) {
3627 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount();
3628 auto *O0 =
B.CreateZExtOrTrunc(
3630 NewI =
B.CreateExtractElement(O0, EE->getOperand(2));
3638 Value *Res =
B.CreateZExtOrTrunc(NewI, OriginalTy);
3639 I->replaceAllUsesWith(Res);
3640 cast<Instruction>(
I)->eraseFromParent();
3642 State.
reset(Def, Res, Part);
3655 for (
unsigned Part = 0; Part <
UF; ++Part) {
3661 State.
reset(Def, NewI, Part);
3700 for (
const auto &Entry :
Legal->getInductionVars())
3711 KV.second->fixPhi(Plan, State);
3747 if (
auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
3749 else if (
auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
3808 Value *Incoming = State.
get(PreviousDef,
UF - 1);
3809 auto *ExtractForScalar = Incoming;
3817 "vector.recur.extract");
3824 Value *ExtractForPhiUsedOutsideLoop =
nullptr;
3829 Incoming,
Idx,
"vector.recur.extract.for.phi");
3835 ExtractForPhiUsedOutsideLoop = State.
get(PreviousDef,
UF - 2);
3843 auto *Incoming = BB ==
LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3844 Start->addIncoming(Incoming, BB);
3872 "Unable to find the reduction variable");
3905 for (
unsigned Part = 0; Part <
UF; ++Part) {
3906 Value *VecLoopExitInst = State.
get(LoopExitInstDef, Part);
3908 for (
User *U : VecLoopExitInst->
users()) {
3909 if (isa<SelectInst>(U)) {
3910 assert(!Sel &&
"Reduction exit feeding two selects");
3911 Sel = cast<SelectInst>(U);
3913 assert(isa<PHINode>(U) &&
"Reduction exit must feed Phi's or select");
3915 assert(Sel &&
"Reduction exit feeds no select");
3916 State.
reset(LoopExitInstDef, Sel, Part);
3918 if (isa<FPMathOperator>(Sel))
3931 cast<PHINode>(State.
get(PhiR, Part));
3932 VecRdxPhi->setIncomingValueForBlock(VectorLoopLatch, Sel);
3941 assert(!PhiR->
isInLoop() &&
"Unexpected truncated inloop reduction!");
3945 for (
unsigned Part = 0; Part <
UF; ++Part) {
3946 RdxParts[Part] = State.
get(LoopExitInstDef, Part);
3952 U->replaceUsesOfWith(RdxParts[Part], Extnd);
3953 RdxParts[Part] = Extnd;
3957 for (
unsigned Part = 0; Part <
UF; ++Part) {
3959 State.
reset(LoopExitInstDef, RdxParts[Part], Part);
3964 Value *ReducedPartRdx = State.
get(LoopExitInstDef, 0);
3976 ReducedPartRdx = State.
get(LoopExitInstDef,
UF - 1);
3981 for (
unsigned Part = 1; Part <
UF; ++Part) {
3982 Value *RdxPart = State.
get(LoopExitInstDef, Part);
3983 if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
3988 ReducedPartRdx, RdxPart);
4002 ReducedPartRdx = RdxDesc.
isSigned()
4020 BCBlockPhi->
addIncoming(ReducedPartRdx, Incoming);
4025 BCBlockPhi->
addIncoming(ReductionStartValue, Incoming);
4057 int IncomingEdgeBlockIdx =
4059 assert(IncomingEdgeBlockIdx >= 0 &&
"Invalid block index");
4061 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4078 while (!Worklist.
empty()) {
4080 for (
unsigned Part = 0; Part <
UF; ++Part) {
4082 if (!isa<OverflowingBinaryOperator>(V))
4084 cast<Instruction>(V)->dropPoisonGeneratingFlags();
4088 auto *UserRecipe = dyn_cast<VPRecipeBase>(U);
4091 for (
VPValue *V : UserRecipe->definedValues())
4092 if (Visited.
insert(V).second)
4112 auto isBlockOfUsePredicated = [&](
Use &U) ->
bool {
4113 auto *
I = cast<Instruction>(U.getUser());
4115 if (
auto *Phi = dyn_cast<PHINode>(
I))
4116 BB = Phi->getIncomingBlock(
4118 return BB == PredBB;
4129 Worklist.
insert(InstsToReanalyze.
begin(), InstsToReanalyze.
end());
4130 InstsToReanalyze.
clear();
4133 while (!Worklist.
empty()) {
4138 if (!
I || isa<PHINode>(
I) || !VectorLoop->contains(
I) ||
4139 I->mayHaveSideEffects())
4147 if (
I->getParent() == PredBB) {
4148 Worklist.
insert(
I->op_begin(),
I->op_end());
4162 I->moveBefore(&*PredBB->getFirstInsertionPt());
4163 Worklist.
insert(
I->op_begin(),
I->op_end());
4175 for (
VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
4180 PHINode *NewPhi = cast<PHINode>(State.
get(VPPhi, 0));
4197void LoopVectorizationCostModel::collectLoopScalars(
ElementCount VF) {
4202 "This function should not be visited twice for the same VF");
4208 Scalars[
VF].
insert(Uniforms[
VF].begin(), Uniforms[
VF].end());
4218 auto *Latch = TheLoop->getLoopLatch();
4225 InstWidening WideningDecision = getWideningDecision(MemAccess,
VF);
4226 assert(WideningDecision != CM_Unknown &&
4227 "Widening decision should be ready at this moment");
4228 if (
auto *Store = dyn_cast<StoreInst>(MemAccess))
4229 if (
Ptr == Store->getValueOperand())
4230 return WideningDecision == CM_Scalarize;
4232 "Ptr is neither a value or pointer operand");
4233 return WideningDecision != CM_GatherScatter;
4238 auto isLoopVaryingBitCastOrGEP = [&](
Value *V) {
4240 isa<GetElementPtrInst>(V)) &&
4241 !TheLoop->isLoopInvariant(V);
4251 if (!isLoopVaryingBitCastOrGEP(
Ptr))
4256 auto *
I = cast<Instruction>(
Ptr);
4264 return isa<LoadInst>(U) || isa<StoreInst>(U);
4268 PossibleNonScalarPtrs.
insert(
I);
4285 for (
auto *BB : TheLoop->blocks())
4286 for (
auto &
I : *BB) {
4287 if (
auto *Load = dyn_cast<LoadInst>(&
I)) {
4288 evaluatePtrUse(Load,
Load->getPointerOperand());
4289 }
else if (
auto *Store = dyn_cast<StoreInst>(&
I)) {
4290 evaluatePtrUse(Store,
Store->getPointerOperand());
4291 evaluatePtrUse(Store,
Store->getValueOperand());
4294 for (
auto *
I : ScalarPtrs)
4295 if (!PossibleNonScalarPtrs.
count(
I)) {
4303 auto ForcedScalar = ForcedScalars.
find(
VF);
4304 if (ForcedScalar != ForcedScalars.
end())
4305 for (
auto *
I : ForcedScalar->second) {
4306 LLVM_DEBUG(
dbgs() <<
"LV: Found (forced) scalar instruction: " << *
I <<
"\n");
4315 while (
Idx != Worklist.
size()) {
4317 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4319 auto *Src = cast<Instruction>(Dst->getOperand(0));
4321 auto *J = cast<Instruction>(U);
4322 return !TheLoop->contains(J) || Worklist.count(J) ||
4323 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4324 isScalarUse(J, Src));
4327 LLVM_DEBUG(
dbgs() <<
"LV: Found scalar instruction: " << *Src <<
"\n");
4333 for (
const auto &Induction :
Legal->getInductionVars()) {
4334 auto *Ind = Induction.first;
4335 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4339 if (Ind ==
Legal->getPrimaryInduction() && foldTailByMasking())
4344 auto IsDirectLoadStoreFromPtrIndvar = [&](
Instruction *Indvar,
4346 return Induction.second.getKind() ==
4348 (isa<LoadInst>(
I) || isa<StoreInst>(
I)) &&
4355 auto *I = cast<Instruction>(U);
4356 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4357 IsDirectLoadStoreFromPtrIndvar(Ind, I);
4364 auto ScalarIndUpdate =
4366 auto *I = cast<Instruction>(U);
4367 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4368 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
4370 if (!ScalarIndUpdate)
4375 Worklist.
insert(IndUpdate);
4376 LLVM_DEBUG(
dbgs() <<
"LV: Found scalar instruction: " << *Ind <<
"\n");
4377 LLVM_DEBUG(
dbgs() <<
"LV: Found scalar instruction: " << *IndUpdate
4386 if (!isPredicatedInst(
I))
4391 switch(
I->getOpcode()) {
4394 case Instruction::Load:
4395 case Instruction::Store: {
4402 return isa<LoadInst>(
I) ? !(isLegalMaskedLoad(Ty,
Ptr, Alignment) ||
4404 : !(isLegalMaskedStore(Ty,
Ptr, Alignment) ||
4407 case Instruction::UDiv:
4408 case Instruction::SDiv:
4409 case Instruction::SRem:
4410 case Instruction::URem: {
4414 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(
I,
VF);
4415 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
4421 if (!blockNeedsPredicationForAnyReason(
I->getParent()))
4426 switch(
I->getOpcode()) {
4429 case Instruction::Load:
4430 case Instruction::Store: {
4431 if (!
Legal->isMaskRequired(
I))
4442 if (
Legal->isUniformMemOp(*
I) &&
4443 (isa<LoadInst>(
I) ||
4444 (isa<StoreInst>(
I) &&
4445 TheLoop->isLoopInvariant(cast<StoreInst>(
I)->getValueOperand()))) &&
4446 !
Legal->blockNeedsPredication(
I->getParent()))
4450 case Instruction::UDiv:
4451 case Instruction::SDiv:
4452 case Instruction::SRem:
4453 case Instruction::URem:
4460std::pair<InstructionCost, InstructionCost>
4463 assert(
I->getOpcode() == Instruction::UDiv ||
4464 I->getOpcode() == Instruction::SDiv ||
4465 I->getOpcode() == Instruction::SRem ||
4466 I->getOpcode() == Instruction::URem);
4477 ScalarizationCost = 0;
4492 ScalarizationCost += getScalarizationOverhead(
I,
VF,
CostKind);