28#define DEBUG_TYPE "loop-vectorize"
32 cl::desc(
"Maximize bandwidth when selecting vectorization factor which "
33 "will be determined by the smallest type in loop."));
36 "vectorizer-maximize-bandwidth-for-vector-calls",
cl::init(
true),
38 cl::desc(
"Try wider VFs if they enable the use of vector variants"));
42 cl::desc(
"Discard VFs if their register pressure is too high."));
47 "Pretend that scalable vectors are supported, even if the target does "
48 "not support them. This flag should only be used for testing."));
52 cl::desc(
"Prefer in-loop vector reductions, "
53 "overriding the targets preference."));
59 cl::desc(
"Assume the target supports masked memory operations (used for "
65 return Legal->isConsecutivePtr(DataType, Ptr) &&
67 TTI.isLegalMaskedStore(DataType, Alignment,
AddressSpace));
73 return Legal->isConsecutivePtr(DataType, Ptr) &&
75 TTI.isLegalMaskedLoad(DataType, Alignment,
AddressSpace));
88 return (LI && TTI.isLegalMaskedGather(Ty,
Align)) ||
89 (
SI && TTI.isLegalMaskedScatter(Ty,
Align));
96bool VFSelectionContext::useMaxBandwidth(
bool IsScalable)
const {
101 (
TTI.shouldMaximizeVectorBandwidth(RegKind) ||
103 Legal->hasVectorCallVariants())));
112 if (TTI.shouldConsiderVectorizationRegPressure())
119 VF, VF.
isScalable() ? MaxPermissibleVFWithoutMaxBW.ScalableVF
120 : MaxPermissibleVFWithoutMaxBW.FixedVF);
124 ElementCount VF,
unsigned MaxTripCount,
unsigned UserIC,
125 bool FoldTailByMasking,
bool RequiresScalarEpilogue)
const {
127 if (VF.
isScalable() &&
F.hasFnAttribute(Attribute::VScaleRange)) {
128 auto Attr =
F.getFnAttribute(Attribute::VScaleRange);
129 auto Min = Attr.getVScaleRangeMin();
136 if (MaxTripCount > 0 && RequiresScalarEpilogue)
141 unsigned IC = UserIC > 0 ? UserIC : 1;
142 unsigned EstimatedVFTimesIC = EstimatedVF * IC;
144 if (MaxTripCount && MaxTripCount <= EstimatedVFTimesIC &&
152 if (ClampedUpperTripCount == 0)
153 ClampedUpperTripCount = 1;
154 LLVM_DEBUG(
dbgs() <<
"LV: Clamping the MaxVF to maximum power of two not "
155 "exceeding the constant trip count"
156 << (UserIC > 0 ?
" divided by UserIC" :
"") <<
": "
157 << ClampedUpperTripCount <<
"\n");
164ElementCount VFSelectionContext::getMaximizedVFForTarget(
165 unsigned MaxTripCount,
unsigned SmallestType,
unsigned WidestType,
166 ElementCount MaxSafeVF,
unsigned UserIC,
bool FoldTailByMasking,
167 bool RequiresScalarEpilogue) {
168 bool ComputeScalableMaxVF = MaxSafeVF.
isScalable();
169 const TypeSize WidestRegister = TTI.getRegisterBitWidth(
174 auto MinVF = [](
const ElementCount &
LHS,
const ElementCount &
RHS) {
176 "Scalable flags must match");
184 ComputeScalableMaxVF);
185 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
187 << (MaxVectorElementCount * WidestType) <<
" bits.\n");
189 if (!MaxVectorElementCount) {
191 << (ComputeScalableMaxVF ?
"scalable" :
"fixed")
192 <<
" vector registers.\n");
197 clampVFByMaxTripCount(MaxVectorElementCount, MaxTripCount, UserIC,
198 FoldTailByMasking, RequiresScalarEpilogue);
201 if (MaxVF != MaxVectorElementCount)
205 MaxPermissibleVFWithoutMaxBW.ScalableVF = MaxVF;
207 MaxPermissibleVFWithoutMaxBW.FixedVF = MaxVF;
209 if (useMaxBandwidth(ComputeScalableMaxVF)) {
212 ComputeScalableMaxVF);
213 MaxVF = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
215 if (ElementCount MinVF =
216 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
219 <<
") with target's minimum: " << MinVF <<
'\n');
224 MaxVF = clampVFByMaxTripCount(MaxVF, MaxTripCount, UserIC,
225 FoldTailByMasking, RequiresScalarEpilogue);
232 if (std::optional<unsigned> MaxVScale =
TTI.getMaxVScale())
235 if (
F.hasFnAttribute(Attribute::VScaleRange))
236 return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
241bool VFSelectionContext::isScalableVectorizationAllowed() {
242 if (IsScalableVectorizationAllowed)
243 return *IsScalableVectorizationAllowed;
245 IsScalableVectorizationAllowed =
false;
251 "ScalableVectorizationDisabled", ORE, TheLoop);
255 LLVM_DEBUG(
dbgs() <<
"LV: Scalable vectorization is available\n");
258 std::numeric_limits<ElementCount::ScalarTy>::max());
267 if (!
all_of(Legal->getReductionVars(), [&](
const auto &
Reduction) ->
bool {
268 return TTI.isLegalToVectorizeReduction(Reduction.second, MaxScalableVF);
271 "Scalable vectorization not supported for the reduction "
272 "operations found in this loop.",
273 "ScalableVFUnfeasible", ORE, TheLoop);
279 if (
any_of(ElementTypesInLoop, [&](
Type *Ty) {
280 return !Ty->
isVoidTy() && !TTI.isElementTypeLegalForScalableVector(Ty);
283 "for all element types found in this loop.",
284 "ScalableVFUnfeasible", ORE, TheLoop);
288 if (!Legal->isSafeForAnyVectorWidth() && !
getMaxVScale(F, TTI)) {
290 "for safe distance analysis.",
291 "ScalableVFUnfeasible", ORE, TheLoop);
295 IsScalableVectorizationAllowed =
true;
300VFSelectionContext::getMaxLegalScalableVF(
unsigned MaxSafeElements) {
301 if (!isScalableVectorizationAllowed())
305 std::numeric_limits<ElementCount::ScalarTy>::max());
306 if (Legal->isSafeForAnyVectorWidth())
307 return MaxScalableVF;
309 std::optional<unsigned> MaxVScale =
getMaxVScale(F, TTI);
315 "Max legal vector width too small, scalable vectorization "
317 "ScalableVFUnfeasible", ORE, TheLoop);
319 return MaxScalableVF;
323 unsigned MaxTripCount,
ElementCount UserVF,
unsigned UserIC,
324 bool FoldTailByMasking,
bool RequiresScalarEpilogue) {
331 unsigned MaxSafeElementsPowerOf2 =
333 if (!Legal->isSafeForAnyStoreLoadForwardDistances()) {
334 unsigned SLDist = Legal->getMaxStoreLoadForwardSafeDistanceInBits();
335 MaxSafeElementsPowerOf2 =
336 std::min(MaxSafeElementsPowerOf2, SLDist / WidestType);
340 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElementsPowerOf2);
342 if (!Legal->isSafeForAnyVectorWidth())
343 MaxSafeElements = MaxSafeElementsPowerOf2;
345 LLVM_DEBUG(
dbgs() <<
"LV: The max safe fixed VF is: " << MaxSafeFixedVF
347 LLVM_DEBUG(
dbgs() <<
"LV: The max safe scalable VF is: " << MaxSafeScalableVF
353 UserVF.
isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
370 <<
" is unsafe, clamping to max safe VF="
371 << MaxSafeFixedVF <<
".\n");
374 TheLoop->getStartLoc(),
375 TheLoop->getHeader())
376 <<
"User-specified vectorization factor "
377 <<
ore::NV(
"UserVectorizationFactor", UserVF)
378 <<
" is unsafe, clamping to maximum safe vectorization factor "
379 <<
ore::NV(
"VectorizationFactor", MaxSafeFixedVF);
381 return MaxSafeFixedVF;
386 <<
" is ignored because scalable vectors are not "
390 TheLoop->getStartLoc(),
391 TheLoop->getHeader())
392 <<
"User-specified vectorization factor "
393 <<
ore::NV(
"UserVectorizationFactor", UserVF)
394 <<
" is ignored because the target does not support scalable "
395 "vectors. The compiler will pick a more suitable value.";
399 <<
" is unsafe. Ignoring scalable UserVF.\n");
402 TheLoop->getStartLoc(),
403 TheLoop->getHeader())
404 <<
"User-specified vectorization factor "
405 <<
ore::NV(
"UserVectorizationFactor", UserVF)
406 <<
" is unsafe. Ignoring the hint to let the compiler pick a "
407 "more suitable value.";
412 LLVM_DEBUG(
dbgs() <<
"LV: The Smallest and Widest types: " << SmallestType
413 <<
" / " << WidestType <<
" bits.\n");
417 if (
auto MaxVF = getMaximizedVFForTarget(
418 MaxTripCount, SmallestType, WidestType, MaxSafeFixedVF, UserIC,
419 FoldTailByMasking, RequiresScalarEpilogue))
420 Result.FixedVF = MaxVF;
422 if (
auto MaxVF = getMaximizedVFForTarget(
423 MaxTripCount, SmallestType, WidestType, MaxSafeScalableVF, UserIC,
424 FoldTailByMasking, RequiresScalarEpilogue))
426 Result.ScalableVF = MaxVF;
434std::pair<unsigned, unsigned>
436 unsigned MinWidth = -1U;
437 unsigned MaxWidth = 8;
442 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
443 for (
const auto &[
_, RdxDesc] : Legal->getReductionVars()) {
448 std::min(RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
449 RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
450 MaxWidth = std::max(MaxWidth,
451 RdxDesc.getRecurrenceType()->getScalarSizeInBits());
454 for (
Type *
T : ElementTypesInLoop) {
455 MinWidth = std::min<unsigned>(
456 MinWidth,
DL.getTypeSizeInBits(
T->getScalarType()).getFixedValue());
457 MaxWidth = std::max<unsigned>(
458 MaxWidth,
DL.getTypeSizeInBits(
T->getScalarType()).getFixedValue());
461 return {MinWidth, MaxWidth};
466 ElementTypesInLoop.clear();
474 if (ValuesToIgnore && ValuesToIgnore->
contains(&
I))
484 if (!Legal->isReductionVariable(PN))
487 Legal->getRecurrenceDescriptor(PN);
497 T = ST->getValueOperand()->getType();
500 "Expected the load/store/recurrence type to be sized");
502 ElementTypesInLoop.insert(
T);
507void VFSelectionContext::initializeVScaleForTuning() {
511 if (
F.hasFnAttribute(Attribute::VScaleRange)) {
512 auto Attr =
F.getFnAttribute(Attribute::VScaleRange);
513 auto Min = Attr.getVScaleRangeMin();
514 auto Max = Attr.getVScaleRangeMax();
515 if (Max && Min == Max) {
516 VScaleForTuning = Max;
521 VScaleForTuning = TTI.getVScaleForTuning();
526 return !Hints->allowReordering() && RdxDesc.
isOrdered();
532 Loop *L =
const_cast<Loop *
>(TheLoop);
533 if (Legal->getRuntimePointerChecking()->Need) {
535 "Runtime ptr check is required with -Os/-Oz",
536 "runtime pointer checks needed. Enable vectorization of this "
537 "loop with '#pragma clang loop vectorize(enable)' when "
538 "compiling with -Os/-Oz",
539 "CantVersionLoopWithOptForSize", ORE, L);
543 if (!PSE.getPredicate().isAlwaysTrue()) {
545 "Runtime SCEV check is required with -Os/-Oz",
546 "runtime SCEV checks needed. Enable vectorization of this "
547 "loop with '#pragma clang loop vectorize(enable)' when "
548 "compiling with -Os/-Oz",
549 "CantVersionLoopWithOptForSize", ORE, L);
554 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
556 "Runtime stride check for small trip count",
557 "runtime stride == 1 checks needed. Enable vectorization of "
558 "this loop without such check by compiling with -Os/-Oz",
559 "CantVersionLoopWithOptForSize", ORE, L);
568 if (!InLoopReductions.empty())
571 for (
const auto &Reduction : Legal->getReductionVars()) {
572 PHINode *Phi = Reduction.first;
594 !TTI.preferInLoopReduction(Kind, Phi->getType()))
602 bool InLoop = !ReductionOperations.
empty();
605 InLoopReductions.insert(Phi);
608 for (
auto *
I : ReductionOperations) {
609 InLoopReductionImmediateChains[
I] = LastChain;
613 LLVM_DEBUG(
dbgs() <<
"LV: Using " << (InLoop ?
"inloop" :
"out of loop")
614 <<
" reduction for phi: " << *Phi <<
"\n");
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
loop Loop Strength Reduction
This file defines the LoopVectorizationLegality class.
static cl::opt< bool > ForceTargetSupportsScalableVectors("force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, cl::desc("Pretend that scalable vectors are supported, even if the target does " "not support them. This flag should only be used for testing."))
static cl::opt< bool > ConsiderRegPressure("vectorizer-consider-reg-pressure", cl::init(false), cl::Hidden, cl::desc("Discard VFs if their register pressure is too high."))
static cl::opt< bool > UseWiderVFIfCallVariantsPresent("vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true), cl::Hidden, cl::desc("Try wider VFs if they enable the use of vector variants"))
static cl::opt< bool > ForceTargetSupportsMaskedMemoryOps("force-target-supports-masked-memory-ops", cl::init(false), cl::Hidden, cl::desc("Assume the target supports masked memory operations (used for " "testing)."))
Note: This currently only applies to llvm.masked.load and llvm.masked.store.
static cl::opt< bool > MaximizeBandwidth("vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " "will be determined by the smallest type in loop."))
This file provides a LoopVectorizationPlanner class.
LLVM Basic Block Representation.
A parsed version of the target data layout string in and methods for querying it.
constexpr bool isVector() const
One or more elements.
static constexpr ElementCount getScalable(ScalarTy MinVal)
static constexpr ElementCount getFixed(ScalarTy MinVal)
static constexpr ElementCount get(ScalarTy MinVal, bool Scalable)
bool isScalableVectorizationDisabled() const
Represents a single loop in the control flow graph.
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Type * getRecurrenceType() const
Returns the type of the recurrence.
bool hasUsesOutsideReductionChain() const
Returns true if the reduction PHI has any uses outside the reduction chain.
static bool isFindLastRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
LLVM_ABI SmallVector< Instruction *, 4 > getReductionOpChain(PHINode *Phi, Loop *L) const
Attempts to find a chain of operations from Phi to LoopExitInst that can be treated as a set of reduc...
static bool isAnyOfRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
RecurKind getRecurrenceKind() const
bool isOrdered() const
Expose an ordered FP reduction to the instance users.
static bool isFindIVRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool contains(ConstPtrType Ptr) const
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
The instances of the Type class are immutable: once they are created, they are never changed.
bool isVoidTy() const
Return true if this is 'void'.
std::pair< unsigned, unsigned > getSmallestAndWidestTypes() const
bool supportsScalableVectors() const
bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment, unsigned AddressSpace) const
Returns true if the target machine supports masked store operation for the given DataType and kind of...
bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment, unsigned AddressSpace) const
Returns true if the target machine supports masked load operation for the given DataType and kind of ...
bool runtimeChecksRequired()
Check whether vectorization would require runtime checks.
bool isLegalGatherOrScatter(Value *V, ElementCount VF) const
Returns true if the target machine can represent V as a masked gather or scatter operation.
void collectInLoopReductions()
Split reductions into those that happen in the loop, and those that happen outside.
FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount, ElementCount UserVF, unsigned UserIC, bool FoldTailByMasking, bool RequiresScalarEpilogue)
bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const
Returns true if we should use strict in-order reductions for the given RdxDesc.
bool shouldConsiderRegPressureForVF(ElementCount VF) const
void collectElementTypesForWidening(const SmallPtrSetImpl< const Value * > *ValuesToIgnore=nullptr)
Collect element types in the loop that need widening.
LLVM Value Representation.
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
static constexpr bool isKnownGT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
initializer< Ty > init(const Ty &Val)
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, const Loop *TheLoop, Instruction *I=nullptr, DebugLoc DL={})
Reports an informative message: print Msg for debugging purposes as well as an optimization remark.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
std::optional< unsigned > getMaxVScale(const Function &F, const TargetTransformInfo &TTI)
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
RecurKind
These are the kinds of recurrences that we support.
LLVM_ABI void reportVectorizationFailure(const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr)
Reports a vectorization failure: print DebugMsg for debugging purposes along with the corresponding o...
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
cl::opt< bool > PreferInLoopReductions
This struct is a compact representation of a valid (non-zero power of two) alignment.
A class that represents two vectorization factors (initialized with 0 by default).