28#define DEBUG_TYPE "loop-vectorize"
34 cl::desc(
"Maximize bandwidth when selecting vectorization factor which "
35 "will be determined by the smallest type in loop."));
38 "vectorizer-maximize-bandwidth-for-vector-calls",
cl::init(
true),
40 cl::desc(
"Try wider VFs if they enable the use of vector variants"));
44 cl::desc(
"Discard VFs if their register pressure is too high."));
49 "Pretend that scalable vectors are supported, even if the target does "
50 "not support them. This flag should only be used for testing."));
54 cl::desc(
"Prefer in-loop vector reductions, "
55 "overriding the targets preference."));
61 cl::desc(
"Assume the target supports masked memory operations (used for "
73 : TTI.isLegalMaskedStore(Ty, Alignment, AS));
86 return (LI && TTI.isLegalMaskedGather(Ty,
Align)) ||
87 (
SI && TTI.isLegalMaskedScatter(Ty,
Align));
94bool VFSelectionContext::useMaxBandwidth(
bool IsScalable)
const {
99 (
TTI.shouldMaximizeVectorBandwidth(RegKind) ||
101 Legal->hasVectorCallVariants())));
110 if (TTI.shouldConsiderVectorizationRegPressure())
117 VF, VF.
isScalable() ? MaxPermissibleVFWithoutMaxBW.ScalableVF
118 : MaxPermissibleVFWithoutMaxBW.FixedVF);
122 ElementCount VF,
unsigned MaxTripCount,
unsigned UserIC,
123 bool FoldTailByMasking,
bool RequiresScalarEpilogue)
const {
125 if (VF.
isScalable() &&
F.hasFnAttribute(Attribute::VScaleRange)) {
126 auto Attr =
F.getFnAttribute(Attribute::VScaleRange);
127 auto Min = Attr.getVScaleRangeMin();
134 if (MaxTripCount > 0 && RequiresScalarEpilogue)
139 unsigned IC = UserIC > 0 ? UserIC : 1;
140 unsigned EstimatedVFTimesIC = EstimatedVF * IC;
142 if (MaxTripCount && MaxTripCount <= EstimatedVFTimesIC &&
150 if (ClampedUpperTripCount == 0)
151 ClampedUpperTripCount = 1;
152 LLVM_DEBUG(
dbgs() <<
"LV: Clamping the MaxVF to maximum power of two not "
153 "exceeding the constant trip count"
154 << (UserIC > 0 ?
" divided by UserIC" :
"") <<
": "
155 << ClampedUpperTripCount <<
"\n");
162ElementCount VFSelectionContext::getMaximizedVFForTarget(
163 unsigned MaxTripCount,
unsigned SmallestType,
unsigned WidestType,
164 ElementCount MaxSafeVF,
unsigned UserIC,
bool FoldTailByMasking,
165 bool RequiresScalarEpilogue) {
166 bool ComputeScalableMaxVF = MaxSafeVF.
isScalable();
167 const TypeSize WidestRegister = TTI.getRegisterBitWidth(
172 auto MinVF = [](
const ElementCount &
LHS,
const ElementCount &
RHS) {
174 "Scalable flags must match");
182 ComputeScalableMaxVF);
183 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
185 << (MaxVectorElementCount * WidestType) <<
" bits.\n");
187 if (!MaxVectorElementCount) {
189 << (ComputeScalableMaxVF ?
"scalable" :
"fixed")
190 <<
" vector registers.\n");
195 clampVFByMaxTripCount(MaxVectorElementCount, MaxTripCount, UserIC,
196 FoldTailByMasking, RequiresScalarEpilogue);
199 if (MaxVF != MaxVectorElementCount)
203 MaxPermissibleVFWithoutMaxBW.ScalableVF = MaxVF;
205 MaxPermissibleVFWithoutMaxBW.FixedVF = MaxVF;
207 if (useMaxBandwidth(ComputeScalableMaxVF)) {
210 ComputeScalableMaxVF);
211 MaxVF = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
213 if (ElementCount MinVF =
214 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
217 <<
") with target's minimum: " << MinVF <<
'\n');
222 MaxVF = clampVFByMaxTripCount(MaxVF, MaxTripCount, UserIC,
223 FoldTailByMasking, RequiresScalarEpilogue);
230 if (std::optional<unsigned> MaxVScale =
TTI.getMaxVScale())
233 if (
F.hasFnAttribute(Attribute::VScaleRange))
234 return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
239bool VFSelectionContext::isScalableVectorizationAllowed() {
240 if (IsScalableVectorizationAllowed)
241 return *IsScalableVectorizationAllowed;
243 IsScalableVectorizationAllowed =
false;
249 "ScalableVectorizationDisabled", ORE, TheLoop);
253 LLVM_DEBUG(
dbgs() <<
"LV: Scalable vectorization is available\n");
256 std::numeric_limits<ElementCount::ScalarTy>::max());
265 if (!
all_of(Legal->getReductionVars(), [&](
const auto &
Reduction) ->
bool {
266 return TTI.isLegalToVectorizeReduction(Reduction.second, MaxScalableVF);
269 "Scalable vectorization not supported for the reduction "
270 "operations found in this loop.",
271 "ScalableVFUnfeasible", ORE, TheLoop);
277 if (
any_of(ElementTypesInLoop, [&](
Type *Ty) {
278 return !Ty->
isVoidTy() && !TTI.isElementTypeLegalForScalableVector(Ty);
281 "for all element types found in this loop.",
282 "ScalableVFUnfeasible", ORE, TheLoop);
286 if (!Legal->isSafeForAnyVectorWidth() && !
getMaxVScale(F, TTI)) {
288 "for safe distance analysis.",
289 "ScalableVFUnfeasible", ORE, TheLoop);
293 IsScalableVectorizationAllowed =
true;
298VFSelectionContext::getMaxLegalScalableVF(
unsigned MaxSafeElements) {
299 if (!isScalableVectorizationAllowed())
303 std::numeric_limits<ElementCount::ScalarTy>::max());
304 if (Legal->isSafeForAnyVectorWidth())
305 return MaxScalableVF;
307 std::optional<unsigned> MaxVScale =
getMaxVScale(F, TTI);
313 "Max legal vector width too small, scalable vectorization "
315 "ScalableVFUnfeasible", ORE, TheLoop);
317 return MaxScalableVF;
321 unsigned MaxTripCount,
ElementCount UserVF,
unsigned UserIC,
322 bool FoldTailByMasking,
bool RequiresScalarEpilogue) {
329 unsigned MaxSafeElementsPowerOf2 =
331 if (!Legal->isSafeForAnyStoreLoadForwardDistances()) {
332 unsigned SLDist = Legal->getMaxStoreLoadForwardSafeDistanceInBits();
333 MaxSafeElementsPowerOf2 =
334 std::min(MaxSafeElementsPowerOf2, SLDist / WidestType);
338 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElementsPowerOf2);
340 if (!Legal->isSafeForAnyVectorWidth())
341 MaxSafeElements = MaxSafeElementsPowerOf2;
343 LLVM_DEBUG(
dbgs() <<
"LV: The max safe fixed VF is: " << MaxSafeFixedVF
345 LLVM_DEBUG(
dbgs() <<
"LV: The max safe scalable VF is: " << MaxSafeScalableVF
351 UserVF.
isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
368 <<
" is unsafe, clamping to max safe VF="
369 << MaxSafeFixedVF <<
".\n");
372 TheLoop->getStartLoc(),
373 TheLoop->getHeader())
374 <<
"User-specified vectorization factor "
375 <<
ore::NV(
"UserVectorizationFactor", UserVF)
376 <<
" is unsafe, clamping to maximum safe vectorization factor "
377 <<
ore::NV(
"VectorizationFactor", MaxSafeFixedVF);
379 return MaxSafeFixedVF;
384 <<
" is ignored because scalable vectors are not "
388 TheLoop->getStartLoc(),
389 TheLoop->getHeader())
390 <<
"User-specified vectorization factor "
391 <<
ore::NV(
"UserVectorizationFactor", UserVF)
392 <<
" is ignored because the target does not support scalable "
393 "vectors. The compiler will pick a more suitable value.";
397 <<
" is unsafe. Ignoring scalable UserVF.\n");
400 TheLoop->getStartLoc(),
401 TheLoop->getHeader())
402 <<
"User-specified vectorization factor "
403 <<
ore::NV(
"UserVectorizationFactor", UserVF)
404 <<
" is unsafe. Ignoring the hint to let the compiler pick a "
405 "more suitable value.";
410 LLVM_DEBUG(
dbgs() <<
"LV: The Smallest and Widest types: " << SmallestType
411 <<
" / " << WidestType <<
" bits.\n");
415 if (
auto MaxVF = getMaximizedVFForTarget(
416 MaxTripCount, SmallestType, WidestType, MaxSafeFixedVF, UserIC,
417 FoldTailByMasking, RequiresScalarEpilogue))
418 Result.FixedVF = MaxVF;
420 if (
auto MaxVF = getMaximizedVFForTarget(
421 MaxTripCount, SmallestType, WidestType, MaxSafeScalableVF, UserIC,
422 FoldTailByMasking, RequiresScalarEpilogue))
424 Result.ScalableVF = MaxVF;
432std::pair<unsigned, unsigned>
434 unsigned MinWidth = -1U;
435 unsigned MaxWidth = 8;
440 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
441 for (
const auto &[
_, RdxDesc] : Legal->getReductionVars()) {
446 std::min(RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
447 RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
448 MaxWidth = std::max(MaxWidth,
449 RdxDesc.getRecurrenceType()->getScalarSizeInBits());
452 for (
Type *
T : ElementTypesInLoop) {
453 MinWidth = std::min<unsigned>(
454 MinWidth,
DL.getTypeSizeInBits(
T->getScalarType()).getFixedValue());
455 MaxWidth = std::max<unsigned>(
456 MaxWidth,
DL.getTypeSizeInBits(
T->getScalarType()).getFixedValue());
459 return {MinWidth, MaxWidth};
464 ElementTypesInLoop.clear();
472 if (ValuesToIgnore && ValuesToIgnore->
contains(&
I))
482 if (!Legal->isReductionVariable(PN))
485 Legal->getRecurrenceDescriptor(PN);
495 T = ST->getValueOperand()->getType();
498 "Expected the load/store/recurrence type to be sized");
500 ElementTypesInLoop.insert(
T);
505void VFSelectionContext::initializeVScaleForTuning() {
509 if (
F.hasFnAttribute(Attribute::VScaleRange)) {
510 auto Attr =
F.getFnAttribute(Attribute::VScaleRange);
511 auto Min = Attr.getVScaleRangeMin();
512 auto Max = Attr.getVScaleRangeMax();
513 if (Max && Min == Max) {
514 VScaleForTuning = Max;
519 VScaleForTuning = TTI.getVScaleForTuning();
524 return !Hints->allowReordering() && RdxDesc.
isOrdered();
530 Loop *L =
const_cast<Loop *
>(TheLoop);
531 if (Legal->getRuntimePointerChecking()->Need) {
533 "Runtime ptr check is required with -Os/-Oz",
534 "runtime pointer checks needed. Enable vectorization of this "
535 "loop with '#pragma clang loop vectorize(enable)' when "
536 "compiling with -Os/-Oz",
537 "CantVersionLoopWithOptForSize", ORE, L);
541 if (!PSE.getPredicate().isAlwaysTrue()) {
543 "Runtime SCEV check is required with -Os/-Oz",
544 "runtime SCEV checks needed. Enable vectorization of this "
545 "loop with '#pragma clang loop vectorize(enable)' when "
546 "compiling with -Os/-Oz",
547 "CantVersionLoopWithOptForSize", ORE, L);
552 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
554 "Runtime stride check for small trip count",
555 "runtime stride == 1 checks needed. Enable vectorization of "
556 "this loop without such check by compiling with -Os/-Oz",
557 "CantVersionLoopWithOptForSize", ORE, L);
570 if (!InLoopReductions.empty())
573 for (
const auto &Reduction : Legal->getReductionVars()) {
574 PHINode *Phi = Reduction.first;
596 !TTI.preferInLoopReduction(Kind, Phi->getType()))
604 bool InLoop = !ReductionOperations.
empty();
607 InLoopReductions.insert(Phi);
610 for (
auto *
I : ReductionOperations) {
611 InLoopReductionImmediateChains[
I] = LastChain;
615 LLVM_DEBUG(
dbgs() <<
"LV: Using " << (InLoop ?
"inloop" :
"out of loop")
616 <<
" reduction for phi: " << *Phi <<
"\n");
629 "Scalable vectorization requested but not supported by the target",
630 "the scalable user-specified vectorization width for outer-loop "
631 "vectorization cannot be used because the target does not support "
633 "ScalableVFUnfeasible", ORE, TheLoop);
641 auto RegKind = TTI.enableScalableVectorization()
646 unsigned N =
RegSize.getKnownMinValue() / WidestType;
653 <<
"overriding computed VF.\n");
658 "VF needs to be a power of two");
662 <<
"VF " << VF <<
" to build VPlans.\n");
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
loop Loop Strength Reduction
This file defines the LoopVectorizationLegality class.
cl::opt< bool > VPlanBuildOuterloopStressTest
static cl::opt< bool > ForceTargetSupportsScalableVectors("force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, cl::desc("Pretend that scalable vectors are supported, even if the target does " "not support them. This flag should only be used for testing."))
static cl::opt< bool > ConsiderRegPressure("vectorizer-consider-reg-pressure", cl::init(false), cl::Hidden, cl::desc("Discard VFs if their register pressure is too high."))
static cl::opt< bool > UseWiderVFIfCallVariantsPresent("vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true), cl::Hidden, cl::desc("Try wider VFs if they enable the use of vector variants"))
static cl::opt< bool > ForceTargetSupportsMaskedMemoryOps("force-target-supports-masked-memory-ops", cl::init(false), cl::Hidden, cl::desc("Assume the target supports masked memory operations (used for " "testing)."))
Note: This currently only applies to llvm.masked.load and llvm.masked.store.
static cl::opt< bool > MaximizeBandwidth("vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " "will be determined by the smallest type in loop."))
This file provides a LoopVectorizationPlanner class.
LLVM Basic Block Representation.
A parsed version of the target data layout string in and methods for querying it.
constexpr bool isVector() const
One or more elements.
static constexpr ElementCount getScalable(ScalarTy MinVal)
static constexpr ElementCount getFixed(ScalarTy MinVal)
static constexpr ElementCount get(ScalarTy MinVal, bool Scalable)
constexpr bool isScalar() const
Exactly one element.
bool isScalableVectorizationDisabled() const
Represents a single loop in the control flow graph.
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Type * getRecurrenceType() const
Returns the type of the recurrence.
bool hasUsesOutsideReductionChain() const
Returns true if the reduction PHI has any uses outside the reduction chain.
static bool isFindLastRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
LLVM_ABI SmallVector< Instruction *, 4 > getReductionOpChain(PHINode *Phi, Loop *L) const
Attempts to find a chain of operations from Phi to LoopExitInst that can be treated as a set of reduc...
static bool isAnyOfRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
RecurKind getRecurrenceKind() const
bool isOrdered() const
Expose an ordered FP reduction to the instance users.
static bool isFindIVRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool contains(ConstPtrType Ptr) const
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
The instances of the Type class are immutable: once they are created, they are never changed.
bool isVoidTy() const
Return true if this is 'void'.
FixedScalableVFPair computeVPlanOuterloopVF(ElementCount UserVF)
Returns a scalable VF to use for outer-loop vectorization if the target supports it and a fixed VF ot...
std::pair< unsigned, unsigned > getSmallestAndWidestTypes() const
bool supportsScalableVectors() const
bool runtimeChecksRequired()
Check whether vectorization would require runtime checks.
bool isLegalGatherOrScatter(Value *V, ElementCount VF) const
Returns true if the target machine can represent V as a masked gather or scatter operation.
void collectInLoopReductions()
Split reductions into those that happen in the loop, and those that happen outside.
FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount, ElementCount UserVF, unsigned UserIC, bool FoldTailByMasking, bool RequiresScalarEpilogue)
bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const
Returns true if we should use strict in-order reductions for the given RdxDesc.
bool shouldConsiderRegPressureForVF(ElementCount VF) const
void collectElementTypesForWidening(const SmallPtrSetImpl< const Value * > *ValuesToIgnore=nullptr)
Collect element types in the loop that need widening.
bool isLegalMaskedLoadOrStore(Instruction *I, ElementCount VF) const
Returns true if the target machine supports masked loads or stores for I's data type and alignment.
void computeMinimalBitwidths()
Compute smallest bitwidth each instruction can be represented with.
LLVM Value Representation.
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
constexpr bool isZero() const
static constexpr bool isKnownGT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
initializer< Ty > init(const Ty &Val)
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, const Loop *TheLoop, Instruction *I=nullptr, DebugLoc DL={})
Reports an informative message: print Msg for debugging purposes as well as an optimization remark.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
unsigned getLoadStoreAddressSpace(const Value *I)
A helper function that returns the address space of the pointer operand of load or store instruction.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
std::optional< unsigned > getMaxVScale(const Function &F, const TargetTransformInfo &TTI)
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
LLVM_ABI void reportVectorizationFailure(const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, const Loop *TheLoop, Instruction *I=nullptr)
Reports a vectorization failure: print DebugMsg for debugging purposes along with the corresponding o...
RecurKind
These are the kinds of recurrences that we support.
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
LLVM_ABI MapVector< Instruction *, uint64_t > computeMinimumValueSizes(ArrayRef< BasicBlock * > Blocks, DemandedBits &DB, const TargetTransformInfo *TTI=nullptr)
Compute a map of integer instructions to their minimum legal type size.
cl::opt< bool > PreferInLoopReductions
This struct is a compact representation of a valid (non-zero power of two) alignment.
A class that represents two vectorization factors (initialized with 0 by default).
static FixedScalableVFPair getNone()