LLVM 23.0.0git
LoopVectorizationPlanner.cpp
Go to the documentation of this file.
1//===- LoopVectorizationPlanner.cpp - VF selection and planning -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements VFSelectionContext methods for loop vectorization
11/// VF selection, independent of cost-modeling decisions.
12///
13//===----------------------------------------------------------------------===//
14
21#include "llvm/Support/Debug.h"
25
26using namespace llvm;
27using namespace LoopVectorizationUtils;
28
29#define DEBUG_TYPE "loop-vectorize"
30
32
34 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
35 cl::desc("Maximize bandwidth when selecting vectorization factor which "
36 "will be determined by the smallest type in loop."));
37
39 "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
41 cl::desc("Try wider VFs if they enable the use of vector variants"));
42
44 "vectorizer-consider-reg-pressure", cl::init(false), cl::Hidden,
45 cl::desc("Discard VFs if their register pressure is too high."));
46
48 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
50 "Pretend that scalable vectors are supported, even if the target does "
51 "not support them. This flag should only be used for testing."));
52
54 "prefer-inloop-reductions", cl::init(false), cl::Hidden,
55 cl::desc("Prefer in-loop vector reductions, "
56 "overriding the targets preference."));
57
58/// Note: This currently only applies to `llvm.masked.load` and
59/// `llvm.masked.store`. TODO: Extend this to cover other operations as needed.
61 "force-target-supports-masked-memory-ops", cl::init(false), cl::Hidden,
62 cl::desc("Assume the target supports masked memory operations (used for "
63 "testing)."));
64
66 "force-target-supports-gather-scatter-ops", cl::init(false), cl::Hidden,
67 cl::desc("Assume the target supports gather/scatter operations (used for "
68 "testing)."));
69
70/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
71/// is passed, the message relates to that particular instruction.
72#ifndef NDEBUG
73static void debugVectorizationMessage(const StringRef Prefix,
74 const StringRef DebugMsg,
75 Instruction *I) {
76 dbgs() << "LV: " << Prefix << DebugMsg;
77 if (I != nullptr)
78 dbgs() << " " << *I;
79 else
80 dbgs() << '.';
81 dbgs() << '\n';
82}
83#endif
84
85/// Create an analysis remark that explains why vectorization failed
86/// \p RemarkName is the identifier for the remark. If \p I is passed it is an
87/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
88/// the location of the remark. If \p DL is passed, use it as debug location for
89/// the remark. \return the remark object that can be streamed to.
91 const Loop *TheLoop,
93 DebugLoc DL = {}) {
94 BasicBlock *CodeRegion = I ? I->getParent() : TheLoop->getHeader();
95 // If debug location is attached to the instruction, use it. Otherwise if DL
96 // was not provided, use the loop's.
97 if (I && I->getDebugLoc())
98 DL = I->getDebugLoc();
99 else if (!DL)
100 DL = TheLoop->getStartLoc();
101
102 return OptimizationRemarkAnalysis(DEBUG_TYPE, RemarkName, DL, CodeRegion);
103}
104
106 const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag,
107 OptimizationRemarkEmitter *ORE, const Loop *TheLoop, Instruction *I) {
108 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
109 ORE->emit(createLVAnalysis(ORETag, TheLoop, I)
110 << "loop not vectorized: " << OREMsg);
111}
112
114 const StringRef Msg, const StringRef ORETag, OptimizationRemarkEmitter *ORE,
115 const Loop *TheLoop, Instruction *I, DebugLoc DL) {
117 ORE->emit(createLVAnalysis(ORETag, TheLoop, I, DL) << Msg);
118}
119
121 Loop *TheLoop,
122 ElementCount VFWidth,
123 unsigned IC) {
125 "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
126 nullptr));
127 StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
128 ORE->emit([&]() {
129 return OptimizationRemark(DEBUG_TYPE, "Vectorized", TheLoop->getStartLoc(),
130 TheLoop->getHeader())
131 << "vectorized " << LoopType << "loop (vectorization width: "
132 << ore::NV("VectorizationFactor", VFWidth)
133 << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
134 });
135}
136
138 ElementCount VF) const {
140 auto *Ty = getLoadStoreType(I);
141 const unsigned AS = getLoadStoreAddressSpace(I);
142 const Align Alignment = getLoadStoreAlignment(I);
143
145 (isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment, AS)
146 : TTI.isLegalMaskedStore(Ty, Alignment, AS));
147}
148
150 ElementCount VF) const {
151 bool LI = isa<LoadInst>(V);
152 bool SI = isa<StoreInst>(V);
153 if (!LI && !SI)
154 return false;
155 auto *Ty = getLoadStoreType(V);
157 if (VF.isVector())
158 Ty = VectorType::get(Ty, VF);
160 (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
161 (SI && TTI.isLegalMaskedScatter(Ty, Align));
162}
163
165 return TTI.supportsScalableVectors() || ForceTargetSupportsScalableVectors;
166}
167
168bool VFSelectionContext::useMaxBandwidth(bool IsScalable) const {
172 return MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
173 (TTI.shouldMaximizeVectorBandwidth(RegKind) ||
175 Legal->hasVectorCallVariants())));
176}
177
179 if (ConsiderRegPressure.getNumOccurrences())
180 return ConsiderRegPressure;
181
182 // TODO: We should eventually consider register pressure for all targets. The
183 // TTI hook is temporary whilst target-specific issues are being fixed.
184 if (TTI.shouldConsiderVectorizationRegPressure())
185 return true;
186
187 if (!useMaxBandwidth(VF.isScalable()))
188 return false;
189 // Only calculate register pressure for VFs enabled by MaxBandwidth.
191 VF, VF.isScalable() ? MaxPermissibleVFWithoutMaxBW.ScalableVF
192 : MaxPermissibleVFWithoutMaxBW.FixedVF);
193}
194
195ElementCount VFSelectionContext::clampVFByMaxTripCount(
196 ElementCount VF, unsigned MaxTripCount, unsigned UserIC,
197 bool FoldTailByMasking, bool RequiresScalarEpilogue) const {
198 unsigned EstimatedVF = VF.getKnownMinValue();
199 if (VF.isScalable() && F.hasFnAttribute(Attribute::VScaleRange)) {
200 auto Attr = F.getFnAttribute(Attribute::VScaleRange);
201 auto Min = Attr.getVScaleRangeMin();
202 EstimatedVF *= Min;
203 }
204
205 // When a scalar epilogue is required, at least one iteration of the scalar
206 // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
207 // max VF that results in a dead vector loop.
208 if (MaxTripCount > 0 && RequiresScalarEpilogue)
209 MaxTripCount -= 1;
210
211 // When the user specifies an interleave count, we need to ensure that
212 // VF * UserIC <= MaxTripCount to avoid a dead vector loop.
213 unsigned IC = UserIC > 0 ? UserIC : 1;
214 unsigned EstimatedVFTimesIC = EstimatedVF * IC;
215
216 if (MaxTripCount && MaxTripCount <= EstimatedVFTimesIC &&
217 (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
218 // If upper bound loop trip count (TC) is known at compile time there is no
219 // point in choosing VF greater than TC / IC (as done in the loop below).
220 // Select maximum power of two which doesn't exceed TC / IC. If VF is
221 // scalable, we only fall back on a fixed VF when the TC is less than or
222 // equal to the known number of lanes.
223 auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount / IC);
224 if (ClampedUpperTripCount == 0)
225 ClampedUpperTripCount = 1;
226 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
227 "exceeding the constant trip count"
228 << (UserIC > 0 ? " divided by UserIC" : "") << ": "
229 << ClampedUpperTripCount << "\n");
230 return ElementCount::get(ClampedUpperTripCount,
231 FoldTailByMasking ? VF.isScalable() : false);
232 }
233 return VF;
234}
235
236ElementCount VFSelectionContext::getMaximizedVFForTarget(
237 unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
238 ElementCount MaxSafeVF, unsigned UserIC, bool FoldTailByMasking,
239 bool RequiresScalarEpilogue) {
240 bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
241 const TypeSize WidestRegister = TTI.getRegisterBitWidth(
242 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
244
245 // Convenience function to return the minimum of two ElementCounts.
246 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
247 assert((LHS.isScalable() == RHS.isScalable()) &&
248 "Scalable flags must match");
250 };
251
252 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
253 // Note that both WidestRegister and WidestType may not be a powers of 2.
254 auto MaxVectorElementCount = ElementCount::get(
255 llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),
256 ComputeScalableMaxVF);
257 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
258 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
259 << (MaxVectorElementCount * WidestType) << " bits.\n");
260
261 if (!MaxVectorElementCount) {
262 LLVM_DEBUG(dbgs() << "LV: The target has no "
263 << (ComputeScalableMaxVF ? "scalable" : "fixed")
264 << " vector registers.\n");
265 return ElementCount::getFixed(1);
266 }
267
268 ElementCount MaxVF =
269 clampVFByMaxTripCount(MaxVectorElementCount, MaxTripCount, UserIC,
270 FoldTailByMasking, RequiresScalarEpilogue);
271 // If the MaxVF was already clamped, there's no point in trying to pick a
272 // larger one.
273 if (MaxVF != MaxVectorElementCount)
274 return MaxVF;
275
276 if (MaxVF.isScalable())
277 MaxPermissibleVFWithoutMaxBW.ScalableVF = MaxVF;
278 else
279 MaxPermissibleVFWithoutMaxBW.FixedVF = MaxVF;
280
281 if (useMaxBandwidth(ComputeScalableMaxVF)) {
282 auto MaxVectorElementCountMaxBW = ElementCount::get(
283 llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
284 ComputeScalableMaxVF);
285 MaxVF = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
286
287 if (ElementCount MinVF =
288 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
289 if (ElementCount::isKnownLT(MaxVF, MinVF)) {
290 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
291 << ") with target's minimum: " << MinVF << '\n');
292 MaxVF = MinVF;
293 }
294 }
295
296 MaxVF = clampVFByMaxTripCount(MaxVF, MaxTripCount, UserIC,
297 FoldTailByMasking, RequiresScalarEpilogue);
298 }
299 return MaxVF;
300}
301
302std::optional<unsigned> llvm::getMaxVScale(const Function &F,
303 const TargetTransformInfo &TTI) {
304 if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
305 return MaxVScale;
306
307 if (F.hasFnAttribute(Attribute::VScaleRange))
308 return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
309
310 return std::nullopt;
311}
312
313bool VFSelectionContext::isScalableVectorizationAllowed() {
314 if (IsScalableVectorizationAllowed)
315 return *IsScalableVectorizationAllowed;
316
317 IsScalableVectorizationAllowed = false;
319 return false;
320
321 if (Hints->isScalableVectorizationDisabled()) {
322 reportVectorizationInfo("Scalable vectorization is explicitly disabled",
323 "ScalableVectorizationDisabled", ORE, TheLoop);
324 return false;
325 }
326
327 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
328
329 auto MaxScalableVF = ElementCount::getScalable(
330 std::numeric_limits<ElementCount::ScalarTy>::max());
331
332 // Test that the loop-vectorizer can legalize all operations for this MaxVF.
333 // FIXME: While for scalable vectors this is currently sufficient, this should
334 // be replaced by a more detailed mechanism that filters out specific VFs,
335 // instead of invalidating vectorization for a whole set of VFs based on the
336 // MaxVF.
337
338 // Disable scalable vectorization if the loop contains unsupported reductions.
339 if (!all_of(Legal->getReductionVars(), [&](const auto &Reduction) -> bool {
340 return TTI.isLegalToVectorizeReduction(Reduction.second, MaxScalableVF);
341 })) {
343 "Scalable vectorization not supported for the reduction "
344 "operations found in this loop.",
345 "ScalableVFUnfeasible", ORE, TheLoop);
346 return false;
347 }
348
349 // Disable scalable vectorization if the loop contains any instructions
350 // with element types not supported for scalable vectors.
351 if (any_of(ElementTypesInLoop, [&](Type *Ty) {
352 return !Ty->isVoidTy() && !TTI.isElementTypeLegalForScalableVector(Ty);
353 })) {
354 reportVectorizationInfo("Scalable vectorization is not supported "
355 "for all element types found in this loop.",
356 "ScalableVFUnfeasible", ORE, TheLoop);
357 return false;
358 }
359
360 if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(F, TTI)) {
361 reportVectorizationInfo("The target does not provide maximum vscale value "
362 "for safe distance analysis.",
363 "ScalableVFUnfeasible", ORE, TheLoop);
364 return false;
365 }
366
367 IsScalableVectorizationAllowed = true;
368 return true;
369}
370
372VFSelectionContext::getMaxLegalScalableVF(unsigned MaxSafeElements) {
373 if (!isScalableVectorizationAllowed())
375
376 auto MaxScalableVF = ElementCount::getScalable(
377 std::numeric_limits<ElementCount::ScalarTy>::max());
378 if (Legal->isSafeForAnyVectorWidth())
379 return MaxScalableVF;
380
381 std::optional<unsigned> MaxVScale = getMaxVScale(F, TTI);
382 // Limit MaxScalableVF by the maximum safe dependence distance.
383 MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);
384
385 if (!MaxScalableVF)
387 "Max legal vector width too small, scalable vectorization "
388 "unfeasible.",
389 "ScalableVFUnfeasible", ORE, TheLoop);
390
391 return MaxScalableVF;
392}
393
395 unsigned MaxTripCount, ElementCount UserVF, unsigned UserIC,
396 bool FoldTailByMasking, bool RequiresScalarEpilogue) {
397 auto [SmallestType, WidestType] = getSmallestAndWidestTypes();
398
399 // Get the maximum safe dependence distance in bits computed by LAA.
400 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
401 // the memory accesses that is most restrictive (involved in the smallest
402 // dependence distance).
403 unsigned MaxSafeElementsPowerOf2 =
404 llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
405 if (!Legal->isSafeForAnyStoreLoadForwardDistances()) {
406 unsigned SLDist = Legal->getMaxStoreLoadForwardSafeDistanceInBits();
407 MaxSafeElementsPowerOf2 =
408 std::min(MaxSafeElementsPowerOf2, SLDist / WidestType);
409 }
410
411 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElementsPowerOf2);
412 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElementsPowerOf2);
413
414 if (!Legal->isSafeForAnyVectorWidth())
415 MaxSafeElements = MaxSafeElementsPowerOf2;
416
417 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
418 << ".\n");
419 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
420 << ".\n");
421
422 // First analyze the UserVF, fall back if the UserVF should be ignored.
423 if (UserVF) {
424 auto MaxSafeUserVF =
425 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
426
427 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
428 // If `VF=vscale x N` is safe, then so is `VF=N`
429 if (UserVF.isScalable())
430 return FixedScalableVFPair(
431 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
432
433 return UserVF;
434 }
435
436 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
437
438 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
439 // is better to ignore the hint and let the compiler choose a suitable VF.
440 if (!UserVF.isScalable()) {
441 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
442 << " is unsafe, clamping to max safe VF="
443 << MaxSafeFixedVF << ".\n");
444 ORE->emit([&]() {
445 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
446 TheLoop->getStartLoc(),
447 TheLoop->getHeader())
448 << "User-specified vectorization factor "
449 << ore::NV("UserVectorizationFactor", UserVF)
450 << " is unsafe, clamping to maximum safe vectorization factor "
451 << ore::NV("VectorizationFactor", MaxSafeFixedVF);
452 });
453 return MaxSafeFixedVF;
454 }
455
457 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
458 << " is ignored because scalable vectors are not "
459 "available.\n");
460 ORE->emit([&]() {
461 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
462 TheLoop->getStartLoc(),
463 TheLoop->getHeader())
464 << "User-specified vectorization factor "
465 << ore::NV("UserVectorizationFactor", UserVF)
466 << " is ignored because the target does not support scalable "
467 "vectors. The compiler will pick a more suitable value.";
468 });
469 } else {
470 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
471 << " is unsafe. Ignoring scalable UserVF.\n");
472 ORE->emit([&]() {
473 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
474 TheLoop->getStartLoc(),
475 TheLoop->getHeader())
476 << "User-specified vectorization factor "
477 << ore::NV("UserVectorizationFactor", UserVF)
478 << " is unsafe. Ignoring the hint to let the compiler pick a "
479 "more suitable value.";
480 });
481 }
482 }
483
484 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
485 << " / " << WidestType << " bits.\n");
486
489 if (auto MaxVF = getMaximizedVFForTarget(
490 MaxTripCount, SmallestType, WidestType, MaxSafeFixedVF, UserIC,
491 FoldTailByMasking, RequiresScalarEpilogue))
492 Result.FixedVF = MaxVF;
493
494 if (auto MaxVF = getMaximizedVFForTarget(
495 MaxTripCount, SmallestType, WidestType, MaxSafeScalableVF, UserIC,
496 FoldTailByMasking, RequiresScalarEpilogue))
497 if (MaxVF.isScalable()) {
498 Result.ScalableVF = MaxVF;
499 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
500 << "\n");
501 }
502
503 return Result;
504}
505
506std::pair<unsigned, unsigned>
508 unsigned MinWidth = -1U;
509 unsigned MaxWidth = 8;
510 const DataLayout &DL = F.getDataLayout();
511 // For in-loop reductions, no element types are added to ElementTypesInLoop
512 // if there are no loads/stores in the loop. In this case, check through the
513 // reduction variables to determine the maximum width.
514 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
515 for (const auto &[_, RdxDesc] : Legal->getReductionVars()) {
516 // When finding the min width used by the recurrence we need to account
517 // for casts on the input operands of the recurrence.
518 MinWidth = std::min(
519 MinWidth,
520 std::min(RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
521 RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
522 MaxWidth = std::max(MaxWidth,
523 RdxDesc.getRecurrenceType()->getScalarSizeInBits());
524 }
525 } else {
526 for (Type *T : ElementTypesInLoop) {
527 MinWidth = std::min<unsigned>(
528 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
529 MaxWidth = std::max<unsigned>(
530 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
531 }
532 }
533 return {MinWidth, MaxWidth};
534}
535
537 const SmallPtrSetImpl<const Value *> *ValuesToIgnore) {
538 ElementTypesInLoop.clear();
539 // For each block.
540 for (BasicBlock *BB : TheLoop->blocks()) {
541 // For each instruction in the loop.
542 for (Instruction &I : *BB) {
543 Type *T = I.getType();
544
545 // Skip ignored values.
546 if (ValuesToIgnore && ValuesToIgnore->contains(&I))
547 continue;
548
549 // Only examine Loads, Stores and PHINodes.
551 continue;
552
553 // Examine PHI nodes that are reduction variables. Update the type to
554 // account for the recurrence type.
555 if (auto *PN = dyn_cast<PHINode>(&I)) {
556 if (!Legal->isReductionVariable(PN))
557 continue;
558 const RecurrenceDescriptor &RdxDesc =
559 Legal->getRecurrenceDescriptor(PN);
561 TTI.preferInLoopReduction(RdxDesc.getRecurrenceKind(),
562 RdxDesc.getRecurrenceType()))
563 continue;
564 T = RdxDesc.getRecurrenceType();
565 }
566
567 // Examine the stored values.
568 if (auto *ST = dyn_cast<StoreInst>(&I))
569 T = ST->getValueOperand()->getType();
570
571 assert(T->isSized() &&
572 "Expected the load/store/recurrence type to be sized");
573
574 ElementTypesInLoop.insert(T);
575 }
576 }
577}
578
579void VFSelectionContext::initializeVScaleForTuning() {
581 return;
582
583 if (F.hasFnAttribute(Attribute::VScaleRange)) {
584 auto Attr = F.getFnAttribute(Attribute::VScaleRange);
585 auto Min = Attr.getVScaleRangeMin();
586 auto Max = Attr.getVScaleRangeMax();
587 if (Max && Min == Max) {
588 VScaleForTuning = Max;
589 return;
590 }
591 }
592
593 VScaleForTuning = TTI.getVScaleForTuning();
594}
595
597 const RecurrenceDescriptor &RdxDesc) const {
598 return !Hints->allowReordering() && RdxDesc.isOrdered();
599}
600
602 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
603
604 Loop *L = const_cast<Loop *>(TheLoop);
605 if (Legal->getRuntimePointerChecking()->Need) {
607 "Runtime ptr check is required with -Os/-Oz",
608 "runtime pointer checks needed. Enable vectorization of this "
609 "loop with '#pragma clang loop vectorize(enable)' when "
610 "compiling with -Os/-Oz",
611 "CantVersionLoopWithOptForSize", ORE, L);
612 return true;
613 }
614
615 if (!PSE.getPredicate().isAlwaysTrue()) {
617 "Runtime SCEV check is required with -Os/-Oz",
618 "runtime SCEV checks needed. Enable vectorization of this "
619 "loop with '#pragma clang loop vectorize(enable)' when "
620 "compiling with -Os/-Oz",
621 "CantVersionLoopWithOptForSize", ORE, L);
622 return true;
623 }
624
625 // FIXME: Avoid specializing for stride==1 instead of bailing out.
626 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
628 "Runtime stride check for small trip count",
629 "runtime stride == 1 checks needed. Enable vectorization of "
630 "this loop without such check by compiling with -Os/-Oz",
631 "CantVersionLoopWithOptForSize", ORE, L);
632 return true;
633 }
634
635 return false;
636}
637
639 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
640}
641
643 // Avoid duplicating work finding in-loop reductions.
644 if (!InLoopReductions.empty())
645 return;
646
647 for (const auto &Reduction : Legal->getReductionVars()) {
648 PHINode *Phi = Reduction.first;
649 const RecurrenceDescriptor &RdxDesc = Reduction.second;
650
651 // Multi-use reductions (e.g., used in FindLastIV patterns) are handled
652 // separately and should not be considered for in-loop reductions.
653 if (RdxDesc.hasUsesOutsideReductionChain())
654 continue;
655
656 // We don't collect reductions that are type promoted (yet).
657 if (RdxDesc.getRecurrenceType() != Phi->getType())
658 continue;
659
660 // In-loop AnyOf and FindIV reductions are not yet supported.
661 RecurKind Kind = RdxDesc.getRecurrenceKind();
665 continue;
666
667 // If the target would prefer this reduction to happen "in-loop", then we
668 // want to record it as such.
670 !TTI.preferInLoopReduction(Kind, Phi->getType()))
671 continue;
672
673 // Check that we can correctly put the reductions into the loop, by
674 // finding the chain of operations that leads from the phi to the loop
675 // exit value.
676 SmallVector<Instruction *, 4> ReductionOperations =
677 RdxDesc.getReductionOpChain(Phi, const_cast<Loop *>(TheLoop));
678 bool InLoop = !ReductionOperations.empty();
679
680 if (InLoop) {
681 InLoopReductions.insert(Phi);
682 // Add the elements to InLoopReductionImmediateChains for cost modelling.
683 Instruction *LastChain = Phi;
684 for (auto *I : ReductionOperations) {
685 InLoopReductionImmediateChains[I] = LastChain;
686 LastChain = I;
687 }
688 }
689 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
690 << " reduction for phi: " << *Phi << "\n");
691 }
692}
693
694bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
695 const VectorizationFactor &B,
696 const unsigned MaxTripCount,
697 bool HasTail,
698 bool IsEpilogue) const {
699 InstructionCost CostA = A.Cost;
700 InstructionCost CostB = B.Cost;
701
702 // When there is a hint to always prefer scalable vectors, honour that hint.
704 if (A.Width.isScalable() && CostA.isValid() && !B.Width.isScalable() &&
705 !B.Width.isScalar())
706 return true;
707
708 // Improve estimate for the vector width if it is scalable.
709 unsigned EstimatedWidthA = A.Width.getKnownMinValue();
710 unsigned EstimatedWidthB = B.Width.getKnownMinValue();
711 if (std::optional<unsigned> VScale = Config.getVScaleForTuning()) {
712 if (A.Width.isScalable())
713 EstimatedWidthA *= *VScale;
714 if (B.Width.isScalable())
715 EstimatedWidthB *= *VScale;
716 }
717
718 // When optimizing for size choose whichever is smallest, which will be the
719 // one with the smallest cost for the whole loop. On a tie pick the larger
720 // vector width, on the assumption that throughput will be greater.
721 if (Config.CostKind == TTI::TCK_CodeSize)
722 return CostA < CostB ||
723 (CostA == CostB && EstimatedWidthA > EstimatedWidthB);
724
725 // Assume vscale may be larger than 1 (or the value being tuned for),
726 // so that scalable vectorization is slightly favorable over fixed-width
727 // vectorization.
728 bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost(IsEpilogue) &&
729 A.Width.isScalable() && !B.Width.isScalable();
730
731 auto CmpFn = [PreferScalable](const InstructionCost &LHS,
732 const InstructionCost &RHS) {
733 return PreferScalable ? LHS <= RHS : LHS < RHS;
734 };
735
736 // To avoid the need for FP division:
737 // (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)
738 // <=> (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA)
739 bool LowerCostWithoutTC =
740 CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);
741 if (!MaxTripCount)
742 return LowerCostWithoutTC;
743
744 auto GetCostForTC = [MaxTripCount, HasTail](unsigned VF,
745 InstructionCost VectorCost,
746 InstructionCost ScalarCost) {
747 // If the trip count is a known (possibly small) constant, the trip count
748 // will be rounded up to an integer number of iterations under
749 // FoldTailByMasking. The total cost in that case will be
750 // VecCost*ceil(TripCount/VF). When not folding the tail, the total
751 // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
752 // some extra overheads, but for the purpose of comparing the costs of
753 // different VFs we can use this to compare the total loop-body cost
754 // expected after vectorization.
755 if (HasTail)
756 return VectorCost * (MaxTripCount / VF) +
757 ScalarCost * (MaxTripCount % VF);
758 return VectorCost * divideCeil(MaxTripCount, VF);
759 };
760
761 auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost);
762 auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost);
763 bool LowerCostWithTC = CmpFn(RTCostA, RTCostB);
764 LLVM_DEBUG(if (LowerCostWithTC != LowerCostWithoutTC) {
765 dbgs() << "LV: VF " << (LowerCostWithTC ? A.Width : B.Width)
766 << " has lower cost than VF "
767 << (LowerCostWithTC ? B.Width : A.Width)
768 << " when taking the cost of the remaining scalar loop iterations "
769 "into consideration for a maximum trip count of "
770 << MaxTripCount << ".\n";
771 });
772 return LowerCostWithTC;
773}
774
775bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
776 const VectorizationFactor &B,
777 bool HasTail,
778 bool IsEpilogue) const {
779 const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount();
780 return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount, HasTail,
781 IsEpilogue);
782}
783
784// TODO: we could return a pair of values that specify the max VF and
785// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
786// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
787// doesn't have a cost model that can choose which plan to execute if
788// more than one is generated.
791 if (UserVF.isScalable() && !supportsScalableVectors()) {
793 "Scalable vectorization requested but not supported by the target",
794 "the scalable user-specified vectorization width for outer-loop "
795 "vectorization cannot be used because the target does not support "
796 "scalable vectors.",
797 "ScalableVFUnfeasible", ORE, TheLoop);
799 }
800
801 ElementCount VF = UserVF;
802 if (VF.isZero()) {
803 auto [_, WidestType] = getSmallestAndWidestTypes();
804
805 auto RegKind = TTI.enableScalableVectorization()
808
809 TypeSize RegSize = TTI.getRegisterBitWidth(RegKind);
810 unsigned N = RegSize.getKnownMinValue() / WidestType;
811 VF = ElementCount::get(N, RegSize.isScalable());
812 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
813
814 // Make sure we have a VF > 1 for stress testing.
816 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
817 << "overriding computed VF.\n");
819 }
820 }
822 "VF needs to be a power of two");
823 if (VF.isScalar())
825 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
826 << "VF " << VF << " to build VPlans.\n");
827 return FixedScalableVFPair(VF);
828}
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define DEBUG_TYPE
#define _
loop Loop Strength Reduction
This file defines the LoopVectorizationLegality class.
static void debugVectorizationMessage(const StringRef Prefix, const StringRef DebugMsg, Instruction *I)
Write a DebugMsg about vectorization to the debug output stream.
static cl::opt< bool > ForceTargetSupportsGatherScatterOps("force-target-supports-gather-scatter-ops", cl::init(false), cl::Hidden, cl::desc("Assume the target supports gather/scatter operations (used for " "testing)."))
cl::opt< bool > VPlanBuildOuterloopStressTest
static cl::opt< bool > ForceTargetSupportsScalableVectors("force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, cl::desc("Pretend that scalable vectors are supported, even if the target does " "not support them. This flag should only be used for testing."))
static cl::opt< bool > ConsiderRegPressure("vectorizer-consider-reg-pressure", cl::init(false), cl::Hidden, cl::desc("Discard VFs if their register pressure is too high."))
static cl::opt< bool > UseWiderVFIfCallVariantsPresent("vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true), cl::Hidden, cl::desc("Try wider VFs if they enable the use of vector variants"))
static OptimizationRemarkAnalysis createLVAnalysis(StringRef RemarkName, const Loop *TheLoop, Instruction *I, DebugLoc DL={})
Create an analysis remark that explains why vectorization failed RemarkName is the identifier for the...
static cl::opt< bool > ForceTargetSupportsMaskedMemoryOps("force-target-supports-masked-memory-ops", cl::init(false), cl::Hidden, cl::desc("Assume the target supports masked memory operations (used for " "testing)."))
Note: This currently only applies to llvm.masked.load and llvm.masked.store.
static cl::opt< bool > MaximizeBandwidth("vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " "will be determined by the smallest type in loop."))
This file provides a LoopVectorizationPlanner class.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define T
#define LLVM_DEBUG(...)
Definition Debug.h:119
Value * RHS
Value * LHS
LLVM Basic Block Representation.
Definition BasicBlock.h:62
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
A debug info location.
Definition DebugLoc.h:124
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
static constexpr ElementCount get(ScalarTy MinVal, bool Scalable)
Definition TypeSize.h:315
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
BlockT * getHeader() const
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition LoopInfo.cpp:659
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Type * getRecurrenceType() const
Returns the type of the recurrence.
bool hasUsesOutsideReductionChain() const
Returns true if the reduction PHI has any uses outside the reduction chain.
static bool isFindLastRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
LLVM_ABI SmallVector< Instruction *, 4 > getReductionOpChain(PHINode *Phi, Loop *L) const
Attempts to find a chain of operations from Phi to LoopExitInst that can be treated as a set of reduc...
static bool isAnyOfRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
RecurKind getRecurrenceKind() const
bool isOrdered() const
Expose an ordered FP reduction to the instance users.
static bool isFindIVRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool contains(ConstPtrType Ptr) const
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
@ TCK_CodeSize
Instruction code size.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141
FixedScalableVFPair computeVPlanOuterloopVF(ElementCount UserVF)
Returns a scalable VF to use for outer-loop vectorization if the target supports it and a fixed VF ot...
std::pair< unsigned, unsigned > getSmallestAndWidestTypes() const
const TTI::TargetCostKind CostKind
The kind of cost that we are calculating.
bool runtimeChecksRequired()
Check whether vectorization would require runtime checks.
bool isLegalGatherOrScatter(Value *V, ElementCount VF) const
Returns true if the target machine can represent V as a masked gather or scatter operation.
void collectInLoopReductions()
Split reductions into those that happen in the loop, and those that happen outside.
FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount, ElementCount UserVF, unsigned UserIC, bool FoldTailByMasking, bool RequiresScalarEpilogue)
bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const
Returns true if we should use strict in-order reductions for the given RdxDesc.
bool shouldConsiderRegPressureForVF(ElementCount VF) const
void collectElementTypesForWidening(const SmallPtrSetImpl< const Value * > *ValuesToIgnore=nullptr)
Collect element types in the loop that need widening.
bool isLegalMaskedLoadOrStore(Instruction *I, ElementCount VF) const
Returns true if the target machine supports masked loads or stores for I's data type and alignment.
std::optional< unsigned > getVScaleForTuning() const
void computeMinimalBitwidths()
Compute smallest bitwidth each instruction can be represented with.
LLVM Value Representation.
Definition Value.h:75
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:230
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr bool isZero() const
Definition TypeSize.h:153
static constexpr bool isKnownGT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:223
void reportVectorizationFailure(const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, const Loop *TheLoop, Instruction *I=nullptr)
Reports a vectorization failure: print DebugMsg for debugging purposes along with the corresponding o...
void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, const Loop *TheLoop, Instruction *I=nullptr, DebugLoc DL={})
Reports an informative message: print Msg for debugging purposes as well as an optimization remark.
void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop, ElementCount VFWidth, unsigned IC)
Report successful vectorization of the loop.
initializer< Ty > init(const Ty &Val)
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
unsigned getLoadStoreAddressSpace(const Value *I)
A helper function that returns the address space of the pointer operand of load or store instruction.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
std::optional< unsigned > getMaxVScale(const Function &F, const TargetTransformInfo &TTI)
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
TargetTransformInfo TTI
RecurKind
These are the kinds of recurrences that we support.
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:347
LLVM_ABI MapVector< Instruction *, uint64_t > computeMinimumValueSizes(ArrayRef< BasicBlock * > Blocks, DemandedBits &DB, const TargetTransformInfo *TTI=nullptr)
Compute a map of integer instructions to their minimum legal type size.
cl::opt< bool > PreferInLoopReductions
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
A class that represents two vectorization factors (initialized with 0 by default).
static FixedScalableVFPair getNone()
TODO: The following VectorizationFactor was pulled out of LoopVectorizationCostModel class.