LLVM 22.0.0git
LoopVectorize.cpp
Go to the documentation of this file.
1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanAnalysis.h"
61#include "VPlanCFG.h"
62#include "VPlanHelpers.h"
63#include "VPlanPatternMatch.h"
64#include "VPlanTransforms.h"
65#include "VPlanUtils.h"
66#include "VPlanVerifier.h"
67#include "llvm/ADT/APInt.h"
68#include "llvm/ADT/ArrayRef.h"
69#include "llvm/ADT/DenseMap.h"
71#include "llvm/ADT/Hashing.h"
72#include "llvm/ADT/MapVector.h"
73#include "llvm/ADT/STLExtras.h"
76#include "llvm/ADT/Statistic.h"
77#include "llvm/ADT/StringRef.h"
78#include "llvm/ADT/Twine.h"
79#include "llvm/ADT/TypeSwitch.h"
84#include "llvm/Analysis/CFG.h"
101#include "llvm/IR/Attributes.h"
102#include "llvm/IR/BasicBlock.h"
103#include "llvm/IR/CFG.h"
104#include "llvm/IR/Constant.h"
105#include "llvm/IR/Constants.h"
106#include "llvm/IR/DataLayout.h"
107#include "llvm/IR/DebugInfo.h"
108#include "llvm/IR/DebugLoc.h"
109#include "llvm/IR/DerivedTypes.h"
111#include "llvm/IR/Dominators.h"
112#include "llvm/IR/Function.h"
113#include "llvm/IR/IRBuilder.h"
114#include "llvm/IR/InstrTypes.h"
115#include "llvm/IR/Instruction.h"
116#include "llvm/IR/Instructions.h"
118#include "llvm/IR/Intrinsics.h"
119#include "llvm/IR/MDBuilder.h"
120#include "llvm/IR/Metadata.h"
121#include "llvm/IR/Module.h"
122#include "llvm/IR/Operator.h"
123#include "llvm/IR/PatternMatch.h"
125#include "llvm/IR/Type.h"
126#include "llvm/IR/Use.h"
127#include "llvm/IR/User.h"
128#include "llvm/IR/Value.h"
129#include "llvm/IR/Verifier.h"
130#include "llvm/Support/Casting.h"
132#include "llvm/Support/Debug.h"
147#include <algorithm>
148#include <cassert>
149#include <cstdint>
150#include <functional>
151#include <iterator>
152#include <limits>
153#include <memory>
154#include <string>
155#include <tuple>
156#include <utility>
157
158using namespace llvm;
159using namespace SCEVPatternMatch;
160
161#define LV_NAME "loop-vectorize"
162#define DEBUG_TYPE LV_NAME
163
164#ifndef NDEBUG
165const char VerboseDebug[] = DEBUG_TYPE "-verbose";
166#endif
167
168STATISTIC(LoopsVectorized, "Number of loops vectorized");
169STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
170STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
171STATISTIC(LoopsEarlyExitVectorized, "Number of early exit loops vectorized");
172
174 "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
175 cl::desc("Enable vectorization of epilogue loops."));
176
178 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
179 cl::desc("When epilogue vectorization is enabled, and a value greater than "
180 "1 is specified, forces the given VF for all applicable epilogue "
181 "loops."));
182
184 "epilogue-vectorization-minimum-VF", cl::Hidden,
185 cl::desc("Only loops with vectorization factor equal to or larger than "
186 "the specified value are considered for epilogue vectorization."));
187
188/// Loops with a known constant trip count below this number are vectorized only
189/// if no scalar iteration overheads are incurred.
191 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
192 cl::desc("Loops with a constant trip count that is smaller than this "
193 "value are vectorized only if no scalar iteration overheads "
194 "are incurred."));
195
197 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
198 cl::desc("The maximum allowed number of runtime memory checks"));
199
200// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
201// that predication is preferred, and this lists all options. I.e., the
202// vectorizer will try to fold the tail-loop (epilogue) into the vector body
203// and predicate the instructions accordingly. If tail-folding fails, there are
204// different fallback strategies depending on these values:
211} // namespace PreferPredicateTy
212
214 "prefer-predicate-over-epilogue",
217 cl::desc("Tail-folding and predication preferences over creating a scalar "
218 "epilogue loop."),
220 "scalar-epilogue",
221 "Don't tail-predicate loops, create scalar epilogue"),
223 "predicate-else-scalar-epilogue",
224 "prefer tail-folding, create scalar epilogue if tail "
225 "folding fails."),
227 "predicate-dont-vectorize",
228 "prefers tail-folding, don't attempt vectorization if "
229 "tail-folding fails.")));
230
232 "force-tail-folding-style", cl::desc("Force the tail folding style"),
235 clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
238 "Create lane mask for data only, using active.lane.mask intrinsic"),
240 "data-without-lane-mask",
241 "Create lane mask with compare/stepvector"),
243 "Create lane mask using active.lane.mask intrinsic, and use "
244 "it for both data and control flow"),
246 "data-and-control-without-rt-check",
247 "Similar to data-and-control, but remove the runtime check"),
249 "Use predicated EVL instructions for tail folding. If EVL "
250 "is unsupported, fallback to data-without-lane-mask.")));
251
253 "enable-wide-lane-mask", cl::init(false), cl::Hidden,
254 cl::desc("Enable use of wide lane masks when used for control flow in "
255 "tail-folded loops"));
256
258 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
259 cl::desc("Maximize bandwidth when selecting vectorization factor which "
260 "will be determined by the smallest type in loop."));
261
263 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
264 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
265
266/// An interleave-group may need masking if it resides in a block that needs
267/// predication, or in order to mask away gaps.
269 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
270 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
271
273 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
274 cl::desc("A flag that overrides the target's number of scalar registers."));
275
277 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
278 cl::desc("A flag that overrides the target's number of vector registers."));
279
281 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
282 cl::desc("A flag that overrides the target's max interleave factor for "
283 "scalar loops."));
284
286 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
287 cl::desc("A flag that overrides the target's max interleave factor for "
288 "vectorized loops."));
289
291 "force-target-instruction-cost", cl::init(0), cl::Hidden,
292 cl::desc("A flag that overrides the target's expected cost for "
293 "an instruction to a single constant value. Mostly "
294 "useful for getting consistent testing."));
295
297 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
298 cl::desc(
299 "Pretend that scalable vectors are supported, even if the target does "
300 "not support them. This flag should only be used for testing."));
301
303 "small-loop-cost", cl::init(20), cl::Hidden,
304 cl::desc(
305 "The cost of a loop that is considered 'small' by the interleaver."));
306
308 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
309 cl::desc("Enable the use of the block frequency analysis to access PGO "
310 "heuristics minimizing code growth in cold regions and being more "
311 "aggressive in hot regions."));
312
313// Runtime interleave loops for load/store throughput.
315 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
316 cl::desc(
317 "Enable runtime interleaving until load/store ports are saturated"));
318
319/// The number of stores in a loop that are allowed to need predication.
321 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
322 cl::desc("Max number of stores to be predicated behind an if."));
323
325 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
326 cl::desc("Count the induction variable only once when interleaving"));
327
329 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
330 cl::desc("Enable if predication of stores during vectorization."));
331
333 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
334 cl::desc("The maximum interleave count to use when interleaving a scalar "
335 "reduction in a nested loop."));
336
337static cl::opt<bool>
338 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
340 cl::desc("Prefer in-loop vector reductions, "
341 "overriding the targets preference."));
342
344 "force-ordered-reductions", cl::init(false), cl::Hidden,
345 cl::desc("Enable the vectorisation of loops with in-order (strict) "
346 "FP reductions"));
347
349 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
350 cl::desc(
351 "Prefer predicating a reduction operation over an after loop select."));
352
354 "enable-vplan-native-path", cl::Hidden,
355 cl::desc("Enable VPlan-native vectorization path with "
356 "support for outer loop vectorization."));
357
359 llvm::VerifyEachVPlan("vplan-verify-each",
360#ifdef EXPENSIVE_CHECKS
361 cl::init(true),
362#else
363 cl::init(false),
364#endif
366 cl::desc("Verfiy VPlans after VPlan transforms."));
367
368// This flag enables the stress testing of the VPlan H-CFG construction in the
369// VPlan-native vectorization path. It must be used in conjuction with
370// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
371// verification of the H-CFGs built.
373 "vplan-build-stress-test", cl::init(false), cl::Hidden,
374 cl::desc(
375 "Build VPlan for every supported loop nest in the function and bail "
376 "out right after the build (stress test the VPlan H-CFG construction "
377 "in the VPlan-native vectorization path)."));
378
380 "interleave-loops", cl::init(true), cl::Hidden,
381 cl::desc("Enable loop interleaving in Loop vectorization passes"));
383 "vectorize-loops", cl::init(true), cl::Hidden,
384 cl::desc("Run the Loop vectorization passes"));
385
387 "force-widen-divrem-via-safe-divisor", cl::Hidden,
388 cl::desc(
389 "Override cost based safe divisor widening for div/rem instructions"));
390
392 "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
394 cl::desc("Try wider VFs if they enable the use of vector variants"));
395
397 "enable-early-exit-vectorization", cl::init(true), cl::Hidden,
398 cl::desc(
399 "Enable vectorization of early exit loops with uncountable exits."));
400
402 "vectorizer-consider-reg-pressure", cl::init(false), cl::Hidden,
403 cl::desc("Discard VFs if their register pressure is too high."));
404
405// Likelyhood of bypassing the vectorized loop because there are zero trips left
406// after prolog. See `emitIterationCountCheck`.
407static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
408
409/// A helper function that returns true if the given type is irregular. The
410/// type is irregular if its allocated size doesn't equal the store size of an
411/// element of the corresponding vector type.
412static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
413 // Determine if an array of N elements of type Ty is "bitcast compatible"
414 // with a <N x Ty> vector.
415 // This is only true if there is no padding between the array elements.
416 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
417}
418
419/// A version of ScalarEvolution::getSmallConstantTripCount that returns an
420/// ElementCount to include loops whose trip count is a function of vscale.
422 const Loop *L) {
423 if (unsigned ExpectedTC = SE->getSmallConstantTripCount(L))
424 return ElementCount::getFixed(ExpectedTC);
425
426 const SCEV *BTC = SE->getBackedgeTakenCount(L);
428 return ElementCount::getFixed(0);
429
430 const SCEV *ExitCount = SE->getTripCountFromExitCount(BTC, BTC->getType(), L);
431 if (isa<SCEVVScale>(ExitCount))
433
434 const APInt *Scale;
435 if (match(ExitCount, m_scev_Mul(m_scev_APInt(Scale), m_SCEVVScale())))
436 if (cast<SCEVMulExpr>(ExitCount)->hasNoUnsignedWrap())
437 if (Scale->getActiveBits() <= 32)
439
440 return ElementCount::getFixed(0);
441}
442
443/// Returns "best known" trip count, which is either a valid positive trip count
444/// or std::nullopt when an estimate cannot be made (including when the trip
445/// count would overflow), for the specified loop \p L as defined by the
446/// following procedure:
447/// 1) Returns exact trip count if it is known.
448/// 2) Returns expected trip count according to profile data if any.
449/// 3) Returns upper bound estimate if known, and if \p CanUseConstantMax.
450/// 4) Returns std::nullopt if all of the above failed.
451static std::optional<ElementCount>
453 bool CanUseConstantMax = true) {
454 // Check if exact trip count is known.
455 if (auto ExpectedTC = getSmallConstantTripCount(PSE.getSE(), L))
456 return ExpectedTC;
457
458 // Check if there is an expected trip count available from profile data.
460 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
461 return ElementCount::getFixed(*EstimatedTC);
462
463 if (!CanUseConstantMax)
464 return std::nullopt;
465
466 // Check if upper bound estimate is known.
467 if (unsigned ExpectedTC = PSE.getSmallConstantMaxTripCount())
468 return ElementCount::getFixed(ExpectedTC);
469
470 return std::nullopt;
471}
472
473namespace {
474// Forward declare GeneratedRTChecks.
475class GeneratedRTChecks;
476
477using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
478} // namespace
479
480namespace llvm {
481
483
484/// InnerLoopVectorizer vectorizes loops which contain only one basic
485/// block to a specified vectorization factor (VF).
486/// This class performs the widening of scalars into vectors, or multiple
487/// scalars. This class also implements the following features:
488/// * It inserts an epilogue loop for handling loops that don't have iteration
489/// counts that are known to be a multiple of the vectorization factor.
490/// * It handles the code generation for reduction variables.
491/// * Scalarization (implementation using scalars) of un-vectorizable
492/// instructions.
493/// InnerLoopVectorizer does not perform any vectorization-legality
494/// checks, and relies on the caller to check for the different legality
495/// aspects. The InnerLoopVectorizer relies on the
496/// LoopVectorizationLegality class to provide information about the induction
497/// and reduction variables that were found to a given vectorization factor.
499public:
503 ElementCount VecWidth, unsigned UnrollFactor,
505 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks,
506 VPlan &Plan)
507 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TTI(TTI), AC(AC),
508 VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()),
511 Plan.getVectorLoopRegion()->getSinglePredecessor())) {}
512
513 virtual ~InnerLoopVectorizer() = default;
514
515 /// Creates a basic block for the scalar preheader. Both
516 /// EpilogueVectorizerMainLoop and EpilogueVectorizerEpilogueLoop overwrite
517 /// the method to create additional blocks and checks needed for epilogue
518 /// vectorization.
520
521 /// Fix the vectorized code, taking care of header phi's, and more.
523
524 /// Fix the non-induction PHIs in \p Plan.
526
527 /// Returns the original loop trip count.
528 Value *getTripCount() const { return TripCount; }
529
530 /// Used to set the trip count after ILV's construction and after the
531 /// preheader block has been executed. Note that this always holds the trip
532 /// count of the original loop for both main loop and epilogue vectorization.
533 void setTripCount(Value *TC) { TripCount = TC; }
534
535protected:
537
538 /// Create and return a new IR basic block for the scalar preheader whose name
539 /// is prefixed with \p Prefix.
541
542 /// Allow subclasses to override and print debug traces before/after vplan
543 /// execution, when trace information is requested.
544 virtual void printDebugTracesAtStart() {}
545 virtual void printDebugTracesAtEnd() {}
546
547 /// The original loop.
549
550 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
551 /// dynamic knowledge to simplify SCEV expressions and converts them to a
552 /// more usable form.
554
555 /// Loop Info.
557
558 /// Dominator Tree.
560
561 /// Target Transform Info.
563
564 /// Assumption Cache.
566
567 /// The vectorization SIMD factor to use. Each vector will have this many
568 /// vector elements.
570
571 /// The vectorization unroll factor to use. Each scalar is vectorized to this
572 /// many different vector instructions.
573 unsigned UF;
574
575 /// The builder that we use
577
578 // --- Vectorization state ---
579
580 /// Trip count of the original loop.
581 Value *TripCount = nullptr;
582
583 /// The profitablity analysis.
585
586 /// BFI and PSI are used to check for profile guided size optimizations.
589
590 /// Structure to hold information about generated runtime checks, responsible
591 /// for cleaning the checks, if vectorization turns out unprofitable.
592 GeneratedRTChecks &RTChecks;
593
595
596 /// The vector preheader block of \p Plan, used as target for check blocks
597 /// introduced during skeleton creation.
599};
600
601/// Encapsulate information regarding vectorization of a loop and its epilogue.
602/// This information is meant to be updated and used across two stages of
603/// epilogue vectorization.
606 unsigned MainLoopUF = 0;
608 unsigned EpilogueUF = 0;
611 Value *TripCount = nullptr;
614
616 ElementCount EVF, unsigned EUF,
618 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF),
620 assert(EUF == 1 &&
621 "A high UF for the epilogue loop is likely not beneficial.");
622 }
623};
624
625/// An extension of the inner loop vectorizer that creates a skeleton for a
626/// vectorized loop that has its epilogue (residual) also vectorized.
627/// The idea is to run the vplan on a given loop twice, firstly to setup the
628/// skeleton and vectorize the main loop, and secondly to complete the skeleton
629/// from the first step and vectorize the epilogue. This is achieved by
630/// deriving two concrete strategy classes from this base class and invoking
631/// them in succession from the loop vectorizer planner.
633public:
644
645 /// Holds and updates state information required to vectorize the main loop
646 /// and its epilogue in two separate passes. This setup helps us avoid
647 /// regenerating and recomputing runtime safety checks. It also helps us to
648 /// shorten the iteration-count-check path length for the cases where the
649 /// iteration count of the loop is so small that the main vector loop is
650 /// completely skipped.
652
653protected:
655};
656
657/// A specialized derived class of inner loop vectorizer that performs
658/// vectorization of *main* loops in the process of vectorizing loops and their
659/// epilogues.
661public:
673 /// Implements the interface for creating a vectorized skeleton using the
674 /// *main loop* strategy (i.e., the first pass of VPlan execution).
676
677protected:
678 /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the
679 /// vector preheader and its predecessor, also connecting the new block to the
680 /// scalar preheader.
681 void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB);
682
683 // Create a check to see if the main vector loop should be executed
685 unsigned UF) const;
686
687 /// Emits an iteration count bypass check once for the main loop (when \p
688 /// ForEpilogue is false) and once for the epilogue loop (when \p
689 /// ForEpilogue is true).
691 bool ForEpilogue);
692 void printDebugTracesAtStart() override;
693 void printDebugTracesAtEnd() override;
694};
695
696// A specialized derived class of inner loop vectorizer that performs
697// vectorization of *epilogue* loops in the process of vectorizing loops and
698// their epilogues.
700public:
710 /// Implements the interface for creating a vectorized skeleton using the
711 /// *epilogue loop* strategy (i.e., the second pass of VPlan execution).
713
714protected:
715 void printDebugTracesAtStart() override;
716 void printDebugTracesAtEnd() override;
717};
718} // end namespace llvm
719
720/// Look for a meaningful debug location on the instruction or its operands.
722 if (!I)
723 return DebugLoc::getUnknown();
724
726 if (I->getDebugLoc() != Empty)
727 return I->getDebugLoc();
728
729 for (Use &Op : I->operands()) {
730 if (Instruction *OpInst = dyn_cast<Instruction>(Op))
731 if (OpInst->getDebugLoc() != Empty)
732 return OpInst->getDebugLoc();
733 }
734
735 return I->getDebugLoc();
736}
737
738/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
739/// is passed, the message relates to that particular instruction.
740#ifndef NDEBUG
741static void debugVectorizationMessage(const StringRef Prefix,
742 const StringRef DebugMsg,
743 Instruction *I) {
744 dbgs() << "LV: " << Prefix << DebugMsg;
745 if (I != nullptr)
746 dbgs() << " " << *I;
747 else
748 dbgs() << '.';
749 dbgs() << '\n';
750}
751#endif
752
753/// Create an analysis remark that explains why vectorization failed
754///
755/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
756/// RemarkName is the identifier for the remark. If \p I is passed it is an
757/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
758/// the location of the remark. If \p DL is passed, use it as debug location for
759/// the remark. \return the remark object that can be streamed to.
760static OptimizationRemarkAnalysis
761createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,
762 Instruction *I, DebugLoc DL = {}) {
763 BasicBlock *CodeRegion = I ? I->getParent() : TheLoop->getHeader();
764 // If debug location is attached to the instruction, use it. Otherwise if DL
765 // was not provided, use the loop's.
766 if (I && I->getDebugLoc())
767 DL = I->getDebugLoc();
768 else if (!DL)
769 DL = TheLoop->getStartLoc();
770
771 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
772}
773
774namespace llvm {
775
776/// Return a value for Step multiplied by VF.
778 int64_t Step) {
779 assert(Ty->isIntegerTy() && "Expected an integer step");
780 ElementCount VFxStep = VF.multiplyCoefficientBy(Step);
781 assert(isPowerOf2_64(VF.getKnownMinValue()) && "must pass power-of-2 VF");
782 if (VF.isScalable() && isPowerOf2_64(Step)) {
783 return B.CreateShl(
784 B.CreateVScale(Ty),
785 ConstantInt::get(Ty, Log2_64(VFxStep.getKnownMinValue())), "", true);
786 }
787 return B.CreateElementCount(Ty, VFxStep);
788}
789
790/// Return the runtime value for VF.
792 return B.CreateElementCount(Ty, VF);
793}
794
796 const StringRef OREMsg, const StringRef ORETag,
797 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
798 Instruction *I) {
799 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
800 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
801 ORE->emit(
802 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
803 << "loop not vectorized: " << OREMsg);
804}
805
806/// Reports an informative message: print \p Msg for debugging purposes as well
807/// as an optimization remark. Uses either \p I as location of the remark, or
808/// otherwise \p TheLoop. If \p DL is passed, use it as debug location for the
809/// remark. If \p DL is passed, use it as debug location for the remark.
810static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
812 Loop *TheLoop, Instruction *I = nullptr,
813 DebugLoc DL = {}) {
815 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
816 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop,
817 I, DL)
818 << Msg);
819}
820
821/// Report successful vectorization of the loop. In case an outer loop is
822/// vectorized, prepend "outer" to the vectorization remark.
824 VectorizationFactor VF, unsigned IC) {
826 "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
827 nullptr));
828 StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
829 ORE->emit([&]() {
830 return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
831 TheLoop->getHeader())
832 << "vectorized " << LoopType << "loop (vectorization width: "
833 << ore::NV("VectorizationFactor", VF.Width)
834 << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
835 });
836}
837
838} // end namespace llvm
839
840namespace llvm {
841
842// Loop vectorization cost-model hints how the scalar epilogue loop should be
843// lowered.
845
846 // The default: allowing scalar epilogues.
848
849 // Vectorization with OptForSize: don't allow epilogues.
851
852 // A special case of vectorisation with OptForSize: loops with a very small
853 // trip count are considered for vectorization under OptForSize, thereby
854 // making sure the cost of their loop body is dominant, free of runtime
855 // guards and scalar iteration overheads.
857
858 // Loop hint predicate indicating an epilogue is undesired.
860
861 // Directive indicating we must either tail fold or not vectorize
863};
864
865/// LoopVectorizationCostModel - estimates the expected speedups due to
866/// vectorization.
867/// In many cases vectorization is not profitable. This can happen because of
868/// a number of reasons. In this class we mainly attempt to predict the
869/// expected speedup/slowdowns due to the supported instruction set. We use the
870/// TargetTransformInfo to query the different backends for the cost of
871/// different operations.
874
875public:
886 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
887 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
888 Hints(Hints), InterleaveInfo(IAI) {
889 if (TTI.supportsScalableVectors() || ForceTargetSupportsScalableVectors)
890 initializeVScaleForTuning();
892 // Query this against the original loop and save it here because the profile
893 // of the original loop header may change as the transformation happens.
894 OptForSize = llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
896 }
897
898 /// \return An upper bound for the vectorization factors (both fixed and
899 /// scalable). If the factors are 0, vectorization and interleaving should be
900 /// avoided up front.
901 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
902
903 /// \return True if runtime checks are required for vectorization, and false
904 /// otherwise.
905 bool runtimeChecksRequired();
906
907 /// Setup cost-based decisions for user vectorization factor.
908 /// \return true if the UserVF is a feasible VF to be chosen.
911 return expectedCost(UserVF).isValid();
912 }
913
914 /// \return True if maximizing vector bandwidth is enabled by the target or
915 /// user options, for the given register kind.
916 bool useMaxBandwidth(TargetTransformInfo::RegisterKind RegKind);
917
918 /// \return True if register pressure should be considered for the given VF.
919 bool shouldConsiderRegPressureForVF(ElementCount VF);
920
921 /// \return The size (in bits) of the smallest and widest types in the code
922 /// that needs to be vectorized. We ignore values that remain scalar such as
923 /// 64 bit loop indices.
924 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
925
926 /// Memory access instruction may be vectorized in more than one way.
927 /// Form of instruction after vectorization depends on cost.
928 /// This function takes cost-based decisions for Load/Store instructions
929 /// and collects them in a map. This decisions map is used for building
930 /// the lists of loop-uniform and loop-scalar instructions.
931 /// The calculated cost is saved with widening decision in order to
932 /// avoid redundant calculations.
933 void setCostBasedWideningDecision(ElementCount VF);
934
935 /// A call may be vectorized in different ways depending on whether we have
936 /// vectorized variants available and whether the target supports masking.
937 /// This function analyzes all calls in the function at the supplied VF,
938 /// makes a decision based on the costs of available options, and stores that
939 /// decision in a map for use in planning and plan execution.
940 void setVectorizedCallDecision(ElementCount VF);
941
942 /// Collect values we want to ignore in the cost model.
943 void collectValuesToIgnore();
944
945 /// Collect all element types in the loop for which widening is needed.
946 void collectElementTypesForWidening();
947
948 /// Split reductions into those that happen in the loop, and those that happen
949 /// outside. In loop reductions are collected into InLoopReductions.
950 void collectInLoopReductions();
951
952 /// Returns true if we should use strict in-order reductions for the given
953 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
954 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
955 /// of FP operations.
956 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
957 return !Hints->allowReordering() && RdxDesc.isOrdered();
958 }
959
960 /// \returns The smallest bitwidth each instruction can be represented with.
961 /// The vector equivalents of these instructions should be truncated to this
962 /// type.
964 return MinBWs;
965 }
966
967 /// \returns True if it is more profitable to scalarize instruction \p I for
968 /// vectorization factor \p VF.
970 assert(VF.isVector() &&
971 "Profitable to scalarize relevant only for VF > 1.");
972 assert(
973 TheLoop->isInnermost() &&
974 "cost-model should not be used for outer loops (in VPlan-native path)");
975
976 auto Scalars = InstsToScalarize.find(VF);
977 assert(Scalars != InstsToScalarize.end() &&
978 "VF not yet analyzed for scalarization profitability");
979 return Scalars->second.contains(I);
980 }
981
982 /// Returns true if \p I is known to be uniform after vectorization.
984 assert(
985 TheLoop->isInnermost() &&
986 "cost-model should not be used for outer loops (in VPlan-native path)");
987 // Pseudo probe needs to be duplicated for each unrolled iteration and
988 // vector lane so that profiled loop trip count can be accurately
989 // accumulated instead of being under counted.
991 return false;
992
993 if (VF.isScalar())
994 return true;
995
996 auto UniformsPerVF = Uniforms.find(VF);
997 assert(UniformsPerVF != Uniforms.end() &&
998 "VF not yet analyzed for uniformity");
999 return UniformsPerVF->second.count(I);
1000 }
1001
1002 /// Returns true if \p I is known to be scalar after vectorization.
1004 assert(
1005 TheLoop->isInnermost() &&
1006 "cost-model should not be used for outer loops (in VPlan-native path)");
1007 if (VF.isScalar())
1008 return true;
1009
1010 auto ScalarsPerVF = Scalars.find(VF);
1011 assert(ScalarsPerVF != Scalars.end() &&
1012 "Scalar values are not calculated for VF");
1013 return ScalarsPerVF->second.count(I);
1014 }
1015
1016 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1017 /// for vectorization factor \p VF.
1019 // Truncs must truncate at most to their destination type.
1020 if (isa_and_nonnull<TruncInst>(I) && MinBWs.contains(I) &&
1021 I->getType()->getScalarSizeInBits() < MinBWs.lookup(I))
1022 return false;
1023 return VF.isVector() && MinBWs.contains(I) &&
1024 !isProfitableToScalarize(I, VF) &&
1026 }
1027
1028 /// Decision that was taken during cost calculation for memory instruction.
1031 CM_Widen, // For consecutive accesses with stride +1.
1032 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1038 };
1039
1040 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1041 /// instruction \p I and vector width \p VF.
1044 assert(VF.isVector() && "Expected VF >=2");
1045 WideningDecisions[{I, VF}] = {W, Cost};
1046 }
1047
1048 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1049 /// interleaving group \p Grp and vector width \p VF.
1053 assert(VF.isVector() && "Expected VF >=2");
1054 /// Broadcast this decicion to all instructions inside the group.
1055 /// When interleaving, the cost will only be assigned one instruction, the
1056 /// insert position. For other cases, add the appropriate fraction of the
1057 /// total cost to each instruction. This ensures accurate costs are used,
1058 /// even if the insert position instruction is not used.
1059 InstructionCost InsertPosCost = Cost;
1060 InstructionCost OtherMemberCost = 0;
1061 if (W != CM_Interleave)
1062 OtherMemberCost = InsertPosCost = Cost / Grp->getNumMembers();
1063 ;
1064 for (unsigned Idx = 0; Idx < Grp->getFactor(); ++Idx) {
1065 if (auto *I = Grp->getMember(Idx)) {
1066 if (Grp->getInsertPos() == I)
1067 WideningDecisions[{I, VF}] = {W, InsertPosCost};
1068 else
1069 WideningDecisions[{I, VF}] = {W, OtherMemberCost};
1070 }
1071 }
1072 }
1073
1074 /// Return the cost model decision for the given instruction \p I and vector
1075 /// width \p VF. Return CM_Unknown if this instruction did not pass
1076 /// through the cost modeling.
1078 assert(VF.isVector() && "Expected VF to be a vector VF");
1079 assert(
1080 TheLoop->isInnermost() &&
1081 "cost-model should not be used for outer loops (in VPlan-native path)");
1082
1083 std::pair<Instruction *, ElementCount> InstOnVF(I, VF);
1084 auto Itr = WideningDecisions.find(InstOnVF);
1085 if (Itr == WideningDecisions.end())
1086 return CM_Unknown;
1087 return Itr->second.first;
1088 }
1089
1090 /// Return the vectorization cost for the given instruction \p I and vector
1091 /// width \p VF.
1093 assert(VF.isVector() && "Expected VF >=2");
1094 std::pair<Instruction *, ElementCount> InstOnVF(I, VF);
1095 assert(WideningDecisions.contains(InstOnVF) &&
1096 "The cost is not calculated");
1097 return WideningDecisions[InstOnVF].second;
1098 }
1099
1107
1109 Function *Variant, Intrinsic::ID IID,
1110 std::optional<unsigned> MaskPos,
1112 assert(!VF.isScalar() && "Expected vector VF");
1113 CallWideningDecisions[{CI, VF}] = {Kind, Variant, IID, MaskPos, Cost};
1114 }
1115
1117 ElementCount VF) const {
1118 assert(!VF.isScalar() && "Expected vector VF");
1119 auto I = CallWideningDecisions.find({CI, VF});
1120 if (I == CallWideningDecisions.end())
1121 return {CM_Unknown, nullptr, Intrinsic::not_intrinsic, std::nullopt, 0};
1122 return I->second;
1123 }
1124
1125 /// Return True if instruction \p I is an optimizable truncate whose operand
1126 /// is an induction variable. Such a truncate will be removed by adding a new
1127 /// induction variable with the destination type.
1129 // If the instruction is not a truncate, return false.
1130 auto *Trunc = dyn_cast<TruncInst>(I);
1131 if (!Trunc)
1132 return false;
1133
1134 // Get the source and destination types of the truncate.
1135 Type *SrcTy = toVectorTy(Trunc->getSrcTy(), VF);
1136 Type *DestTy = toVectorTy(Trunc->getDestTy(), VF);
1137
1138 // If the truncate is free for the given types, return false. Replacing a
1139 // free truncate with an induction variable would add an induction variable
1140 // update instruction to each iteration of the loop. We exclude from this
1141 // check the primary induction variable since it will need an update
1142 // instruction regardless.
1143 Value *Op = Trunc->getOperand(0);
1144 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1145 return false;
1146
1147 // If the truncated value is not an induction variable, return false.
1148 return Legal->isInductionPhi(Op);
1149 }
1150
1151 /// Collects the instructions to scalarize for each predicated instruction in
1152 /// the loop.
1153 void collectInstsToScalarize(ElementCount VF);
1154
1155 /// Collect values that will not be widened, including Uniforms, Scalars, and
1156 /// Instructions to Scalarize for the given \p VF.
1157 /// The sets depend on CM decision for Load/Store instructions
1158 /// that may be vectorized as interleave, gather-scatter or scalarized.
1159 /// Also make a decision on what to do about call instructions in the loop
1160 /// at that VF -- scalarize, call a known vector routine, or call a
1161 /// vector intrinsic.
1163 // Do the analysis once.
1164 if (VF.isScalar() || Uniforms.contains(VF))
1165 return;
1167 collectLoopUniforms(VF);
1169 collectLoopScalars(VF);
1171 }
1172
1173 /// Returns true if the target machine supports masked store operation
1174 /// for the given \p DataType and kind of access to \p Ptr.
1175 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment,
1176 unsigned AddressSpace) const {
1177 return Legal->isConsecutivePtr(DataType, Ptr) &&
1178 TTI.isLegalMaskedStore(DataType, Alignment, AddressSpace);
1179 }
1180
1181 /// Returns true if the target machine supports masked load operation
1182 /// for the given \p DataType and kind of access to \p Ptr.
1183 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment,
1184 unsigned AddressSpace) const {
1185 return Legal->isConsecutivePtr(DataType, Ptr) &&
1186 TTI.isLegalMaskedLoad(DataType, Alignment, AddressSpace);
1187 }
1188
1189 /// Returns true if the target machine can represent \p V as a masked gather
1190 /// or scatter operation.
1192 bool LI = isa<LoadInst>(V);
1193 bool SI = isa<StoreInst>(V);
1194 if (!LI && !SI)
1195 return false;
1196 auto *Ty = getLoadStoreType(V);
1198 if (VF.isVector())
1199 Ty = VectorType::get(Ty, VF);
1200 return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1201 (SI && TTI.isLegalMaskedScatter(Ty, Align));
1202 }
1203
1204 /// Returns true if the target machine supports all of the reduction
1205 /// variables found for the given VF.
1207 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1208 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1209 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1210 }));
1211 }
1212
1213 /// Given costs for both strategies, return true if the scalar predication
1214 /// lowering should be used for div/rem. This incorporates an override
1215 /// option so it is not simply a cost comparison.
1217 InstructionCost SafeDivisorCost) const {
1218 switch (ForceSafeDivisor) {
1219 case cl::BOU_UNSET:
1220 return ScalarCost < SafeDivisorCost;
1221 case cl::BOU_TRUE:
1222 return false;
1223 case cl::BOU_FALSE:
1224 return true;
1225 }
1226 llvm_unreachable("impossible case value");
1227 }
1228
1229 /// Returns true if \p I is an instruction which requires predication and
1230 /// for which our chosen predication strategy is scalarization (i.e. we
1231 /// don't have an alternate strategy such as masking available).
1232 /// \p VF is the vectorization factor that will be used to vectorize \p I.
1233 bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1234
1235 /// Returns true if \p I is an instruction that needs to be predicated
1236 /// at runtime. The result is independent of the predication mechanism.
1237 /// Superset of instructions that return true for isScalarWithPredication.
1238 bool isPredicatedInst(Instruction *I) const;
1239
1240 /// A helper function that returns how much we should divide the cost of a
1241 /// predicated block by. Typically this is the reciprocal of the block
1242 /// probability, i.e. if we return X we are assuming the predicated block will
1243 /// execute once for every X iterations of the loop header so the block should
1244 /// only contribute 1/X of its cost to the total cost calculation, but when
1245 /// optimizing for code size it will just be 1 as code size costs don't depend
1246 /// on execution probabilities.
1247 ///
1248 /// TODO: We should use actual block probability here, if available.
1249 /// Currently, we always assume predicated blocks have a 50% chance of
1250 /// executing, apart from blocks that are only predicated due to tail folding.
1251 inline unsigned
1253 BasicBlock *BB) const {
1254 // If a block wasn't originally predicated but was predicated due to
1255 // e.g. tail folding, don't divide the cost. Tail folded loops may still be
1256 // predicated in the final vector loop iteration, but for most loops that
1257 // don't have low trip counts we can expect their probability to be close to
1258 // zero.
1259 if (!Legal->blockNeedsPredication(BB))
1260 return 1;
1261 return CostKind == TTI::TCK_CodeSize ? 1 : 2;
1262 }
1263
1264 /// Return the costs for our two available strategies for lowering a
1265 /// div/rem operation which requires speculating at least one lane.
1266 /// First result is for scalarization (will be invalid for scalable
1267 /// vectors); second is for the safe-divisor strategy.
1268 std::pair<InstructionCost, InstructionCost>
1269 getDivRemSpeculationCost(Instruction *I,
1270 ElementCount VF) const;
1271
1272 /// Returns true if \p I is a memory instruction with consecutive memory
1273 /// access that can be widened.
1274 bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
1275
1276 /// Returns true if \p I is a memory instruction in an interleaved-group
1277 /// of memory accesses that can be vectorized with wide vector loads/stores
1278 /// and shuffles.
1279 bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const;
1280
1281 /// Check if \p Instr belongs to any interleaved access group.
1283 return InterleaveInfo.isInterleaved(Instr);
1284 }
1285
1286 /// Get the interleaved access group that \p Instr belongs to.
1289 return InterleaveInfo.getInterleaveGroup(Instr);
1290 }
1291
1292 /// Returns true if we're required to use a scalar epilogue for at least
1293 /// the final iteration of the original loop.
1294 bool requiresScalarEpilogue(bool IsVectorizing) const {
1295 if (!isScalarEpilogueAllowed()) {
1296 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1297 return false;
1298 }
1299 // If we might exit from anywhere but the latch and early exit vectorization
1300 // is disabled, we must run the exiting iteration in scalar form.
1301 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
1302 !(EnableEarlyExitVectorization && Legal->hasUncountableEarlyExit())) {
1303 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting "
1304 "from latch block\n");
1305 return true;
1306 }
1307 if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
1308 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
1309 "interleaved group requires scalar epilogue\n");
1310 return true;
1311 }
1312 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1313 return false;
1314 }
1315
1316 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1317 /// loop hint annotation.
1319 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1320 }
1321
1322 /// Returns true if tail-folding is preferred over a scalar epilogue.
1324 return ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate ||
1325 ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate;
1326 }
1327
1328 /// Returns the TailFoldingStyle that is best for the current loop.
1329 TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1330 if (!ChosenTailFoldingStyle)
1332 return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first
1333 : ChosenTailFoldingStyle->second;
1334 }
1335
1336 /// Selects and saves TailFoldingStyle for 2 options - if IV update may
1337 /// overflow or not.
1338 /// \param IsScalableVF true if scalable vector factors enabled.
1339 /// \param UserIC User specific interleave count.
1340 void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
1341 assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
1342 if (!Legal->canFoldTailByMasking()) {
1343 ChosenTailFoldingStyle = {TailFoldingStyle::None, TailFoldingStyle::None};
1344 return;
1345 }
1346
1347 // Default to TTI preference, but allow command line override.
1348 ChosenTailFoldingStyle = {
1349 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
1350 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false)};
1351 if (ForceTailFoldingStyle.getNumOccurrences())
1352 ChosenTailFoldingStyle = {ForceTailFoldingStyle.getValue(),
1353 ForceTailFoldingStyle.getValue()};
1354
1355 if (ChosenTailFoldingStyle->first != TailFoldingStyle::DataWithEVL &&
1356 ChosenTailFoldingStyle->second != TailFoldingStyle::DataWithEVL)
1357 return;
1358 // Override EVL styles if needed.
1359 // FIXME: Investigate opportunity for fixed vector factor.
1360 bool EVLIsLegal = UserIC <= 1 && IsScalableVF &&
1361 TTI.hasActiveVectorLength() && !EnableVPlanNativePath;
1362 if (EVLIsLegal)
1363 return;
1364 // If for some reason EVL mode is unsupported, fallback to a scalar epilogue
1365 // if it's allowed, or DataWithoutLaneMask otherwise.
1366 if (ScalarEpilogueStatus == CM_ScalarEpilogueAllowed ||
1367 ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate)
1368 ChosenTailFoldingStyle = {TailFoldingStyle::None, TailFoldingStyle::None};
1369 else
1370 ChosenTailFoldingStyle = {TailFoldingStyle::DataWithoutLaneMask,
1372
1373 LLVM_DEBUG(
1374 dbgs() << "LV: Preference for VP intrinsics indicated. Will "
1375 "not try to generate VP Intrinsics "
1376 << (UserIC > 1
1377 ? "since interleave count specified is greater than 1.\n"
1378 : "due to non-interleaving reasons.\n"));
1379 }
1380
1381 /// Returns true if all loop blocks should be masked to fold tail loop.
1382 bool foldTailByMasking() const {
1383 // TODO: check if it is possible to check for None style independent of
1384 // IVUpdateMayOverflow flag in getTailFoldingStyle.
1386 }
1387
1388 /// Returns true if the use of wide lane masks is requested and the loop is
1389 /// using tail-folding with a lane mask for control flow.
1398
1399 /// Return maximum safe number of elements to be processed per vector
1400 /// iteration, which do not prevent store-load forwarding and are safe with
1401 /// regard to the memory dependencies. Required for EVL-based VPlans to
1402 /// correctly calculate AVL (application vector length) as min(remaining AVL,
1403 /// MaxSafeElements).
1404 /// TODO: need to consider adjusting cost model to use this value as a
1405 /// vectorization factor for EVL-based vectorization.
1406 std::optional<unsigned> getMaxSafeElements() const { return MaxSafeElements; }
1407
1408 /// Returns true if the instructions in this block requires predication
1409 /// for any reason, e.g. because tail folding now requires a predicate
1410 /// or because the block in the original loop was predicated.
1412 return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1413 }
1414
1415 /// Returns true if VP intrinsics with explicit vector length support should
1416 /// be generated in the tail folded loop.
1420
1421 /// Returns true if the Phi is part of an inloop reduction.
1422 bool isInLoopReduction(PHINode *Phi) const {
1423 return InLoopReductions.contains(Phi);
1424 }
1425
1426 /// Returns true if the predicated reduction select should be used to set the
1427 /// incoming value for the reduction phi.
1429 // Force to use predicated reduction select since the EVL of the
1430 // second-to-last iteration might not be VF*UF.
1431 if (foldTailWithEVL())
1432 return true;
1434 TTI.preferPredicatedReductionSelect();
1435 }
1436
1437 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1438 /// with factor VF. Return the cost of the instruction, including
1439 /// scalarization overhead if it's needed.
1440 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1441
1442 /// Estimate cost of a call instruction CI if it were vectorized with factor
1443 /// VF. Return the cost of the instruction, including scalarization overhead
1444 /// if it's needed.
1445 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const;
1446
1447 /// Invalidates decisions already taken by the cost model.
1449 WideningDecisions.clear();
1450 CallWideningDecisions.clear();
1451 Uniforms.clear();
1452 Scalars.clear();
1453 }
1454
1455 /// Returns the expected execution cost. The unit of the cost does
1456 /// not matter because we use the 'cost' units to compare different
1457 /// vector widths. The cost that is returned is *not* normalized by
1458 /// the factor width.
1459 InstructionCost expectedCost(ElementCount VF);
1460
1461 bool hasPredStores() const { return NumPredStores > 0; }
1462
1463 /// Returns true if epilogue vectorization is considered profitable, and
1464 /// false otherwise.
1465 /// \p VF is the vectorization factor chosen for the original loop.
1466 /// \p Multiplier is an aditional scaling factor applied to VF before
1467 /// comparing to EpilogueVectorizationMinVF.
1468 bool isEpilogueVectorizationProfitable(const ElementCount VF,
1469 const unsigned IC) const;
1470
1471 /// Returns the execution time cost of an instruction for a given vector
1472 /// width. Vector width of one means scalar.
1473 InstructionCost getInstructionCost(Instruction *I, ElementCount VF);
1474
1475 /// Return the cost of instructions in an inloop reduction pattern, if I is
1476 /// part of that pattern.
1477 std::optional<InstructionCost> getReductionPatternCost(Instruction *I,
1478 ElementCount VF,
1479 Type *VectorTy) const;
1480
1481 /// Returns true if \p Op should be considered invariant and if it is
1482 /// trivially hoistable.
1483 bool shouldConsiderInvariant(Value *Op);
1484
1485 /// Return the value of vscale used for tuning the cost model.
1486 std::optional<unsigned> getVScaleForTuning() const { return VScaleForTuning; }
1487
1488private:
1489 unsigned NumPredStores = 0;
1490
1491 /// Used to store the value of vscale used for tuning the cost model. It is
1492 /// initialized during object construction.
1493 std::optional<unsigned> VScaleForTuning;
1494
1495 /// Initializes the value of vscale used for tuning the cost model. If
1496 /// vscale_range.min == vscale_range.max then return vscale_range.max, else
1497 /// return the value returned by the corresponding TTI method.
1498 void initializeVScaleForTuning() {
1499 const Function *Fn = TheLoop->getHeader()->getParent();
1500 if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
1501 auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
1502 auto Min = Attr.getVScaleRangeMin();
1503 auto Max = Attr.getVScaleRangeMax();
1504 if (Max && Min == Max) {
1505 VScaleForTuning = Max;
1506 return;
1507 }
1508 }
1509
1510 VScaleForTuning = TTI.getVScaleForTuning();
1511 }
1512
1513 /// \return An upper bound for the vectorization factors for both
1514 /// fixed and scalable vectorization, where the minimum-known number of
1515 /// elements is a power-of-2 larger than zero. If scalable vectorization is
1516 /// disabled or unsupported, then the scalable part will be equal to
1517 /// ElementCount::getScalable(0).
1518 FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1519 ElementCount UserVF,
1520 bool FoldTailByMasking);
1521
1522 /// If \p VF > MaxTripcount, clamps it to the next lower VF that is <=
1523 /// MaxTripCount.
1524 ElementCount clampVFByMaxTripCount(ElementCount VF, unsigned MaxTripCount,
1525 bool FoldTailByMasking) const;
1526
1527 /// \return the maximized element count based on the targets vector
1528 /// registers and the loop trip-count, but limited to a maximum safe VF.
1529 /// This is a helper function of computeFeasibleMaxVF.
1530 ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1531 unsigned SmallestType,
1532 unsigned WidestType,
1533 ElementCount MaxSafeVF,
1534 bool FoldTailByMasking);
1535
1536 /// Checks if scalable vectorization is supported and enabled. Caches the
1537 /// result to avoid repeated debug dumps for repeated queries.
1538 bool isScalableVectorizationAllowed();
1539
1540 /// \return the maximum legal scalable VF, based on the safe max number
1541 /// of elements.
1542 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1543
1544 /// Calculate vectorization cost of memory instruction \p I.
1545 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1546
1547 /// The cost computation for scalarized memory instruction.
1548 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1549
1550 /// The cost computation for interleaving group of memory instructions.
1551 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1552
1553 /// The cost computation for Gather/Scatter instruction.
1554 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1555
1556 /// The cost computation for widening instruction \p I with consecutive
1557 /// memory access.
1558 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1559
1560 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1561 /// Load: scalar load + broadcast.
1562 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1563 /// element)
1564 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1565
1566 /// Estimate the overhead of scalarizing an instruction. This is a
1567 /// convenience wrapper for the type-based getScalarizationOverhead API.
1569 ElementCount VF) const;
1570
1571 /// Returns true if an artificially high cost for emulated masked memrefs
1572 /// should be used.
1573 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1574
1575 /// Map of scalar integer values to the smallest bitwidth they can be legally
1576 /// represented as. The vector equivalents of these values should be truncated
1577 /// to this type.
1578 MapVector<Instruction *, uint64_t> MinBWs;
1579
1580 /// A type representing the costs for instructions if they were to be
1581 /// scalarized rather than vectorized. The entries are Instruction-Cost
1582 /// pairs.
1583 using ScalarCostsTy = MapVector<Instruction *, InstructionCost>;
1584
1585 /// A set containing all BasicBlocks that are known to present after
1586 /// vectorization as a predicated block.
1587 DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
1588 PredicatedBBsAfterVectorization;
1589
1590 /// Records whether it is allowed to have the original scalar loop execute at
1591 /// least once. This may be needed as a fallback loop in case runtime
1592 /// aliasing/dependence checks fail, or to handle the tail/remainder
1593 /// iterations when the trip count is unknown or doesn't divide by the VF,
1594 /// or as a peel-loop to handle gaps in interleave-groups.
1595 /// Under optsize and when the trip count is very small we don't allow any
1596 /// iterations to execute in the scalar loop.
1597 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1598
1599 /// Control finally chosen tail folding style. The first element is used if
1600 /// the IV update may overflow, the second element - if it does not.
1601 std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
1602 ChosenTailFoldingStyle;
1603
1604 /// true if scalable vectorization is supported and enabled.
1605 std::optional<bool> IsScalableVectorizationAllowed;
1606
1607 /// Maximum safe number of elements to be processed per vector iteration,
1608 /// which do not prevent store-load forwarding and are safe with regard to the
1609 /// memory dependencies. Required for EVL-based veectorization, where this
1610 /// value is used as the upper bound of the safe AVL.
1611 std::optional<unsigned> MaxSafeElements;
1612
1613 /// A map holding scalar costs for different vectorization factors. The
1614 /// presence of a cost for an instruction in the mapping indicates that the
1615 /// instruction will be scalarized when vectorizing with the associated
1616 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1617 MapVector<ElementCount, ScalarCostsTy> InstsToScalarize;
1618
1619 /// Holds the instructions known to be uniform after vectorization.
1620 /// The data is collected per VF.
1621 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1622
1623 /// Holds the instructions known to be scalar after vectorization.
1624 /// The data is collected per VF.
1625 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1626
1627 /// Holds the instructions (address computations) that are forced to be
1628 /// scalarized.
1629 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1630
1631 /// PHINodes of the reductions that should be expanded in-loop.
1632 SmallPtrSet<PHINode *, 4> InLoopReductions;
1633
1634 /// A Map of inloop reduction operations and their immediate chain operand.
1635 /// FIXME: This can be removed once reductions can be costed correctly in
1636 /// VPlan. This was added to allow quick lookup of the inloop operations.
1637 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1638
1639 /// Returns the expected difference in cost from scalarizing the expression
1640 /// feeding a predicated instruction \p PredInst. The instructions to
1641 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1642 /// non-negative return value implies the expression will be scalarized.
1643 /// Currently, only single-use chains are considered for scalarization.
1644 InstructionCost computePredInstDiscount(Instruction *PredInst,
1645 ScalarCostsTy &ScalarCosts,
1646 ElementCount VF);
1647
1648 /// Collect the instructions that are uniform after vectorization. An
1649 /// instruction is uniform if we represent it with a single scalar value in
1650 /// the vectorized loop corresponding to each vector iteration. Examples of
1651 /// uniform instructions include pointer operands of consecutive or
1652 /// interleaved memory accesses. Note that although uniformity implies an
1653 /// instruction will be scalar, the reverse is not true. In general, a
1654 /// scalarized instruction will be represented by VF scalar values in the
1655 /// vectorized loop, each corresponding to an iteration of the original
1656 /// scalar loop.
1657 void collectLoopUniforms(ElementCount VF);
1658
1659 /// Collect the instructions that are scalar after vectorization. An
1660 /// instruction is scalar if it is known to be uniform or will be scalarized
1661 /// during vectorization. collectLoopScalars should only add non-uniform nodes
1662 /// to the list if they are used by a load/store instruction that is marked as
1663 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1664 /// VF values in the vectorized loop, each corresponding to an iteration of
1665 /// the original scalar loop.
1666 void collectLoopScalars(ElementCount VF);
1667
1668 /// Keeps cost model vectorization decision and cost for instructions.
1669 /// Right now it is used for memory instructions only.
1670 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1671 std::pair<InstWidening, InstructionCost>>;
1672
1673 DecisionList WideningDecisions;
1674
1675 using CallDecisionList =
1676 DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1677
1678 CallDecisionList CallWideningDecisions;
1679
1680 /// Returns true if \p V is expected to be vectorized and it needs to be
1681 /// extracted.
1682 bool needsExtract(Value *V, ElementCount VF) const {
1684 if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1685 TheLoop->isLoopInvariant(I) ||
1686 getWideningDecision(I, VF) == CM_Scalarize ||
1687 (isa<CallInst>(I) &&
1688 getCallWideningDecision(cast<CallInst>(I), VF).Kind == CM_Scalarize))
1689 return false;
1690
1691 // Assume we can vectorize V (and hence we need extraction) if the
1692 // scalars are not computed yet. This can happen, because it is called
1693 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1694 // the scalars are collected. That should be a safe assumption in most
1695 // cases, because we check if the operands have vectorizable types
1696 // beforehand in LoopVectorizationLegality.
1697 return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
1698 };
1699
1700 /// Returns a range containing only operands needing to be extracted.
1701 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1702 ElementCount VF) const {
1703
1704 SmallPtrSet<const Value *, 4> UniqueOperands;
1706 for (Value *Op : Ops) {
1707 if (isa<Constant>(Op) || !UniqueOperands.insert(Op).second ||
1708 !needsExtract(Op, VF))
1709 continue;
1710 Res.push_back(Op);
1711 }
1712 return Res;
1713 }
1714
1715public:
1716 /// The loop that we evaluate.
1718
1719 /// Predicated scalar evolution analysis.
1721
1722 /// Loop Info analysis.
1724
1725 /// Vectorization legality.
1727
1728 /// Vector target information.
1730
1731 /// Target Library Info.
1733
1734 /// Demanded bits analysis.
1736
1737 /// Assumption cache.
1739
1740 /// Interface to emit optimization remarks.
1742
1744
1745 /// Loop Vectorize Hint.
1747
1748 /// The interleave access information contains groups of interleaved accesses
1749 /// with the same stride and close to each other.
1751
1752 /// Values to ignore in the cost model.
1754
1755 /// Values to ignore in the cost model when VF > 1.
1757
1758 /// All element types found in the loop.
1760
1761 /// The kind of cost that we are calculating
1763
1764 /// Whether this loop should be optimized for size based on function attribute
1765 /// or profile information.
1767
1768 /// The highest VF possible for this loop, without using MaxBandwidth.
1770};
1771} // end namespace llvm
1772
1773namespace {
1774/// Helper struct to manage generating runtime checks for vectorization.
1775///
1776/// The runtime checks are created up-front in temporary blocks to allow better
1777/// estimating the cost and un-linked from the existing IR. After deciding to
1778/// vectorize, the checks are moved back. If deciding not to vectorize, the
1779/// temporary blocks are completely removed.
1780class GeneratedRTChecks {
1781 /// Basic block which contains the generated SCEV checks, if any.
1782 BasicBlock *SCEVCheckBlock = nullptr;
1783
1784 /// The value representing the result of the generated SCEV checks. If it is
1785 /// nullptr no SCEV checks have been generated.
1786 Value *SCEVCheckCond = nullptr;
1787
1788 /// Basic block which contains the generated memory runtime checks, if any.
1789 BasicBlock *MemCheckBlock = nullptr;
1790
1791 /// The value representing the result of the generated memory runtime checks.
1792 /// If it is nullptr no memory runtime checks have been generated.
1793 Value *MemRuntimeCheckCond = nullptr;
1794
1795 DominatorTree *DT;
1796 LoopInfo *LI;
1798
1799 SCEVExpander SCEVExp;
1800 SCEVExpander MemCheckExp;
1801
1802 bool CostTooHigh = false;
1803
1804 Loop *OuterLoop = nullptr;
1805
1807
1808 /// The kind of cost that we are calculating
1810
1811public:
1812 GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
1815 : DT(DT), LI(LI), TTI(TTI),
1816 SCEVExp(*PSE.getSE(), DL, "scev.check", /*PreserveLCSSA=*/false),
1817 MemCheckExp(*PSE.getSE(), DL, "scev.check", /*PreserveLCSSA=*/false),
1818 PSE(PSE), CostKind(CostKind) {}
1819
1820 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1821 /// accurately estimate the cost of the runtime checks. The blocks are
1822 /// un-linked from the IR and are added back during vector code generation. If
1823 /// there is no vector code generation, the check blocks are removed
1824 /// completely.
1825 void create(Loop *L, const LoopAccessInfo &LAI,
1826 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1827
1828 // Hard cutoff to limit compile-time increase in case a very large number of
1829 // runtime checks needs to be generated.
1830 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1831 // profile info.
1832 CostTooHigh =
1834 if (CostTooHigh)
1835 return;
1836
1837 BasicBlock *LoopHeader = L->getHeader();
1838 BasicBlock *Preheader = L->getLoopPreheader();
1839
1840 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1841 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1842 // may be used by SCEVExpander. The blocks will be un-linked from their
1843 // predecessors and removed from LI & DT at the end of the function.
1844 if (!UnionPred.isAlwaysTrue()) {
1845 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1846 nullptr, "vector.scevcheck");
1847
1848 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1849 &UnionPred, SCEVCheckBlock->getTerminator());
1850 if (isa<Constant>(SCEVCheckCond)) {
1851 // Clean up directly after expanding the predicate to a constant, to
1852 // avoid further expansions re-using anything left over from SCEVExp.
1853 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
1854 SCEVCleaner.cleanup();
1855 }
1856 }
1857
1858 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1859 if (RtPtrChecking.Need) {
1860 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1861 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1862 "vector.memcheck");
1863
1864 auto DiffChecks = RtPtrChecking.getDiffChecks();
1865 if (DiffChecks) {
1866 Value *RuntimeVF = nullptr;
1867 MemRuntimeCheckCond = addDiffRuntimeChecks(
1868 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
1869 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1870 if (!RuntimeVF)
1871 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1872 return RuntimeVF;
1873 },
1874 IC);
1875 } else {
1876 MemRuntimeCheckCond = addRuntimeChecks(
1877 MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
1879 }
1880 assert(MemRuntimeCheckCond &&
1881 "no RT checks generated although RtPtrChecking "
1882 "claimed checks are required");
1883 }
1884
1885 SCEVExp.eraseDeadInstructions(SCEVCheckCond);
1886
1887 if (!MemCheckBlock && !SCEVCheckBlock)
1888 return;
1889
1890 // Unhook the temporary block with the checks, update various places
1891 // accordingly.
1892 if (SCEVCheckBlock)
1893 SCEVCheckBlock->replaceAllUsesWith(Preheader);
1894 if (MemCheckBlock)
1895 MemCheckBlock->replaceAllUsesWith(Preheader);
1896
1897 if (SCEVCheckBlock) {
1898 SCEVCheckBlock->getTerminator()->moveBefore(
1899 Preheader->getTerminator()->getIterator());
1900 auto *UI = new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1901 UI->setDebugLoc(DebugLoc::getTemporary());
1902 Preheader->getTerminator()->eraseFromParent();
1903 }
1904 if (MemCheckBlock) {
1905 MemCheckBlock->getTerminator()->moveBefore(
1906 Preheader->getTerminator()->getIterator());
1907 auto *UI = new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1908 UI->setDebugLoc(DebugLoc::getTemporary());
1909 Preheader->getTerminator()->eraseFromParent();
1910 }
1911
1912 DT->changeImmediateDominator(LoopHeader, Preheader);
1913 if (MemCheckBlock) {
1914 DT->eraseNode(MemCheckBlock);
1915 LI->removeBlock(MemCheckBlock);
1916 }
1917 if (SCEVCheckBlock) {
1918 DT->eraseNode(SCEVCheckBlock);
1919 LI->removeBlock(SCEVCheckBlock);
1920 }
1921
1922 // Outer loop is used as part of the later cost calculations.
1923 OuterLoop = L->getParentLoop();
1924 }
1925
1927 if (SCEVCheckBlock || MemCheckBlock)
1928 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
1929
1930 if (CostTooHigh) {
1932 Cost.setInvalid();
1933 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n");
1934 return Cost;
1935 }
1936
1937 InstructionCost RTCheckCost = 0;
1938 if (SCEVCheckBlock)
1939 for (Instruction &I : *SCEVCheckBlock) {
1940 if (SCEVCheckBlock->getTerminator() == &I)
1941 continue;
1943 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1944 RTCheckCost += C;
1945 }
1946 if (MemCheckBlock) {
1947 InstructionCost MemCheckCost = 0;
1948 for (Instruction &I : *MemCheckBlock) {
1949 if (MemCheckBlock->getTerminator() == &I)
1950 continue;
1952 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1953 MemCheckCost += C;
1954 }
1955
1956 // If the runtime memory checks are being created inside an outer loop
1957 // we should find out if these checks are outer loop invariant. If so,
1958 // the checks will likely be hoisted out and so the effective cost will
1959 // reduce according to the outer loop trip count.
1960 if (OuterLoop) {
1961 ScalarEvolution *SE = MemCheckExp.getSE();
1962 // TODO: If profitable, we could refine this further by analysing every
1963 // individual memory check, since there could be a mixture of loop
1964 // variant and invariant checks that mean the final condition is
1965 // variant.
1966 const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
1967 if (SE->isLoopInvariant(Cond, OuterLoop)) {
1968 // It seems reasonable to assume that we can reduce the effective
1969 // cost of the checks even when we know nothing about the trip
1970 // count. Assume that the outer loop executes at least twice.
1971 unsigned BestTripCount = 2;
1972
1973 // Get the best known TC estimate.
1974 if (auto EstimatedTC = getSmallBestKnownTC(
1975 PSE, OuterLoop, /* CanUseConstantMax = */ false))
1976 if (EstimatedTC->isFixed())
1977 BestTripCount = EstimatedTC->getFixedValue();
1978
1979 InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
1980
1981 // Let's ensure the cost is always at least 1.
1982 NewMemCheckCost = std::max(NewMemCheckCost.getValue(),
1983 (InstructionCost::CostType)1);
1984
1985 if (BestTripCount > 1)
1987 << "We expect runtime memory checks to be hoisted "
1988 << "out of the outer loop. Cost reduced from "
1989 << MemCheckCost << " to " << NewMemCheckCost << '\n');
1990
1991 MemCheckCost = NewMemCheckCost;
1992 }
1993 }
1994
1995 RTCheckCost += MemCheckCost;
1996 }
1997
1998 if (SCEVCheckBlock || MemCheckBlock)
1999 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2000 << "\n");
2001
2002 return RTCheckCost;
2003 }
2004
2005 /// Remove the created SCEV & memory runtime check blocks & instructions, if
2006 /// unused.
2007 ~GeneratedRTChecks() {
2008 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2009 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2010 bool SCEVChecksUsed = !SCEVCheckBlock || !pred_empty(SCEVCheckBlock);
2011 bool MemChecksUsed = !MemCheckBlock || !pred_empty(MemCheckBlock);
2012 if (SCEVChecksUsed)
2013 SCEVCleaner.markResultUsed();
2014
2015 if (MemChecksUsed) {
2016 MemCheckCleaner.markResultUsed();
2017 } else {
2018 auto &SE = *MemCheckExp.getSE();
2019 // Memory runtime check generation creates compares that use expanded
2020 // values. Remove them before running the SCEVExpanderCleaners.
2021 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2022 if (MemCheckExp.isInsertedInstruction(&I))
2023 continue;
2024 SE.forgetValue(&I);
2025 I.eraseFromParent();
2026 }
2027 }
2028 MemCheckCleaner.cleanup();
2029 SCEVCleaner.cleanup();
2030
2031 if (!SCEVChecksUsed)
2032 SCEVCheckBlock->eraseFromParent();
2033 if (!MemChecksUsed)
2034 MemCheckBlock->eraseFromParent();
2035 }
2036
2037 /// Retrieves the SCEVCheckCond and SCEVCheckBlock that were generated as IR
2038 /// outside VPlan.
2039 std::pair<Value *, BasicBlock *> getSCEVChecks() const {
2040 using namespace llvm::PatternMatch;
2041 if (!SCEVCheckCond || match(SCEVCheckCond, m_ZeroInt()))
2042 return {nullptr, nullptr};
2043
2044 return {SCEVCheckCond, SCEVCheckBlock};
2045 }
2046
2047 /// Retrieves the MemCheckCond and MemCheckBlock that were generated as IR
2048 /// outside VPlan.
2049 std::pair<Value *, BasicBlock *> getMemRuntimeChecks() const {
2050 using namespace llvm::PatternMatch;
2051 if (MemRuntimeCheckCond && match(MemRuntimeCheckCond, m_ZeroInt()))
2052 return {nullptr, nullptr};
2053 return {MemRuntimeCheckCond, MemCheckBlock};
2054 }
2055
2056 /// Return true if any runtime checks have been added
2057 bool hasChecks() const {
2058 return getSCEVChecks().first || getMemRuntimeChecks().first;
2059 }
2060};
2061} // namespace
2062
2068
2073
2074// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2075// vectorization. The loop needs to be annotated with #pragma omp simd
2076// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2077// vector length information is not provided, vectorization is not considered
2078// explicit. Interleave hints are not allowed either. These limitations will be
2079// relaxed in the future.
2080// Please, note that we are currently forced to abuse the pragma 'clang
2081// vectorize' semantics. This pragma provides *auto-vectorization hints*
2082// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2083// provides *explicit vectorization hints* (LV can bypass legal checks and
2084// assume that vectorization is legal). However, both hints are implemented
2085// using the same metadata (llvm.loop.vectorize, processed by
2086// LoopVectorizeHints). This will be fixed in the future when the native IR
2087// representation for pragma 'omp simd' is introduced.
2088static bool isExplicitVecOuterLoop(Loop *OuterLp,
2090 assert(!OuterLp->isInnermost() && "This is not an outer loop");
2091 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2092
2093 // Only outer loops with an explicit vectorization hint are supported.
2094 // Unannotated outer loops are ignored.
2096 return false;
2097
2098 Function *Fn = OuterLp->getHeader()->getParent();
2099 if (!Hints.allowVectorization(Fn, OuterLp,
2100 true /*VectorizeOnlyWhenForced*/)) {
2101 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2102 return false;
2103 }
2104
2105 if (Hints.getInterleave() > 1) {
2106 // TODO: Interleave support is future work.
2107 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2108 "outer loops.\n");
2109 Hints.emitRemarkWithHints();
2110 return false;
2111 }
2112
2113 return true;
2114}
2115
2119 // Collect inner loops and outer loops without irreducible control flow. For
2120 // now, only collect outer loops that have explicit vectorization hints. If we
2121 // are stress testing the VPlan H-CFG construction, we collect the outermost
2122 // loop of every loop nest.
2123 if (L.isInnermost() || VPlanBuildStressTest ||
2125 LoopBlocksRPO RPOT(&L);
2126 RPOT.perform(LI);
2128 V.push_back(&L);
2129 // TODO: Collect inner loops inside marked outer loops in case
2130 // vectorization fails for the outer loop. Do not invoke
2131 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2132 // already known to be reducible. We can use an inherited attribute for
2133 // that.
2134 return;
2135 }
2136 }
2137 for (Loop *InnerL : L)
2138 collectSupportedLoops(*InnerL, LI, ORE, V);
2139}
2140
2141//===----------------------------------------------------------------------===//
2142// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2143// LoopVectorizationCostModel and LoopVectorizationPlanner.
2144//===----------------------------------------------------------------------===//
2145
2146/// Compute the transformed value of Index at offset StartValue using step
2147/// StepValue.
2148/// For integer induction, returns StartValue + Index * StepValue.
2149/// For pointer induction, returns StartValue[Index * StepValue].
2150/// FIXME: The newly created binary instructions should contain nsw/nuw
2151/// flags, which can be found from the original scalar operations.
2152static Value *
2154 Value *Step,
2156 const BinaryOperator *InductionBinOp) {
2157 using namespace llvm::PatternMatch;
2158 Type *StepTy = Step->getType();
2159 Value *CastedIndex = StepTy->isIntegerTy()
2160 ? B.CreateSExtOrTrunc(Index, StepTy)
2161 : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2162 if (CastedIndex != Index) {
2163 CastedIndex->setName(CastedIndex->getName() + ".cast");
2164 Index = CastedIndex;
2165 }
2166
2167 // Note: the IR at this point is broken. We cannot use SE to create any new
2168 // SCEV and then expand it, hoping that SCEV's simplification will give us
2169 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2170 // lead to various SCEV crashes. So all we can do is to use builder and rely
2171 // on InstCombine for future simplifications. Here we handle some trivial
2172 // cases only.
2173 auto CreateAdd = [&B](Value *X, Value *Y) {
2174 assert(X->getType() == Y->getType() && "Types don't match!");
2175 if (match(X, m_ZeroInt()))
2176 return Y;
2177 if (match(Y, m_ZeroInt()))
2178 return X;
2179 return B.CreateAdd(X, Y);
2180 };
2181
2182 // We allow X to be a vector type, in which case Y will potentially be
2183 // splatted into a vector with the same element count.
2184 auto CreateMul = [&B](Value *X, Value *Y) {
2185 assert(X->getType()->getScalarType() == Y->getType() &&
2186 "Types don't match!");
2187 if (match(X, m_One()))
2188 return Y;
2189 if (match(Y, m_One()))
2190 return X;
2191 VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2192 if (XVTy && !isa<VectorType>(Y->getType()))
2193 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2194 return B.CreateMul(X, Y);
2195 };
2196
2197 switch (InductionKind) {
2199 assert(!isa<VectorType>(Index->getType()) &&
2200 "Vector indices not supported for integer inductions yet");
2201 assert(Index->getType() == StartValue->getType() &&
2202 "Index type does not match StartValue type");
2203 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2204 return B.CreateSub(StartValue, Index);
2205 auto *Offset = CreateMul(Index, Step);
2206 return CreateAdd(StartValue, Offset);
2207 }
2209 return B.CreatePtrAdd(StartValue, CreateMul(Index, Step));
2211 assert(!isa<VectorType>(Index->getType()) &&
2212 "Vector indices not supported for FP inductions yet");
2213 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2214 assert(InductionBinOp &&
2215 (InductionBinOp->getOpcode() == Instruction::FAdd ||
2216 InductionBinOp->getOpcode() == Instruction::FSub) &&
2217 "Original bin op should be defined for FP induction");
2218
2219 Value *MulExp = B.CreateFMul(Step, Index);
2220 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2221 "induction");
2222 }
2224 return nullptr;
2225 }
2226 llvm_unreachable("invalid enum");
2227}
2228
2229static std::optional<unsigned> getMaxVScale(const Function &F,
2230 const TargetTransformInfo &TTI) {
2231 if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2232 return MaxVScale;
2233
2234 if (F.hasFnAttribute(Attribute::VScaleRange))
2235 return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2236
2237 return std::nullopt;
2238}
2239
2240/// For the given VF and UF and maximum trip count computed for the loop, return
2241/// whether the induction variable might overflow in the vectorized loop. If not,
2242/// then we know a runtime overflow check always evaluates to false and can be
2243/// removed.
2245 const LoopVectorizationCostModel *Cost,
2246 ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2247 // Always be conservative if we don't know the exact unroll factor.
2248 unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2249
2250 IntegerType *IdxTy = Cost->Legal->getWidestInductionType();
2251 APInt MaxUIntTripCount = IdxTy->getMask();
2252
2253 // We know the runtime overflow check is known false iff the (max) trip-count
2254 // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2255 // the vector loop induction variable.
2256 if (unsigned TC = Cost->PSE.getSmallConstantMaxTripCount()) {
2257 uint64_t MaxVF = VF.getKnownMinValue();
2258 if (VF.isScalable()) {
2259 std::optional<unsigned> MaxVScale =
2260 getMaxVScale(*Cost->TheFunction, Cost->TTI);
2261 if (!MaxVScale)
2262 return false;
2263 MaxVF *= *MaxVScale;
2264 }
2265
2266 return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
2267 }
2268
2269 return false;
2270}
2271
2272// Return whether we allow using masked interleave-groups (for dealing with
2273// strided loads/stores that reside in predicated blocks, or for dealing
2274// with gaps).
2276 // If an override option has been passed in for interleaved accesses, use it.
2277 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2279
2280 return TTI.enableMaskedInterleavedAccessVectorization();
2281}
2282
2284 BasicBlock *CheckIRBB) {
2285 // Note: The block with the minimum trip-count check is already connected
2286 // during earlier VPlan construction.
2287 VPBlockBase *ScalarPH = Plan.getScalarPreheader();
2288 VPBlockBase *PreVectorPH = VectorPHVPBB->getSinglePredecessor();
2289 assert(PreVectorPH->getNumSuccessors() == 2 && "Expected 2 successors");
2290 assert(PreVectorPH->getSuccessors()[0] == ScalarPH && "Unexpected successor");
2291 VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(CheckIRBB);
2292 VPBlockUtils::insertOnEdge(PreVectorPH, VectorPHVPBB, CheckVPIRBB);
2293 PreVectorPH = CheckVPIRBB;
2294 VPBlockUtils::connectBlocks(PreVectorPH, ScalarPH);
2295 PreVectorPH->swapSuccessors();
2296
2297 // We just connected a new block to the scalar preheader. Update all
2298 // VPPhis by adding an incoming value for it, replicating the last value.
2299 unsigned NumPredecessors = ScalarPH->getNumPredecessors();
2300 for (VPRecipeBase &R : cast<VPBasicBlock>(ScalarPH)->phis()) {
2301 assert(isa<VPPhi>(&R) && "Phi expected to be VPPhi");
2302 assert(cast<VPPhi>(&R)->getNumIncoming() == NumPredecessors - 1 &&
2303 "must have incoming values for all operands");
2304 R.addOperand(R.getOperand(NumPredecessors - 2));
2305 }
2306}
2307
2309 BasicBlock *VectorPH, ElementCount VF, unsigned UF) const {
2310 // Generate code to check if the loop's trip count is less than VF * UF, or
2311 // equal to it in case a scalar epilogue is required; this implies that the
2312 // vector trip count is zero. This check also covers the case where adding one
2313 // to the backedge-taken count overflowed leading to an incorrect trip count
2314 // of zero. In this case we will also jump to the scalar loop.
2315 auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
2317
2318 // Reuse existing vector loop preheader for TC checks.
2319 // Note that new preheader block is generated for vector loop.
2320 BasicBlock *const TCCheckBlock = VectorPH;
2322 TCCheckBlock->getContext(),
2323 InstSimplifyFolder(TCCheckBlock->getDataLayout()));
2324 Builder.SetInsertPoint(TCCheckBlock->getTerminator());
2325
2326 // If tail is to be folded, vector loop takes care of all iterations.
2328 Type *CountTy = Count->getType();
2329 Value *CheckMinIters = Builder.getFalse();
2330 auto CreateStep = [&]() -> Value * {
2331 // Create step with max(MinProTripCount, UF * VF).
2332 if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
2333 return createStepForVF(Builder, CountTy, VF, UF);
2334
2335 Value *MinProfTC =
2336 Builder.CreateElementCount(CountTy, MinProfitableTripCount);
2337 if (!VF.isScalable())
2338 return MinProfTC;
2339 return Builder.CreateBinaryIntrinsic(
2340 Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2341 };
2342
2343 TailFoldingStyle Style = Cost->getTailFoldingStyle();
2344 if (Style == TailFoldingStyle::None) {
2345 Value *Step = CreateStep();
2346 ScalarEvolution &SE = *PSE.getSE();
2347 // TODO: Emit unconditional branch to vector preheader instead of
2348 // conditional branch with known condition.
2349 const SCEV *TripCountSCEV = SE.applyLoopGuards(SE.getSCEV(Count), OrigLoop);
2350 // Check if the trip count is < the step.
2351 if (SE.isKnownPredicate(P, TripCountSCEV, SE.getSCEV(Step))) {
2352 // TODO: Ensure step is at most the trip count when determining max VF and
2353 // UF, w/o tail folding.
2354 CheckMinIters = Builder.getTrue();
2356 TripCountSCEV, SE.getSCEV(Step))) {
2357 // Generate the minimum iteration check only if we cannot prove the
2358 // check is known to be true, or known to be false.
2359 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
2360 } // else step known to be < trip count, use CheckMinIters preset to false.
2361 } else if (VF.isScalable() && !TTI->isVScaleKnownToBeAPowerOfTwo() &&
2364 // vscale is not necessarily a power-of-2, which means we cannot guarantee
2365 // an overflow to zero when updating induction variables and so an
2366 // additional overflow check is required before entering the vector loop.
2367
2368 // Get the maximum unsigned value for the type.
2369 Value *MaxUIntTripCount =
2370 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2371 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
2372
2373 // Don't execute the vector loop if (UMax - n) < (VF * UF).
2374 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
2375 }
2376 return CheckMinIters;
2377}
2378
2379/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
2380/// VPBB are moved to the end of the newly created VPIRBasicBlock. All
2381/// predecessors and successors of VPBB, if any, are rewired to the new
2382/// VPIRBasicBlock. If \p VPBB may be unreachable, \p Plan must be passed.
2384 BasicBlock *IRBB,
2385 VPlan *Plan = nullptr) {
2386 if (!Plan)
2387 Plan = VPBB->getPlan();
2388 VPIRBasicBlock *IRVPBB = Plan->createVPIRBasicBlock(IRBB);
2389 auto IP = IRVPBB->begin();
2390 for (auto &R : make_early_inc_range(VPBB->phis()))
2391 R.moveBefore(*IRVPBB, IP);
2392
2393 for (auto &R :
2395 R.moveBefore(*IRVPBB, IRVPBB->end());
2396
2397 VPBlockUtils::reassociateBlocks(VPBB, IRVPBB);
2398 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
2399 return IRVPBB;
2400}
2401
2403 BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
2404 assert(VectorPH && "Invalid loop structure");
2405 assert((OrigLoop->getUniqueLatchExitBlock() ||
2406 Cost->requiresScalarEpilogue(VF.isVector())) &&
2407 "loops not exiting via the latch without required epilogue?");
2408
2409 // NOTE: The Plan's scalar preheader VPBB isn't replaced with a VPIRBasicBlock
2410 // wrapping the newly created scalar preheader here at the moment, because the
2411 // Plan's scalar preheader may be unreachable at this point. Instead it is
2412 // replaced in executePlan.
2413 return SplitBlock(VectorPH, VectorPH->getTerminator(), DT, LI, nullptr,
2414 Twine(Prefix) + "scalar.ph");
2415}
2416
2417/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
2418/// expansion results.
2420 const SCEV2ValueTy &ExpandedSCEVs) {
2421 const SCEV *Step = ID.getStep();
2422 if (auto *C = dyn_cast<SCEVConstant>(Step))
2423 return C->getValue();
2424 if (auto *U = dyn_cast<SCEVUnknown>(Step))
2425 return U->getValue();
2426 Value *V = ExpandedSCEVs.lookup(Step);
2427 assert(V && "SCEV must be expanded at this point");
2428 return V;
2429}
2430
2431/// Knowing that loop \p L executes a single vector iteration, add instructions
2432/// that will get simplified and thus should not have any cost to \p
2433/// InstsToIgnore.
2436 SmallPtrSetImpl<Instruction *> &InstsToIgnore) {
2437 auto *Cmp = L->getLatchCmpInst();
2438 if (Cmp)
2439 InstsToIgnore.insert(Cmp);
2440 for (const auto &KV : IL) {
2441 // Extract the key by hand so that it can be used in the lambda below. Note
2442 // that captured structured bindings are a C++20 extension.
2443 const PHINode *IV = KV.first;
2444
2445 // Get next iteration value of the induction variable.
2446 Instruction *IVInst =
2447 cast<Instruction>(IV->getIncomingValueForBlock(L->getLoopLatch()));
2448 if (all_of(IVInst->users(),
2449 [&](const User *U) { return U == IV || U == Cmp; }))
2450 InstsToIgnore.insert(IVInst);
2451 }
2452}
2453
2455 // Create a new IR basic block for the scalar preheader.
2456 BasicBlock *ScalarPH = createScalarPreheader("");
2457 return ScalarPH->getSinglePredecessor();
2458}
2459
2460namespace {
2461
2462struct CSEDenseMapInfo {
2463 static bool canHandle(const Instruction *I) {
2466 }
2467
2468 static inline Instruction *getEmptyKey() {
2470 }
2471
2472 static inline Instruction *getTombstoneKey() {
2473 return DenseMapInfo<Instruction *>::getTombstoneKey();
2474 }
2475
2476 static unsigned getHashValue(const Instruction *I) {
2477 assert(canHandle(I) && "Unknown instruction!");
2478 return hash_combine(I->getOpcode(),
2479 hash_combine_range(I->operand_values()));
2480 }
2481
2482 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
2483 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
2484 LHS == getTombstoneKey() || RHS == getTombstoneKey())
2485 return LHS == RHS;
2486 return LHS->isIdenticalTo(RHS);
2487 }
2488};
2489
2490} // end anonymous namespace
2491
2492/// FIXME: This legacy common-subexpression-elimination routine is scheduled for
2493/// removal, in favor of the VPlan-based one.
2494static void legacyCSE(BasicBlock *BB) {
2495 // Perform simple cse.
2497 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
2498 if (!CSEDenseMapInfo::canHandle(&In))
2499 continue;
2500
2501 // Check if we can replace this instruction with any of the
2502 // visited instructions.
2503 if (Instruction *V = CSEMap.lookup(&In)) {
2504 In.replaceAllUsesWith(V);
2505 In.eraseFromParent();
2506 continue;
2507 }
2508
2509 CSEMap[&In] = &In;
2510 }
2511}
2512
2513/// This function attempts to return a value that represents the ElementCount
2514/// at runtime. For fixed-width VFs we know this precisely at compile
2515/// time, but for scalable VFs we calculate it based on an estimate of the
2516/// vscale value.
2518 std::optional<unsigned> VScale) {
2519 unsigned EstimatedVF = VF.getKnownMinValue();
2520 if (VF.isScalable())
2521 if (VScale)
2522 EstimatedVF *= *VScale;
2523 assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
2524 return EstimatedVF;
2525}
2526
2529 ElementCount VF) const {
2530 // We only need to calculate a cost if the VF is scalar; for actual vectors
2531 // we should already have a pre-calculated cost at each VF.
2532 if (!VF.isScalar())
2533 return getCallWideningDecision(CI, VF).Cost;
2534
2535 Type *RetTy = CI->getType();
2537 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy))
2538 return *RedCost;
2539
2541 for (auto &ArgOp : CI->args())
2542 Tys.push_back(ArgOp->getType());
2543
2544 InstructionCost ScalarCallCost =
2545 TTI.getCallInstrCost(CI->getCalledFunction(), RetTy, Tys, CostKind);
2546
2547 // If this is an intrinsic we may have a lower cost for it.
2550 return std::min(ScalarCallCost, IntrinsicCost);
2551 }
2552 return ScalarCallCost;
2553}
2554
2556 if (VF.isScalar() || !canVectorizeTy(Ty))
2557 return Ty;
2558 return toVectorizedTy(Ty, VF);
2559}
2560
2563 ElementCount VF) const {
2565 assert(ID && "Expected intrinsic call!");
2566 Type *RetTy = maybeVectorizeType(CI->getType(), VF);
2567 FastMathFlags FMF;
2568 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
2569 FMF = FPMO->getFastMathFlags();
2570
2573 SmallVector<Type *> ParamTys;
2574 std::transform(FTy->param_begin(), FTy->param_end(),
2575 std::back_inserter(ParamTys),
2576 [&](Type *Ty) { return maybeVectorizeType(Ty, VF); });
2577
2578 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
2581 return TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
2582}
2583
2585 // Fix widened non-induction PHIs by setting up the PHI operands.
2586 fixNonInductionPHIs(State);
2587
2588 // Don't apply optimizations below when no (vector) loop remains, as they all
2589 // require one at the moment.
2590 VPBasicBlock *HeaderVPBB =
2591 vputils::getFirstLoopHeader(*State.Plan, State.VPDT);
2592 if (!HeaderVPBB)
2593 return;
2594
2595 BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB];
2596
2597 // Remove redundant induction instructions.
2598 legacyCSE(HeaderBB);
2599}
2600
2602 auto Iter = vp_depth_first_shallow(Plan.getEntry());
2604 for (VPRecipeBase &P : VPBB->phis()) {
2606 if (!VPPhi)
2607 continue;
2608 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi));
2609 // Make sure the builder has a valid insert point.
2610 Builder.SetInsertPoint(NewPhi);
2611 for (const auto &[Inc, VPBB] : VPPhi->incoming_values_and_blocks())
2612 NewPhi->addIncoming(State.get(Inc), State.CFG.VPBB2IRBB[VPBB]);
2613 }
2614 }
2615}
2616
2617void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
2618 // We should not collect Scalars more than once per VF. Right now, this
2619 // function is called from collectUniformsAndScalars(), which already does
2620 // this check. Collecting Scalars for VF=1 does not make any sense.
2621 assert(VF.isVector() && !Scalars.contains(VF) &&
2622 "This function should not be visited twice for the same VF");
2623
2624 // This avoids any chances of creating a REPLICATE recipe during planning
2625 // since that would result in generation of scalarized code during execution,
2626 // which is not supported for scalable vectors.
2627 if (VF.isScalable()) {
2628 Scalars[VF].insert_range(Uniforms[VF]);
2629 return;
2630 }
2631
2633
2634 // These sets are used to seed the analysis with pointers used by memory
2635 // accesses that will remain scalar.
2637 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
2638 auto *Latch = TheLoop->getLoopLatch();
2639
2640 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
2641 // The pointer operands of loads and stores will be scalar as long as the
2642 // memory access is not a gather or scatter operation. The value operand of a
2643 // store will remain scalar if the store is scalarized.
2644 auto IsScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
2645 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
2646 assert(WideningDecision != CM_Unknown &&
2647 "Widening decision should be ready at this moment");
2648 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
2649 if (Ptr == Store->getValueOperand())
2650 return WideningDecision == CM_Scalarize;
2651 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
2652 "Ptr is neither a value or pointer operand");
2653 return WideningDecision != CM_GatherScatter;
2654 };
2655
2656 // A helper that returns true if the given value is a getelementptr
2657 // instruction contained in the loop.
2658 auto IsLoopVaryingGEP = [&](Value *V) {
2659 return isa<GetElementPtrInst>(V) && !TheLoop->isLoopInvariant(V);
2660 };
2661
2662 // A helper that evaluates a memory access's use of a pointer. If the use will
2663 // be a scalar use and the pointer is only used by memory accesses, we place
2664 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
2665 // PossibleNonScalarPtrs.
2666 auto EvaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
2667 // We only care about bitcast and getelementptr instructions contained in
2668 // the loop.
2669 if (!IsLoopVaryingGEP(Ptr))
2670 return;
2671
2672 // If the pointer has already been identified as scalar (e.g., if it was
2673 // also identified as uniform), there's nothing to do.
2674 auto *I = cast<Instruction>(Ptr);
2675 if (Worklist.count(I))
2676 return;
2677
2678 // If the use of the pointer will be a scalar use, and all users of the
2679 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
2680 // place the pointer in PossibleNonScalarPtrs.
2681 if (IsScalarUse(MemAccess, Ptr) &&
2683 ScalarPtrs.insert(I);
2684 else
2685 PossibleNonScalarPtrs.insert(I);
2686 };
2687
2688 // We seed the scalars analysis with three classes of instructions: (1)
2689 // instructions marked uniform-after-vectorization and (2) bitcast,
2690 // getelementptr and (pointer) phi instructions used by memory accesses
2691 // requiring a scalar use.
2692 //
2693 // (1) Add to the worklist all instructions that have been identified as
2694 // uniform-after-vectorization.
2695 Worklist.insert_range(Uniforms[VF]);
2696
2697 // (2) Add to the worklist all bitcast and getelementptr instructions used by
2698 // memory accesses requiring a scalar use. The pointer operands of loads and
2699 // stores will be scalar unless the operation is a gather or scatter.
2700 // The value operand of a store will remain scalar if the store is scalarized.
2701 for (auto *BB : TheLoop->blocks())
2702 for (auto &I : *BB) {
2703 if (auto *Load = dyn_cast<LoadInst>(&I)) {
2704 EvaluatePtrUse(Load, Load->getPointerOperand());
2705 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
2706 EvaluatePtrUse(Store, Store->getPointerOperand());
2707 EvaluatePtrUse(Store, Store->getValueOperand());
2708 }
2709 }
2710 for (auto *I : ScalarPtrs)
2711 if (!PossibleNonScalarPtrs.count(I)) {
2712 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
2713 Worklist.insert(I);
2714 }
2715
2716 // Insert the forced scalars.
2717 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
2718 // induction variable when the PHI user is scalarized.
2719 auto ForcedScalar = ForcedScalars.find(VF);
2720 if (ForcedScalar != ForcedScalars.end())
2721 for (auto *I : ForcedScalar->second) {
2722 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
2723 Worklist.insert(I);
2724 }
2725
2726 // Expand the worklist by looking through any bitcasts and getelementptr
2727 // instructions we've already identified as scalar. This is similar to the
2728 // expansion step in collectLoopUniforms(); however, here we're only
2729 // expanding to include additional bitcasts and getelementptr instructions.
2730 unsigned Idx = 0;
2731 while (Idx != Worklist.size()) {
2732 Instruction *Dst = Worklist[Idx++];
2733 if (!IsLoopVaryingGEP(Dst->getOperand(0)))
2734 continue;
2735 auto *Src = cast<Instruction>(Dst->getOperand(0));
2736 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
2737 auto *J = cast<Instruction>(U);
2738 return !TheLoop->contains(J) || Worklist.count(J) ||
2739 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
2740 IsScalarUse(J, Src));
2741 })) {
2742 Worklist.insert(Src);
2743 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
2744 }
2745 }
2746
2747 // An induction variable will remain scalar if all users of the induction
2748 // variable and induction variable update remain scalar.
2749 for (const auto &Induction : Legal->getInductionVars()) {
2750 auto *Ind = Induction.first;
2751 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
2752
2753 // If tail-folding is applied, the primary induction variable will be used
2754 // to feed a vector compare.
2755 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
2756 continue;
2757
2758 // Returns true if \p Indvar is a pointer induction that is used directly by
2759 // load/store instruction \p I.
2760 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
2761 Instruction *I) {
2762 return Induction.second.getKind() ==
2765 Indvar == getLoadStorePointerOperand(I) && IsScalarUse(I, Indvar);
2766 };
2767
2768 // Determine if all users of the induction variable are scalar after
2769 // vectorization.
2770 bool ScalarInd = all_of(Ind->users(), [&](User *U) -> bool {
2771 auto *I = cast<Instruction>(U);
2772 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
2773 IsDirectLoadStoreFromPtrIndvar(Ind, I);
2774 });
2775 if (!ScalarInd)
2776 continue;
2777
2778 // If the induction variable update is a fixed-order recurrence, neither the
2779 // induction variable or its update should be marked scalar after
2780 // vectorization.
2781 auto *IndUpdatePhi = dyn_cast<PHINode>(IndUpdate);
2782 if (IndUpdatePhi && Legal->isFixedOrderRecurrence(IndUpdatePhi))
2783 continue;
2784
2785 // Determine if all users of the induction variable update instruction are
2786 // scalar after vectorization.
2787 bool ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
2788 auto *I = cast<Instruction>(U);
2789 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
2790 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
2791 });
2792 if (!ScalarIndUpdate)
2793 continue;
2794
2795 // The induction variable and its update instruction will remain scalar.
2796 Worklist.insert(Ind);
2797 Worklist.insert(IndUpdate);
2798 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
2799 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
2800 << "\n");
2801 }
2802
2803 Scalars[VF].insert_range(Worklist);
2804}
2805
2807 Instruction *I, ElementCount VF) const {
2808 if (!isPredicatedInst(I))
2809 return false;
2810
2811 // Do we have a non-scalar lowering for this predicated
2812 // instruction? No - it is scalar with predication.
2813 switch(I->getOpcode()) {
2814 default:
2815 return true;
2816 case Instruction::Call:
2817 if (VF.isScalar())
2818 return true;
2820 case Instruction::Load:
2821 case Instruction::Store: {
2823 auto *Ty = getLoadStoreType(I);
2824 unsigned AS = getLoadStoreAddressSpace(I);
2825 Type *VTy = Ty;
2826 if (VF.isVector())
2827 VTy = VectorType::get(Ty, VF);
2828 const Align Alignment = getLoadStoreAlignment(I);
2829 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment, AS) ||
2830 TTI.isLegalMaskedGather(VTy, Alignment))
2831 : !(isLegalMaskedStore(Ty, Ptr, Alignment, AS) ||
2832 TTI.isLegalMaskedScatter(VTy, Alignment));
2833 }
2834 case Instruction::UDiv:
2835 case Instruction::SDiv:
2836 case Instruction::SRem:
2837 case Instruction::URem: {
2838 // We have the option to use the safe-divisor idiom to avoid predication.
2839 // The cost based decision here will always select safe-divisor for
2840 // scalable vectors as scalarization isn't legal.
2841 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
2842 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
2843 }
2844 }
2845}
2846
2847// TODO: Fold into LoopVectorizationLegality::isMaskRequired.
2849 // TODO: We can use the loop-preheader as context point here and get
2850 // context sensitive reasoning for isSafeToSpeculativelyExecute.
2852 (isa<LoadInst, StoreInst, CallInst>(I) && !Legal->isMaskRequired(I)) ||
2854 return false;
2855
2856 // If the instruction was executed conditionally in the original scalar loop,
2857 // predication is needed with a mask whose lanes are all possibly inactive.
2858 if (Legal->blockNeedsPredication(I->getParent()))
2859 return true;
2860
2861 // If we're not folding the tail by masking, predication is unnecessary.
2862 if (!foldTailByMasking())
2863 return false;
2864
2865 // All that remain are instructions with side-effects originally executed in
2866 // the loop unconditionally, but now execute under a tail-fold mask (only)
2867 // having at least one active lane (the first). If the side-effects of the
2868 // instruction are invariant, executing it w/o (the tail-folding) mask is safe
2869 // - it will cause the same side-effects as when masked.
2870 switch(I->getOpcode()) {
2871 default:
2873 "instruction should have been considered by earlier checks");
2874 case Instruction::Call:
2875 // Side-effects of a Call are assumed to be non-invariant, needing a
2876 // (fold-tail) mask.
2877 assert(Legal->isMaskRequired(I) &&
2878 "should have returned earlier for calls not needing a mask");
2879 return true;
2880 case Instruction::Load:
2881 // If the address is loop invariant no predication is needed.
2882 return !Legal->isInvariant(getLoadStorePointerOperand(I));
2883 case Instruction::Store: {
2884 // For stores, we need to prove both speculation safety (which follows from
2885 // the same argument as loads), but also must prove the value being stored
2886 // is correct. The easiest form of the later is to require that all values
2887 // stored are the same.
2888 return !(Legal->isInvariant(getLoadStorePointerOperand(I)) &&
2889 TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()));
2890 }
2891 case Instruction::UDiv:
2892 case Instruction::SDiv:
2893 case Instruction::SRem:
2894 case Instruction::URem:
2895 // If the divisor is loop-invariant no predication is needed.
2896 return !Legal->isInvariant(I->getOperand(1));
2897 }
2898}
2899
2900std::pair<InstructionCost, InstructionCost>
2902 ElementCount VF) const {
2903 assert(I->getOpcode() == Instruction::UDiv ||
2904 I->getOpcode() == Instruction::SDiv ||
2905 I->getOpcode() == Instruction::SRem ||
2906 I->getOpcode() == Instruction::URem);
2908
2909 // Scalarization isn't legal for scalable vector types
2910 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
2911 if (!VF.isScalable()) {
2912 // Get the scalarization cost and scale this amount by the probability of
2913 // executing the predicated block. If the instruction is not predicated,
2914 // we fall through to the next case.
2915 ScalarizationCost = 0;
2916
2917 // These instructions have a non-void type, so account for the phi nodes
2918 // that we will create. This cost is likely to be zero. The phi node
2919 // cost, if any, should be scaled by the block probability because it
2920 // models a copy at the end of each predicated block.
2921 ScalarizationCost +=
2922 VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
2923
2924 // The cost of the non-predicated instruction.
2925 ScalarizationCost +=
2926 VF.getFixedValue() *
2927 TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
2928
2929 // The cost of insertelement and extractelement instructions needed for
2930 // scalarization.
2931 ScalarizationCost += getScalarizationOverhead(I, VF);
2932
2933 // Scale the cost by the probability of executing the predicated blocks.
2934 // This assumes the predicated block for each vector lane is equally
2935 // likely.
2936 ScalarizationCost =
2937 ScalarizationCost / getPredBlockCostDivisor(CostKind, I->getParent());
2938 }
2939
2940 InstructionCost SafeDivisorCost = 0;
2941 auto *VecTy = toVectorTy(I->getType(), VF);
2942 // The cost of the select guard to ensure all lanes are well defined
2943 // after we speculate above any internal control flow.
2944 SafeDivisorCost +=
2945 TTI.getCmpSelInstrCost(Instruction::Select, VecTy,
2946 toVectorTy(Type::getInt1Ty(I->getContext()), VF),
2948
2949 SmallVector<const Value *, 4> Operands(I->operand_values());
2950 SafeDivisorCost += TTI.getArithmeticInstrCost(
2951 I->getOpcode(), VecTy, CostKind,
2952 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
2953 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
2954 Operands, I);
2955 return {ScalarizationCost, SafeDivisorCost};
2956}
2957
2959 Instruction *I, ElementCount VF) const {
2960 assert(isAccessInterleaved(I) && "Expecting interleaved access.");
2962 "Decision should not be set yet.");
2963 auto *Group = getInterleavedAccessGroup(I);
2964 assert(Group && "Must have a group.");
2965 unsigned InterleaveFactor = Group->getFactor();
2966
2967 // If the instruction's allocated size doesn't equal its type size, it
2968 // requires padding and will be scalarized.
2969 auto &DL = I->getDataLayout();
2970 auto *ScalarTy = getLoadStoreType(I);
2971 if (hasIrregularType(ScalarTy, DL))
2972 return false;
2973
2974 // For scalable vectors, the interleave factors must be <= 8 since we require
2975 // the (de)interleaveN intrinsics instead of shufflevectors.
2976 if (VF.isScalable() && InterleaveFactor > 8)
2977 return false;
2978
2979 // If the group involves a non-integral pointer, we may not be able to
2980 // losslessly cast all values to a common type.
2981 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
2982 for (unsigned Idx = 0; Idx < InterleaveFactor; Idx++) {
2983 Instruction *Member = Group->getMember(Idx);
2984 if (!Member)
2985 continue;
2986 auto *MemberTy = getLoadStoreType(Member);
2987 bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
2988 // Don't coerce non-integral pointers to integers or vice versa.
2989 if (MemberNI != ScalarNI)
2990 // TODO: Consider adding special nullptr value case here
2991 return false;
2992 if (MemberNI && ScalarNI &&
2993 ScalarTy->getPointerAddressSpace() !=
2994 MemberTy->getPointerAddressSpace())
2995 return false;
2996 }
2997
2998 // Check if masking is required.
2999 // A Group may need masking for one of two reasons: it resides in a block that
3000 // needs predication, or it was decided to use masking to deal with gaps
3001 // (either a gap at the end of a load-access that may result in a speculative
3002 // load, or any gaps in a store-access).
3003 bool PredicatedAccessRequiresMasking =
3004 blockNeedsPredicationForAnyReason(I->getParent()) &&
3005 Legal->isMaskRequired(I);
3006 bool LoadAccessWithGapsRequiresEpilogMasking =
3007 isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
3009 bool StoreAccessWithGapsRequiresMasking =
3010 isa<StoreInst>(I) && !Group->isFull();
3011 if (!PredicatedAccessRequiresMasking &&
3012 !LoadAccessWithGapsRequiresEpilogMasking &&
3013 !StoreAccessWithGapsRequiresMasking)
3014 return true;
3015
3016 // If masked interleaving is required, we expect that the user/target had
3017 // enabled it, because otherwise it either wouldn't have been created or
3018 // it should have been invalidated by the CostModel.
3020 "Masked interleave-groups for predicated accesses are not enabled.");
3021
3022 if (Group->isReverse())
3023 return false;
3024
3025 // TODO: Support interleaved access that requires a gap mask for scalable VFs.
3026 bool NeedsMaskForGaps = LoadAccessWithGapsRequiresEpilogMasking ||
3027 StoreAccessWithGapsRequiresMasking;
3028 if (VF.isScalable() && NeedsMaskForGaps)
3029 return false;
3030
3031 auto *Ty = getLoadStoreType(I);
3032 const Align Alignment = getLoadStoreAlignment(I);
3033 unsigned AS = getLoadStoreAddressSpace(I);
3034 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment, AS)
3035 : TTI.isLegalMaskedStore(Ty, Alignment, AS);
3036}
3037
3039 Instruction *I, ElementCount VF) {
3040 // Get and ensure we have a valid memory instruction.
3041 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
3042
3044 auto *ScalarTy = getLoadStoreType(I);
3045
3046 // In order to be widened, the pointer should be consecutive, first of all.
3047 if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
3048 return false;
3049
3050 // If the instruction is a store located in a predicated block, it will be
3051 // scalarized.
3052 if (isScalarWithPredication(I, VF))
3053 return false;
3054
3055 // If the instruction's allocated size doesn't equal it's type size, it
3056 // requires padding and will be scalarized.
3057 auto &DL = I->getDataLayout();
3058 if (hasIrregularType(ScalarTy, DL))
3059 return false;
3060
3061 return true;
3062}
3063
3064void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
3065 // We should not collect Uniforms more than once per VF. Right now,
3066 // this function is called from collectUniformsAndScalars(), which
3067 // already does this check. Collecting Uniforms for VF=1 does not make any
3068 // sense.
3069
3070 assert(VF.isVector() && !Uniforms.contains(VF) &&
3071 "This function should not be visited twice for the same VF");
3072
3073 // Visit the list of Uniforms. If we find no uniform value, we won't
3074 // analyze again. Uniforms.count(VF) will return 1.
3075 Uniforms[VF].clear();
3076
3077 // Now we know that the loop is vectorizable!
3078 // Collect instructions inside the loop that will remain uniform after
3079 // vectorization.
3080
3081 // Global values, params and instructions outside of current loop are out of
3082 // scope.
3083 auto IsOutOfScope = [&](Value *V) -> bool {
3085 return (!I || !TheLoop->contains(I));
3086 };
3087
3088 // Worklist containing uniform instructions demanding lane 0.
3089 SetVector<Instruction *> Worklist;
3090
3091 // Add uniform instructions demanding lane 0 to the worklist. Instructions
3092 // that require predication must not be considered uniform after
3093 // vectorization, because that would create an erroneous replicating region
3094 // where only a single instance out of VF should be formed.
3095 auto AddToWorklistIfAllowed = [&](Instruction *I) -> void {
3096 if (IsOutOfScope(I)) {
3097 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
3098 << *I << "\n");
3099 return;
3100 }
3101 if (isPredicatedInst(I)) {
3102 LLVM_DEBUG(
3103 dbgs() << "LV: Found not uniform due to requiring predication: " << *I
3104 << "\n");
3105 return;
3106 }
3107 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
3108 Worklist.insert(I);
3109 };
3110
3111 // Start with the conditional branches exiting the loop. If the branch
3112 // condition is an instruction contained in the loop that is only used by the
3113 // branch, it is uniform. Note conditions from uncountable early exits are not
3114 // uniform.
3116 TheLoop->getExitingBlocks(Exiting);
3117 for (BasicBlock *E : Exiting) {
3118 if (Legal->hasUncountableEarlyExit() && TheLoop->getLoopLatch() != E)
3119 continue;
3120 auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
3121 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
3122 AddToWorklistIfAllowed(Cmp);
3123 }
3124
3125 auto PrevVF = VF.divideCoefficientBy(2);
3126 // Return true if all lanes perform the same memory operation, and we can
3127 // thus choose to execute only one.
3128 auto IsUniformMemOpUse = [&](Instruction *I) {
3129 // If the value was already known to not be uniform for the previous
3130 // (smaller VF), it cannot be uniform for the larger VF.
3131 if (PrevVF.isVector()) {
3132 auto Iter = Uniforms.find(PrevVF);
3133 if (Iter != Uniforms.end() && !Iter->second.contains(I))
3134 return false;
3135 }
3136 if (!Legal->isUniformMemOp(*I, VF))
3137 return false;
3138 if (isa<LoadInst>(I))
3139 // Loading the same address always produces the same result - at least
3140 // assuming aliasing and ordering which have already been checked.
3141 return true;
3142 // Storing the same value on every iteration.
3143 return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
3144 };
3145
3146 auto IsUniformDecision = [&](Instruction *I, ElementCount VF) {
3147 InstWidening WideningDecision = getWideningDecision(I, VF);
3148 assert(WideningDecision != CM_Unknown &&
3149 "Widening decision should be ready at this moment");
3150
3151 if (IsUniformMemOpUse(I))
3152 return true;
3153
3154 return (WideningDecision == CM_Widen ||
3155 WideningDecision == CM_Widen_Reverse ||
3156 WideningDecision == CM_Interleave);
3157 };
3158
3159 // Returns true if Ptr is the pointer operand of a memory access instruction
3160 // I, I is known to not require scalarization, and the pointer is not also
3161 // stored.
3162 auto IsVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
3163 if (isa<StoreInst>(I) && I->getOperand(0) == Ptr)
3164 return false;
3165 return getLoadStorePointerOperand(I) == Ptr &&
3166 (IsUniformDecision(I, VF) || Legal->isInvariant(Ptr));
3167 };
3168
3169 // Holds a list of values which are known to have at least one uniform use.
3170 // Note that there may be other uses which aren't uniform. A "uniform use"
3171 // here is something which only demands lane 0 of the unrolled iterations;
3172 // it does not imply that all lanes produce the same value (e.g. this is not
3173 // the usual meaning of uniform)
3174 SetVector<Value *> HasUniformUse;
3175
3176 // Scan the loop for instructions which are either a) known to have only
3177 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
3178 for (auto *BB : TheLoop->blocks())
3179 for (auto &I : *BB) {
3180 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
3181 switch (II->getIntrinsicID()) {
3182 case Intrinsic::sideeffect:
3183 case Intrinsic::experimental_noalias_scope_decl:
3184 case Intrinsic::assume:
3185 case Intrinsic::lifetime_start:
3186 case Intrinsic::lifetime_end:
3187 if (TheLoop->hasLoopInvariantOperands(&I))
3188 AddToWorklistIfAllowed(&I);
3189 break;
3190 default:
3191 break;
3192 }
3193 }
3194
3195 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
3196 if (IsOutOfScope(EVI->getAggregateOperand())) {
3197 AddToWorklistIfAllowed(EVI);
3198 continue;
3199 }
3200 // Only ExtractValue instructions where the aggregate value comes from a
3201 // call are allowed to be non-uniform.
3202 assert(isa<CallInst>(EVI->getAggregateOperand()) &&
3203 "Expected aggregate value to be call return value");
3204 }
3205
3206 // If there's no pointer operand, there's nothing to do.
3208 if (!Ptr)
3209 continue;
3210
3211 // If the pointer can be proven to be uniform, always add it to the
3212 // worklist.
3213 if (isa<Instruction>(Ptr) && Legal->isUniform(Ptr, VF))
3214 AddToWorklistIfAllowed(cast<Instruction>(Ptr));
3215
3216 if (IsUniformMemOpUse(&I))
3217 AddToWorklistIfAllowed(&I);
3218
3219 if (IsVectorizedMemAccessUse(&I, Ptr))
3220 HasUniformUse.insert(Ptr);
3221 }
3222
3223 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
3224 // demanding) users. Since loops are assumed to be in LCSSA form, this
3225 // disallows uses outside the loop as well.
3226 for (auto *V : HasUniformUse) {
3227 if (IsOutOfScope(V))
3228 continue;
3229 auto *I = cast<Instruction>(V);
3230 bool UsersAreMemAccesses = all_of(I->users(), [&](User *U) -> bool {
3231 auto *UI = cast<Instruction>(U);
3232 return TheLoop->contains(UI) && IsVectorizedMemAccessUse(UI, V);
3233 });
3234 if (UsersAreMemAccesses)
3235 AddToWorklistIfAllowed(I);
3236 }
3237
3238 // Expand Worklist in topological order: whenever a new instruction
3239 // is added , its users should be already inside Worklist. It ensures
3240 // a uniform instruction will only be used by uniform instructions.
3241 unsigned Idx = 0;
3242 while (Idx != Worklist.size()) {
3243 Instruction *I = Worklist[Idx++];
3244
3245 for (auto *OV : I->operand_values()) {
3246 // isOutOfScope operands cannot be uniform instructions.
3247 if (IsOutOfScope(OV))
3248 continue;
3249 // First order recurrence Phi's should typically be considered
3250 // non-uniform.
3251 auto *OP = dyn_cast<PHINode>(OV);
3252 if (OP && Legal->isFixedOrderRecurrence(OP))
3253 continue;
3254 // If all the users of the operand are uniform, then add the
3255 // operand into the uniform worklist.
3256 auto *OI = cast<Instruction>(OV);
3257 if (llvm::all_of(OI->users(), [&](User *U) -> bool {
3258 auto *J = cast<Instruction>(U);
3259 return Worklist.count(J) || IsVectorizedMemAccessUse(J, OI);
3260 }))
3261 AddToWorklistIfAllowed(OI);
3262 }
3263 }
3264
3265 // For an instruction to be added into Worklist above, all its users inside
3266 // the loop should also be in Worklist. However, this condition cannot be
3267 // true for phi nodes that form a cyclic dependence. We must process phi
3268 // nodes separately. An induction variable will remain uniform if all users
3269 // of the induction variable and induction variable update remain uniform.
3270 // The code below handles both pointer and non-pointer induction variables.
3271 BasicBlock *Latch = TheLoop->getLoopLatch();
3272 for (const auto &Induction : Legal->getInductionVars()) {
3273 auto *Ind = Induction.first;
3274 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3275
3276 // Determine if all users of the induction variable are uniform after
3277 // vectorization.
3278 bool UniformInd = all_of(Ind->users(), [&](User *U) -> bool {
3279 auto *I = cast<Instruction>(U);
3280 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3281 IsVectorizedMemAccessUse(I, Ind);
3282 });
3283 if (!UniformInd)
3284 continue;
3285
3286 // Determine if all users of the induction variable update instruction are
3287 // uniform after vectorization.
3288 bool UniformIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
3289 auto *I = cast<Instruction>(U);
3290 return I == Ind || Worklist.count(I) ||
3291 IsVectorizedMemAccessUse(I, IndUpdate);
3292 });
3293 if (!UniformIndUpdate)
3294 continue;
3295
3296 // The induction variable and its update instruction will remain uniform.
3297 AddToWorklistIfAllowed(Ind);
3298 AddToWorklistIfAllowed(IndUpdate);
3299 }
3300
3301 Uniforms[VF].insert_range(Worklist);
3302}
3303
3305 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
3306
3307 if (Legal->getRuntimePointerChecking()->Need) {
3308 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
3309 "runtime pointer checks needed. Enable vectorization of this "
3310 "loop with '#pragma clang loop vectorize(enable)' when "
3311 "compiling with -Os/-Oz",
3312 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3313 return true;
3314 }
3315
3316 if (!PSE.getPredicate().isAlwaysTrue()) {
3317 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
3318 "runtime SCEV checks needed. Enable vectorization of this "
3319 "loop with '#pragma clang loop vectorize(enable)' when "
3320 "compiling with -Os/-Oz",
3321 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3322 return true;
3323 }
3324
3325 // FIXME: Avoid specializing for stride==1 instead of bailing out.
3326 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
3327 reportVectorizationFailure("Runtime stride check for small trip count",
3328 "runtime stride == 1 checks needed. Enable vectorization of "
3329 "this loop without such check by compiling with -Os/-Oz",
3330 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3331 return true;
3332 }
3333
3334 return false;
3335}
3336
3337bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
3338 if (IsScalableVectorizationAllowed)
3339 return *IsScalableVectorizationAllowed;
3340
3341 IsScalableVectorizationAllowed = false;
3342 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
3343 return false;
3344
3345 if (Hints->isScalableVectorizationDisabled()) {
3346 reportVectorizationInfo("Scalable vectorization is explicitly disabled",
3347 "ScalableVectorizationDisabled", ORE, TheLoop);
3348 return false;
3349 }
3350
3351 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
3352
3353 auto MaxScalableVF = ElementCount::getScalable(
3354 std::numeric_limits<ElementCount::ScalarTy>::max());
3355
3356 // Test that the loop-vectorizer can legalize all operations for this MaxVF.
3357 // FIXME: While for scalable vectors this is currently sufficient, this should
3358 // be replaced by a more detailed mechanism that filters out specific VFs,
3359 // instead of invalidating vectorization for a whole set of VFs based on the
3360 // MaxVF.
3361
3362 // Disable scalable vectorization if the loop contains unsupported reductions.
3363 if (!canVectorizeReductions(MaxScalableVF)) {
3365 "Scalable vectorization not supported for the reduction "
3366 "operations found in this loop.",
3367 "ScalableVFUnfeasible", ORE, TheLoop);
3368 return false;
3369 }
3370
3371 // Disable scalable vectorization if the loop contains any instructions
3372 // with element types not supported for scalable vectors.
3373 if (any_of(ElementTypesInLoop, [&](Type *Ty) {
3374 return !Ty->isVoidTy() &&
3376 })) {
3377 reportVectorizationInfo("Scalable vectorization is not supported "
3378 "for all element types found in this loop.",
3379 "ScalableVFUnfeasible", ORE, TheLoop);
3380 return false;
3381 }
3382
3383 if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(*TheFunction, TTI)) {
3384 reportVectorizationInfo("The target does not provide maximum vscale value "
3385 "for safe distance analysis.",
3386 "ScalableVFUnfeasible", ORE, TheLoop);
3387 return false;
3388 }
3389
3390 IsScalableVectorizationAllowed = true;
3391 return true;
3392}
3393
3394ElementCount
3395LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
3396 if (!isScalableVectorizationAllowed())
3397 return ElementCount::getScalable(0);
3398
3399 auto MaxScalableVF = ElementCount::getScalable(
3400 std::numeric_limits<ElementCount::ScalarTy>::max());
3401 if (Legal->isSafeForAnyVectorWidth())
3402 return MaxScalableVF;
3403
3404 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
3405 // Limit MaxScalableVF by the maximum safe dependence distance.
3406 MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);
3407
3408 if (!MaxScalableVF)
3410 "Max legal vector width too small, scalable vectorization "
3411 "unfeasible.",
3412 "ScalableVFUnfeasible", ORE, TheLoop);
3413
3414 return MaxScalableVF;
3415}
3416
3417FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
3418 unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
3419 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
3420 unsigned SmallestType, WidestType;
3421 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
3422
3423 // Get the maximum safe dependence distance in bits computed by LAA.
3424 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
3425 // the memory accesses that is most restrictive (involved in the smallest
3426 // dependence distance).
3427 unsigned MaxSafeElementsPowerOf2 =
3428 bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
3429 if (!Legal->isSafeForAnyStoreLoadForwardDistances()) {
3430 unsigned SLDist = Legal->getMaxStoreLoadForwardSafeDistanceInBits();
3431 MaxSafeElementsPowerOf2 =
3432 std::min(MaxSafeElementsPowerOf2, SLDist / WidestType);
3433 }
3434 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElementsPowerOf2);
3435 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElementsPowerOf2);
3436
3437 if (!Legal->isSafeForAnyVectorWidth())
3438 this->MaxSafeElements = MaxSafeElementsPowerOf2;
3439
3440 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
3441 << ".\n");
3442 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
3443 << ".\n");
3444
3445 // First analyze the UserVF, fall back if the UserVF should be ignored.
3446 if (UserVF) {
3447 auto MaxSafeUserVF =
3448 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
3449
3450 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
3451 // If `VF=vscale x N` is safe, then so is `VF=N`
3452 if (UserVF.isScalable())
3453 return FixedScalableVFPair(
3454 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
3455
3456 return UserVF;
3457 }
3458
3459 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
3460
3461 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
3462 // is better to ignore the hint and let the compiler choose a suitable VF.
3463 if (!UserVF.isScalable()) {
3464 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3465 << " is unsafe, clamping to max safe VF="
3466 << MaxSafeFixedVF << ".\n");
3467 ORE->emit([&]() {
3468 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3469 TheLoop->getStartLoc(),
3470 TheLoop->getHeader())
3471 << "User-specified vectorization factor "
3472 << ore::NV("UserVectorizationFactor", UserVF)
3473 << " is unsafe, clamping to maximum safe vectorization factor "
3474 << ore::NV("VectorizationFactor", MaxSafeFixedVF);
3475 });
3476 return MaxSafeFixedVF;
3477 }
3478
3480 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3481 << " is ignored because scalable vectors are not "
3482 "available.\n");
3483 ORE->emit([&]() {
3484 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3485 TheLoop->getStartLoc(),
3486 TheLoop->getHeader())
3487 << "User-specified vectorization factor "
3488 << ore::NV("UserVectorizationFactor", UserVF)
3489 << " is ignored because the target does not support scalable "
3490 "vectors. The compiler will pick a more suitable value.";
3491 });
3492 } else {
3493 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3494 << " is unsafe. Ignoring scalable UserVF.\n");
3495 ORE->emit([&]() {
3496 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3497 TheLoop->getStartLoc(),
3498 TheLoop->getHeader())
3499 << "User-specified vectorization factor "
3500 << ore::NV("UserVectorizationFactor", UserVF)
3501 << " is unsafe. Ignoring the hint to let the compiler pick a "
3502 "more suitable value.";
3503 });
3504 }
3505 }
3506
3507 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
3508 << " / " << WidestType << " bits.\n");
3509
3510 FixedScalableVFPair Result(ElementCount::getFixed(1),
3512 if (auto MaxVF =
3513 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3514 MaxSafeFixedVF, FoldTailByMasking))
3515 Result.FixedVF = MaxVF;
3516
3517 if (auto MaxVF =
3518 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3519 MaxSafeScalableVF, FoldTailByMasking))
3520 if (MaxVF.isScalable()) {
3521 Result.ScalableVF = MaxVF;
3522 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
3523 << "\n");
3524 }
3525
3526 return Result;
3527}
3528
3529FixedScalableVFPair
3531 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
3532 // TODO: It may be useful to do since it's still likely to be dynamically
3533 // uniform if the target can skip.
3535 "Not inserting runtime ptr check for divergent target",
3536 "runtime pointer checks needed. Not enabled for divergent target",
3537 "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
3539 }
3540
3541 ScalarEvolution *SE = PSE.getSE();
3543 unsigned MaxTC = PSE.getSmallConstantMaxTripCount();
3544 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
3545 if (TC != ElementCount::getFixed(MaxTC))
3546 LLVM_DEBUG(dbgs() << "LV: Found maximum trip count: " << MaxTC << '\n');
3547 if (TC.isScalar()) {
3548 reportVectorizationFailure("Single iteration (non) loop",
3549 "loop trip count is one, irrelevant for vectorization",
3550 "SingleIterationLoop", ORE, TheLoop);
3552 }
3553
3554 // If BTC matches the widest induction type and is -1 then the trip count
3555 // computation will wrap to 0 and the vector trip count will be 0. Do not try
3556 // to vectorize.
3557 const SCEV *BTC = SE->getBackedgeTakenCount(TheLoop);
3558 if (!isa<SCEVCouldNotCompute>(BTC) &&
3559 BTC->getType()->getScalarSizeInBits() >=
3560 Legal->getWidestInductionType()->getScalarSizeInBits() &&
3562 SE->getMinusOne(BTC->getType()))) {
3564 "Trip count computation wrapped",
3565 "backedge-taken count is -1, loop trip count wrapped to 0",
3566 "TripCountWrapped", ORE, TheLoop);
3568 }
3569
3570 switch (ScalarEpilogueStatus) {
3572 return computeFeasibleMaxVF(MaxTC, UserVF, false);
3574 [[fallthrough]];
3576 LLVM_DEBUG(
3577 dbgs() << "LV: vector predicate hint/switch found.\n"
3578 << "LV: Not allowing scalar epilogue, creating predicated "
3579 << "vector loop.\n");
3580 break;
3582 // fallthrough as a special case of OptForSize
3584 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
3585 LLVM_DEBUG(
3586 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
3587 else
3588 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
3589 << "count.\n");
3590
3591 // Bail if runtime checks are required, which are not good when optimising
3592 // for size.
3595
3596 break;
3597 }
3598
3599 // Now try the tail folding
3600
3601 // Invalidate interleave groups that require an epilogue if we can't mask
3602 // the interleave-group.
3604 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
3605 "No decisions should have been taken at this point");
3606 // Note: There is no need to invalidate any cost modeling decisions here, as
3607 // none were taken so far.
3608 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
3609 }
3610
3611 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
3612
3613 // Avoid tail folding if the trip count is known to be a multiple of any VF
3614 // we choose.
3615 std::optional<unsigned> MaxPowerOf2RuntimeVF =
3616 MaxFactors.FixedVF.getFixedValue();
3617 if (MaxFactors.ScalableVF) {
3618 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
3619 if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
3620 MaxPowerOf2RuntimeVF = std::max<unsigned>(
3621 *MaxPowerOf2RuntimeVF,
3622 *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
3623 } else
3624 MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
3625 }
3626
3627 auto NoScalarEpilogueNeeded = [this, &UserIC](unsigned MaxVF) {
3628 // Return false if the loop is neither a single-latch-exit loop nor an
3629 // early-exit loop as tail-folding is not supported in that case.
3630 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
3631 !Legal->hasUncountableEarlyExit())
3632 return false;
3633 unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF;
3634 ScalarEvolution *SE = PSE.getSE();
3635 // Calling getSymbolicMaxBackedgeTakenCount enables support for loops
3636 // with uncountable exits. For countable loops, the symbolic maximum must
3637 // remain identical to the known back-edge taken count.
3638 const SCEV *BackedgeTakenCount = PSE.getSymbolicMaxBackedgeTakenCount();
3639 assert((Legal->hasUncountableEarlyExit() ||
3640 BackedgeTakenCount == PSE.getBackedgeTakenCount()) &&
3641 "Invalid loop count");
3642 const SCEV *ExitCount = SE->getAddExpr(
3643 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
3644 const SCEV *Rem = SE->getURemExpr(
3645 SE->applyLoopGuards(ExitCount, TheLoop),
3646 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
3647 return Rem->isZero();
3648 };
3649
3650 if (MaxPowerOf2RuntimeVF > 0u) {
3651 assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
3652 "MaxFixedVF must be a power of 2");
3653 if (NoScalarEpilogueNeeded(*MaxPowerOf2RuntimeVF)) {
3654 // Accept MaxFixedVF if we do not have a tail.
3655 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
3656 return MaxFactors;
3657 }
3658 }
3659
3660 auto ExpectedTC = getSmallBestKnownTC(PSE, TheLoop);
3661 if (ExpectedTC && ExpectedTC->isFixed() &&
3662 ExpectedTC->getFixedValue() <=
3663 TTI.getMinTripCountTailFoldingThreshold()) {
3664 if (MaxPowerOf2RuntimeVF > 0u) {
3665 // If we have a low-trip-count, and the fixed-width VF is known to divide
3666 // the trip count but the scalable factor does not, use the fixed-width
3667 // factor in preference to allow the generation of a non-predicated loop.
3668 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedLowTripLoop &&
3669 NoScalarEpilogueNeeded(MaxFactors.FixedVF.getFixedValue())) {
3670 LLVM_DEBUG(dbgs() << "LV: Picking a fixed-width so that no tail will "
3671 "remain for any chosen VF.\n");
3672 MaxFactors.ScalableVF = ElementCount::getScalable(0);
3673 return MaxFactors;
3674 }
3675 }
3676
3678 "The trip count is below the minial threshold value.",
3679 "loop trip count is too low, avoiding vectorization", "LowTripCount",
3680 ORE, TheLoop);
3682 }
3683
3684 // If we don't know the precise trip count, or if the trip count that we
3685 // found modulo the vectorization factor is not zero, try to fold the tail
3686 // by masking.
3687 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
3688 bool ContainsScalableVF = MaxFactors.ScalableVF.isNonZero();
3689 setTailFoldingStyles(ContainsScalableVF, UserIC);
3690 if (foldTailByMasking()) {
3692 LLVM_DEBUG(
3693 dbgs()
3694 << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
3695 "try to generate VP Intrinsics with scalable vector "
3696 "factors only.\n");
3697 // Tail folded loop using VP intrinsics restricts the VF to be scalable
3698 // for now.
3699 // TODO: extend it for fixed vectors, if required.
3700 assert(ContainsScalableVF && "Expected scalable vector factor.");
3701
3702 MaxFactors.FixedVF = ElementCount::getFixed(1);
3703 }
3704 return MaxFactors;
3705 }
3706
3707 // If there was a tail-folding hint/switch, but we can't fold the tail by
3708 // masking, fallback to a vectorization with a scalar epilogue.
3709 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
3710 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
3711 "scalar epilogue instead.\n");
3712 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
3713 return MaxFactors;
3714 }
3715
3716 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
3717 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
3719 }
3720
3721 if (TC.isZero()) {
3723 "unable to calculate the loop count due to complex control flow",
3724 "UnknownLoopCountComplexCFG", ORE, TheLoop);
3726 }
3727
3729 "Cannot optimize for size and vectorize at the same time.",
3730 "cannot optimize for size and vectorize at the same time. "
3731 "Enable vectorization of this loop with '#pragma clang loop "
3732 "vectorize(enable)' when compiling with -Os/-Oz",
3733 "NoTailLoopWithOptForSize", ORE, TheLoop);
3735}
3736
3738 ElementCount VF) {
3739 if (ConsiderRegPressure.getNumOccurrences())
3740 return ConsiderRegPressure;
3741
3742 // TODO: We should eventually consider register pressure for all targets. The
3743 // TTI hook is temporary whilst target-specific issues are being fixed.
3744 if (TTI.shouldConsiderVectorizationRegPressure())
3745 return true;
3746
3747 if (!useMaxBandwidth(VF.isScalable()
3750 return false;
3751 // Only calculate register pressure for VFs enabled by MaxBandwidth.
3753 VF, VF.isScalable() ? MaxPermissibleVFWithoutMaxBW.ScalableVF
3755}
3756
3759 return MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
3760 (TTI.shouldMaximizeVectorBandwidth(RegKind) ||
3762 Legal->hasVectorCallVariants())));
3763}
3764
3765ElementCount LoopVectorizationCostModel::clampVFByMaxTripCount(
3766 ElementCount VF, unsigned MaxTripCount, bool FoldTailByMasking) const {
3767 unsigned EstimatedVF = VF.getKnownMinValue();
3768 if (VF.isScalable() && TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
3769 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
3770 auto Min = Attr.getVScaleRangeMin();
3771 EstimatedVF *= Min;
3772 }
3773
3774 // When a scalar epilogue is required, at least one iteration of the scalar
3775 // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
3776 // max VF that results in a dead vector loop.
3777 if (MaxTripCount > 0 && requiresScalarEpilogue(true))
3778 MaxTripCount -= 1;
3779
3780 if (MaxTripCount && MaxTripCount <= EstimatedVF &&
3781 (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
3782 // If upper bound loop trip count (TC) is known at compile time there is no
3783 // point in choosing VF greater than TC (as done in the loop below). Select
3784 // maximum power of two which doesn't exceed TC. If VF is
3785 // scalable, we only fall back on a fixed VF when the TC is less than or
3786 // equal to the known number of lanes.
3787 auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount);
3788 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
3789 "exceeding the constant trip count: "
3790 << ClampedUpperTripCount << "\n");
3791 return ElementCount::get(ClampedUpperTripCount,
3792 FoldTailByMasking ? VF.isScalable() : false);
3793 }
3794 return VF;
3795}
3796
3797ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
3798 unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
3799 ElementCount MaxSafeVF, bool FoldTailByMasking) {
3800 bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
3801 const TypeSize WidestRegister = TTI.getRegisterBitWidth(
3802 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
3804
3805 // Convenience function to return the minimum of two ElementCounts.
3806 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
3807 assert((LHS.isScalable() == RHS.isScalable()) &&
3808 "Scalable flags must match");
3809 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
3810 };
3811
3812 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
3813 // Note that both WidestRegister and WidestType may not be a powers of 2.
3814 auto MaxVectorElementCount = ElementCount::get(
3815 llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),
3816 ComputeScalableMaxVF);
3817 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
3818 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
3819 << (MaxVectorElementCount * WidestType) << " bits.\n");
3820
3821 if (!MaxVectorElementCount) {
3822 LLVM_DEBUG(dbgs() << "LV: The target has no "
3823 << (ComputeScalableMaxVF ? "scalable" : "fixed")
3824 << " vector registers.\n");
3825 return ElementCount::getFixed(1);
3826 }
3827
3828 ElementCount MaxVF = clampVFByMaxTripCount(MaxVectorElementCount,
3829 MaxTripCount, FoldTailByMasking);
3830 // If the MaxVF was already clamped, there's no point in trying to pick a
3831 // larger one.
3832 if (MaxVF != MaxVectorElementCount)
3833 return MaxVF;
3834
3836 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
3838
3839 if (MaxVF.isScalable())
3840 MaxPermissibleVFWithoutMaxBW.ScalableVF = MaxVF;
3841 else
3842 MaxPermissibleVFWithoutMaxBW.FixedVF = MaxVF;
3843
3844 if (useMaxBandwidth(RegKind)) {
3845 auto MaxVectorElementCountMaxBW = ElementCount::get(
3846 llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
3847 ComputeScalableMaxVF);
3848 MaxVF = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
3849
3850 if (ElementCount MinVF =
3851 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
3852 if (ElementCount::isKnownLT(MaxVF, MinVF)) {
3853 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
3854 << ") with target's minimum: " << MinVF << '\n');
3855 MaxVF = MinVF;
3856 }
3857 }
3858
3859 MaxVF = clampVFByMaxTripCount(MaxVF, MaxTripCount, FoldTailByMasking);
3860
3861 if (MaxVectorElementCount != MaxVF) {
3862 // Invalidate any widening decisions we might have made, in case the loop
3863 // requires prediction (decided later), but we have already made some
3864 // load/store widening decisions.
3865 invalidateCostModelingDecisions();
3866 }
3867 }
3868 return MaxVF;
3869}
3870
3871bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
3872 const VectorizationFactor &B,
3873 const unsigned MaxTripCount,
3874 bool HasTail,
3875 bool IsEpilogue) const {
3876 InstructionCost CostA = A.Cost;
3877 InstructionCost CostB = B.Cost;
3878
3879 // Improve estimate for the vector width if it is scalable.
3880 unsigned EstimatedWidthA = A.Width.getKnownMinValue();
3881 unsigned EstimatedWidthB = B.Width.getKnownMinValue();
3882 if (std::optional<unsigned> VScale = CM.getVScaleForTuning()) {
3883 if (A.Width.isScalable())
3884 EstimatedWidthA *= *VScale;
3885 if (B.Width.isScalable())
3886 EstimatedWidthB *= *VScale;
3887 }
3888
3889 // When optimizing for size choose whichever is smallest, which will be the
3890 // one with the smallest cost for the whole loop. On a tie pick the larger
3891 // vector width, on the assumption that throughput will be greater.
3892 if (CM.CostKind == TTI::TCK_CodeSize)
3893 return CostA < CostB ||
3894 (CostA == CostB && EstimatedWidthA > EstimatedWidthB);
3895
3896 // Assume vscale may be larger than 1 (or the value being tuned for),
3897 // so that scalable vectorization is slightly favorable over fixed-width
3898 // vectorization.
3899 bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost(IsEpilogue) &&
3900 A.Width.isScalable() && !B.Width.isScalable();
3901
3902 auto CmpFn = [PreferScalable](const InstructionCost &LHS,
3903 const InstructionCost &RHS) {
3904 return PreferScalable ? LHS <= RHS : LHS < RHS;
3905 };
3906
3907 // To avoid the need for FP division:
3908 // (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)
3909 // <=> (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA)
3910 if (!MaxTripCount)
3911 return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);
3912
3913 auto GetCostForTC = [MaxTripCount, HasTail](unsigned VF,
3914 InstructionCost VectorCost,
3915 InstructionCost ScalarCost) {
3916 // If the trip count is a known (possibly small) constant, the trip count
3917 // will be rounded up to an integer number of iterations under
3918 // FoldTailByMasking. The total cost in that case will be
3919 // VecCost*ceil(TripCount/VF). When not folding the tail, the total
3920 // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
3921 // some extra overheads, but for the purpose of comparing the costs of
3922 // different VFs we can use this to compare the total loop-body cost
3923 // expected after vectorization.
3924 if (HasTail)
3925 return VectorCost * (MaxTripCount / VF) +
3926 ScalarCost * (MaxTripCount % VF);
3927 return VectorCost * divideCeil(MaxTripCount, VF);
3928 };
3929
3930 auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost);
3931 auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost);
3932 return CmpFn(RTCostA, RTCostB);
3933}
3934
3935bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
3936 const VectorizationFactor &B,
3937 bool HasTail,
3938 bool IsEpilogue) const {
3939 const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount();
3940 return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount, HasTail,
3941 IsEpilogue);
3942}
3943
3946 using RecipeVFPair = std::pair<VPRecipeBase *, ElementCount>;
3947 SmallVector<RecipeVFPair> InvalidCosts;
3948 for (const auto &Plan : VPlans) {
3949 for (ElementCount VF : Plan->vectorFactors()) {
3950 // The VPlan-based cost model is designed for computing vector cost.
3951 // Querying VPlan-based cost model with a scarlar VF will cause some
3952 // errors because we expect the VF is vector for most of the widen
3953 // recipes.
3954 if (VF.isScalar())
3955 continue;
3956
3957 VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
3958 *CM.PSE.getSE(), OrigLoop);
3959 precomputeCosts(*Plan, VF, CostCtx);
3960 auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
3962 for (auto &R : *VPBB) {
3963 if (!R.cost(VF, CostCtx).isValid())
3964 InvalidCosts.emplace_back(&R, VF);
3965 }
3966 }
3967 }
3968 }
3969 if (InvalidCosts.empty())
3970 return;
3971
3972 // Emit a report of VFs with invalid costs in the loop.
3973
3974 // Group the remarks per recipe, keeping the recipe order from InvalidCosts.
3976 unsigned I = 0;
3977 for (auto &Pair : InvalidCosts)
3978 if (Numbering.try_emplace(Pair.first, I).second)
3979 ++I;
3980
3981 // Sort the list, first on recipe(number) then on VF.
3982 sort(InvalidCosts, [&Numbering](RecipeVFPair &A, RecipeVFPair &B) {
3983 unsigned NA = Numbering[A.first];
3984 unsigned NB = Numbering[B.first];
3985 if (NA != NB)
3986 return NA < NB;
3987 return ElementCount::isKnownLT(A.second, B.second);
3988 });
3989
3990 // For a list of ordered recipe-VF pairs:
3991 // [(load, VF1), (load, VF2), (store, VF1)]
3992 // group the recipes together to emit separate remarks for:
3993 // load (VF1, VF2)
3994 // store (VF1)
3995 auto Tail = ArrayRef<RecipeVFPair>(InvalidCosts);
3996 auto Subset = ArrayRef<RecipeVFPair>();
3997 do {
3998 if (Subset.empty())
3999 Subset = Tail.take_front(1);
4000
4001 VPRecipeBase *R = Subset.front().first;
4002
4003 unsigned Opcode =
4006 [](const auto *R) { return Instruction::PHI; })
4007 .Case<VPWidenSelectRecipe>(
4008 [](const auto *R) { return Instruction::Select; })
4009 .Case<VPWidenStoreRecipe>(
4010 [](const auto *R) { return Instruction::Store; })
4011 .Case<VPWidenLoadRecipe>(
4012 [](const auto *R) { return Instruction::Load; })
4013 .Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>(
4014 [](const auto *R) { return Instruction::Call; })
4017 [](const auto *R) { return R->getOpcode(); })
4018 .Case<VPInterleaveRecipe>([](const VPInterleaveRecipe *R) {
4019 return R->getStoredValues().empty() ? Instruction::Load
4020 : Instruction::Store;
4021 })
4022 .Case<VPReductionRecipe>([](const auto *R) {
4023 return RecurrenceDescriptor::getOpcode(R->getRecurrenceKind());
4024 });
4025
4026 // If the next recipe is different, or if there are no other pairs,
4027 // emit a remark for the collated subset. e.g.
4028 // [(load, VF1), (load, VF2))]
4029 // to emit:
4030 // remark: invalid costs for 'load' at VF=(VF1, VF2)
4031 if (Subset == Tail || Tail[Subset.size()].first != R) {
4032 std::string OutString;
4033 raw_string_ostream OS(OutString);
4034 assert(!Subset.empty() && "Unexpected empty range");
4035 OS << "Recipe with invalid costs prevented vectorization at VF=(";
4036 for (const auto &Pair : Subset)
4037 OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
4038 OS << "):";
4039 if (Opcode == Instruction::Call) {
4040 StringRef Name = "";
4041 if (auto *Int = dyn_cast<VPWidenIntrinsicRecipe>(R)) {
4042 Name = Int->getIntrinsicName();
4043 } else {
4044 auto *WidenCall = dyn_cast<VPWidenCallRecipe>(R);
4045 Function *CalledFn =
4046 WidenCall ? WidenCall->getCalledScalarFunction()
4047 : cast<Function>(R->getOperand(R->getNumOperands() - 1)
4048 ->getLiveInIRValue());
4049 Name = CalledFn->getName();
4050 }
4051 OS << " call to " << Name;
4052 } else
4053 OS << " " << Instruction::getOpcodeName(Opcode);
4054 reportVectorizationInfo(OutString, "InvalidCost", ORE, OrigLoop, nullptr,
4055 R->getDebugLoc());
4056 Tail = Tail.drop_front(Subset.size());
4057 Subset = {};
4058 } else
4059 // Grow the subset by one element
4060 Subset = Tail.take_front(Subset.size() + 1);
4061 } while (!Tail.empty());
4062}
4063
4064/// Check if any recipe of \p Plan will generate a vector value, which will be
4065/// assigned a vector register.
4067 const TargetTransformInfo &TTI) {
4068 assert(VF.isVector() && "Checking a scalar VF?");
4069 VPTypeAnalysis TypeInfo(Plan);
4070 DenseSet<VPRecipeBase *> EphemeralRecipes;
4071 collectEphemeralRecipesForVPlan(Plan, EphemeralRecipes);
4072 // Set of already visited types.
4073 DenseSet<Type *> Visited;
4076 for (VPRecipeBase &R : *VPBB) {
4077 if (EphemeralRecipes.contains(&R))
4078 continue;
4079 // Continue early if the recipe is considered to not produce a vector
4080 // result. Note that this includes VPInstruction where some opcodes may
4081 // produce a vector, to preserve existing behavior as VPInstructions model
4082 // aspects not directly mapped to existing IR instructions.
4083 switch (R.getVPDefID()) {
4084 case VPDef::VPDerivedIVSC:
4085 case VPDef::VPScalarIVStepsSC:
4086 case VPDef::VPReplicateSC:
4087 case VPDef::VPInstructionSC:
4088 case VPDef::VPCanonicalIVPHISC:
4089 case VPDef::VPVectorPointerSC:
4090 case VPDef::VPVectorEndPointerSC:
4091 case VPDef::VPExpandSCEVSC:
4092 case VPDef::VPEVLBasedIVPHISC:
4093 case VPDef::VPPredInstPHISC:
4094 case VPDef::VPBranchOnMaskSC:
4095 continue;
4096 case VPDef::VPReductionSC:
4097 case VPDef::VPActiveLaneMaskPHISC:
4098 case VPDef::VPWidenCallSC:
4099 case VPDef::VPWidenCanonicalIVSC:
4100 case VPDef::VPWidenCastSC:
4101 case VPDef::VPWidenGEPSC:
4102 case VPDef::VPWidenIntrinsicSC:
4103 case VPDef::VPWidenSC:
4104 case VPDef::VPWidenSelectSC:
4105 case VPDef::VPBlendSC:
4106 case VPDef::VPFirstOrderRecurrencePHISC:
4107 case VPDef::VPHistogramSC:
4108 case VPDef::VPWidenPHISC:
4109 case VPDef::VPWidenIntOrFpInductionSC:
4110 case VPDef::VPWidenPointerInductionSC:
4111 case VPDef::VPReductionPHISC:
4112 case VPDef::VPInterleaveEVLSC:
4113 case VPDef::VPInterleaveSC:
4114 case VPDef::VPWidenLoadEVLSC:
4115 case VPDef::VPWidenLoadSC:
4116 case VPDef::VPWidenStoreEVLSC:
4117 case VPDef::VPWidenStoreSC:
4118 break;
4119 default:
4120 llvm_unreachable("unhandled recipe");
4121 }
4122
4123 auto WillGenerateTargetVectors = [&TTI, VF](Type *VectorTy) {
4124 unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
4125 if (!NumLegalParts)
4126 return false;
4127 if (VF.isScalable()) {
4128 // <vscale x 1 x iN> is assumed to be profitable over iN because
4129 // scalable registers are a distinct register class from scalar
4130 // ones. If we ever find a target which wants to lower scalable
4131 // vectors back to scalars, we'll need to update this code to
4132 // explicitly ask TTI about the register class uses for each part.
4133 return NumLegalParts <= VF.getKnownMinValue();
4134 }
4135 // Two or more elements that share a register - are vectorized.
4136 return NumLegalParts < VF.getFixedValue();
4137 };
4138
4139 // If no def nor is a store, e.g., branches, continue - no value to check.
4140 if (R.getNumDefinedValues() == 0 &&
4142 continue;
4143 // For multi-def recipes, currently only interleaved loads, suffice to
4144 // check first def only.
4145 // For stores check their stored value; for interleaved stores suffice
4146 // the check first stored value only. In all cases this is the second
4147 // operand.
4148 VPValue *ToCheck =
4149 R.getNumDefinedValues() >= 1 ? R.getVPValue(0) : R.getOperand(1);
4150 Type *ScalarTy = TypeInfo.inferScalarType(ToCheck);
4151 if (!Visited.insert({ScalarTy}).second)
4152 continue;
4153 Type *WideTy = toVectorizedTy(ScalarTy, VF);
4154 if (any_of(getContainedTypes(WideTy), WillGenerateTargetVectors))
4155 return true;
4156 }
4157 }
4158
4159 return false;
4160}
4161
4162static bool hasReplicatorRegion(VPlan &Plan) {
4164 Plan.getVectorLoopRegion()->getEntry())),
4165 [](auto *VPRB) { return VPRB->isReplicator(); });
4166}
4167
4168#ifndef NDEBUG
4169VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
4170 InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1));
4171 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
4172 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
4173 assert(
4174 any_of(VPlans,
4175 [](std::unique_ptr<VPlan> &P) { return P->hasScalarVFOnly(); }) &&
4176 "Expected Scalar VF to be a candidate");
4177
4178 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
4179 ExpectedCost);
4180 VectorizationFactor ChosenFactor = ScalarCost;
4181
4182 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
4183 if (ForceVectorization &&
4184 (VPlans.size() > 1 || !VPlans[0]->hasScalarVFOnly())) {
4185 // Ignore scalar width, because the user explicitly wants vectorization.
4186 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4187 // evaluation.
4188 ChosenFactor.Cost = InstructionCost::getMax();
4189 }
4190
4191 for (auto &P : VPlans) {
4192 ArrayRef<ElementCount> VFs(P->vectorFactors().begin(),
4193 P->vectorFactors().end());
4194
4196 if (any_of(VFs, [this](ElementCount VF) {
4197 return CM.shouldConsiderRegPressureForVF(VF);
4198 }))
4199 RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore);
4200
4201 for (unsigned I = 0; I < VFs.size(); I++) {
4202 ElementCount VF = VFs[I];
4203 // The cost for scalar VF=1 is already calculated, so ignore it.
4204 if (VF.isScalar())
4205 continue;
4206
4207 /// If the register pressure needs to be considered for VF,
4208 /// don't consider the VF as valid if it exceeds the number
4209 /// of registers for the target.
4210 if (CM.shouldConsiderRegPressureForVF(VF) &&
4211 RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs))
4212 continue;
4213
4214 InstructionCost C = CM.expectedCost(VF);
4215
4216 // Add on other costs that are modelled in VPlan, but not in the legacy
4217 // cost model.
4218 VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind,
4219 *CM.PSE.getSE(), OrigLoop);
4220 VPRegionBlock *VectorRegion = P->getVectorLoopRegion();
4221 assert(VectorRegion && "Expected to have a vector region!");
4222 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4223 vp_depth_first_shallow(VectorRegion->getEntry()))) {
4224 for (VPRecipeBase &R : *VPBB) {
4225 auto *VPI = dyn_cast<VPInstruction>(&R);
4226 if (!VPI)
4227 continue;
4228 switch (VPI->getOpcode()) {
4229 // Selects are only modelled in the legacy cost model for safe
4230 // divisors.
4231 case Instruction::Select: {
4232 if (auto *WR =
4233 dyn_cast_or_null<VPWidenRecipe>(VPI->getSingleUser())) {
4234 switch (WR->getOpcode()) {
4235 case Instruction::UDiv:
4236 case Instruction::SDiv:
4237 case Instruction::URem:
4238 case Instruction::SRem:
4239 continue;
4240 default:
4241 break;
4242 }
4243 }
4244 C += VPI->cost(VF, CostCtx);
4245 break;
4246 }
4248 unsigned Multiplier =
4249 cast<ConstantInt>(VPI->getOperand(2)->getLiveInIRValue())
4250 ->getZExtValue();
4251 C += VPI->cost(VF * Multiplier, CostCtx);
4252 break;
4253 }
4255 C += VPI->cost(VF, CostCtx);
4256 break;
4257 default:
4258 break;
4259 }
4260 }
4261 }
4262
4263 VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
4264 unsigned Width =
4265 estimateElementCount(Candidate.Width, CM.getVScaleForTuning());
4266 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
4267 << " costs: " << (Candidate.Cost / Width));
4268 if (VF.isScalable())
4269 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
4270 << CM.getVScaleForTuning().value_or(1) << ")");
4271 LLVM_DEBUG(dbgs() << ".\n");
4272
4273 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
4274 LLVM_DEBUG(
4275 dbgs()
4276 << "LV: Not considering vector loop of width " << VF
4277 << " because it will not generate any vector instructions.\n");
4278 continue;
4279 }
4280
4281 if (CM.OptForSize && !ForceVectorization && hasReplicatorRegion(*P)) {
4282 LLVM_DEBUG(
4283 dbgs()
4284 << "LV: Not considering vector loop of width " << VF
4285 << " because it would cause replicated blocks to be generated,"
4286 << " which isn't allowed when optimizing for size.\n");
4287 continue;
4288 }
4289
4290 if (isMoreProfitable(Candidate, ChosenFactor, P->hasScalarTail()))
4291 ChosenFactor = Candidate;
4292 }
4293 }
4294
4295 if (!EnableCondStoresVectorization && CM.hasPredStores()) {
4297 "There are conditional stores.",
4298 "store that is conditionally executed prevents vectorization",
4299 "ConditionalStore", ORE, OrigLoop);
4300 ChosenFactor = ScalarCost;
4301 }
4302
4303 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
4304 !isMoreProfitable(ChosenFactor, ScalarCost,
4305 !CM.foldTailByMasking())) dbgs()
4306 << "LV: Vectorization seems to be not beneficial, "
4307 << "but was forced by a user.\n");
4308 return ChosenFactor;
4309}
4310#endif
4311
4312bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
4313 ElementCount VF) const {
4314 // Cross iteration phis such as fixed-order recurrences and FMaxNum/FMinNum
4315 // reductions need special handling and are currently unsupported.
4316 if (any_of(OrigLoop->getHeader()->phis(), [&](PHINode &Phi) {
4317 if (!Legal->isReductionVariable(&Phi))
4318 return Legal->isFixedOrderRecurrence(&Phi);
4319 return RecurrenceDescriptor::isFPMinMaxNumRecurrenceKind(
4320 Legal->getRecurrenceDescriptor(&Phi).getRecurrenceKind());
4321 }))
4322 return false;
4323
4324 // Phis with uses outside of the loop require special handling and are
4325 // currently unsupported.
4326 for (const auto &Entry : Legal->getInductionVars()) {
4327 // Look for uses of the value of the induction at the last iteration.
4328 Value *PostInc =
4329 Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch());
4330 for (User *U : PostInc->users())
4331 if (!OrigLoop->contains(cast<Instruction>(U)))
4332 return false;
4333 // Look for uses of penultimate value of the induction.
4334 for (User *U : Entry.first->users())
4335 if (!OrigLoop->contains(cast<Instruction>(U)))
4336 return false;
4337 }
4338
4339 // Epilogue vectorization code has not been auditted to ensure it handles
4340 // non-latch exits properly. It may be fine, but it needs auditted and
4341 // tested.
4342 // TODO: Add support for loops with an early exit.
4343 if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
4344 return false;
4345
4346 return true;
4347}
4348
4350 const ElementCount VF, const unsigned IC) const {
4351 // FIXME: We need a much better cost-model to take different parameters such
4352 // as register pressure, code size increase and cost of extra branches into
4353 // account. For now we apply a very crude heuristic and only consider loops
4354 // with vectorization factors larger than a certain value.
4355
4356 // Allow the target to opt out entirely.
4357 if (!TTI.preferEpilogueVectorization())
4358 return false;
4359
4360 // We also consider epilogue vectorization unprofitable for targets that don't
4361 // consider interleaving beneficial (eg. MVE).
4362 if (TTI.getMaxInterleaveFactor(VF) <= 1)
4363 return false;
4364
4365 unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
4367 : TTI.getEpilogueVectorizationMinVF();
4368 return estimateElementCount(VF * IC, VScaleForTuning) >= MinVFThreshold;
4369}
4370
4372 const ElementCount MainLoopVF, unsigned IC) {
4375 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
4376 return Result;
4377 }
4378
4379 if (!CM.isScalarEpilogueAllowed()) {
4380 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
4381 "epilogue is allowed.\n");
4382 return Result;
4383 }
4384
4385 // Not really a cost consideration, but check for unsupported cases here to
4386 // simplify the logic.
4387 if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
4388 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
4389 "is not a supported candidate.\n");
4390 return Result;
4391 }
4392
4394 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
4396 if (hasPlanWithVF(ForcedEC))
4397 return {ForcedEC, 0, 0};
4398
4399 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
4400 "viable.\n");
4401 return Result;
4402 }
4403
4404 if (OrigLoop->getHeader()->getParent()->hasOptSize()) {
4405 LLVM_DEBUG(
4406 dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
4407 return Result;
4408 }
4409
4410 if (!CM.isEpilogueVectorizationProfitable(MainLoopVF, IC)) {
4411 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
4412 "this loop\n");
4413 return Result;
4414 }
4415
4416 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
4417 // the main loop handles 8 lanes per iteration. We could still benefit from
4418 // vectorizing the epilogue loop with VF=4.
4419 ElementCount EstimatedRuntimeVF = ElementCount::getFixed(
4420 estimateElementCount(MainLoopVF, CM.getVScaleForTuning()));
4421
4422 ScalarEvolution &SE = *PSE.getSE();
4423 Type *TCType = Legal->getWidestInductionType();
4424 const SCEV *RemainingIterations = nullptr;
4425 unsigned MaxTripCount = 0;
4426 const SCEV *TC =
4427 vputils::getSCEVExprForVPValue(getPlanFor(MainLoopVF).getTripCount(), SE);
4428 assert(!isa<SCEVCouldNotCompute>(TC) && "Trip count SCEV must be computable");
4429 const SCEV *KnownMinTC;
4430 bool ScalableTC = match(TC, m_scev_c_Mul(m_SCEV(KnownMinTC), m_SCEVVScale()));
4431 bool ScalableRemIter = false;
4432 // Use versions of TC and VF in which both are either scalable or fixed.
4433 if (ScalableTC == MainLoopVF.isScalable()) {
4434 ScalableRemIter = ScalableTC;
4435 RemainingIterations =
4436 SE.getURemExpr(TC, SE.getElementCount(TCType, MainLoopVF * IC));
4437 } else if (ScalableTC) {
4438 const SCEV *EstimatedTC = SE.getMulExpr(
4439 KnownMinTC,
4440 SE.getConstant(TCType, CM.getVScaleForTuning().value_or(1)));
4441 RemainingIterations = SE.getURemExpr(
4442 EstimatedTC, SE.getElementCount(TCType, MainLoopVF * IC));
4443 } else
4444 RemainingIterations =
4445 SE.getURemExpr(TC, SE.getElementCount(TCType, EstimatedRuntimeVF * IC));
4446
4447 // No iterations left to process in the epilogue.
4448 if (RemainingIterations->isZero())
4449 return Result;
4450
4451 if (MainLoopVF.isFixed()) {
4452 MaxTripCount = MainLoopVF.getFixedValue() * IC - 1;
4453 if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations,
4454 SE.getConstant(TCType, MaxTripCount))) {
4455 MaxTripCount = SE.getUnsignedRangeMax(RemainingIterations).getZExtValue();
4456 }
4457 LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: "
4458 << MaxTripCount << "\n");
4459 }
4460
4461 auto SkipVF = [&](const SCEV *VF, const SCEV *RemIter) -> bool {
4462 return SE.isKnownPredicate(CmpInst::ICMP_UGT, VF, RemIter);
4463 };
4464 for (auto &NextVF : ProfitableVFs) {
4465 // Skip candidate VFs without a corresponding VPlan.
4466 if (!hasPlanWithVF(NextVF.Width))
4467 continue;
4468
4469 // Skip candidate VFs with widths >= the (estimated) runtime VF (scalable
4470 // vectors) or > the VF of the main loop (fixed vectors).
4471 if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
4472 ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||
4473 (NextVF.Width.isScalable() &&
4474 ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) ||
4475 (!NextVF.Width.isScalable() && !MainLoopVF.isScalable() &&
4476 ElementCount::isKnownGT(NextVF.Width, MainLoopVF)))
4477 continue;
4478
4479 // If NextVF is greater than the number of remaining iterations, the
4480 // epilogue loop would be dead. Skip such factors.
4481 // TODO: We should also consider comparing against a scalable
4482 // RemainingIterations when SCEV be able to evaluate non-canonical
4483 // vscale-based expressions.
4484 if (!ScalableRemIter) {
4485 // Handle the case where NextVF and RemainingIterations are in different
4486 // numerical spaces.
4487 ElementCount EC = NextVF.Width;
4488 if (NextVF.Width.isScalable())
4490 estimateElementCount(NextVF.Width, CM.getVScaleForTuning()));
4491 if (SkipVF(SE.getElementCount(TCType, EC), RemainingIterations))
4492 continue;
4493 }
4494
4495 if (Result.Width.isScalar() ||
4496 isMoreProfitable(NextVF, Result, MaxTripCount, !CM.foldTailByMasking(),
4497 /*IsEpilogue*/ true))
4498 Result = NextVF;
4499 }
4500
4501 if (Result != VectorizationFactor::Disabled())
4502 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
4503 << Result.Width << "\n");
4504 return Result;
4505}
4506
4507std::pair<unsigned, unsigned>
4509 unsigned MinWidth = -1U;
4510 unsigned MaxWidth = 8;
4511 const DataLayout &DL = TheFunction->getDataLayout();
4512 // For in-loop reductions, no element types are added to ElementTypesInLoop
4513 // if there are no loads/stores in the loop. In this case, check through the
4514 // reduction variables to determine the maximum width.
4515 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
4516 for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
4517 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
4518 // When finding the min width used by the recurrence we need to account
4519 // for casts on the input operands of the recurrence.
4520 MinWidth = std::min(
4521 MinWidth,
4522 std::min(RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
4524 MaxWidth = std::max(MaxWidth,
4526 }
4527 } else {
4528 for (Type *T : ElementTypesInLoop) {
4529 MinWidth = std::min<unsigned>(
4530 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4531 MaxWidth = std::max<unsigned>(
4532 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4533 }
4534 }
4535 return {MinWidth, MaxWidth};
4536}
4537
4539 ElementTypesInLoop.clear();
4540 // For each block.
4541 for (BasicBlock *BB : TheLoop->blocks()) {
4542 // For each instruction in the loop.
4543 for (Instruction &I : BB->instructionsWithoutDebug()) {
4544 Type *T = I.getType();
4545
4546 // Skip ignored values.
4547 if (ValuesToIgnore.count(&I))
4548 continue;
4549
4550 // Only examine Loads, Stores and PHINodes.
4551 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
4552 continue;
4553
4554 // Examine PHI nodes that are reduction variables. Update the type to
4555 // account for the recurrence type.
4556 if (auto *PN = dyn_cast<PHINode>(&I)) {
4557 if (!Legal->isReductionVariable(PN))
4558 continue;
4559 const RecurrenceDescriptor &RdxDesc =
4560 Legal->getRecurrenceDescriptor(PN);
4562 TTI.preferInLoopReduction(RdxDesc.getRecurrenceKind(),
4563 RdxDesc.getRecurrenceType()))
4564 continue;
4565 T = RdxDesc.getRecurrenceType();
4566 }
4567
4568 // Examine the stored values.
4569 if (auto *ST = dyn_cast<StoreInst>(&I))
4570 T = ST->getValueOperand()->getType();
4571
4572 assert(T->isSized() &&
4573 "Expected the load/store/recurrence type to be sized");
4574
4575 ElementTypesInLoop.insert(T);
4576 }
4577 }
4578}
4579
4580unsigned
4582 InstructionCost LoopCost) {
4583 // -- The interleave heuristics --
4584 // We interleave the loop in order to expose ILP and reduce the loop overhead.
4585 // There are many micro-architectural considerations that we can't predict
4586 // at this level. For example, frontend pressure (on decode or fetch) due to
4587 // code size, or the number and capabilities of the execution ports.
4588 //
4589 // We use the following heuristics to select the interleave count:
4590 // 1. If the code has reductions, then we interleave to break the cross
4591 // iteration dependency.
4592 // 2. If the loop is really small, then we interleave to reduce the loop
4593 // overhead.
4594 // 3. We don't interleave if we think that we will spill registers to memory
4595 // due to the increased register pressure.
4596
4597 // Only interleave tail-folded loops if wide lane masks are requested, as the
4598 // overhead of multiple instructions to calculate the predicate is likely
4599 // not beneficial. If a scalar epilogue is not allowed for any other reason,
4600 // do not interleave.
4601 if (!CM.isScalarEpilogueAllowed() &&
4602 !(CM.preferPredicatedLoop() && CM.useWideActiveLaneMask()))
4603 return 1;
4604
4607 LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. "
4608 "Unroll factor forced to be 1.\n");
4609 return 1;
4610 }
4611
4612 // We used the distance for the interleave count.
4613 if (!Legal->isSafeForAnyVectorWidth())
4614 return 1;
4615
4616 // We don't attempt to perform interleaving for loops with uncountable early
4617 // exits because the VPInstruction::AnyOf code cannot currently handle
4618 // multiple parts.
4619 if (Plan.hasEarlyExit())
4620 return 1;
4621
4622 const bool HasReductions =
4625
4626 // If we did not calculate the cost for VF (because the user selected the VF)
4627 // then we calculate the cost of VF here.
4628 if (LoopCost == 0) {
4629 if (VF.isScalar())
4630 LoopCost = CM.expectedCost(VF);
4631 else
4632 LoopCost = cost(Plan, VF);
4633 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
4634
4635 // Loop body is free and there is no need for interleaving.
4636 if (LoopCost == 0)
4637 return 1;
4638 }
4639
4640 VPRegisterUsage R =
4641 calculateRegisterUsageForPlan(Plan, {VF}, TTI, CM.ValuesToIgnore)[0];
4642 // We divide by these constants so assume that we have at least one
4643 // instruction that uses at least one register.
4644 for (auto &Pair : R.MaxLocalUsers) {
4645 Pair.second = std::max(Pair.second, 1U);
4646 }
4647
4648 // We calculate the interleave count using the following formula.
4649 // Subtract the number of loop invariants from the number of available
4650 // registers. These registers are used by all of the interleaved instances.
4651 // Next, divide the remaining registers by the number of registers that is
4652 // required by the loop, in order to estimate how many parallel instances
4653 // fit without causing spills. All of this is rounded down if necessary to be
4654 // a power of two. We want power of two interleave count to simplify any
4655 // addressing operations or alignment considerations.
4656 // We also want power of two interleave counts to ensure that the induction
4657 // variable of the vector loop wraps to zero, when tail is folded by masking;
4658 // this currently happens when OptForSize, in which case IC is set to 1 above.
4659 unsigned IC = UINT_MAX;
4660
4661 for (const auto &Pair : R.MaxLocalUsers) {
4662 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(Pair.first);
4663 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
4664 << " registers of "
4665 << TTI.getRegisterClassName(Pair.first)
4666 << " register class\n");
4667 if (VF.isScalar()) {
4668 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
4669 TargetNumRegisters = ForceTargetNumScalarRegs;
4670 } else {
4671 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
4672 TargetNumRegisters = ForceTargetNumVectorRegs;
4673 }
4674 unsigned MaxLocalUsers = Pair.second;
4675 unsigned LoopInvariantRegs = 0;
4676 if (R.LoopInvariantRegs.contains(Pair.first))
4677 LoopInvariantRegs = R.LoopInvariantRegs[Pair.first];
4678
4679 unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /
4680 MaxLocalUsers);
4681 // Don't count the induction variable as interleaved.
4683 TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) /
4684 std::max(1U, (MaxLocalUsers - 1)));
4685 }
4686
4687 IC = std::min(IC, TmpIC);
4688 }
4689
4690 // Clamp the interleave ranges to reasonable counts.
4691 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
4692
4693 // Check if the user has overridden the max.
4694 if (VF.isScalar()) {
4695 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
4696 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
4697 } else {
4698 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
4699 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
4700 }
4701
4702 // Try to get the exact trip count, or an estimate based on profiling data or
4703 // ConstantMax from PSE, failing that.
4704 auto BestKnownTC = getSmallBestKnownTC(PSE, OrigLoop);
4705
4706 // For fixed length VFs treat a scalable trip count as unknown.
4707 if (BestKnownTC && (BestKnownTC->isFixed() || VF.isScalable())) {
4708 // Re-evaluate trip counts and VFs to be in the same numerical space.
4709 unsigned AvailableTC =
4710 estimateElementCount(*BestKnownTC, CM.getVScaleForTuning());
4711 unsigned EstimatedVF = estimateElementCount(VF, CM.getVScaleForTuning());
4712
4713 // At least one iteration must be scalar when this constraint holds. So the
4714 // maximum available iterations for interleaving is one less.
4715 if (CM.requiresScalarEpilogue(VF.isVector()))
4716 --AvailableTC;
4717
4718 unsigned InterleaveCountLB = bit_floor(std::max(
4719 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
4720
4721 if (getSmallConstantTripCount(PSE.getSE(), OrigLoop).isNonZero()) {
4722 // If the best known trip count is exact, we select between two
4723 // prospective ICs, where
4724 //
4725 // 1) the aggressive IC is capped by the trip count divided by VF
4726 // 2) the conservative IC is capped by the trip count divided by (VF * 2)
4727 //
4728 // The final IC is selected in a way that the epilogue loop trip count is
4729 // minimized while maximizing the IC itself, so that we either run the
4730 // vector loop at least once if it generates a small epilogue loop, or
4731 // else we run the vector loop at least twice.
4732
4733 unsigned InterleaveCountUB = bit_floor(std::max(
4734 1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
4735 MaxInterleaveCount = InterleaveCountLB;
4736
4737 if (InterleaveCountUB != InterleaveCountLB) {
4738 unsigned TailTripCountUB =
4739 (AvailableTC % (EstimatedVF * InterleaveCountUB));
4740 unsigned TailTripCountLB =
4741 (AvailableTC % (EstimatedVF * InterleaveCountLB));
4742 // If both produce same scalar tail, maximize the IC to do the same work
4743 // in fewer vector loop iterations
4744 if (TailTripCountUB == TailTripCountLB)
4745 MaxInterleaveCount = InterleaveCountUB;
4746 }
4747 } else {
4748 // If trip count is an estimated compile time constant, limit the
4749 // IC to be capped by the trip count divided by VF * 2, such that the
4750 // vector loop runs at least twice to make interleaving seem profitable
4751 // when there is an epilogue loop present. Since exact Trip count is not
4752 // known we choose to be conservative in our IC estimate.
4753 MaxInterleaveCount = InterleaveCountLB;
4754 }
4755 }
4756
4757 assert(MaxInterleaveCount > 0 &&
4758 "Maximum interleave count must be greater than 0");
4759
4760 // Clamp the calculated IC to be between the 1 and the max interleave count
4761 // that the target and trip count allows.
4762 if (IC > MaxInterleaveCount)
4763 IC = MaxInterleaveCount;
4764 else
4765 // Make sure IC is greater than 0.
4766 IC = std::max(1u, IC);
4767
4768 assert(IC > 0 && "Interleave count must be greater than 0.");
4769
4770 // Interleave if we vectorized this loop and there is a reduction that could
4771 // benefit from interleaving.
4772 if (VF.isVector() && HasReductions) {
4773 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
4774 return IC;
4775 }
4776
4777 // For any scalar loop that either requires runtime checks or predication we
4778 // are better off leaving this to the unroller. Note that if we've already
4779 // vectorized the loop we will have done the runtime check and so interleaving
4780 // won't require further checks.
4781 bool ScalarInterleavingRequiresPredication =
4782 (VF.isScalar() && any_of(OrigLoop->blocks(), [this](BasicBlock *BB) {
4783 return Legal->blockNeedsPredication(BB);
4784 }));
4785 bool ScalarInterleavingRequiresRuntimePointerCheck =
4786 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
4787
4788 // We want to interleave small loops in order to reduce the loop overhead and
4789 // potentially expose ILP opportunities.
4790 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
4791 << "LV: IC is " << IC << '\n'
4792 << "LV: VF is " << VF << '\n');
4793 const bool AggressivelyInterleaveReductions =
4794 TTI.enableAggressiveInterleaving(HasReductions);
4795 if (!ScalarInterleavingRequiresRuntimePointerCheck &&
4796 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
4797 // We assume that the cost overhead is 1 and we use the cost model
4798 // to estimate the cost of the loop and interleave until the cost of the
4799 // loop overhead is about 5% of the cost of the loop.
4800 unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>(
4801 SmallLoopCost / LoopCost.getValue()));
4802
4803 // Interleave until store/load ports (estimated by max interleave count) are
4804 // saturated.
4805 unsigned NumStores = 0;
4806 unsigned NumLoads = 0;
4809 for (VPRecipeBase &R : *VPBB) {
4811 NumLoads++;
4812 continue;
4813 }
4815 NumStores++;
4816 continue;
4817 }
4818
4819 if (auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R)) {
4820 if (unsigned StoreOps = InterleaveR->getNumStoreOperands())
4821 NumStores += StoreOps;
4822 else
4823 NumLoads += InterleaveR->getNumDefinedValues();
4824 continue;
4825 }
4826 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
4827 NumLoads += isa<LoadInst>(RepR->getUnderlyingInstr());
4828 NumStores += isa<StoreInst>(RepR->getUnderlyingInstr());
4829 continue;
4830 }
4831 if (isa<VPHistogramRecipe>(&R)) {
4832 NumLoads++;
4833 NumStores++;
4834 continue;
4835 }
4836 }
4837 }
4838 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
4839 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
4840
4841 // There is little point in interleaving for reductions containing selects
4842 // and compares when VF=1 since it may just create more overhead than it's
4843 // worth for loops with small trip counts. This is because we still have to
4844 // do the final reduction after the loop.
4845 bool HasSelectCmpReductions =
4846 HasReductions &&
4848 [](VPRecipeBase &R) {
4849 auto *RedR = dyn_cast<VPReductionPHIRecipe>(&R);
4850 return RedR && (RecurrenceDescriptor::isAnyOfRecurrenceKind(
4851 RedR->getRecurrenceKind()) ||
4852 RecurrenceDescriptor::isFindIVRecurrenceKind(
4853 RedR->getRecurrenceKind()));
4854 });
4855 if (HasSelectCmpReductions) {
4856 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
4857 return 1;
4858 }
4859
4860 // If we have a scalar reduction (vector reductions are already dealt with
4861 // by this point), we can increase the critical path length if the loop
4862 // we're interleaving is inside another loop. For tree-wise reductions
4863 // set the limit to 2, and for ordered reductions it's best to disable
4864 // interleaving entirely.
4865 if (HasReductions && OrigLoop->getLoopDepth() > 1) {
4866 bool HasOrderedReductions =
4868 [](VPRecipeBase &R) {
4869 auto *RedR = dyn_cast<VPReductionPHIRecipe>(&R);
4870
4871 return RedR && RedR->isOrdered();
4872 });
4873 if (HasOrderedReductions) {
4874 LLVM_DEBUG(
4875 dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
4876 return 1;
4877 }
4878
4879 unsigned F = MaxNestedScalarReductionIC;
4880 SmallIC = std::min(SmallIC, F);
4881 StoresIC = std::min(StoresIC, F);
4882 LoadsIC = std::min(LoadsIC, F);
4883 }
4884
4886 std::max(StoresIC, LoadsIC) > SmallIC) {
4887 LLVM_DEBUG(
4888 dbgs() << "LV: Interleaving to saturate store or load ports.\n");
4889 return std::max(StoresIC, LoadsIC);
4890 }
4891
4892 // If there are scalar reductions and TTI has enabled aggressive
4893 // interleaving for reductions, we will interleave to expose ILP.
4894 if (VF.isScalar() && AggressivelyInterleaveReductions) {
4895 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
4896 // Interleave no less than SmallIC but not as aggressive as the normal IC
4897 // to satisfy the rare situation when resources are too limited.
4898 return std::max(IC / 2, SmallIC);
4899 }
4900
4901 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
4902 return SmallIC;
4903 }
4904
4905 // Interleave if this is a large loop (small loops are already dealt with by
4906 // this point) that could benefit from interleaving.
4907 if (AggressivelyInterleaveReductions) {
4908 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
4909 return IC;
4910 }
4911
4912 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
4913 return 1;
4914}
4915
4916bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
4917 ElementCount VF) {
4918 // TODO: Cost model for emulated masked load/store is completely
4919 // broken. This hack guides the cost model to use an artificially
4920 // high enough value to practically disable vectorization with such
4921 // operations, except where previously deployed legality hack allowed
4922 // using very low cost values. This is to avoid regressions coming simply
4923 // from moving "masked load/store" check from legality to cost model.
4924 // Masked Load/Gather emulation was previously never allowed.
4925 // Limited number of Masked Store/Scatter emulation was allowed.
4926 assert((isPredicatedInst(I)) &&
4927 "Expecting a scalar emulated instruction");
4928 return isa<LoadInst>(I) ||
4929 (isa<StoreInst>(I) &&
4930 NumPredStores > NumberOfStoresToPredicate);
4931}
4932
4934 assert(VF.isVector() && "Expected VF >= 2");
4935
4936 // If we've already collected the instructions to scalarize or the predicated
4937 // BBs after vectorization, there's nothing to do. Collection may already have
4938 // occurred if we have a user-selected VF and are now computing the expected
4939 // cost for interleaving.
4940 if (InstsToScalarize.contains(VF) ||
4941 PredicatedBBsAfterVectorization.contains(VF))
4942 return;
4943
4944 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
4945 // not profitable to scalarize any instructions, the presence of VF in the
4946 // map will indicate that we've analyzed it already.
4947 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
4948
4949 // Find all the instructions that are scalar with predication in the loop and
4950 // determine if it would be better to not if-convert the blocks they are in.
4951 // If so, we also record the instructions to scalarize.
4952 for (BasicBlock *BB : TheLoop->blocks()) {
4954 continue;
4955 for (Instruction &I : *BB)
4956 if (isScalarWithPredication(&I, VF)) {
4957 ScalarCostsTy ScalarCosts;
4958 // Do not apply discount logic for:
4959 // 1. Scalars after vectorization, as there will only be a single copy
4960 // of the instruction.
4961 // 2. Scalable VF, as that would lead to invalid scalarization costs.
4962 // 3. Emulated masked memrefs, if a hacked cost is needed.
4963 if (!isScalarAfterVectorization(&I, VF) && !VF.isScalable() &&
4964 !useEmulatedMaskMemRefHack(&I, VF) &&
4965 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) {
4966 for (const auto &[I, IC] : ScalarCosts)
4967 ScalarCostsVF.insert({I, IC});
4968 // Check if we decided to scalarize a call. If so, update the widening
4969 // decision of the call to CM_Scalarize with the computed scalar cost.
4970 for (const auto &[I, Cost] : ScalarCosts) {
4971 auto *CI = dyn_cast<CallInst>(I);
4972 if (!CI || !CallWideningDecisions.contains({CI, VF}))
4973 continue;
4974 CallWideningDecisions[{CI, VF}].Kind = CM_Scalarize;
4975 CallWideningDecisions[{CI, VF}].Cost = Cost;
4976 }
4977 }
4978 // Remember that BB will remain after vectorization.
4979 PredicatedBBsAfterVectorization[VF].insert(BB);
4980 for (auto *Pred : predecessors(BB)) {
4981 if (Pred->getSingleSuccessor() == BB)
4982 PredicatedBBsAfterVectorization[VF].insert(Pred);
4983 }
4984 }
4985 }
4986}
4987
4988InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
4989 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
4990 assert(!isUniformAfterVectorization(PredInst, VF) &&
4991 "Instruction marked uniform-after-vectorization will be predicated");
4992
4993 // Initialize the discount to zero, meaning that the scalar version and the
4994 // vector version cost the same.
4995 InstructionCost Discount = 0;
4996
4997 // Holds instructions to analyze. The instructions we visit are mapped in
4998 // ScalarCosts. Those instructions are the ones that would be scalarized if
4999 // we find that the scalar version costs less.
5001
5002 // Returns true if the given instruction can be scalarized.
5003 auto CanBeScalarized = [&](Instruction *I) -> bool {
5004 // We only attempt to scalarize instructions forming a single-use chain
5005 // from the original predicated block that would otherwise be vectorized.
5006 // Although not strictly necessary, we give up on instructions we know will
5007 // already be scalar to avoid traversing chains that are unlikely to be
5008 // beneficial.
5009 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5010 isScalarAfterVectorization(I, VF))
5011 return false;
5012
5013 // If the instruction is scalar with predication, it will be analyzed
5014 // separately. We ignore it within the context of PredInst.
5015 if (isScalarWithPredication(I, VF))
5016 return false;
5017
5018 // If any of the instruction's operands are uniform after vectorization,
5019 // the instruction cannot be scalarized. This prevents, for example, a
5020 // masked load from being scalarized.
5021 //
5022 // We assume we will only emit a value for lane zero of an instruction
5023 // marked uniform after vectorization, rather than VF identical values.
5024 // Thus, if we scalarize an instruction that uses a uniform, we would
5025 // create uses of values corresponding to the lanes we aren't emitting code
5026 // for. This behavior can be changed by allowing getScalarValue to clone
5027 // the lane zero values for uniforms rather than asserting.
5028 for (Use &U : I->operands())
5029 if (auto *J = dyn_cast<Instruction>(U.get()))
5030 if (isUniformAfterVectorization(J, VF))
5031 return false;
5032
5033 // Otherwise, we can scalarize the instruction.
5034 return true;
5035 };
5036
5037 // Compute the expected cost discount from scalarizing the entire expression
5038 // feeding the predicated instruction. We currently only consider expressions
5039 // that are single-use instruction chains.
5040 Worklist.push_back(PredInst);
5041 while (!Worklist.empty()) {
5042 Instruction *I = Worklist.pop_back_val();
5043
5044 // If we've already analyzed the instruction, there's nothing to do.
5045 if (ScalarCosts.contains(I))
5046 continue;
5047
5048 // Cannot scalarize fixed-order recurrence phis at the moment.
5049 if (isa<PHINode>(I) && Legal->isFixedOrderRecurrence(cast<PHINode>(I)))
5050 continue;
5051
5052 // Compute the cost of the vector instruction. Note that this cost already
5053 // includes the scalarization overhead of the predicated instruction.
5054 InstructionCost VectorCost = getInstructionCost(I, VF);
5055
5056 // Compute the cost of the scalarized instruction. This cost is the cost of
5057 // the instruction as if it wasn't if-converted and instead remained in the
5058 // predicated block. We will scale this cost by block probability after
5059 // computing the scalarization overhead.
5060 InstructionCost ScalarCost =
5061 VF.getFixedValue() * getInstructionCost(I, ElementCount::getFixed(1));
5062
5063 // Compute the scalarization overhead of needed insertelement instructions
5064 // and phi nodes.
5065 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5066 Type *WideTy = toVectorizedTy(I->getType(), VF);
5067 for (Type *VectorTy : getContainedTypes(WideTy)) {
5068 ScalarCost += TTI.getScalarizationOverhead(
5070 /*Insert=*/true,
5071 /*Extract=*/false, CostKind);
5072 }
5073 ScalarCost +=
5074 VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
5075 }
5076
5077 // Compute the scalarization overhead of needed extractelement
5078 // instructions. For each of the instruction's operands, if the operand can
5079 // be scalarized, add it to the worklist; otherwise, account for the
5080 // overhead.
5081 for (Use &U : I->operands())
5082 if (auto *J = dyn_cast<Instruction>(U.get())) {
5083 assert(canVectorizeTy(J->getType()) &&
5084 "Instruction has non-scalar type");
5085 if (CanBeScalarized(J))
5086 Worklist.push_back(J);
5087 else if (needsExtract(J, VF)) {
5088 Type *WideTy = toVectorizedTy(J->getType(), VF);
5089 for (Type *VectorTy : getContainedTypes(WideTy)) {
5090 ScalarCost += TTI.getScalarizationOverhead(
5091 cast<VectorType>(VectorTy),
5092 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
5093 /*Extract*/ true, CostKind);
5094 }
5095 }
5096 }
5097
5098 // Scale the total scalar cost by block probability.
5099 ScalarCost /= getPredBlockCostDivisor(CostKind, I->getParent());
5100
5101 // Compute the discount. A non-negative discount means the vector version
5102 // of the instruction costs more, and scalarizing would be beneficial.
5103 Discount += VectorCost - ScalarCost;
5104 ScalarCosts[I] = ScalarCost;
5105 }
5106
5107 return Discount;
5108}
5109
5112
5113 // If the vector loop gets executed exactly once with the given VF, ignore the
5114 // costs of comparison and induction instructions, as they'll get simplified
5115 // away.
5116 SmallPtrSet<Instruction *, 2> ValuesToIgnoreForVF;
5117 auto TC = getSmallConstantTripCount(PSE.getSE(), TheLoop);
5118 if (TC == VF && !foldTailByMasking())
5120 ValuesToIgnoreForVF);
5121
5122 // For each block.
5123 for (BasicBlock *BB : TheLoop->blocks()) {
5124 InstructionCost BlockCost;
5125
5126 // For each instruction in the old loop.
5127 for (Instruction &I : BB->instructionsWithoutDebug()) {
5128 // Skip ignored values.
5129 if (ValuesToIgnore.count(&I) || ValuesToIgnoreForVF.count(&I) ||
5130 (VF.isVector() && VecValuesToIgnore.count(&I)))
5131 continue;
5132
5134
5135 // Check if we should override the cost.
5136 if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0)
5138
5139 BlockCost += C;
5140 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
5141 << VF << " For instruction: " << I << '\n');
5142 }
5143
5144 // If we are vectorizing a predicated block, it will have been
5145 // if-converted. This means that the block's instructions (aside from
5146 // stores and instructions that may divide by zero) will now be
5147 // unconditionally executed. For the scalar case, we may not always execute
5148 // the predicated block, if it is an if-else block. Thus, scale the block's
5149 // cost by the probability of executing it.
5150 // getPredBlockCostDivisor will return 1 for blocks that are only predicated
5151 // by the header mask when folding the tail.
5152 if (VF.isScalar())
5153 BlockCost /= getPredBlockCostDivisor(CostKind, BB);
5154
5155 Cost += BlockCost;
5156 }
5157
5158 return Cost;
5159}
5160
5161/// Gets Address Access SCEV after verifying that the access pattern
5162/// is loop invariant except the induction variable dependence.
5163///
5164/// This SCEV can be sent to the Target in order to estimate the address
5165/// calculation cost.
5167 Value *Ptr,
5170 const Loop *TheLoop) {
5171
5172 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5173 if (!Gep)
5174 return nullptr;
5175
5176 // We are looking for a gep with all loop invariant indices except for one
5177 // which should be an induction variable.
5178 auto *SE = PSE.getSE();
5179 unsigned NumOperands = Gep->getNumOperands();
5180 for (unsigned Idx = 1; Idx < NumOperands; ++Idx) {
5181 Value *Opd = Gep->getOperand(Idx);
5182 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5183 !Legal->isInductionVariable(Opd))
5184 return nullptr;
5185 }
5186
5187 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5188 return PSE.getSCEV(Ptr);
5189}
5190
5192LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5193 ElementCount VF) {
5194 assert(VF.isVector() &&
5195 "Scalarization cost of instruction implies vectorization.");
5196 if (VF.isScalable())
5197 return InstructionCost::getInvalid();
5198
5199 Type *ValTy = getLoadStoreType(I);
5200 auto *SE = PSE.getSE();
5201
5202 unsigned AS = getLoadStoreAddressSpace(I);
5204 Type *PtrTy = toVectorTy(Ptr->getType(), VF);
5205 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
5206 // that it is being called from this specific place.
5207
5208 // Figure out whether the access is strided and get the stride value
5209 // if it's known in compile time
5210 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5211
5212 // Get the cost of the scalar memory instruction and address computation.
5214 PtrTy, SE, PtrSCEV, CostKind);
5215
5216 // Don't pass *I here, since it is scalar but will actually be part of a
5217 // vectorized loop where the user of it is a vectorized instruction.
5218 const Align Alignment = getLoadStoreAlignment(I);
5219 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
5220 Cost += VF.getFixedValue() *
5221 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
5222 AS, CostKind, OpInfo);
5223
5224 // Get the overhead of the extractelement and insertelement instructions
5225 // we might create due to scalarization.
5227
5228 // If we have a predicated load/store, it will need extra i1 extracts and
5229 // conditional branches, but may not be executed for each vector lane. Scale
5230 // the cost by the probability of executing the predicated block.
5231 if (isPredicatedInst(I)) {
5232 Cost /= getPredBlockCostDivisor(CostKind, I->getParent());
5233
5234 // Add the cost of an i1 extract and a branch
5235 auto *VecI1Ty =
5236 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
5238 VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
5239 /*Insert=*/false, /*Extract=*/true, CostKind);
5240 Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
5241
5242 if (useEmulatedMaskMemRefHack(I, VF))
5243 // Artificially setting to a high enough value to practically disable
5244 // vectorization with such operations.
5245 Cost = 3000000;
5246 }
5247
5248 return Cost;
5249}
5250
5252LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5253 ElementCount VF) {
5254 Type *ValTy = getLoadStoreType(I);
5255 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5257 unsigned AS = getLoadStoreAddressSpace(I);
5258 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
5259
5260 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5261 "Stride should be 1 or -1 for consecutive memory access");
5262 const Align Alignment = getLoadStoreAlignment(I);
5264 if (Legal->isMaskRequired(I)) {
5265 unsigned IID = I->getOpcode() == Instruction::Load
5266 ? Intrinsic::masked_load
5267 : Intrinsic::masked_store;
5268 Cost += TTI.getMaskedMemoryOpCost({IID, VectorTy, Alignment, AS}, CostKind);
5269 } else {
5270 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
5271 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5272 CostKind, OpInfo, I);
5273 }
5274
5275 bool Reverse = ConsecutiveStride < 0;
5276 if (Reverse)
5278 VectorTy, {}, CostKind, 0);
5279 return Cost;
5280}
5281
5283LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5284 ElementCount VF) {
5285 assert(Legal->isUniformMemOp(*I, VF));
5286
5287 Type *ValTy = getLoadStoreType(I);
5289 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5290 const Align Alignment = getLoadStoreAlignment(I);
5291 unsigned AS = getLoadStoreAddressSpace(I);
5292 if (isa<LoadInst>(I)) {
5293 return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
5294 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
5295 CostKind) +
5297 VectorTy, {}, CostKind);
5298 }
5299 StoreInst *SI = cast<StoreInst>(I);
5300
5301 bool IsLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
5302 // TODO: We have existing tests that request the cost of extracting element
5303 // VF.getKnownMinValue() - 1 from a scalable vector. This does not represent
5304 // the actual generated code, which involves extracting the last element of
5305 // a scalable vector where the lane to extract is unknown at compile time.
5307 TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
5308 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, CostKind);
5309 if (!IsLoopInvariantStoreValue)
5310 Cost += TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement,
5311 VectorTy, CostKind, 0);
5312 return Cost;
5313}
5314
5316LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5317 ElementCount VF) {
5318 Type *ValTy = getLoadStoreType(I);
5319 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5320 const Align Alignment = getLoadStoreAlignment(I);
5322 Type *PtrTy = Ptr->getType();
5323
5324 if (!Legal->isUniform(Ptr, VF))
5325 PtrTy = toVectorTy(PtrTy, VF);
5326
5327 return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
5328 TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5329 Legal->isMaskRequired(I), Alignment,
5330 CostKind, I);
5331}
5332
5334LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5335 ElementCount VF) {
5336 const auto *Group = getInterleavedAccessGroup(I);
5337 assert(Group && "Fail to get an interleaved access group.");
5338
5339 Instruction *InsertPos = Group->getInsertPos();
5340 Type *ValTy = getLoadStoreType(InsertPos);
5341 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5342 unsigned AS = getLoadStoreAddressSpace(InsertPos);
5343
5344 unsigned InterleaveFactor = Group->getFactor();
5345 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5346
5347 // Holds the indices of existing members in the interleaved group.
5348 SmallVector<unsigned, 4> Indices;
5349 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
5350 if (Group->getMember(IF))
5351 Indices.push_back(IF);
5352
5353 // Calculate the cost of the whole interleaved group.
5354 bool UseMaskForGaps =
5355 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
5356 (isa<StoreInst>(I) && !Group->isFull());
5358 InsertPos->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5359 Group->getAlign(), AS, CostKind, Legal->isMaskRequired(I),
5360 UseMaskForGaps);
5361
5362 if (Group->isReverse()) {
5363 // TODO: Add support for reversed masked interleaved access.
5364 assert(!Legal->isMaskRequired(I) &&
5365 "Reverse masked interleaved access not supported.");
5366 Cost += Group->getNumMembers() *
5368 VectorTy, {}, CostKind, 0);
5369 }
5370 return Cost;
5371}
5372
5373std::optional<InstructionCost>
5375 ElementCount VF,
5376 Type *Ty) const {
5377 using namespace llvm::PatternMatch;
5378 // Early exit for no inloop reductions
5379 if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
5380 return std::nullopt;
5381 auto *VectorTy = cast<VectorType>(Ty);
5382
5383 // We are looking for a pattern of, and finding the minimal acceptable cost:
5384 // reduce(mul(ext(A), ext(B))) or
5385 // reduce(mul(A, B)) or
5386 // reduce(ext(A)) or
5387 // reduce(A).
5388 // The basic idea is that we walk down the tree to do that, finding the root
5389 // reduction instruction in InLoopReductionImmediateChains. From there we find
5390 // the pattern of mul/ext and test the cost of the entire pattern vs the cost
5391 // of the components. If the reduction cost is lower then we return it for the
5392 // reduction instruction and 0 for the other instructions in the pattern. If
5393 // it is not we return an invalid cost specifying the orignal cost method
5394 // should be used.
5395 Instruction *RetI = I;
5396 if (match(RetI, m_ZExtOrSExt(m_Value()))) {
5397 if (!RetI->hasOneUser())
5398 return std::nullopt;
5399 RetI = RetI->user_back();
5400 }
5401
5402 if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
5403 RetI->user_back()->getOpcode() == Instruction::Add) {
5404 RetI = RetI->user_back();
5405 }
5406
5407 // Test if the found instruction is a reduction, and if not return an invalid
5408 // cost specifying the parent to use the original cost modelling.
5409 Instruction *LastChain = InLoopReductionImmediateChains.lookup(RetI);
5410 if (!LastChain)
5411 return std::nullopt;
5412
5413 // Find the reduction this chain is a part of and calculate the basic cost of
5414 // the reduction on its own.
5415 Instruction *ReductionPhi = LastChain;
5416 while (!isa<PHINode>(ReductionPhi))
5417 ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi);
5418
5419 const RecurrenceDescriptor &RdxDesc =
5420 Legal->getRecurrenceDescriptor(cast<PHINode>(ReductionPhi));
5421
5422 InstructionCost BaseCost;
5423 RecurKind RK = RdxDesc.getRecurrenceKind();
5426 BaseCost = TTI.getMinMaxReductionCost(MinMaxID, VectorTy,
5427 RdxDesc.getFastMathFlags(), CostKind);
5428 } else {
5429 BaseCost = TTI.getArithmeticReductionCost(
5430 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
5431 }
5432
5433 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
5434 // normal fmul instruction to the cost of the fadd reduction.
5435 if (RK == RecurKind::FMulAdd)
5436 BaseCost +=
5437 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
5438
5439 // If we're using ordered reductions then we can just return the base cost
5440 // here, since getArithmeticReductionCost calculates the full ordered
5441 // reduction cost when FP reassociation is not allowed.
5442 if (useOrderedReductions(RdxDesc))
5443 return BaseCost;
5444
5445 // Get the operand that was not the reduction chain and match it to one of the
5446 // patterns, returning the better cost if it is found.
5447 Instruction *RedOp = RetI->getOperand(1) == LastChain
5450
5451 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
5452
5453 Instruction *Op0, *Op1;
5454 if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5455 match(RedOp,
5457 match(Op0, m_ZExtOrSExt(m_Value())) &&
5458 Op0->getOpcode() == Op1->getOpcode() &&
5459 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
5460 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
5461 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
5462
5463 // Matched reduce.add(ext(mul(ext(A), ext(B)))
5464 // Note that the extend opcodes need to all match, or if A==B they will have
5465 // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
5466 // which is equally fine.
5467 bool IsUnsigned = isa<ZExtInst>(Op0);
5468 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
5469 auto *MulType = VectorType::get(Op0->getType(), VectorTy);
5470
5471 InstructionCost ExtCost =
5472 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
5474 InstructionCost MulCost =
5475 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
5476 InstructionCost Ext2Cost =
5477 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
5479
5480 InstructionCost RedCost = TTI.getMulAccReductionCost(
5481 IsUnsigned, RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), ExtType,
5482 CostKind);
5483
5484 if (RedCost.isValid() &&
5485 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
5486 return I == RetI ? RedCost : 0;
5487 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
5488 !TheLoop->isLoopInvariant(RedOp)) {
5489 // Matched reduce(ext(A))
5490 bool IsUnsigned = isa<ZExtInst>(RedOp);
5491 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
5492 InstructionCost RedCost = TTI.getExtendedReductionCost(
5493 RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
5494 RdxDesc.getFastMathFlags(), CostKind);
5495
5496 InstructionCost ExtCost =
5497 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
5499 if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
5500 return I == RetI ? RedCost : 0;
5501 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5502 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
5503 if (match(Op0, m_ZExtOrSExt(m_Value())) &&
5504 Op0->getOpcode() == Op1->getOpcode() &&
5505 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
5506 bool IsUnsigned = isa<ZExtInst>(Op0);
5507 Type *Op0Ty = Op0->getOperand(0)->getType();
5508 Type *Op1Ty = Op1->getOperand(0)->getType();
5509 Type *LargestOpTy =
5510 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
5511 : Op0Ty;
5512 auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
5513
5514 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
5515 // different sizes. We take the largest type as the ext to reduce, and add
5516 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
5517 InstructionCost ExtCost0 = TTI.getCastInstrCost(
5518 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
5520 InstructionCost ExtCost1 = TTI.getCastInstrCost(
5521 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
5523 InstructionCost MulCost =
5524 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
5525
5526 InstructionCost RedCost = TTI.getMulAccReductionCost(
5527 IsUnsigned, RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), ExtType,
5528 CostKind);
5529 InstructionCost ExtraExtCost = 0;
5530 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
5531 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
5532 ExtraExtCost = TTI.getCastInstrCost(
5533 ExtraExtOp->getOpcode(), ExtType,
5534 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
5536 }
5537
5538 if (RedCost.isValid() &&
5539 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
5540 return I == RetI ? RedCost : 0;
5541 } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
5542 // Matched reduce.add(mul())
5543 InstructionCost MulCost =
5544 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
5545
5546 InstructionCost RedCost = TTI.getMulAccReductionCost(
5547 true, RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), VectorTy,
5548 CostKind);
5549
5550 if (RedCost.isValid() && RedCost < MulCost + BaseCost)
5551 return I == RetI ? RedCost : 0;
5552 }
5553 }
5554
5555 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
5556}
5557
5559LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5560 ElementCount VF) {
5561 // Calculate scalar cost only. Vectorization cost should be ready at this
5562 // moment.
5563 if (VF.isScalar()) {
5564 Type *ValTy = getLoadStoreType(I);
5566 const Align Alignment = getLoadStoreAlignment(I);
5567 unsigned AS = getLoadStoreAddressSpace(I);
5568
5569 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
5570 return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
5571 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, CostKind,
5572 OpInfo, I);
5573 }
5574 return getWideningCost(I, VF);
5575}
5576
5578LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5579 ElementCount VF) const {
5580
5581 // There is no mechanism yet to create a scalable scalarization loop,
5582 // so this is currently Invalid.
5583 if (VF.isScalable())
5584 return InstructionCost::getInvalid();
5585
5586 if (VF.isScalar())
5587 return 0;
5588
5590 Type *RetTy = toVectorizedTy(I->getType(), VF);
5591 if (!RetTy->isVoidTy() &&
5593
5594 for (Type *VectorTy : getContainedTypes(RetTy)) {
5597 /*Insert=*/true,
5598 /*Extract=*/false, CostKind);
5599 }
5600 }
5601
5602 // Some targets keep addresses scalar.
5604 return Cost;
5605
5606 // Some targets support efficient element stores.
5608 return Cost;
5609
5610 // Collect operands to consider.
5611 CallInst *CI = dyn_cast<CallInst>(I);
5612 Instruction::op_range Ops = CI ? CI->args() : I->operands();
5613
5614 // Skip operands that do not require extraction/scalarization and do not incur
5615 // any overhead.
5617 for (auto *V : filterExtractingOperands(Ops, VF))
5618 Tys.push_back(maybeVectorizeType(V->getType(), VF));
5620}
5621
5623 if (VF.isScalar())
5624 return;
5625 NumPredStores = 0;
5626 for (BasicBlock *BB : TheLoop->blocks()) {
5627 // For each instruction in the old loop.
5628 for (Instruction &I : *BB) {
5630 if (!Ptr)
5631 continue;
5632
5633 // TODO: We should generate better code and update the cost model for
5634 // predicated uniform stores. Today they are treated as any other
5635 // predicated store (see added test cases in
5636 // invariant-store-vectorization.ll).
5638 NumPredStores++;
5639
5640 if (Legal->isUniformMemOp(I, VF)) {
5641 auto IsLegalToScalarize = [&]() {
5642 if (!VF.isScalable())
5643 // Scalarization of fixed length vectors "just works".
5644 return true;
5645
5646 // We have dedicated lowering for unpredicated uniform loads and
5647 // stores. Note that even with tail folding we know that at least
5648 // one lane is active (i.e. generalized predication is not possible
5649 // here), and the logic below depends on this fact.
5650 if (!foldTailByMasking())
5651 return true;
5652
5653 // For scalable vectors, a uniform memop load is always
5654 // uniform-by-parts and we know how to scalarize that.
5655 if (isa<LoadInst>(I))
5656 return true;
5657
5658 // A uniform store isn't neccessarily uniform-by-part
5659 // and we can't assume scalarization.
5660 auto &SI = cast<StoreInst>(I);
5661 return TheLoop->isLoopInvariant(SI.getValueOperand());
5662 };
5663
5664 const InstructionCost GatherScatterCost =
5666 getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
5667
5668 // Load: Scalar load + broadcast
5669 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
5670 // FIXME: This cost is a significant under-estimate for tail folded
5671 // memory ops.
5672 const InstructionCost ScalarizationCost =
5673 IsLegalToScalarize() ? getUniformMemOpCost(&I, VF)
5675
5676 // Choose better solution for the current VF, Note that Invalid
5677 // costs compare as maximumal large. If both are invalid, we get
5678 // scalable invalid which signals a failure and a vectorization abort.
5679 if (GatherScatterCost < ScalarizationCost)
5680 setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
5681 else
5682 setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
5683 continue;
5684 }
5685
5686 // We assume that widening is the best solution when possible.
5687 if (memoryInstructionCanBeWidened(&I, VF)) {
5688 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
5689 int ConsecutiveStride = Legal->isConsecutivePtr(
5691 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5692 "Expected consecutive stride.");
5693 InstWidening Decision =
5694 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
5695 setWideningDecision(&I, VF, Decision, Cost);
5696 continue;
5697 }
5698
5699 // Choose between Interleaving, Gather/Scatter or Scalarization.
5701 unsigned NumAccesses = 1;
5702 if (isAccessInterleaved(&I)) {
5703 const auto *Group = getInterleavedAccessGroup(&I);
5704 assert(Group && "Fail to get an interleaved access group.");
5705
5706 // Make one decision for the whole group.
5707 if (getWideningDecision(&I, VF) != CM_Unknown)
5708 continue;
5709
5710 NumAccesses = Group->getNumMembers();
5712 InterleaveCost = getInterleaveGroupCost(&I, VF);
5713 }
5714
5715 InstructionCost GatherScatterCost =
5717 ? getGatherScatterCost(&I, VF) * NumAccesses
5719
5720 InstructionCost ScalarizationCost =
5721 getMemInstScalarizationCost(&I, VF) * NumAccesses;
5722
5723 // Choose better solution for the current VF,
5724 // write down this decision and use it during vectorization.
5726 InstWidening Decision;
5727 if (InterleaveCost <= GatherScatterCost &&
5728 InterleaveCost < ScalarizationCost) {
5729 Decision = CM_Interleave;
5730 Cost = InterleaveCost;
5731 } else if (GatherScatterCost < ScalarizationCost) {
5732 Decision = CM_GatherScatter;
5733 Cost = GatherScatterCost;
5734 } else {
5735 Decision = CM_Scalarize;
5736 Cost = ScalarizationCost;
5737 }
5738 // If the instructions belongs to an interleave group, the whole group
5739 // receives the same decision. The whole group receives the cost, but
5740 // the cost will actually be assigned to one instruction.
5741 if (const auto *Group = getInterleavedAccessGroup(&I)) {
5742 if (Decision == CM_Scalarize) {
5743 for (unsigned Idx = 0; Idx < Group->getFactor(); ++Idx) {
5744 if (auto *I = Group->getMember(Idx)) {
5745 setWideningDecision(I, VF, Decision,
5746 getMemInstScalarizationCost(I, VF));
5747 }
5748 }
5749 } else {
5750 setWideningDecision(Group, VF, Decision, Cost);
5751 }
5752 } else
5753 setWideningDecision(&I, VF, Decision, Cost);
5754 }
5755 }
5756
5757 // Make sure that any load of address and any other address computation
5758 // remains scalar unless there is gather/scatter support. This avoids
5759 // inevitable extracts into address registers, and also has the benefit of
5760 // activating LSR more, since that pass can't optimize vectorized
5761 // addresses.
5762 if (TTI.prefersVectorizedAddressing())
5763 return;
5764
5765 // Start with all scalar pointer uses.
5767 for (BasicBlock *BB : TheLoop->blocks())
5768 for (Instruction &I : *BB) {
5769 Instruction *PtrDef =
5771 if (PtrDef && TheLoop->contains(PtrDef) &&
5773 AddrDefs.insert(PtrDef);
5774 }
5775
5776 // Add all instructions used to generate the addresses.
5778 append_range(Worklist, AddrDefs);
5779 while (!Worklist.empty()) {
5780 Instruction *I = Worklist.pop_back_val();
5781 for (auto &Op : I->operands())
5782 if (auto *InstOp = dyn_cast<Instruction>(Op))
5783 if (TheLoop->contains(InstOp) && !isa<PHINode>(InstOp) &&
5784 AddrDefs.insert(InstOp).second)
5785 Worklist.push_back(InstOp);
5786 }
5787
5788 auto UpdateMemOpUserCost = [this, VF](LoadInst *LI) {
5789 // If there are direct memory op users of the newly scalarized load,
5790 // their cost may have changed because there's no scalarization
5791 // overhead for the operand. Update it.
5792 for (User *U : LI->users()) {
5794 continue;
5796 continue;
5799 getMemInstScalarizationCost(cast<Instruction>(U), VF));
5800 }
5801 };
5802 for (auto *I : AddrDefs) {
5803 if (isa<LoadInst>(I)) {
5804 // Setting the desired widening decision should ideally be handled in
5805 // by cost functions, but since this involves the task of finding out
5806 // if the loaded register is involved in an address computation, it is
5807 // instead changed here when we know this is the case.
5808 InstWidening Decision = getWideningDecision(I, VF);
5809 if (Decision == CM_Widen || Decision == CM_Widen_Reverse ||
5810 (!isPredicatedInst(I) && !Legal->isUniformMemOp(*I, VF) &&
5811 Decision == CM_Scalarize)) {
5812 // Scalarize a widened load of address or update the cost of a scalar
5813 // load of an address.
5815 I, VF, CM_Scalarize,
5816 (VF.getKnownMinValue() *
5817 getMemoryInstructionCost(I, ElementCount::getFixed(1))));
5818 UpdateMemOpUserCost(cast<LoadInst>(I));
5819 } else if (const auto *Group = getInterleavedAccessGroup(I)) {
5820 // Scalarize all members of this interleaved group when any member
5821 // is used as an address. The address-used load skips scalarization
5822 // overhead, other members include it.
5823 for (unsigned Idx = 0; Idx < Group->getFactor(); ++Idx) {
5824 if (Instruction *Member = Group->getMember(Idx)) {
5826 AddrDefs.contains(Member)
5827 ? (VF.getKnownMinValue() *
5828 getMemoryInstructionCost(Member,
5830 : getMemInstScalarizationCost(Member, VF);
5832 UpdateMemOpUserCost(cast<LoadInst>(Member));
5833 }
5834 }
5835 }
5836 } else {
5837 // Cannot scalarize fixed-order recurrence phis at the moment.
5838 if (isa<PHINode>(I) && Legal->isFixedOrderRecurrence(cast<PHINode>(I)))
5839 continue;
5840
5841 // Make sure I gets scalarized and a cost estimate without
5842 // scalarization overhead.
5843 ForcedScalars[VF].insert(I);
5844 }
5845 }
5846}
5847
5849 assert(!VF.isScalar() &&
5850 "Trying to set a vectorization decision for a scalar VF");
5851
5852 auto ForcedScalar = ForcedScalars.find(VF);
5853 for (BasicBlock *BB : TheLoop->blocks()) {
5854 // For each instruction in the old loop.
5855 for (Instruction &I : *BB) {
5857
5858 if (!CI)
5859 continue;
5860
5864 Function *ScalarFunc = CI->getCalledFunction();
5865 Type *ScalarRetTy = CI->getType();
5866 SmallVector<Type *, 4> Tys, ScalarTys;
5867 for (auto &ArgOp : CI->args())
5868 ScalarTys.push_back(ArgOp->getType());
5869
5870 // Estimate cost of scalarized vector call. The source operands are
5871 // assumed to be vectors, so we need to extract individual elements from
5872 // there, execute VF scalar calls, and then gather the result into the
5873 // vector return value.
5874 if (VF.isFixed()) {
5875 InstructionCost ScalarCallCost =
5876 TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
5877
5878 // Compute costs of unpacking argument values for the scalar calls and
5879 // packing the return values to a vector.
5880 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
5881 ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
5882 } else {
5883 // There is no point attempting to calculate the scalar cost for a
5884 // scalable VF as we know it will be Invalid.
5886 "Unexpected valid cost for scalarizing scalable vectors");
5887 ScalarCost = InstructionCost::getInvalid();
5888 }
5889
5890 // Honor ForcedScalars and UniformAfterVectorization decisions.
5891 // TODO: For calls, it might still be more profitable to widen. Use
5892 // VPlan-based cost model to compare different options.
5893 if (VF.isVector() && ((ForcedScalar != ForcedScalars.end() &&
5894 ForcedScalar->second.contains(CI)) ||
5895 isUniformAfterVectorization(CI, VF))) {
5896 setCallWideningDecision(CI, VF, CM_Scalarize, nullptr,
5897 Intrinsic::not_intrinsic, std::nullopt,
5898 ScalarCost);
5899 continue;
5900 }
5901
5902 bool MaskRequired = Legal->isMaskRequired(CI);
5903 // Compute corresponding vector type for return value and arguments.
5904 Type *RetTy = toVectorizedTy(ScalarRetTy, VF);
5905 for (Type *ScalarTy : ScalarTys)
5906 Tys.push_back(toVectorizedTy(ScalarTy, VF));
5907
5908 // An in-loop reduction using an fmuladd intrinsic is a special case;
5909 // we don't want the normal cost for that intrinsic.
5911 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy)) {
5914 std::nullopt, *RedCost);
5915 continue;
5916 }
5917
5918 // Find the cost of vectorizing the call, if we can find a suitable
5919 // vector variant of the function.
5920 VFInfo FuncInfo;
5921 Function *VecFunc = nullptr;
5922 // Search through any available variants for one we can use at this VF.
5923 for (VFInfo &Info : VFDatabase::getMappings(*CI)) {
5924 // Must match requested VF.
5925 if (Info.Shape.VF != VF)
5926 continue;
5927
5928 // Must take a mask argument if one is required
5929 if (MaskRequired && !Info.isMasked())
5930 continue;
5931
5932 // Check that all parameter kinds are supported
5933 bool ParamsOk = true;
5934 for (VFParameter Param : Info.Shape.Parameters) {
5935 switch (Param.ParamKind) {
5937 break;
5939 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
5940 // Make sure the scalar parameter in the loop is invariant.
5941 if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),
5942 TheLoop))
5943 ParamsOk = false;
5944 break;
5945 }
5947 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
5948 // Find the stride for the scalar parameter in this loop and see if
5949 // it matches the stride for the variant.
5950 // TODO: do we need to figure out the cost of an extract to get the
5951 // first lane? Or do we hope that it will be folded away?
5952 ScalarEvolution *SE = PSE.getSE();
5953 if (!match(SE->getSCEV(ScalarParam),
5955 m_SCEV(), m_scev_SpecificSInt(Param.LinearStepOrPos),
5957 ParamsOk = false;
5958 break;
5959 }
5961 break;
5962 default:
5963 ParamsOk = false;
5964 break;
5965 }
5966 }
5967
5968 if (!ParamsOk)
5969 continue;
5970
5971 // Found a suitable candidate, stop here.
5972 VecFunc = CI->getModule()->getFunction(Info.VectorName);
5973 FuncInfo = Info;
5974 break;
5975 }
5976
5977 if (TLI && VecFunc && !CI->isNoBuiltin())
5978 VectorCost = TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind);
5979
5980 // Find the cost of an intrinsic; some targets may have instructions that
5981 // perform the operation without needing an actual call.
5983 if (IID != Intrinsic::not_intrinsic)
5985
5986 InstructionCost Cost = ScalarCost;
5987 InstWidening Decision = CM_Scalarize;
5988
5989 if (VectorCost <= Cost) {
5990 Cost = VectorCost;
5991 Decision = CM_VectorCall;
5992 }
5993
5994 if (IntrinsicCost <= Cost) {
5996 Decision = CM_IntrinsicCall;
5997 }
5998
5999 setCallWideningDecision(CI, VF, Decision, VecFunc, IID,
6001 }
6002 }
6003}
6004
6006 if (!Legal->isInvariant(Op))
6007 return false;
6008 // Consider Op invariant, if it or its operands aren't predicated
6009 // instruction in the loop. In that case, it is not trivially hoistable.
6010 auto *OpI = dyn_cast<Instruction>(Op);
6011 return !OpI || !TheLoop->contains(OpI) ||
6012 (!isPredicatedInst(OpI) &&
6013 (!isa<PHINode>(OpI) || OpI->getParent() != TheLoop->getHeader()) &&
6014 all_of(OpI->operands(),
6015 [this](Value *Op) { return shouldConsiderInvariant(Op); }));
6016}
6017
6020 ElementCount VF) {
6021 // If we know that this instruction will remain uniform, check the cost of
6022 // the scalar version.
6024 VF = ElementCount::getFixed(1);
6025
6026 if (VF.isVector() && isProfitableToScalarize(I, VF))
6027 return InstsToScalarize[VF][I];
6028
6029 // Forced scalars do not have any scalarization overhead.
6030 auto ForcedScalar = ForcedScalars.find(VF);
6031 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6032 auto InstSet = ForcedScalar->second;
6033 if (InstSet.count(I))
6035 VF.getKnownMinValue();
6036 }
6037
6038 Type *RetTy = I->getType();
6040 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6041 auto *SE = PSE.getSE();
6042
6043 Type *VectorTy;
6044 if (isScalarAfterVectorization(I, VF)) {
6045 [[maybe_unused]] auto HasSingleCopyAfterVectorization =
6046 [this](Instruction *I, ElementCount VF) -> bool {
6047 if (VF.isScalar())
6048 return true;
6049
6050 auto Scalarized = InstsToScalarize.find(VF);
6051 assert(Scalarized != InstsToScalarize.end() &&
6052 "VF not yet analyzed for scalarization profitability");
6053 return !Scalarized->second.count(I) &&
6054 llvm::all_of(I->users(), [&](User *U) {
6055 auto *UI = cast<Instruction>(U);
6056 return !Scalarized->second.count(UI);
6057 });
6058 };
6059
6060 // With the exception of GEPs and PHIs, after scalarization there should
6061 // only be one copy of the instruction generated in the loop. This is
6062 // because the VF is either 1, or any instructions that need scalarizing
6063 // have already been dealt with by the time we get here. As a result,
6064 // it means we don't have to multiply the instruction cost by VF.
6065 assert(I->getOpcode() == Instruction::GetElementPtr ||
6066 I->getOpcode() == Instruction::PHI ||
6067 (I->getOpcode() == Instruction::BitCast &&
6068 I->getType()->isPointerTy()) ||
6069 HasSingleCopyAfterVectorization(I, VF));
6070 VectorTy = RetTy;
6071 } else
6072 VectorTy = toVectorizedTy(RetTy, VF);
6073
6074 if (VF.isVector() && VectorTy->isVectorTy() &&
6075 !TTI.getNumberOfParts(VectorTy))
6077
6078 // TODO: We need to estimate the cost of intrinsic calls.
6079 switch (I->getOpcode()) {
6080 case Instruction::GetElementPtr:
6081 // We mark this instruction as zero-cost because the cost of GEPs in
6082 // vectorized code depends on whether the corresponding memory instruction
6083 // is scalarized or not. Therefore, we handle GEPs with the memory
6084 // instruction cost.
6085 return 0;
6086 case Instruction::Br: {
6087 // In cases of scalarized and predicated instructions, there will be VF
6088 // predicated blocks in the vectorized loop. Each branch around these
6089 // blocks requires also an extract of its vector compare i1 element.
6090 // Note that the conditional branch from the loop latch will be replaced by
6091 // a single branch controlling the loop, so there is no extra overhead from
6092 // scalarization.
6093 bool ScalarPredicatedBB = false;
6095 if (VF.isVector() && BI->isConditional() &&
6096 (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
6097 PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))) &&
6098 BI->getParent() != TheLoop->getLoopLatch())
6099 ScalarPredicatedBB = true;
6100
6101 if (ScalarPredicatedBB) {
6102 // Not possible to scalarize scalable vector with predicated instructions.
6103 if (VF.isScalable())
6105 // Return cost for branches around scalarized and predicated blocks.
6106 auto *VecI1Ty =
6108 return (
6109 TTI.getScalarizationOverhead(
6110 VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
6111 /*Insert*/ false, /*Extract*/ true, CostKind) +
6112 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
6113 }
6114
6115 if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6116 // The back-edge branch will remain, as will all scalar branches.
6117 return TTI.getCFInstrCost(Instruction::Br, CostKind);
6118
6119 // This branch will be eliminated by if-conversion.
6120 return 0;
6121 // Note: We currently assume zero cost for an unconditional branch inside
6122 // a predicated block since it will become a fall-through, although we
6123 // may decide in the future to call TTI for all branches.
6124 }
6125 case Instruction::Switch: {
6126 if (VF.isScalar())
6127 return TTI.getCFInstrCost(Instruction::Switch, CostKind);
6128 auto *Switch = cast<SwitchInst>(I);
6129 return Switch->getNumCases() *
6130 TTI.getCmpSelInstrCost(
6131 Instruction::ICmp,
6132 toVectorTy(Switch->getCondition()->getType(), VF),
6133 toVectorTy(Type::getInt1Ty(I->getContext()), VF),
6135 }
6136 case Instruction::PHI: {
6137 auto *Phi = cast<PHINode>(I);
6138
6139 // First-order recurrences are replaced by vector shuffles inside the loop.
6140 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6142 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
6143 return TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
6144 cast<VectorType>(VectorTy),
6145 cast<VectorType>(VectorTy), Mask, CostKind,
6146 VF.getKnownMinValue() - 1);
6147 }
6148
6149 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6150 // converted into select instructions. We require N - 1 selects per phi
6151 // node, where N is the number of incoming values.
6152 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) {
6153 Type *ResultTy = Phi->getType();
6154
6155 // All instructions in an Any-of reduction chain are narrowed to bool.
6156 // Check if that is the case for this phi node.
6157 auto *HeaderUser = cast_if_present<PHINode>(
6158 find_singleton<User>(Phi->users(), [this](User *U, bool) -> User * {
6159 auto *Phi = dyn_cast<PHINode>(U);
6160 if (Phi && Phi->getParent() == TheLoop->getHeader())
6161 return Phi;
6162 return nullptr;
6163 }));
6164 if (HeaderUser) {
6165 auto &ReductionVars = Legal->getReductionVars();
6166 auto Iter = ReductionVars.find(HeaderUser);
6167 if (Iter != ReductionVars.end() &&
6169 Iter->second.getRecurrenceKind()))
6170 ResultTy = Type::getInt1Ty(Phi->getContext());
6171 }
6172 return (Phi->getNumIncomingValues() - 1) *
6173 TTI.getCmpSelInstrCost(
6174 Instruction::Select, toVectorTy(ResultTy, VF),
6175 toVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6177 }
6178
6179 // When tail folding with EVL, if the phi is part of an out of loop
6180 // reduction then it will be transformed into a wide vp_merge.
6181 if (VF.isVector() && foldTailWithEVL() &&
6182 Legal->getReductionVars().contains(Phi) && !isInLoopReduction(Phi)) {
6184 Intrinsic::vp_merge, toVectorTy(Phi->getType(), VF),
6185 {toVectorTy(Type::getInt1Ty(Phi->getContext()), VF)});
6186 return TTI.getIntrinsicInstrCost(ICA, CostKind);
6187 }
6188
6189 return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6190 }
6191 case Instruction::UDiv:
6192 case Instruction::SDiv:
6193 case Instruction::URem:
6194 case Instruction::SRem:
6195 if (VF.isVector() && isPredicatedInst(I)) {
6196 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6197 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6198 ScalarCost : SafeDivisorCost;
6199 }
6200 // We've proven all lanes safe to speculate, fall through.
6201 [[fallthrough]];
6202 case Instruction::Add:
6203 case Instruction::Sub: {
6204 auto Info = Legal->getHistogramInfo(I);
6205 if (Info && VF.isVector()) {
6206 const HistogramInfo *HGram = Info.value();
6207 // Assume that a non-constant update value (or a constant != 1) requires
6208 // a multiply, and add that into the cost.
6210 ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1));
6211 if (!RHS || RHS->getZExtValue() != 1)
6212 MulCost =
6213 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6214
6215 // Find the cost of the histogram operation itself.
6216 Type *PtrTy = VectorType::get(HGram->Load->getPointerOperandType(), VF);
6217 Type *ScalarTy = I->getType();
6218 Type *MaskTy = VectorType::get(Type::getInt1Ty(I->getContext()), VF);
6219 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
6220 Type::getVoidTy(I->getContext()),
6221 {PtrTy, ScalarTy, MaskTy});
6222
6223 // Add the costs together with the add/sub operation.
6224 return TTI.getIntrinsicInstrCost(ICA, CostKind) + MulCost +
6225 TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, CostKind);
6226 }
6227 [[fallthrough]];
6228 }
6229 case Instruction::FAdd:
6230 case Instruction::FSub:
6231 case Instruction::Mul:
6232 case Instruction::FMul:
6233 case Instruction::FDiv:
6234 case Instruction::FRem:
6235 case Instruction::Shl:
6236 case Instruction::LShr:
6237 case Instruction::AShr:
6238 case Instruction::And:
6239 case Instruction::Or:
6240 case Instruction::Xor: {
6241 // If we're speculating on the stride being 1, the multiplication may
6242 // fold away. We can generalize this for all operations using the notion
6243 // of neutral elements. (TODO)
6244 if (I->getOpcode() == Instruction::Mul &&
6245 ((TheLoop->isLoopInvariant(I->getOperand(0)) &&
6246 PSE.getSCEV(I->getOperand(0))->isOne()) ||
6247 (TheLoop->isLoopInvariant(I->getOperand(1)) &&
6248 PSE.getSCEV(I->getOperand(1))->isOne())))
6249 return 0;
6250
6251 // Detect reduction patterns
6252 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy))
6253 return *RedCost;
6254
6255 // Certain instructions can be cheaper to vectorize if they have a constant
6256 // second vector operand. One example of this are shifts on x86.
6257 Value *Op2 = I->getOperand(1);
6258 if (!isa<Constant>(Op2) && TheLoop->isLoopInvariant(Op2) &&
6259 PSE.getSE()->isSCEVable(Op2->getType()) &&
6260 isa<SCEVConstant>(PSE.getSCEV(Op2))) {
6261 Op2 = cast<SCEVConstant>(PSE.getSCEV(Op2))->getValue();
6262 }
6263 auto Op2Info = TTI.getOperandInfo(Op2);
6264 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6267
6268 SmallVector<const Value *, 4> Operands(I->operand_values());
6269 return TTI.getArithmeticInstrCost(
6270 I->getOpcode(), VectorTy, CostKind,
6271 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6272 Op2Info, Operands, I, TLI);
6273 }
6274 case Instruction::FNeg: {
6275 return TTI.getArithmeticInstrCost(
6276 I->getOpcode(), VectorTy, CostKind,
6277 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6278 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6279 I->getOperand(0), I);
6280 }
6281 case Instruction::Select: {
6283 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6284 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6285
6286 const Value *Op0, *Op1;
6287 using namespace llvm::PatternMatch;
6288 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
6289 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
6290 // select x, y, false --> x & y
6291 // select x, true, y --> x | y
6292 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);
6293 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);
6294 assert(Op0->getType()->getScalarSizeInBits() == 1 &&
6295 Op1->getType()->getScalarSizeInBits() == 1);
6296
6297 return TTI.getArithmeticInstrCost(
6298 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And,
6299 VectorTy, CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, {Op0, Op1}, I);
6300 }
6301
6302 Type *CondTy = SI->getCondition()->getType();
6303 if (!ScalarCond)
6304 CondTy = VectorType::get(CondTy, VF);
6305
6307 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
6308 Pred = Cmp->getPredicate();
6309 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
6310 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
6311 {TTI::OK_AnyValue, TTI::OP_None}, I);
6312 }
6313 case Instruction::ICmp:
6314 case Instruction::FCmp: {
6315 Type *ValTy = I->getOperand(0)->getType();
6316
6318 [[maybe_unused]] Instruction *Op0AsInstruction =
6319 dyn_cast<Instruction>(I->getOperand(0));
6320 assert((!canTruncateToMinimalBitwidth(Op0AsInstruction, VF) ||
6321 MinBWs[I] == MinBWs[Op0AsInstruction]) &&
6322 "if both the operand and the compare are marked for "
6323 "truncation, they must have the same bitwidth");
6324 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[I]);
6325 }
6326
6327 VectorTy = toVectorTy(ValTy, VF);
6328 return TTI.getCmpSelInstrCost(
6329 I->getOpcode(), VectorTy, CmpInst::makeCmpResultType(VectorTy),
6330 cast<CmpInst>(I)->getPredicate(), CostKind,
6331 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, I);
6332 }
6333 case Instruction::Store:
6334 case Instruction::Load: {
6335 ElementCount Width = VF;
6336 if (Width.isVector()) {
6337 InstWidening Decision = getWideningDecision(I, Width);
6338 assert(Decision != CM_Unknown &&
6339 "CM decision should be taken at this point");
6342 if (Decision == CM_Scalarize)
6343 Width = ElementCount::getFixed(1);
6344 }
6345 VectorTy = toVectorTy(getLoadStoreType(I), Width);
6346 return getMemoryInstructionCost(I, VF);
6347 }
6348 case Instruction::BitCast:
6349 if (I->getType()->isPointerTy())
6350 return 0;
6351 [[fallthrough]];
6352 case Instruction::ZExt:
6353 case Instruction::SExt:
6354 case Instruction::FPToUI:
6355 case Instruction::FPToSI:
6356 case Instruction::FPExt:
6357 case Instruction::PtrToInt:
6358 case Instruction::IntToPtr:
6359 case Instruction::SIToFP:
6360 case Instruction::UIToFP:
6361 case Instruction::Trunc:
6362 case Instruction::FPTrunc: {
6363 // Computes the CastContextHint from a Load/Store instruction.
6364 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6366 "Expected a load or a store!");
6367
6368 if (VF.isScalar() || !TheLoop->contains(I))
6370
6371 switch (getWideningDecision(I, VF)) {
6383 llvm_unreachable("Instr did not go through cost modelling?");
6386 llvm_unreachable_internal("Instr has invalid widening decision");
6387 }
6388
6389 llvm_unreachable("Unhandled case!");
6390 };
6391
6392 unsigned Opcode = I->getOpcode();
6394 // For Trunc, the context is the only user, which must be a StoreInst.
6395 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
6396 if (I->hasOneUse())
6397 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
6398 CCH = ComputeCCH(Store);
6399 }
6400 // For Z/Sext, the context is the operand, which must be a LoadInst.
6401 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
6402 Opcode == Instruction::FPExt) {
6403 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
6404 CCH = ComputeCCH(Load);
6405 }
6406
6407 // We optimize the truncation of induction variables having constant
6408 // integer steps. The cost of these truncations is the same as the scalar
6409 // operation.
6410 if (isOptimizableIVTruncate(I, VF)) {
6411 auto *Trunc = cast<TruncInst>(I);
6412 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6413 Trunc->getSrcTy(), CCH, CostKind, Trunc);
6414 }
6415
6416 // Detect reduction patterns
6417 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy))
6418 return *RedCost;
6419
6420 Type *SrcScalarTy = I->getOperand(0)->getType();
6421 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6422 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6423 SrcScalarTy =
6424 IntegerType::get(SrcScalarTy->getContext(), MinBWs[Op0AsInstruction]);
6425 Type *SrcVecTy =
6426 VectorTy->isVectorTy() ? toVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6427
6429 // If the result type is <= the source type, there will be no extend
6430 // after truncating the users to the minimal required bitwidth.
6431 if (VectorTy->getScalarSizeInBits() <= SrcVecTy->getScalarSizeInBits() &&
6432 (I->getOpcode() == Instruction::ZExt ||
6433 I->getOpcode() == Instruction::SExt))
6434 return 0;
6435 }
6436
6437 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
6438 }
6439 case Instruction::Call:
6440 return getVectorCallCost(cast<CallInst>(I), VF);
6441 case Instruction::ExtractValue:
6442 return TTI.getInstructionCost(I, CostKind);
6443 case Instruction::Alloca:
6444 // We cannot easily widen alloca to a scalable alloca, as
6445 // the result would need to be a vector of pointers.
6446 if (VF.isScalable())
6448 [[fallthrough]];
6449 default:
6450 // This opcode is unknown. Assume that it is the same as 'mul'.
6451 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6452 } // end of switch.
6453}
6454
6456 // Ignore ephemeral values.
6458
6459 SmallVector<Value *, 4> DeadInterleavePointerOps;
6461
6462 // If a scalar epilogue is required, users outside the loop won't use
6463 // live-outs from the vector loop but from the scalar epilogue. Ignore them if
6464 // that is the case.
6465 bool RequiresScalarEpilogue = requiresScalarEpilogue(true);
6466 auto IsLiveOutDead = [this, RequiresScalarEpilogue](User *U) {
6467 return RequiresScalarEpilogue &&
6468 !TheLoop->contains(cast<Instruction>(U)->getParent());
6469 };
6470
6472 DFS.perform(LI);
6473 for (BasicBlock *BB : reverse(make_range(DFS.beginRPO(), DFS.endRPO())))
6474 for (Instruction &I : reverse(*BB)) {
6475 if (VecValuesToIgnore.contains(&I) || ValuesToIgnore.contains(&I))
6476 continue;
6477
6478 // Add instructions that would be trivially dead and are only used by
6479 // values already ignored to DeadOps to seed worklist.
6481 all_of(I.users(), [this, IsLiveOutDead](User *U) {
6482 return VecValuesToIgnore.contains(U) ||
6483 ValuesToIgnore.contains(U) || IsLiveOutDead(U);
6484 }))
6485 DeadOps.push_back(&I);
6486
6487 // For interleave groups, we only create a pointer for the start of the
6488 // interleave group. Queue up addresses of group members except the insert
6489 // position for further processing.
6490 if (isAccessInterleaved(&I)) {
6491 auto *Group = getInterleavedAccessGroup(&I);
6492 if (Group->getInsertPos() == &I)
6493 continue;
6494 Value *PointerOp = getLoadStorePointerOperand(&I);
6495 DeadInterleavePointerOps.push_back(PointerOp);
6496 }
6497
6498 // Queue branches for analysis. They are dead, if their successors only
6499 // contain dead instructions.
6500 if (auto *Br = dyn_cast<BranchInst>(&I)) {
6501 if (Br->isConditional())
6502 DeadOps.push_back(&I);
6503 }
6504 }
6505
6506 // Mark ops feeding interleave group members as free, if they are only used
6507 // by other dead computations.
6508 for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) {
6509 auto *Op = dyn_cast<Instruction>(DeadInterleavePointerOps[I]);
6510 if (!Op || !TheLoop->contains(Op) || any_of(Op->users(), [this](User *U) {
6511 Instruction *UI = cast<Instruction>(U);
6512 return !VecValuesToIgnore.contains(U) &&
6513 (!isAccessInterleaved(UI) ||
6514 getInterleavedAccessGroup(UI)->getInsertPos() == UI);
6515 }))
6516 continue;
6517 VecValuesToIgnore.insert(Op);
6518 append_range(DeadInterleavePointerOps, Op->operands());
6519 }
6520
6521 // Mark ops that would be trivially dead and are only used by ignored
6522 // instructions as free.
6523 BasicBlock *Header = TheLoop->getHeader();
6524
6525 // Returns true if the block contains only dead instructions. Such blocks will
6526 // be removed by VPlan-to-VPlan transforms and won't be considered by the
6527 // VPlan-based cost model, so skip them in the legacy cost-model as well.
6528 auto IsEmptyBlock = [this](BasicBlock *BB) {
6529 return all_of(*BB, [this](Instruction &I) {
6530 return ValuesToIgnore.contains(&I) || VecValuesToIgnore.contains(&I) ||
6531 (isa<BranchInst>(&I) && !cast<BranchInst>(&I)->isConditional());
6532 });
6533 };
6534 for (unsigned I = 0; I != DeadOps.size(); ++I) {
6535 auto *Op = dyn_cast<Instruction>(DeadOps[I]);
6536
6537 // Check if the branch should be considered dead.
6538 if (auto *Br = dyn_cast_or_null<BranchInst>(Op)) {
6539 BasicBlock *ThenBB = Br->getSuccessor(0);
6540 BasicBlock *ElseBB = Br->getSuccessor(1);
6541 // Don't considers branches leaving the loop for simplification.
6542 if (!TheLoop->contains(ThenBB) || !TheLoop->contains(ElseBB))
6543 continue;
6544 bool ThenEmpty = IsEmptyBlock(ThenBB);
6545 bool ElseEmpty = IsEmptyBlock(ElseBB);
6546 if ((ThenEmpty && ElseEmpty) ||
6547 (ThenEmpty && ThenBB->getSingleSuccessor() == ElseBB &&
6548 ElseBB->phis().empty()) ||
6549 (ElseEmpty && ElseBB->getSingleSuccessor() == ThenBB &&
6550 ThenBB->phis().empty())) {
6551 VecValuesToIgnore.insert(Br);
6552 DeadOps.push_back(Br->getCondition());
6553 }
6554 continue;
6555 }
6556
6557 // Skip any op that shouldn't be considered dead.
6558 if (!Op || !TheLoop->contains(Op) ||
6559 (isa<PHINode>(Op) && Op->getParent() == Header) ||
6561 any_of(Op->users(), [this, IsLiveOutDead](User *U) {
6562 return !VecValuesToIgnore.contains(U) &&
6563 !ValuesToIgnore.contains(U) && !IsLiveOutDead(U);
6564 }))
6565 continue;
6566
6567 // If all of Op's users are in ValuesToIgnore, add it to ValuesToIgnore
6568 // which applies for both scalar and vector versions. Otherwise it is only
6569 // dead in vector versions, so only add it to VecValuesToIgnore.
6570 if (all_of(Op->users(),
6571 [this](User *U) { return ValuesToIgnore.contains(U); }))
6572 ValuesToIgnore.insert(Op);
6573
6574 VecValuesToIgnore.insert(Op);
6575 append_range(DeadOps, Op->operands());
6576 }
6577
6578 // Ignore type-promoting instructions we identified during reduction
6579 // detection.
6580 for (const auto &Reduction : Legal->getReductionVars()) {
6581 const RecurrenceDescriptor &RedDes = Reduction.second;
6582 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6583 VecValuesToIgnore.insert_range(Casts);
6584 }
6585 // Ignore type-casting instructions we identified during induction
6586 // detection.
6587 for (const auto &Induction : Legal->getInductionVars()) {
6588 const InductionDescriptor &IndDes = Induction.second;
6589 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6590 VecValuesToIgnore.insert_range(Casts);
6591 }
6592}
6593
6595 // Avoid duplicating work finding in-loop reductions.
6596 if (!InLoopReductions.empty())
6597 return;
6598
6599 for (const auto &Reduction : Legal->getReductionVars()) {
6600 PHINode *Phi = Reduction.first;
6601 const RecurrenceDescriptor &RdxDesc = Reduction.second;
6602
6603 // We don't collect reductions that are type promoted (yet).
6604 if (RdxDesc.getRecurrenceType() != Phi->getType())
6605 continue;
6606
6607 // In-loop AnyOf and FindIV reductions are not yet supported.
6608 RecurKind Kind = RdxDesc.getRecurrenceKind();
6611 continue;
6612
6613 // If the target would prefer this reduction to happen "in-loop", then we
6614 // want to record it as such.
6615 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
6616 !TTI.preferInLoopReduction(Kind, Phi->getType()))
6617 continue;
6618
6619 // Check that we can correctly put the reductions into the loop, by
6620 // finding the chain of operations that leads from the phi to the loop
6621 // exit value.
6622 SmallVector<Instruction *, 4> ReductionOperations =
6623 RdxDesc.getReductionOpChain(Phi, TheLoop);
6624 bool InLoop = !ReductionOperations.empty();
6625
6626 if (InLoop) {
6627 InLoopReductions.insert(Phi);
6628 // Add the elements to InLoopReductionImmediateChains for cost modelling.
6629 Instruction *LastChain = Phi;
6630 for (auto *I : ReductionOperations) {
6631 InLoopReductionImmediateChains[I] = LastChain;
6632 LastChain = I;
6633 }
6634 }
6635 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
6636 << " reduction for phi: " << *Phi << "\n");
6637 }
6638}
6639
6640// This function will select a scalable VF if the target supports scalable
6641// vectors and a fixed one otherwise.
6642// TODO: we could return a pair of values that specify the max VF and
6643// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6644// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6645// doesn't have a cost model that can choose which plan to execute if
6646// more than one is generated.
6649 unsigned WidestType;
6650 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6651
6653 TTI.enableScalableVectorization()
6656
6657 TypeSize RegSize = TTI.getRegisterBitWidth(RegKind);
6658 unsigned N = RegSize.getKnownMinValue() / WidestType;
6659 return ElementCount::get(N, RegSize.isScalable());
6660}
6661
6664 ElementCount VF = UserVF;
6665 // Outer loop handling: They may require CFG and instruction level
6666 // transformations before even evaluating whether vectorization is profitable.
6667 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6668 // the vectorization pipeline.
6669 if (!OrigLoop->isInnermost()) {
6670 // If the user doesn't provide a vectorization factor, determine a
6671 // reasonable one.
6672 if (UserVF.isZero()) {
6673 VF = determineVPlanVF(TTI, CM);
6674 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6675
6676 // Make sure we have a VF > 1 for stress testing.
6677 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
6678 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6679 << "overriding computed VF.\n");
6680 VF = ElementCount::getFixed(4);
6681 }
6682 } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
6684 LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
6685 << "not supported by the target.\n");
6687 "Scalable vectorization requested but not supported by the target",
6688 "the scalable user-specified vectorization width for outer-loop "
6689 "vectorization cannot be used because the target does not support "
6690 "scalable vectors.",
6691 "ScalableVFUnfeasible", ORE, OrigLoop);
6693 }
6694 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6696 "VF needs to be a power of two");
6697 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
6698 << "VF " << VF << " to build VPlans.\n");
6699 buildVPlans(VF, VF);
6700
6701 if (VPlans.empty())
6703
6704 // For VPlan build stress testing, we bail out after VPlan construction.
6707
6708 return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
6709 }
6710
6711 LLVM_DEBUG(
6712 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6713 "VPlan-native path.\n");
6715}
6716
6717void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
6718 assert(OrigLoop->isInnermost() && "Inner loop expected.");
6719 CM.collectValuesToIgnore();
6720 CM.collectElementTypesForWidening();
6721
6722 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
6723 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
6724 return;
6725
6726 // Invalidate interleave groups if all blocks of loop will be predicated.
6727 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
6729 LLVM_DEBUG(
6730 dbgs()
6731 << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6732 "which requires masked-interleaved support.\n");
6733 if (CM.InterleaveInfo.invalidateGroups())
6734 // Invalidating interleave groups also requires invalidating all decisions
6735 // based on them, which includes widening decisions and uniform and scalar
6736 // values.
6737 CM.invalidateCostModelingDecisions();
6738 }
6739
6740 if (CM.foldTailByMasking())
6741 Legal->prepareToFoldTailByMasking();
6742
6743 ElementCount MaxUserVF =
6744 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
6745 if (UserVF) {
6746 if (!ElementCount::isKnownLE(UserVF, MaxUserVF)) {
6748 "UserVF ignored because it may be larger than the maximal safe VF",
6749 "InvalidUserVF", ORE, OrigLoop);
6750 } else {
6752 "VF needs to be a power of two");
6753 // Collect the instructions (and their associated costs) that will be more
6754 // profitable to scalarize.
6755 CM.collectInLoopReductions();
6756 if (CM.selectUserVectorizationFactor(UserVF)) {
6757 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6758 buildVPlansWithVPRecipes(UserVF, UserVF);
6760 return;
6761 }
6762 reportVectorizationInfo("UserVF ignored because of invalid costs.",
6763 "InvalidCost", ORE, OrigLoop);
6764 }
6765 }
6766
6767 // Collect the Vectorization Factor Candidates.
6768 SmallVector<ElementCount> VFCandidates;
6769 for (auto VF = ElementCount::getFixed(1);
6770 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
6771 VFCandidates.push_back(VF);
6772 for (auto VF = ElementCount::getScalable(1);
6773 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
6774 VFCandidates.push_back(VF);
6775
6776 CM.collectInLoopReductions();
6777 for (const auto &VF : VFCandidates) {
6778 // Collect Uniform and Scalar instructions after vectorization with VF.
6779 CM.collectNonVectorizedAndSetWideningDecisions(VF);
6780 }
6781
6782 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
6783 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
6784
6786}
6787
6789 ElementCount VF) const {
6790 InstructionCost Cost = CM.getInstructionCost(UI, VF);
6791 if (Cost.isValid() && ForceTargetInstructionCost.getNumOccurrences())
6793 return Cost;
6794}
6795
6797 ElementCount VF) const {
6798 return CM.isUniformAfterVectorization(I, VF);
6799}
6800
6801bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
6802 return CM.ValuesToIgnore.contains(UI) ||
6803 (IsVector && CM.VecValuesToIgnore.contains(UI)) ||
6804 SkipCostComputation.contains(UI);
6805}
6806
6808 return CM.getPredBlockCostDivisor(CostKind, BB);
6809}
6810
6812LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
6813 VPCostContext &CostCtx) const {
6815 // Cost modeling for inductions is inaccurate in the legacy cost model
6816 // compared to the recipes that are generated. To match here initially during
6817 // VPlan cost model bring up directly use the induction costs from the legacy
6818 // cost model. Note that we do this as pre-processing; the VPlan may not have
6819 // any recipes associated with the original induction increment instruction
6820 // and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute
6821 // the cost of induction phis and increments (both that are represented by
6822 // recipes and those that are not), to avoid distinguishing between them here,
6823 // and skip all recipes that represent induction phis and increments (the
6824 // former case) later on, if they exist, to avoid counting them twice.
6825 // Similarly we pre-compute the cost of any optimized truncates.
6826 // TODO: Switch to more accurate costing based on VPlan.
6827 for (const auto &[IV, IndDesc] : Legal->getInductionVars()) {
6829 IV->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
6830 SmallVector<Instruction *> IVInsts = {IVInc};
6831 for (unsigned I = 0; I != IVInsts.size(); I++) {
6832 for (Value *Op : IVInsts[I]->operands()) {
6833 auto *OpI = dyn_cast<Instruction>(Op);
6834 if (Op == IV || !OpI || !OrigLoop->contains(OpI) || !Op->hasOneUse())
6835 continue;
6836 IVInsts.push_back(OpI);
6837 }
6838 }
6839 IVInsts.push_back(IV);
6840 for (User *U : IV->users()) {
6841 auto *CI = cast<Instruction>(U);
6842 if (!CostCtx.CM.isOptimizableIVTruncate(CI, VF))
6843 continue;
6844 IVInsts.push_back(CI);
6845 }
6846
6847 // If the vector loop gets executed exactly once with the given VF, ignore
6848 // the costs of comparison and induction instructions, as they'll get
6849 // simplified away.
6850 // TODO: Remove this code after stepping away from the legacy cost model and
6851 // adding code to simplify VPlans before calculating their costs.
6852 auto TC = getSmallConstantTripCount(PSE.getSE(), OrigLoop);
6853 if (TC == VF && !CM.foldTailByMasking())
6854 addFullyUnrolledInstructionsToIgnore(OrigLoop, Legal->getInductionVars(),
6855 CostCtx.SkipCostComputation);
6856
6857 for (Instruction *IVInst : IVInsts) {
6858 if (CostCtx.skipCostComputation(IVInst, VF.isVector()))
6859 continue;
6860 InstructionCost InductionCost = CostCtx.getLegacyCost(IVInst, VF);
6861 LLVM_DEBUG({
6862 dbgs() << "Cost of " << InductionCost << " for VF " << VF
6863 << ": induction instruction " << *IVInst << "\n";
6864 });
6865 Cost += InductionCost;
6866 CostCtx.SkipCostComputation.insert(IVInst);
6867 }
6868 }
6869
6870 /// Compute the cost of all exiting conditions of the loop using the legacy
6871 /// cost model. This is to match the legacy behavior, which adds the cost of
6872 /// all exit conditions. Note that this over-estimates the cost, as there will
6873 /// be a single condition to control the vector loop.
6875 CM.TheLoop->getExitingBlocks(Exiting);
6876 SetVector<Instruction *> ExitInstrs;
6877 // Collect all exit conditions.
6878 for (BasicBlock *EB : Exiting) {
6879 auto *Term = dyn_cast<BranchInst>(EB->getTerminator());
6880 if (!Term || CostCtx.skipCostComputation(Term, VF.isVector()))
6881 continue;
6882 if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) {
6883 ExitInstrs.insert(CondI);
6884 }
6885 }
6886 // Compute the cost of all instructions only feeding the exit conditions.
6887 for (unsigned I = 0; I != ExitInstrs.size(); ++I) {
6888 Instruction *CondI = ExitInstrs[I];
6889 if (!OrigLoop->contains(CondI) ||
6890 !CostCtx.SkipCostComputation.insert(CondI).second)
6891 continue;
6892 InstructionCost CondICost = CostCtx.getLegacyCost(CondI, VF);
6893 LLVM_DEBUG({
6894 dbgs() << "Cost of " << CondICost << " for VF " << VF
6895 << ": exit condition instruction " << *CondI << "\n";
6896 });
6897 Cost += CondICost;
6898 for (Value *Op : CondI->operands()) {
6899 auto *OpI = dyn_cast<Instruction>(Op);
6900 if (!OpI || CostCtx.skipCostComputation(OpI, VF.isVector()) ||
6901 any_of(OpI->users(), [&ExitInstrs, this](User *U) {
6902 return OrigLoop->contains(cast<Instruction>(U)->getParent()) &&
6903 !ExitInstrs.contains(cast<Instruction>(U));
6904 }))
6905 continue;
6906 ExitInstrs.insert(OpI);
6907 }
6908 }
6909
6910 // Pre-compute the costs for branches except for the backedge, as the number
6911 // of replicate regions in a VPlan may not directly match the number of
6912 // branches, which would lead to different decisions.
6913 // TODO: Compute cost of branches for each replicate region in the VPlan,
6914 // which is more accurate than the legacy cost model.
6915 for (BasicBlock *BB : OrigLoop->blocks()) {
6916 if (CostCtx.skipCostComputation(BB->getTerminator(), VF.isVector()))
6917 continue;
6918 CostCtx.SkipCostComputation.insert(BB->getTerminator());
6919 if (BB == OrigLoop->getLoopLatch())
6920 continue;
6921 auto BranchCost = CostCtx.getLegacyCost(BB->getTerminator(), VF);
6922 Cost += BranchCost;
6923 }
6924
6925 // Pre-compute costs for instructions that are forced-scalar or profitable to
6926 // scalarize. Their costs will be computed separately in the legacy cost
6927 // model.
6928 for (Instruction *ForcedScalar : CM.ForcedScalars[VF]) {
6929 if (CostCtx.skipCostComputation(ForcedScalar, VF.isVector()))
6930 continue;
6931 CostCtx.SkipCostComputation.insert(ForcedScalar);
6932 InstructionCost ForcedCost = CostCtx.getLegacyCost(ForcedScalar, VF);
6933 LLVM_DEBUG({
6934 dbgs() << "Cost of " << ForcedCost << " for VF " << VF
6935 << ": forced scalar " << *ForcedScalar << "\n";
6936 });
6937 Cost += ForcedCost;
6938 }
6939 for (const auto &[Scalarized, ScalarCost] : CM.InstsToScalarize[VF]) {
6940 if (CostCtx.skipCostComputation(Scalarized, VF.isVector()))
6941 continue;
6942 CostCtx.SkipCostComputation.insert(Scalarized);
6943 LLVM_DEBUG({
6944 dbgs() << "Cost of " << ScalarCost << " for VF " << VF
6945 << ": profitable to scalarize " << *Scalarized << "\n";
6946 });
6947 Cost += ScalarCost;
6948 }
6949
6950 return Cost;
6951}
6952
6953InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
6954 ElementCount VF) const {
6955 VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE(),
6956 OrigLoop);
6957 InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
6958
6959 // Now compute and add the VPlan-based cost.
6960 Cost += Plan.cost(VF, CostCtx);
6961#ifndef NDEBUG
6962 unsigned EstimatedWidth = estimateElementCount(VF, CM.getVScaleForTuning());
6963 LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
6964 << " (Estimated cost per lane: ");
6965 if (Cost.isValid()) {
6966 double CostPerLane = double(Cost.getValue()) / EstimatedWidth;
6967 LLVM_DEBUG(dbgs() << format("%.1f", CostPerLane));
6968 } else /* No point dividing an invalid cost - it will still be invalid */
6969 LLVM_DEBUG(dbgs() << "Invalid");
6970 LLVM_DEBUG(dbgs() << ")\n");
6971#endif
6972 return Cost;
6973}
6974
6975#ifndef NDEBUG
6976/// Return true if the original loop \ TheLoop contains any instructions that do
6977/// not have corresponding recipes in \p Plan and are not marked to be ignored
6978/// in \p CostCtx. This means the VPlan contains simplification that the legacy
6979/// cost-model did not account for.
6981 VPCostContext &CostCtx,
6982 Loop *TheLoop,
6983 ElementCount VF) {
6984 // First collect all instructions for the recipes in Plan.
6985 auto GetInstructionForCost = [](const VPRecipeBase *R) -> Instruction * {
6986 if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
6987 return dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
6988 if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))
6989 return &WidenMem->getIngredient();
6990 return nullptr;
6991 };
6992
6993 // Check if a select for a safe divisor was hoisted to the pre-header. If so,
6994 // the select doesn't need to be considered for the vector loop cost; go with
6995 // the more accurate VPlan-based cost model.
6996 for (VPRecipeBase &R : *Plan.getVectorPreheader()) {
6997 auto *VPI = dyn_cast<VPInstruction>(&R);
6998 if (!VPI || VPI->getOpcode() != Instruction::Select)
6999 continue;
7000
7001 if (auto *WR = dyn_cast_or_null<VPWidenRecipe>(VPI->getSingleUser())) {
7002 switch (WR->getOpcode()) {
7003 case Instruction::UDiv:
7004 case Instruction::SDiv:
7005 case Instruction::URem:
7006 case Instruction::SRem:
7007 return true;
7008 default:
7009 break;
7010 }
7011 }
7012 }
7013
7014 DenseSet<Instruction *> SeenInstrs;
7015 auto Iter = vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry());
7017 for (VPRecipeBase &R : *VPBB) {
7018 if (auto *IR = dyn_cast<VPInterleaveRecipe>(&R)) {
7019 auto *IG = IR->getInterleaveGroup();
7020 unsigned NumMembers = IG->getNumMembers();
7021 for (unsigned I = 0; I != NumMembers; ++I) {
7022 if (Instruction *M = IG->getMember(I))
7023 SeenInstrs.insert(M);
7024 }
7025 continue;
7026 }
7027 // Unused FOR splices are removed by VPlan transforms, so the VPlan-based
7028 // cost model won't cost it whilst the legacy will.
7029 if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) {
7030 using namespace VPlanPatternMatch;
7031 if (none_of(FOR->users(),
7032 match_fn(m_VPInstruction<
7034 return true;
7035 }
7036 // The VPlan-based cost model is more accurate for partial reduction and
7037 // comparing against the legacy cost isn't desirable.
7039 return true;
7040
7041 // The VPlan-based cost model can analyze if recipes are scalar
7042 // recursively, but the legacy cost model cannot.
7043 if (auto *WidenMemR = dyn_cast<VPWidenMemoryRecipe>(&R)) {
7044 auto *AddrI = dyn_cast<Instruction>(
7045 getLoadStorePointerOperand(&WidenMemR->getIngredient()));
7046 if (AddrI && vputils::isSingleScalar(WidenMemR->getAddr()) !=
7047 CostCtx.isLegacyUniformAfterVectorization(AddrI, VF))
7048 return true;
7049 }
7050
7051 /// If a VPlan transform folded a recipe to one producing a single-scalar,
7052 /// but the original instruction wasn't uniform-after-vectorization in the
7053 /// legacy cost model, the legacy cost overestimates the actual cost.
7054 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
7055 if (RepR->isSingleScalar() &&
7057 RepR->getUnderlyingInstr(), VF))
7058 return true;
7059 }
7060 if (Instruction *UI = GetInstructionForCost(&R)) {
7061 // If we adjusted the predicate of the recipe, the cost in the legacy
7062 // cost model may be different.
7063 using namespace VPlanPatternMatch;
7064 CmpPredicate Pred;
7065 if (match(&R, m_Cmp(Pred, m_VPValue(), m_VPValue())) &&
7066 cast<VPRecipeWithIRFlags>(R).getPredicate() !=
7067 cast<CmpInst>(UI)->getPredicate())
7068 return true;
7069 SeenInstrs.insert(UI);
7070 }
7071 }
7072 }
7073
7074 // Return true if the loop contains any instructions that are not also part of
7075 // the VPlan or are skipped for VPlan-based cost computations. This indicates
7076 // that the VPlan contains extra simplifications.
7077 return any_of(TheLoop->blocks(), [&SeenInstrs, &CostCtx,
7078 TheLoop](BasicBlock *BB) {
7079 return any_of(*BB, [&SeenInstrs, &CostCtx, TheLoop, BB](Instruction &I) {
7080 // Skip induction phis when checking for simplifications, as they may not
7081 // be lowered directly be lowered to a corresponding PHI recipe.
7082 if (isa<PHINode>(&I) && BB == TheLoop->getHeader() &&
7083 CostCtx.CM.Legal->isInductionPhi(cast<PHINode>(&I)))
7084 return false;
7085 return !SeenInstrs.contains(&I) && !CostCtx.skipCostComputation(&I, true);
7086 });
7087 });
7088}
7089#endif
7090
7092 if (VPlans.empty())
7094 // If there is a single VPlan with a single VF, return it directly.
7095 VPlan &FirstPlan = *VPlans[0];
7096 if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1)
7097 return {*FirstPlan.vectorFactors().begin(), 0, 0};
7098
7099 LLVM_DEBUG(dbgs() << "LV: Computing best VF using cost kind: "
7100 << (CM.CostKind == TTI::TCK_RecipThroughput
7101 ? "Reciprocal Throughput\n"
7102 : CM.CostKind == TTI::TCK_Latency
7103 ? "Instruction Latency\n"
7104 : CM.CostKind == TTI::TCK_CodeSize ? "Code Size\n"
7105 : CM.CostKind == TTI::TCK_SizeAndLatency
7106 ? "Code Size and Latency\n"
7107 : "Unknown\n"));
7108
7110 assert(hasPlanWithVF(ScalarVF) &&
7111 "More than a single plan/VF w/o any plan having scalar VF");
7112
7113 // TODO: Compute scalar cost using VPlan-based cost model.
7114 InstructionCost ScalarCost = CM.expectedCost(ScalarVF);
7115 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ScalarCost << ".\n");
7116 VectorizationFactor ScalarFactor(ScalarVF, ScalarCost, ScalarCost);
7117 VectorizationFactor BestFactor = ScalarFactor;
7118
7119 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
7120 if (ForceVectorization) {
7121 // Ignore scalar width, because the user explicitly wants vectorization.
7122 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
7123 // evaluation.
7124 BestFactor.Cost = InstructionCost::getMax();
7125 }
7126
7127 for (auto &P : VPlans) {
7128 ArrayRef<ElementCount> VFs(P->vectorFactors().begin(),
7129 P->vectorFactors().end());
7130
7132 if (any_of(VFs, [this](ElementCount VF) {
7133 return CM.shouldConsiderRegPressureForVF(VF);
7134 }))
7135 RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore);
7136
7137 for (unsigned I = 0; I < VFs.size(); I++) {
7138 ElementCount VF = VFs[I];
7139 if (VF.isScalar())
7140 continue;
7141 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
7142 LLVM_DEBUG(
7143 dbgs()
7144 << "LV: Not considering vector loop of width " << VF
7145 << " because it will not generate any vector instructions.\n");
7146 continue;
7147 }
7148 if (CM.OptForSize && !ForceVectorization && hasReplicatorRegion(*P)) {
7149 LLVM_DEBUG(
7150 dbgs()
7151 << "LV: Not considering vector loop of width " << VF
7152 << " because it would cause replicated blocks to be generated,"
7153 << " which isn't allowed when optimizing for size.\n");
7154 continue;
7155 }
7156
7157 InstructionCost Cost = cost(*P, VF);
7158 VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
7159
7160 if (CM.shouldConsiderRegPressureForVF(VF) &&
7161 RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs)) {
7162 LLVM_DEBUG(dbgs() << "LV(REG): Not considering vector loop of width "
7163 << VF << " because it uses too many registers\n");
7164 continue;
7165 }
7166
7167 if (isMoreProfitable(CurrentFactor, BestFactor, P->hasScalarTail()))
7168 BestFactor = CurrentFactor;
7169
7170 // If profitable add it to ProfitableVF list.
7171 if (isMoreProfitable(CurrentFactor, ScalarFactor, P->hasScalarTail()))
7172 ProfitableVFs.push_back(CurrentFactor);
7173 }
7174 }
7175
7176#ifndef NDEBUG
7177 // Select the optimal vectorization factor according to the legacy cost-model.
7178 // This is now only used to verify the decisions by the new VPlan-based
7179 // cost-model and will be retired once the VPlan-based cost-model is
7180 // stabilized.
7181 VectorizationFactor LegacyVF = selectVectorizationFactor();
7182 VPlan &BestPlan = getPlanFor(BestFactor.Width);
7183
7184 // Pre-compute the cost and use it to check if BestPlan contains any
7185 // simplifications not accounted for in the legacy cost model. If that's the
7186 // case, don't trigger the assertion, as the extra simplifications may cause a
7187 // different VF to be picked by the VPlan-based cost model.
7188 VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind,
7189 *CM.PSE.getSE(), OrigLoop);
7190 precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
7191 // Verify that the VPlan-based and legacy cost models agree, except for VPlans
7192 // with early exits and plans with additional VPlan simplifications. The
7193 // legacy cost model doesn't properly model costs for such loops.
7194 assert((BestFactor.Width == LegacyVF.Width || BestPlan.hasEarlyExit() ||
7195 !Legal->getLAI()->getSymbolicStrides().empty() ||
7197 CostCtx, OrigLoop,
7198 BestFactor.Width) ||
7200 getPlanFor(LegacyVF.Width), CostCtx, OrigLoop, LegacyVF.Width)) &&
7201 " VPlan cost model and legacy cost model disagreed");
7202 assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) &&
7203 "when vectorizing, the scalar cost must be computed.");
7204#endif
7205
7206 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << BestFactor.Width << ".\n");
7207 return BestFactor;
7208}
7209
7211 using namespace VPlanPatternMatch;
7213 "RdxResult must be ComputeFindIVResult");
7214 VPValue *StartVPV = RdxResult->getOperand(1);
7215 match(StartVPV, m_Freeze(m_VPValue(StartVPV)));
7216 return StartVPV->getLiveInIRValue();
7217}
7218
7219// If \p EpiResumePhiR is resume VPPhi for a reduction when vectorizing the
7220// epilog loop, fix the reduction's scalar PHI node by adding the incoming value
7221// from the main vector loop.
7223 VPPhi *EpiResumePhiR, PHINode &EpiResumePhi, BasicBlock *BypassBlock) {
7224 // Get the VPInstruction computing the reduction result in the middle block.
7225 // The first operand may not be from the middle block if it is not connected
7226 // to the scalar preheader. In that case, there's nothing to fix.
7227 VPValue *Incoming = EpiResumePhiR->getOperand(0);
7230 auto *EpiRedResult = dyn_cast<VPInstruction>(Incoming);
7231 if (!EpiRedResult ||
7232 (EpiRedResult->getOpcode() != VPInstruction::ComputeAnyOfResult &&
7233 EpiRedResult->getOpcode() != VPInstruction::ComputeReductionResult &&
7234 EpiRedResult->getOpcode() != VPInstruction::ComputeFindIVResult))
7235 return;
7236
7237 auto *EpiRedHeaderPhi =
7238 cast<VPReductionPHIRecipe>(EpiRedResult->getOperand(0));
7239 RecurKind Kind = EpiRedHeaderPhi->getRecurrenceKind();
7240 Value *MainResumeValue;
7241 if (auto *VPI = dyn_cast<VPInstruction>(EpiRedHeaderPhi->getStartValue())) {
7242 assert((VPI->getOpcode() == VPInstruction::Broadcast ||
7243 VPI->getOpcode() == VPInstruction::ReductionStartVector) &&
7244 "unexpected start recipe");
7245 MainResumeValue = VPI->getOperand(0)->getUnderlyingValue();
7246 } else
7247 MainResumeValue = EpiRedHeaderPhi->getStartValue()->getUnderlyingValue();
7249 [[maybe_unused]] Value *StartV =
7250 EpiRedResult->getOperand(1)->getLiveInIRValue();
7251 auto *Cmp = cast<ICmpInst>(MainResumeValue);
7252 assert(Cmp->getPredicate() == CmpInst::ICMP_NE &&
7253 "AnyOf expected to start with ICMP_NE");
7254 assert(Cmp->getOperand(1) == StartV &&
7255 "AnyOf expected to start by comparing main resume value to original "
7256 "start value");
7257 MainResumeValue = Cmp->getOperand(0);
7259 Value *StartV = getStartValueFromReductionResult(EpiRedResult);
7260 Value *SentinelV = EpiRedResult->getOperand(2)->getLiveInIRValue();
7261 using namespace llvm::PatternMatch;
7262 Value *Cmp, *OrigResumeV, *CmpOp;
7263 [[maybe_unused]] bool IsExpectedPattern =
7264 match(MainResumeValue,
7265 m_Select(m_OneUse(m_Value(Cmp)), m_Specific(SentinelV),
7266 m_Value(OrigResumeV))) &&
7268 m_Value(CmpOp))) &&
7269 ((CmpOp == StartV && isGuaranteedNotToBeUndefOrPoison(CmpOp))));
7270 assert(IsExpectedPattern && "Unexpected reduction resume pattern");
7271 MainResumeValue = OrigResumeV;
7272 }
7273 PHINode *MainResumePhi = cast<PHINode>(MainResumeValue);
7274
7275 // When fixing reductions in the epilogue loop we should already have
7276 // created a bc.merge.rdx Phi after the main vector body. Ensure that we carry
7277 // over the incoming values correctly.
7278 EpiResumePhi.setIncomingValueForBlock(
7279 BypassBlock, MainResumePhi->getIncomingValueForBlock(BypassBlock));
7280}
7281
7283 ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7284 InnerLoopVectorizer &ILV, DominatorTree *DT, bool VectorizingEpilogue) {
7285 assert(BestVPlan.hasVF(BestVF) &&
7286 "Trying to execute plan with unsupported VF");
7287 assert(BestVPlan.hasUF(BestUF) &&
7288 "Trying to execute plan with unsupported UF");
7289 if (BestVPlan.hasEarlyExit())
7290 ++LoopsEarlyExitVectorized;
7291 // TODO: Move to VPlan transform stage once the transition to the VPlan-based
7292 // cost model is complete for better cost estimates.
7295 BestVPlan);
7298 bool HasBranchWeights =
7299 hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator());
7300 if (HasBranchWeights) {
7301 std::optional<unsigned> VScale = CM.getVScaleForTuning();
7303 BestVPlan, BestVF, VScale);
7304 }
7305
7306 // Checks are the same for all VPlans, added to BestVPlan only for
7307 // compactness.
7308 attachRuntimeChecks(BestVPlan, ILV.RTChecks, HasBranchWeights);
7309
7310 // Retrieving VectorPH now when it's easier while VPlan still has Regions.
7311 VPBasicBlock *VectorPH = cast<VPBasicBlock>(BestVPlan.getVectorPreheader());
7312
7313 VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
7316 if (BestVPlan.getEntry()->getSingleSuccessor() ==
7317 BestVPlan.getScalarPreheader()) {
7318 // TODO: The vector loop would be dead, should not even try to vectorize.
7319 ORE->emit([&]() {
7320 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationDead",
7321 OrigLoop->getStartLoc(),
7322 OrigLoop->getHeader())
7323 << "Created vector loop never executes due to insufficient trip "
7324 "count.";
7325 });
7327 }
7328
7330 BestVPlan, BestVF,
7331 TTI.getRegisterBitWidth(BestVF.isScalable()
7335
7337 // Regions are dissolved after optimizing for VF and UF, which completely
7338 // removes unneeded loop regions first.
7340 // Canonicalize EVL loops after regions are dissolved.
7344 BestVPlan, VectorPH, CM.foldTailByMasking(),
7345 CM.requiresScalarEpilogue(BestVF.isVector()));
7346 VPlanTransforms::materializeVFAndVFxUF(BestVPlan, VectorPH, BestVF);
7347 VPlanTransforms::cse(BestVPlan);
7349
7350 // 0. Generate SCEV-dependent code in the entry, including TripCount, before
7351 // making any changes to the CFG.
7352 DenseMap<const SCEV *, Value *> ExpandedSCEVs =
7353 VPlanTransforms::expandSCEVs(BestVPlan, *PSE.getSE());
7354 if (!ILV.getTripCount())
7355 ILV.setTripCount(BestVPlan.getTripCount()->getLiveInIRValue());
7356 else
7357 assert(VectorizingEpilogue && "should only re-use the existing trip "
7358 "count during epilogue vectorization");
7359
7360 // Perform the actual loop transformation.
7361 VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan,
7362 OrigLoop->getParentLoop(),
7363 Legal->getWidestInductionType());
7364
7365#ifdef EXPENSIVE_CHECKS
7366 assert(DT->verify(DominatorTree::VerificationLevel::Fast));
7367#endif
7368
7369 // 1. Set up the skeleton for vectorization, including vector pre-header and
7370 // middle block. The vector loop is created during VPlan execution.
7371 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
7373 State.CFG.PrevBB->getSingleSuccessor(), &BestVPlan);
7375
7376 assert(verifyVPlanIsValid(BestVPlan, true /*VerifyLate*/) &&
7377 "final VPlan is invalid");
7378
7379 // After vectorization, the exit blocks of the original loop will have
7380 // additional predecessors. Invalidate SCEVs for the exit phis in case SE
7381 // looked through single-entry phis.
7382 ScalarEvolution &SE = *PSE.getSE();
7383 for (VPIRBasicBlock *Exit : BestVPlan.getExitBlocks()) {
7384 if (!Exit->hasPredecessors())
7385 continue;
7386 for (VPRecipeBase &PhiR : Exit->phis())
7388 &cast<VPIRPhi>(PhiR).getIRPhi());
7389 }
7390 // Forget the original loop and block dispositions.
7391 SE.forgetLoop(OrigLoop);
7393
7395
7396 //===------------------------------------------------===//
7397 //
7398 // Notice: any optimization or new instruction that go
7399 // into the code below should also be implemented in
7400 // the cost-model.
7401 //
7402 //===------------------------------------------------===//
7403
7404 // Retrieve loop information before executing the plan, which may remove the
7405 // original loop, if it becomes unreachable.
7406 MDNode *LID = OrigLoop->getLoopID();
7407 unsigned OrigLoopInvocationWeight = 0;
7408 std::optional<unsigned> OrigAverageTripCount =
7409 getLoopEstimatedTripCount(OrigLoop, &OrigLoopInvocationWeight);
7410
7411 BestVPlan.execute(&State);
7412
7413 // 2.6. Maintain Loop Hints
7414 // Keep all loop hints from the original loop on the vector loop (we'll
7415 // replace the vectorizer-specific hints below).
7416 VPBasicBlock *HeaderVPBB = vputils::getFirstLoopHeader(BestVPlan, State.VPDT);
7417 // Add metadata to disable runtime unrolling a scalar loop when there
7418 // are no runtime checks about strides and memory. A scalar loop that is
7419 // rarely used is not worth unrolling.
7420 bool DisableRuntimeUnroll = !ILV.RTChecks.hasChecks() && !BestVF.isScalar();
7422 HeaderVPBB ? LI->getLoopFor(State.CFG.VPBB2IRBB.lookup(HeaderVPBB))
7423 : nullptr,
7424 HeaderVPBB, BestVPlan, VectorizingEpilogue, LID, OrigAverageTripCount,
7425 OrigLoopInvocationWeight,
7426 estimateElementCount(BestVF * BestUF, CM.getVScaleForTuning()),
7427 DisableRuntimeUnroll);
7428
7429 // 3. Fix the vectorized code: take care of header phi's, live-outs,
7430 // predication, updating analyses.
7431 ILV.fixVectorizedLoop(State);
7432
7434
7435 return ExpandedSCEVs;
7436}
7437
7438//===--------------------------------------------------------------------===//
7439// EpilogueVectorizerMainLoop
7440//===--------------------------------------------------------------------===//
7441
7442/// This function is partially responsible for generating the control flow
7443/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7445 BasicBlock *ScalarPH = createScalarPreheader("");
7446 BasicBlock *VectorPH = ScalarPH->getSinglePredecessor();
7447
7448 // Generate the code to check the minimum iteration count of the vector
7449 // epilogue (see below).
7450 EPI.EpilogueIterationCountCheck =
7451 emitIterationCountCheck(VectorPH, ScalarPH, true);
7452 EPI.EpilogueIterationCountCheck->setName("iter.check");
7453
7454 VectorPH = cast<BranchInst>(EPI.EpilogueIterationCountCheck->getTerminator())
7455 ->getSuccessor(1);
7456 // Generate the iteration count check for the main loop, *after* the check
7457 // for the epilogue loop, so that the path-length is shorter for the case
7458 // that goes directly through the vector epilogue. The longer-path length for
7459 // the main loop is compensated for, by the gain from vectorizing the larger
7460 // trip count. Note: the branch will get updated later on when we vectorize
7461 // the epilogue.
7462 EPI.MainLoopIterationCountCheck =
7463 emitIterationCountCheck(VectorPH, ScalarPH, false);
7464
7465 return cast<BranchInst>(EPI.MainLoopIterationCountCheck->getTerminator())
7466 ->getSuccessor(1);
7467}
7468
7470 LLVM_DEBUG({
7471 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7472 << "Main Loop VF:" << EPI.MainLoopVF
7473 << ", Main Loop UF:" << EPI.MainLoopUF
7474 << ", Epilogue Loop VF:" << EPI.EpilogueVF
7475 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7476 });
7477}
7478
7481 dbgs() << "intermediate fn:\n"
7482 << *OrigLoop->getHeader()->getParent() << "\n";
7483 });
7484}
7485
7487 BasicBlock *VectorPH, BasicBlock *Bypass, bool ForEpilogue) {
7488 assert(Bypass && "Expected valid bypass basic block.");
7491 Value *CheckMinIters = createIterationCountCheck(
7492 VectorPH, ForEpilogue ? EPI.EpilogueVF : EPI.MainLoopVF,
7493 ForEpilogue ? EPI.EpilogueUF : EPI.MainLoopUF);
7494
7495 BasicBlock *const TCCheckBlock = VectorPH;
7496 if (!ForEpilogue)
7497 TCCheckBlock->setName("vector.main.loop.iter.check");
7498
7499 // Create new preheader for vector loop.
7500 VectorPH = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7501 static_cast<DominatorTree *>(nullptr), LI, nullptr,
7502 "vector.ph");
7503 if (ForEpilogue) {
7504 // Save the trip count so we don't have to regenerate it in the
7505 // vec.epilog.iter.check. This is safe to do because the trip count
7506 // generated here dominates the vector epilog iter check.
7507 EPI.TripCount = Count;
7508 } else {
7510 }
7511
7512 BranchInst &BI = *BranchInst::Create(Bypass, VectorPH, CheckMinIters);
7513 if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
7514 setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
7515 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
7516
7517 // When vectorizing the main loop, its trip-count check is placed in a new
7518 // block, whereas the overall trip-count check is placed in the VPlan entry
7519 // block. When vectorizing the epilogue loop, its trip-count check is placed
7520 // in the VPlan entry block.
7521 if (!ForEpilogue)
7522 introduceCheckBlockInVPlan(TCCheckBlock);
7523 return TCCheckBlock;
7524}
7525
7526//===--------------------------------------------------------------------===//
7527// EpilogueVectorizerEpilogueLoop
7528//===--------------------------------------------------------------------===//
7529
7530/// This function creates a new scalar preheader, using the previous one as
7531/// entry block to the epilogue VPlan. The minimum iteration check is being
7532/// represented in VPlan.
7534 BasicBlock *NewScalarPH = createScalarPreheader("vec.epilog.");
7535 BasicBlock *OriginalScalarPH = NewScalarPH->getSinglePredecessor();
7536 OriginalScalarPH->setName("vec.epilog.iter.check");
7537 VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(OriginalScalarPH);
7538 VPBasicBlock *OldEntry = Plan.getEntry();
7539 for (auto &R : make_early_inc_range(*OldEntry)) {
7540 // Skip moving VPIRInstructions (including VPIRPhis), which are unmovable by
7541 // defining.
7542 if (isa<VPIRInstruction>(&R))
7543 continue;
7544 R.moveBefore(*NewEntry, NewEntry->end());
7545 }
7546
7547 VPBlockUtils::reassociateBlocks(OldEntry, NewEntry);
7548 Plan.setEntry(NewEntry);
7549 // OldEntry is now dead and will be cleaned up when the plan gets destroyed.
7550
7551 return OriginalScalarPH;
7552}
7553
7555 LLVM_DEBUG({
7556 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7557 << "Epilogue Loop VF:" << EPI.EpilogueVF
7558 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7559 });
7560}
7561
7564 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
7565 });
7566}
7567
7568VPWidenMemoryRecipe *VPRecipeBuilder::tryToWidenMemory(VPInstruction *VPI,
7569 VFRange &Range) {
7570 assert((VPI->getOpcode() == Instruction::Load ||
7571 VPI->getOpcode() == Instruction::Store) &&
7572 "Must be called with either a load or store");
7574
7575 auto WillWiden = [&](ElementCount VF) -> bool {
7577 CM.getWideningDecision(I, VF);
7579 "CM decision should be taken at this point.");
7581 return true;
7582 if (CM.isScalarAfterVectorization(I, VF) ||
7583 CM.isProfitableToScalarize(I, VF))
7584 return false;
7586 };
7587
7589 return nullptr;
7590
7591 VPValue *Mask = nullptr;
7592 if (Legal->isMaskRequired(I))
7593 Mask = getBlockInMask(Builder.getInsertBlock());
7594
7595 // Determine if the pointer operand of the access is either consecutive or
7596 // reverse consecutive.
7598 CM.getWideningDecision(I, Range.Start);
7600 bool Consecutive =
7602
7603 VPValue *Ptr = VPI->getOpcode() == Instruction::Load ? VPI->getOperand(0)
7604 : VPI->getOperand(1);
7605 if (Consecutive) {
7607 Ptr->getUnderlyingValue()->stripPointerCasts());
7608 VPSingleDefRecipe *VectorPtr;
7609 if (Reverse) {
7610 // When folding the tail, we may compute an address that we don't in the
7611 // original scalar loop: drop the GEP no-wrap flags in this case.
7612 // Otherwise preserve existing flags without no-unsigned-wrap, as we will
7613 // emit negative indices.
7614 GEPNoWrapFlags Flags =
7615 CM.foldTailByMasking() || !GEP
7617 : GEP->getNoWrapFlags().withoutNoUnsignedWrap();
7618 VectorPtr = new VPVectorEndPointerRecipe(
7619 Ptr, &Plan.getVF(), getLoadStoreType(I),
7620 /*Stride*/ -1, Flags, VPI->getDebugLoc());
7621 } else {
7622 VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
7623 GEP ? GEP->getNoWrapFlags()
7625 VPI->getDebugLoc());
7626 }
7627 Builder.insert(VectorPtr);
7628 Ptr = VectorPtr;
7629 }
7630 if (VPI->getOpcode() == Instruction::Load) {
7631 auto *Load = cast<LoadInst>(I);
7632 return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse, *VPI,
7633 VPI->getDebugLoc());
7634 }
7635
7636 StoreInst *Store = cast<StoreInst>(I);
7637 return new VPWidenStoreRecipe(*Store, Ptr, VPI->getOperand(0), Mask,
7638 Consecutive, Reverse, *VPI, VPI->getDebugLoc());
7639}
7640
7641/// Creates a VPWidenIntOrFpInductionRecipe for \p PhiR. If needed, it will
7642/// also insert a recipe to expand the step for the induction recipe.
7643static VPWidenIntOrFpInductionRecipe *
7645 const InductionDescriptor &IndDesc, VPlan &Plan,
7646 ScalarEvolution &SE, Loop &OrigLoop) {
7647 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
7648 "step must be loop invariant");
7649
7650 VPValue *Start = PhiR->getOperand(0);
7651 assert(Plan.getLiveIn(IndDesc.getStartValue()) == Start &&
7652 "Start VPValue must match IndDesc's start value");
7653
7654 // It is always safe to copy over the NoWrap and FastMath flags. In
7655 // particular, when folding tail by masking, the masked-off lanes are never
7656 // used, so it is safe.
7657 VPIRFlags Flags = vputils::getFlagsFromIndDesc(IndDesc);
7658 VPValue *Step =
7660
7661 // Update wide induction increments to use the same step as the corresponding
7662 // wide induction. This enables detecting induction increments directly in
7663 // VPlan and removes redundant splats.
7664 using namespace llvm::VPlanPatternMatch;
7665 if (match(PhiR->getOperand(1), m_Add(m_Specific(PhiR), m_VPValue())))
7666 PhiR->getOperand(1)->getDefiningRecipe()->setOperand(1, Step);
7667
7669 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
7670 IndDesc, Flags, PhiR->getDebugLoc());
7671}
7672
7673VPHeaderPHIRecipe *
7674VPRecipeBuilder::tryToOptimizeInductionPHI(VPInstruction *VPI, VFRange &Range) {
7675 auto *Phi = cast<PHINode>(VPI->getUnderlyingInstr());
7676
7677 // Check if this is an integer or fp induction. If so, build the recipe that
7678 // produces its scalar and vector values.
7679 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
7680 return createWidenInductionRecipes(VPI, *II, Plan, *PSE.getSE(), *OrigLoop);
7681
7682 // Check if this is pointer induction. If so, build the recipe for it.
7683 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
7684 VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep());
7685 return new VPWidenPointerInductionRecipe(
7686 Phi, VPI->getOperand(0), Step, &Plan.getVFxUF(), *II,
7688 [&](ElementCount VF) {
7689 return CM.isScalarAfterVectorization(Phi, VF);
7690 },
7691 Range),
7692 VPI->getDebugLoc());
7693 }
7694 return nullptr;
7695}
7696
7697VPWidenIntOrFpInductionRecipe *
7698VPRecipeBuilder::tryToOptimizeInductionTruncate(VPInstruction *VPI,
7699 VFRange &Range) {
7700 auto *I = cast<TruncInst>(VPI->getUnderlyingInstr());
7701 // Optimize the special case where the source is a constant integer
7702 // induction variable. Notice that we can only optimize the 'trunc' case
7703 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
7704 // (c) other casts depend on pointer size.
7705
7706 // Determine whether \p K is a truncation based on an induction variable that
7707 // can be optimized.
7708 auto IsOptimizableIVTruncate =
7709 [&](Instruction *K) -> std::function<bool(ElementCount)> {
7710 return [=](ElementCount VF) -> bool {
7711 return CM.isOptimizableIVTruncate(K, VF);
7712 };
7713 };
7714
7716 IsOptimizableIVTruncate(I), Range))
7717 return nullptr;
7718
7720 VPI->getOperand(0)->getDefiningRecipe());
7721 PHINode *Phi = WidenIV->getPHINode();
7722 VPValue *Start = WidenIV->getStartValue();
7723 const InductionDescriptor &IndDesc = WidenIV->getInductionDescriptor();
7724
7725 // It is always safe to copy over the NoWrap and FastMath flags. In
7726 // particular, when folding tail by masking, the masked-off lanes are never
7727 // used, so it is safe.
7728 VPIRFlags Flags = vputils::getFlagsFromIndDesc(IndDesc);
7729 VPValue *Step =
7731 return new VPWidenIntOrFpInductionRecipe(
7732 Phi, Start, Step, &Plan.getVF(), IndDesc, I, Flags, VPI->getDebugLoc());
7733}
7734
7735VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(VPInstruction *VPI,
7736 VFRange &Range) {
7737 CallInst *CI = cast<CallInst>(VPI->getUnderlyingInstr());
7739 [this, CI](ElementCount VF) {
7740 return CM.isScalarWithPredication(CI, VF);
7741 },
7742 Range);
7743
7744 if (IsPredicated)
7745 return nullptr;
7746
7748 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
7749 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
7750 ID == Intrinsic::pseudoprobe ||
7751 ID == Intrinsic::experimental_noalias_scope_decl))
7752 return nullptr;
7753
7755 VPI->op_begin() + CI->arg_size());
7756
7757 // Is it beneficial to perform intrinsic call compared to lib call?
7758 bool ShouldUseVectorIntrinsic =
7760 [&](ElementCount VF) -> bool {
7761 return CM.getCallWideningDecision(CI, VF).Kind ==
7763 },
7764 Range);
7765 if (ShouldUseVectorIntrinsic)
7766 return new VPWidenIntrinsicRecipe(*CI, ID, Ops, CI->getType(), *VPI, *VPI,
7767 VPI->getDebugLoc());
7768
7769 Function *Variant = nullptr;
7770 std::optional<unsigned> MaskPos;
7771 // Is better to call a vectorized version of the function than to to scalarize
7772 // the call?
7773 auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
7774 [&](ElementCount VF) -> bool {
7775 // The following case may be scalarized depending on the VF.
7776 // The flag shows whether we can use a usual Call for vectorized
7777 // version of the instruction.
7778
7779 // If we've found a variant at a previous VF, then stop looking. A
7780 // vectorized variant of a function expects input in a certain shape
7781 // -- basically the number of input registers, the number of lanes
7782 // per register, and whether there's a mask required.
7783 // We store a pointer to the variant in the VPWidenCallRecipe, so
7784 // once we have an appropriate variant it's only valid for that VF.
7785 // This will force a different vplan to be generated for each VF that
7786 // finds a valid variant.
7787 if (Variant)
7788 return false;
7789 LoopVectorizationCostModel::CallWideningDecision Decision =
7790 CM.getCallWideningDecision(CI, VF);
7792 Variant = Decision.Variant;
7793 MaskPos = Decision.MaskPos;
7794 return true;
7795 }
7796
7797 return false;
7798 },
7799 Range);
7800 if (ShouldUseVectorCall) {
7801 if (MaskPos.has_value()) {
7802 // We have 2 cases that would require a mask:
7803 // 1) The block needs to be predicated, either due to a conditional
7804 // in the scalar loop or use of an active lane mask with
7805 // tail-folding, and we use the appropriate mask for the block.
7806 // 2) No mask is required for the block, but the only available
7807 // vector variant at this VF requires a mask, so we synthesize an
7808 // all-true mask.
7809 VPValue *Mask = nullptr;
7810 if (Legal->isMaskRequired(CI))
7811 Mask = getBlockInMask(Builder.getInsertBlock());
7812 else
7813 Mask = Plan.getOrAddLiveIn(
7814 ConstantInt::getTrue(IntegerType::getInt1Ty(Plan.getContext())));
7815
7816 Ops.insert(Ops.begin() + *MaskPos, Mask);
7817 }
7818
7819 Ops.push_back(VPI->getOperand(VPI->getNumOperands() - 1));
7820 return new VPWidenCallRecipe(CI, Variant, Ops, *VPI, *VPI,
7821 VPI->getDebugLoc());
7822 }
7823
7824 return nullptr;
7825}
7826
7827bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
7829 !isa<StoreInst>(I) && "Instruction should have been handled earlier");
7830 // Instruction should be widened, unless it is scalar after vectorization,
7831 // scalarization is profitable or it is predicated.
7832 auto WillScalarize = [this, I](ElementCount VF) -> bool {
7833 return CM.isScalarAfterVectorization(I, VF) ||
7834 CM.isProfitableToScalarize(I, VF) ||
7835 CM.isScalarWithPredication(I, VF);
7836 };
7838 Range);
7839}
7840
7841VPWidenRecipe *VPRecipeBuilder::tryToWiden(VPInstruction *VPI) {
7842 auto *I = VPI->getUnderlyingInstr();
7843 switch (VPI->getOpcode()) {
7844 default:
7845 return nullptr;
7846 case Instruction::SDiv:
7847 case Instruction::UDiv:
7848 case Instruction::SRem:
7849 case Instruction::URem: {
7850 // If not provably safe, use a select to form a safe divisor before widening the
7851 // div/rem operation itself. Otherwise fall through to general handling below.
7852 if (CM.isPredicatedInst(I)) {
7854 VPValue *Mask = getBlockInMask(Builder.getInsertBlock());
7855 VPValue *One = Plan.getConstantInt(I->getType(), 1u);
7856 auto *SafeRHS =
7857 Builder.createSelect(Mask, Ops[1], One, VPI->getDebugLoc());
7858 Ops[1] = SafeRHS;
7859 return new VPWidenRecipe(*I, Ops, *VPI, *VPI, VPI->getDebugLoc());
7860 }
7861 [[fallthrough]];
7862 }
7863 case Instruction::Add:
7864 case Instruction::And:
7865 case Instruction::AShr:
7866 case Instruction::FAdd:
7867 case Instruction::FCmp:
7868 case Instruction::FDiv:
7869 case Instruction::FMul:
7870 case Instruction::FNeg:
7871 case Instruction::FRem:
7872 case Instruction::FSub:
7873 case Instruction::ICmp:
7874 case Instruction::LShr:
7875 case Instruction::Mul:
7876 case Instruction::Or:
7877 case Instruction::Select:
7878 case Instruction::Shl:
7879 case Instruction::Sub:
7880 case Instruction::Xor:
7881 case Instruction::Freeze: {
7882 SmallVector<VPValue *> NewOps(VPI->operands());
7883 if (Instruction::isBinaryOp(VPI->getOpcode())) {
7884 // The legacy cost model uses SCEV to check if some of the operands are
7885 // constants. To match the legacy cost model's behavior, use SCEV to try
7886 // to replace operands with constants.
7887 ScalarEvolution &SE = *PSE.getSE();
7888 auto GetConstantViaSCEV = [this, &SE](VPValue *Op) {
7889 if (!Op->isLiveIn())
7890 return Op;
7891 Value *V = Op->getUnderlyingValue();
7892 if (isa<Constant>(V) || !SE.isSCEVable(V->getType()))
7893 return Op;
7894 auto *C = dyn_cast<SCEVConstant>(SE.getSCEV(V));
7895 if (!C)
7896 return Op;
7897 return Plan.getOrAddLiveIn(C->getValue());
7898 };
7899 // For Mul, the legacy cost model checks both operands.
7900 if (VPI->getOpcode() == Instruction::Mul)
7901 NewOps[0] = GetConstantViaSCEV(NewOps[0]);
7902 // For other binops, the legacy cost model only checks the second operand.
7903 NewOps[1] = GetConstantViaSCEV(NewOps[1]);
7904 }
7905 return new VPWidenRecipe(*I, NewOps, *VPI, *VPI, VPI->getDebugLoc());
7906 }
7907 case Instruction::ExtractValue: {
7908 SmallVector<VPValue *> NewOps(VPI->operands());
7909 auto *EVI = cast<ExtractValueInst>(I);
7910 assert(EVI->getNumIndices() == 1 && "Expected one extractvalue index");
7911 unsigned Idx = EVI->getIndices()[0];
7912 NewOps.push_back(Plan.getConstantInt(32, Idx));
7913 return new VPWidenRecipe(*I, NewOps, *VPI, *VPI, VPI->getDebugLoc());
7914 }
7915 };
7916}
7917
7918VPHistogramRecipe *VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
7919 VPInstruction *VPI) {
7920 // FIXME: Support other operations.
7921 unsigned Opcode = HI->Update->getOpcode();
7922 assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) &&
7923 "Histogram update operation must be an Add or Sub");
7924
7926 // Bucket address.
7927 HGramOps.push_back(VPI->getOperand(1));
7928 // Increment value.
7929 HGramOps.push_back(getVPValueOrAddLiveIn(HI->Update->getOperand(1)));
7930
7931 // In case of predicated execution (due to tail-folding, or conditional
7932 // execution, or both), pass the relevant mask.
7933 if (Legal->isMaskRequired(HI->Store))
7934 HGramOps.push_back(getBlockInMask(Builder.getInsertBlock()));
7935
7936 return new VPHistogramRecipe(Opcode, HGramOps, VPI->getDebugLoc());
7937}
7938
7940 VFRange &Range) {
7941 auto *I = VPI->getUnderlyingInstr();
7943 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
7944 Range);
7945
7946 bool IsPredicated = CM.isPredicatedInst(I);
7947
7948 // Even if the instruction is not marked as uniform, there are certain
7949 // intrinsic calls that can be effectively treated as such, so we check for
7950 // them here. Conservatively, we only do this for scalable vectors, since
7951 // for fixed-width VFs we can always fall back on full scalarization.
7952 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
7953 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
7954 case Intrinsic::assume:
7955 case Intrinsic::lifetime_start:
7956 case Intrinsic::lifetime_end:
7957 // For scalable vectors if one of the operands is variant then we still
7958 // want to mark as uniform, which will generate one instruction for just
7959 // the first lane of the vector. We can't scalarize the call in the same
7960 // way as for fixed-width vectors because we don't know how many lanes
7961 // there are.
7962 //
7963 // The reasons for doing it this way for scalable vectors are:
7964 // 1. For the assume intrinsic generating the instruction for the first
7965 // lane is still be better than not generating any at all. For
7966 // example, the input may be a splat across all lanes.
7967 // 2. For the lifetime start/end intrinsics the pointer operand only
7968 // does anything useful when the input comes from a stack object,
7969 // which suggests it should always be uniform. For non-stack objects
7970 // the effect is to poison the object, which still allows us to
7971 // remove the call.
7972 IsUniform = true;
7973 break;
7974 default:
7975 break;
7976 }
7977 }
7978 VPValue *BlockInMask = nullptr;
7979 if (!IsPredicated) {
7980 // Finalize the recipe for Instr, first if it is not predicated.
7981 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
7982 } else {
7983 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
7984 // Instructions marked for predication are replicated and a mask operand is
7985 // added initially. Masked replicate recipes will later be placed under an
7986 // if-then construct to prevent side-effects. Generate recipes to compute
7987 // the block mask for this region.
7988 BlockInMask = getBlockInMask(Builder.getInsertBlock());
7989 }
7990
7991 // Note that there is some custom logic to mark some intrinsics as uniform
7992 // manually above for scalable vectors, which this assert needs to account for
7993 // as well.
7994 assert((Range.Start.isScalar() || !IsUniform || !IsPredicated ||
7995 (Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
7996 "Should not predicate a uniform recipe");
7997 auto *Recipe =
7998 new VPReplicateRecipe(I, VPI->operands(), IsUniform, BlockInMask, *VPI,
7999 *VPI, VPI->getDebugLoc());
8000 return Recipe;
8001}
8002
8003/// Find all possible partial reductions in the loop and track all of those that
8004/// are valid so recipes can be formed later.
8006 // Find all possible partial reductions.
8008 PartialReductionChains;
8009 for (const auto &[Phi, RdxDesc] : Legal->getReductionVars()) {
8010 getScaledReductions(Phi, RdxDesc.getLoopExitInstr(), Range,
8011 PartialReductionChains);
8012 }
8013
8014 // A partial reduction is invalid if any of its extends are used by
8015 // something that isn't another partial reduction. This is because the
8016 // extends are intended to be lowered along with the reduction itself.
8017
8018 // Build up a set of partial reduction ops for efficient use checking.
8019 SmallPtrSet<User *, 4> PartialReductionOps;
8020 for (const auto &[PartialRdx, _] : PartialReductionChains)
8021 PartialReductionOps.insert(PartialRdx.ExtendUser);
8022
8023 auto ExtendIsOnlyUsedByPartialReductions =
8024 [&PartialReductionOps](Instruction *Extend) {
8025 return all_of(Extend->users(), [&](const User *U) {
8026 return PartialReductionOps.contains(U);
8027 });
8028 };
8029
8030 // Check if each use of a chain's two extends is a partial reduction
8031 // and only add those that don't have non-partial reduction users.
8032 for (auto Pair : PartialReductionChains) {
8033 PartialReductionChain Chain = Pair.first;
8034 if (ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) &&
8035 (!Chain.ExtendB || ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB)))
8036 ScaledReductionMap.try_emplace(Chain.Reduction, Pair.second);
8037 }
8038
8039 // Check that all partial reductions in a chain are only used by other
8040 // partial reductions with the same scale factor. Otherwise we end up creating
8041 // users of scaled reductions where the types of the other operands don't
8042 // match.
8043 for (const auto &[Chain, Scale] : PartialReductionChains) {
8044 auto AllUsersPartialRdx = [ScaleVal = Scale, this](const User *U) {
8045 auto *UI = cast<Instruction>(U);
8046 if (isa<PHINode>(UI) && UI->getParent() == OrigLoop->getHeader()) {
8047 return all_of(UI->users(), [ScaleVal, this](const User *U) {
8048 auto *UI = cast<Instruction>(U);
8049 return ScaledReductionMap.lookup_or(UI, 0) == ScaleVal;
8050 });
8051 }
8052 return ScaledReductionMap.lookup_or(UI, 0) == ScaleVal ||
8053 !OrigLoop->contains(UI->getParent());
8054 };
8055 if (!all_of(Chain.Reduction->users(), AllUsersPartialRdx))
8056 ScaledReductionMap.erase(Chain.Reduction);
8057 }
8058}
8059
8060bool VPRecipeBuilder::getScaledReductions(
8061 Instruction *PHI, Instruction *RdxExitInstr, VFRange &Range,
8062 SmallVectorImpl<std::pair<PartialReductionChain, unsigned>> &Chains) {
8063 if (!CM.TheLoop->contains(RdxExitInstr))
8064 return false;
8065
8066 auto *Update = dyn_cast<BinaryOperator>(RdxExitInstr);
8067 if (!Update)
8068 return false;
8069
8070 Value *Op = Update->getOperand(0);
8071 Value *PhiOp = Update->getOperand(1);
8072 if (Op == PHI)
8073 std::swap(Op, PhiOp);
8074
8075 // Try and get a scaled reduction from the first non-phi operand.
8076 // If one is found, we use the discovered reduction instruction in
8077 // place of the accumulator for costing.
8078 if (auto *OpInst = dyn_cast<Instruction>(Op)) {
8079 if (getScaledReductions(PHI, OpInst, Range, Chains)) {
8080 PHI = Chains.rbegin()->first.Reduction;
8081
8082 Op = Update->getOperand(0);
8083 PhiOp = Update->getOperand(1);
8084 if (Op == PHI)
8085 std::swap(Op, PhiOp);
8086 }
8087 }
8088 if (PhiOp != PHI)
8089 return false;
8090
8091 using namespace llvm::PatternMatch;
8092
8093 // If the update is a binary operator, check both of its operands to see if
8094 // they are extends. Otherwise, see if the update comes directly from an
8095 // extend.
8096 Instruction *Exts[2] = {nullptr};
8097 BinaryOperator *ExtendUser = dyn_cast<BinaryOperator>(Op);
8098 std::optional<unsigned> BinOpc;
8099 Type *ExtOpTypes[2] = {nullptr};
8101
8102 auto CollectExtInfo = [this, &Exts, &ExtOpTypes,
8103 &ExtKinds](SmallVectorImpl<Value *> &Ops) -> bool {
8104 for (const auto &[I, OpI] : enumerate(Ops)) {
8105 const APInt *C;
8106 if (I > 0 && match(OpI, m_APInt(C)) &&
8107 canConstantBeExtended(C, ExtOpTypes[0], ExtKinds[0])) {
8108 ExtOpTypes[I] = ExtOpTypes[0];
8109 ExtKinds[I] = ExtKinds[0];
8110 continue;
8111 }
8112 Value *ExtOp;
8113 if (!match(OpI, m_ZExtOrSExt(m_Value(ExtOp))))
8114 return false;
8115 Exts[I] = cast<Instruction>(OpI);
8116
8117 // TODO: We should be able to support live-ins.
8118 if (!CM.TheLoop->contains(Exts[I]))
8119 return false;
8120
8121 ExtOpTypes[I] = ExtOp->getType();
8122 ExtKinds[I] = TTI::getPartialReductionExtendKind(Exts[I]);
8123 }
8124 return true;
8125 };
8126
8127 if (ExtendUser) {
8128 if (!ExtendUser->hasOneUse())
8129 return false;
8130
8131 // Use the side-effect of match to replace BinOp only if the pattern is
8132 // matched, we don't care at this point whether it actually matched.
8133 match(ExtendUser, m_Neg(m_BinOp(ExtendUser)));
8134
8135 SmallVector<Value *> Ops(ExtendUser->operands());
8136 if (!CollectExtInfo(Ops))
8137 return false;
8138
8139 BinOpc = std::make_optional(ExtendUser->getOpcode());
8140 } else if (match(Update, m_Add(m_Value(), m_Value()))) {
8141 // We already know the operands for Update are Op and PhiOp.
8143 if (!CollectExtInfo(Ops))
8144 return false;
8145
8146 ExtendUser = Update;
8147 BinOpc = std::nullopt;
8148 } else
8149 return false;
8150
8151 PartialReductionChain Chain(RdxExitInstr, Exts[0], Exts[1], ExtendUser);
8152
8153 TypeSize PHISize = PHI->getType()->getPrimitiveSizeInBits();
8154 TypeSize ASize = ExtOpTypes[0]->getPrimitiveSizeInBits();
8155 if (!PHISize.hasKnownScalarFactor(ASize))
8156 return false;
8157 unsigned TargetScaleFactor = PHISize.getKnownScalarFactor(ASize);
8158
8160 [&](ElementCount VF) {
8162 Update->getOpcode(), ExtOpTypes[0], ExtOpTypes[1],
8163 PHI->getType(), VF, ExtKinds[0], ExtKinds[1], BinOpc,
8164 CM.CostKind);
8165 return Cost.isValid();
8166 },
8167 Range)) {
8168 Chains.emplace_back(Chain, TargetScaleFactor);
8169 return true;
8170 }
8171
8172 return false;
8173}
8174
8176 VFRange &Range) {
8177 // First, check for specific widening recipes that deal with inductions, Phi
8178 // nodes, calls and memory operations.
8179 VPRecipeBase *Recipe;
8180 if (auto *PhiR = dyn_cast<VPPhi>(R)) {
8181 VPBasicBlock *Parent = PhiR->getParent();
8182 [[maybe_unused]] VPRegionBlock *LoopRegionOf =
8183 Parent->getEnclosingLoopRegion();
8184 assert(LoopRegionOf && LoopRegionOf->getEntry() == Parent &&
8185 "Non-header phis should have been handled during predication");
8186 auto *Phi = cast<PHINode>(R->getUnderlyingInstr());
8187 assert(R->getNumOperands() == 2 && "Must have 2 operands for header phis");
8188 if ((Recipe = tryToOptimizeInductionPHI(PhiR, Range)))
8189 return Recipe;
8190
8191 VPHeaderPHIRecipe *PhiRecipe = nullptr;
8192 assert((Legal->isReductionVariable(Phi) ||
8193 Legal->isFixedOrderRecurrence(Phi)) &&
8194 "can only widen reductions and fixed-order recurrences here");
8195 VPValue *StartV = R->getOperand(0);
8196 if (Legal->isReductionVariable(Phi)) {
8197 const RecurrenceDescriptor &RdxDesc = Legal->getRecurrenceDescriptor(Phi);
8198 assert(RdxDesc.getRecurrenceStartValue() ==
8199 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8200
8201 // If the PHI is used by a partial reduction, set the scale factor.
8202 unsigned ScaleFactor =
8203 getScalingForReduction(RdxDesc.getLoopExitInstr()).value_or(1);
8204 PhiRecipe = new VPReductionPHIRecipe(
8205 Phi, RdxDesc.getRecurrenceKind(), *StartV, CM.isInLoopReduction(Phi),
8206 CM.useOrderedReductions(RdxDesc), ScaleFactor);
8207 } else {
8208 // TODO: Currently fixed-order recurrences are modeled as chains of
8209 // first-order recurrences. If there are no users of the intermediate
8210 // recurrences in the chain, the fixed order recurrence should be modeled
8211 // directly, enabling more efficient codegen.
8212 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8213 }
8214 // Add backedge value.
8215 PhiRecipe->addOperand(R->getOperand(1));
8216 return PhiRecipe;
8217 }
8218 assert(!R->isPhi() && "only VPPhi nodes expected at this point");
8219
8220 auto *VPI = cast<VPInstruction>(R);
8221 Instruction *Instr = R->getUnderlyingInstr();
8222 if (VPI->getOpcode() == Instruction::Trunc &&
8223 (Recipe = tryToOptimizeInductionTruncate(VPI, Range)))
8224 return Recipe;
8225
8226 // All widen recipes below deal only with VF > 1.
8228 [&](ElementCount VF) { return VF.isScalar(); }, Range))
8229 return nullptr;
8230
8231 if (VPI->getOpcode() == Instruction::Call)
8232 return tryToWidenCall(VPI, Range);
8233
8234 if (VPI->getOpcode() == Instruction::Store)
8235 if (auto HistInfo = Legal->getHistogramInfo(cast<StoreInst>(Instr)))
8236 return tryToWidenHistogram(*HistInfo, VPI);
8237
8238 if (VPI->getOpcode() == Instruction::Load ||
8239 VPI->getOpcode() == Instruction::Store)
8240 return tryToWidenMemory(VPI, Range);
8241
8242 if (std::optional<unsigned> ScaleFactor = getScalingForReduction(Instr))
8243 return tryToCreatePartialReduction(VPI, ScaleFactor.value());
8244
8245 if (!shouldWiden(Instr, Range))
8246 return nullptr;
8247
8248 if (VPI->getOpcode() == Instruction::GetElementPtr)
8249 return new VPWidenGEPRecipe(cast<GetElementPtrInst>(Instr), R->operands(),
8250 *VPI, VPI->getDebugLoc());
8251
8252 if (VPI->getOpcode() == Instruction::Select)
8253 return new VPWidenSelectRecipe(cast<SelectInst>(Instr), R->operands(), *VPI,
8254 *VPI, VPI->getDebugLoc());
8255
8256 if (Instruction::isCast(VPI->getOpcode())) {
8257 auto *CI = cast<CastInst>(Instr);
8258 auto *CastR = cast<VPInstructionWithType>(VPI);
8259 return new VPWidenCastRecipe(CI->getOpcode(), VPI->getOperand(0),
8260 CastR->getResultType(), CI, *VPI, *VPI,
8261 VPI->getDebugLoc());
8262 }
8263
8264 return tryToWiden(VPI);
8265}
8266
8269 unsigned ScaleFactor) {
8270 assert(Reduction->getNumOperands() == 2 &&
8271 "Unexpected number of operands for partial reduction");
8272
8273 VPValue *BinOp = Reduction->getOperand(0);
8274 VPValue *Accumulator = Reduction->getOperand(1);
8276 std::swap(BinOp, Accumulator);
8277
8278 assert(ScaleFactor ==
8279 vputils::getVFScaleFactor(Accumulator->getDefiningRecipe()) &&
8280 "all accumulators in chain must have same scale factor");
8281
8282 unsigned ReductionOpcode = Reduction->getOpcode();
8283 auto *ReductionI = Reduction->getUnderlyingInstr();
8284 if (ReductionOpcode == Instruction::Sub) {
8285 auto *const Zero = ConstantInt::get(ReductionI->getType(), 0);
8287 Ops.push_back(Plan.getOrAddLiveIn(Zero));
8288 Ops.push_back(BinOp);
8289 BinOp = new VPWidenRecipe(*ReductionI, Ops, VPIRFlags(*ReductionI),
8290 VPIRMetadata(), ReductionI->getDebugLoc());
8291 Builder.insert(BinOp->getDefiningRecipe());
8292 ReductionOpcode = Instruction::Add;
8293 }
8294
8295 VPValue *Cond = nullptr;
8296 if (CM.blockNeedsPredicationForAnyReason(ReductionI->getParent()))
8297 Cond = getBlockInMask(Builder.getInsertBlock());
8298 return new VPPartialReductionRecipe(ReductionOpcode, Accumulator, BinOp, Cond,
8299 ScaleFactor, ReductionI);
8300}
8301
8302void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8303 ElementCount MaxVF) {
8304 if (ElementCount::isKnownGT(MinVF, MaxVF))
8305 return;
8306
8307 assert(OrigLoop->isInnermost() && "Inner loop expected.");
8308
8309 const LoopAccessInfo *LAI = Legal->getLAI();
8311 OrigLoop, LI, DT, PSE.getSE());
8312 if (!LAI->getRuntimePointerChecking()->getChecks().empty() &&
8314 // Only use noalias metadata when using memory checks guaranteeing no
8315 // overlap across all iterations.
8316 LVer.prepareNoAliasMetadata();
8317 }
8318
8319 // Create initial base VPlan0, to serve as common starting point for all
8320 // candidates built later for specific VF ranges.
8321 auto VPlan0 = VPlanTransforms::buildVPlan0(
8322 OrigLoop, *LI, Legal->getWidestInductionType(),
8323 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE, &LVer);
8324
8325 auto MaxVFTimes2 = MaxVF * 2;
8326 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
8327 VFRange SubRange = {VF, MaxVFTimes2};
8328 if (auto Plan = tryToBuildVPlanWithVPRecipes(
8329 std::unique_ptr<VPlan>(VPlan0->duplicate()), SubRange, &LVer)) {
8330 // Now optimize the initial VPlan.
8332 *Plan, CM.getMinimalBitwidths());
8334 // TODO: try to put it close to addActiveLaneMask().
8335 if (CM.foldTailWithEVL())
8337 *Plan, CM.getMaxSafeElements());
8338 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8339 VPlans.push_back(std::move(Plan));
8340 }
8341 VF = SubRange.End;
8342 }
8343}
8344
8345VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
8346 VPlanPtr Plan, VFRange &Range, LoopVersioning *LVer) {
8347
8348 using namespace llvm::VPlanPatternMatch;
8349 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8350
8351 // ---------------------------------------------------------------------------
8352 // Build initial VPlan: Scan the body of the loop in a topological order to
8353 // visit each basic block after having visited its predecessor basic blocks.
8354 // ---------------------------------------------------------------------------
8355
8356 bool RequiresScalarEpilogueCheck =
8358 [this](ElementCount VF) {
8359 return !CM.requiresScalarEpilogue(VF.isVector());
8360 },
8361 Range);
8362 VPlanTransforms::handleEarlyExits(*Plan, Legal->hasUncountableEarlyExit());
8363 VPlanTransforms::addMiddleCheck(*Plan, RequiresScalarEpilogueCheck,
8364 CM.foldTailByMasking());
8365
8367
8368 // Don't use getDecisionAndClampRange here, because we don't know the UF
8369 // so this function is better to be conservative, rather than to split
8370 // it up into different VPlans.
8371 // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
8372 bool IVUpdateMayOverflow = false;
8373 for (ElementCount VF : Range)
8374 IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
8375
8376 TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
8377 // Use NUW for the induction increment if we proved that it won't overflow in
8378 // the vector loop or when not folding the tail. In the later case, we know
8379 // that the canonical induction increment will not overflow as the vector trip
8380 // count is >= increment and a multiple of the increment.
8381 VPRegionBlock *LoopRegion = Plan->getVectorLoopRegion();
8382 bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None;
8383 if (!HasNUW) {
8384 auto *IVInc =
8385 LoopRegion->getExitingBasicBlock()->getTerminator()->getOperand(0);
8386 assert(match(IVInc,
8387 m_VPInstruction<Instruction::Add>(
8388 m_Specific(LoopRegion->getCanonicalIV()), m_VPValue())) &&
8389 "Did not find the canonical IV increment");
8390 cast<VPRecipeWithIRFlags>(IVInc)->dropPoisonGeneratingFlags();
8391 }
8392
8393 // ---------------------------------------------------------------------------
8394 // Pre-construction: record ingredients whose recipes we'll need to further
8395 // process after constructing the initial VPlan.
8396 // ---------------------------------------------------------------------------
8397
8398 // For each interleave group which is relevant for this (possibly trimmed)
8399 // Range, add it to the set of groups to be later applied to the VPlan and add
8400 // placeholders for its members' Recipes which we'll be replacing with a
8401 // single VPInterleaveRecipe.
8402 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8403 auto ApplyIG = [IG, this](ElementCount VF) -> bool {
8404 bool Result = (VF.isVector() && // Query is illegal for VF == 1
8405 CM.getWideningDecision(IG->getInsertPos(), VF) ==
8407 // For scalable vectors, the interleave factors must be <= 8 since we
8408 // require the (de)interleaveN intrinsics instead of shufflevectors.
8409 assert((!Result || !VF.isScalable() || IG->getFactor() <= 8) &&
8410 "Unsupported interleave factor for scalable vectors");
8411 return Result;
8412 };
8413 if (!getDecisionAndClampRange(ApplyIG, Range))
8414 continue;
8415 InterleaveGroups.insert(IG);
8416 }
8417
8418 // ---------------------------------------------------------------------------
8419 // Predicate and linearize the top-level loop region.
8420 // ---------------------------------------------------------------------------
8421 auto BlockMaskCache = VPlanTransforms::introduceMasksAndLinearize(
8422 *Plan, CM.foldTailByMasking());
8423
8424 // ---------------------------------------------------------------------------
8425 // Construct wide recipes and apply predication for original scalar
8426 // VPInstructions in the loop.
8427 // ---------------------------------------------------------------------------
8428 VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
8429 Builder, BlockMaskCache);
8430 // TODO: Handle partial reductions with EVL tail folding.
8431 if (!CM.foldTailWithEVL())
8432 RecipeBuilder.collectScaledReductions(Range);
8433
8434 // Scan the body of the loop in a topological order to visit each basic block
8435 // after having visited its predecessor basic blocks.
8436 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
8437 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
8438 HeaderVPBB);
8439
8440 auto *MiddleVPBB = Plan->getMiddleBlock();
8441 VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
8442 // Mapping from VPValues in the initial plan to their widened VPValues. Needed
8443 // temporarily to update created block masks.
8444 DenseMap<VPValue *, VPValue *> Old2New;
8445 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
8446 // Convert input VPInstructions to widened recipes.
8447 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
8448 auto *SingleDef = cast<VPSingleDefRecipe>(&R);
8449 auto *UnderlyingValue = SingleDef->getUnderlyingValue();
8450 // Skip recipes that do not need transforming, including canonical IV,
8451 // wide canonical IV and VPInstructions without underlying values. The
8452 // latter are added above for masking.
8453 // FIXME: Migrate code relying on the underlying instruction from VPlan0
8454 // to construct recipes below to not use the underlying instruction.
8456 &R) ||
8457 (isa<VPInstruction>(&R) && !UnderlyingValue))
8458 continue;
8459 assert(isa<VPInstruction>(&R) && UnderlyingValue && "unsupported recipe");
8460
8461 // TODO: Gradually replace uses of underlying instruction by analyses on
8462 // VPlan.
8463 Instruction *Instr = cast<Instruction>(UnderlyingValue);
8464 Builder.setInsertPoint(SingleDef);
8465
8466 // The stores with invariant address inside the loop will be deleted, and
8467 // in the exit block, a uniform store recipe will be created for the final
8468 // invariant store of the reduction.
8469 StoreInst *SI;
8470 if ((SI = dyn_cast<StoreInst>(Instr)) &&
8471 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
8472 // Only create recipe for the final invariant store of the reduction.
8473 if (Legal->isInvariantStoreOfReduction(SI)) {
8474 auto *VPI = cast<VPInstruction>(SingleDef);
8475 auto *Recipe = new VPReplicateRecipe(
8476 SI, R.operands(), true /* IsUniform */, nullptr /*Mask*/, *VPI,
8477 *VPI, VPI->getDebugLoc());
8478 Recipe->insertBefore(*MiddleVPBB, MBIP);
8479 }
8480 R.eraseFromParent();
8481 continue;
8482 }
8483
8484 VPRecipeBase *Recipe =
8485 RecipeBuilder.tryToCreateWidenRecipe(SingleDef, Range);
8486 if (!Recipe)
8487 Recipe = RecipeBuilder.handleReplication(cast<VPInstruction>(SingleDef),
8488 Range);
8489
8490 RecipeBuilder.setRecipe(Instr, Recipe);
8491 if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && isa<TruncInst>(Instr)) {
8492 // Optimized a truncate to VPWidenIntOrFpInductionRecipe. It needs to be
8493 // moved to the phi section in the header.
8494 Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
8495 } else {
8496 Builder.insert(Recipe);
8497 }
8498 if (Recipe->getNumDefinedValues() == 1) {
8499 SingleDef->replaceAllUsesWith(Recipe->getVPSingleValue());
8500 Old2New[SingleDef] = Recipe->getVPSingleValue();
8501 } else {
8502 assert(Recipe->getNumDefinedValues() == 0 &&
8503 "Unexpected multidef recipe");
8504 R.eraseFromParent();
8505 }
8506 }
8507 }
8508
8509 // replaceAllUsesWith above may invalidate the block masks. Update them here.
8510 // TODO: Include the masks as operands in the predicated VPlan directly
8511 // to remove the need to keep a map of masks beyond the predication
8512 // transform.
8513 RecipeBuilder.updateBlockMaskCache(Old2New);
8514 for (VPValue *Old : Old2New.keys())
8515 Old->getDefiningRecipe()->eraseFromParent();
8516
8517 assert(isa<VPRegionBlock>(LoopRegion) &&
8518 !LoopRegion->getEntryBasicBlock()->empty() &&
8519 "entry block must be set to a VPRegionBlock having a non-empty entry "
8520 "VPBasicBlock");
8521
8522 // TODO: We can't call runPass on these transforms yet, due to verifier
8523 // failures.
8525 DenseMap<VPValue *, VPValue *> IVEndValues;
8526 VPlanTransforms::addScalarResumePhis(*Plan, RecipeBuilder, IVEndValues);
8527
8528 // ---------------------------------------------------------------------------
8529 // Transform initial VPlan: Apply previously taken decisions, in order, to
8530 // bring the VPlan to its final state.
8531 // ---------------------------------------------------------------------------
8532
8533 // Adjust the recipes for any inloop reductions.
8534 adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start);
8535
8536 // Apply mandatory transformation to handle FP maxnum/minnum reduction with
8537 // NaNs if possible, bail out otherwise.
8539 *Plan))
8540 return nullptr;
8541
8542 // Transform recipes to abstract recipes if it is legal and beneficial and
8543 // clamp the range for better cost estimation.
8544 // TODO: Enable following transform when the EVL-version of extended-reduction
8545 // and mulacc-reduction are implemented.
8546 if (!CM.foldTailWithEVL()) {
8547 VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
8548 *CM.PSE.getSE(), OrigLoop);
8550 CostCtx, Range);
8551 }
8552
8553 for (ElementCount VF : Range)
8554 Plan->addVF(VF);
8555 Plan->setName("Initial VPlan");
8556
8557 // Interleave memory: for each Interleave Group we marked earlier as relevant
8558 // for this VPlan, replace the Recipes widening its memory instructions with a
8559 // single VPInterleaveRecipe at its insertion point.
8561 InterleaveGroups, RecipeBuilder,
8562 CM.isScalarEpilogueAllowed());
8563
8564 // Replace VPValues for known constant strides.
8566 Legal->getLAI()->getSymbolicStrides());
8567
8568 auto BlockNeedsPredication = [this](BasicBlock *BB) {
8569 return Legal->blockNeedsPredication(BB);
8570 };
8572 BlockNeedsPredication);
8573
8574 // Sink users of fixed-order recurrence past the recipe defining the previous
8575 // value and introduce FirstOrderRecurrenceSplice VPInstructions.
8577 *Plan, Builder))
8578 return nullptr;
8579
8580 if (useActiveLaneMask(Style)) {
8581 // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
8582 // TailFoldingStyle is visible there.
8583 bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
8584 bool WithoutRuntimeCheck =
8585 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
8586 VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
8587 WithoutRuntimeCheck);
8588 }
8589 VPlanTransforms::optimizeInductionExitUsers(*Plan, IVEndValues, *PSE.getSE());
8590
8591 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8592 return Plan;
8593}
8594
8595VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
8596 // Outer loop handling: They may require CFG and instruction level
8597 // transformations before even evaluating whether vectorization is profitable.
8598 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8599 // the vectorization pipeline.
8600 assert(!OrigLoop->isInnermost());
8601 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8602
8603 auto Plan = VPlanTransforms::buildVPlan0(
8604 OrigLoop, *LI, Legal->getWidestInductionType(),
8605 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE);
8607 /*HasUncountableExit*/ false);
8608 VPlanTransforms::addMiddleCheck(*Plan, /*RequiresScalarEpilogue*/ true,
8609 /*TailFolded*/ false);
8610
8612
8613 for (ElementCount VF : Range)
8614 Plan->addVF(VF);
8615
8617 *Plan,
8618 [this](PHINode *P) {
8619 return Legal->getIntOrFpInductionDescriptor(P);
8620 },
8621 *TLI))
8622 return nullptr;
8623
8624 // Collect mapping of IR header phis to header phi recipes, to be used in
8625 // addScalarResumePhis.
8626 DenseMap<VPBasicBlock *, VPValue *> BlockMaskCache;
8627 VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
8628 Builder, BlockMaskCache);
8629 for (auto &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
8631 continue;
8632 auto *HeaderR = cast<VPHeaderPHIRecipe>(&R);
8633 RecipeBuilder.setRecipe(HeaderR->getUnderlyingInstr(), HeaderR);
8634 }
8635 DenseMap<VPValue *, VPValue *> IVEndValues;
8636 // TODO: IVEndValues are not used yet in the native path, to optimize exit
8637 // values.
8638 // TODO: We can't call runPass on the transform yet, due to verifier
8639 // failures.
8640 VPlanTransforms::addScalarResumePhis(*Plan, RecipeBuilder, IVEndValues);
8641
8642 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8643 return Plan;
8644}
8645
8646// Adjust the recipes for reductions. For in-loop reductions the chain of
8647// instructions leading from the loop exit instr to the phi need to be converted
8648// to reductions, with one operand being vector and the other being the scalar
8649// reduction chain. For other reductions, a select is introduced between the phi
8650// and users outside the vector region when folding the tail.
8651//
8652// A ComputeReductionResult recipe is added to the middle block, also for
8653// in-loop reductions which compute their result in-loop, because generating
8654// the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
8655//
8656// Adjust AnyOf reductions; replace the reduction phi for the selected value
8657// with a boolean reduction phi node to check if the condition is true in any
8658// iteration. The final value is selected by the final ComputeReductionResult.
8659void LoopVectorizationPlanner::adjustRecipesForReductions(
8660 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
8661 using namespace VPlanPatternMatch;
8662 VPTypeAnalysis TypeInfo(*Plan);
8663 VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
8664 VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
8665 VPBasicBlock *MiddleVPBB = Plan->getMiddleBlock();
8667
8668 for (VPRecipeBase &R : Header->phis()) {
8669 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
8670 if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
8671 continue;
8672
8673 RecurKind Kind = PhiR->getRecurrenceKind();
8674 assert(
8677 "AnyOf and FindIV reductions are not allowed for in-loop reductions");
8678
8679 // Collect the chain of "link" recipes for the reduction starting at PhiR.
8680 SetVector<VPSingleDefRecipe *> Worklist;
8681 Worklist.insert(PhiR);
8682 for (unsigned I = 0; I != Worklist.size(); ++I) {
8683 VPSingleDefRecipe *Cur = Worklist[I];
8684 for (VPUser *U : Cur->users()) {
8685 auto *UserRecipe = cast<VPSingleDefRecipe>(U);
8686 if (!UserRecipe->getParent()->getEnclosingLoopRegion()) {
8687 assert((UserRecipe->getParent() == MiddleVPBB ||
8688 UserRecipe->getParent() == Plan->getScalarPreheader()) &&
8689 "U must be either in the loop region, the middle block or the "
8690 "scalar preheader.");
8691 continue;
8692 }
8693 Worklist.insert(UserRecipe);
8694 }
8695 }
8696
8697 // Visit operation "Links" along the reduction chain top-down starting from
8698 // the phi until LoopExitValue. We keep track of the previous item
8699 // (PreviousLink) to tell which of the two operands of a Link will remain
8700 // scalar and which will be reduced. For minmax by select(cmp), Link will be
8701 // the select instructions. Blend recipes of in-loop reduction phi's will
8702 // get folded to their non-phi operand, as the reduction recipe handles the
8703 // condition directly.
8704 VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0].
8705 for (VPSingleDefRecipe *CurrentLink : drop_begin(Worklist)) {
8706 if (auto *Blend = dyn_cast<VPBlendRecipe>(CurrentLink)) {
8707 assert(Blend->getNumIncomingValues() == 2 &&
8708 "Blend must have 2 incoming values");
8709 if (Blend->getIncomingValue(0) == PhiR) {
8710 Blend->replaceAllUsesWith(Blend->getIncomingValue(1));
8711 } else {
8712 assert(Blend->getIncomingValue(1) == PhiR &&
8713 "PhiR must be an operand of the blend");
8714 Blend->replaceAllUsesWith(Blend->getIncomingValue(0));
8715 }
8716 continue;
8717 }
8718
8719 Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
8720
8721 // Index of the first operand which holds a non-mask vector operand.
8722 unsigned IndexOfFirstOperand;
8723 // Recognize a call to the llvm.fmuladd intrinsic.
8724 bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
8725 VPValue *VecOp;
8726 VPBasicBlock *LinkVPBB = CurrentLink->getParent();
8727 if (IsFMulAdd) {
8728 assert(
8730 "Expected instruction to be a call to the llvm.fmuladd intrinsic");
8731 assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) ||
8732 isa<VPWidenIntrinsicRecipe>(CurrentLink)) &&
8733 CurrentLink->getOperand(2) == PreviousLink &&
8734 "expected a call where the previous link is the added operand");
8735
8736 // If the instruction is a call to the llvm.fmuladd intrinsic then we
8737 // need to create an fmul recipe (multiplying the first two operands of
8738 // the fmuladd together) to use as the vector operand for the fadd
8739 // reduction.
8740 VPInstruction *FMulRecipe = new VPInstruction(
8741 Instruction::FMul,
8742 {CurrentLink->getOperand(0), CurrentLink->getOperand(1)},
8743 CurrentLinkI->getFastMathFlags());
8744 LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());
8745 VecOp = FMulRecipe;
8746 } else if (PhiR->isInLoop() && Kind == RecurKind::AddChainWithSubs &&
8747 match(CurrentLink, m_Sub(m_VPValue(), m_VPValue()))) {
8748 Type *PhiTy = TypeInfo.inferScalarType(PhiR);
8749 auto *Zero = Plan->getConstantInt(PhiTy, 0);
8750 VPWidenRecipe *Sub = new VPWidenRecipe(
8751 Instruction::Sub, {Zero, CurrentLink->getOperand(1)}, {},
8752 VPIRMetadata(), CurrentLinkI->getDebugLoc());
8753 Sub->setUnderlyingValue(CurrentLinkI);
8754 LinkVPBB->insert(Sub, CurrentLink->getIterator());
8755 VecOp = Sub;
8756 } else {
8758 if (match(CurrentLink, m_Cmp(m_VPValue(), m_VPValue())))
8759 continue;
8760 assert(isa<VPWidenSelectRecipe>(CurrentLink) &&
8761 "must be a select recipe");
8762 IndexOfFirstOperand = 1;
8763 } else {
8764 assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) &&
8765 "Expected to replace a VPWidenSC");
8766 IndexOfFirstOperand = 0;
8767 }
8768 // Note that for non-commutable operands (cmp-selects), the semantics of
8769 // the cmp-select are captured in the recurrence kind.
8770 unsigned VecOpId =
8771 CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLink
8772 ? IndexOfFirstOperand + 1
8773 : IndexOfFirstOperand;
8774 VecOp = CurrentLink->getOperand(VecOpId);
8775 assert(VecOp != PreviousLink &&
8776 CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 -
8777 (VecOpId - IndexOfFirstOperand)) ==
8778 PreviousLink &&
8779 "PreviousLink must be the operand other than VecOp");
8780 }
8781
8782 VPValue *CondOp = nullptr;
8783 if (CM.blockNeedsPredicationForAnyReason(CurrentLinkI->getParent()))
8784 CondOp = RecipeBuilder.getBlockInMask(CurrentLink->getParent());
8785
8786 // TODO: Retrieve FMFs from recipes directly.
8787 RecurrenceDescriptor RdxDesc = Legal->getRecurrenceDescriptor(
8788 cast<PHINode>(PhiR->getUnderlyingInstr()));
8789 // Non-FP RdxDescs will have all fast math flags set, so clear them.
8790 FastMathFlags FMFs = isa<FPMathOperator>(CurrentLinkI)
8791 ? RdxDesc.getFastMathFlags()
8792 : FastMathFlags();
8793 auto *RedRecipe = new VPReductionRecipe(
8794 Kind, FMFs, CurrentLinkI, PreviousLink, VecOp, CondOp,
8795 PhiR->isOrdered(), CurrentLinkI->getDebugLoc());
8796 // Append the recipe to the end of the VPBasicBlock because we need to
8797 // ensure that it comes after all of it's inputs, including CondOp.
8798 // Delete CurrentLink as it will be invalid if its operand is replaced
8799 // with a reduction defined at the bottom of the block in the next link.
8800 if (LinkVPBB->getNumSuccessors() == 0)
8801 RedRecipe->insertBefore(&*std::prev(std::prev(LinkVPBB->end())));
8802 else
8803 LinkVPBB->appendRecipe(RedRecipe);
8804
8805 CurrentLink->replaceAllUsesWith(RedRecipe);
8806 ToDelete.push_back(CurrentLink);
8807 PreviousLink = RedRecipe;
8808 }
8809 }
8810 VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
8811 Builder.setInsertPoint(&*std::prev(std::prev(LatchVPBB->end())));
8812 VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
8813 for (VPRecipeBase &R :
8814 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
8815 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
8816 if (!PhiR)
8817 continue;
8818
8819 const RecurrenceDescriptor &RdxDesc = Legal->getRecurrenceDescriptor(
8821 Type *PhiTy = TypeInfo.inferScalarType(PhiR);
8822 // If tail is folded by masking, introduce selects between the phi
8823 // and the users outside the vector region of each reduction, at the
8824 // beginning of the dedicated latch block.
8825 auto *OrigExitingVPV = PhiR->getBackedgeValue();
8826 auto *NewExitingVPV = PhiR->getBackedgeValue();
8827 // Don't output selects for partial reductions because they have an output
8828 // with fewer lanes than the VF. So the operands of the select would have
8829 // different numbers of lanes. Partial reductions mask the input instead.
8830 if (!PhiR->isInLoop() && CM.foldTailByMasking() &&
8831 !isa<VPPartialReductionRecipe>(OrigExitingVPV)) {
8832 VPValue *Cond = RecipeBuilder.getBlockInMask(PhiR->getParent());
8833 std::optional<FastMathFlags> FMFs =
8834 PhiTy->isFloatingPointTy()
8835 ? std::make_optional(RdxDesc.getFastMathFlags())
8836 : std::nullopt;
8837 NewExitingVPV =
8838 Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs);
8839 OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) {
8840 return isa<VPInstruction>(&U) &&
8841 (cast<VPInstruction>(&U)->getOpcode() ==
8843 cast<VPInstruction>(&U)->getOpcode() ==
8845 cast<VPInstruction>(&U)->getOpcode() ==
8847 });
8848 if (CM.usePredicatedReductionSelect())
8849 PhiR->setOperand(1, NewExitingVPV);
8850 }
8851
8852 // We want code in the middle block to appear to execute on the location of
8853 // the scalar loop's latch terminator because: (a) it is all compiler
8854 // generated, (b) these instructions are always executed after evaluating
8855 // the latch conditional branch, and (c) other passes may add new
8856 // predecessors which terminate on this line. This is the easiest way to
8857 // ensure we don't accidentally cause an extra step back into the loop while
8858 // debugging.
8859 DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
8860
8861 // TODO: At the moment ComputeReductionResult also drives creation of the
8862 // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
8863 // even for in-loop reductions, until the reduction resume value handling is
8864 // also modeled in VPlan.
8865 VPInstruction *FinalReductionResult;
8866 VPBuilder::InsertPointGuard Guard(Builder);
8867 Builder.setInsertPoint(MiddleVPBB, IP);
8868 RecurKind RecurrenceKind = PhiR->getRecurrenceKind();
8870 VPValue *Start = PhiR->getStartValue();
8871 VPValue *Sentinel = Plan->getOrAddLiveIn(RdxDesc.getSentinelValue());
8872 FinalReductionResult =
8873 Builder.createNaryOp(VPInstruction::ComputeFindIVResult,
8874 {PhiR, Start, Sentinel, NewExitingVPV}, ExitDL);
8875 } else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RecurrenceKind)) {
8876 VPValue *Start = PhiR->getStartValue();
8877 FinalReductionResult =
8878 Builder.createNaryOp(VPInstruction::ComputeAnyOfResult,
8879 {PhiR, Start, NewExitingVPV}, ExitDL);
8880 } else {
8881 VPIRFlags Flags =
8883 ? VPIRFlags(RdxDesc.getFastMathFlags())
8884 : VPIRFlags();
8885 FinalReductionResult =
8886 Builder.createNaryOp(VPInstruction::ComputeReductionResult,
8887 {PhiR, NewExitingVPV}, Flags, ExitDL);
8888 }
8889 // If the vector reduction can be performed in a smaller type, we truncate
8890 // then extend the loop exit value to enable InstCombine to evaluate the
8891 // entire expression in the smaller type.
8892 if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&
8894 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
8896 "Unexpected truncated min-max recurrence!");
8897 Type *RdxTy = RdxDesc.getRecurrenceType();
8898 VPWidenCastRecipe *Trunc;
8899 Instruction::CastOps ExtendOpc =
8900 RdxDesc.isSigned() ? Instruction::SExt : Instruction::ZExt;
8901 VPWidenCastRecipe *Extnd;
8902 {
8903 VPBuilder::InsertPointGuard Guard(Builder);
8904 Builder.setInsertPoint(
8905 NewExitingVPV->getDefiningRecipe()->getParent(),
8906 std::next(NewExitingVPV->getDefiningRecipe()->getIterator()));
8907 Trunc =
8908 Builder.createWidenCast(Instruction::Trunc, NewExitingVPV, RdxTy);
8909 Extnd = Builder.createWidenCast(ExtendOpc, Trunc, PhiTy);
8910 }
8911 if (PhiR->getOperand(1) == NewExitingVPV)
8912 PhiR->setOperand(1, Extnd->getVPSingleValue());
8913
8914 // Update ComputeReductionResult with the truncated exiting value and
8915 // extend its result.
8916 FinalReductionResult->setOperand(1, Trunc);
8917 FinalReductionResult =
8918 Builder.createScalarCast(ExtendOpc, FinalReductionResult, PhiTy, {});
8919 }
8920
8921 // Update all users outside the vector region. Also replace redundant
8922 // ExtractLastElement.
8923 for (auto *U : to_vector(OrigExitingVPV->users())) {
8924 auto *Parent = cast<VPRecipeBase>(U)->getParent();
8925 if (FinalReductionResult == U || Parent->getParent())
8926 continue;
8927 U->replaceUsesOfWith(OrigExitingVPV, FinalReductionResult);
8929 cast<VPInstruction>(U)->replaceAllUsesWith(FinalReductionResult);
8930 }
8931
8932 // Adjust AnyOf reductions; replace the reduction phi for the selected value
8933 // with a boolean reduction phi node to check if the condition is true in
8934 // any iteration. The final value is selected by the final
8935 // ComputeReductionResult.
8936 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RecurrenceKind)) {
8937 auto *Select = cast<VPRecipeBase>(*find_if(PhiR->users(), [](VPUser *U) {
8938 return isa<VPWidenSelectRecipe>(U) ||
8939 (isa<VPReplicateRecipe>(U) &&
8940 cast<VPReplicateRecipe>(U)->getUnderlyingInstr()->getOpcode() ==
8941 Instruction::Select);
8942 }));
8943 VPValue *Cmp = Select->getOperand(0);
8944 // If the compare is checking the reduction PHI node, adjust it to check
8945 // the start value.
8946 if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe())
8947 CmpR->replaceUsesOfWith(PhiR, PhiR->getStartValue());
8948 Builder.setInsertPoint(Select);
8949
8950 // If the true value of the select is the reduction phi, the new value is
8951 // selected if the negated condition is true in any iteration.
8952 if (Select->getOperand(1) == PhiR)
8953 Cmp = Builder.createNot(Cmp);
8954 VPValue *Or = Builder.createOr(PhiR, Cmp);
8955 Select->getVPSingleValue()->replaceAllUsesWith(Or);
8956 // Delete Select now that it has invalid types.
8957 ToDelete.push_back(Select);
8958
8959 // Convert the reduction phi to operate on bools.
8960 PhiR->setOperand(0, Plan->getFalse());
8961 continue;
8962 }
8963
8965 RdxDesc.getRecurrenceKind())) {
8966 // Adjust the start value for FindFirstIV/FindLastIV recurrences to use
8967 // the sentinel value after generating the ResumePhi recipe, which uses
8968 // the original start value.
8969 PhiR->setOperand(0, Plan->getOrAddLiveIn(RdxDesc.getSentinelValue()));
8970 }
8971 RecurKind RK = RdxDesc.getRecurrenceKind();
8975 VPBuilder PHBuilder(Plan->getVectorPreheader());
8976 VPValue *Iden = Plan->getOrAddLiveIn(
8977 getRecurrenceIdentity(RK, PhiTy, RdxDesc.getFastMathFlags()));
8978 // If the PHI is used by a partial reduction, set the scale factor.
8979 unsigned ScaleFactor =
8980 RecipeBuilder.getScalingForReduction(RdxDesc.getLoopExitInstr())
8981 .value_or(1);
8982 auto *ScaleFactorVPV = Plan->getConstantInt(32, ScaleFactor);
8983 VPValue *StartV = PHBuilder.createNaryOp(
8985 {PhiR->getStartValue(), Iden, ScaleFactorVPV},
8986 PhiTy->isFloatingPointTy() ? RdxDesc.getFastMathFlags()
8987 : FastMathFlags());
8988 PhiR->setOperand(0, StartV);
8989 }
8990 }
8991 for (VPRecipeBase *R : ToDelete)
8992 R->eraseFromParent();
8993
8995}
8996
8997void LoopVectorizationPlanner::attachRuntimeChecks(
8998 VPlan &Plan, GeneratedRTChecks &RTChecks, bool HasBranchWeights) const {
8999 const auto &[SCEVCheckCond, SCEVCheckBlock] = RTChecks.getSCEVChecks();
9000 if (SCEVCheckBlock && SCEVCheckBlock->hasNPredecessors(0)) {
9001 assert((!CM.OptForSize ||
9002 CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled) &&
9003 "Cannot SCEV check stride or overflow when optimizing for size");
9004 VPlanTransforms::attachCheckBlock(Plan, SCEVCheckCond, SCEVCheckBlock,
9005 HasBranchWeights);
9006 }
9007 const auto &[MemCheckCond, MemCheckBlock] = RTChecks.getMemRuntimeChecks();
9008 if (MemCheckBlock && MemCheckBlock->hasNPredecessors(0)) {
9009 // VPlan-native path does not do any analysis for runtime checks
9010 // currently.
9011 assert((!EnableVPlanNativePath || OrigLoop->isInnermost()) &&
9012 "Runtime checks are not supported for outer loops yet");
9013
9014 if (CM.OptForSize) {
9015 assert(
9016 CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
9017 "Cannot emit memory checks when optimizing for size, unless forced "
9018 "to vectorize.");
9019 ORE->emit([&]() {
9020 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
9021 OrigLoop->getStartLoc(),
9022 OrigLoop->getHeader())
9023 << "Code-size may be reduced by not forcing "
9024 "vectorization, or by source-code modifications "
9025 "eliminating the need for runtime checks "
9026 "(e.g., adding 'restrict').";
9027 });
9028 }
9029 VPlanTransforms::attachCheckBlock(Plan, MemCheckCond, MemCheckBlock,
9030 HasBranchWeights);
9031 }
9032}
9033
9035 VPlan &Plan, ElementCount VF, unsigned UF,
9036 ElementCount MinProfitableTripCount) const {
9037 // vscale is not necessarily a power-of-2, which means we cannot guarantee
9038 // an overflow to zero when updating induction variables and so an
9039 // additional overflow check is required before entering the vector loop.
9040 bool IsIndvarOverflowCheckNeededForVF =
9041 VF.isScalable() && !TTI.isVScaleKnownToBeAPowerOfTwo() &&
9042 !isIndvarOverflowCheckKnownFalse(&CM, VF, UF) &&
9043 CM.getTailFoldingStyle() !=
9045 const uint32_t *BranchWeigths =
9046 hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())
9048 : nullptr;
9050 Plan, VF, UF, MinProfitableTripCount,
9051 CM.requiresScalarEpilogue(VF.isVector()), CM.foldTailByMasking(),
9052 IsIndvarOverflowCheckNeededForVF, OrigLoop, BranchWeigths,
9053 OrigLoop->getLoopPredecessor()->getTerminator()->getDebugLoc(),
9054 *PSE.getSE());
9055}
9056
9058 assert(!State.Lane && "VPDerivedIVRecipe being replicated.");
9059
9060 // Fast-math-flags propagate from the original induction instruction.
9061 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9062 if (FPBinOp)
9063 State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
9064
9065 Value *Step = State.get(getStepValue(), VPLane(0));
9066 Value *Index = State.get(getOperand(1), VPLane(0));
9067 Value *DerivedIV = emitTransformedIndex(
9068 State.Builder, Index, getStartValue()->getLiveInIRValue(), Step, Kind,
9070 DerivedIV->setName(Name);
9071 State.set(this, DerivedIV, VPLane(0));
9072}
9073
9074// Determine how to lower the scalar epilogue, which depends on 1) optimising
9075// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9076// predication, and 4) a TTI hook that analyses whether the loop is suitable
9077// for predication.
9082 // 1) OptSize takes precedence over all other options, i.e. if this is set,
9083 // don't look at hints or options, and don't request a scalar epilogue.
9084 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9085 // LoopAccessInfo (due to code dependency and not being able to reliably get
9086 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9087 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9088 // versioning when the vectorization is forced, unlike hasOptSize. So revert
9089 // back to the old way and vectorize with versioning when forced. See D81345.)
9090 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9094
9095 // 2) If set, obey the directives
9096 if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9104 };
9105 }
9106
9107 // 3) If set, obey the hints
9108 switch (Hints.getPredicate()) {
9113 };
9114
9115 // 4) if the TTI hook indicates this is profitable, request predication.
9116 TailFoldingInfo TFI(TLI, &LVL, IAI);
9117 if (TTI->preferPredicateOverEpilogue(&TFI))
9119
9121}
9122
9123// Process the loop in the VPlan-native vectorization path. This path builds
9124// VPlan upfront in the vectorization pipeline, which allows to apply
9125// VPlan-to-VPlan transformations from the very beginning without modifying the
9126// input LLVM IR.
9133 LoopVectorizationRequirements &Requirements) {
9134
9136 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9137 return false;
9138 }
9139 assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9140 Function *F = L->getHeader()->getParent();
9141 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9142
9144 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
9145
9146 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9147 &Hints, IAI, PSI, BFI);
9148 // Use the planner for outer loop vectorization.
9149 // TODO: CM is not used at this point inside the planner. Turn CM into an
9150 // optional argument if we don't need it in the future.
9151 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
9152 ORE);
9153
9154 // Get user vectorization factor.
9155 ElementCount UserVF = Hints.getWidth();
9156
9158
9159 // Plan how to best vectorize, return the best VF and its cost.
9160 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9161
9162 // If we are stress testing VPlan builds, do not attempt to generate vector
9163 // code. Masked vector code generation support will follow soon.
9164 // Also, do not attempt to vectorize if no vector code will be produced.
9166 return false;
9167
9168 VPlan &BestPlan = LVP.getPlanFor(VF.Width);
9169
9170 {
9171 GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind);
9172 InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, /*UF=*/1, &CM,
9173 BFI, PSI, Checks, BestPlan);
9174 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
9175 << L->getHeader()->getParent()->getName() << "\"\n");
9176 LVP.addMinimumIterationCheck(BestPlan, VF.Width, /*UF=*/1,
9178
9179 LVP.executePlan(VF.Width, /*UF=*/1, BestPlan, LB, DT, false);
9180 }
9181
9182 reportVectorization(ORE, L, VF, 1);
9183
9184 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9185 return true;
9186}
9187
9188// Emit a remark if there are stores to floats that required a floating point
9189// extension. If the vectorized loop was generated with floating point there
9190// will be a performance penalty from the conversion overhead and the change in
9191// the vector width.
9194 for (BasicBlock *BB : L->getBlocks()) {
9195 for (Instruction &Inst : *BB) {
9196 if (auto *S = dyn_cast<StoreInst>(&Inst)) {
9197 if (S->getValueOperand()->getType()->isFloatTy())
9198 Worklist.push_back(S);
9199 }
9200 }
9201 }
9202
9203 // Traverse the floating point stores upwards searching, for floating point
9204 // conversions.
9207 while (!Worklist.empty()) {
9208 auto *I = Worklist.pop_back_val();
9209 if (!L->contains(I))
9210 continue;
9211 if (!Visited.insert(I).second)
9212 continue;
9213
9214 // Emit a remark if the floating point store required a floating
9215 // point conversion.
9216 // TODO: More work could be done to identify the root cause such as a
9217 // constant or a function return type and point the user to it.
9218 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
9219 ORE->emit([&]() {
9220 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
9221 I->getDebugLoc(), L->getHeader())
9222 << "floating point conversion changes vector width. "
9223 << "Mixed floating point precision requires an up/down "
9224 << "cast that will negatively impact performance.";
9225 });
9226
9227 for (Use &Op : I->operands())
9228 if (auto *OpI = dyn_cast<Instruction>(Op))
9229 Worklist.push_back(OpI);
9230 }
9231}
9232
9233/// For loops with uncountable early exits, find the cost of doing work when
9234/// exiting the loop early, such as calculating the final exit values of
9235/// variables used outside the loop.
9236/// TODO: This is currently overly pessimistic because the loop may not take
9237/// the early exit, but better to keep this conservative for now. In future,
9238/// it might be possible to relax this by using branch probabilities.
9240 VPlan &Plan, ElementCount VF) {
9241 InstructionCost Cost = 0;
9242 for (auto *ExitVPBB : Plan.getExitBlocks()) {
9243 for (auto *PredVPBB : ExitVPBB->getPredecessors()) {
9244 // If the predecessor is not the middle.block, then it must be the
9245 // vector.early.exit block, which may contain work to calculate the exit
9246 // values of variables used outside the loop.
9247 if (PredVPBB != Plan.getMiddleBlock()) {
9248 LLVM_DEBUG(dbgs() << "Calculating cost of work in exit block "
9249 << PredVPBB->getName() << ":\n");
9250 Cost += PredVPBB->cost(VF, CostCtx);
9251 }
9252 }
9253 }
9254 return Cost;
9255}
9256
9257/// This function determines whether or not it's still profitable to vectorize
9258/// the loop given the extra work we have to do outside of the loop:
9259/// 1. Perform the runtime checks before entering the loop to ensure it's safe
9260/// to vectorize.
9261/// 2. In the case of loops with uncountable early exits, we may have to do
9262/// extra work when exiting the loop early, such as calculating the final
9263/// exit values of variables used outside the loop.
9264static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
9265 VectorizationFactor &VF, Loop *L,
9267 VPCostContext &CostCtx, VPlan &Plan,
9269 std::optional<unsigned> VScale) {
9270 InstructionCost TotalCost = Checks.getCost();
9271 if (!TotalCost.isValid())
9272 return false;
9273
9274 // Add on the cost of any work required in the vector early exit block, if
9275 // one exists.
9276 TotalCost += calculateEarlyExitCost(CostCtx, Plan, VF.Width);
9277
9278 // When interleaving only scalar and vector cost will be equal, which in turn
9279 // would lead to a divide by 0. Fall back to hard threshold.
9280 if (VF.Width.isScalar()) {
9281 // TODO: Should we rename VectorizeMemoryCheckThreshold?
9282 if (TotalCost > VectorizeMemoryCheckThreshold) {
9283 LLVM_DEBUG(
9284 dbgs()
9285 << "LV: Interleaving only is not profitable due to runtime checks\n");
9286 return false;
9287 }
9288 return true;
9289 }
9290
9291 // The scalar cost should only be 0 when vectorizing with a user specified
9292 // VF/IC. In those cases, runtime checks should always be generated.
9293 uint64_t ScalarC = VF.ScalarCost.getValue();
9294 if (ScalarC == 0)
9295 return true;
9296
9297 // First, compute the minimum iteration count required so that the vector
9298 // loop outperforms the scalar loop.
9299 // The total cost of the scalar loop is
9300 // ScalarC * TC
9301 // where
9302 // * TC is the actual trip count of the loop.
9303 // * ScalarC is the cost of a single scalar iteration.
9304 //
9305 // The total cost of the vector loop is
9306 // RtC + VecC * (TC / VF) + EpiC
9307 // where
9308 // * RtC is the cost of the generated runtime checks plus the cost of
9309 // performing any additional work in the vector.early.exit block for loops
9310 // with uncountable early exits.
9311 // * VecC is the cost of a single vector iteration.
9312 // * TC is the actual trip count of the loop
9313 // * VF is the vectorization factor
9314 // * EpiCost is the cost of the generated epilogue, including the cost
9315 // of the remaining scalar operations.
9316 //
9317 // Vectorization is profitable once the total vector cost is less than the
9318 // total scalar cost:
9319 // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC
9320 //
9321 // Now we can compute the minimum required trip count TC as
9322 // VF * (RtC + EpiC) / (ScalarC * VF - VecC) < TC
9323 //
9324 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
9325 // the computations are performed on doubles, not integers and the result
9326 // is rounded up, hence we get an upper estimate of the TC.
9327 unsigned IntVF = estimateElementCount(VF.Width, VScale);
9328 uint64_t RtC = TotalCost.getValue();
9329 uint64_t Div = ScalarC * IntVF - VF.Cost.getValue();
9330 uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
9331
9332 // Second, compute a minimum iteration count so that the cost of the
9333 // runtime checks is only a fraction of the total scalar loop cost. This
9334 // adds a loop-dependent bound on the overhead incurred if the runtime
9335 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
9336 // * TC. To bound the runtime check to be a fraction 1/X of the scalar
9337 // cost, compute
9338 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC
9339 uint64_t MinTC2 = divideCeil(RtC * 10, ScalarC);
9340
9341 // Now pick the larger minimum. If it is not a multiple of VF and a scalar
9342 // epilogue is allowed, choose the next closest multiple of VF. This should
9343 // partly compensate for ignoring the epilogue cost.
9344 uint64_t MinTC = std::max(MinTC1, MinTC2);
9345 if (SEL == CM_ScalarEpilogueAllowed)
9346 MinTC = alignTo(MinTC, IntVF);
9348
9349 LLVM_DEBUG(
9350 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
9351 << VF.MinProfitableTripCount << "\n");
9352
9353 // Skip vectorization if the expected trip count is less than the minimum
9354 // required trip count.
9355 if (auto ExpectedTC = getSmallBestKnownTC(PSE, L)) {
9356 if (ElementCount::isKnownLT(*ExpectedTC, VF.MinProfitableTripCount)) {
9357 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
9358 "trip count < minimum profitable VF ("
9359 << *ExpectedTC << " < " << VF.MinProfitableTripCount
9360 << ")\n");
9361
9362 return false;
9363 }
9364 }
9365 return true;
9366}
9367
9369 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
9371 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
9373
9374/// Prepare \p MainPlan for vectorizing the main vector loop during epilogue
9375/// vectorization. Remove ResumePhis from \p MainPlan for inductions that
9376/// don't have a corresponding wide induction in \p EpiPlan.
9377static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
9378 // Collect PHI nodes of widened phis in the VPlan for the epilogue. Those
9379 // will need their resume-values computed in the main vector loop. Others
9380 // can be removed from the main VPlan.
9381 SmallPtrSet<PHINode *, 2> EpiWidenedPhis;
9382 for (VPRecipeBase &R :
9385 continue;
9386 EpiWidenedPhis.insert(
9387 cast<PHINode>(R.getVPSingleValue()->getUnderlyingValue()));
9388 }
9389 for (VPRecipeBase &R :
9390 make_early_inc_range(MainPlan.getScalarHeader()->phis())) {
9391 auto *VPIRInst = cast<VPIRPhi>(&R);
9392 if (EpiWidenedPhis.contains(&VPIRInst->getIRPhi()))
9393 continue;
9394 // There is no corresponding wide induction in the epilogue plan that would
9395 // need a resume value. Remove the VPIRInst wrapping the scalar header phi
9396 // together with the corresponding ResumePhi. The resume values for the
9397 // scalar loop will be created during execution of EpiPlan.
9398 VPRecipeBase *ResumePhi = VPIRInst->getOperand(0)->getDefiningRecipe();
9399 VPIRInst->eraseFromParent();
9400 ResumePhi->eraseFromParent();
9401 }
9403
9404 using namespace VPlanPatternMatch;
9405 // When vectorizing the epilogue, FindFirstIV & FindLastIV reductions can
9406 // introduce multiple uses of undef/poison. If the reduction start value may
9407 // be undef or poison it needs to be frozen and the frozen start has to be
9408 // used when computing the reduction result. We also need to use the frozen
9409 // value in the resume phi generated by the main vector loop, as this is also
9410 // used to compute the reduction result after the epilogue vector loop.
9411 auto AddFreezeForFindLastIVReductions = [](VPlan &Plan,
9412 bool UpdateResumePhis) {
9413 VPBuilder Builder(Plan.getEntry());
9414 for (VPRecipeBase &R : *Plan.getMiddleBlock()) {
9415 auto *VPI = dyn_cast<VPInstruction>(&R);
9416 if (!VPI || VPI->getOpcode() != VPInstruction::ComputeFindIVResult)
9417 continue;
9418 VPValue *OrigStart = VPI->getOperand(1);
9420 continue;
9421 VPInstruction *Freeze =
9422 Builder.createNaryOp(Instruction::Freeze, {OrigStart}, {}, "fr");
9423 VPI->setOperand(1, Freeze);
9424 if (UpdateResumePhis)
9425 OrigStart->replaceUsesWithIf(Freeze, [Freeze](VPUser &U, unsigned) {
9426 return Freeze != &U && isa<VPPhi>(&U);
9427 });
9428 }
9429 };
9430 AddFreezeForFindLastIVReductions(MainPlan, true);
9431 AddFreezeForFindLastIVReductions(EpiPlan, false);
9432
9433 VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader();
9434 VPValue *VectorTC = &MainPlan.getVectorTripCount();
9435 // If there is a suitable resume value for the canonical induction in the
9436 // scalar (which will become vector) epilogue loop, use it and move it to the
9437 // beginning of the scalar preheader. Otherwise create it below.
9438 auto ResumePhiIter =
9439 find_if(MainScalarPH->phis(), [VectorTC](VPRecipeBase &R) {
9440 return match(&R, m_VPInstruction<Instruction::PHI>(m_Specific(VectorTC),
9441 m_ZeroInt()));
9442 });
9443 VPPhi *ResumePhi = nullptr;
9444 if (ResumePhiIter == MainScalarPH->phis().end()) {
9445 VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin());
9446 ResumePhi = ScalarPHBuilder.createScalarPhi(
9447 {VectorTC,
9449 {}, "vec.epilog.resume.val");
9450 } else {
9451 ResumePhi = cast<VPPhi>(&*ResumePhiIter);
9452 if (MainScalarPH->begin() == MainScalarPH->end())
9453 ResumePhi->moveBefore(*MainScalarPH, MainScalarPH->end());
9454 else if (&*MainScalarPH->begin() != ResumePhi)
9455 ResumePhi->moveBefore(*MainScalarPH, MainScalarPH->begin());
9456 }
9457 // Add a user to to make sure the resume phi won't get removed.
9458 VPBuilder(MainScalarPH)
9460}
9461
9462/// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded
9463/// SCEVs from \p ExpandedSCEVs and set resume values for header recipes. Some
9464/// reductions require creating new instructions to compute the resume values.
9465/// They are collected in a vector and returned. They must be moved to the
9466/// preheader of the vector epilogue loop, after created by the execution of \p
9467/// Plan.
9469 VPlan &Plan, Loop *L, const SCEV2ValueTy &ExpandedSCEVs,
9471 ScalarEvolution &SE) {
9472 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
9473 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
9474 Header->setName("vec.epilog.vector.body");
9475
9476 VPCanonicalIVPHIRecipe *IV = VectorLoop->getCanonicalIV();
9477 // When vectorizing the epilogue loop, the canonical induction needs to be
9478 // adjusted by the value after the main vector loop. Find the resume value
9479 // created during execution of the main VPlan. It must be the first phi in the
9480 // loop preheader. Use the value to increment the canonical IV, and update all
9481 // users in the loop region to use the adjusted value.
9482 // FIXME: Improve modeling for canonical IV start values in the epilogue
9483 // loop.
9484 using namespace llvm::PatternMatch;
9485 PHINode *EPResumeVal = &*L->getLoopPreheader()->phis().begin();
9486 for (Value *Inc : EPResumeVal->incoming_values()) {
9487 if (match(Inc, m_SpecificInt(0)))
9488 continue;
9489 assert(!EPI.VectorTripCount &&
9490 "Must only have a single non-zero incoming value");
9491 EPI.VectorTripCount = Inc;
9492 }
9493 // If we didn't find a non-zero vector trip count, all incoming values
9494 // must be zero, which also means the vector trip count is zero. Pick the
9495 // first zero as vector trip count.
9496 // TODO: We should not choose VF * UF so the main vector loop is known to
9497 // be dead.
9498 if (!EPI.VectorTripCount) {
9499 assert(EPResumeVal->getNumIncomingValues() > 0 &&
9500 all_of(EPResumeVal->incoming_values(),
9501 [](Value *Inc) { return match(Inc, m_SpecificInt(0)); }) &&
9502 "all incoming values must be 0");
9503 EPI.VectorTripCount = EPResumeVal->getOperand(0);
9504 }
9505 VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal);
9506 assert(all_of(IV->users(),
9507 [](const VPUser *U) {
9508 return isa<VPScalarIVStepsRecipe>(U) ||
9509 isa<VPDerivedIVRecipe>(U) ||
9510 cast<VPRecipeBase>(U)->isScalarCast() ||
9511 cast<VPInstruction>(U)->getOpcode() ==
9512 Instruction::Add;
9513 }) &&
9514 "the canonical IV should only be used by its increment or "
9515 "ScalarIVSteps when resetting the start value");
9516 VPBuilder Builder(Header, Header->getFirstNonPhi());
9517 VPInstruction *Add = Builder.createNaryOp(Instruction::Add, {IV, VPV});
9518 IV->replaceAllUsesWith(Add);
9519 Add->setOperand(0, IV);
9520
9522 SmallVector<Instruction *> InstsToMove;
9523 // Ensure that the start values for all header phi recipes are updated before
9524 // vectorizing the epilogue loop. Skip the canonical IV, which has been
9525 // handled above.
9526 for (VPRecipeBase &R : drop_begin(Header->phis())) {
9527 Value *ResumeV = nullptr;
9528 // TODO: Move setting of resume values to prepareToExecute.
9529 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
9530 auto *RdxResult =
9531 cast<VPInstruction>(*find_if(ReductionPhi->users(), [](VPUser *U) {
9532 auto *VPI = dyn_cast<VPInstruction>(U);
9533 return VPI &&
9534 (VPI->getOpcode() == VPInstruction::ComputeAnyOfResult ||
9535 VPI->getOpcode() == VPInstruction::ComputeReductionResult ||
9536 VPI->getOpcode() == VPInstruction::ComputeFindIVResult);
9537 }));
9538 ResumeV = cast<PHINode>(ReductionPhi->getUnderlyingInstr())
9539 ->getIncomingValueForBlock(L->getLoopPreheader());
9540 RecurKind RK = ReductionPhi->getRecurrenceKind();
9542 Value *StartV = RdxResult->getOperand(1)->getLiveInIRValue();
9543 // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
9544 // start value; compare the final value from the main vector loop
9545 // to the start value.
9546 BasicBlock *PBB = cast<Instruction>(ResumeV)->getParent();
9547 IRBuilder<> Builder(PBB, PBB->getFirstNonPHIIt());
9548 ResumeV = Builder.CreateICmpNE(ResumeV, StartV);
9549 if (auto *I = dyn_cast<Instruction>(ResumeV))
9550 InstsToMove.push_back(I);
9552 Value *StartV = getStartValueFromReductionResult(RdxResult);
9553 ToFrozen[StartV] = cast<PHINode>(ResumeV)->getIncomingValueForBlock(
9555
9556 // VPReductionPHIRecipe for FindFirstIV/FindLastIV reductions requires
9557 // an adjustment to the resume value. The resume value is adjusted to
9558 // the sentinel value when the final value from the main vector loop
9559 // equals the start value. This ensures correctness when the start value
9560 // might not be less than the minimum value of a monotonically
9561 // increasing induction variable.
9562 BasicBlock *ResumeBB = cast<Instruction>(ResumeV)->getParent();
9563 IRBuilder<> Builder(ResumeBB, ResumeBB->getFirstNonPHIIt());
9564 Value *Cmp = Builder.CreateICmpEQ(ResumeV, ToFrozen[StartV]);
9565 if (auto *I = dyn_cast<Instruction>(Cmp))
9566 InstsToMove.push_back(I);
9567 Value *Sentinel = RdxResult->getOperand(2)->getLiveInIRValue();
9568 ResumeV = Builder.CreateSelect(Cmp, Sentinel, ResumeV);
9569 if (auto *I = dyn_cast<Instruction>(ResumeV))
9570 InstsToMove.push_back(I);
9571 } else {
9572 VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
9573 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9574 if (auto *VPI = dyn_cast<VPInstruction>(PhiR->getStartValue())) {
9576 "unexpected start value");
9577 VPI->setOperand(0, StartVal);
9578 continue;
9579 }
9580 }
9581 } else {
9582 // Retrieve the induction resume values for wide inductions from
9583 // their original phi nodes in the scalar loop.
9584 PHINode *IndPhi = cast<VPWidenInductionRecipe>(&R)->getPHINode();
9585 // Hook up to the PHINode generated by a ResumePhi recipe of main
9586 // loop VPlan, which feeds the scalar loop.
9587 ResumeV = IndPhi->getIncomingValueForBlock(L->getLoopPreheader());
9588 }
9589 assert(ResumeV && "Must have a resume value");
9590 VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
9591 cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
9592 }
9593
9594 // For some VPValues in the epilogue plan we must re-use the generated IR
9595 // values from the main plan. Replace them with live-in VPValues.
9596 // TODO: This is a workaround needed for epilogue vectorization and it
9597 // should be removed once induction resume value creation is done
9598 // directly in VPlan.
9599 for (auto &R : make_early_inc_range(*Plan.getEntry())) {
9600 // Re-use frozen values from the main plan for Freeze VPInstructions in the
9601 // epilogue plan. This ensures all users use the same frozen value.
9602 auto *VPI = dyn_cast<VPInstruction>(&R);
9603 if (VPI && VPI->getOpcode() == Instruction::Freeze) {
9605 ToFrozen.lookup(VPI->getOperand(0)->getLiveInIRValue())));
9606 continue;
9607 }
9608
9609 // Re-use the trip count and steps expanded for the main loop, as
9610 // skeleton creation needs it as a value that dominates both the scalar
9611 // and vector epilogue loops
9612 auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(&R);
9613 if (!ExpandR)
9614 continue;
9615 VPValue *ExpandedVal =
9616 Plan.getOrAddLiveIn(ExpandedSCEVs.lookup(ExpandR->getSCEV()));
9617 ExpandR->replaceAllUsesWith(ExpandedVal);
9618 if (Plan.getTripCount() == ExpandR)
9619 Plan.resetTripCount(ExpandedVal);
9620 ExpandR->eraseFromParent();
9621 }
9622
9623 auto VScale = CM.getVScaleForTuning();
9624 unsigned MainLoopStep =
9625 estimateElementCount(EPI.MainLoopVF * EPI.MainLoopUF, VScale);
9626 unsigned EpilogueLoopStep =
9627 estimateElementCount(EPI.EpilogueVF * EPI.EpilogueUF, VScale);
9629 Plan, EPI.TripCount, EPI.VectorTripCount,
9631 EPI.EpilogueUF, MainLoopStep, EpilogueLoopStep, SE);
9632
9633 return InstsToMove;
9634}
9635
9636// Generate bypass values from the additional bypass block. Note that when the
9637// vectorized epilogue is skipped due to iteration count check, then the
9638// resume value for the induction variable comes from the trip count of the
9639// main vector loop, passed as the second argument.
9641 PHINode *OrigPhi, const InductionDescriptor &II, IRBuilder<> &BypassBuilder,
9642 const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount,
9643 Instruction *OldInduction) {
9644 Value *Step = getExpandedStep(II, ExpandedSCEVs);
9645 // For the primary induction the additional bypass end value is known.
9646 // Otherwise it is computed.
9647 Value *EndValueFromAdditionalBypass = MainVectorTripCount;
9648 if (OrigPhi != OldInduction) {
9649 auto *BinOp = II.getInductionBinOp();
9650 // Fast-math-flags propagate from the original induction instruction.
9652 BypassBuilder.setFastMathFlags(BinOp->getFastMathFlags());
9653
9654 // Compute the end value for the additional bypass.
9655 EndValueFromAdditionalBypass =
9656 emitTransformedIndex(BypassBuilder, MainVectorTripCount,
9657 II.getStartValue(), Step, II.getKind(), BinOp);
9658 EndValueFromAdditionalBypass->setName("ind.end");
9659 }
9660 return EndValueFromAdditionalBypass;
9661}
9662
9664 VPlan &BestEpiPlan,
9666 const SCEV2ValueTy &ExpandedSCEVs,
9667 Value *MainVectorTripCount) {
9668 // Fix reduction resume values from the additional bypass block.
9669 BasicBlock *PH = L->getLoopPreheader();
9670 for (auto *Pred : predecessors(PH)) {
9671 for (PHINode &Phi : PH->phis()) {
9672 if (Phi.getBasicBlockIndex(Pred) != -1)
9673 continue;
9674 Phi.addIncoming(Phi.getIncomingValueForBlock(BypassBlock), Pred);
9675 }
9676 }
9677 auto *ScalarPH = cast<VPIRBasicBlock>(BestEpiPlan.getScalarPreheader());
9678 if (ScalarPH->hasPredecessors()) {
9679 // If ScalarPH has predecessors, we may need to update its reduction
9680 // resume values.
9681 for (const auto &[R, IRPhi] :
9682 zip(ScalarPH->phis(), ScalarPH->getIRBasicBlock()->phis())) {
9684 BypassBlock);
9685 }
9686 }
9687
9688 // Fix induction resume values from the additional bypass block.
9689 IRBuilder<> BypassBuilder(BypassBlock, BypassBlock->getFirstInsertionPt());
9690 for (const auto &[IVPhi, II] : LVL.getInductionVars()) {
9691 auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH));
9693 IVPhi, II, BypassBuilder, ExpandedSCEVs, MainVectorTripCount,
9694 LVL.getPrimaryInduction());
9695 // TODO: Directly add as extra operand to the VPResumePHI recipe.
9696 Inc->setIncomingValueForBlock(BypassBlock, V);
9697 }
9698}
9699
9700/// Connect the epilogue vector loop generated for \p EpiPlan to the main vector
9701// loop, after both plans have executed, updating branches from the iteration
9702// and runtime checks of the main loop, as well as updating various phis. \p
9703// InstsToMove contains instructions that need to be moved to the preheader of
9704// the epilogue vector loop.
9706 VPlan &EpiPlan, Loop *L, EpilogueLoopVectorizationInfo &EPI,
9708 DenseMap<const SCEV *, Value *> &ExpandedSCEVs, GeneratedRTChecks &Checks,
9709 ArrayRef<Instruction *> InstsToMove) {
9710 BasicBlock *VecEpilogueIterationCountCheck =
9711 cast<VPIRBasicBlock>(EpiPlan.getEntry())->getIRBasicBlock();
9712
9713 BasicBlock *VecEpiloguePreHeader =
9714 cast<BranchInst>(VecEpilogueIterationCountCheck->getTerminator())
9715 ->getSuccessor(1);
9716 // Adjust the control flow taking the state info from the main loop
9717 // vectorization into account.
9719 "expected this to be saved from the previous pass.");
9720 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
9722 VecEpilogueIterationCountCheck, VecEpiloguePreHeader);
9723
9725 VecEpilogueIterationCountCheck},
9727 VecEpiloguePreHeader}});
9728
9729 BasicBlock *ScalarPH =
9730 cast<VPIRBasicBlock>(EpiPlan.getScalarPreheader())->getIRBasicBlock();
9732 VecEpilogueIterationCountCheck, ScalarPH);
9733 DTU.applyUpdates(
9735 VecEpilogueIterationCountCheck},
9737
9738 // Adjust the terminators of runtime check blocks and phis using them.
9739 BasicBlock *SCEVCheckBlock = Checks.getSCEVChecks().second;
9740 BasicBlock *MemCheckBlock = Checks.getMemRuntimeChecks().second;
9741 if (SCEVCheckBlock) {
9742 SCEVCheckBlock->getTerminator()->replaceUsesOfWith(
9743 VecEpilogueIterationCountCheck, ScalarPH);
9744 DTU.applyUpdates({{DominatorTree::Delete, SCEVCheckBlock,
9745 VecEpilogueIterationCountCheck},
9746 {DominatorTree::Insert, SCEVCheckBlock, ScalarPH}});
9747 }
9748 if (MemCheckBlock) {
9749 MemCheckBlock->getTerminator()->replaceUsesOfWith(
9750 VecEpilogueIterationCountCheck, ScalarPH);
9751 DTU.applyUpdates(
9752 {{DominatorTree::Delete, MemCheckBlock, VecEpilogueIterationCountCheck},
9753 {DominatorTree::Insert, MemCheckBlock, ScalarPH}});
9754 }
9755
9756 // The vec.epilog.iter.check block may contain Phi nodes from inductions
9757 // or reductions which merge control-flow from the latch block and the
9758 // middle block. Update the incoming values here and move the Phi into the
9759 // preheader.
9760 SmallVector<PHINode *, 4> PhisInBlock(
9761 llvm::make_pointer_range(VecEpilogueIterationCountCheck->phis()));
9762
9763 for (PHINode *Phi : PhisInBlock) {
9764 Phi->moveBefore(VecEpiloguePreHeader->getFirstNonPHIIt());
9765 Phi->replaceIncomingBlockWith(
9766 VecEpilogueIterationCountCheck->getSinglePredecessor(),
9767 VecEpilogueIterationCountCheck);
9768
9769 // If the phi doesn't have an incoming value from the
9770 // EpilogueIterationCountCheck, we are done. Otherwise remove the
9771 // incoming value and also those from other check blocks. This is needed
9772 // for reduction phis only.
9773 if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
9774 return EPI.EpilogueIterationCountCheck == IncB;
9775 }))
9776 continue;
9777 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
9778 if (SCEVCheckBlock)
9779 Phi->removeIncomingValue(SCEVCheckBlock);
9780 if (MemCheckBlock)
9781 Phi->removeIncomingValue(MemCheckBlock);
9782 }
9783
9784 auto IP = VecEpiloguePreHeader->getFirstNonPHIIt();
9785 for (auto *I : InstsToMove)
9786 I->moveBefore(IP);
9787
9788 // VecEpilogueIterationCountCheck conditionally skips over the epilogue loop
9789 // after executing the main loop. We need to update the resume values of
9790 // inductions and reductions during epilogue vectorization.
9791 fixScalarResumeValuesFromBypass(VecEpilogueIterationCountCheck, L, EpiPlan,
9792 LVL, ExpandedSCEVs, EPI.VectorTripCount);
9793}
9794
9796 assert((EnableVPlanNativePath || L->isInnermost()) &&
9797 "VPlan-native path is not enabled. Only process inner loops.");
9798
9799 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
9800 << L->getHeader()->getParent()->getName() << "' from "
9801 << L->getLocStr() << "\n");
9802
9803 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
9804
9805 LLVM_DEBUG(
9806 dbgs() << "LV: Loop hints:"
9807 << " force="
9809 ? "disabled"
9811 ? "enabled"
9812 : "?"))
9813 << " width=" << Hints.getWidth()
9814 << " interleave=" << Hints.getInterleave() << "\n");
9815
9816 // Function containing loop
9817 Function *F = L->getHeader()->getParent();
9818
9819 // Looking at the diagnostic output is the only way to determine if a loop
9820 // was vectorized (other than looking at the IR or machine code), so it
9821 // is important to generate an optimization remark for each loop. Most of
9822 // these messages are generated as OptimizationRemarkAnalysis. Remarks
9823 // generated as OptimizationRemark and OptimizationRemarkMissed are
9824 // less verbose reporting vectorized loops and unvectorized loops that may
9825 // benefit from vectorization, respectively.
9826
9827 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
9828 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
9829 return false;
9830 }
9831
9832 PredicatedScalarEvolution PSE(*SE, *L);
9833
9834 // Check if it is legal to vectorize the loop.
9835 LoopVectorizationRequirements Requirements;
9836 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
9837 &Requirements, &Hints, DB, AC, BFI, PSI, AA);
9839 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
9840 Hints.emitRemarkWithHints();
9841 return false;
9842 }
9843
9845 reportVectorizationFailure("Auto-vectorization of loops with uncountable "
9846 "early exit is not enabled",
9847 "UncountableEarlyExitLoopsDisabled", ORE, L);
9848 return false;
9849 }
9850
9851 if (!LVL.getPotentiallyFaultingLoads().empty()) {
9852 reportVectorizationFailure("Auto-vectorization of loops with potentially "
9853 "faulting load is not supported",
9854 "PotentiallyFaultingLoadsNotSupported", ORE, L);
9855 return false;
9856 }
9857
9858 // Entrance to the VPlan-native vectorization path. Outer loops are processed
9859 // here. They may require CFG and instruction level transformations before
9860 // even evaluating whether vectorization is profitable. Since we cannot modify
9861 // the incoming IR, we need to build VPlan upfront in the vectorization
9862 // pipeline.
9863 if (!L->isInnermost())
9864 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
9865 ORE, BFI, PSI, Hints, Requirements);
9866
9867 assert(L->isInnermost() && "Inner loop expected.");
9868
9869 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9870 bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9871
9872 // If an override option has been passed in for interleaved accesses, use it.
9873 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
9874 UseInterleaved = EnableInterleavedMemAccesses;
9875
9876 // Analyze interleaved memory accesses.
9877 if (UseInterleaved)
9879
9880 if (LVL.hasUncountableEarlyExit()) {
9881 BasicBlock *LoopLatch = L->getLoopLatch();
9882 if (IAI.requiresScalarEpilogue() ||
9884 [LoopLatch](BasicBlock *BB) { return BB != LoopLatch; })) {
9885 reportVectorizationFailure("Auto-vectorization of early exit loops "
9886 "requiring a scalar epilogue is unsupported",
9887 "UncountableEarlyExitUnsupported", ORE, L);
9888 return false;
9889 }
9890 }
9891
9892 // Check the function attributes and profiles to find out if this function
9893 // should be optimized for size.
9895 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI);
9896
9897 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
9898 // count by optimizing for size, to minimize overheads.
9899 auto ExpectedTC = getSmallBestKnownTC(PSE, L);
9900 if (ExpectedTC && ExpectedTC->isFixed() &&
9901 ExpectedTC->getFixedValue() < TinyTripCountVectorThreshold) {
9902 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
9903 << "This loop is worth vectorizing only if no scalar "
9904 << "iteration overheads are incurred.");
9906 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
9907 else {
9908 LLVM_DEBUG(dbgs() << "\n");
9909 // Predicate tail-folded loops are efficient even when the loop
9910 // iteration count is low. However, setting the epilogue policy to
9911 // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
9912 // with runtime checks. It's more effective to let
9913 // `isOutsideLoopWorkProfitable` determine if vectorization is
9914 // beneficial for the loop.
9917 }
9918 }
9919
9920 // Check the function attributes to see if implicit floats or vectors are
9921 // allowed.
9922 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
9924 "Can't vectorize when the NoImplicitFloat attribute is used",
9925 "loop not vectorized due to NoImplicitFloat attribute",
9926 "NoImplicitFloat", ORE, L);
9927 Hints.emitRemarkWithHints();
9928 return false;
9929 }
9930
9931 // Check if the target supports potentially unsafe FP vectorization.
9932 // FIXME: Add a check for the type of safety issue (denormal, signaling)
9933 // for the target we're vectorizing for, to make sure none of the
9934 // additional fp-math flags can help.
9935 if (Hints.isPotentiallyUnsafe() &&
9936 TTI->isFPVectorizationPotentiallyUnsafe()) {
9938 "Potentially unsafe FP op prevents vectorization",
9939 "loop not vectorized due to unsafe FP support.",
9940 "UnsafeFP", ORE, L);
9941 Hints.emitRemarkWithHints();
9942 return false;
9943 }
9944
9945 bool AllowOrderedReductions;
9946 // If the flag is set, use that instead and override the TTI behaviour.
9947 if (ForceOrderedReductions.getNumOccurrences() > 0)
9948 AllowOrderedReductions = ForceOrderedReductions;
9949 else
9950 AllowOrderedReductions = TTI->enableOrderedReductions();
9951 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
9952 ORE->emit([&]() {
9953 auto *ExactFPMathInst = Requirements.getExactFPInst();
9954 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
9955 ExactFPMathInst->getDebugLoc(),
9956 ExactFPMathInst->getParent())
9957 << "loop not vectorized: cannot prove it is safe to reorder "
9958 "floating-point operations";
9959 });
9960 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
9961 "reorder floating-point operations\n");
9962 Hints.emitRemarkWithHints();
9963 return false;
9964 }
9965
9966 // Use the cost model.
9967 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
9968 F, &Hints, IAI, PSI, BFI);
9969 // Use the planner for vectorization.
9970 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
9971 ORE);
9972
9973 // Get user vectorization factor and interleave count.
9974 ElementCount UserVF = Hints.getWidth();
9975 unsigned UserIC = Hints.getInterleave();
9976 if (UserIC > 1 && !LVL.isSafeForAnyVectorWidth())
9977 UserIC = 1;
9978
9979 // Plan how to best vectorize.
9980 LVP.plan(UserVF, UserIC);
9982 unsigned IC = 1;
9983
9984 if (ORE->allowExtraAnalysis(LV_NAME))
9986
9987 GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind);
9988 if (LVP.hasPlanWithVF(VF.Width)) {
9989 // Select the interleave count.
9990 IC = LVP.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost);
9991
9992 unsigned SelectedIC = std::max(IC, UserIC);
9993 // Optimistically generate runtime checks if they are needed. Drop them if
9994 // they turn out to not be profitable.
9995 if (VF.Width.isVector() || SelectedIC > 1) {
9996 Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
9997
9998 // Bail out early if either the SCEV or memory runtime checks are known to
9999 // fail. In that case, the vector loop would never execute.
10000 using namespace llvm::PatternMatch;
10001 if (Checks.getSCEVChecks().first &&
10002 match(Checks.getSCEVChecks().first, m_One()))
10003 return false;
10004 if (Checks.getMemRuntimeChecks().first &&
10005 match(Checks.getMemRuntimeChecks().first, m_One()))
10006 return false;
10007 }
10008
10009 // Check if it is profitable to vectorize with runtime checks.
10010 bool ForceVectorization =
10012 VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM,
10013 CM.CostKind, *CM.PSE.getSE(), L);
10014 if (!ForceVectorization &&
10015 !isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx,
10016 LVP.getPlanFor(VF.Width), SEL,
10017 CM.getVScaleForTuning())) {
10018 ORE->emit([&]() {
10020 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
10021 L->getHeader())
10022 << "loop not vectorized: cannot prove it is safe to reorder "
10023 "memory operations";
10024 });
10025 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
10026 Hints.emitRemarkWithHints();
10027 return false;
10028 }
10029 }
10030
10031 // Identify the diagnostic messages that should be produced.
10032 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10033 bool VectorizeLoop = true, InterleaveLoop = true;
10034 if (VF.Width.isScalar()) {
10035 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10036 VecDiagMsg = {
10037 "VectorizationNotBeneficial",
10038 "the cost-model indicates that vectorization is not beneficial"};
10039 VectorizeLoop = false;
10040 }
10041
10042 if (UserIC == 1 && Hints.getInterleave() > 1) {
10044 "UserIC should only be ignored due to unsafe dependencies");
10045 LLVM_DEBUG(dbgs() << "LV: Ignoring user-specified interleave count.\n");
10046 IntDiagMsg = {"InterleavingUnsafe",
10047 "Ignoring user-specified interleave count due to possibly "
10048 "unsafe dependencies in the loop."};
10049 InterleaveLoop = false;
10050 } else if (!LVP.hasPlanWithVF(VF.Width) && UserIC > 1) {
10051 // Tell the user interleaving was avoided up-front, despite being explicitly
10052 // requested.
10053 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10054 "interleaving should be avoided up front\n");
10055 IntDiagMsg = {"InterleavingAvoided",
10056 "Ignoring UserIC, because interleaving was avoided up front"};
10057 InterleaveLoop = false;
10058 } else if (IC == 1 && UserIC <= 1) {
10059 // Tell the user interleaving is not beneficial.
10060 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10061 IntDiagMsg = {
10062 "InterleavingNotBeneficial",
10063 "the cost-model indicates that interleaving is not beneficial"};
10064 InterleaveLoop = false;
10065 if (UserIC == 1) {
10066 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10067 IntDiagMsg.second +=
10068 " and is explicitly disabled or interleave count is set to 1";
10069 }
10070 } else if (IC > 1 && UserIC == 1) {
10071 // Tell the user interleaving is beneficial, but it explicitly disabled.
10072 LLVM_DEBUG(dbgs() << "LV: Interleaving is beneficial but is explicitly "
10073 "disabled.\n");
10074 IntDiagMsg = {"InterleavingBeneficialButDisabled",
10075 "the cost-model indicates that interleaving is beneficial "
10076 "but is explicitly disabled or interleave count is set to 1"};
10077 InterleaveLoop = false;
10078 }
10079
10080 // If there is a histogram in the loop, do not just interleave without
10081 // vectorizing. The order of operations will be incorrect without the
10082 // histogram intrinsics, which are only used for recipes with VF > 1.
10083 if (!VectorizeLoop && InterleaveLoop && LVL.hasHistograms()) {
10084 LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due "
10085 << "to histogram operations.\n");
10086 IntDiagMsg = {
10087 "HistogramPreventsScalarInterleaving",
10088 "Unable to interleave without vectorization due to constraints on "
10089 "the order of histogram operations"};
10090 InterleaveLoop = false;
10091 }
10092
10093 // Override IC if user provided an interleave count.
10094 IC = UserIC > 0 ? UserIC : IC;
10095
10096 // Emit diagnostic messages, if any.
10097 const char *VAPassName = Hints.vectorizeAnalysisPassName();
10098 if (!VectorizeLoop && !InterleaveLoop) {
10099 // Do not vectorize or interleaving the loop.
10100 ORE->emit([&]() {
10101 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10102 L->getStartLoc(), L->getHeader())
10103 << VecDiagMsg.second;
10104 });
10105 ORE->emit([&]() {
10106 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10107 L->getStartLoc(), L->getHeader())
10108 << IntDiagMsg.second;
10109 });
10110 return false;
10111 }
10112
10113 if (!VectorizeLoop && InterleaveLoop) {
10114 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10115 ORE->emit([&]() {
10116 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10117 L->getStartLoc(), L->getHeader())
10118 << VecDiagMsg.second;
10119 });
10120 } else if (VectorizeLoop && !InterleaveLoop) {
10121 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10122 << ") in " << L->getLocStr() << '\n');
10123 ORE->emit([&]() {
10124 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10125 L->getStartLoc(), L->getHeader())
10126 << IntDiagMsg.second;
10127 });
10128 } else if (VectorizeLoop && InterleaveLoop) {
10129 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10130 << ") in " << L->getLocStr() << '\n');
10131 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10132 }
10133
10134 // Report the vectorization decision.
10135 if (VF.Width.isScalar()) {
10136 using namespace ore;
10137 assert(IC > 1);
10138 ORE->emit([&]() {
10139 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10140 L->getHeader())
10141 << "interleaved loop (interleaved count: "
10142 << NV("InterleaveCount", IC) << ")";
10143 });
10144 } else {
10145 // Report the vectorization decision.
10146 reportVectorization(ORE, L, VF, IC);
10147 }
10148 if (ORE->allowExtraAnalysis(LV_NAME))
10150
10151 // If we decided that it is *legal* to interleave or vectorize the loop, then
10152 // do it.
10153
10154 VPlan &BestPlan = LVP.getPlanFor(VF.Width);
10155 // Consider vectorizing the epilogue too if it's profitable.
10156 VectorizationFactor EpilogueVF =
10158 if (EpilogueVF.Width.isVector()) {
10159 std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
10160
10161 // The first pass vectorizes the main loop and creates a scalar epilogue
10162 // to be vectorized by executing the plan (potentially with a different
10163 // factor) again shortly afterwards.
10164 VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width);
10165 BestEpiPlan.getMiddleBlock()->setName("vec.epilog.middle.block");
10166 BestEpiPlan.getVectorPreheader()->setName("vec.epilog.ph");
10167 preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan);
10168 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1,
10169 BestEpiPlan);
10170 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TTI, AC, EPI, &CM, BFI,
10171 PSI, Checks, *BestMainPlan);
10172 auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
10173 *BestMainPlan, MainILV, DT, false);
10174 ++LoopsVectorized;
10175
10176 // Second pass vectorizes the epilogue and adjusts the control flow
10177 // edges from the first pass.
10178 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TTI, AC, EPI, &CM,
10179 BFI, PSI, Checks, BestEpiPlan);
10181 BestEpiPlan, L, ExpandedSCEVs, EPI, CM, *PSE.getSE());
10182 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, DT,
10183 true);
10184 connectEpilogueVectorLoop(BestEpiPlan, L, EPI, DT, LVL, ExpandedSCEVs,
10185 Checks, InstsToMove);
10186 ++LoopsEpilogueVectorized;
10187 } else {
10188 InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, IC, &CM, BFI, PSI,
10189 Checks, BestPlan);
10190 // TODO: Move to general VPlan pipeline once epilogue loops are also
10191 // supported.
10194 IC, PSE);
10195 LVP.addMinimumIterationCheck(BestPlan, VF.Width, IC,
10197
10198 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
10199 ++LoopsVectorized;
10200 }
10201
10202 assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
10203 "DT not preserved correctly");
10204 assert(!verifyFunction(*F, &dbgs()));
10205
10206 return true;
10207}
10208
10210
10211 // Don't attempt if
10212 // 1. the target claims to have no vector registers, and
10213 // 2. interleaving won't help ILP.
10214 //
10215 // The second condition is necessary because, even if the target has no
10216 // vector registers, loop vectorization may still enable scalar
10217 // interleaving.
10218 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10219 TTI->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2)
10220 return LoopVectorizeResult(false, false);
10221
10222 bool Changed = false, CFGChanged = false;
10223
10224 // The vectorizer requires loops to be in simplified form.
10225 // Since simplification may add new inner loops, it has to run before the
10226 // legality and profitability checks. This means running the loop vectorizer
10227 // will simplify all loops, regardless of whether anything end up being
10228 // vectorized.
10229 for (const auto &L : *LI)
10230 Changed |= CFGChanged |=
10231 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10232
10233 // Build up a worklist of inner-loops to vectorize. This is necessary as
10234 // the act of vectorizing or partially unrolling a loop creates new loops
10235 // and can invalidate iterators across the loops.
10236 SmallVector<Loop *, 8> Worklist;
10237
10238 for (Loop *L : *LI)
10239 collectSupportedLoops(*L, LI, ORE, Worklist);
10240
10241 LoopsAnalyzed += Worklist.size();
10242
10243 // Now walk the identified inner loops.
10244 while (!Worklist.empty()) {
10245 Loop *L = Worklist.pop_back_val();
10246
10247 // For the inner loops we actually process, form LCSSA to simplify the
10248 // transform.
10249 Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10250
10251 Changed |= CFGChanged |= processLoop(L);
10252
10253 if (Changed) {
10254 LAIs->clear();
10255
10256#ifndef NDEBUG
10257 if (VerifySCEV)
10258 SE->verify();
10259#endif
10260 }
10261 }
10262
10263 // Process each loop nest in the function.
10264 return LoopVectorizeResult(Changed, CFGChanged);
10265}
10266
10269 LI = &AM.getResult<LoopAnalysis>(F);
10270 // There are no loops in the function. Return before computing other
10271 // expensive analyses.
10272 if (LI->empty())
10273 return PreservedAnalyses::all();
10282 AA = &AM.getResult<AAManager>(F);
10283
10284 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10285 PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10286 BFI = nullptr;
10287 if (PSI && PSI->hasProfileSummary())
10289 LoopVectorizeResult Result = runImpl(F);
10290 if (!Result.MadeAnyChange)
10291 return PreservedAnalyses::all();
10293
10294 if (isAssignmentTrackingEnabled(*F.getParent())) {
10295 for (auto &BB : F)
10297 }
10298
10299 PA.preserve<LoopAnalysis>();
10303
10304 if (Result.MadeCFGChange) {
10305 // Making CFG changes likely means a loop got vectorized. Indicate that
10306 // extra simplification passes should be run.
10307 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10308 // be run if runtime checks have been added.
10311 } else {
10313 }
10314 return PA;
10315}
10316
10318 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10319 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10320 OS, MapClassName2PassName);
10321
10322 OS << '<';
10323 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10324 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10325 OS << '>';
10326}
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
static unsigned getIntrinsicID(const SDNode *N)
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
AMDGPU Lower Kernel Arguments
AMDGPU Register Bank Select
Rewrite undef for PHI
This file implements a class to represent arbitrary precision integral constant values and operations...
@ PostInc
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
This is the interface for LLVM's primary stateless and local alias analysis.
static bool IsEmptyBlock(MachineBasicBlock *MBB)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
static InstructionCost getCost(Instruction &Inst, TTI::TargetCostKind CostKind, TargetTransformInfo &TTI, TargetLibraryInfo &TLI)
Definition CostModel.cpp:74
This file defines DenseMapInfo traits for DenseMap.
This file defines the DenseMap class.
#define DEBUG_TYPE
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
#define _
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
Module.h This file contains the declarations for the Module class.
This defines the Use class.
static bool hasNoUnsignedWrap(BinaryOperator &I)
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:80
static cl::opt< unsigned, true > VectorizationFactor("force-vector-width", cl::Hidden, cl::desc("Sets the SIMD width. Zero is autoselect."), cl::location(VectorizerParams::VectorizationFactor))
This header provides classes for managing per-loop analyses.
static cl::opt< bool > WidenIV("loop-flatten-widen-iv", cl::Hidden, cl::init(true), cl::desc("Widen the loop induction variables, if possible, so " "overflow checks won't reject flattening"))
static const char * VerboseDebug
#define LV_NAME
This file defines the LoopVectorizationLegality class.
This file provides a LoopVectorizationPlanner class.
static void collectSupportedLoops(Loop &L, LoopInfo *LI, OptimizationRemarkEmitter *ORE, SmallVectorImpl< Loop * > &V)
static cl::opt< unsigned > EpilogueVectorizationMinVF("epilogue-vectorization-minimum-VF", cl::Hidden, cl::desc("Only loops with vectorization factor equal to or larger than " "the specified value are considered for epilogue vectorization."))
static cl::opt< unsigned > EpilogueVectorizationForceVF("epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, cl::desc("When epilogue vectorization is enabled, and a value greater than " "1 is specified, forces the given VF for all applicable epilogue " "loops."))
static Type * maybeVectorizeType(Type *Ty, ElementCount VF)
static ElementCount determineVPlanVF(const TargetTransformInfo &TTI, LoopVectorizationCostModel &CM)
static ElementCount getSmallConstantTripCount(ScalarEvolution *SE, const Loop *L)
A version of ScalarEvolution::getSmallConstantTripCount that returns an ElementCount to include loops...
static cl::opt< unsigned > VectorizeMemoryCheckThreshold("vectorize-memory-check-threshold", cl::init(128), cl::Hidden, cl::desc("The maximum allowed number of runtime memory checks"))
static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan)
Prepare MainPlan for vectorizing the main vector loop during epilogue vectorization.
static cl::opt< unsigned > TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), cl::Hidden, cl::desc("Loops with a constant trip count that is smaller than this " "value are vectorized only if no scalar iteration overheads " "are incurred."))
Loops with a known constant trip count below this number are vectorized only if no scalar iteration o...
static void debugVectorizationMessage(const StringRef Prefix, const StringRef DebugMsg, Instruction *I)
Write a DebugMsg about vectorization to the debug output stream.
static cl::opt< bool > EnableCondStoresVectorization("enable-cond-stores-vec", cl::init(true), cl::Hidden, cl::desc("Enable if predication of stores during vectorization."))
static void legacyCSE(BasicBlock *BB)
FIXME: This legacy common-subexpression-elimination routine is scheduled for removal,...
static VPIRBasicBlock * replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB, VPlan *Plan=nullptr)
Replace VPBB with a VPIRBasicBlock wrapping IRBB.
static Value * emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue, Value *Step, InductionDescriptor::InductionKind InductionKind, const BinaryOperator *InductionBinOp)
Compute the transformed value of Index at offset StartValue using step StepValue.
static DebugLoc getDebugLocFromInstOrOperands(Instruction *I)
Look for a meaningful debug location on the instruction or its operands.
static Value * createInductionAdditionalBypassValues(PHINode *OrigPhi, const InductionDescriptor &II, IRBuilder<> &BypassBuilder, const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount, Instruction *OldInduction)
static void fixReductionScalarResumeWhenVectorizingEpilog(VPPhi *EpiResumePhiR, PHINode &EpiResumePhi, BasicBlock *BypassBlock)
static Value * getStartValueFromReductionResult(VPInstruction *RdxResult)
static cl::opt< bool > ForceTargetSupportsScalableVectors("force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, cl::desc("Pretend that scalable vectors are supported, even if the target does " "not support them. This flag should only be used for testing."))
static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style)
static VPWidenIntOrFpInductionRecipe * createWidenInductionRecipes(VPInstruction *PhiR, const InductionDescriptor &IndDesc, VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop)
Creates a VPWidenIntOrFpInductionRecipe for PhiR.
static cl::opt< bool > EnableEarlyExitVectorization("enable-early-exit-vectorization", cl::init(true), cl::Hidden, cl::desc("Enable vectorization of early exit loops with uncountable exits."))
static cl::opt< bool > ConsiderRegPressure("vectorizer-consider-reg-pressure", cl::init(false), cl::Hidden, cl::desc("Discard VFs if their register pressure is too high."))
static unsigned estimateElementCount(ElementCount VF, std::optional< unsigned > VScale)
This function attempts to return a value that represents the ElementCount at runtime.
static constexpr uint32_t MinItersBypassWeights[]
static cl::opt< unsigned > ForceTargetNumScalarRegs("force-target-num-scalar-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of scalar registers."))
static cl::opt< bool > UseWiderVFIfCallVariantsPresent("vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true), cl::Hidden, cl::desc("Try wider VFs if they enable the use of vector variants"))
static std::optional< unsigned > getMaxVScale(const Function &F, const TargetTransformInfo &TTI)
static cl::opt< unsigned > SmallLoopCost("small-loop-cost", cl::init(20), cl::Hidden, cl::desc("The cost of a loop that is considered 'small' by the interleaver."))
static void connectEpilogueVectorLoop(VPlan &EpiPlan, Loop *L, EpilogueLoopVectorizationInfo &EPI, DominatorTree *DT, LoopVectorizationLegality &LVL, DenseMap< const SCEV *, Value * > &ExpandedSCEVs, GeneratedRTChecks &Checks, ArrayRef< Instruction * > InstsToMove)
Connect the epilogue vector loop generated for EpiPlan to the main vector.
static bool planContainsAdditionalSimplifications(VPlan &Plan, VPCostContext &CostCtx, Loop *TheLoop, ElementCount VF)
Return true if the original loop \ TheLoop contains any instructions that do not have corresponding r...
static cl::opt< unsigned > ForceTargetNumVectorRegs("force-target-num-vector-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of vector registers."))
static bool isExplicitVecOuterLoop(Loop *OuterLp, OptimizationRemarkEmitter *ORE)
static cl::opt< bool > EnableIndVarRegisterHeur("enable-ind-var-reg-heur", cl::init(true), cl::Hidden, cl::desc("Count the induction variable only once when interleaving"))
static cl::opt< TailFoldingStyle > ForceTailFoldingStyle("force-tail-folding-style", cl::desc("Force the tail folding style"), cl::init(TailFoldingStyle::None), cl::values(clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"), clEnumValN(TailFoldingStyle::Data, "data", "Create lane mask for data only, using active.lane.mask intrinsic"), clEnumValN(TailFoldingStyle::DataWithoutLaneMask, "data-without-lane-mask", "Create lane mask with compare/stepvector"), clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control", "Create lane mask using active.lane.mask intrinsic, and use " "it for both data and control flow"), clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, "data-and-control-without-rt-check", "Similar to data-and-control, but remove the runtime check"), clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl", "Use predicated EVL instructions for tail folding. If EVL " "is unsupported, fallback to data-without-lane-mask.")))
static cl::opt< bool > EnableEpilogueVectorization("enable-epilogue-vectorization", cl::init(true), cl::Hidden, cl::desc("Enable vectorization of epilogue loops."))
static ScalarEpilogueLowering getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI)
static cl::opt< bool > PreferPredicatedReductionSelect("prefer-predicated-reduction-select", cl::init(false), cl::Hidden, cl::desc("Prefer predicating a reduction operation over an after loop select."))
static cl::opt< bool > PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), cl::Hidden, cl::desc("Prefer in-loop vector reductions, " "overriding the targets preference."))
static SmallVector< Instruction * > preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L, const SCEV2ValueTy &ExpandedSCEVs, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel &CM, ScalarEvolution &SE)
Prepare Plan for vectorizing the epilogue loop.
static cl::opt< bool > EnableLoadStoreRuntimeInterleave("enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, cl::desc("Enable runtime interleaving until load/store ports are saturated"))
static cl::opt< bool > VPlanBuildStressTest("vplan-build-stress-test", cl::init(false), cl::Hidden, cl::desc("Build VPlan for every supported loop nest in the function and bail " "out right after the build (stress test the VPlan H-CFG construction " "in the VPlan-native vectorization path)."))
static bool hasIrregularType(Type *Ty, const DataLayout &DL)
A helper function that returns true if the given type is irregular.
static cl::opt< bool > LoopVectorizeWithBlockFrequency("loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, cl::desc("Enable the use of the block frequency analysis to access PGO " "heuristics minimizing code growth in cold regions and being more " "aggressive in hot regions."))
static std::optional< ElementCount > getSmallBestKnownTC(PredicatedScalarEvolution &PSE, Loop *L, bool CanUseConstantMax=true)
Returns "best known" trip count, which is either a valid positive trip count or std::nullopt when an ...
static Value * getExpandedStep(const InductionDescriptor &ID, const SCEV2ValueTy &ExpandedSCEVs)
Return the expanded step for ID using ExpandedSCEVs to look up SCEV expansion results.
static bool useActiveLaneMask(TailFoldingStyle Style)
static bool hasReplicatorRegion(VPlan &Plan)
static bool isIndvarOverflowCheckKnownFalse(const LoopVectorizationCostModel *Cost, ElementCount VF, std::optional< unsigned > UF=std::nullopt)
For the given VF and UF and maximum trip count computed for the loop, return whether the induction va...
static void addFullyUnrolledInstructionsToIgnore(Loop *L, const LoopVectorizationLegality::InductionList &IL, SmallPtrSetImpl< Instruction * > &InstsToIgnore)
Knowing that loop L executes a single vector iteration, add instructions that will get simplified and...
static cl::opt< PreferPredicateTy::Option > PreferPredicateOverEpilogue("prefer-predicate-over-epilogue", cl::init(PreferPredicateTy::ScalarEpilogue), cl::Hidden, cl::desc("Tail-folding and predication preferences over creating a scalar " "epilogue loop."), cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, "scalar-epilogue", "Don't tail-predicate loops, create scalar epilogue"), clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, "predicate-else-scalar-epilogue", "prefer tail-folding, create scalar epilogue if tail " "folding fails."), clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, "predicate-dont-vectorize", "prefers tail-folding, don't attempt vectorization if " "tail-folding fails.")))
static cl::opt< bool > EnableInterleavedMemAccesses("enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on interleaved memory accesses in a loop"))
static cl::opt< bool > EnableMaskedInterleavedMemAccesses("enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"))
An interleave-group may need masking if it resides in a block that needs predication,...
static cl::opt< bool > ForceOrderedReductions("force-ordered-reductions", cl::init(false), cl::Hidden, cl::desc("Enable the vectorisation of loops with in-order (strict) " "FP reductions"))
static const SCEV * getAddressAccessSCEV(Value *Ptr, LoopVectorizationLegality *Legal, PredicatedScalarEvolution &PSE, const Loop *TheLoop)
Gets Address Access SCEV after verifying that the access pattern is loop invariant except the inducti...
static cl::opt< cl::boolOrDefault > ForceSafeDivisor("force-widen-divrem-via-safe-divisor", cl::Hidden, cl::desc("Override cost based safe divisor widening for div/rem instructions"))
static InstructionCost calculateEarlyExitCost(VPCostContext &CostCtx, VPlan &Plan, ElementCount VF)
For loops with uncountable early exits, find the cost of doing work when exiting the loop early,...
static cl::opt< unsigned > ForceTargetMaxVectorInterleaveFactor("force-target-max-vector-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "vectorized loops."))
static bool processLoopInVPlanNativePath(Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, LoopVectorizationRequirements &Requirements)
static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI)
static cl::opt< unsigned > NumberOfStoresToPredicate("vectorize-num-stores-pred", cl::init(1), cl::Hidden, cl::desc("Max number of stores to be predicated behind an if."))
The number of stores in a loop that are allowed to need predication.
static cl::opt< unsigned > MaxNestedScalarReductionIC("max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, cl::desc("The maximum interleave count to use when interleaving a scalar " "reduction in a nested loop."))
static cl::opt< unsigned > ForceTargetMaxScalarInterleaveFactor("force-target-max-scalar-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "scalar loops."))
static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE)
static bool willGenerateVectors(VPlan &Plan, ElementCount VF, const TargetTransformInfo &TTI)
Check if any recipe of Plan will generate a vector value, which will be assigned a vector register.
static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks, VectorizationFactor &VF, Loop *L, PredicatedScalarEvolution &PSE, VPCostContext &CostCtx, VPlan &Plan, ScalarEpilogueLowering SEL, std::optional< unsigned > VScale)
This function determines whether or not it's still profitable to vectorize the loop given the extra w...
static void fixScalarResumeValuesFromBypass(BasicBlock *BypassBlock, Loop *L, VPlan &BestEpiPlan, LoopVectorizationLegality &LVL, const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount)
static cl::opt< bool > MaximizeBandwidth("vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " "will be determined by the smallest type in loop."))
static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop, Instruction *I, DebugLoc DL={})
Create an analysis remark that explains why vectorization failed.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
This file implements a map that provides insertion order iteration.
This file contains the declarations for metadata subclasses.
#define T
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
#define P(N)
This file contains the declarations for profiling metadata utility functions.
const SmallVectorImpl< MachineOperand > & Cond
static BinaryOperator * CreateMul(Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore, Value *FlagsOp)
static BinaryOperator * CreateAdd(Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore, Value *FlagsOp)
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy, VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={})
This is similar to TargetTransformInfo::getScalarizationOverhead, but if ScalarTy is a FixedVectorTyp...
This file contains some templates that are useful if you are working with the STL at all.
#define OP(OPC)
Definition Instruction.h:46
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
#define DEBUG_WITH_TYPE(TYPE,...)
DEBUG_WITH_TYPE macro - This macro should be used by passes to emit debug information.
Definition Debug.h:72
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
This pass exposes codegen information to IR-level passes.
LocallyHashedType DenseMapInfo< LocallyHashedType >::Empty
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file contains the declarations of different VPlan-related auxiliary helpers.
This file provides utility VPlan to VPlan transformations.
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const char PassName[]
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition blake3_impl.h:83
A manager for alias analyses.
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1541
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1513
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM_ABI unsigned getVScaleRangeMin() const
Returns the minimum value for the vscale_range attribute.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition BasicBlock.h:528
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
LLVM_ABI const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this basic block belongs to.
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
BinaryOps getOpcode() const
Definition InstrTypes.h:374
Analysis pass which computes BlockFrequencyInfo.
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Conditional or Unconditional Branch instruction.
bool isConditional() const
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
BasicBlock * getSuccessor(unsigned i) const
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Value * getArgOperand(unsigned i) const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:982
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
A debug info location.
Definition DebugLoc.h:124
static DebugLoc getTemporary()
Definition DebugLoc.h:161
static DebugLoc getUnknown()
Definition DebugLoc.h:162
An analysis that produces DemandedBits for a function.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:248
iterator end()
Definition DenseMap.h:81
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:169
void insert_range(Range &&R)
Inserts range of 'std::pair<KeyT, ValueT>' values into the map.
Definition DenseMap.h:286
Implements a dense probed hash-table based set.
Definition DenseSet.h:279
Analysis pass which computes a DominatorTree.
Definition Dominators.h:283
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
void eraseNode(NodeT *BB)
eraseNode - Removes a node from the dominator tree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
static constexpr ElementCount get(ScalarTy MinVal, bool Scalable)
Definition TypeSize.h:315
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
BasicBlock * createVectorizedLoopSkeleton() final
Implements the interface for creating a vectorized skeleton using the epilogue loop strategy (i....
EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan)
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
A specialized derived class of inner loop vectorizer that performs vectorization of main loops in the...
void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB)
Introduces a new VPIRBasicBlock for CheckIRBB to Plan between the vector preheader and its predecesso...
BasicBlock * emitIterationCountCheck(BasicBlock *VectorPH, BasicBlock *Bypass, bool ForEpilogue)
Emits an iteration count bypass check once for the main loop (when ForEpilogue is false) and once for...
EpilogueVectorizerMainLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Check, VPlan &Plan)
Value * createIterationCountCheck(BasicBlock *VectorPH, ElementCount VF, unsigned UF) const
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
BasicBlock * createVectorizedLoopSkeleton() final
Implements the interface for creating a vectorized skeleton using the main loop strategy (i....
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
Class to represent function types.
param_iterator param_begin() const
param_iterator param_end() const
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:765
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:730
Represents flags for the getelementptr instruction/expression.
static GEPNoWrapFlags none()
void applyUpdates(ArrayRef< UpdateT > Updates)
Submit updates to all available trees.
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition IRBuilder.h:345
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788
A struct for saving information about induction variables.
const SCEV * getStep() const
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_NoInduction
Not an induction variable.
@ IK_FpInduction
Floating point induction variable.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
const SmallVectorImpl< Instruction * > & getCastInsts() const
Returns a reference to the type cast instructions in the induction update chain, that are redundant w...
Value * getStartValue() const
InnerLoopAndEpilogueVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan, ElementCount VecWidth, ElementCount MinProfitableTripCount, unsigned UnrollFactor)
EpilogueLoopVectorizationInfo & EPI
Holds and updates state information required to vectorize the main loop and its epilogue in two separ...
InnerLoopVectorizer vectorizes loops which contain only one basic block to a specified vectorization ...
virtual void printDebugTracesAtStart()
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
Value * TripCount
Trip count of the original loop.
const TargetTransformInfo * TTI
Target Transform Info.
LoopVectorizationCostModel * Cost
The profitablity analysis.
BlockFrequencyInfo * BFI
BFI and PSI are used to check for profile guided size optimizations.
Value * getTripCount() const
Returns the original loop trip count.
friend class LoopVectorizationPlanner
PredicatedScalarEvolution & PSE
A wrapper around ScalarEvolution used to add runtime SCEV checks.
LoopInfo * LI
Loop Info.
ProfileSummaryInfo * PSI
DominatorTree * DT
Dominator Tree.
void setTripCount(Value *TC)
Used to set the trip count after ILV's construction and after the preheader block has been executed.
void fixVectorizedLoop(VPTransformState &State)
Fix the vectorized code, taking care of header phi's, and more.
virtual BasicBlock * createVectorizedLoopSkeleton()
Creates a basic block for the scalar preheader.
virtual void printDebugTracesAtEnd()
AssumptionCache * AC
Assumption Cache.
InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC, ElementCount VecWidth, unsigned UnrollFactor, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks, VPlan &Plan)
IRBuilder Builder
The builder that we use.
void fixNonInductionPHIs(VPTransformState &State)
Fix the non-induction PHIs in Plan.
VPBasicBlock * VectorPHVPBB
The vector preheader block of Plan, used as target for check blocks introduced during skeleton creati...
unsigned UF
The vectorization unroll factor to use.
GeneratedRTChecks & RTChecks
Structure to hold information about generated runtime checks, responsible for cleaning the checks,...
virtual ~InnerLoopVectorizer()=default
ElementCount VF
The vectorization SIMD factor to use.
Loop * OrigLoop
The original loop.
BasicBlock * createScalarPreheader(StringRef Prefix)
Create and return a new IR basic block for the scalar preheader whose name is prefixed with Prefix.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
static InstructionCost getInvalid(CostType Val=0)
static InstructionCost getMax()
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
bool isCast() const
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI void moveBefore(InstListType::iterator InsertPos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool isBinaryOp() const
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
LLVM_ABI FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
const char * getOpcodeName() const
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Class to represent integer types.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
LLVM_ABI APInt getMask() const
For example, this is 0xFF for an 8 bit integer, 0xFFFF for i16, etc.
Definition Type.cpp:342
The group of interleaved loads/stores sharing the same stride and close to each other.
uint32_t getFactor() const
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
InstTy * getInsertPos() const
uint32_t getNumMembers() const
Drive the analysis of interleaved memory accesses in the loop.
bool requiresScalarEpilogue() const
Returns true if an interleaved group that may access memory out-of-bounds requires a scalar epilogue ...
LLVM_ABI void analyzeInterleaving(bool EnableMaskedInterleavedGroup)
Analyze the interleaved accesses and collect them in interleave groups.
An instruction for reading from memory.
Type * getPointerOperandType() const
This analysis provides dependence information for the memory accesses of a loop.
Drive the analysis of memory accesses in the loop.
const RuntimePointerChecking * getRuntimePointerChecking() const
unsigned getNumRuntimePointerChecks() const
Number of memchecks required to prove independence of otherwise may-alias pointers.
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
BlockT * getHeader() const
iterator_range< block_iterator > blocks() const
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Store the result of a depth first search within basic blocks contained by a single loop.
RPOIterator beginRPO() const
Reverse iterate over the cached postorder blocks.
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
RPOIterator endRPO() const
Wrapper class to LoopBlocksDFS that provides a standard begin()/end() interface for the DFS reverse p...
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
void removeBlock(BlockT *BB)
This method completely removes BB from all data structures, including all of the Loop objects it is n...
LoopVectorizationCostModel - estimates the expected speedups due to vectorization.
SmallPtrSet< Type *, 16 > ElementTypesInLoop
All element types found in the loop.
bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment, unsigned AddressSpace) const
Returns true if the target machine supports masked load operation for the given DataType and kind of ...
LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, const Function *F, const LoopVectorizeHints *Hints, InterleavedAccessInfo &IAI, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI)
void collectElementTypesForWidening()
Collect all element types in the loop for which widening is needed.
bool canVectorizeReductions(ElementCount VF) const
Returns true if the target machine supports all of the reduction variables found for the given VF.
bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment, unsigned AddressSpace) const
Returns true if the target machine supports masked store operation for the given DataType and kind of...
bool isEpilogueVectorizationProfitable(const ElementCount VF, const unsigned IC) const
Returns true if epilogue vectorization is considered profitable, and false otherwise.
bool useWideActiveLaneMask() const
Returns true if the use of wide lane masks is requested and the loop is using tail-folding with a lan...
bool isPredicatedInst(Instruction *I) const
Returns true if I is an instruction that needs to be predicated at runtime.
void collectValuesToIgnore()
Collect values we want to ignore in the cost model.
void collectInLoopReductions()
Split reductions into those that happen in the loop, and those that happen outside.
std::pair< unsigned, unsigned > getSmallestAndWidestTypes()
bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be uniform after vectorization.
void collectNonVectorizedAndSetWideningDecisions(ElementCount VF)
Collect values that will not be widened, including Uniforms, Scalars, and Instructions to Scalarize f...
PredicatedScalarEvolution & PSE
Predicated scalar evolution analysis.
const LoopVectorizeHints * Hints
Loop Vectorize Hint.
std::optional< unsigned > getMaxSafeElements() const
Return maximum safe number of elements to be processed per vector iteration, which do not prevent sto...
const TargetTransformInfo & TTI
Vector target information.
LoopVectorizationLegality * Legal
Vectorization legality.
std::optional< InstructionCost > getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy) const
Return the cost of instructions in an inloop reduction pattern, if I is part of that pattern.
InstructionCost getInstructionCost(Instruction *I, ElementCount VF)
Returns the execution time cost of an instruction for a given vector width.
DemandedBits * DB
Demanded bits analysis.
bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const
Returns true if I is a memory instruction in an interleaved-group of memory accesses that can be vect...
const TargetLibraryInfo * TLI
Target Library Info.
bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF)
Returns true if I is a memory instruction with consecutive memory access that can be widened.
const InterleaveGroup< Instruction > * getInterleavedAccessGroup(Instruction *Instr) const
Get the interleaved access group that Instr belongs to.
InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const
Estimate cost of an intrinsic call instruction CI if it were vectorized with factor VF.
bool OptForSize
Whether this loop should be optimized for size based on function attribute or profile information.
bool useMaxBandwidth(TargetTransformInfo::RegisterKind RegKind)
bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be scalar after vectorization.
bool isOptimizableIVTruncate(Instruction *I, ElementCount VF)
Return True if instruction I is an optimizable truncate whose operand is an induction variable.
FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC)
bool shouldConsiderRegPressureForVF(ElementCount VF)
Loop * TheLoop
The loop that we evaluate.
TTI::TargetCostKind CostKind
The kind of cost that we are calculating.
TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow=true) const
Returns the TailFoldingStyle that is best for the current loop.
InterleavedAccessInfo & InterleaveInfo
The interleave access information contains groups of interleaved accesses with the same stride and cl...
SmallPtrSet< const Value *, 16 > ValuesToIgnore
Values to ignore in the cost model.
void setVectorizedCallDecision(ElementCount VF)
A call may be vectorized in different ways depending on whether we have vectorized variants available...
void invalidateCostModelingDecisions()
Invalidates decisions already taken by the cost model.
bool isAccessInterleaved(Instruction *Instr) const
Check if Instr belongs to any interleaved access group.
bool selectUserVectorizationFactor(ElementCount UserVF)
Setup cost-based decisions for user vectorization factor.
std::optional< unsigned > getVScaleForTuning() const
Return the value of vscale used for tuning the cost model.
OptimizationRemarkEmitter * ORE
Interface to emit optimization remarks.
bool preferPredicatedLoop() const
Returns true if tail-folding is preferred over a scalar epilogue.
LoopInfo * LI
Loop Info analysis.
bool requiresScalarEpilogue(bool IsVectorizing) const
Returns true if we're required to use a scalar epilogue for at least the final iteration of the origi...
SmallPtrSet< const Value *, 16 > VecValuesToIgnore
Values to ignore in the cost model when VF > 1.
unsigned getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind, BasicBlock *BB) const
A helper function that returns how much we should divide the cost of a predicated block by.
bool isInLoopReduction(PHINode *Phi) const
Returns true if the Phi is part of an inloop reduction.
bool isProfitableToScalarize(Instruction *I, ElementCount VF) const
void setWideningDecision(const InterleaveGroup< Instruction > *Grp, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for interleaving group Grp and vector ...
const MapVector< Instruction *, uint64_t > & getMinimalBitwidths() const
CallWideningDecision getCallWideningDecision(CallInst *CI, ElementCount VF) const
bool isLegalGatherOrScatter(Value *V, ElementCount VF)
Returns true if the target machine can represent V as a masked gather or scatter operation.
bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const
bool shouldConsiderInvariant(Value *Op)
Returns true if Op should be considered invariant and if it is trivially hoistable.
bool foldTailByMasking() const
Returns true if all loop blocks should be masked to fold tail loop.
bool foldTailWithEVL() const
Returns true if VP intrinsics with explicit vector length support should be generated in the tail fol...
bool usePredicatedReductionSelect() const
Returns true if the predicated reduction select should be used to set the incoming value for the redu...
bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const
Returns true if the instructions in this block requires predication for any reason,...
void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind, Function *Variant, Intrinsic::ID IID, std::optional< unsigned > MaskPos, InstructionCost Cost)
void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC)
Selects and saves TailFoldingStyle for 2 options - if IV update may overflow or not.
AssumptionCache * AC
Assumption cache.
void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for instruction I and vector width VF.
InstWidening
Decision that was taken during cost calculation for memory instruction.
bool isScalarWithPredication(Instruction *I, ElementCount VF) const
Returns true if I is an instruction which requires predication and for which our chosen predication s...
InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const
Estimate cost of a call instruction CI if it were vectorized with factor VF.
bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const
Returns true if we should use strict in-order reductions for the given RdxDesc.
std::pair< InstructionCost, InstructionCost > getDivRemSpeculationCost(Instruction *I, ElementCount VF) const
Return the costs for our two available strategies for lowering a div/rem operation which requires spe...
bool isDivRemScalarWithPredication(InstructionCost ScalarCost, InstructionCost SafeDivisorCost) const
Given costs for both strategies, return true if the scalar predication lowering should be used for di...
InstructionCost expectedCost(ElementCount VF)
Returns the expected execution cost.
void setCostBasedWideningDecision(ElementCount VF)
Memory access instruction may be vectorized in more than one way.
InstWidening getWideningDecision(Instruction *I, ElementCount VF) const
Return the cost model decision for the given instruction I and vector width VF.
FixedScalableVFPair MaxPermissibleVFWithoutMaxBW
The highest VF possible for this loop, without using MaxBandwidth.
bool isScalarEpilogueAllowed() const
Returns true if a scalar epilogue is not allowed due to optsize or a loop hint annotation.
InstructionCost getWideningCost(Instruction *I, ElementCount VF)
Return the vectorization cost for the given instruction I and vector width VF.
void collectInstsToScalarize(ElementCount VF)
Collects the instructions to scalarize for each predicated instruction in the loop.
LoopVectorizationLegality checks if it is legal to vectorize a loop, and to what vectorization factor...
MapVector< PHINode *, InductionDescriptor > InductionList
InductionList saves induction variables and maps them to the induction descriptor.
const SmallPtrSetImpl< const Instruction * > & getPotentiallyFaultingLoads() const
Returns potentially faulting loads.
bool canVectorize(bool UseVPlanNativePath)
Returns true if it is legal to vectorize this loop.
bool canVectorizeFPMath(bool EnableStrictReductions)
Returns true if it is legal to vectorize the FP math operations in this loop.
PHINode * getPrimaryInduction()
Returns the primary induction variable.
const SmallVector< BasicBlock *, 4 > & getCountableExitingBlocks() const
Returns all exiting blocks with a countable exit, i.e.
const InductionList & getInductionVars() const
Returns the induction variables found in the loop.
bool hasUncountableEarlyExit() const
Returns true if the loop has exactly one uncountable early exit, i.e.
bool hasHistograms() const
Returns a list of all known histogram operations in the loop.
const LoopAccessInfo * getLAI() const
Planner drives the vectorization process after having passed Legality checks.
VectorizationFactor selectEpilogueVectorizationFactor(const ElementCount MaxVF, unsigned IC)
VPlan & getPlanFor(ElementCount VF) const
Return the VPlan for VF.
Definition VPlan.cpp:1576
VectorizationFactor planInVPlanNativePath(ElementCount UserVF)
Use the VPlan-native path to plan how to best vectorize, return the best VF and its cost.
void updateLoopMetadataAndProfileInfo(Loop *VectorLoop, VPBasicBlock *HeaderVPBB, const VPlan &Plan, bool VectorizingEpilogue, MDNode *OrigLoopID, std::optional< unsigned > OrigAverageTripCount, unsigned OrigLoopInvocationWeight, unsigned EstimatedVFxUF, bool DisableRuntimeUnroll)
Update loop metadata and profile info for both the scalar remainder loop and VectorLoop,...
Definition VPlan.cpp:1627
void buildVPlans(ElementCount MinVF, ElementCount MaxVF)
Build VPlans for power-of-2 VF's between MinVF and MaxVF inclusive, according to the information gath...
Definition VPlan.cpp:1560
VectorizationFactor computeBestVF()
Compute and return the most profitable vectorization factor.
DenseMap< const SCEV *, Value * > executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan, InnerLoopVectorizer &LB, DominatorTree *DT, bool VectorizingEpilogue)
Generate the IR code for the vectorized loop captured in VPlan BestPlan according to the best selecte...
unsigned selectInterleaveCount(VPlan &Plan, ElementCount VF, InstructionCost LoopCost)
void emitInvalidCostRemarks(OptimizationRemarkEmitter *ORE)
Emit remarks for recipes with invalid costs in the available VPlans.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1541
void printPlans(raw_ostream &O)
Definition VPlan.cpp:1705
void plan(ElementCount UserVF, unsigned UserIC)
Build VPlans for the specified UserVF and UserIC if they are non-zero or all applicable candidate VFs...
void addMinimumIterationCheck(VPlan &Plan, ElementCount VF, unsigned UF, ElementCount MinProfitableTripCount) const
Create a check to Plan to see if the vector loop should be executed based on its trip count.
bool hasPlanWithVF(ElementCount VF) const
Look through the existing plans and return true if we have one with vectorization factor VF.
This holds vectorization requirements that must be verified late in the process.
Utility class for getting and setting loop vectorizer hints in the form of loop metadata.
bool allowVectorization(Function *F, Loop *L, bool VectorizeOnlyWhenForced) const
void emitRemarkWithHints() const
Dumps all the hint information.
const char * vectorizeAnalysisPassName() const
If hints are provided that force vectorization, use the AlwaysPrint pass name to force the frontend t...
This class emits a version of the loop where run-time checks ensure that may-alias pointers can't ove...
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
bool hasLoopInvariantOperands(const Instruction *I) const
Return true if all the operands of the specified instruction are loop invariant.
Definition LoopInfo.cpp:67
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition LoopInfo.cpp:632
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition LoopInfo.cpp:61
Metadata node.
Definition Metadata.h:1078
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition MapVector.h:119
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition Module.cpp:230
Diagnostic information for optimization analysis remarks related to pointer aliasing.
Diagnostic information for optimization analysis remarks related to floating-point non-commutativity.
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
op_range incoming_values()
void setIncomingValueForBlock(const BasicBlock *BB, Value *V)
Set every incoming value(s) for block BB to V.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
unsigned getNumIncomingValues() const
Return the number of incoming edges.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEVPredicate & getPredicate() const
LLVM_ABI unsigned getSmallConstantMaxTripCount()
Returns the upper bound of the loop trip count as a normal unsigned value, or 0 if the trip count is ...
LLVM_ABI const SCEV * getBackedgeTakenCount()
Get the (predicated) backedge count for the analyzed loop.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
PreservedAnalyses & preserve()
Mark an analysis as preserved.
Definition Analysis.h:132
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
Analysis providing profile information.
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
static bool isFMulAddIntrinsic(Instruction *I)
Returns true if the instruction is a call to the llvm.fmuladd intrinsic.
FastMathFlags getFastMathFlags() const
Instruction * getLoopExitInstr() const
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
Type * getRecurrenceType() const
Returns the type of the recurrence.
const SmallPtrSet< Instruction *, 8 > & getCastInsts() const
Returns a reference to the instructions used for type-promoting the recurrence.
unsigned getMinWidthCastToRecurrenceTypeInBits() const
Returns the minimum width used by the recurrence in bits.
TrackingVH< Value > getRecurrenceStartValue() const
LLVM_ABI SmallVector< Instruction *, 4 > getReductionOpChain(PHINode *Phi, Loop *L) const
Attempts to find a chain of operations from Phi to LoopExitInst that can be treated as a set of reduc...
static bool isAnyOfRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
bool isSigned() const
Returns true if all source operands of the recurrence are SExtInsts.
RecurKind getRecurrenceKind() const
bool isOrdered() const
Expose an ordered FP reduction to the instance users.
static LLVM_ABI bool isFloatingPointRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is a floating point kind.
static bool isFindIVRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
Value * getSentinelValue() const
Returns the sentinel value for FindFirstIV & FindLastIV recurrences to replace the start value.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
std::optional< ArrayRef< PointerDiffInfo > > getDiffChecks() const
const SmallVectorImpl< RuntimePointerCheck > & getChecks() const
Returns the checks that generateChecks created.
This class uses information about analyze scalars to rewrite expressions in canonical form.
ScalarEvolution * getSE()
bool isInsertedInstruction(Instruction *I) const
Return true if the specified instruction was inserted by the code rewriter.
LLVM_ABI Value * expandCodeForPredicate(const SCEVPredicate *Pred, Instruction *Loc)
Generates a code sequence that evaluates this predicate.
void eraseDeadInstructions(Value *Root)
Remove inserted instructions that are dead, e.g.
virtual bool isAlwaysTrue() const =0
Returns true if the predicate is always true.
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
LLVM_ABI const SCEV * getURemExpr(const SCEV *LHS, const SCEV *RHS)
Represents an unsigned remainder expression based on unsigned division.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getTripCountFromExitCount(const SCEV *ExitCount)
A version of getTripCountFromExitCount below which always picks an evaluation type which can not resu...
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
LLVM_ABI void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
LLVM_ABI bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
LLVM_ABI void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
LLVM_ABI void forgetBlockAndLoopDispositions(Value *V=nullptr)
Called when the client has changed the disposition of values in a loop or block.
const SCEV * getMinusOne(Type *Ty)
Return a SCEV for the constant -1 of a specific type.
LLVM_ABI void forgetLcssaPhiWithNewPredecessor(Loop *L, PHINode *V)
Forget LCSSA phi node V of loop L to which a new predecessor was added, such that it may no longer be...
LLVM_ABI unsigned getSmallConstantTripCount(const Loop *L)
Returns the exact trip count of the loop if we can compute it, and the result is a small constant.
APInt getUnsignedRangeMax(const SCEV *S)
Determine the max of the unsigned range for a particular SCEV.
LLVM_ABI const SCEV * applyLoopGuards(const SCEV *Expr, const Loop *L)
Try to apply information from loop guards for L to Expr.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:58
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:101
void insert_range(Range &&R)
Definition SetVector.h:174
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition SetVector.h:260
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:149
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:337
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
LLVM_ABI std::optional< unsigned > getVScaleForTuning() const
LLVM_ABI InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
LLVM_ABI bool supportsEfficientVectorElementLoadStore() const
If target has efficient vector element load/store instructions, it can return true here so that inser...
LLVM_ABI bool prefersVectorizedAddressing() const
Return true if target doesn't mind addresses in vectors.
LLVM_ABI TypeSize getRegisterBitWidth(RegisterKind K) const
LLVM_ABI bool preferFixedOverScalableIfEqualCost(bool IsEpilogue) const
LLVM_ABI InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
LLVM_ABI bool isElementTypeLegalForScalableVector(Type *Ty) const
LLVM_ABI ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
LLVM_ABI InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const
LLVM_ABI InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, PartialReductionExtendKind OpAExtend, PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind) const
LLVM_ABI InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
LLVM_ABI InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
LLVM_ABI bool supportsScalableVectors() const
@ TCC_Free
Expected to fold away in lowering.
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
LLVM_ABI InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const
LLVM_ABI InstructionCost getOperandsScalarizationOverhead(ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const
Estimate the overhead of scalarizing operands with the given types.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
LLVM_ABI InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ Interleave
The cast is used with an interleaved load/store.
@ GatherScatter
The cast is used with a gather/scatter.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:88
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:97
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:280
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
op_range operands()
Definition User.h:292
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition User.cpp:24
Value * getOperand(unsigned i) const
Definition User.h:232
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition VectorUtils.h:74
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:3990
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition VPlan.h:4065
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:4017
iterator end()
Definition VPlan.h:4027
iterator begin()
Recipe iterator methods.
Definition VPlan.h:4025
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:4078
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:216
VPRegionBlock * getEnclosingLoopRegion()
Definition VPlan.cpp:578
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:623
void insert(VPRecipeBase *Recipe, iterator InsertPt)
Definition VPlan.h:4056
bool empty() const
Definition VPlan.h:4036
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:80
VPRegionBlock * getParent()
Definition VPlan.h:172
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:186
void setName(const Twine &newName)
Definition VPlan.h:165
size_t getNumSuccessors() const
Definition VPlan.h:218
void swapSuccessors()
Swap successors of the block. The block must have exactly 2 successors.
Definition VPlan.h:321
size_t getNumPredecessors() const
Definition VPlan.h:219
VPlan * getPlan()
Definition VPlan.cpp:161
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:166
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:208
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:197
static auto blocksOnly(const T &Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:211
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:232
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:170
static void reassociateBlocks(VPBlockBase *Old, VPBlockBase *New)
Reassociate all the blocks connected to Old so that they now point to New.
Definition VPlanUtils.h:197
VPlan-based builder utility analogous to IRBuilder.
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL, const Twine &Name="")
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
Canonical scalar induction phi of the vector loop.
Definition VPlan.h:3571
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:432
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:405
void execute(VPTransformState &State) override
Generate the transformed value of the induction at offset StartValue (1.
VPValue * getStepValue() const
Definition VPlan.h:3790
VPValue * getStartValue() const
Definition VPlan.h:3789
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:2055
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2097
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2086
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:4143
Class to record and manage LLVM IR flags.
Definition VPlan.h:609
Helper to manage IR metadata for recipes.
Definition VPlan.h:982
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1031
@ ComputeAnyOfResult
Compute the final result of a AnyOf reduction with select(cmp(),x,y), where one of (x,...
Definition VPlan.h:1069
@ ResumeForEpilogue
Explicit user for the resume phi of the canonical induction in the main VPlan, used by the epilogue v...
Definition VPlan.h:1118
@ ReductionStartVector
Start vector for reductions with 3 operands: the original start value, the identity value for the red...
Definition VPlan.h:1109
unsigned getOpcode() const
Definition VPlan.h:1170
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:2661
In what follows, the term "input IR" refers to code that is fed into the vectorizer whereas the term ...
A recipe for forming partial reductions.
Definition VPlan.h:2851
detail::zippy< llvm::detail::zip_first, VPUser::const_operand_range, const_incoming_blocks_range > incoming_values_and_blocks() const
Returns an iterator range over pairs of incoming values and corresponding incoming blocks.
Definition VPlan.h:1347
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:386
VPBasicBlock * getParent()
Definition VPlan.h:407
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:478
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
VPRecipeBase * tryToCreateWidenRecipe(VPSingleDefRecipe *R, VFRange &Range)
Create and return a widened recipe for R if one can be created within the given VF Range.
VPValue * getBlockInMask(VPBasicBlock *VPBB) const
Returns the entry mask for block VPBB or null if the mask is all-true.
VPRecipeBase * tryToCreatePartialReduction(VPInstruction *Reduction, unsigned ScaleFactor)
Create and return a partial reduction recipe for a reduction instruction along with binary operation ...
std::optional< unsigned > getScalingForReduction(const Instruction *ExitInst)
void collectScaledReductions(VFRange &Range)
Find all possible partial reductions in the loop and track all of those that are valid so recipes can...
VPReplicateRecipe * handleReplication(VPInstruction *VPI, VFRange &Range)
Build a VPReplicationRecipe for VPI.
A recipe for handling reduction phis.
Definition VPlan.h:2414
bool isInLoop() const
Returns true, if the phi is part of an in-loop reduction.
Definition VPlan.h:2468
RecurKind getRecurrenceKind() const
Returns the recurrence kind of the reduction.
Definition VPlan.h:2462
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4178
const VPBlockBase * getEntry() const
Definition VPlan.h:4214
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the region.
Definition VPlan.h:4276
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:2959
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:530
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:595
An analysis for type-inference for VPValues.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:207
operand_range operands()
Definition VPlanValue.h:275
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:251
unsigned getNumOperands() const
Definition VPlanValue.h:245
operand_iterator op_begin()
Definition VPlanValue.h:271
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:246
void addOperand(VPValue *Operand)
Definition VPlanValue.h:240
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:48
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:131
Value * getLiveInIRValue() const
Returns the underlying IR value, if this VPValue is defined outside the scope of VPlan.
Definition VPlanValue.h:183
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1377
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1381
user_range users()
Definition VPlanValue.h:134
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:1917
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1545
A recipe for handling GEP instructions.
Definition VPlan.h:1841
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2197
A common base class for widening memory operations.
Definition VPlan.h:3270
A recipe for widened phis.
Definition VPlan.h:2336
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1497
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4308
bool hasVF(ElementCount VF) const
Definition VPlan.h:4513
LLVMContext & getContext() const
Definition VPlan.h:4501
VPBasicBlock * getEntry()
Definition VPlan.h:4401
VPValue & getVectorTripCount()
The vector trip count.
Definition VPlan.h:4492
VPValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4499
VPValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4495
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4463
iterator_range< SmallSetVector< ElementCount, 2 >::iterator > vectorFactors() const
Returns an iterator range over all VFs of the plan.
Definition VPlan.h:4520
bool hasUF(unsigned UF) const
Definition VPlan.h:4531
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4453
VPValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:4576
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1011
bool hasEarlyExit() const
Returns true if the VPlan is based on a loop with an early exit.
Definition VPlan.h:4669
InstructionCost cost(ElementCount VF, VPCostContext &Ctx)
Return the cost of this plan.
Definition VPlan.cpp:993
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4477
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4426
VPValue * getOrAddLiveIn(Value *V)
Gets the live-in VPValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:4555
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4444
void execute(VPTransformState *State)
Generate the IR code for this VPlan.
Definition VPlan.cpp:905
VPIRBasicBlock * getScalarHeader() const
Return the VPIRBasicBlock wrapping the header of the scalar loop.
Definition VPlan.h:4449
VPValue * getLiveIn(Value *V) const
Return the live-in VPValue for V, if there is one or nullptr otherwise.
Definition VPlan.h:4592
VPBasicBlock * getVectorPreheader()
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4406
LLVM_ABI_FOR_TEST VPlan * duplicate()
Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...
Definition VPlan.cpp:1153
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
LLVM_ABI bool hasOneUser() const
Return true if there is exactly one user of this value.
Definition Value.cpp:166
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:390
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
constexpr bool hasKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns true if there exists a value X where RHS.multiplyCoefficientBy(X) will result in a value whos...
Definition TypeSize.h:269
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:230
constexpr bool isNonZero() const
Definition TypeSize.h:155
constexpr ScalarTy getKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns a value X where RHS.multiplyCoefficientBy(X) will result in a value whose quantity matches ou...
Definition TypeSize.h:277
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr bool isZero() const
Definition TypeSize.h:153
static constexpr bool isKnownGT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:223
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
static constexpr bool isKnownGE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:237
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
IteratorT end() const
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
A raw_ostream that writes to an std::string.
Changed
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
std::variant< std::monostate, Loc::Single, Loc::Multi, Loc::MMI, Loc::EntryValue > Variant
Alias for the std::variant specialization base class of DbgVariable.
Definition DwarfDebug.h:189
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
OneOps_match< OpTy, Instruction::Freeze > m_Freeze(const OpTy &Op)
Matches FreezeInst.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
MatchFunctor< Val, Pattern > match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
class_match< const SCEVVScale > m_SCEVVScale()
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
specificloop_ty m_SpecificLoop(const Loop *L)
cst_pred_ty< is_specific_signed_cst > m_scev_SpecificSInt(int64_t V)
Match an SCEV constant with a plain signed integer (sign-extended value will be matched)
SCEVAffineAddRec_match< Op0_t, Op1_t, class_match< const Loop > > m_scev_AffineAddRec(const Op0_t &Op0, const Op1_t &Op1)
bind_ty< const SCEVMulExpr > m_scev_Mul(const SCEVMulExpr *&V)
bool match(const SCEV *S, const Pattern &P)
SCEVBinaryExpr_match< SCEVMulExpr, Op0_t, Op1_t, SCEV::FlagAnyWrap, true > m_scev_c_Mul(const Op0_t &Op0, const Op1_t &Op1)
class_match< const SCEV > m_SCEV()
match_combine_or< AllRecipe_match< Instruction::ZExt, Op0_t >, AllRecipe_match< Instruction::SExt, Op0_t > > m_ZExtOrSExt(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ExtractLastElement, Op0_t > m_ExtractLastElement(const Op0_t &Op0)
class_match< VPValue > m_VPValue()
Match an arbitrary VPValue and ignore it.
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
Add a small namespace to avoid name clashes with the classes used in the streaming interface.
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< InstrNode * > Instr
Definition RDFGraph.h:389
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
VPBasicBlock * getFirstLoopHeader(VPlan &Plan, VPDominatorTree &VPDT)
Returns the header block of the first, top-level loop, or null if none exist.
VPIRFlags getFlagsFromIndDesc(const InductionDescriptor &ID)
Extracts and returns NoWrap and FastMath flags from the induction binop in ID.
Definition VPlanUtils.h:85
unsigned getVFScaleFactor(VPRecipeBase *R)
Get the VF scaling factor applied to the recipe's output, if the recipe has one.
const SCEV * getSCEVExprForVPValue(const VPValue *V, ScalarEvolution &SE, const Loop *L=nullptr)
Return the SCEV expression for V.
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI bool simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, AssumptionCache *AC, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
Simplify each loop in a loop nest recursively.
LLVM_ABI void ReplaceInstWithInst(BasicBlock *BB, BasicBlock::iterator &BI, Instruction *I)
Replace the instruction specified by BI with the instruction specified by I.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:829
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI Value * addRuntimeChecks(Instruction *Loc, Loop *TheLoop, const SmallVectorImpl< RuntimePointerCheck > &PointerChecks, SCEVExpander &Expander, bool HoistRuntimeChecks=false)
Add code that checks at runtime if the accessed arrays in PointerChecks overlap.
auto cast_if_present(const Y &Val)
cast_if_present<X> - Functionally identical to cast, except that a null value is accepted.
Definition Casting.h:683
LLVM_ABI bool RemoveRedundantDbgInstrs(BasicBlock *BB)
Try to remove redundant dbg.value instructions from given basic block.
LLVM_ABI_FOR_TEST cl::opt< bool > VerifyEachVPlan
LLVM_ABI std::optional< unsigned > getLoopEstimatedTripCount(Loop *L, unsigned *EstimatedLoopInvocationWeight=nullptr)
Return either:
static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop, VectorizationFactor VF, unsigned IC)
Report successful vectorization of the loop.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
unsigned getLoadStoreAddressSpace(const Value *I)
A helper function that returns the address space of the pointer operand of load or store instruction.
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1655
LLVM_ABI_FOR_TEST bool verifyVPlanIsValid(const VPlan &Plan, bool VerifyLate=false)
Verify invariants for general VPlans.
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
InstructionCost Cost
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
OuterAnalysisManagerProxy< ModuleAnalysisManager, Function > ModuleAnalysisManagerFunctionProxy
Provide the ModuleAnalysisManager to Function proxy.
Value * getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF)
Return the runtime value for VF.
LLVM_ABI bool formLCSSARecursively(Loop &L, const DominatorTree &DT, const LoopInfo *LI, ScalarEvolution *SE)
Put a loop nest into LCSSA form.
Definition LCSSA.cpp:449
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2136
LLVM_ABI bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:216
LLVM_ABI bool VerifySCEV
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:243
SmallVector< VPRegisterUsage, 8 > calculateRegisterUsageForPlan(VPlan &Plan, ArrayRef< ElementCount > VFs, const TargetTransformInfo &TTI, const SmallPtrSetImpl< const Value * > &ValuesToIgnore)
Estimate the register usage for Plan and vectorization factors in VFs by calculating the highest numb...
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:337
LLVM_ABI void setBranchWeights(Instruction &I, ArrayRef< uint32_t > Weights, bool IsExpected, bool ElideAllZero=false)
Create a new branch_weights metadata node and add or overwrite a prof metadata reference to instructi...
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
void collectEphemeralRecipesForVPlan(VPlan &Plan, DenseSet< VPRecipeBase * > &EphRecipes)
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
bool containsIrreducibleCFG(RPOTraversalT &RPOTraversal, const LoopInfoT &LI)
Return true if the control flow in RPOTraversal is irreducible.
Definition CFG.h:149
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1622
LLVM_ABI_FOR_TEST cl::opt< bool > EnableWideActiveLaneMask
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
LLVM_ABI cl::opt< bool > EnableLoopVectorization
LLVM_ABI bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition Local.cpp:421
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
Type * toVectorizedTy(Type *Ty, ElementCount EC)
A helper for converting to vectorized types.
LLVM_ABI void llvm_unreachable_internal(const char *msg=nullptr, const char *file=nullptr, unsigned line=0)
This function calls abort(), and prints the optional message to stderr.
bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind)
Check if a constant CI can be safely treated as having been extended from a narrower type with the gi...
Definition VPlan.cpp:1718
T * find_singleton(R &&Range, Predicate P, bool AllowRepeats=false)
Return the single value in Range that satisfies P(<member of Range> *, AllowRepeats)->T * returning n...
Definition STLExtras.h:1787
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
cl::opt< unsigned > ForceTargetInstructionCost
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:129
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
bool canVectorizeTy(Type *Ty)
Returns true if Ty is a valid vector element type, void, or an unpacked literal struct where all elem...
TargetTransformInfo TTI
static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr, DebugLoc DL={})
Reports an informative message: print Msg for debugging purposes as well as an optimization remark.
LLVM_ABI bool isAssignmentTrackingEnabled(const Module &M)
Return true if assignment tracking is enabled for module M.
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
LLVM_ABI Value * getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF)
Given information about an recurrence kind, return the identity for the @llvm.vector....
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
LLVM_ABI void reportVectorizationFailure(const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr)
Reports a vectorization failure: print DebugMsg for debugging purposes along with the corresponding o...
DWARFExpression::Operation Op
ScalarEpilogueLowering
@ CM_ScalarEpilogueNotAllowedLowTripLoop
@ CM_ScalarEpilogueNotNeededUsePredicate
@ CM_ScalarEpilogueNotAllowedOptSize
@ CM_ScalarEpilogueAllowed
@ CM_ScalarEpilogueNotAllowedUsePredicate
LLVM_ABI bool isGuaranteedNotToBeUndefOrPoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Return true if this function can prove that V does not have undef bits and is never poison.
ArrayRef(const T &OneElt) -> ArrayRef< T >
Value * createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, int64_t Step)
Return a value for Step multiplied by VF.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1758
auto predecessors(const MachineBasicBlock *BB)
iterator_range< pointer_iterator< WrappedIteratorT > > make_pointer_range(RangeT &&Range)
Definition iterator.h:363
cl::opt< bool > EnableVPlanNativePath
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
ArrayRef< Type * > getContainedTypes(Type *const &Ty)
Returns the types contained in Ty.
LLVM_ABI Value * addDiffRuntimeChecks(Instruction *Loc, ArrayRef< PointerDiffInfo > Checks, SCEVExpander &Expander, function_ref< Value *(IRBuilderBase &, unsigned)> GetVF, unsigned IC)
bool pred_empty(const BasicBlock *BB)
Definition CFG.h:119
@ DataAndControlFlowWithoutRuntimeCheck
Use predicate to control both data and control flow, but modify the trip count so that a runtime over...
@ None
Don't use tail folding.
@ DataWithEVL
Use predicated EVL instructions for tail-folding.
@ DataAndControlFlow
Use predicate to control both data and control flow.
@ DataWithoutLaneMask
Same as Data, but avoids using the get.active.lane.mask intrinsic to calculate the mask and instead i...
@ Data
Use predicate only to mask operations on data in the loop.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI bool hasBranchWeightMD(const Instruction &I)
Checks if an instructions has Branch Weight Metadata.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:330
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
std::unique_ptr< VPlan > VPlanPtr
Definition VPlan.h:76
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
LLVM_ABI MapVector< Instruction *, uint64_t > computeMinimumValueSizes(ArrayRef< BasicBlock * > Blocks, DemandedBits &DB, const TargetTransformInfo *TTI=nullptr)
Compute a map of integer instructions to their minimum legal type size.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
LLVM_ABI cl::opt< bool > EnableLoopInterleaving
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
A special type used by analysis passes to provide an address that identifies that particular analysis...
Definition Analysis.h:29
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
An information struct used to provide DenseMap with the various necessary components for a given valu...
Encapsulate information regarding vectorization of a loop and its epilogue.
EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, ElementCount EVF, unsigned EUF, VPlan &EpiloguePlan)
A class that represents two vectorization factors (initialized with 0 by default).
static FixedScalableVFPair getNone()
This holds details about a histogram operation – a load -> update -> store sequence where each lane i...
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
TargetLibraryInfo * TLI
LLVM_ABI LoopVectorizeResult runImpl(Function &F)
LLVM_ABI bool processLoop(Loop *L)
ProfileSummaryInfo * PSI
LoopAccessInfoManager * LAIs
LLVM_ABI void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName)
LLVM_ABI LoopVectorizePass(LoopVectorizeOptions Opts={})
BlockFrequencyInfo * BFI
ScalarEvolution * SE
AssumptionCache * AC
LLVM_ABI PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
OptimizationRemarkEmitter * ORE
TargetTransformInfo * TTI
Storage for information about made changes.
A chain of instructions that form a partial reduction.
Instruction * Reduction
The top-level binary operation that forms the reduction to a scalar after the loop body.
Instruction * ExtendA
The extension of each of the inner binary operation's operands.
A CRTP mix-in to automatically provide informational APIs needed for passes.
Definition PassManager.h:69
A marker analysis to determine if extra passes should be run after loop vectorization.
static LLVM_ABI AnalysisKey Key
Holds the VFShape for a specific scalar to vector function mapping.
std::optional< unsigned > getParamIndexForOptionalMask() const
Instruction Set Architecture.
Encapsulates information needed to describe a parameter.
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
ElementCount End
Struct to hold various analysis needed for cost computations.
unsigned getPredBlockCostDivisor(BasicBlock *BB) const
LoopVectorizationCostModel & CM
bool isLegacyUniformAfterVectorization(Instruction *I, ElementCount VF) const
Return true if I is considered uniform-after-vectorization in the legacy cost model for VF.
bool skipCostComputation(Instruction *UI, bool IsVector) const
Return true if the cost for UI shouldn't be computed, e.g.
InstructionCost getLegacyCost(Instruction *UI, ElementCount VF) const
Return the cost for UI with VF using the legacy cost model as fallback until computing the cost of al...
TargetTransformInfo::TargetCostKind CostKind
SmallPtrSet< Instruction *, 8 > SkipCostComputation
A recipe for handling first-order recurrence phis.
Definition VPlan.h:2378
A struct that represents some properties of the register usage of a loop.
VPTransformState holds information passed down when "executing" a VPlan, needed for generating the ou...
A recipe for widening select instructions.
Definition VPlan.h:1794
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializePacksAndUnpacks(VPlan &Plan)
Add explicit Build[Struct]Vector recipes to Pack multiple scalar values into vectors and Unpack recip...
static LLVM_ABI_FOR_TEST std::unique_ptr< VPlan > buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy, DebugLoc IVDL, PredicatedScalarEvolution &PSE, LoopVersioning *LVer=nullptr)
Create a base VPlan0, serving as the common starting point for all later candidates.
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void optimizeInductionExitUsers(VPlan &Plan, DenseMap< VPValue *, VPValue * > &EndValues, ScalarEvolution &SE)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static LLVM_ABI_FOR_TEST void handleEarlyExits(VPlan &Plan, bool HasUncountableExit)
Update Plan to account for all early exits.
static void addScalarResumePhis(VPlan &Plan, VPRecipeBuilder &Builder, DenseMap< VPValue *, VPValue * > &IVEndValues)
Create resume phis in the scalar preheader for first-order recurrences, reductions and inductions,...
static void canonicalizeEVLLoops(VPlan &Plan)
Transform EVL loops to use variable-length stepping after region dissolution.
static void dropPoisonGeneratingRecipes(VPlan &Plan, const std::function< bool(BasicBlock *)> &BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed)
static bool runPass(bool(*Transform)(VPlan &, ArgsTy...), VPlan &Plan, typename std::remove_reference< ArgsTy >::type &...Args)
Helper to run a VPlan transform Transform on VPlan, forwarding extra arguments to the transform.
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF, TypeSize VectorRegWidth)
Try to convert a plan with interleave groups with VF elements to a plan with the interleave groups re...
static void unrollByUF(VPlan &Plan, unsigned UF)
Explicitly unroll Plan by UF.
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand VPExpandSCEVRecipes in Plan's entry block.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void addMinimumIterationCheck(VPlan &Plan, ElementCount VF, unsigned UF, ElementCount MinProfitableTripCount, bool RequiresScalarEpilogue, bool TailFolded, bool CheckNeededWithTailFolding, Loop *OrigLoop, const uint32_t *MinItersBypassWeights, DebugLoc DL, ScalarEvolution &SE)
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, function_ref< const InductionDescriptor *(PHINode *)> GetIntOrFpInductionDescriptor, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range)
Handle users in the exit block for first order reductions in the original exit block.
static DenseMap< VPBasicBlock *, VPValue * > introduceMasksAndLinearize(VPlan &Plan, bool FoldTail)
Predicate and linearize the control-flow in the only loop region of Plan.
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPEVLBasedIVPHIRecipe and related recipes to Plan and replaces all uses except the canonical IV...
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static bool handleMaxMinNumReductions(VPlan &Plan)
Check if Plan contains any FMaxNum or FMinNum reductions.
static void removeBranchOnConst(VPlan &Plan)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static LLVM_ABI_FOR_TEST void createLoopRegions(VPlan &Plan)
Replace loops in Plan's flat CFG with VPRegionBlocks, turning Plan's flat CFG into a hierarchical CFG...
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void attachCheckBlock(VPlan &Plan, Value *Cond, BasicBlock *CheckBlock, bool AddBranchWeights)
Wrap runtime check block CheckBlock in a VPIRBB and Cond in a VPValue and connect the block to Plan,...
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue)
Materialize vector trip count computations to a set of VPInstructions.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static void replicateByVF(VPlan &Plan, ElementCount VF)
Replace each replicating VPReplicateRecipe and VPInstruction outside of any replicate region in Plan ...
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static LLVM_ABI_FOR_TEST void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Try to have all users of fixed-order recurrences appear after the recipe defining their previous valu...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize VF and VFxUF to be computed explicitly using VPInstructions.
static void addMinimumVectorEpilogueIterationCheck(VPlan &Plan, Value *TripCount, Value *VectorTripCount, bool RequiresScalarEpilogue, ElementCount EpilogueVF, unsigned EpilogueUF, unsigned MainLoopStep, unsigned EpilogueLoopStep, ScalarEvolution &SE)
Add a check to Plan to see if the epilogue vector loop should be executed.
static LLVM_ABI_FOR_TEST void addMiddleCheck(VPlan &Plan, bool RequiresScalarEpilogueCheck, bool TailFolded)
If a check is needed to guard executing the scalar epilogue loop, it will be added to the middle bloc...
TODO: The following VectorizationFactor was pulled out of LoopVectorizationCostModel class.
InstructionCost Cost
Cost of the loop with that width.
ElementCount MinProfitableTripCount
The minimum trip count required to make vectorization profitable, e.g.
ElementCount Width
Vector width with best cost.
InstructionCost ScalarCost
Cost of the scalar loop.
static VectorizationFactor Disabled()
Width 1 means no vectorization, cost 0 means uncomputed cost.
static LLVM_ABI bool HoistRuntimeChecks