LLVM 22.0.0git
LoopVectorize.cpp
Go to the documentation of this file.
1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanAnalysis.h"
61#include "VPlanCFG.h"
62#include "VPlanHelpers.h"
63#include "VPlanPatternMatch.h"
64#include "VPlanTransforms.h"
65#include "VPlanUtils.h"
66#include "VPlanVerifier.h"
67#include "llvm/ADT/APInt.h"
68#include "llvm/ADT/ArrayRef.h"
69#include "llvm/ADT/DenseMap.h"
71#include "llvm/ADT/Hashing.h"
72#include "llvm/ADT/MapVector.h"
73#include "llvm/ADT/STLExtras.h"
76#include "llvm/ADT/Statistic.h"
77#include "llvm/ADT/StringRef.h"
78#include "llvm/ADT/Twine.h"
79#include "llvm/ADT/TypeSwitch.h"
84#include "llvm/Analysis/CFG.h"
101#include "llvm/IR/Attributes.h"
102#include "llvm/IR/BasicBlock.h"
103#include "llvm/IR/CFG.h"
104#include "llvm/IR/Constant.h"
105#include "llvm/IR/Constants.h"
106#include "llvm/IR/DataLayout.h"
107#include "llvm/IR/DebugInfo.h"
108#include "llvm/IR/DebugLoc.h"
109#include "llvm/IR/DerivedTypes.h"
111#include "llvm/IR/Dominators.h"
112#include "llvm/IR/Function.h"
113#include "llvm/IR/IRBuilder.h"
114#include "llvm/IR/InstrTypes.h"
115#include "llvm/IR/Instruction.h"
116#include "llvm/IR/Instructions.h"
118#include "llvm/IR/Intrinsics.h"
119#include "llvm/IR/MDBuilder.h"
120#include "llvm/IR/Metadata.h"
121#include "llvm/IR/Module.h"
122#include "llvm/IR/Operator.h"
123#include "llvm/IR/PatternMatch.h"
125#include "llvm/IR/Type.h"
126#include "llvm/IR/Use.h"
127#include "llvm/IR/User.h"
128#include "llvm/IR/Value.h"
129#include "llvm/IR/Verifier.h"
130#include "llvm/Support/Casting.h"
132#include "llvm/Support/Debug.h"
147#include <algorithm>
148#include <cassert>
149#include <cstdint>
150#include <functional>
151#include <iterator>
152#include <limits>
153#include <memory>
154#include <string>
155#include <tuple>
156#include <utility>
157
158using namespace llvm;
159using namespace SCEVPatternMatch;
160
161#define LV_NAME "loop-vectorize"
162#define DEBUG_TYPE LV_NAME
163
164#ifndef NDEBUG
165const char VerboseDebug[] = DEBUG_TYPE "-verbose";
166#endif
167
168STATISTIC(LoopsVectorized, "Number of loops vectorized");
169STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
170STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
171STATISTIC(LoopsEarlyExitVectorized, "Number of early exit loops vectorized");
172
174 "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
175 cl::desc("Enable vectorization of epilogue loops."));
176
178 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
179 cl::desc("When epilogue vectorization is enabled, and a value greater than "
180 "1 is specified, forces the given VF for all applicable epilogue "
181 "loops."));
182
184 "epilogue-vectorization-minimum-VF", cl::Hidden,
185 cl::desc("Only loops with vectorization factor equal to or larger than "
186 "the specified value are considered for epilogue vectorization."));
187
188/// Loops with a known constant trip count below this number are vectorized only
189/// if no scalar iteration overheads are incurred.
191 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
192 cl::desc("Loops with a constant trip count that is smaller than this "
193 "value are vectorized only if no scalar iteration overheads "
194 "are incurred."));
195
197 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
198 cl::desc("The maximum allowed number of runtime memory checks"));
199
200// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
201// that predication is preferred, and this lists all options. I.e., the
202// vectorizer will try to fold the tail-loop (epilogue) into the vector body
203// and predicate the instructions accordingly. If tail-folding fails, there are
204// different fallback strategies depending on these values:
211} // namespace PreferPredicateTy
212
214 "prefer-predicate-over-epilogue",
217 cl::desc("Tail-folding and predication preferences over creating a scalar "
218 "epilogue loop."),
220 "scalar-epilogue",
221 "Don't tail-predicate loops, create scalar epilogue"),
223 "predicate-else-scalar-epilogue",
224 "prefer tail-folding, create scalar epilogue if tail "
225 "folding fails."),
227 "predicate-dont-vectorize",
228 "prefers tail-folding, don't attempt vectorization if "
229 "tail-folding fails.")));
230
232 "force-tail-folding-style", cl::desc("Force the tail folding style"),
235 clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
238 "Create lane mask for data only, using active.lane.mask intrinsic"),
240 "data-without-lane-mask",
241 "Create lane mask with compare/stepvector"),
243 "Create lane mask using active.lane.mask intrinsic, and use "
244 "it for both data and control flow"),
246 "data-and-control-without-rt-check",
247 "Similar to data-and-control, but remove the runtime check"),
249 "Use predicated EVL instructions for tail folding. If EVL "
250 "is unsupported, fallback to data-without-lane-mask.")));
251
253 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
254 cl::desc("Maximize bandwidth when selecting vectorization factor which "
255 "will be determined by the smallest type in loop."));
256
258 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
259 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
260
261/// An interleave-group may need masking if it resides in a block that needs
262/// predication, or in order to mask away gaps.
264 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
265 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
266
268 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
269 cl::desc("A flag that overrides the target's number of scalar registers."));
270
272 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
273 cl::desc("A flag that overrides the target's number of vector registers."));
274
276 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
277 cl::desc("A flag that overrides the target's max interleave factor for "
278 "scalar loops."));
279
281 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
282 cl::desc("A flag that overrides the target's max interleave factor for "
283 "vectorized loops."));
284
286 "force-target-instruction-cost", cl::init(0), cl::Hidden,
287 cl::desc("A flag that overrides the target's expected cost for "
288 "an instruction to a single constant value. Mostly "
289 "useful for getting consistent testing."));
290
292 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
293 cl::desc(
294 "Pretend that scalable vectors are supported, even if the target does "
295 "not support them. This flag should only be used for testing."));
296
298 "small-loop-cost", cl::init(20), cl::Hidden,
299 cl::desc(
300 "The cost of a loop that is considered 'small' by the interleaver."));
301
303 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
304 cl::desc("Enable the use of the block frequency analysis to access PGO "
305 "heuristics minimizing code growth in cold regions and being more "
306 "aggressive in hot regions."));
307
308// Runtime interleave loops for load/store throughput.
310 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
311 cl::desc(
312 "Enable runtime interleaving until load/store ports are saturated"));
313
314/// The number of stores in a loop that are allowed to need predication.
316 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
317 cl::desc("Max number of stores to be predicated behind an if."));
318
320 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
321 cl::desc("Count the induction variable only once when interleaving"));
322
324 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
325 cl::desc("Enable if predication of stores during vectorization."));
326
328 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
329 cl::desc("The maximum interleave count to use when interleaving a scalar "
330 "reduction in a nested loop."));
331
332static cl::opt<bool>
333 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
335 cl::desc("Prefer in-loop vector reductions, "
336 "overriding the targets preference."));
337
339 "force-ordered-reductions", cl::init(false), cl::Hidden,
340 cl::desc("Enable the vectorisation of loops with in-order (strict) "
341 "FP reductions"));
342
344 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
345 cl::desc(
346 "Prefer predicating a reduction operation over an after loop select."));
347
349 "enable-vplan-native-path", cl::Hidden,
350 cl::desc("Enable VPlan-native vectorization path with "
351 "support for outer loop vectorization."));
352
354 llvm::VerifyEachVPlan("vplan-verify-each",
355#ifdef EXPENSIVE_CHECKS
356 cl::init(true),
357#else
358 cl::init(false),
359#endif
361 cl::desc("Verfiy VPlans after VPlan transforms."));
362
363// This flag enables the stress testing of the VPlan H-CFG construction in the
364// VPlan-native vectorization path. It must be used in conjuction with
365// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
366// verification of the H-CFGs built.
368 "vplan-build-stress-test", cl::init(false), cl::Hidden,
369 cl::desc(
370 "Build VPlan for every supported loop nest in the function and bail "
371 "out right after the build (stress test the VPlan H-CFG construction "
372 "in the VPlan-native vectorization path)."));
373
375 "interleave-loops", cl::init(true), cl::Hidden,
376 cl::desc("Enable loop interleaving in Loop vectorization passes"));
378 "vectorize-loops", cl::init(true), cl::Hidden,
379 cl::desc("Run the Loop vectorization passes"));
380
382 "force-widen-divrem-via-safe-divisor", cl::Hidden,
383 cl::desc(
384 "Override cost based safe divisor widening for div/rem instructions"));
385
387 "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
389 cl::desc("Try wider VFs if they enable the use of vector variants"));
390
392 "enable-early-exit-vectorization", cl::init(true), cl::Hidden,
393 cl::desc(
394 "Enable vectorization of early exit loops with uncountable exits."));
395
397 "vectorizer-consider-reg-pressure", cl::init(false), cl::Hidden,
398 cl::desc("Discard VFs if their register pressure is too high."));
399
400// Likelyhood of bypassing the vectorized loop because there are zero trips left
401// after prolog. See `emitIterationCountCheck`.
402static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
403
404/// A helper function that returns true if the given type is irregular. The
405/// type is irregular if its allocated size doesn't equal the store size of an
406/// element of the corresponding vector type.
407static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
408 // Determine if an array of N elements of type Ty is "bitcast compatible"
409 // with a <N x Ty> vector.
410 // This is only true if there is no padding between the array elements.
411 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
412}
413
414/// A version of ScalarEvolution::getSmallConstantTripCount that returns an
415/// ElementCount to include loops whose trip count is a function of vscale.
417 const Loop *L) {
418 if (unsigned ExpectedTC = SE->getSmallConstantTripCount(L))
419 return ElementCount::getFixed(ExpectedTC);
420
421 const SCEV *BTC = SE->getBackedgeTakenCount(L);
423 return ElementCount::getFixed(0);
424
425 const SCEV *ExitCount = SE->getTripCountFromExitCount(BTC, BTC->getType(), L);
426 if (isa<SCEVVScale>(ExitCount))
428
429 const APInt *Scale;
430 if (match(ExitCount, m_scev_Mul(m_scev_APInt(Scale), m_SCEVVScale())))
431 if (cast<SCEVMulExpr>(ExitCount)->hasNoUnsignedWrap())
432 if (Scale->getActiveBits() <= 32)
434
435 return ElementCount::getFixed(0);
436}
437
438/// Returns "best known" trip count, which is either a valid positive trip count
439/// or std::nullopt when an estimate cannot be made (including when the trip
440/// count would overflow), for the specified loop \p L as defined by the
441/// following procedure:
442/// 1) Returns exact trip count if it is known.
443/// 2) Returns expected trip count according to profile data if any.
444/// 3) Returns upper bound estimate if known, and if \p CanUseConstantMax.
445/// 4) Returns std::nullopt if all of the above failed.
446static std::optional<ElementCount>
448 bool CanUseConstantMax = true) {
449 // Check if exact trip count is known.
450 if (auto ExpectedTC = getSmallConstantTripCount(PSE.getSE(), L))
451 return ExpectedTC;
452
453 // Check if there is an expected trip count available from profile data.
455 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
456 return ElementCount::getFixed(*EstimatedTC);
457
458 if (!CanUseConstantMax)
459 return std::nullopt;
460
461 // Check if upper bound estimate is known.
462 if (unsigned ExpectedTC = PSE.getSmallConstantMaxTripCount())
463 return ElementCount::getFixed(ExpectedTC);
464
465 return std::nullopt;
466}
467
468namespace {
469// Forward declare GeneratedRTChecks.
470class GeneratedRTChecks;
471
472using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
473} // namespace
474
475namespace llvm {
476
478
479/// InnerLoopVectorizer vectorizes loops which contain only one basic
480/// block to a specified vectorization factor (VF).
481/// This class performs the widening of scalars into vectors, or multiple
482/// scalars. This class also implements the following features:
483/// * It inserts an epilogue loop for handling loops that don't have iteration
484/// counts that are known to be a multiple of the vectorization factor.
485/// * It handles the code generation for reduction variables.
486/// * Scalarization (implementation using scalars) of un-vectorizable
487/// instructions.
488/// InnerLoopVectorizer does not perform any vectorization-legality
489/// checks, and relies on the caller to check for the different legality
490/// aspects. The InnerLoopVectorizer relies on the
491/// LoopVectorizationLegality class to provide information about the induction
492/// and reduction variables that were found to a given vectorization factor.
494public:
498 ElementCount VecWidth, unsigned UnrollFactor,
500 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks,
501 VPlan &Plan)
502 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TTI(TTI), AC(AC),
503 VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()),
506 Plan.getVectorLoopRegion()->getSinglePredecessor())) {}
507
508 virtual ~InnerLoopVectorizer() = default;
509
510 /// Creates a basic block for the scalar preheader. Both
511 /// EpilogueVectorizerMainLoop and EpilogueVectorizerEpilogueLoop overwrite
512 /// the method to create additional blocks and checks needed for epilogue
513 /// vectorization.
515
516 /// Fix the vectorized code, taking care of header phi's, and more.
518
519 /// Fix the non-induction PHIs in \p Plan.
521
522 /// Returns the original loop trip count.
523 Value *getTripCount() const { return TripCount; }
524
525 /// Used to set the trip count after ILV's construction and after the
526 /// preheader block has been executed. Note that this always holds the trip
527 /// count of the original loop for both main loop and epilogue vectorization.
528 void setTripCount(Value *TC) { TripCount = TC; }
529
530protected:
532
533 /// Create and return a new IR basic block for the scalar preheader whose name
534 /// is prefixed with \p Prefix.
536
537 /// Allow subclasses to override and print debug traces before/after vplan
538 /// execution, when trace information is requested.
539 virtual void printDebugTracesAtStart() {}
540 virtual void printDebugTracesAtEnd() {}
541
542 /// The original loop.
544
545 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
546 /// dynamic knowledge to simplify SCEV expressions and converts them to a
547 /// more usable form.
549
550 /// Loop Info.
552
553 /// Dominator Tree.
555
556 /// Target Transform Info.
558
559 /// Assumption Cache.
561
562 /// The vectorization SIMD factor to use. Each vector will have this many
563 /// vector elements.
565
566 /// The vectorization unroll factor to use. Each scalar is vectorized to this
567 /// many different vector instructions.
568 unsigned UF;
569
570 /// The builder that we use
572
573 // --- Vectorization state ---
574
575 /// Trip count of the original loop.
576 Value *TripCount = nullptr;
577
578 /// The profitablity analysis.
580
581 /// BFI and PSI are used to check for profile guided size optimizations.
584
585 /// Structure to hold information about generated runtime checks, responsible
586 /// for cleaning the checks, if vectorization turns out unprofitable.
587 GeneratedRTChecks &RTChecks;
588
590
591 /// The vector preheader block of \p Plan, used as target for check blocks
592 /// introduced during skeleton creation.
594};
595
596/// Encapsulate information regarding vectorization of a loop and its epilogue.
597/// This information is meant to be updated and used across two stages of
598/// epilogue vectorization.
601 unsigned MainLoopUF = 0;
603 unsigned EpilogueUF = 0;
606 Value *TripCount = nullptr;
609
611 ElementCount EVF, unsigned EUF,
613 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF),
615 assert(EUF == 1 &&
616 "A high UF for the epilogue loop is likely not beneficial.");
617 }
618};
619
620/// An extension of the inner loop vectorizer that creates a skeleton for a
621/// vectorized loop that has its epilogue (residual) also vectorized.
622/// The idea is to run the vplan on a given loop twice, firstly to setup the
623/// skeleton and vectorize the main loop, and secondly to complete the skeleton
624/// from the first step and vectorize the epilogue. This is achieved by
625/// deriving two concrete strategy classes from this base class and invoking
626/// them in succession from the loop vectorizer planner.
628public:
639
640 /// Holds and updates state information required to vectorize the main loop
641 /// and its epilogue in two separate passes. This setup helps us avoid
642 /// regenerating and recomputing runtime safety checks. It also helps us to
643 /// shorten the iteration-count-check path length for the cases where the
644 /// iteration count of the loop is so small that the main vector loop is
645 /// completely skipped.
647
648protected:
650};
651
652/// A specialized derived class of inner loop vectorizer that performs
653/// vectorization of *main* loops in the process of vectorizing loops and their
654/// epilogues.
656public:
668 /// Implements the interface for creating a vectorized skeleton using the
669 /// *main loop* strategy (i.e., the first pass of VPlan execution).
671
672protected:
673 /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the
674 /// vector preheader and its predecessor, also connecting the new block to the
675 /// scalar preheader.
676 void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB);
677
678 // Create a check to see if the main vector loop should be executed
680 unsigned UF) const;
681
682 /// Emits an iteration count bypass check once for the main loop (when \p
683 /// ForEpilogue is false) and once for the epilogue loop (when \p
684 /// ForEpilogue is true).
686 bool ForEpilogue);
687 void printDebugTracesAtStart() override;
688 void printDebugTracesAtEnd() override;
689};
690
691// A specialized derived class of inner loop vectorizer that performs
692// vectorization of *epilogue* loops in the process of vectorizing loops and
693// their epilogues.
695public:
705 /// Implements the interface for creating a vectorized skeleton using the
706 /// *epilogue loop* strategy (i.e., the second pass of VPlan execution).
708
709protected:
710 void printDebugTracesAtStart() override;
711 void printDebugTracesAtEnd() override;
712};
713} // end namespace llvm
714
715/// Look for a meaningful debug location on the instruction or its operands.
717 if (!I)
718 return DebugLoc::getUnknown();
719
721 if (I->getDebugLoc() != Empty)
722 return I->getDebugLoc();
723
724 for (Use &Op : I->operands()) {
725 if (Instruction *OpInst = dyn_cast<Instruction>(Op))
726 if (OpInst->getDebugLoc() != Empty)
727 return OpInst->getDebugLoc();
728 }
729
730 return I->getDebugLoc();
731}
732
733/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
734/// is passed, the message relates to that particular instruction.
735#ifndef NDEBUG
736static void debugVectorizationMessage(const StringRef Prefix,
737 const StringRef DebugMsg,
738 Instruction *I) {
739 dbgs() << "LV: " << Prefix << DebugMsg;
740 if (I != nullptr)
741 dbgs() << " " << *I;
742 else
743 dbgs() << '.';
744 dbgs() << '\n';
745}
746#endif
747
748/// Create an analysis remark that explains why vectorization failed
749///
750/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
751/// RemarkName is the identifier for the remark. If \p I is passed it is an
752/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
753/// the location of the remark. If \p DL is passed, use it as debug location for
754/// the remark. \return the remark object that can be streamed to.
755static OptimizationRemarkAnalysis
756createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,
757 Instruction *I, DebugLoc DL = {}) {
758 BasicBlock *CodeRegion = I ? I->getParent() : TheLoop->getHeader();
759 // If debug location is attached to the instruction, use it. Otherwise if DL
760 // was not provided, use the loop's.
761 if (I && I->getDebugLoc())
762 DL = I->getDebugLoc();
763 else if (!DL)
764 DL = TheLoop->getStartLoc();
765
766 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
767}
768
769namespace llvm {
770
771/// Return a value for Step multiplied by VF.
773 int64_t Step) {
774 assert(Ty->isIntegerTy() && "Expected an integer step");
775 ElementCount VFxStep = VF.multiplyCoefficientBy(Step);
776 assert(isPowerOf2_64(VF.getKnownMinValue()) && "must pass power-of-2 VF");
777 if (VF.isScalable() && isPowerOf2_64(Step)) {
778 return B.CreateShl(
779 B.CreateVScale(Ty),
780 ConstantInt::get(Ty, Log2_64(VFxStep.getKnownMinValue())), "", true);
781 }
782 return B.CreateElementCount(Ty, VFxStep);
783}
784
785/// Return the runtime value for VF.
787 return B.CreateElementCount(Ty, VF);
788}
789
791 const StringRef OREMsg, const StringRef ORETag,
792 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
793 Instruction *I) {
794 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
795 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
796 ORE->emit(
797 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
798 << "loop not vectorized: " << OREMsg);
799}
800
801/// Reports an informative message: print \p Msg for debugging purposes as well
802/// as an optimization remark. Uses either \p I as location of the remark, or
803/// otherwise \p TheLoop. If \p DL is passed, use it as debug location for the
804/// remark. If \p DL is passed, use it as debug location for the remark.
805static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
807 Loop *TheLoop, Instruction *I = nullptr,
808 DebugLoc DL = {}) {
810 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
811 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop,
812 I, DL)
813 << Msg);
814}
815
816/// Report successful vectorization of the loop. In case an outer loop is
817/// vectorized, prepend "outer" to the vectorization remark.
819 VectorizationFactor VF, unsigned IC) {
821 "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
822 nullptr));
823 StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
824 ORE->emit([&]() {
825 return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
826 TheLoop->getHeader())
827 << "vectorized " << LoopType << "loop (vectorization width: "
828 << ore::NV("VectorizationFactor", VF.Width)
829 << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
830 });
831}
832
833} // end namespace llvm
834
835namespace llvm {
836
837// Loop vectorization cost-model hints how the scalar epilogue loop should be
838// lowered.
840
841 // The default: allowing scalar epilogues.
843
844 // Vectorization with OptForSize: don't allow epilogues.
846
847 // A special case of vectorisation with OptForSize: loops with a very small
848 // trip count are considered for vectorization under OptForSize, thereby
849 // making sure the cost of their loop body is dominant, free of runtime
850 // guards and scalar iteration overheads.
852
853 // Loop hint predicate indicating an epilogue is undesired.
855
856 // Directive indicating we must either tail fold or not vectorize
858};
859
860/// LoopVectorizationCostModel - estimates the expected speedups due to
861/// vectorization.
862/// In many cases vectorization is not profitable. This can happen because of
863/// a number of reasons. In this class we mainly attempt to predict the
864/// expected speedup/slowdowns due to the supported instruction set. We use the
865/// TargetTransformInfo to query the different backends for the cost of
866/// different operations.
869
870public:
881 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
882 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
883 Hints(Hints), InterleaveInfo(IAI) {
884 if (TTI.supportsScalableVectors() || ForceTargetSupportsScalableVectors)
885 initializeVScaleForTuning();
887 // Query this against the original loop and save it here because the profile
888 // of the original loop header may change as the transformation happens.
889 OptForSize = llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
891 }
892
893 /// \return An upper bound for the vectorization factors (both fixed and
894 /// scalable). If the factors are 0, vectorization and interleaving should be
895 /// avoided up front.
896 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
897
898 /// \return True if runtime checks are required for vectorization, and false
899 /// otherwise.
900 bool runtimeChecksRequired();
901
902 /// Setup cost-based decisions for user vectorization factor.
903 /// \return true if the UserVF is a feasible VF to be chosen.
906 return expectedCost(UserVF).isValid();
907 }
908
909 /// \return True if maximizing vector bandwidth is enabled by the target or
910 /// user options, for the given register kind.
911 bool useMaxBandwidth(TargetTransformInfo::RegisterKind RegKind);
912
913 /// \return True if register pressure should be considered for the given VF.
914 bool shouldConsiderRegPressureForVF(ElementCount VF);
915
916 /// \return The size (in bits) of the smallest and widest types in the code
917 /// that needs to be vectorized. We ignore values that remain scalar such as
918 /// 64 bit loop indices.
919 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
920
921 /// Memory access instruction may be vectorized in more than one way.
922 /// Form of instruction after vectorization depends on cost.
923 /// This function takes cost-based decisions for Load/Store instructions
924 /// and collects them in a map. This decisions map is used for building
925 /// the lists of loop-uniform and loop-scalar instructions.
926 /// The calculated cost is saved with widening decision in order to
927 /// avoid redundant calculations.
928 void setCostBasedWideningDecision(ElementCount VF);
929
930 /// A call may be vectorized in different ways depending on whether we have
931 /// vectorized variants available and whether the target supports masking.
932 /// This function analyzes all calls in the function at the supplied VF,
933 /// makes a decision based on the costs of available options, and stores that
934 /// decision in a map for use in planning and plan execution.
935 void setVectorizedCallDecision(ElementCount VF);
936
937 /// Collect values we want to ignore in the cost model.
938 void collectValuesToIgnore();
939
940 /// Collect all element types in the loop for which widening is needed.
941 void collectElementTypesForWidening();
942
943 /// Split reductions into those that happen in the loop, and those that happen
944 /// outside. In loop reductions are collected into InLoopReductions.
945 void collectInLoopReductions();
946
947 /// Returns true if we should use strict in-order reductions for the given
948 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
949 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
950 /// of FP operations.
951 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
952 return !Hints->allowReordering() && RdxDesc.isOrdered();
953 }
954
955 /// \returns The smallest bitwidth each instruction can be represented with.
956 /// The vector equivalents of these instructions should be truncated to this
957 /// type.
959 return MinBWs;
960 }
961
962 /// \returns True if it is more profitable to scalarize instruction \p I for
963 /// vectorization factor \p VF.
965 assert(VF.isVector() &&
966 "Profitable to scalarize relevant only for VF > 1.");
967 assert(
968 TheLoop->isInnermost() &&
969 "cost-model should not be used for outer loops (in VPlan-native path)");
970
971 auto Scalars = InstsToScalarize.find(VF);
972 assert(Scalars != InstsToScalarize.end() &&
973 "VF not yet analyzed for scalarization profitability");
974 return Scalars->second.contains(I);
975 }
976
977 /// Returns true if \p I is known to be uniform after vectorization.
979 assert(
980 TheLoop->isInnermost() &&
981 "cost-model should not be used for outer loops (in VPlan-native path)");
982 // Pseudo probe needs to be duplicated for each unrolled iteration and
983 // vector lane so that profiled loop trip count can be accurately
984 // accumulated instead of being under counted.
986 return false;
987
988 if (VF.isScalar())
989 return true;
990
991 auto UniformsPerVF = Uniforms.find(VF);
992 assert(UniformsPerVF != Uniforms.end() &&
993 "VF not yet analyzed for uniformity");
994 return UniformsPerVF->second.count(I);
995 }
996
997 /// Returns true if \p I is known to be scalar after vectorization.
999 assert(
1000 TheLoop->isInnermost() &&
1001 "cost-model should not be used for outer loops (in VPlan-native path)");
1002 if (VF.isScalar())
1003 return true;
1004
1005 auto ScalarsPerVF = Scalars.find(VF);
1006 assert(ScalarsPerVF != Scalars.end() &&
1007 "Scalar values are not calculated for VF");
1008 return ScalarsPerVF->second.count(I);
1009 }
1010
1011 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1012 /// for vectorization factor \p VF.
1014 // Truncs must truncate at most to their destination type.
1015 if (isa_and_nonnull<TruncInst>(I) && MinBWs.contains(I) &&
1016 I->getType()->getScalarSizeInBits() < MinBWs.lookup(I))
1017 return false;
1018 return VF.isVector() && MinBWs.contains(I) &&
1019 !isProfitableToScalarize(I, VF) &&
1021 }
1022
1023 /// Decision that was taken during cost calculation for memory instruction.
1026 CM_Widen, // For consecutive accesses with stride +1.
1027 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1033 };
1034
1035 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1036 /// instruction \p I and vector width \p VF.
1039 assert(VF.isVector() && "Expected VF >=2");
1040 WideningDecisions[{I, VF}] = {W, Cost};
1041 }
1042
1043 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1044 /// interleaving group \p Grp and vector width \p VF.
1048 assert(VF.isVector() && "Expected VF >=2");
1049 /// Broadcast this decicion to all instructions inside the group.
1050 /// When interleaving, the cost will only be assigned one instruction, the
1051 /// insert position. For other cases, add the appropriate fraction of the
1052 /// total cost to each instruction. This ensures accurate costs are used,
1053 /// even if the insert position instruction is not used.
1054 InstructionCost InsertPosCost = Cost;
1055 InstructionCost OtherMemberCost = 0;
1056 if (W != CM_Interleave)
1057 OtherMemberCost = InsertPosCost = Cost / Grp->getNumMembers();
1058 ;
1059 for (unsigned Idx = 0; Idx < Grp->getFactor(); ++Idx) {
1060 if (auto *I = Grp->getMember(Idx)) {
1061 if (Grp->getInsertPos() == I)
1062 WideningDecisions[{I, VF}] = {W, InsertPosCost};
1063 else
1064 WideningDecisions[{I, VF}] = {W, OtherMemberCost};
1065 }
1066 }
1067 }
1068
1069 /// Return the cost model decision for the given instruction \p I and vector
1070 /// width \p VF. Return CM_Unknown if this instruction did not pass
1071 /// through the cost modeling.
1073 assert(VF.isVector() && "Expected VF to be a vector VF");
1074 assert(
1075 TheLoop->isInnermost() &&
1076 "cost-model should not be used for outer loops (in VPlan-native path)");
1077
1078 std::pair<Instruction *, ElementCount> InstOnVF(I, VF);
1079 auto Itr = WideningDecisions.find(InstOnVF);
1080 if (Itr == WideningDecisions.end())
1081 return CM_Unknown;
1082 return Itr->second.first;
1083 }
1084
1085 /// Return the vectorization cost for the given instruction \p I and vector
1086 /// width \p VF.
1088 assert(VF.isVector() && "Expected VF >=2");
1089 std::pair<Instruction *, ElementCount> InstOnVF(I, VF);
1090 assert(WideningDecisions.contains(InstOnVF) &&
1091 "The cost is not calculated");
1092 return WideningDecisions[InstOnVF].second;
1093 }
1094
1102
1104 Function *Variant, Intrinsic::ID IID,
1105 std::optional<unsigned> MaskPos,
1107 assert(!VF.isScalar() && "Expected vector VF");
1108 CallWideningDecisions[{CI, VF}] = {Kind, Variant, IID, MaskPos, Cost};
1109 }
1110
1112 ElementCount VF) const {
1113 assert(!VF.isScalar() && "Expected vector VF");
1114 auto I = CallWideningDecisions.find({CI, VF});
1115 if (I == CallWideningDecisions.end())
1116 return {CM_Unknown, nullptr, Intrinsic::not_intrinsic, std::nullopt, 0};
1117 return I->second;
1118 }
1119
1120 /// Return True if instruction \p I is an optimizable truncate whose operand
1121 /// is an induction variable. Such a truncate will be removed by adding a new
1122 /// induction variable with the destination type.
1124 // If the instruction is not a truncate, return false.
1125 auto *Trunc = dyn_cast<TruncInst>(I);
1126 if (!Trunc)
1127 return false;
1128
1129 // Get the source and destination types of the truncate.
1130 Type *SrcTy = toVectorTy(Trunc->getSrcTy(), VF);
1131 Type *DestTy = toVectorTy(Trunc->getDestTy(), VF);
1132
1133 // If the truncate is free for the given types, return false. Replacing a
1134 // free truncate with an induction variable would add an induction variable
1135 // update instruction to each iteration of the loop. We exclude from this
1136 // check the primary induction variable since it will need an update
1137 // instruction regardless.
1138 Value *Op = Trunc->getOperand(0);
1139 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1140 return false;
1141
1142 // If the truncated value is not an induction variable, return false.
1143 return Legal->isInductionPhi(Op);
1144 }
1145
1146 /// Collects the instructions to scalarize for each predicated instruction in
1147 /// the loop.
1148 void collectInstsToScalarize(ElementCount VF);
1149
1150 /// Collect values that will not be widened, including Uniforms, Scalars, and
1151 /// Instructions to Scalarize for the given \p VF.
1152 /// The sets depend on CM decision for Load/Store instructions
1153 /// that may be vectorized as interleave, gather-scatter or scalarized.
1154 /// Also make a decision on what to do about call instructions in the loop
1155 /// at that VF -- scalarize, call a known vector routine, or call a
1156 /// vector intrinsic.
1158 // Do the analysis once.
1159 if (VF.isScalar() || Uniforms.contains(VF))
1160 return;
1162 collectLoopUniforms(VF);
1164 collectLoopScalars(VF);
1166 }
1167
1168 /// Returns true if the target machine supports masked store operation
1169 /// for the given \p DataType and kind of access to \p Ptr.
1170 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment,
1171 unsigned AddressSpace) const {
1172 return Legal->isConsecutivePtr(DataType, Ptr) &&
1173 TTI.isLegalMaskedStore(DataType, Alignment, AddressSpace);
1174 }
1175
1176 /// Returns true if the target machine supports masked load operation
1177 /// for the given \p DataType and kind of access to \p Ptr.
1178 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment,
1179 unsigned AddressSpace) const {
1180 return Legal->isConsecutivePtr(DataType, Ptr) &&
1181 TTI.isLegalMaskedLoad(DataType, Alignment, AddressSpace);
1182 }
1183
1184 /// Returns true if the target machine can represent \p V as a masked gather
1185 /// or scatter operation.
1187 bool LI = isa<LoadInst>(V);
1188 bool SI = isa<StoreInst>(V);
1189 if (!LI && !SI)
1190 return false;
1191 auto *Ty = getLoadStoreType(V);
1193 if (VF.isVector())
1194 Ty = VectorType::get(Ty, VF);
1195 return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1196 (SI && TTI.isLegalMaskedScatter(Ty, Align));
1197 }
1198
1199 /// Returns true if the target machine supports all of the reduction
1200 /// variables found for the given VF.
1202 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1203 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1204 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1205 }));
1206 }
1207
1208 /// Given costs for both strategies, return true if the scalar predication
1209 /// lowering should be used for div/rem. This incorporates an override
1210 /// option so it is not simply a cost comparison.
1212 InstructionCost SafeDivisorCost) const {
1213 switch (ForceSafeDivisor) {
1214 case cl::BOU_UNSET:
1215 return ScalarCost < SafeDivisorCost;
1216 case cl::BOU_TRUE:
1217 return false;
1218 case cl::BOU_FALSE:
1219 return true;
1220 }
1221 llvm_unreachable("impossible case value");
1222 }
1223
1224 /// Returns true if \p I is an instruction which requires predication and
1225 /// for which our chosen predication strategy is scalarization (i.e. we
1226 /// don't have an alternate strategy such as masking available).
1227 /// \p VF is the vectorization factor that will be used to vectorize \p I.
1228 bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1229
1230 /// Returns true if \p I is an instruction that needs to be predicated
1231 /// at runtime. The result is independent of the predication mechanism.
1232 /// Superset of instructions that return true for isScalarWithPredication.
1233 bool isPredicatedInst(Instruction *I) const;
1234
1235 /// A helper function that returns how much we should divide the cost of a
1236 /// predicated block by. Typically this is the reciprocal of the block
1237 /// probability, i.e. if we return X we are assuming the predicated block will
1238 /// execute once for every X iterations of the loop header so the block should
1239 /// only contribute 1/X of its cost to the total cost calculation, but when
1240 /// optimizing for code size it will just be 1 as code size costs don't depend
1241 /// on execution probabilities.
1242 ///
1243 /// TODO: We should use actual block probability here, if available.
1244 /// Currently, we always assume predicated blocks have a 50% chance of
1245 /// executing, apart from blocks that are only predicated due to tail folding.
1246 inline unsigned
1248 BasicBlock *BB) const {
1249 // If a block wasn't originally predicated but was predicated due to
1250 // e.g. tail folding, don't divide the cost. Tail folded loops may still be
1251 // predicated in the final vector loop iteration, but for most loops that
1252 // don't have low trip counts we can expect their probability to be close to
1253 // zero.
1254 if (!Legal->blockNeedsPredication(BB))
1255 return 1;
1256 return CostKind == TTI::TCK_CodeSize ? 1 : 2;
1257 }
1258
1259 /// Return the costs for our two available strategies for lowering a
1260 /// div/rem operation which requires speculating at least one lane.
1261 /// First result is for scalarization (will be invalid for scalable
1262 /// vectors); second is for the safe-divisor strategy.
1263 std::pair<InstructionCost, InstructionCost>
1264 getDivRemSpeculationCost(Instruction *I,
1265 ElementCount VF) const;
1266
1267 /// Returns true if \p I is a memory instruction with consecutive memory
1268 /// access that can be widened.
1269 bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
1270
1271 /// Returns true if \p I is a memory instruction in an interleaved-group
1272 /// of memory accesses that can be vectorized with wide vector loads/stores
1273 /// and shuffles.
1274 bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const;
1275
1276 /// Check if \p Instr belongs to any interleaved access group.
1278 return InterleaveInfo.isInterleaved(Instr);
1279 }
1280
1281 /// Get the interleaved access group that \p Instr belongs to.
1284 return InterleaveInfo.getInterleaveGroup(Instr);
1285 }
1286
1287 /// Returns true if we're required to use a scalar epilogue for at least
1288 /// the final iteration of the original loop.
1289 bool requiresScalarEpilogue(bool IsVectorizing) const {
1290 if (!isScalarEpilogueAllowed()) {
1291 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1292 return false;
1293 }
1294 // If we might exit from anywhere but the latch and early exit vectorization
1295 // is disabled, we must run the exiting iteration in scalar form.
1296 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
1297 !(EnableEarlyExitVectorization && Legal->hasUncountableEarlyExit())) {
1298 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting "
1299 "from latch block\n");
1300 return true;
1301 }
1302 if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
1303 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
1304 "interleaved group requires scalar epilogue\n");
1305 return true;
1306 }
1307 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1308 return false;
1309 }
1310
1311 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1312 /// loop hint annotation.
1314 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1315 }
1316
1317 /// Returns the TailFoldingStyle that is best for the current loop.
1318 TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1319 if (!ChosenTailFoldingStyle)
1321 return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first
1322 : ChosenTailFoldingStyle->second;
1323 }
1324
1325 /// Selects and saves TailFoldingStyle for 2 options - if IV update may
1326 /// overflow or not.
1327 /// \param IsScalableVF true if scalable vector factors enabled.
1328 /// \param UserIC User specific interleave count.
1329 void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
1330 assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
1331 if (!Legal->canFoldTailByMasking()) {
1332 ChosenTailFoldingStyle = {TailFoldingStyle::None, TailFoldingStyle::None};
1333 return;
1334 }
1335
1336 // Default to TTI preference, but allow command line override.
1337 ChosenTailFoldingStyle = {
1338 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
1339 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false)};
1340 if (ForceTailFoldingStyle.getNumOccurrences())
1341 ChosenTailFoldingStyle = {ForceTailFoldingStyle.getValue(),
1342 ForceTailFoldingStyle.getValue()};
1343
1344 if (ChosenTailFoldingStyle->first != TailFoldingStyle::DataWithEVL &&
1345 ChosenTailFoldingStyle->second != TailFoldingStyle::DataWithEVL)
1346 return;
1347 // Override EVL styles if needed.
1348 // FIXME: Investigate opportunity for fixed vector factor.
1349 bool EVLIsLegal = UserIC <= 1 && IsScalableVF &&
1350 TTI.hasActiveVectorLength() && !EnableVPlanNativePath;
1351 if (EVLIsLegal)
1352 return;
1353 // If for some reason EVL mode is unsupported, fallback to a scalar epilogue
1354 // if it's allowed, or DataWithoutLaneMask otherwise.
1355 if (ScalarEpilogueStatus == CM_ScalarEpilogueAllowed ||
1356 ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate)
1357 ChosenTailFoldingStyle = {TailFoldingStyle::None, TailFoldingStyle::None};
1358 else
1359 ChosenTailFoldingStyle = {TailFoldingStyle::DataWithoutLaneMask,
1361
1362 LLVM_DEBUG(
1363 dbgs() << "LV: Preference for VP intrinsics indicated. Will "
1364 "not try to generate VP Intrinsics "
1365 << (UserIC > 1
1366 ? "since interleave count specified is greater than 1.\n"
1367 : "due to non-interleaving reasons.\n"));
1368 }
1369
1370 /// Returns true if all loop blocks should be masked to fold tail loop.
1371 bool foldTailByMasking() const {
1372 // TODO: check if it is possible to check for None style independent of
1373 // IVUpdateMayOverflow flag in getTailFoldingStyle.
1375 }
1376
1377 /// Return maximum safe number of elements to be processed per vector
1378 /// iteration, which do not prevent store-load forwarding and are safe with
1379 /// regard to the memory dependencies. Required for EVL-based VPlans to
1380 /// correctly calculate AVL (application vector length) as min(remaining AVL,
1381 /// MaxSafeElements).
1382 /// TODO: need to consider adjusting cost model to use this value as a
1383 /// vectorization factor for EVL-based vectorization.
1384 std::optional<unsigned> getMaxSafeElements() const { return MaxSafeElements; }
1385
1386 /// Returns true if the instructions in this block requires predication
1387 /// for any reason, e.g. because tail folding now requires a predicate
1388 /// or because the block in the original loop was predicated.
1390 return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1391 }
1392
1393 /// Returns true if VP intrinsics with explicit vector length support should
1394 /// be generated in the tail folded loop.
1398
1399 /// Returns true if the Phi is part of an inloop reduction.
1400 bool isInLoopReduction(PHINode *Phi) const {
1401 return InLoopReductions.contains(Phi);
1402 }
1403
1404 /// Returns true if the predicated reduction select should be used to set the
1405 /// incoming value for the reduction phi.
1407 // Force to use predicated reduction select since the EVL of the
1408 // second-to-last iteration might not be VF*UF.
1409 if (foldTailWithEVL())
1410 return true;
1412 TTI.preferPredicatedReductionSelect();
1413 }
1414
1415 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1416 /// with factor VF. Return the cost of the instruction, including
1417 /// scalarization overhead if it's needed.
1418 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1419
1420 /// Estimate cost of a call instruction CI if it were vectorized with factor
1421 /// VF. Return the cost of the instruction, including scalarization overhead
1422 /// if it's needed.
1423 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const;
1424
1425 /// Invalidates decisions already taken by the cost model.
1427 WideningDecisions.clear();
1428 CallWideningDecisions.clear();
1429 Uniforms.clear();
1430 Scalars.clear();
1431 }
1432
1433 /// Returns the expected execution cost. The unit of the cost does
1434 /// not matter because we use the 'cost' units to compare different
1435 /// vector widths. The cost that is returned is *not* normalized by
1436 /// the factor width.
1437 InstructionCost expectedCost(ElementCount VF);
1438
1439 bool hasPredStores() const { return NumPredStores > 0; }
1440
1441 /// Returns true if epilogue vectorization is considered profitable, and
1442 /// false otherwise.
1443 /// \p VF is the vectorization factor chosen for the original loop.
1444 /// \p Multiplier is an aditional scaling factor applied to VF before
1445 /// comparing to EpilogueVectorizationMinVF.
1446 bool isEpilogueVectorizationProfitable(const ElementCount VF,
1447 const unsigned IC) const;
1448
1449 /// Returns the execution time cost of an instruction for a given vector
1450 /// width. Vector width of one means scalar.
1451 InstructionCost getInstructionCost(Instruction *I, ElementCount VF);
1452
1453 /// Return the cost of instructions in an inloop reduction pattern, if I is
1454 /// part of that pattern.
1455 std::optional<InstructionCost> getReductionPatternCost(Instruction *I,
1456 ElementCount VF,
1457 Type *VectorTy) const;
1458
1459 /// Returns true if \p Op should be considered invariant and if it is
1460 /// trivially hoistable.
1461 bool shouldConsiderInvariant(Value *Op);
1462
1463 /// Return the value of vscale used for tuning the cost model.
1464 std::optional<unsigned> getVScaleForTuning() const { return VScaleForTuning; }
1465
1466private:
1467 unsigned NumPredStores = 0;
1468
1469 /// Used to store the value of vscale used for tuning the cost model. It is
1470 /// initialized during object construction.
1471 std::optional<unsigned> VScaleForTuning;
1472
1473 /// Initializes the value of vscale used for tuning the cost model. If
1474 /// vscale_range.min == vscale_range.max then return vscale_range.max, else
1475 /// return the value returned by the corresponding TTI method.
1476 void initializeVScaleForTuning() {
1477 const Function *Fn = TheLoop->getHeader()->getParent();
1478 if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
1479 auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
1480 auto Min = Attr.getVScaleRangeMin();
1481 auto Max = Attr.getVScaleRangeMax();
1482 if (Max && Min == Max) {
1483 VScaleForTuning = Max;
1484 return;
1485 }
1486 }
1487
1488 VScaleForTuning = TTI.getVScaleForTuning();
1489 }
1490
1491 /// \return An upper bound for the vectorization factors for both
1492 /// fixed and scalable vectorization, where the minimum-known number of
1493 /// elements is a power-of-2 larger than zero. If scalable vectorization is
1494 /// disabled or unsupported, then the scalable part will be equal to
1495 /// ElementCount::getScalable(0).
1496 FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1497 ElementCount UserVF,
1498 bool FoldTailByMasking);
1499
1500 /// If \p VF > MaxTripcount, clamps it to the next lower VF that is <=
1501 /// MaxTripCount.
1502 ElementCount clampVFByMaxTripCount(ElementCount VF, unsigned MaxTripCount,
1503 bool FoldTailByMasking) const;
1504
1505 /// \return the maximized element count based on the targets vector
1506 /// registers and the loop trip-count, but limited to a maximum safe VF.
1507 /// This is a helper function of computeFeasibleMaxVF.
1508 ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1509 unsigned SmallestType,
1510 unsigned WidestType,
1511 ElementCount MaxSafeVF,
1512 bool FoldTailByMasking);
1513
1514 /// Checks if scalable vectorization is supported and enabled. Caches the
1515 /// result to avoid repeated debug dumps for repeated queries.
1516 bool isScalableVectorizationAllowed();
1517
1518 /// \return the maximum legal scalable VF, based on the safe max number
1519 /// of elements.
1520 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1521
1522 /// Calculate vectorization cost of memory instruction \p I.
1523 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1524
1525 /// The cost computation for scalarized memory instruction.
1526 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1527
1528 /// The cost computation for interleaving group of memory instructions.
1529 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1530
1531 /// The cost computation for Gather/Scatter instruction.
1532 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1533
1534 /// The cost computation for widening instruction \p I with consecutive
1535 /// memory access.
1536 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1537
1538 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1539 /// Load: scalar load + broadcast.
1540 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1541 /// element)
1542 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1543
1544 /// Estimate the overhead of scalarizing an instruction. This is a
1545 /// convenience wrapper for the type-based getScalarizationOverhead API.
1547 ElementCount VF) const;
1548
1549 /// Returns true if an artificially high cost for emulated masked memrefs
1550 /// should be used.
1551 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1552
1553 /// Map of scalar integer values to the smallest bitwidth they can be legally
1554 /// represented as. The vector equivalents of these values should be truncated
1555 /// to this type.
1556 MapVector<Instruction *, uint64_t> MinBWs;
1557
1558 /// A type representing the costs for instructions if they were to be
1559 /// scalarized rather than vectorized. The entries are Instruction-Cost
1560 /// pairs.
1561 using ScalarCostsTy = MapVector<Instruction *, InstructionCost>;
1562
1563 /// A set containing all BasicBlocks that are known to present after
1564 /// vectorization as a predicated block.
1565 DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
1566 PredicatedBBsAfterVectorization;
1567
1568 /// Records whether it is allowed to have the original scalar loop execute at
1569 /// least once. This may be needed as a fallback loop in case runtime
1570 /// aliasing/dependence checks fail, or to handle the tail/remainder
1571 /// iterations when the trip count is unknown or doesn't divide by the VF,
1572 /// or as a peel-loop to handle gaps in interleave-groups.
1573 /// Under optsize and when the trip count is very small we don't allow any
1574 /// iterations to execute in the scalar loop.
1575 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1576
1577 /// Control finally chosen tail folding style. The first element is used if
1578 /// the IV update may overflow, the second element - if it does not.
1579 std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
1580 ChosenTailFoldingStyle;
1581
1582 /// true if scalable vectorization is supported and enabled.
1583 std::optional<bool> IsScalableVectorizationAllowed;
1584
1585 /// Maximum safe number of elements to be processed per vector iteration,
1586 /// which do not prevent store-load forwarding and are safe with regard to the
1587 /// memory dependencies. Required for EVL-based veectorization, where this
1588 /// value is used as the upper bound of the safe AVL.
1589 std::optional<unsigned> MaxSafeElements;
1590
1591 /// A map holding scalar costs for different vectorization factors. The
1592 /// presence of a cost for an instruction in the mapping indicates that the
1593 /// instruction will be scalarized when vectorizing with the associated
1594 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1595 MapVector<ElementCount, ScalarCostsTy> InstsToScalarize;
1596
1597 /// Holds the instructions known to be uniform after vectorization.
1598 /// The data is collected per VF.
1599 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1600
1601 /// Holds the instructions known to be scalar after vectorization.
1602 /// The data is collected per VF.
1603 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1604
1605 /// Holds the instructions (address computations) that are forced to be
1606 /// scalarized.
1607 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1608
1609 /// PHINodes of the reductions that should be expanded in-loop.
1610 SmallPtrSet<PHINode *, 4> InLoopReductions;
1611
1612 /// A Map of inloop reduction operations and their immediate chain operand.
1613 /// FIXME: This can be removed once reductions can be costed correctly in
1614 /// VPlan. This was added to allow quick lookup of the inloop operations.
1615 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1616
1617 /// Returns the expected difference in cost from scalarizing the expression
1618 /// feeding a predicated instruction \p PredInst. The instructions to
1619 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1620 /// non-negative return value implies the expression will be scalarized.
1621 /// Currently, only single-use chains are considered for scalarization.
1622 InstructionCost computePredInstDiscount(Instruction *PredInst,
1623 ScalarCostsTy &ScalarCosts,
1624 ElementCount VF);
1625
1626 /// Collect the instructions that are uniform after vectorization. An
1627 /// instruction is uniform if we represent it with a single scalar value in
1628 /// the vectorized loop corresponding to each vector iteration. Examples of
1629 /// uniform instructions include pointer operands of consecutive or
1630 /// interleaved memory accesses. Note that although uniformity implies an
1631 /// instruction will be scalar, the reverse is not true. In general, a
1632 /// scalarized instruction will be represented by VF scalar values in the
1633 /// vectorized loop, each corresponding to an iteration of the original
1634 /// scalar loop.
1635 void collectLoopUniforms(ElementCount VF);
1636
1637 /// Collect the instructions that are scalar after vectorization. An
1638 /// instruction is scalar if it is known to be uniform or will be scalarized
1639 /// during vectorization. collectLoopScalars should only add non-uniform nodes
1640 /// to the list if they are used by a load/store instruction that is marked as
1641 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1642 /// VF values in the vectorized loop, each corresponding to an iteration of
1643 /// the original scalar loop.
1644 void collectLoopScalars(ElementCount VF);
1645
1646 /// Keeps cost model vectorization decision and cost for instructions.
1647 /// Right now it is used for memory instructions only.
1648 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1649 std::pair<InstWidening, InstructionCost>>;
1650
1651 DecisionList WideningDecisions;
1652
1653 using CallDecisionList =
1654 DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1655
1656 CallDecisionList CallWideningDecisions;
1657
1658 /// Returns true if \p V is expected to be vectorized and it needs to be
1659 /// extracted.
1660 bool needsExtract(Value *V, ElementCount VF) const {
1662 if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1663 TheLoop->isLoopInvariant(I) ||
1664 getWideningDecision(I, VF) == CM_Scalarize ||
1665 (isa<CallInst>(I) &&
1666 getCallWideningDecision(cast<CallInst>(I), VF).Kind == CM_Scalarize))
1667 return false;
1668
1669 // Assume we can vectorize V (and hence we need extraction) if the
1670 // scalars are not computed yet. This can happen, because it is called
1671 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1672 // the scalars are collected. That should be a safe assumption in most
1673 // cases, because we check if the operands have vectorizable types
1674 // beforehand in LoopVectorizationLegality.
1675 return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
1676 };
1677
1678 /// Returns a range containing only operands needing to be extracted.
1679 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1680 ElementCount VF) const {
1681
1682 SmallPtrSet<const Value *, 4> UniqueOperands;
1684 for (Value *Op : Ops) {
1685 if (isa<Constant>(Op) || !UniqueOperands.insert(Op).second ||
1686 !needsExtract(Op, VF))
1687 continue;
1688 Res.push_back(Op);
1689 }
1690 return Res;
1691 }
1692
1693public:
1694 /// The loop that we evaluate.
1696
1697 /// Predicated scalar evolution analysis.
1699
1700 /// Loop Info analysis.
1702
1703 /// Vectorization legality.
1705
1706 /// Vector target information.
1708
1709 /// Target Library Info.
1711
1712 /// Demanded bits analysis.
1714
1715 /// Assumption cache.
1717
1718 /// Interface to emit optimization remarks.
1720
1722
1723 /// Loop Vectorize Hint.
1725
1726 /// The interleave access information contains groups of interleaved accesses
1727 /// with the same stride and close to each other.
1729
1730 /// Values to ignore in the cost model.
1732
1733 /// Values to ignore in the cost model when VF > 1.
1735
1736 /// All element types found in the loop.
1738
1739 /// The kind of cost that we are calculating
1741
1742 /// Whether this loop should be optimized for size based on function attribute
1743 /// or profile information.
1745
1746 /// The highest VF possible for this loop, without using MaxBandwidth.
1748};
1749} // end namespace llvm
1750
1751namespace {
1752/// Helper struct to manage generating runtime checks for vectorization.
1753///
1754/// The runtime checks are created up-front in temporary blocks to allow better
1755/// estimating the cost and un-linked from the existing IR. After deciding to
1756/// vectorize, the checks are moved back. If deciding not to vectorize, the
1757/// temporary blocks are completely removed.
1758class GeneratedRTChecks {
1759 /// Basic block which contains the generated SCEV checks, if any.
1760 BasicBlock *SCEVCheckBlock = nullptr;
1761
1762 /// The value representing the result of the generated SCEV checks. If it is
1763 /// nullptr no SCEV checks have been generated.
1764 Value *SCEVCheckCond = nullptr;
1765
1766 /// Basic block which contains the generated memory runtime checks, if any.
1767 BasicBlock *MemCheckBlock = nullptr;
1768
1769 /// The value representing the result of the generated memory runtime checks.
1770 /// If it is nullptr no memory runtime checks have been generated.
1771 Value *MemRuntimeCheckCond = nullptr;
1772
1773 DominatorTree *DT;
1774 LoopInfo *LI;
1776
1777 SCEVExpander SCEVExp;
1778 SCEVExpander MemCheckExp;
1779
1780 bool CostTooHigh = false;
1781
1782 Loop *OuterLoop = nullptr;
1783
1785
1786 /// The kind of cost that we are calculating
1788
1789public:
1790 GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
1793 : DT(DT), LI(LI), TTI(TTI),
1794 SCEVExp(*PSE.getSE(), DL, "scev.check", /*PreserveLCSSA=*/false),
1795 MemCheckExp(*PSE.getSE(), DL, "scev.check", /*PreserveLCSSA=*/false),
1796 PSE(PSE), CostKind(CostKind) {}
1797
1798 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1799 /// accurately estimate the cost of the runtime checks. The blocks are
1800 /// un-linked from the IR and are added back during vector code generation. If
1801 /// there is no vector code generation, the check blocks are removed
1802 /// completely.
1803 void create(Loop *L, const LoopAccessInfo &LAI,
1804 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1805
1806 // Hard cutoff to limit compile-time increase in case a very large number of
1807 // runtime checks needs to be generated.
1808 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1809 // profile info.
1810 CostTooHigh =
1812 if (CostTooHigh)
1813 return;
1814
1815 BasicBlock *LoopHeader = L->getHeader();
1816 BasicBlock *Preheader = L->getLoopPreheader();
1817
1818 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1819 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1820 // may be used by SCEVExpander. The blocks will be un-linked from their
1821 // predecessors and removed from LI & DT at the end of the function.
1822 if (!UnionPred.isAlwaysTrue()) {
1823 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1824 nullptr, "vector.scevcheck");
1825
1826 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1827 &UnionPred, SCEVCheckBlock->getTerminator());
1828 if (isa<Constant>(SCEVCheckCond)) {
1829 // Clean up directly after expanding the predicate to a constant, to
1830 // avoid further expansions re-using anything left over from SCEVExp.
1831 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
1832 SCEVCleaner.cleanup();
1833 }
1834 }
1835
1836 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1837 if (RtPtrChecking.Need) {
1838 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1839 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1840 "vector.memcheck");
1841
1842 auto DiffChecks = RtPtrChecking.getDiffChecks();
1843 if (DiffChecks) {
1844 Value *RuntimeVF = nullptr;
1845 MemRuntimeCheckCond = addDiffRuntimeChecks(
1846 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
1847 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1848 if (!RuntimeVF)
1849 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1850 return RuntimeVF;
1851 },
1852 IC);
1853 } else {
1854 MemRuntimeCheckCond = addRuntimeChecks(
1855 MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
1857 }
1858 assert(MemRuntimeCheckCond &&
1859 "no RT checks generated although RtPtrChecking "
1860 "claimed checks are required");
1861 }
1862
1863 SCEVExp.eraseDeadInstructions(SCEVCheckCond);
1864
1865 if (!MemCheckBlock && !SCEVCheckBlock)
1866 return;
1867
1868 // Unhook the temporary block with the checks, update various places
1869 // accordingly.
1870 if (SCEVCheckBlock)
1871 SCEVCheckBlock->replaceAllUsesWith(Preheader);
1872 if (MemCheckBlock)
1873 MemCheckBlock->replaceAllUsesWith(Preheader);
1874
1875 if (SCEVCheckBlock) {
1876 SCEVCheckBlock->getTerminator()->moveBefore(
1877 Preheader->getTerminator()->getIterator());
1878 auto *UI = new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1879 UI->setDebugLoc(DebugLoc::getTemporary());
1880 Preheader->getTerminator()->eraseFromParent();
1881 }
1882 if (MemCheckBlock) {
1883 MemCheckBlock->getTerminator()->moveBefore(
1884 Preheader->getTerminator()->getIterator());
1885 auto *UI = new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1886 UI->setDebugLoc(DebugLoc::getTemporary());
1887 Preheader->getTerminator()->eraseFromParent();
1888 }
1889
1890 DT->changeImmediateDominator(LoopHeader, Preheader);
1891 if (MemCheckBlock) {
1892 DT->eraseNode(MemCheckBlock);
1893 LI->removeBlock(MemCheckBlock);
1894 }
1895 if (SCEVCheckBlock) {
1896 DT->eraseNode(SCEVCheckBlock);
1897 LI->removeBlock(SCEVCheckBlock);
1898 }
1899
1900 // Outer loop is used as part of the later cost calculations.
1901 OuterLoop = L->getParentLoop();
1902 }
1903
1905 if (SCEVCheckBlock || MemCheckBlock)
1906 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
1907
1908 if (CostTooHigh) {
1910 Cost.setInvalid();
1911 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n");
1912 return Cost;
1913 }
1914
1915 InstructionCost RTCheckCost = 0;
1916 if (SCEVCheckBlock)
1917 for (Instruction &I : *SCEVCheckBlock) {
1918 if (SCEVCheckBlock->getTerminator() == &I)
1919 continue;
1921 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1922 RTCheckCost += C;
1923 }
1924 if (MemCheckBlock) {
1925 InstructionCost MemCheckCost = 0;
1926 for (Instruction &I : *MemCheckBlock) {
1927 if (MemCheckBlock->getTerminator() == &I)
1928 continue;
1930 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1931 MemCheckCost += C;
1932 }
1933
1934 // If the runtime memory checks are being created inside an outer loop
1935 // we should find out if these checks are outer loop invariant. If so,
1936 // the checks will likely be hoisted out and so the effective cost will
1937 // reduce according to the outer loop trip count.
1938 if (OuterLoop) {
1939 ScalarEvolution *SE = MemCheckExp.getSE();
1940 // TODO: If profitable, we could refine this further by analysing every
1941 // individual memory check, since there could be a mixture of loop
1942 // variant and invariant checks that mean the final condition is
1943 // variant.
1944 const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
1945 if (SE->isLoopInvariant(Cond, OuterLoop)) {
1946 // It seems reasonable to assume that we can reduce the effective
1947 // cost of the checks even when we know nothing about the trip
1948 // count. Assume that the outer loop executes at least twice.
1949 unsigned BestTripCount = 2;
1950
1951 // Get the best known TC estimate.
1952 if (auto EstimatedTC = getSmallBestKnownTC(
1953 PSE, OuterLoop, /* CanUseConstantMax = */ false))
1954 if (EstimatedTC->isFixed())
1955 BestTripCount = EstimatedTC->getFixedValue();
1956
1957 InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
1958
1959 // Let's ensure the cost is always at least 1.
1960 NewMemCheckCost = std::max(NewMemCheckCost.getValue(),
1961 (InstructionCost::CostType)1);
1962
1963 if (BestTripCount > 1)
1965 << "We expect runtime memory checks to be hoisted "
1966 << "out of the outer loop. Cost reduced from "
1967 << MemCheckCost << " to " << NewMemCheckCost << '\n');
1968
1969 MemCheckCost = NewMemCheckCost;
1970 }
1971 }
1972
1973 RTCheckCost += MemCheckCost;
1974 }
1975
1976 if (SCEVCheckBlock || MemCheckBlock)
1977 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
1978 << "\n");
1979
1980 return RTCheckCost;
1981 }
1982
1983 /// Remove the created SCEV & memory runtime check blocks & instructions, if
1984 /// unused.
1985 ~GeneratedRTChecks() {
1986 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
1987 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
1988 bool SCEVChecksUsed = !SCEVCheckBlock || !pred_empty(SCEVCheckBlock);
1989 bool MemChecksUsed = !MemCheckBlock || !pred_empty(MemCheckBlock);
1990 if (SCEVChecksUsed)
1991 SCEVCleaner.markResultUsed();
1992
1993 if (MemChecksUsed) {
1994 MemCheckCleaner.markResultUsed();
1995 } else {
1996 auto &SE = *MemCheckExp.getSE();
1997 // Memory runtime check generation creates compares that use expanded
1998 // values. Remove them before running the SCEVExpanderCleaners.
1999 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2000 if (MemCheckExp.isInsertedInstruction(&I))
2001 continue;
2002 SE.forgetValue(&I);
2003 I.eraseFromParent();
2004 }
2005 }
2006 MemCheckCleaner.cleanup();
2007 SCEVCleaner.cleanup();
2008
2009 if (!SCEVChecksUsed)
2010 SCEVCheckBlock->eraseFromParent();
2011 if (!MemChecksUsed)
2012 MemCheckBlock->eraseFromParent();
2013 }
2014
2015 /// Retrieves the SCEVCheckCond and SCEVCheckBlock that were generated as IR
2016 /// outside VPlan.
2017 std::pair<Value *, BasicBlock *> getSCEVChecks() const {
2018 using namespace llvm::PatternMatch;
2019 if (!SCEVCheckCond || match(SCEVCheckCond, m_ZeroInt()))
2020 return {nullptr, nullptr};
2021
2022 return {SCEVCheckCond, SCEVCheckBlock};
2023 }
2024
2025 /// Retrieves the MemCheckCond and MemCheckBlock that were generated as IR
2026 /// outside VPlan.
2027 std::pair<Value *, BasicBlock *> getMemRuntimeChecks() const {
2028 using namespace llvm::PatternMatch;
2029 if (MemRuntimeCheckCond && match(MemRuntimeCheckCond, m_ZeroInt()))
2030 return {nullptr, nullptr};
2031 return {MemRuntimeCheckCond, MemCheckBlock};
2032 }
2033
2034 /// Return true if any runtime checks have been added
2035 bool hasChecks() const {
2036 return getSCEVChecks().first || getMemRuntimeChecks().first;
2037 }
2038};
2039} // namespace
2040
2046
2051
2052// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2053// vectorization. The loop needs to be annotated with #pragma omp simd
2054// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2055// vector length information is not provided, vectorization is not considered
2056// explicit. Interleave hints are not allowed either. These limitations will be
2057// relaxed in the future.
2058// Please, note that we are currently forced to abuse the pragma 'clang
2059// vectorize' semantics. This pragma provides *auto-vectorization hints*
2060// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2061// provides *explicit vectorization hints* (LV can bypass legal checks and
2062// assume that vectorization is legal). However, both hints are implemented
2063// using the same metadata (llvm.loop.vectorize, processed by
2064// LoopVectorizeHints). This will be fixed in the future when the native IR
2065// representation for pragma 'omp simd' is introduced.
2066static bool isExplicitVecOuterLoop(Loop *OuterLp,
2068 assert(!OuterLp->isInnermost() && "This is not an outer loop");
2069 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2070
2071 // Only outer loops with an explicit vectorization hint are supported.
2072 // Unannotated outer loops are ignored.
2074 return false;
2075
2076 Function *Fn = OuterLp->getHeader()->getParent();
2077 if (!Hints.allowVectorization(Fn, OuterLp,
2078 true /*VectorizeOnlyWhenForced*/)) {
2079 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2080 return false;
2081 }
2082
2083 if (Hints.getInterleave() > 1) {
2084 // TODO: Interleave support is future work.
2085 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2086 "outer loops.\n");
2087 Hints.emitRemarkWithHints();
2088 return false;
2089 }
2090
2091 return true;
2092}
2093
2097 // Collect inner loops and outer loops without irreducible control flow. For
2098 // now, only collect outer loops that have explicit vectorization hints. If we
2099 // are stress testing the VPlan H-CFG construction, we collect the outermost
2100 // loop of every loop nest.
2101 if (L.isInnermost() || VPlanBuildStressTest ||
2103 LoopBlocksRPO RPOT(&L);
2104 RPOT.perform(LI);
2106 V.push_back(&L);
2107 // TODO: Collect inner loops inside marked outer loops in case
2108 // vectorization fails for the outer loop. Do not invoke
2109 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2110 // already known to be reducible. We can use an inherited attribute for
2111 // that.
2112 return;
2113 }
2114 }
2115 for (Loop *InnerL : L)
2116 collectSupportedLoops(*InnerL, LI, ORE, V);
2117}
2118
2119//===----------------------------------------------------------------------===//
2120// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2121// LoopVectorizationCostModel and LoopVectorizationPlanner.
2122//===----------------------------------------------------------------------===//
2123
2124/// Compute the transformed value of Index at offset StartValue using step
2125/// StepValue.
2126/// For integer induction, returns StartValue + Index * StepValue.
2127/// For pointer induction, returns StartValue[Index * StepValue].
2128/// FIXME: The newly created binary instructions should contain nsw/nuw
2129/// flags, which can be found from the original scalar operations.
2130static Value *
2132 Value *Step,
2134 const BinaryOperator *InductionBinOp) {
2135 using namespace llvm::PatternMatch;
2136 Type *StepTy = Step->getType();
2137 Value *CastedIndex = StepTy->isIntegerTy()
2138 ? B.CreateSExtOrTrunc(Index, StepTy)
2139 : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2140 if (CastedIndex != Index) {
2141 CastedIndex->setName(CastedIndex->getName() + ".cast");
2142 Index = CastedIndex;
2143 }
2144
2145 // Note: the IR at this point is broken. We cannot use SE to create any new
2146 // SCEV and then expand it, hoping that SCEV's simplification will give us
2147 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2148 // lead to various SCEV crashes. So all we can do is to use builder and rely
2149 // on InstCombine for future simplifications. Here we handle some trivial
2150 // cases only.
2151 auto CreateAdd = [&B](Value *X, Value *Y) {
2152 assert(X->getType() == Y->getType() && "Types don't match!");
2153 if (match(X, m_ZeroInt()))
2154 return Y;
2155 if (match(Y, m_ZeroInt()))
2156 return X;
2157 return B.CreateAdd(X, Y);
2158 };
2159
2160 // We allow X to be a vector type, in which case Y will potentially be
2161 // splatted into a vector with the same element count.
2162 auto CreateMul = [&B](Value *X, Value *Y) {
2163 assert(X->getType()->getScalarType() == Y->getType() &&
2164 "Types don't match!");
2165 if (match(X, m_One()))
2166 return Y;
2167 if (match(Y, m_One()))
2168 return X;
2169 VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2170 if (XVTy && !isa<VectorType>(Y->getType()))
2171 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2172 return B.CreateMul(X, Y);
2173 };
2174
2175 switch (InductionKind) {
2177 assert(!isa<VectorType>(Index->getType()) &&
2178 "Vector indices not supported for integer inductions yet");
2179 assert(Index->getType() == StartValue->getType() &&
2180 "Index type does not match StartValue type");
2181 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2182 return B.CreateSub(StartValue, Index);
2183 auto *Offset = CreateMul(Index, Step);
2184 return CreateAdd(StartValue, Offset);
2185 }
2187 return B.CreatePtrAdd(StartValue, CreateMul(Index, Step));
2189 assert(!isa<VectorType>(Index->getType()) &&
2190 "Vector indices not supported for FP inductions yet");
2191 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2192 assert(InductionBinOp &&
2193 (InductionBinOp->getOpcode() == Instruction::FAdd ||
2194 InductionBinOp->getOpcode() == Instruction::FSub) &&
2195 "Original bin op should be defined for FP induction");
2196
2197 Value *MulExp = B.CreateFMul(Step, Index);
2198 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2199 "induction");
2200 }
2202 return nullptr;
2203 }
2204 llvm_unreachable("invalid enum");
2205}
2206
2207static std::optional<unsigned> getMaxVScale(const Function &F,
2208 const TargetTransformInfo &TTI) {
2209 if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2210 return MaxVScale;
2211
2212 if (F.hasFnAttribute(Attribute::VScaleRange))
2213 return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2214
2215 return std::nullopt;
2216}
2217
2218/// For the given VF and UF and maximum trip count computed for the loop, return
2219/// whether the induction variable might overflow in the vectorized loop. If not,
2220/// then we know a runtime overflow check always evaluates to false and can be
2221/// removed.
2223 const LoopVectorizationCostModel *Cost,
2224 ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2225 // Always be conservative if we don't know the exact unroll factor.
2226 unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2227
2228 IntegerType *IdxTy = Cost->Legal->getWidestInductionType();
2229 APInt MaxUIntTripCount = IdxTy->getMask();
2230
2231 // We know the runtime overflow check is known false iff the (max) trip-count
2232 // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2233 // the vector loop induction variable.
2234 if (unsigned TC = Cost->PSE.getSmallConstantMaxTripCount()) {
2235 uint64_t MaxVF = VF.getKnownMinValue();
2236 if (VF.isScalable()) {
2237 std::optional<unsigned> MaxVScale =
2238 getMaxVScale(*Cost->TheFunction, Cost->TTI);
2239 if (!MaxVScale)
2240 return false;
2241 MaxVF *= *MaxVScale;
2242 }
2243
2244 return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
2245 }
2246
2247 return false;
2248}
2249
2250// Return whether we allow using masked interleave-groups (for dealing with
2251// strided loads/stores that reside in predicated blocks, or for dealing
2252// with gaps).
2254 // If an override option has been passed in for interleaved accesses, use it.
2255 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2257
2258 return TTI.enableMaskedInterleavedAccessVectorization();
2259}
2260
2262 BasicBlock *CheckIRBB) {
2263 // Note: The block with the minimum trip-count check is already connected
2264 // during earlier VPlan construction.
2265 VPBlockBase *ScalarPH = Plan.getScalarPreheader();
2266 VPBlockBase *PreVectorPH = VectorPHVPBB->getSinglePredecessor();
2267 assert(PreVectorPH->getNumSuccessors() == 2 && "Expected 2 successors");
2268 assert(PreVectorPH->getSuccessors()[0] == ScalarPH && "Unexpected successor");
2269 VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(CheckIRBB);
2270 VPBlockUtils::insertOnEdge(PreVectorPH, VectorPHVPBB, CheckVPIRBB);
2271 PreVectorPH = CheckVPIRBB;
2272 VPBlockUtils::connectBlocks(PreVectorPH, ScalarPH);
2273 PreVectorPH->swapSuccessors();
2274
2275 // We just connected a new block to the scalar preheader. Update all
2276 // VPPhis by adding an incoming value for it, replicating the last value.
2277 unsigned NumPredecessors = ScalarPH->getNumPredecessors();
2278 for (VPRecipeBase &R : cast<VPBasicBlock>(ScalarPH)->phis()) {
2279 assert(isa<VPPhi>(&R) && "Phi expected to be VPPhi");
2280 assert(cast<VPPhi>(&R)->getNumIncoming() == NumPredecessors - 1 &&
2281 "must have incoming values for all operands");
2282 R.addOperand(R.getOperand(NumPredecessors - 2));
2283 }
2284}
2285
2287 BasicBlock *VectorPH, ElementCount VF, unsigned UF) const {
2288 // Generate code to check if the loop's trip count is less than VF * UF, or
2289 // equal to it in case a scalar epilogue is required; this implies that the
2290 // vector trip count is zero. This check also covers the case where adding one
2291 // to the backedge-taken count overflowed leading to an incorrect trip count
2292 // of zero. In this case we will also jump to the scalar loop.
2293 auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
2295
2296 // Reuse existing vector loop preheader for TC checks.
2297 // Note that new preheader block is generated for vector loop.
2298 BasicBlock *const TCCheckBlock = VectorPH;
2300 TCCheckBlock->getContext(),
2301 InstSimplifyFolder(TCCheckBlock->getDataLayout()));
2302 Builder.SetInsertPoint(TCCheckBlock->getTerminator());
2303
2304 // If tail is to be folded, vector loop takes care of all iterations.
2306 Type *CountTy = Count->getType();
2307 Value *CheckMinIters = Builder.getFalse();
2308 auto CreateStep = [&]() -> Value * {
2309 // Create step with max(MinProTripCount, UF * VF).
2310 if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
2311 return createStepForVF(Builder, CountTy, VF, UF);
2312
2313 Value *MinProfTC =
2314 Builder.CreateElementCount(CountTy, MinProfitableTripCount);
2315 if (!VF.isScalable())
2316 return MinProfTC;
2317 return Builder.CreateBinaryIntrinsic(
2318 Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2319 };
2320
2321 TailFoldingStyle Style = Cost->getTailFoldingStyle();
2322 if (Style == TailFoldingStyle::None) {
2323 Value *Step = CreateStep();
2324 ScalarEvolution &SE = *PSE.getSE();
2325 // TODO: Emit unconditional branch to vector preheader instead of
2326 // conditional branch with known condition.
2327 const SCEV *TripCountSCEV = SE.applyLoopGuards(SE.getSCEV(Count), OrigLoop);
2328 // Check if the trip count is < the step.
2329 if (SE.isKnownPredicate(P, TripCountSCEV, SE.getSCEV(Step))) {
2330 // TODO: Ensure step is at most the trip count when determining max VF and
2331 // UF, w/o tail folding.
2332 CheckMinIters = Builder.getTrue();
2334 TripCountSCEV, SE.getSCEV(Step))) {
2335 // Generate the minimum iteration check only if we cannot prove the
2336 // check is known to be true, or known to be false.
2337 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
2338 } // else step known to be < trip count, use CheckMinIters preset to false.
2339 } else if (VF.isScalable() && !TTI->isVScaleKnownToBeAPowerOfTwo() &&
2342 // vscale is not necessarily a power-of-2, which means we cannot guarantee
2343 // an overflow to zero when updating induction variables and so an
2344 // additional overflow check is required before entering the vector loop.
2345
2346 // Get the maximum unsigned value for the type.
2347 Value *MaxUIntTripCount =
2348 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2349 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
2350
2351 // Don't execute the vector loop if (UMax - n) < (VF * UF).
2352 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
2353 }
2354 return CheckMinIters;
2355}
2356
2357/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
2358/// VPBB are moved to the end of the newly created VPIRBasicBlock. All
2359/// predecessors and successors of VPBB, if any, are rewired to the new
2360/// VPIRBasicBlock. If \p VPBB may be unreachable, \p Plan must be passed.
2362 BasicBlock *IRBB,
2363 VPlan *Plan = nullptr) {
2364 if (!Plan)
2365 Plan = VPBB->getPlan();
2366 VPIRBasicBlock *IRVPBB = Plan->createVPIRBasicBlock(IRBB);
2367 auto IP = IRVPBB->begin();
2368 for (auto &R : make_early_inc_range(VPBB->phis()))
2369 R.moveBefore(*IRVPBB, IP);
2370
2371 for (auto &R :
2373 R.moveBefore(*IRVPBB, IRVPBB->end());
2374
2375 VPBlockUtils::reassociateBlocks(VPBB, IRVPBB);
2376 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
2377 return IRVPBB;
2378}
2379
2381 BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
2382 assert(VectorPH && "Invalid loop structure");
2383 assert((OrigLoop->getUniqueLatchExitBlock() ||
2384 Cost->requiresScalarEpilogue(VF.isVector())) &&
2385 "loops not exiting via the latch without required epilogue?");
2386
2387 // NOTE: The Plan's scalar preheader VPBB isn't replaced with a VPIRBasicBlock
2388 // wrapping the newly created scalar preheader here at the moment, because the
2389 // Plan's scalar preheader may be unreachable at this point. Instead it is
2390 // replaced in executePlan.
2391 return SplitBlock(VectorPH, VectorPH->getTerminator(), DT, LI, nullptr,
2392 Twine(Prefix) + "scalar.ph");
2393}
2394
2395/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
2396/// expansion results.
2398 const SCEV2ValueTy &ExpandedSCEVs) {
2399 const SCEV *Step = ID.getStep();
2400 if (auto *C = dyn_cast<SCEVConstant>(Step))
2401 return C->getValue();
2402 if (auto *U = dyn_cast<SCEVUnknown>(Step))
2403 return U->getValue();
2404 Value *V = ExpandedSCEVs.lookup(Step);
2405 assert(V && "SCEV must be expanded at this point");
2406 return V;
2407}
2408
2409/// Knowing that loop \p L executes a single vector iteration, add instructions
2410/// that will get simplified and thus should not have any cost to \p
2411/// InstsToIgnore.
2414 SmallPtrSetImpl<Instruction *> &InstsToIgnore) {
2415 auto *Cmp = L->getLatchCmpInst();
2416 if (Cmp)
2417 InstsToIgnore.insert(Cmp);
2418 for (const auto &KV : IL) {
2419 // Extract the key by hand so that it can be used in the lambda below. Note
2420 // that captured structured bindings are a C++20 extension.
2421 const PHINode *IV = KV.first;
2422
2423 // Get next iteration value of the induction variable.
2424 Instruction *IVInst =
2425 cast<Instruction>(IV->getIncomingValueForBlock(L->getLoopLatch()));
2426 if (all_of(IVInst->users(),
2427 [&](const User *U) { return U == IV || U == Cmp; }))
2428 InstsToIgnore.insert(IVInst);
2429 }
2430}
2431
2433 // Create a new IR basic block for the scalar preheader.
2434 BasicBlock *ScalarPH = createScalarPreheader("");
2435 return ScalarPH->getSinglePredecessor();
2436}
2437
2438namespace {
2439
2440struct CSEDenseMapInfo {
2441 static bool canHandle(const Instruction *I) {
2444 }
2445
2446 static inline Instruction *getEmptyKey() {
2448 }
2449
2450 static inline Instruction *getTombstoneKey() {
2451 return DenseMapInfo<Instruction *>::getTombstoneKey();
2452 }
2453
2454 static unsigned getHashValue(const Instruction *I) {
2455 assert(canHandle(I) && "Unknown instruction!");
2456 return hash_combine(I->getOpcode(),
2457 hash_combine_range(I->operand_values()));
2458 }
2459
2460 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
2461 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
2462 LHS == getTombstoneKey() || RHS == getTombstoneKey())
2463 return LHS == RHS;
2464 return LHS->isIdenticalTo(RHS);
2465 }
2466};
2467
2468} // end anonymous namespace
2469
2470/// FIXME: This legacy common-subexpression-elimination routine is scheduled for
2471/// removal, in favor of the VPlan-based one.
2472static void legacyCSE(BasicBlock *BB) {
2473 // Perform simple cse.
2475 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
2476 if (!CSEDenseMapInfo::canHandle(&In))
2477 continue;
2478
2479 // Check if we can replace this instruction with any of the
2480 // visited instructions.
2481 if (Instruction *V = CSEMap.lookup(&In)) {
2482 In.replaceAllUsesWith(V);
2483 In.eraseFromParent();
2484 continue;
2485 }
2486
2487 CSEMap[&In] = &In;
2488 }
2489}
2490
2491/// This function attempts to return a value that represents the ElementCount
2492/// at runtime. For fixed-width VFs we know this precisely at compile
2493/// time, but for scalable VFs we calculate it based on an estimate of the
2494/// vscale value.
2496 std::optional<unsigned> VScale) {
2497 unsigned EstimatedVF = VF.getKnownMinValue();
2498 if (VF.isScalable())
2499 if (VScale)
2500 EstimatedVF *= *VScale;
2501 assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
2502 return EstimatedVF;
2503}
2504
2507 ElementCount VF) const {
2508 // We only need to calculate a cost if the VF is scalar; for actual vectors
2509 // we should already have a pre-calculated cost at each VF.
2510 if (!VF.isScalar())
2511 return getCallWideningDecision(CI, VF).Cost;
2512
2513 Type *RetTy = CI->getType();
2515 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy))
2516 return *RedCost;
2517
2519 for (auto &ArgOp : CI->args())
2520 Tys.push_back(ArgOp->getType());
2521
2522 InstructionCost ScalarCallCost =
2523 TTI.getCallInstrCost(CI->getCalledFunction(), RetTy, Tys, CostKind);
2524
2525 // If this is an intrinsic we may have a lower cost for it.
2528 return std::min(ScalarCallCost, IntrinsicCost);
2529 }
2530 return ScalarCallCost;
2531}
2532
2534 if (VF.isScalar() || !canVectorizeTy(Ty))
2535 return Ty;
2536 return toVectorizedTy(Ty, VF);
2537}
2538
2541 ElementCount VF) const {
2543 assert(ID && "Expected intrinsic call!");
2544 Type *RetTy = maybeVectorizeType(CI->getType(), VF);
2545 FastMathFlags FMF;
2546 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
2547 FMF = FPMO->getFastMathFlags();
2548
2551 SmallVector<Type *> ParamTys;
2552 std::transform(FTy->param_begin(), FTy->param_end(),
2553 std::back_inserter(ParamTys),
2554 [&](Type *Ty) { return maybeVectorizeType(Ty, VF); });
2555
2556 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
2559 return TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
2560}
2561
2563 // Fix widened non-induction PHIs by setting up the PHI operands.
2564 fixNonInductionPHIs(State);
2565
2566 // Don't apply optimizations below when no (vector) loop remains, as they all
2567 // require one at the moment.
2568 VPBasicBlock *HeaderVPBB =
2569 vputils::getFirstLoopHeader(*State.Plan, State.VPDT);
2570 if (!HeaderVPBB)
2571 return;
2572
2573 BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB];
2574
2575 // Remove redundant induction instructions.
2576 legacyCSE(HeaderBB);
2577}
2578
2580 auto Iter = vp_depth_first_shallow(Plan.getEntry());
2582 for (VPRecipeBase &P : VPBB->phis()) {
2584 if (!VPPhi)
2585 continue;
2586 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi));
2587 // Make sure the builder has a valid insert point.
2588 Builder.SetInsertPoint(NewPhi);
2589 for (const auto &[Inc, VPBB] : VPPhi->incoming_values_and_blocks())
2590 NewPhi->addIncoming(State.get(Inc), State.CFG.VPBB2IRBB[VPBB]);
2591 }
2592 }
2593}
2594
2595void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
2596 // We should not collect Scalars more than once per VF. Right now, this
2597 // function is called from collectUniformsAndScalars(), which already does
2598 // this check. Collecting Scalars for VF=1 does not make any sense.
2599 assert(VF.isVector() && !Scalars.contains(VF) &&
2600 "This function should not be visited twice for the same VF");
2601
2602 // This avoids any chances of creating a REPLICATE recipe during planning
2603 // since that would result in generation of scalarized code during execution,
2604 // which is not supported for scalable vectors.
2605 if (VF.isScalable()) {
2606 Scalars[VF].insert_range(Uniforms[VF]);
2607 return;
2608 }
2609
2611
2612 // These sets are used to seed the analysis with pointers used by memory
2613 // accesses that will remain scalar.
2615 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
2616 auto *Latch = TheLoop->getLoopLatch();
2617
2618 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
2619 // The pointer operands of loads and stores will be scalar as long as the
2620 // memory access is not a gather or scatter operation. The value operand of a
2621 // store will remain scalar if the store is scalarized.
2622 auto IsScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
2623 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
2624 assert(WideningDecision != CM_Unknown &&
2625 "Widening decision should be ready at this moment");
2626 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
2627 if (Ptr == Store->getValueOperand())
2628 return WideningDecision == CM_Scalarize;
2629 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
2630 "Ptr is neither a value or pointer operand");
2631 return WideningDecision != CM_GatherScatter;
2632 };
2633
2634 // A helper that returns true if the given value is a getelementptr
2635 // instruction contained in the loop.
2636 auto IsLoopVaryingGEP = [&](Value *V) {
2637 return isa<GetElementPtrInst>(V) && !TheLoop->isLoopInvariant(V);
2638 };
2639
2640 // A helper that evaluates a memory access's use of a pointer. If the use will
2641 // be a scalar use and the pointer is only used by memory accesses, we place
2642 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
2643 // PossibleNonScalarPtrs.
2644 auto EvaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
2645 // We only care about bitcast and getelementptr instructions contained in
2646 // the loop.
2647 if (!IsLoopVaryingGEP(Ptr))
2648 return;
2649
2650 // If the pointer has already been identified as scalar (e.g., if it was
2651 // also identified as uniform), there's nothing to do.
2652 auto *I = cast<Instruction>(Ptr);
2653 if (Worklist.count(I))
2654 return;
2655
2656 // If the use of the pointer will be a scalar use, and all users of the
2657 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
2658 // place the pointer in PossibleNonScalarPtrs.
2659 if (IsScalarUse(MemAccess, Ptr) &&
2661 ScalarPtrs.insert(I);
2662 else
2663 PossibleNonScalarPtrs.insert(I);
2664 };
2665
2666 // We seed the scalars analysis with three classes of instructions: (1)
2667 // instructions marked uniform-after-vectorization and (2) bitcast,
2668 // getelementptr and (pointer) phi instructions used by memory accesses
2669 // requiring a scalar use.
2670 //
2671 // (1) Add to the worklist all instructions that have been identified as
2672 // uniform-after-vectorization.
2673 Worklist.insert_range(Uniforms[VF]);
2674
2675 // (2) Add to the worklist all bitcast and getelementptr instructions used by
2676 // memory accesses requiring a scalar use. The pointer operands of loads and
2677 // stores will be scalar unless the operation is a gather or scatter.
2678 // The value operand of a store will remain scalar if the store is scalarized.
2679 for (auto *BB : TheLoop->blocks())
2680 for (auto &I : *BB) {
2681 if (auto *Load = dyn_cast<LoadInst>(&I)) {
2682 EvaluatePtrUse(Load, Load->getPointerOperand());
2683 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
2684 EvaluatePtrUse(Store, Store->getPointerOperand());
2685 EvaluatePtrUse(Store, Store->getValueOperand());
2686 }
2687 }
2688 for (auto *I : ScalarPtrs)
2689 if (!PossibleNonScalarPtrs.count(I)) {
2690 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
2691 Worklist.insert(I);
2692 }
2693
2694 // Insert the forced scalars.
2695 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
2696 // induction variable when the PHI user is scalarized.
2697 auto ForcedScalar = ForcedScalars.find(VF);
2698 if (ForcedScalar != ForcedScalars.end())
2699 for (auto *I : ForcedScalar->second) {
2700 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
2701 Worklist.insert(I);
2702 }
2703
2704 // Expand the worklist by looking through any bitcasts and getelementptr
2705 // instructions we've already identified as scalar. This is similar to the
2706 // expansion step in collectLoopUniforms(); however, here we're only
2707 // expanding to include additional bitcasts and getelementptr instructions.
2708 unsigned Idx = 0;
2709 while (Idx != Worklist.size()) {
2710 Instruction *Dst = Worklist[Idx++];
2711 if (!IsLoopVaryingGEP(Dst->getOperand(0)))
2712 continue;
2713 auto *Src = cast<Instruction>(Dst->getOperand(0));
2714 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
2715 auto *J = cast<Instruction>(U);
2716 return !TheLoop->contains(J) || Worklist.count(J) ||
2717 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
2718 IsScalarUse(J, Src));
2719 })) {
2720 Worklist.insert(Src);
2721 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
2722 }
2723 }
2724
2725 // An induction variable will remain scalar if all users of the induction
2726 // variable and induction variable update remain scalar.
2727 for (const auto &Induction : Legal->getInductionVars()) {
2728 auto *Ind = Induction.first;
2729 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
2730
2731 // If tail-folding is applied, the primary induction variable will be used
2732 // to feed a vector compare.
2733 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
2734 continue;
2735
2736 // Returns true if \p Indvar is a pointer induction that is used directly by
2737 // load/store instruction \p I.
2738 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
2739 Instruction *I) {
2740 return Induction.second.getKind() ==
2743 Indvar == getLoadStorePointerOperand(I) && IsScalarUse(I, Indvar);
2744 };
2745
2746 // Determine if all users of the induction variable are scalar after
2747 // vectorization.
2748 bool ScalarInd = all_of(Ind->users(), [&](User *U) -> bool {
2749 auto *I = cast<Instruction>(U);
2750 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
2751 IsDirectLoadStoreFromPtrIndvar(Ind, I);
2752 });
2753 if (!ScalarInd)
2754 continue;
2755
2756 // If the induction variable update is a fixed-order recurrence, neither the
2757 // induction variable or its update should be marked scalar after
2758 // vectorization.
2759 auto *IndUpdatePhi = dyn_cast<PHINode>(IndUpdate);
2760 if (IndUpdatePhi && Legal->isFixedOrderRecurrence(IndUpdatePhi))
2761 continue;
2762
2763 // Determine if all users of the induction variable update instruction are
2764 // scalar after vectorization.
2765 bool ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
2766 auto *I = cast<Instruction>(U);
2767 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
2768 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
2769 });
2770 if (!ScalarIndUpdate)
2771 continue;
2772
2773 // The induction variable and its update instruction will remain scalar.
2774 Worklist.insert(Ind);
2775 Worklist.insert(IndUpdate);
2776 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
2777 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
2778 << "\n");
2779 }
2780
2781 Scalars[VF].insert_range(Worklist);
2782}
2783
2785 Instruction *I, ElementCount VF) const {
2786 if (!isPredicatedInst(I))
2787 return false;
2788
2789 // Do we have a non-scalar lowering for this predicated
2790 // instruction? No - it is scalar with predication.
2791 switch(I->getOpcode()) {
2792 default:
2793 return true;
2794 case Instruction::Call:
2795 if (VF.isScalar())
2796 return true;
2798 case Instruction::Load:
2799 case Instruction::Store: {
2801 auto *Ty = getLoadStoreType(I);
2802 unsigned AS = getLoadStoreAddressSpace(I);
2803 Type *VTy = Ty;
2804 if (VF.isVector())
2805 VTy = VectorType::get(Ty, VF);
2806 const Align Alignment = getLoadStoreAlignment(I);
2807 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment, AS) ||
2808 TTI.isLegalMaskedGather(VTy, Alignment))
2809 : !(isLegalMaskedStore(Ty, Ptr, Alignment, AS) ||
2810 TTI.isLegalMaskedScatter(VTy, Alignment));
2811 }
2812 case Instruction::UDiv:
2813 case Instruction::SDiv:
2814 case Instruction::SRem:
2815 case Instruction::URem: {
2816 // We have the option to use the safe-divisor idiom to avoid predication.
2817 // The cost based decision here will always select safe-divisor for
2818 // scalable vectors as scalarization isn't legal.
2819 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
2820 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
2821 }
2822 }
2823}
2824
2825// TODO: Fold into LoopVectorizationLegality::isMaskRequired.
2827 // TODO: We can use the loop-preheader as context point here and get
2828 // context sensitive reasoning for isSafeToSpeculativelyExecute.
2830 (isa<LoadInst, StoreInst, CallInst>(I) && !Legal->isMaskRequired(I)) ||
2832 return false;
2833
2834 // If the instruction was executed conditionally in the original scalar loop,
2835 // predication is needed with a mask whose lanes are all possibly inactive.
2836 if (Legal->blockNeedsPredication(I->getParent()))
2837 return true;
2838
2839 // If we're not folding the tail by masking, predication is unnecessary.
2840 if (!foldTailByMasking())
2841 return false;
2842
2843 // All that remain are instructions with side-effects originally executed in
2844 // the loop unconditionally, but now execute under a tail-fold mask (only)
2845 // having at least one active lane (the first). If the side-effects of the
2846 // instruction are invariant, executing it w/o (the tail-folding) mask is safe
2847 // - it will cause the same side-effects as when masked.
2848 switch(I->getOpcode()) {
2849 default:
2851 "instruction should have been considered by earlier checks");
2852 case Instruction::Call:
2853 // Side-effects of a Call are assumed to be non-invariant, needing a
2854 // (fold-tail) mask.
2855 assert(Legal->isMaskRequired(I) &&
2856 "should have returned earlier for calls not needing a mask");
2857 return true;
2858 case Instruction::Load:
2859 // If the address is loop invariant no predication is needed.
2860 return !Legal->isInvariant(getLoadStorePointerOperand(I));
2861 case Instruction::Store: {
2862 // For stores, we need to prove both speculation safety (which follows from
2863 // the same argument as loads), but also must prove the value being stored
2864 // is correct. The easiest form of the later is to require that all values
2865 // stored are the same.
2866 return !(Legal->isInvariant(getLoadStorePointerOperand(I)) &&
2867 TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()));
2868 }
2869 case Instruction::UDiv:
2870 case Instruction::SDiv:
2871 case Instruction::SRem:
2872 case Instruction::URem:
2873 // If the divisor is loop-invariant no predication is needed.
2874 return !Legal->isInvariant(I->getOperand(1));
2875 }
2876}
2877
2878std::pair<InstructionCost, InstructionCost>
2880 ElementCount VF) const {
2881 assert(I->getOpcode() == Instruction::UDiv ||
2882 I->getOpcode() == Instruction::SDiv ||
2883 I->getOpcode() == Instruction::SRem ||
2884 I->getOpcode() == Instruction::URem);
2886
2887 // Scalarization isn't legal for scalable vector types
2888 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
2889 if (!VF.isScalable()) {
2890 // Get the scalarization cost and scale this amount by the probability of
2891 // executing the predicated block. If the instruction is not predicated,
2892 // we fall through to the next case.
2893 ScalarizationCost = 0;
2894
2895 // These instructions have a non-void type, so account for the phi nodes
2896 // that we will create. This cost is likely to be zero. The phi node
2897 // cost, if any, should be scaled by the block probability because it
2898 // models a copy at the end of each predicated block.
2899 ScalarizationCost +=
2900 VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
2901
2902 // The cost of the non-predicated instruction.
2903 ScalarizationCost +=
2904 VF.getFixedValue() *
2905 TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
2906
2907 // The cost of insertelement and extractelement instructions needed for
2908 // scalarization.
2909 ScalarizationCost += getScalarizationOverhead(I, VF);
2910
2911 // Scale the cost by the probability of executing the predicated blocks.
2912 // This assumes the predicated block for each vector lane is equally
2913 // likely.
2914 ScalarizationCost =
2915 ScalarizationCost / getPredBlockCostDivisor(CostKind, I->getParent());
2916 }
2917
2918 InstructionCost SafeDivisorCost = 0;
2919 auto *VecTy = toVectorTy(I->getType(), VF);
2920 // The cost of the select guard to ensure all lanes are well defined
2921 // after we speculate above any internal control flow.
2922 SafeDivisorCost +=
2923 TTI.getCmpSelInstrCost(Instruction::Select, VecTy,
2924 toVectorTy(Type::getInt1Ty(I->getContext()), VF),
2926
2927 SmallVector<const Value *, 4> Operands(I->operand_values());
2928 SafeDivisorCost += TTI.getArithmeticInstrCost(
2929 I->getOpcode(), VecTy, CostKind,
2930 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
2931 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
2932 Operands, I);
2933 return {ScalarizationCost, SafeDivisorCost};
2934}
2935
2937 Instruction *I, ElementCount VF) const {
2938 assert(isAccessInterleaved(I) && "Expecting interleaved access.");
2940 "Decision should not be set yet.");
2941 auto *Group = getInterleavedAccessGroup(I);
2942 assert(Group && "Must have a group.");
2943 unsigned InterleaveFactor = Group->getFactor();
2944
2945 // If the instruction's allocated size doesn't equal its type size, it
2946 // requires padding and will be scalarized.
2947 auto &DL = I->getDataLayout();
2948 auto *ScalarTy = getLoadStoreType(I);
2949 if (hasIrregularType(ScalarTy, DL))
2950 return false;
2951
2952 // For scalable vectors, the interleave factors must be <= 8 since we require
2953 // the (de)interleaveN intrinsics instead of shufflevectors.
2954 if (VF.isScalable() && InterleaveFactor > 8)
2955 return false;
2956
2957 // If the group involves a non-integral pointer, we may not be able to
2958 // losslessly cast all values to a common type.
2959 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
2960 for (unsigned Idx = 0; Idx < InterleaveFactor; Idx++) {
2961 Instruction *Member = Group->getMember(Idx);
2962 if (!Member)
2963 continue;
2964 auto *MemberTy = getLoadStoreType(Member);
2965 bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
2966 // Don't coerce non-integral pointers to integers or vice versa.
2967 if (MemberNI != ScalarNI)
2968 // TODO: Consider adding special nullptr value case here
2969 return false;
2970 if (MemberNI && ScalarNI &&
2971 ScalarTy->getPointerAddressSpace() !=
2972 MemberTy->getPointerAddressSpace())
2973 return false;
2974 }
2975
2976 // Check if masking is required.
2977 // A Group may need masking for one of two reasons: it resides in a block that
2978 // needs predication, or it was decided to use masking to deal with gaps
2979 // (either a gap at the end of a load-access that may result in a speculative
2980 // load, or any gaps in a store-access).
2981 bool PredicatedAccessRequiresMasking =
2982 blockNeedsPredicationForAnyReason(I->getParent()) &&
2983 Legal->isMaskRequired(I);
2984 bool LoadAccessWithGapsRequiresEpilogMasking =
2985 isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
2987 bool StoreAccessWithGapsRequiresMasking =
2988 isa<StoreInst>(I) && !Group->isFull();
2989 if (!PredicatedAccessRequiresMasking &&
2990 !LoadAccessWithGapsRequiresEpilogMasking &&
2991 !StoreAccessWithGapsRequiresMasking)
2992 return true;
2993
2994 // If masked interleaving is required, we expect that the user/target had
2995 // enabled it, because otherwise it either wouldn't have been created or
2996 // it should have been invalidated by the CostModel.
2998 "Masked interleave-groups for predicated accesses are not enabled.");
2999
3000 if (Group->isReverse())
3001 return false;
3002
3003 // TODO: Support interleaved access that requires a gap mask for scalable VFs.
3004 bool NeedsMaskForGaps = LoadAccessWithGapsRequiresEpilogMasking ||
3005 StoreAccessWithGapsRequiresMasking;
3006 if (VF.isScalable() && NeedsMaskForGaps)
3007 return false;
3008
3009 auto *Ty = getLoadStoreType(I);
3010 const Align Alignment = getLoadStoreAlignment(I);
3011 unsigned AS = getLoadStoreAddressSpace(I);
3012 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment, AS)
3013 : TTI.isLegalMaskedStore(Ty, Alignment, AS);
3014}
3015
3017 Instruction *I, ElementCount VF) {
3018 // Get and ensure we have a valid memory instruction.
3019 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
3020
3022 auto *ScalarTy = getLoadStoreType(I);
3023
3024 // In order to be widened, the pointer should be consecutive, first of all.
3025 if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
3026 return false;
3027
3028 // If the instruction is a store located in a predicated block, it will be
3029 // scalarized.
3030 if (isScalarWithPredication(I, VF))
3031 return false;
3032
3033 // If the instruction's allocated size doesn't equal it's type size, it
3034 // requires padding and will be scalarized.
3035 auto &DL = I->getDataLayout();
3036 if (hasIrregularType(ScalarTy, DL))
3037 return false;
3038
3039 return true;
3040}
3041
3042void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
3043 // We should not collect Uniforms more than once per VF. Right now,
3044 // this function is called from collectUniformsAndScalars(), which
3045 // already does this check. Collecting Uniforms for VF=1 does not make any
3046 // sense.
3047
3048 assert(VF.isVector() && !Uniforms.contains(VF) &&
3049 "This function should not be visited twice for the same VF");
3050
3051 // Visit the list of Uniforms. If we find no uniform value, we won't
3052 // analyze again. Uniforms.count(VF) will return 1.
3053 Uniforms[VF].clear();
3054
3055 // Now we know that the loop is vectorizable!
3056 // Collect instructions inside the loop that will remain uniform after
3057 // vectorization.
3058
3059 // Global values, params and instructions outside of current loop are out of
3060 // scope.
3061 auto IsOutOfScope = [&](Value *V) -> bool {
3063 return (!I || !TheLoop->contains(I));
3064 };
3065
3066 // Worklist containing uniform instructions demanding lane 0.
3067 SetVector<Instruction *> Worklist;
3068
3069 // Add uniform instructions demanding lane 0 to the worklist. Instructions
3070 // that require predication must not be considered uniform after
3071 // vectorization, because that would create an erroneous replicating region
3072 // where only a single instance out of VF should be formed.
3073 auto AddToWorklistIfAllowed = [&](Instruction *I) -> void {
3074 if (IsOutOfScope(I)) {
3075 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
3076 << *I << "\n");
3077 return;
3078 }
3079 if (isPredicatedInst(I)) {
3080 LLVM_DEBUG(
3081 dbgs() << "LV: Found not uniform due to requiring predication: " << *I
3082 << "\n");
3083 return;
3084 }
3085 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
3086 Worklist.insert(I);
3087 };
3088
3089 // Start with the conditional branches exiting the loop. If the branch
3090 // condition is an instruction contained in the loop that is only used by the
3091 // branch, it is uniform. Note conditions from uncountable early exits are not
3092 // uniform.
3094 TheLoop->getExitingBlocks(Exiting);
3095 for (BasicBlock *E : Exiting) {
3096 if (Legal->hasUncountableEarlyExit() && TheLoop->getLoopLatch() != E)
3097 continue;
3098 auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
3099 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
3100 AddToWorklistIfAllowed(Cmp);
3101 }
3102
3103 auto PrevVF = VF.divideCoefficientBy(2);
3104 // Return true if all lanes perform the same memory operation, and we can
3105 // thus choose to execute only one.
3106 auto IsUniformMemOpUse = [&](Instruction *I) {
3107 // If the value was already known to not be uniform for the previous
3108 // (smaller VF), it cannot be uniform for the larger VF.
3109 if (PrevVF.isVector()) {
3110 auto Iter = Uniforms.find(PrevVF);
3111 if (Iter != Uniforms.end() && !Iter->second.contains(I))
3112 return false;
3113 }
3114 if (!Legal->isUniformMemOp(*I, VF))
3115 return false;
3116 if (isa<LoadInst>(I))
3117 // Loading the same address always produces the same result - at least
3118 // assuming aliasing and ordering which have already been checked.
3119 return true;
3120 // Storing the same value on every iteration.
3121 return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
3122 };
3123
3124 auto IsUniformDecision = [&](Instruction *I, ElementCount VF) {
3125 InstWidening WideningDecision = getWideningDecision(I, VF);
3126 assert(WideningDecision != CM_Unknown &&
3127 "Widening decision should be ready at this moment");
3128
3129 if (IsUniformMemOpUse(I))
3130 return true;
3131
3132 return (WideningDecision == CM_Widen ||
3133 WideningDecision == CM_Widen_Reverse ||
3134 WideningDecision == CM_Interleave);
3135 };
3136
3137 // Returns true if Ptr is the pointer operand of a memory access instruction
3138 // I, I is known to not require scalarization, and the pointer is not also
3139 // stored.
3140 auto IsVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
3141 if (isa<StoreInst>(I) && I->getOperand(0) == Ptr)
3142 return false;
3143 return getLoadStorePointerOperand(I) == Ptr &&
3144 (IsUniformDecision(I, VF) || Legal->isInvariant(Ptr));
3145 };
3146
3147 // Holds a list of values which are known to have at least one uniform use.
3148 // Note that there may be other uses which aren't uniform. A "uniform use"
3149 // here is something which only demands lane 0 of the unrolled iterations;
3150 // it does not imply that all lanes produce the same value (e.g. this is not
3151 // the usual meaning of uniform)
3152 SetVector<Value *> HasUniformUse;
3153
3154 // Scan the loop for instructions which are either a) known to have only
3155 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
3156 for (auto *BB : TheLoop->blocks())
3157 for (auto &I : *BB) {
3158 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
3159 switch (II->getIntrinsicID()) {
3160 case Intrinsic::sideeffect:
3161 case Intrinsic::experimental_noalias_scope_decl:
3162 case Intrinsic::assume:
3163 case Intrinsic::lifetime_start:
3164 case Intrinsic::lifetime_end:
3165 if (TheLoop->hasLoopInvariantOperands(&I))
3166 AddToWorklistIfAllowed(&I);
3167 break;
3168 default:
3169 break;
3170 }
3171 }
3172
3173 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
3174 if (IsOutOfScope(EVI->getAggregateOperand())) {
3175 AddToWorklistIfAllowed(EVI);
3176 continue;
3177 }
3178 // Only ExtractValue instructions where the aggregate value comes from a
3179 // call are allowed to be non-uniform.
3180 assert(isa<CallInst>(EVI->getAggregateOperand()) &&
3181 "Expected aggregate value to be call return value");
3182 }
3183
3184 // If there's no pointer operand, there's nothing to do.
3186 if (!Ptr)
3187 continue;
3188
3189 // If the pointer can be proven to be uniform, always add it to the
3190 // worklist.
3191 if (isa<Instruction>(Ptr) && Legal->isUniform(Ptr, VF))
3192 AddToWorklistIfAllowed(cast<Instruction>(Ptr));
3193
3194 if (IsUniformMemOpUse(&I))
3195 AddToWorklistIfAllowed(&I);
3196
3197 if (IsVectorizedMemAccessUse(&I, Ptr))
3198 HasUniformUse.insert(Ptr);
3199 }
3200
3201 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
3202 // demanding) users. Since loops are assumed to be in LCSSA form, this
3203 // disallows uses outside the loop as well.
3204 for (auto *V : HasUniformUse) {
3205 if (IsOutOfScope(V))
3206 continue;
3207 auto *I = cast<Instruction>(V);
3208 bool UsersAreMemAccesses = all_of(I->users(), [&](User *U) -> bool {
3209 auto *UI = cast<Instruction>(U);
3210 return TheLoop->contains(UI) && IsVectorizedMemAccessUse(UI, V);
3211 });
3212 if (UsersAreMemAccesses)
3213 AddToWorklistIfAllowed(I);
3214 }
3215
3216 // Expand Worklist in topological order: whenever a new instruction
3217 // is added , its users should be already inside Worklist. It ensures
3218 // a uniform instruction will only be used by uniform instructions.
3219 unsigned Idx = 0;
3220 while (Idx != Worklist.size()) {
3221 Instruction *I = Worklist[Idx++];
3222
3223 for (auto *OV : I->operand_values()) {
3224 // isOutOfScope operands cannot be uniform instructions.
3225 if (IsOutOfScope(OV))
3226 continue;
3227 // First order recurrence Phi's should typically be considered
3228 // non-uniform.
3229 auto *OP = dyn_cast<PHINode>(OV);
3230 if (OP && Legal->isFixedOrderRecurrence(OP))
3231 continue;
3232 // If all the users of the operand are uniform, then add the
3233 // operand into the uniform worklist.
3234 auto *OI = cast<Instruction>(OV);
3235 if (llvm::all_of(OI->users(), [&](User *U) -> bool {
3236 auto *J = cast<Instruction>(U);
3237 return Worklist.count(J) || IsVectorizedMemAccessUse(J, OI);
3238 }))
3239 AddToWorklistIfAllowed(OI);
3240 }
3241 }
3242
3243 // For an instruction to be added into Worklist above, all its users inside
3244 // the loop should also be in Worklist. However, this condition cannot be
3245 // true for phi nodes that form a cyclic dependence. We must process phi
3246 // nodes separately. An induction variable will remain uniform if all users
3247 // of the induction variable and induction variable update remain uniform.
3248 // The code below handles both pointer and non-pointer induction variables.
3249 BasicBlock *Latch = TheLoop->getLoopLatch();
3250 for (const auto &Induction : Legal->getInductionVars()) {
3251 auto *Ind = Induction.first;
3252 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3253
3254 // Determine if all users of the induction variable are uniform after
3255 // vectorization.
3256 bool UniformInd = all_of(Ind->users(), [&](User *U) -> bool {
3257 auto *I = cast<Instruction>(U);
3258 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3259 IsVectorizedMemAccessUse(I, Ind);
3260 });
3261 if (!UniformInd)
3262 continue;
3263
3264 // Determine if all users of the induction variable update instruction are
3265 // uniform after vectorization.
3266 bool UniformIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
3267 auto *I = cast<Instruction>(U);
3268 return I == Ind || Worklist.count(I) ||
3269 IsVectorizedMemAccessUse(I, IndUpdate);
3270 });
3271 if (!UniformIndUpdate)
3272 continue;
3273
3274 // The induction variable and its update instruction will remain uniform.
3275 AddToWorklistIfAllowed(Ind);
3276 AddToWorklistIfAllowed(IndUpdate);
3277 }
3278
3279 Uniforms[VF].insert_range(Worklist);
3280}
3281
3283 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
3284
3285 if (Legal->getRuntimePointerChecking()->Need) {
3286 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
3287 "runtime pointer checks needed. Enable vectorization of this "
3288 "loop with '#pragma clang loop vectorize(enable)' when "
3289 "compiling with -Os/-Oz",
3290 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3291 return true;
3292 }
3293
3294 if (!PSE.getPredicate().isAlwaysTrue()) {
3295 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
3296 "runtime SCEV checks needed. Enable vectorization of this "
3297 "loop with '#pragma clang loop vectorize(enable)' when "
3298 "compiling with -Os/-Oz",
3299 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3300 return true;
3301 }
3302
3303 // FIXME: Avoid specializing for stride==1 instead of bailing out.
3304 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
3305 reportVectorizationFailure("Runtime stride check for small trip count",
3306 "runtime stride == 1 checks needed. Enable vectorization of "
3307 "this loop without such check by compiling with -Os/-Oz",
3308 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3309 return true;
3310 }
3311
3312 return false;
3313}
3314
3315bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
3316 if (IsScalableVectorizationAllowed)
3317 return *IsScalableVectorizationAllowed;
3318
3319 IsScalableVectorizationAllowed = false;
3320 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
3321 return false;
3322
3323 if (Hints->isScalableVectorizationDisabled()) {
3324 reportVectorizationInfo("Scalable vectorization is explicitly disabled",
3325 "ScalableVectorizationDisabled", ORE, TheLoop);
3326 return false;
3327 }
3328
3329 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
3330
3331 auto MaxScalableVF = ElementCount::getScalable(
3332 std::numeric_limits<ElementCount::ScalarTy>::max());
3333
3334 // Test that the loop-vectorizer can legalize all operations for this MaxVF.
3335 // FIXME: While for scalable vectors this is currently sufficient, this should
3336 // be replaced by a more detailed mechanism that filters out specific VFs,
3337 // instead of invalidating vectorization for a whole set of VFs based on the
3338 // MaxVF.
3339
3340 // Disable scalable vectorization if the loop contains unsupported reductions.
3341 if (!canVectorizeReductions(MaxScalableVF)) {
3343 "Scalable vectorization not supported for the reduction "
3344 "operations found in this loop.",
3345 "ScalableVFUnfeasible", ORE, TheLoop);
3346 return false;
3347 }
3348
3349 // Disable scalable vectorization if the loop contains any instructions
3350 // with element types not supported for scalable vectors.
3351 if (any_of(ElementTypesInLoop, [&](Type *Ty) {
3352 return !Ty->isVoidTy() &&
3354 })) {
3355 reportVectorizationInfo("Scalable vectorization is not supported "
3356 "for all element types found in this loop.",
3357 "ScalableVFUnfeasible", ORE, TheLoop);
3358 return false;
3359 }
3360
3361 if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(*TheFunction, TTI)) {
3362 reportVectorizationInfo("The target does not provide maximum vscale value "
3363 "for safe distance analysis.",
3364 "ScalableVFUnfeasible", ORE, TheLoop);
3365 return false;
3366 }
3367
3368 IsScalableVectorizationAllowed = true;
3369 return true;
3370}
3371
3372ElementCount
3373LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
3374 if (!isScalableVectorizationAllowed())
3375 return ElementCount::getScalable(0);
3376
3377 auto MaxScalableVF = ElementCount::getScalable(
3378 std::numeric_limits<ElementCount::ScalarTy>::max());
3379 if (Legal->isSafeForAnyVectorWidth())
3380 return MaxScalableVF;
3381
3382 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
3383 // Limit MaxScalableVF by the maximum safe dependence distance.
3384 MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);
3385
3386 if (!MaxScalableVF)
3388 "Max legal vector width too small, scalable vectorization "
3389 "unfeasible.",
3390 "ScalableVFUnfeasible", ORE, TheLoop);
3391
3392 return MaxScalableVF;
3393}
3394
3395FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
3396 unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
3397 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
3398 unsigned SmallestType, WidestType;
3399 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
3400
3401 // Get the maximum safe dependence distance in bits computed by LAA.
3402 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
3403 // the memory accesses that is most restrictive (involved in the smallest
3404 // dependence distance).
3405 unsigned MaxSafeElementsPowerOf2 =
3406 bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
3407 if (!Legal->isSafeForAnyStoreLoadForwardDistances()) {
3408 unsigned SLDist = Legal->getMaxStoreLoadForwardSafeDistanceInBits();
3409 MaxSafeElementsPowerOf2 =
3410 std::min(MaxSafeElementsPowerOf2, SLDist / WidestType);
3411 }
3412 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElementsPowerOf2);
3413 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElementsPowerOf2);
3414
3415 if (!Legal->isSafeForAnyVectorWidth())
3416 this->MaxSafeElements = MaxSafeElementsPowerOf2;
3417
3418 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
3419 << ".\n");
3420 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
3421 << ".\n");
3422
3423 // First analyze the UserVF, fall back if the UserVF should be ignored.
3424 if (UserVF) {
3425 auto MaxSafeUserVF =
3426 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
3427
3428 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
3429 // If `VF=vscale x N` is safe, then so is `VF=N`
3430 if (UserVF.isScalable())
3431 return FixedScalableVFPair(
3432 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
3433
3434 return UserVF;
3435 }
3436
3437 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
3438
3439 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
3440 // is better to ignore the hint and let the compiler choose a suitable VF.
3441 if (!UserVF.isScalable()) {
3442 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3443 << " is unsafe, clamping to max safe VF="
3444 << MaxSafeFixedVF << ".\n");
3445 ORE->emit([&]() {
3446 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3447 TheLoop->getStartLoc(),
3448 TheLoop->getHeader())
3449 << "User-specified vectorization factor "
3450 << ore::NV("UserVectorizationFactor", UserVF)
3451 << " is unsafe, clamping to maximum safe vectorization factor "
3452 << ore::NV("VectorizationFactor", MaxSafeFixedVF);
3453 });
3454 return MaxSafeFixedVF;
3455 }
3456
3458 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3459 << " is ignored because scalable vectors are not "
3460 "available.\n");
3461 ORE->emit([&]() {
3462 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3463 TheLoop->getStartLoc(),
3464 TheLoop->getHeader())
3465 << "User-specified vectorization factor "
3466 << ore::NV("UserVectorizationFactor", UserVF)
3467 << " is ignored because the target does not support scalable "
3468 "vectors. The compiler will pick a more suitable value.";
3469 });
3470 } else {
3471 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3472 << " is unsafe. Ignoring scalable UserVF.\n");
3473 ORE->emit([&]() {
3474 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3475 TheLoop->getStartLoc(),
3476 TheLoop->getHeader())
3477 << "User-specified vectorization factor "
3478 << ore::NV("UserVectorizationFactor", UserVF)
3479 << " is unsafe. Ignoring the hint to let the compiler pick a "
3480 "more suitable value.";
3481 });
3482 }
3483 }
3484
3485 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
3486 << " / " << WidestType << " bits.\n");
3487
3488 FixedScalableVFPair Result(ElementCount::getFixed(1),
3490 if (auto MaxVF =
3491 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3492 MaxSafeFixedVF, FoldTailByMasking))
3493 Result.FixedVF = MaxVF;
3494
3495 if (auto MaxVF =
3496 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3497 MaxSafeScalableVF, FoldTailByMasking))
3498 if (MaxVF.isScalable()) {
3499 Result.ScalableVF = MaxVF;
3500 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
3501 << "\n");
3502 }
3503
3504 return Result;
3505}
3506
3507FixedScalableVFPair
3509 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
3510 // TODO: It may be useful to do since it's still likely to be dynamically
3511 // uniform if the target can skip.
3513 "Not inserting runtime ptr check for divergent target",
3514 "runtime pointer checks needed. Not enabled for divergent target",
3515 "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
3517 }
3518
3519 ScalarEvolution *SE = PSE.getSE();
3521 unsigned MaxTC = PSE.getSmallConstantMaxTripCount();
3522 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
3523 if (TC != ElementCount::getFixed(MaxTC))
3524 LLVM_DEBUG(dbgs() << "LV: Found maximum trip count: " << MaxTC << '\n');
3525 if (TC.isScalar()) {
3526 reportVectorizationFailure("Single iteration (non) loop",
3527 "loop trip count is one, irrelevant for vectorization",
3528 "SingleIterationLoop", ORE, TheLoop);
3530 }
3531
3532 // If BTC matches the widest induction type and is -1 then the trip count
3533 // computation will wrap to 0 and the vector trip count will be 0. Do not try
3534 // to vectorize.
3535 const SCEV *BTC = SE->getBackedgeTakenCount(TheLoop);
3536 if (!isa<SCEVCouldNotCompute>(BTC) &&
3537 BTC->getType()->getScalarSizeInBits() >=
3538 Legal->getWidestInductionType()->getScalarSizeInBits() &&
3540 SE->getMinusOne(BTC->getType()))) {
3542 "Trip count computation wrapped",
3543 "backedge-taken count is -1, loop trip count wrapped to 0",
3544 "TripCountWrapped", ORE, TheLoop);
3546 }
3547
3548 switch (ScalarEpilogueStatus) {
3550 return computeFeasibleMaxVF(MaxTC, UserVF, false);
3552 [[fallthrough]];
3554 LLVM_DEBUG(
3555 dbgs() << "LV: vector predicate hint/switch found.\n"
3556 << "LV: Not allowing scalar epilogue, creating predicated "
3557 << "vector loop.\n");
3558 break;
3560 // fallthrough as a special case of OptForSize
3562 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
3563 LLVM_DEBUG(
3564 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
3565 else
3566 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
3567 << "count.\n");
3568
3569 // Bail if runtime checks are required, which are not good when optimising
3570 // for size.
3573
3574 break;
3575 }
3576
3577 // Now try the tail folding
3578
3579 // Invalidate interleave groups that require an epilogue if we can't mask
3580 // the interleave-group.
3582 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
3583 "No decisions should have been taken at this point");
3584 // Note: There is no need to invalidate any cost modeling decisions here, as
3585 // none were taken so far.
3586 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
3587 }
3588
3589 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
3590
3591 // Avoid tail folding if the trip count is known to be a multiple of any VF
3592 // we choose.
3593 std::optional<unsigned> MaxPowerOf2RuntimeVF =
3594 MaxFactors.FixedVF.getFixedValue();
3595 if (MaxFactors.ScalableVF) {
3596 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
3597 if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
3598 MaxPowerOf2RuntimeVF = std::max<unsigned>(
3599 *MaxPowerOf2RuntimeVF,
3600 *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
3601 } else
3602 MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
3603 }
3604
3605 auto NoScalarEpilogueNeeded = [this, &UserIC](unsigned MaxVF) {
3606 // Return false if the loop is neither a single-latch-exit loop nor an
3607 // early-exit loop as tail-folding is not supported in that case.
3608 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
3609 !Legal->hasUncountableEarlyExit())
3610 return false;
3611 unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF;
3612 ScalarEvolution *SE = PSE.getSE();
3613 // Calling getSymbolicMaxBackedgeTakenCount enables support for loops
3614 // with uncountable exits. For countable loops, the symbolic maximum must
3615 // remain identical to the known back-edge taken count.
3616 const SCEV *BackedgeTakenCount = PSE.getSymbolicMaxBackedgeTakenCount();
3617 assert((Legal->hasUncountableEarlyExit() ||
3618 BackedgeTakenCount == PSE.getBackedgeTakenCount()) &&
3619 "Invalid loop count");
3620 const SCEV *ExitCount = SE->getAddExpr(
3621 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
3622 const SCEV *Rem = SE->getURemExpr(
3623 SE->applyLoopGuards(ExitCount, TheLoop),
3624 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
3625 return Rem->isZero();
3626 };
3627
3628 if (MaxPowerOf2RuntimeVF > 0u) {
3629 assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
3630 "MaxFixedVF must be a power of 2");
3631 if (NoScalarEpilogueNeeded(*MaxPowerOf2RuntimeVF)) {
3632 // Accept MaxFixedVF if we do not have a tail.
3633 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
3634 return MaxFactors;
3635 }
3636 }
3637
3638 auto ExpectedTC = getSmallBestKnownTC(PSE, TheLoop);
3639 if (ExpectedTC && ExpectedTC->isFixed() &&
3640 ExpectedTC->getFixedValue() <=
3641 TTI.getMinTripCountTailFoldingThreshold()) {
3642 if (MaxPowerOf2RuntimeVF > 0u) {
3643 // If we have a low-trip-count, and the fixed-width VF is known to divide
3644 // the trip count but the scalable factor does not, use the fixed-width
3645 // factor in preference to allow the generation of a non-predicated loop.
3646 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedLowTripLoop &&
3647 NoScalarEpilogueNeeded(MaxFactors.FixedVF.getFixedValue())) {
3648 LLVM_DEBUG(dbgs() << "LV: Picking a fixed-width so that no tail will "
3649 "remain for any chosen VF.\n");
3650 MaxFactors.ScalableVF = ElementCount::getScalable(0);
3651 return MaxFactors;
3652 }
3653 }
3654
3656 "The trip count is below the minial threshold value.",
3657 "loop trip count is too low, avoiding vectorization", "LowTripCount",
3658 ORE, TheLoop);
3660 }
3661
3662 // If we don't know the precise trip count, or if the trip count that we
3663 // found modulo the vectorization factor is not zero, try to fold the tail
3664 // by masking.
3665 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
3666 bool ContainsScalableVF = MaxFactors.ScalableVF.isNonZero();
3667 setTailFoldingStyles(ContainsScalableVF, UserIC);
3668 if (foldTailByMasking()) {
3670 LLVM_DEBUG(
3671 dbgs()
3672 << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
3673 "try to generate VP Intrinsics with scalable vector "
3674 "factors only.\n");
3675 // Tail folded loop using VP intrinsics restricts the VF to be scalable
3676 // for now.
3677 // TODO: extend it for fixed vectors, if required.
3678 assert(ContainsScalableVF && "Expected scalable vector factor.");
3679
3680 MaxFactors.FixedVF = ElementCount::getFixed(1);
3681 }
3682 return MaxFactors;
3683 }
3684
3685 // If there was a tail-folding hint/switch, but we can't fold the tail by
3686 // masking, fallback to a vectorization with a scalar epilogue.
3687 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
3688 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
3689 "scalar epilogue instead.\n");
3690 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
3691 return MaxFactors;
3692 }
3693
3694 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
3695 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
3697 }
3698
3699 if (TC.isZero()) {
3701 "unable to calculate the loop count due to complex control flow",
3702 "UnknownLoopCountComplexCFG", ORE, TheLoop);
3704 }
3705
3707 "Cannot optimize for size and vectorize at the same time.",
3708 "cannot optimize for size and vectorize at the same time. "
3709 "Enable vectorization of this loop with '#pragma clang loop "
3710 "vectorize(enable)' when compiling with -Os/-Oz",
3711 "NoTailLoopWithOptForSize", ORE, TheLoop);
3713}
3714
3716 ElementCount VF) {
3717 if (ConsiderRegPressure.getNumOccurrences())
3718 return ConsiderRegPressure;
3719
3720 // TODO: We should eventually consider register pressure for all targets. The
3721 // TTI hook is temporary whilst target-specific issues are being fixed.
3722 if (TTI.shouldConsiderVectorizationRegPressure())
3723 return true;
3724
3725 if (!useMaxBandwidth(VF.isScalable()
3728 return false;
3729 // Only calculate register pressure for VFs enabled by MaxBandwidth.
3731 VF, VF.isScalable() ? MaxPermissibleVFWithoutMaxBW.ScalableVF
3733}
3734
3737 return MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
3738 (TTI.shouldMaximizeVectorBandwidth(RegKind) ||
3740 Legal->hasVectorCallVariants())));
3741}
3742
3743ElementCount LoopVectorizationCostModel::clampVFByMaxTripCount(
3744 ElementCount VF, unsigned MaxTripCount, bool FoldTailByMasking) const {
3745 unsigned EstimatedVF = VF.getKnownMinValue();
3746 if (VF.isScalable() && TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
3747 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
3748 auto Min = Attr.getVScaleRangeMin();
3749 EstimatedVF *= Min;
3750 }
3751
3752 // When a scalar epilogue is required, at least one iteration of the scalar
3753 // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
3754 // max VF that results in a dead vector loop.
3755 if (MaxTripCount > 0 && requiresScalarEpilogue(true))
3756 MaxTripCount -= 1;
3757
3758 if (MaxTripCount && MaxTripCount <= EstimatedVF &&
3759 (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
3760 // If upper bound loop trip count (TC) is known at compile time there is no
3761 // point in choosing VF greater than TC (as done in the loop below). Select
3762 // maximum power of two which doesn't exceed TC. If VF is
3763 // scalable, we only fall back on a fixed VF when the TC is less than or
3764 // equal to the known number of lanes.
3765 auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount);
3766 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
3767 "exceeding the constant trip count: "
3768 << ClampedUpperTripCount << "\n");
3769 return ElementCount::get(ClampedUpperTripCount,
3770 FoldTailByMasking ? VF.isScalable() : false);
3771 }
3772 return VF;
3773}
3774
3775ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
3776 unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
3777 ElementCount MaxSafeVF, bool FoldTailByMasking) {
3778 bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
3779 const TypeSize WidestRegister = TTI.getRegisterBitWidth(
3780 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
3782
3783 // Convenience function to return the minimum of two ElementCounts.
3784 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
3785 assert((LHS.isScalable() == RHS.isScalable()) &&
3786 "Scalable flags must match");
3787 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
3788 };
3789
3790 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
3791 // Note that both WidestRegister and WidestType may not be a powers of 2.
3792 auto MaxVectorElementCount = ElementCount::get(
3793 llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),
3794 ComputeScalableMaxVF);
3795 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
3796 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
3797 << (MaxVectorElementCount * WidestType) << " bits.\n");
3798
3799 if (!MaxVectorElementCount) {
3800 LLVM_DEBUG(dbgs() << "LV: The target has no "
3801 << (ComputeScalableMaxVF ? "scalable" : "fixed")
3802 << " vector registers.\n");
3803 return ElementCount::getFixed(1);
3804 }
3805
3806 ElementCount MaxVF = clampVFByMaxTripCount(MaxVectorElementCount,
3807 MaxTripCount, FoldTailByMasking);
3808 // If the MaxVF was already clamped, there's no point in trying to pick a
3809 // larger one.
3810 if (MaxVF != MaxVectorElementCount)
3811 return MaxVF;
3812
3814 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
3816
3817 if (MaxVF.isScalable())
3818 MaxPermissibleVFWithoutMaxBW.ScalableVF = MaxVF;
3819 else
3820 MaxPermissibleVFWithoutMaxBW.FixedVF = MaxVF;
3821
3822 if (useMaxBandwidth(RegKind)) {
3823 auto MaxVectorElementCountMaxBW = ElementCount::get(
3824 llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
3825 ComputeScalableMaxVF);
3826 MaxVF = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
3827
3828 if (ElementCount MinVF =
3829 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
3830 if (ElementCount::isKnownLT(MaxVF, MinVF)) {
3831 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
3832 << ") with target's minimum: " << MinVF << '\n');
3833 MaxVF = MinVF;
3834 }
3835 }
3836
3837 MaxVF = clampVFByMaxTripCount(MaxVF, MaxTripCount, FoldTailByMasking);
3838
3839 if (MaxVectorElementCount != MaxVF) {
3840 // Invalidate any widening decisions we might have made, in case the loop
3841 // requires prediction (decided later), but we have already made some
3842 // load/store widening decisions.
3843 invalidateCostModelingDecisions();
3844 }
3845 }
3846 return MaxVF;
3847}
3848
3849bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
3850 const VectorizationFactor &B,
3851 const unsigned MaxTripCount,
3852 bool HasTail,
3853 bool IsEpilogue) const {
3854 InstructionCost CostA = A.Cost;
3855 InstructionCost CostB = B.Cost;
3856
3857 // Improve estimate for the vector width if it is scalable.
3858 unsigned EstimatedWidthA = A.Width.getKnownMinValue();
3859 unsigned EstimatedWidthB = B.Width.getKnownMinValue();
3860 if (std::optional<unsigned> VScale = CM.getVScaleForTuning()) {
3861 if (A.Width.isScalable())
3862 EstimatedWidthA *= *VScale;
3863 if (B.Width.isScalable())
3864 EstimatedWidthB *= *VScale;
3865 }
3866
3867 // When optimizing for size choose whichever is smallest, which will be the
3868 // one with the smallest cost for the whole loop. On a tie pick the larger
3869 // vector width, on the assumption that throughput will be greater.
3870 if (CM.CostKind == TTI::TCK_CodeSize)
3871 return CostA < CostB ||
3872 (CostA == CostB && EstimatedWidthA > EstimatedWidthB);
3873
3874 // Assume vscale may be larger than 1 (or the value being tuned for),
3875 // so that scalable vectorization is slightly favorable over fixed-width
3876 // vectorization.
3877 bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost(IsEpilogue) &&
3878 A.Width.isScalable() && !B.Width.isScalable();
3879
3880 auto CmpFn = [PreferScalable](const InstructionCost &LHS,
3881 const InstructionCost &RHS) {
3882 return PreferScalable ? LHS <= RHS : LHS < RHS;
3883 };
3884
3885 // To avoid the need for FP division:
3886 // (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)
3887 // <=> (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA)
3888 if (!MaxTripCount)
3889 return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);
3890
3891 auto GetCostForTC = [MaxTripCount, HasTail](unsigned VF,
3892 InstructionCost VectorCost,
3893 InstructionCost ScalarCost) {
3894 // If the trip count is a known (possibly small) constant, the trip count
3895 // will be rounded up to an integer number of iterations under
3896 // FoldTailByMasking. The total cost in that case will be
3897 // VecCost*ceil(TripCount/VF). When not folding the tail, the total
3898 // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
3899 // some extra overheads, but for the purpose of comparing the costs of
3900 // different VFs we can use this to compare the total loop-body cost
3901 // expected after vectorization.
3902 if (HasTail)
3903 return VectorCost * (MaxTripCount / VF) +
3904 ScalarCost * (MaxTripCount % VF);
3905 return VectorCost * divideCeil(MaxTripCount, VF);
3906 };
3907
3908 auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost);
3909 auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost);
3910 return CmpFn(RTCostA, RTCostB);
3911}
3912
3913bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
3914 const VectorizationFactor &B,
3915 bool HasTail,
3916 bool IsEpilogue) const {
3917 const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount();
3918 return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount, HasTail,
3919 IsEpilogue);
3920}
3921
3924 using RecipeVFPair = std::pair<VPRecipeBase *, ElementCount>;
3925 SmallVector<RecipeVFPair> InvalidCosts;
3926 for (const auto &Plan : VPlans) {
3927 for (ElementCount VF : Plan->vectorFactors()) {
3928 // The VPlan-based cost model is designed for computing vector cost.
3929 // Querying VPlan-based cost model with a scarlar VF will cause some
3930 // errors because we expect the VF is vector for most of the widen
3931 // recipes.
3932 if (VF.isScalar())
3933 continue;
3934
3935 VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
3936 *CM.PSE.getSE(), OrigLoop);
3937 precomputeCosts(*Plan, VF, CostCtx);
3938 auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
3940 for (auto &R : *VPBB) {
3941 if (!R.cost(VF, CostCtx).isValid())
3942 InvalidCosts.emplace_back(&R, VF);
3943 }
3944 }
3945 }
3946 }
3947 if (InvalidCosts.empty())
3948 return;
3949
3950 // Emit a report of VFs with invalid costs in the loop.
3951
3952 // Group the remarks per recipe, keeping the recipe order from InvalidCosts.
3954 unsigned I = 0;
3955 for (auto &Pair : InvalidCosts)
3956 if (Numbering.try_emplace(Pair.first, I).second)
3957 ++I;
3958
3959 // Sort the list, first on recipe(number) then on VF.
3960 sort(InvalidCosts, [&Numbering](RecipeVFPair &A, RecipeVFPair &B) {
3961 unsigned NA = Numbering[A.first];
3962 unsigned NB = Numbering[B.first];
3963 if (NA != NB)
3964 return NA < NB;
3965 return ElementCount::isKnownLT(A.second, B.second);
3966 });
3967
3968 // For a list of ordered recipe-VF pairs:
3969 // [(load, VF1), (load, VF2), (store, VF1)]
3970 // group the recipes together to emit separate remarks for:
3971 // load (VF1, VF2)
3972 // store (VF1)
3973 auto Tail = ArrayRef<RecipeVFPair>(InvalidCosts);
3974 auto Subset = ArrayRef<RecipeVFPair>();
3975 do {
3976 if (Subset.empty())
3977 Subset = Tail.take_front(1);
3978
3979 VPRecipeBase *R = Subset.front().first;
3980
3981 unsigned Opcode =
3984 [](const auto *R) { return Instruction::PHI; })
3985 .Case<VPWidenSelectRecipe>(
3986 [](const auto *R) { return Instruction::Select; })
3987 .Case<VPWidenStoreRecipe>(
3988 [](const auto *R) { return Instruction::Store; })
3989 .Case<VPWidenLoadRecipe>(
3990 [](const auto *R) { return Instruction::Load; })
3991 .Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>(
3992 [](const auto *R) { return Instruction::Call; })
3995 [](const auto *R) { return R->getOpcode(); })
3996 .Case<VPInterleaveRecipe>([](const VPInterleaveRecipe *R) {
3997 return R->getStoredValues().empty() ? Instruction::Load
3998 : Instruction::Store;
3999 });
4000
4001 // If the next recipe is different, or if there are no other pairs,
4002 // emit a remark for the collated subset. e.g.
4003 // [(load, VF1), (load, VF2))]
4004 // to emit:
4005 // remark: invalid costs for 'load' at VF=(VF1, VF2)
4006 if (Subset == Tail || Tail[Subset.size()].first != R) {
4007 std::string OutString;
4008 raw_string_ostream OS(OutString);
4009 assert(!Subset.empty() && "Unexpected empty range");
4010 OS << "Recipe with invalid costs prevented vectorization at VF=(";
4011 for (const auto &Pair : Subset)
4012 OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
4013 OS << "):";
4014 if (Opcode == Instruction::Call) {
4015 StringRef Name = "";
4016 if (auto *Int = dyn_cast<VPWidenIntrinsicRecipe>(R)) {
4017 Name = Int->getIntrinsicName();
4018 } else {
4019 auto *WidenCall = dyn_cast<VPWidenCallRecipe>(R);
4020 Function *CalledFn =
4021 WidenCall ? WidenCall->getCalledScalarFunction()
4022 : cast<Function>(R->getOperand(R->getNumOperands() - 1)
4023 ->getLiveInIRValue());
4024 Name = CalledFn->getName();
4025 }
4026 OS << " call to " << Name;
4027 } else
4028 OS << " " << Instruction::getOpcodeName(Opcode);
4029 reportVectorizationInfo(OutString, "InvalidCost", ORE, OrigLoop, nullptr,
4030 R->getDebugLoc());
4031 Tail = Tail.drop_front(Subset.size());
4032 Subset = {};
4033 } else
4034 // Grow the subset by one element
4035 Subset = Tail.take_front(Subset.size() + 1);
4036 } while (!Tail.empty());
4037}
4038
4039/// Check if any recipe of \p Plan will generate a vector value, which will be
4040/// assigned a vector register.
4042 const TargetTransformInfo &TTI) {
4043 assert(VF.isVector() && "Checking a scalar VF?");
4044 VPTypeAnalysis TypeInfo(Plan);
4045 DenseSet<VPRecipeBase *> EphemeralRecipes;
4046 collectEphemeralRecipesForVPlan(Plan, EphemeralRecipes);
4047 // Set of already visited types.
4048 DenseSet<Type *> Visited;
4051 for (VPRecipeBase &R : *VPBB) {
4052 if (EphemeralRecipes.contains(&R))
4053 continue;
4054 // Continue early if the recipe is considered to not produce a vector
4055 // result. Note that this includes VPInstruction where some opcodes may
4056 // produce a vector, to preserve existing behavior as VPInstructions model
4057 // aspects not directly mapped to existing IR instructions.
4058 switch (R.getVPDefID()) {
4059 case VPDef::VPDerivedIVSC:
4060 case VPDef::VPScalarIVStepsSC:
4061 case VPDef::VPReplicateSC:
4062 case VPDef::VPInstructionSC:
4063 case VPDef::VPCanonicalIVPHISC:
4064 case VPDef::VPVectorPointerSC:
4065 case VPDef::VPVectorEndPointerSC:
4066 case VPDef::VPExpandSCEVSC:
4067 case VPDef::VPEVLBasedIVPHISC:
4068 case VPDef::VPPredInstPHISC:
4069 case VPDef::VPBranchOnMaskSC:
4070 continue;
4071 case VPDef::VPReductionSC:
4072 case VPDef::VPActiveLaneMaskPHISC:
4073 case VPDef::VPWidenCallSC:
4074 case VPDef::VPWidenCanonicalIVSC:
4075 case VPDef::VPWidenCastSC:
4076 case VPDef::VPWidenGEPSC:
4077 case VPDef::VPWidenIntrinsicSC:
4078 case VPDef::VPWidenSC:
4079 case VPDef::VPWidenSelectSC:
4080 case VPDef::VPBlendSC:
4081 case VPDef::VPFirstOrderRecurrencePHISC:
4082 case VPDef::VPHistogramSC:
4083 case VPDef::VPWidenPHISC:
4084 case VPDef::VPWidenIntOrFpInductionSC:
4085 case VPDef::VPWidenPointerInductionSC:
4086 case VPDef::VPReductionPHISC:
4087 case VPDef::VPInterleaveEVLSC:
4088 case VPDef::VPInterleaveSC:
4089 case VPDef::VPWidenLoadEVLSC:
4090 case VPDef::VPWidenLoadSC:
4091 case VPDef::VPWidenStoreEVLSC:
4092 case VPDef::VPWidenStoreSC:
4093 break;
4094 default:
4095 llvm_unreachable("unhandled recipe");
4096 }
4097
4098 auto WillGenerateTargetVectors = [&TTI, VF](Type *VectorTy) {
4099 unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
4100 if (!NumLegalParts)
4101 return false;
4102 if (VF.isScalable()) {
4103 // <vscale x 1 x iN> is assumed to be profitable over iN because
4104 // scalable registers are a distinct register class from scalar
4105 // ones. If we ever find a target which wants to lower scalable
4106 // vectors back to scalars, we'll need to update this code to
4107 // explicitly ask TTI about the register class uses for each part.
4108 return NumLegalParts <= VF.getKnownMinValue();
4109 }
4110 // Two or more elements that share a register - are vectorized.
4111 return NumLegalParts < VF.getFixedValue();
4112 };
4113
4114 // If no def nor is a store, e.g., branches, continue - no value to check.
4115 if (R.getNumDefinedValues() == 0 &&
4117 continue;
4118 // For multi-def recipes, currently only interleaved loads, suffice to
4119 // check first def only.
4120 // For stores check their stored value; for interleaved stores suffice
4121 // the check first stored value only. In all cases this is the second
4122 // operand.
4123 VPValue *ToCheck =
4124 R.getNumDefinedValues() >= 1 ? R.getVPValue(0) : R.getOperand(1);
4125 Type *ScalarTy = TypeInfo.inferScalarType(ToCheck);
4126 if (!Visited.insert({ScalarTy}).second)
4127 continue;
4128 Type *WideTy = toVectorizedTy(ScalarTy, VF);
4129 if (any_of(getContainedTypes(WideTy), WillGenerateTargetVectors))
4130 return true;
4131 }
4132 }
4133
4134 return false;
4135}
4136
4137static bool hasReplicatorRegion(VPlan &Plan) {
4139 Plan.getVectorLoopRegion()->getEntry())),
4140 [](auto *VPRB) { return VPRB->isReplicator(); });
4141}
4142
4143#ifndef NDEBUG
4144VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
4145 InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1));
4146 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
4147 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
4148 assert(
4149 any_of(VPlans,
4150 [](std::unique_ptr<VPlan> &P) { return P->hasScalarVFOnly(); }) &&
4151 "Expected Scalar VF to be a candidate");
4152
4153 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
4154 ExpectedCost);
4155 VectorizationFactor ChosenFactor = ScalarCost;
4156
4157 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
4158 if (ForceVectorization &&
4159 (VPlans.size() > 1 || !VPlans[0]->hasScalarVFOnly())) {
4160 // Ignore scalar width, because the user explicitly wants vectorization.
4161 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4162 // evaluation.
4163 ChosenFactor.Cost = InstructionCost::getMax();
4164 }
4165
4166 for (auto &P : VPlans) {
4167 ArrayRef<ElementCount> VFs(P->vectorFactors().begin(),
4168 P->vectorFactors().end());
4169
4171 if (any_of(VFs, [this](ElementCount VF) {
4172 return CM.shouldConsiderRegPressureForVF(VF);
4173 }))
4174 RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore);
4175
4176 for (unsigned I = 0; I < VFs.size(); I++) {
4177 ElementCount VF = VFs[I];
4178 // The cost for scalar VF=1 is already calculated, so ignore it.
4179 if (VF.isScalar())
4180 continue;
4181
4182 /// If the register pressure needs to be considered for VF,
4183 /// don't consider the VF as valid if it exceeds the number
4184 /// of registers for the target.
4185 if (CM.shouldConsiderRegPressureForVF(VF) &&
4186 RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs))
4187 continue;
4188
4189 InstructionCost C = CM.expectedCost(VF);
4190
4191 // Add on other costs that are modelled in VPlan, but not in the legacy
4192 // cost model.
4193 VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind,
4194 *CM.PSE.getSE(), OrigLoop);
4195 VPRegionBlock *VectorRegion = P->getVectorLoopRegion();
4196 assert(VectorRegion && "Expected to have a vector region!");
4197 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4198 vp_depth_first_shallow(VectorRegion->getEntry()))) {
4199 for (VPRecipeBase &R : *VPBB) {
4200 auto *VPI = dyn_cast<VPInstruction>(&R);
4201 if (!VPI)
4202 continue;
4203 switch (VPI->getOpcode()) {
4204 // Selects are only modelled in the legacy cost model for safe
4205 // divisors.
4206 case Instruction::Select: {
4207 VPValue *VPV = VPI->getVPSingleValue();
4208 if (VPV->getNumUsers() == 1) {
4209 if (auto *WR = dyn_cast<VPWidenRecipe>(*VPV->user_begin())) {
4210 switch (WR->getOpcode()) {
4211 case Instruction::UDiv:
4212 case Instruction::SDiv:
4213 case Instruction::URem:
4214 case Instruction::SRem:
4215 continue;
4216 default:
4217 break;
4218 }
4219 }
4220 }
4221 C += VPI->cost(VF, CostCtx);
4222 break;
4223 }
4225 unsigned Multiplier =
4226 cast<ConstantInt>(VPI->getOperand(2)->getLiveInIRValue())
4227 ->getZExtValue();
4228 C += VPI->cost(VF * Multiplier, CostCtx);
4229 break;
4230 }
4232 C += VPI->cost(VF, CostCtx);
4233 break;
4234 default:
4235 break;
4236 }
4237 }
4238 }
4239
4240 VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
4241 unsigned Width =
4242 estimateElementCount(Candidate.Width, CM.getVScaleForTuning());
4243 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
4244 << " costs: " << (Candidate.Cost / Width));
4245 if (VF.isScalable())
4246 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
4247 << CM.getVScaleForTuning().value_or(1) << ")");
4248 LLVM_DEBUG(dbgs() << ".\n");
4249
4250 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
4251 LLVM_DEBUG(
4252 dbgs()
4253 << "LV: Not considering vector loop of width " << VF
4254 << " because it will not generate any vector instructions.\n");
4255 continue;
4256 }
4257
4258 if (CM.OptForSize && !ForceVectorization && hasReplicatorRegion(*P)) {
4259 LLVM_DEBUG(
4260 dbgs()
4261 << "LV: Not considering vector loop of width " << VF
4262 << " because it would cause replicated blocks to be generated,"
4263 << " which isn't allowed when optimizing for size.\n");
4264 continue;
4265 }
4266
4267 if (isMoreProfitable(Candidate, ChosenFactor, P->hasScalarTail()))
4268 ChosenFactor = Candidate;
4269 }
4270 }
4271
4272 if (!EnableCondStoresVectorization && CM.hasPredStores()) {
4274 "There are conditional stores.",
4275 "store that is conditionally executed prevents vectorization",
4276 "ConditionalStore", ORE, OrigLoop);
4277 ChosenFactor = ScalarCost;
4278 }
4279
4280 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
4281 !isMoreProfitable(ChosenFactor, ScalarCost,
4282 !CM.foldTailByMasking())) dbgs()
4283 << "LV: Vectorization seems to be not beneficial, "
4284 << "but was forced by a user.\n");
4285 return ChosenFactor;
4286}
4287#endif
4288
4289bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
4290 ElementCount VF) const {
4291 // Cross iteration phis such as fixed-order recurrences and FMaxNum/FMinNum
4292 // reductions need special handling and are currently unsupported.
4293 if (any_of(OrigLoop->getHeader()->phis(), [&](PHINode &Phi) {
4294 if (!Legal->isReductionVariable(&Phi))
4295 return Legal->isFixedOrderRecurrence(&Phi);
4296 return RecurrenceDescriptor::isFPMinMaxNumRecurrenceKind(
4297 Legal->getRecurrenceDescriptor(&Phi).getRecurrenceKind());
4298 }))
4299 return false;
4300
4301 // Phis with uses outside of the loop require special handling and are
4302 // currently unsupported.
4303 for (const auto &Entry : Legal->getInductionVars()) {
4304 // Look for uses of the value of the induction at the last iteration.
4305 Value *PostInc =
4306 Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch());
4307 for (User *U : PostInc->users())
4308 if (!OrigLoop->contains(cast<Instruction>(U)))
4309 return false;
4310 // Look for uses of penultimate value of the induction.
4311 for (User *U : Entry.first->users())
4312 if (!OrigLoop->contains(cast<Instruction>(U)))
4313 return false;
4314 }
4315
4316 // Epilogue vectorization code has not been auditted to ensure it handles
4317 // non-latch exits properly. It may be fine, but it needs auditted and
4318 // tested.
4319 // TODO: Add support for loops with an early exit.
4320 if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
4321 return false;
4322
4323 return true;
4324}
4325
4327 const ElementCount VF, const unsigned IC) const {
4328 // FIXME: We need a much better cost-model to take different parameters such
4329 // as register pressure, code size increase and cost of extra branches into
4330 // account. For now we apply a very crude heuristic and only consider loops
4331 // with vectorization factors larger than a certain value.
4332
4333 // Allow the target to opt out entirely.
4334 if (!TTI.preferEpilogueVectorization())
4335 return false;
4336
4337 // We also consider epilogue vectorization unprofitable for targets that don't
4338 // consider interleaving beneficial (eg. MVE).
4339 if (TTI.getMaxInterleaveFactor(VF) <= 1)
4340 return false;
4341
4342 unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
4344 : TTI.getEpilogueVectorizationMinVF();
4345 return estimateElementCount(VF * IC, VScaleForTuning) >= MinVFThreshold;
4346}
4347
4349 const ElementCount MainLoopVF, unsigned IC) {
4352 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
4353 return Result;
4354 }
4355
4356 if (!CM.isScalarEpilogueAllowed()) {
4357 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
4358 "epilogue is allowed.\n");
4359 return Result;
4360 }
4361
4362 // Not really a cost consideration, but check for unsupported cases here to
4363 // simplify the logic.
4364 if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
4365 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
4366 "is not a supported candidate.\n");
4367 return Result;
4368 }
4369
4371 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
4373 if (hasPlanWithVF(ForcedEC))
4374 return {ForcedEC, 0, 0};
4375
4376 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
4377 "viable.\n");
4378 return Result;
4379 }
4380
4381 if (OrigLoop->getHeader()->getParent()->hasOptSize()) {
4382 LLVM_DEBUG(
4383 dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
4384 return Result;
4385 }
4386
4387 if (!CM.isEpilogueVectorizationProfitable(MainLoopVF, IC)) {
4388 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
4389 "this loop\n");
4390 return Result;
4391 }
4392
4393 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
4394 // the main loop handles 8 lanes per iteration. We could still benefit from
4395 // vectorizing the epilogue loop with VF=4.
4396 ElementCount EstimatedRuntimeVF = ElementCount::getFixed(
4397 estimateElementCount(MainLoopVF, CM.getVScaleForTuning()));
4398
4399 ScalarEvolution &SE = *PSE.getSE();
4400 Type *TCType = Legal->getWidestInductionType();
4401 const SCEV *RemainingIterations = nullptr;
4402 unsigned MaxTripCount = 0;
4403 const SCEV *TC =
4404 vputils::getSCEVExprForVPValue(getPlanFor(MainLoopVF).getTripCount(), SE);
4405 assert(!isa<SCEVCouldNotCompute>(TC) && "Trip count SCEV must be computable");
4406 const SCEV *KnownMinTC;
4407 bool ScalableTC = match(TC, m_scev_c_Mul(m_SCEV(KnownMinTC), m_SCEVVScale()));
4408 // Use versions of TC and VF in which both are either scalable or fixed.
4409 if (ScalableTC == MainLoopVF.isScalable())
4410 RemainingIterations =
4411 SE.getURemExpr(TC, SE.getElementCount(TCType, MainLoopVF * IC));
4412 else if (ScalableTC) {
4413 const SCEV *EstimatedTC = SE.getMulExpr(
4414 KnownMinTC,
4415 SE.getConstant(TCType, CM.getVScaleForTuning().value_or(1)));
4416 RemainingIterations = SE.getURemExpr(
4417 EstimatedTC, SE.getElementCount(TCType, MainLoopVF * IC));
4418 } else
4419 RemainingIterations =
4420 SE.getURemExpr(TC, SE.getElementCount(TCType, EstimatedRuntimeVF * IC));
4421
4422 // No iterations left to process in the epilogue.
4423 if (RemainingIterations->isZero())
4424 return Result;
4425
4426 if (MainLoopVF.isFixed()) {
4427 MaxTripCount = MainLoopVF.getFixedValue() * IC - 1;
4428 if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations,
4429 SE.getConstant(TCType, MaxTripCount))) {
4430 MaxTripCount = SE.getUnsignedRangeMax(RemainingIterations).getZExtValue();
4431 }
4432 LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: "
4433 << MaxTripCount << "\n");
4434 }
4435
4436 for (auto &NextVF : ProfitableVFs) {
4437 // Skip candidate VFs without a corresponding VPlan.
4438 if (!hasPlanWithVF(NextVF.Width))
4439 continue;
4440
4441 // Skip candidate VFs with widths >= the (estimated) runtime VF (scalable
4442 // vectors) or > the VF of the main loop (fixed vectors).
4443 if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
4444 ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||
4445 (NextVF.Width.isScalable() &&
4446 ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) ||
4447 (!NextVF.Width.isScalable() && !MainLoopVF.isScalable() &&
4448 ElementCount::isKnownGT(NextVF.Width, MainLoopVF)))
4449 continue;
4450
4451 // If NextVF is greater than the number of remaining iterations, the
4452 // epilogue loop would be dead. Skip such factors.
4453 if (RemainingIterations && !NextVF.Width.isScalable()) {
4454 if (SE.isKnownPredicate(
4456 SE.getConstant(TCType, NextVF.Width.getFixedValue()),
4457 RemainingIterations))
4458 continue;
4459 }
4460
4461 if (Result.Width.isScalar() ||
4462 isMoreProfitable(NextVF, Result, MaxTripCount, !CM.foldTailByMasking(),
4463 /*IsEpilogue*/ true))
4464 Result = NextVF;
4465 }
4466
4467 if (Result != VectorizationFactor::Disabled())
4468 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
4469 << Result.Width << "\n");
4470 return Result;
4471}
4472
4473std::pair<unsigned, unsigned>
4475 unsigned MinWidth = -1U;
4476 unsigned MaxWidth = 8;
4477 const DataLayout &DL = TheFunction->getDataLayout();
4478 // For in-loop reductions, no element types are added to ElementTypesInLoop
4479 // if there are no loads/stores in the loop. In this case, check through the
4480 // reduction variables to determine the maximum width.
4481 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
4482 for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
4483 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
4484 // When finding the min width used by the recurrence we need to account
4485 // for casts on the input operands of the recurrence.
4486 MinWidth = std::min(
4487 MinWidth,
4488 std::min(RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
4490 MaxWidth = std::max(MaxWidth,
4492 }
4493 } else {
4494 for (Type *T : ElementTypesInLoop) {
4495 MinWidth = std::min<unsigned>(
4496 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4497 MaxWidth = std::max<unsigned>(
4498 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4499 }
4500 }
4501 return {MinWidth, MaxWidth};
4502}
4503
4505 ElementTypesInLoop.clear();
4506 // For each block.
4507 for (BasicBlock *BB : TheLoop->blocks()) {
4508 // For each instruction in the loop.
4509 for (Instruction &I : BB->instructionsWithoutDebug()) {
4510 Type *T = I.getType();
4511
4512 // Skip ignored values.
4513 if (ValuesToIgnore.count(&I))
4514 continue;
4515
4516 // Only examine Loads, Stores and PHINodes.
4517 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
4518 continue;
4519
4520 // Examine PHI nodes that are reduction variables. Update the type to
4521 // account for the recurrence type.
4522 if (auto *PN = dyn_cast<PHINode>(&I)) {
4523 if (!Legal->isReductionVariable(PN))
4524 continue;
4525 const RecurrenceDescriptor &RdxDesc =
4526 Legal->getRecurrenceDescriptor(PN);
4528 TTI.preferInLoopReduction(RdxDesc.getRecurrenceKind(),
4529 RdxDesc.getRecurrenceType()))
4530 continue;
4531 T = RdxDesc.getRecurrenceType();
4532 }
4533
4534 // Examine the stored values.
4535 if (auto *ST = dyn_cast<StoreInst>(&I))
4536 T = ST->getValueOperand()->getType();
4537
4538 assert(T->isSized() &&
4539 "Expected the load/store/recurrence type to be sized");
4540
4541 ElementTypesInLoop.insert(T);
4542 }
4543 }
4544}
4545
4546unsigned
4548 InstructionCost LoopCost) {
4549 // -- The interleave heuristics --
4550 // We interleave the loop in order to expose ILP and reduce the loop overhead.
4551 // There are many micro-architectural considerations that we can't predict
4552 // at this level. For example, frontend pressure (on decode or fetch) due to
4553 // code size, or the number and capabilities of the execution ports.
4554 //
4555 // We use the following heuristics to select the interleave count:
4556 // 1. If the code has reductions, then we interleave to break the cross
4557 // iteration dependency.
4558 // 2. If the loop is really small, then we interleave to reduce the loop
4559 // overhead.
4560 // 3. We don't interleave if we think that we will spill registers to memory
4561 // due to the increased register pressure.
4562
4563 if (!CM.isScalarEpilogueAllowed())
4564 return 1;
4565
4568 LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. "
4569 "Unroll factor forced to be 1.\n");
4570 return 1;
4571 }
4572
4573 // We used the distance for the interleave count.
4574 if (!Legal->isSafeForAnyVectorWidth())
4575 return 1;
4576
4577 // We don't attempt to perform interleaving for loops with uncountable early
4578 // exits because the VPInstruction::AnyOf code cannot currently handle
4579 // multiple parts.
4580 if (Plan.hasEarlyExit())
4581 return 1;
4582
4583 const bool HasReductions =
4586
4587 // If we did not calculate the cost for VF (because the user selected the VF)
4588 // then we calculate the cost of VF here.
4589 if (LoopCost == 0) {
4590 if (VF.isScalar())
4591 LoopCost = CM.expectedCost(VF);
4592 else
4593 LoopCost = cost(Plan, VF);
4594 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
4595
4596 // Loop body is free and there is no need for interleaving.
4597 if (LoopCost == 0)
4598 return 1;
4599 }
4600
4601 VPRegisterUsage R =
4602 calculateRegisterUsageForPlan(Plan, {VF}, TTI, CM.ValuesToIgnore)[0];
4603 // We divide by these constants so assume that we have at least one
4604 // instruction that uses at least one register.
4605 for (auto &Pair : R.MaxLocalUsers) {
4606 Pair.second = std::max(Pair.second, 1U);
4607 }
4608
4609 // We calculate the interleave count using the following formula.
4610 // Subtract the number of loop invariants from the number of available
4611 // registers. These registers are used by all of the interleaved instances.
4612 // Next, divide the remaining registers by the number of registers that is
4613 // required by the loop, in order to estimate how many parallel instances
4614 // fit without causing spills. All of this is rounded down if necessary to be
4615 // a power of two. We want power of two interleave count to simplify any
4616 // addressing operations or alignment considerations.
4617 // We also want power of two interleave counts to ensure that the induction
4618 // variable of the vector loop wraps to zero, when tail is folded by masking;
4619 // this currently happens when OptForSize, in which case IC is set to 1 above.
4620 unsigned IC = UINT_MAX;
4621
4622 for (const auto &Pair : R.MaxLocalUsers) {
4623 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(Pair.first);
4624 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
4625 << " registers of "
4626 << TTI.getRegisterClassName(Pair.first)
4627 << " register class\n");
4628 if (VF.isScalar()) {
4629 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
4630 TargetNumRegisters = ForceTargetNumScalarRegs;
4631 } else {
4632 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
4633 TargetNumRegisters = ForceTargetNumVectorRegs;
4634 }
4635 unsigned MaxLocalUsers = Pair.second;
4636 unsigned LoopInvariantRegs = 0;
4637 if (R.LoopInvariantRegs.contains(Pair.first))
4638 LoopInvariantRegs = R.LoopInvariantRegs[Pair.first];
4639
4640 unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /
4641 MaxLocalUsers);
4642 // Don't count the induction variable as interleaved.
4644 TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) /
4645 std::max(1U, (MaxLocalUsers - 1)));
4646 }
4647
4648 IC = std::min(IC, TmpIC);
4649 }
4650
4651 // Clamp the interleave ranges to reasonable counts.
4652 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
4653
4654 // Check if the user has overridden the max.
4655 if (VF.isScalar()) {
4656 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
4657 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
4658 } else {
4659 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
4660 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
4661 }
4662
4663 // Try to get the exact trip count, or an estimate based on profiling data or
4664 // ConstantMax from PSE, failing that.
4665 auto BestKnownTC = getSmallBestKnownTC(PSE, OrigLoop);
4666
4667 // For fixed length VFs treat a scalable trip count as unknown.
4668 if (BestKnownTC && (BestKnownTC->isFixed() || VF.isScalable())) {
4669 // Re-evaluate trip counts and VFs to be in the same numerical space.
4670 unsigned AvailableTC =
4671 estimateElementCount(*BestKnownTC, CM.getVScaleForTuning());
4672 unsigned EstimatedVF = estimateElementCount(VF, CM.getVScaleForTuning());
4673
4674 // At least one iteration must be scalar when this constraint holds. So the
4675 // maximum available iterations for interleaving is one less.
4676 if (CM.requiresScalarEpilogue(VF.isVector()))
4677 --AvailableTC;
4678
4679 unsigned InterleaveCountLB = bit_floor(std::max(
4680 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
4681
4682 if (getSmallConstantTripCount(PSE.getSE(), OrigLoop).isNonZero()) {
4683 // If the best known trip count is exact, we select between two
4684 // prospective ICs, where
4685 //
4686 // 1) the aggressive IC is capped by the trip count divided by VF
4687 // 2) the conservative IC is capped by the trip count divided by (VF * 2)
4688 //
4689 // The final IC is selected in a way that the epilogue loop trip count is
4690 // minimized while maximizing the IC itself, so that we either run the
4691 // vector loop at least once if it generates a small epilogue loop, or
4692 // else we run the vector loop at least twice.
4693
4694 unsigned InterleaveCountUB = bit_floor(std::max(
4695 1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
4696 MaxInterleaveCount = InterleaveCountLB;
4697
4698 if (InterleaveCountUB != InterleaveCountLB) {
4699 unsigned TailTripCountUB =
4700 (AvailableTC % (EstimatedVF * InterleaveCountUB));
4701 unsigned TailTripCountLB =
4702 (AvailableTC % (EstimatedVF * InterleaveCountLB));
4703 // If both produce same scalar tail, maximize the IC to do the same work
4704 // in fewer vector loop iterations
4705 if (TailTripCountUB == TailTripCountLB)
4706 MaxInterleaveCount = InterleaveCountUB;
4707 }
4708 } else {
4709 // If trip count is an estimated compile time constant, limit the
4710 // IC to be capped by the trip count divided by VF * 2, such that the
4711 // vector loop runs at least twice to make interleaving seem profitable
4712 // when there is an epilogue loop present. Since exact Trip count is not
4713 // known we choose to be conservative in our IC estimate.
4714 MaxInterleaveCount = InterleaveCountLB;
4715 }
4716 }
4717
4718 assert(MaxInterleaveCount > 0 &&
4719 "Maximum interleave count must be greater than 0");
4720
4721 // Clamp the calculated IC to be between the 1 and the max interleave count
4722 // that the target and trip count allows.
4723 if (IC > MaxInterleaveCount)
4724 IC = MaxInterleaveCount;
4725 else
4726 // Make sure IC is greater than 0.
4727 IC = std::max(1u, IC);
4728
4729 assert(IC > 0 && "Interleave count must be greater than 0.");
4730
4731 // Interleave if we vectorized this loop and there is a reduction that could
4732 // benefit from interleaving.
4733 if (VF.isVector() && HasReductions) {
4734 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
4735 return IC;
4736 }
4737
4738 // For any scalar loop that either requires runtime checks or predication we
4739 // are better off leaving this to the unroller. Note that if we've already
4740 // vectorized the loop we will have done the runtime check and so interleaving
4741 // won't require further checks.
4742 bool ScalarInterleavingRequiresPredication =
4743 (VF.isScalar() && any_of(OrigLoop->blocks(), [this](BasicBlock *BB) {
4744 return Legal->blockNeedsPredication(BB);
4745 }));
4746 bool ScalarInterleavingRequiresRuntimePointerCheck =
4747 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
4748
4749 // We want to interleave small loops in order to reduce the loop overhead and
4750 // potentially expose ILP opportunities.
4751 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
4752 << "LV: IC is " << IC << '\n'
4753 << "LV: VF is " << VF << '\n');
4754 const bool AggressivelyInterleaveReductions =
4755 TTI.enableAggressiveInterleaving(HasReductions);
4756 if (!ScalarInterleavingRequiresRuntimePointerCheck &&
4757 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
4758 // We assume that the cost overhead is 1 and we use the cost model
4759 // to estimate the cost of the loop and interleave until the cost of the
4760 // loop overhead is about 5% of the cost of the loop.
4761 unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>(
4762 SmallLoopCost / LoopCost.getValue()));
4763
4764 // Interleave until store/load ports (estimated by max interleave count) are
4765 // saturated.
4766 unsigned NumStores = 0;
4767 unsigned NumLoads = 0;
4770 for (VPRecipeBase &R : *VPBB) {
4772 NumLoads++;
4773 continue;
4774 }
4776 NumStores++;
4777 continue;
4778 }
4779
4780 if (auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R)) {
4781 if (unsigned StoreOps = InterleaveR->getNumStoreOperands())
4782 NumStores += StoreOps;
4783 else
4784 NumLoads += InterleaveR->getNumDefinedValues();
4785 continue;
4786 }
4787 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
4788 NumLoads += isa<LoadInst>(RepR->getUnderlyingInstr());
4789 NumStores += isa<StoreInst>(RepR->getUnderlyingInstr());
4790 continue;
4791 }
4792 if (isa<VPHistogramRecipe>(&R)) {
4793 NumLoads++;
4794 NumStores++;
4795 continue;
4796 }
4797 }
4798 }
4799 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
4800 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
4801
4802 // There is little point in interleaving for reductions containing selects
4803 // and compares when VF=1 since it may just create more overhead than it's
4804 // worth for loops with small trip counts. This is because we still have to
4805 // do the final reduction after the loop.
4806 bool HasSelectCmpReductions =
4807 HasReductions &&
4809 [](VPRecipeBase &R) {
4810 auto *RedR = dyn_cast<VPReductionPHIRecipe>(&R);
4811 return RedR && (RecurrenceDescriptor::isAnyOfRecurrenceKind(
4812 RedR->getRecurrenceKind()) ||
4813 RecurrenceDescriptor::isFindIVRecurrenceKind(
4814 RedR->getRecurrenceKind()));
4815 });
4816 if (HasSelectCmpReductions) {
4817 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
4818 return 1;
4819 }
4820
4821 // If we have a scalar reduction (vector reductions are already dealt with
4822 // by this point), we can increase the critical path length if the loop
4823 // we're interleaving is inside another loop. For tree-wise reductions
4824 // set the limit to 2, and for ordered reductions it's best to disable
4825 // interleaving entirely.
4826 if (HasReductions && OrigLoop->getLoopDepth() > 1) {
4827 bool HasOrderedReductions =
4829 [](VPRecipeBase &R) {
4830 auto *RedR = dyn_cast<VPReductionPHIRecipe>(&R);
4831
4832 return RedR && RedR->isOrdered();
4833 });
4834 if (HasOrderedReductions) {
4835 LLVM_DEBUG(
4836 dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
4837 return 1;
4838 }
4839
4840 unsigned F = MaxNestedScalarReductionIC;
4841 SmallIC = std::min(SmallIC, F);
4842 StoresIC = std::min(StoresIC, F);
4843 LoadsIC = std::min(LoadsIC, F);
4844 }
4845
4847 std::max(StoresIC, LoadsIC) > SmallIC) {
4848 LLVM_DEBUG(
4849 dbgs() << "LV: Interleaving to saturate store or load ports.\n");
4850 return std::max(StoresIC, LoadsIC);
4851 }
4852
4853 // If there are scalar reductions and TTI has enabled aggressive
4854 // interleaving for reductions, we will interleave to expose ILP.
4855 if (VF.isScalar() && AggressivelyInterleaveReductions) {
4856 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
4857 // Interleave no less than SmallIC but not as aggressive as the normal IC
4858 // to satisfy the rare situation when resources are too limited.
4859 return std::max(IC / 2, SmallIC);
4860 }
4861
4862 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
4863 return SmallIC;
4864 }
4865
4866 // Interleave if this is a large loop (small loops are already dealt with by
4867 // this point) that could benefit from interleaving.
4868 if (AggressivelyInterleaveReductions) {
4869 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
4870 return IC;
4871 }
4872
4873 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
4874 return 1;
4875}
4876
4877bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
4878 ElementCount VF) {
4879 // TODO: Cost model for emulated masked load/store is completely
4880 // broken. This hack guides the cost model to use an artificially
4881 // high enough value to practically disable vectorization with such
4882 // operations, except where previously deployed legality hack allowed
4883 // using very low cost values. This is to avoid regressions coming simply
4884 // from moving "masked load/store" check from legality to cost model.
4885 // Masked Load/Gather emulation was previously never allowed.
4886 // Limited number of Masked Store/Scatter emulation was allowed.
4887 assert((isPredicatedInst(I)) &&
4888 "Expecting a scalar emulated instruction");
4889 return isa<LoadInst>(I) ||
4890 (isa<StoreInst>(I) &&
4891 NumPredStores > NumberOfStoresToPredicate);
4892}
4893
4895 assert(VF.isVector() && "Expected VF >= 2");
4896
4897 // If we've already collected the instructions to scalarize or the predicated
4898 // BBs after vectorization, there's nothing to do. Collection may already have
4899 // occurred if we have a user-selected VF and are now computing the expected
4900 // cost for interleaving.
4901 if (InstsToScalarize.contains(VF) ||
4902 PredicatedBBsAfterVectorization.contains(VF))
4903 return;
4904
4905 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
4906 // not profitable to scalarize any instructions, the presence of VF in the
4907 // map will indicate that we've analyzed it already.
4908 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
4909
4910 // Find all the instructions that are scalar with predication in the loop and
4911 // determine if it would be better to not if-convert the blocks they are in.
4912 // If so, we also record the instructions to scalarize.
4913 for (BasicBlock *BB : TheLoop->blocks()) {
4915 continue;
4916 for (Instruction &I : *BB)
4917 if (isScalarWithPredication(&I, VF)) {
4918 ScalarCostsTy ScalarCosts;
4919 // Do not apply discount logic for:
4920 // 1. Scalars after vectorization, as there will only be a single copy
4921 // of the instruction.
4922 // 2. Scalable VF, as that would lead to invalid scalarization costs.
4923 // 3. Emulated masked memrefs, if a hacked cost is needed.
4924 if (!isScalarAfterVectorization(&I, VF) && !VF.isScalable() &&
4925 !useEmulatedMaskMemRefHack(&I, VF) &&
4926 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) {
4927 for (const auto &[I, IC] : ScalarCosts)
4928 ScalarCostsVF.insert({I, IC});
4929 // Check if we decided to scalarize a call. If so, update the widening
4930 // decision of the call to CM_Scalarize with the computed scalar cost.
4931 for (const auto &[I, Cost] : ScalarCosts) {
4932 auto *CI = dyn_cast<CallInst>(I);
4933 if (!CI || !CallWideningDecisions.contains({CI, VF}))
4934 continue;
4935 CallWideningDecisions[{CI, VF}].Kind = CM_Scalarize;
4936 CallWideningDecisions[{CI, VF}].Cost = Cost;
4937 }
4938 }
4939 // Remember that BB will remain after vectorization.
4940 PredicatedBBsAfterVectorization[VF].insert(BB);
4941 for (auto *Pred : predecessors(BB)) {
4942 if (Pred->getSingleSuccessor() == BB)
4943 PredicatedBBsAfterVectorization[VF].insert(Pred);
4944 }
4945 }
4946 }
4947}
4948
4949InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
4950 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
4951 assert(!isUniformAfterVectorization(PredInst, VF) &&
4952 "Instruction marked uniform-after-vectorization will be predicated");
4953
4954 // Initialize the discount to zero, meaning that the scalar version and the
4955 // vector version cost the same.
4956 InstructionCost Discount = 0;
4957
4958 // Holds instructions to analyze. The instructions we visit are mapped in
4959 // ScalarCosts. Those instructions are the ones that would be scalarized if
4960 // we find that the scalar version costs less.
4962
4963 // Returns true if the given instruction can be scalarized.
4964 auto CanBeScalarized = [&](Instruction *I) -> bool {
4965 // We only attempt to scalarize instructions forming a single-use chain
4966 // from the original predicated block that would otherwise be vectorized.
4967 // Although not strictly necessary, we give up on instructions we know will
4968 // already be scalar to avoid traversing chains that are unlikely to be
4969 // beneficial.
4970 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
4971 isScalarAfterVectorization(I, VF))
4972 return false;
4973
4974 // If the instruction is scalar with predication, it will be analyzed
4975 // separately. We ignore it within the context of PredInst.
4976 if (isScalarWithPredication(I, VF))
4977 return false;
4978
4979 // If any of the instruction's operands are uniform after vectorization,
4980 // the instruction cannot be scalarized. This prevents, for example, a
4981 // masked load from being scalarized.
4982 //
4983 // We assume we will only emit a value for lane zero of an instruction
4984 // marked uniform after vectorization, rather than VF identical values.
4985 // Thus, if we scalarize an instruction that uses a uniform, we would
4986 // create uses of values corresponding to the lanes we aren't emitting code
4987 // for. This behavior can be changed by allowing getScalarValue to clone
4988 // the lane zero values for uniforms rather than asserting.
4989 for (Use &U : I->operands())
4990 if (auto *J = dyn_cast<Instruction>(U.get()))
4991 if (isUniformAfterVectorization(J, VF))
4992 return false;
4993
4994 // Otherwise, we can scalarize the instruction.
4995 return true;
4996 };
4997
4998 // Compute the expected cost discount from scalarizing the entire expression
4999 // feeding the predicated instruction. We currently only consider expressions
5000 // that are single-use instruction chains.
5001 Worklist.push_back(PredInst);
5002 while (!Worklist.empty()) {
5003 Instruction *I = Worklist.pop_back_val();
5004
5005 // If we've already analyzed the instruction, there's nothing to do.
5006 if (ScalarCosts.contains(I))
5007 continue;
5008
5009 // Cannot scalarize fixed-order recurrence phis at the moment.
5010 if (isa<PHINode>(I) && Legal->isFixedOrderRecurrence(cast<PHINode>(I)))
5011 continue;
5012
5013 // Compute the cost of the vector instruction. Note that this cost already
5014 // includes the scalarization overhead of the predicated instruction.
5015 InstructionCost VectorCost = getInstructionCost(I, VF);
5016
5017 // Compute the cost of the scalarized instruction. This cost is the cost of
5018 // the instruction as if it wasn't if-converted and instead remained in the
5019 // predicated block. We will scale this cost by block probability after
5020 // computing the scalarization overhead.
5021 InstructionCost ScalarCost =
5022 VF.getFixedValue() * getInstructionCost(I, ElementCount::getFixed(1));
5023
5024 // Compute the scalarization overhead of needed insertelement instructions
5025 // and phi nodes.
5026 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5027 Type *WideTy = toVectorizedTy(I->getType(), VF);
5028 for (Type *VectorTy : getContainedTypes(WideTy)) {
5029 ScalarCost += TTI.getScalarizationOverhead(
5031 /*Insert=*/true,
5032 /*Extract=*/false, CostKind);
5033 }
5034 ScalarCost +=
5035 VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
5036 }
5037
5038 // Compute the scalarization overhead of needed extractelement
5039 // instructions. For each of the instruction's operands, if the operand can
5040 // be scalarized, add it to the worklist; otherwise, account for the
5041 // overhead.
5042 for (Use &U : I->operands())
5043 if (auto *J = dyn_cast<Instruction>(U.get())) {
5044 assert(canVectorizeTy(J->getType()) &&
5045 "Instruction has non-scalar type");
5046 if (CanBeScalarized(J))
5047 Worklist.push_back(J);
5048 else if (needsExtract(J, VF)) {
5049 Type *WideTy = toVectorizedTy(J->getType(), VF);
5050 for (Type *VectorTy : getContainedTypes(WideTy)) {
5051 ScalarCost += TTI.getScalarizationOverhead(
5052 cast<VectorType>(VectorTy),
5053 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
5054 /*Extract*/ true, CostKind);
5055 }
5056 }
5057 }
5058
5059 // Scale the total scalar cost by block probability.
5060 ScalarCost /= getPredBlockCostDivisor(CostKind, I->getParent());
5061
5062 // Compute the discount. A non-negative discount means the vector version
5063 // of the instruction costs more, and scalarizing would be beneficial.
5064 Discount += VectorCost - ScalarCost;
5065 ScalarCosts[I] = ScalarCost;
5066 }
5067
5068 return Discount;
5069}
5070
5073
5074 // If the vector loop gets executed exactly once with the given VF, ignore the
5075 // costs of comparison and induction instructions, as they'll get simplified
5076 // away.
5077 SmallPtrSet<Instruction *, 2> ValuesToIgnoreForVF;
5078 auto TC = getSmallConstantTripCount(PSE.getSE(), TheLoop);
5079 if (TC == VF && !foldTailByMasking())
5081 ValuesToIgnoreForVF);
5082
5083 // For each block.
5084 for (BasicBlock *BB : TheLoop->blocks()) {
5085 InstructionCost BlockCost;
5086
5087 // For each instruction in the old loop.
5088 for (Instruction &I : BB->instructionsWithoutDebug()) {
5089 // Skip ignored values.
5090 if (ValuesToIgnore.count(&I) || ValuesToIgnoreForVF.count(&I) ||
5091 (VF.isVector() && VecValuesToIgnore.count(&I)))
5092 continue;
5093
5095
5096 // Check if we should override the cost.
5097 if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0)
5099
5100 BlockCost += C;
5101 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
5102 << VF << " For instruction: " << I << '\n');
5103 }
5104
5105 // If we are vectorizing a predicated block, it will have been
5106 // if-converted. This means that the block's instructions (aside from
5107 // stores and instructions that may divide by zero) will now be
5108 // unconditionally executed. For the scalar case, we may not always execute
5109 // the predicated block, if it is an if-else block. Thus, scale the block's
5110 // cost by the probability of executing it.
5111 // getPredBlockCostDivisor will return 1 for blocks that are only predicated
5112 // by the header mask when folding the tail.
5113 if (VF.isScalar())
5114 BlockCost /= getPredBlockCostDivisor(CostKind, BB);
5115
5116 Cost += BlockCost;
5117 }
5118
5119 return Cost;
5120}
5121
5122/// Gets Address Access SCEV after verifying that the access pattern
5123/// is loop invariant except the induction variable dependence.
5124///
5125/// This SCEV can be sent to the Target in order to estimate the address
5126/// calculation cost.
5128 Value *Ptr,
5131 const Loop *TheLoop) {
5132
5133 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5134 if (!Gep)
5135 return nullptr;
5136
5137 // We are looking for a gep with all loop invariant indices except for one
5138 // which should be an induction variable.
5139 auto *SE = PSE.getSE();
5140 unsigned NumOperands = Gep->getNumOperands();
5141 for (unsigned Idx = 1; Idx < NumOperands; ++Idx) {
5142 Value *Opd = Gep->getOperand(Idx);
5143 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5144 !Legal->isInductionVariable(Opd))
5145 return nullptr;
5146 }
5147
5148 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5149 return PSE.getSCEV(Ptr);
5150}
5151
5153LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5154 ElementCount VF) {
5155 assert(VF.isVector() &&
5156 "Scalarization cost of instruction implies vectorization.");
5157 if (VF.isScalable())
5158 return InstructionCost::getInvalid();
5159
5160 Type *ValTy = getLoadStoreType(I);
5161 auto *SE = PSE.getSE();
5162
5163 unsigned AS = getLoadStoreAddressSpace(I);
5165 Type *PtrTy = toVectorTy(Ptr->getType(), VF);
5166 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
5167 // that it is being called from this specific place.
5168
5169 // Figure out whether the access is strided and get the stride value
5170 // if it's known in compile time
5171 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5172
5173 // Get the cost of the scalar memory instruction and address computation.
5175 PtrTy, SE, PtrSCEV, CostKind);
5176
5177 // Don't pass *I here, since it is scalar but will actually be part of a
5178 // vectorized loop where the user of it is a vectorized instruction.
5179 const Align Alignment = getLoadStoreAlignment(I);
5180 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
5181 Cost += VF.getFixedValue() *
5182 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
5183 AS, CostKind, OpInfo);
5184
5185 // Get the overhead of the extractelement and insertelement instructions
5186 // we might create due to scalarization.
5188
5189 // If we have a predicated load/store, it will need extra i1 extracts and
5190 // conditional branches, but may not be executed for each vector lane. Scale
5191 // the cost by the probability of executing the predicated block.
5192 if (isPredicatedInst(I)) {
5193 Cost /= getPredBlockCostDivisor(CostKind, I->getParent());
5194
5195 // Add the cost of an i1 extract and a branch
5196 auto *VecI1Ty =
5197 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
5199 VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
5200 /*Insert=*/false, /*Extract=*/true, CostKind);
5201 Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
5202
5203 if (useEmulatedMaskMemRefHack(I, VF))
5204 // Artificially setting to a high enough value to practically disable
5205 // vectorization with such operations.
5206 Cost = 3000000;
5207 }
5208
5209 return Cost;
5210}
5211
5213LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5214 ElementCount VF) {
5215 Type *ValTy = getLoadStoreType(I);
5216 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5218 unsigned AS = getLoadStoreAddressSpace(I);
5219 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
5220
5221 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5222 "Stride should be 1 or -1 for consecutive memory access");
5223 const Align Alignment = getLoadStoreAlignment(I);
5225 if (Legal->isMaskRequired(I)) {
5226 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5227 CostKind);
5228 } else {
5229 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
5230 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5231 CostKind, OpInfo, I);
5232 }
5233
5234 bool Reverse = ConsecutiveStride < 0;
5235 if (Reverse)
5237 VectorTy, {}, CostKind, 0);
5238 return Cost;
5239}
5240
5242LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5243 ElementCount VF) {
5244 assert(Legal->isUniformMemOp(*I, VF));
5245
5246 Type *ValTy = getLoadStoreType(I);
5248 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5249 const Align Alignment = getLoadStoreAlignment(I);
5250 unsigned AS = getLoadStoreAddressSpace(I);
5251 if (isa<LoadInst>(I)) {
5252 return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
5253 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
5254 CostKind) +
5256 VectorTy, {}, CostKind);
5257 }
5258 StoreInst *SI = cast<StoreInst>(I);
5259
5260 bool IsLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
5261 // TODO: We have existing tests that request the cost of extracting element
5262 // VF.getKnownMinValue() - 1 from a scalable vector. This does not represent
5263 // the actual generated code, which involves extracting the last element of
5264 // a scalable vector where the lane to extract is unknown at compile time.
5266 TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
5267 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, CostKind);
5268 if (!IsLoopInvariantStoreValue)
5269 Cost += TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement,
5270 VectorTy, CostKind, 0);
5271 return Cost;
5272}
5273
5275LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5276 ElementCount VF) {
5277 Type *ValTy = getLoadStoreType(I);
5278 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5279 const Align Alignment = getLoadStoreAlignment(I);
5281 Type *PtrTy = Ptr->getType();
5282
5283 if (!Legal->isUniform(Ptr, VF))
5284 PtrTy = toVectorTy(PtrTy, VF);
5285
5286 return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
5287 TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5288 Legal->isMaskRequired(I), Alignment,
5289 CostKind, I);
5290}
5291
5293LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5294 ElementCount VF) {
5295 const auto *Group = getInterleavedAccessGroup(I);
5296 assert(Group && "Fail to get an interleaved access group.");
5297
5298 Instruction *InsertPos = Group->getInsertPos();
5299 Type *ValTy = getLoadStoreType(InsertPos);
5300 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5301 unsigned AS = getLoadStoreAddressSpace(InsertPos);
5302
5303 unsigned InterleaveFactor = Group->getFactor();
5304 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5305
5306 // Holds the indices of existing members in the interleaved group.
5307 SmallVector<unsigned, 4> Indices;
5308 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
5309 if (Group->getMember(IF))
5310 Indices.push_back(IF);
5311
5312 // Calculate the cost of the whole interleaved group.
5313 bool UseMaskForGaps =
5314 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
5315 (isa<StoreInst>(I) && !Group->isFull());
5317 InsertPos->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5318 Group->getAlign(), AS, CostKind, Legal->isMaskRequired(I),
5319 UseMaskForGaps);
5320
5321 if (Group->isReverse()) {
5322 // TODO: Add support for reversed masked interleaved access.
5323 assert(!Legal->isMaskRequired(I) &&
5324 "Reverse masked interleaved access not supported.");
5325 Cost += Group->getNumMembers() *
5327 VectorTy, {}, CostKind, 0);
5328 }
5329 return Cost;
5330}
5331
5332std::optional<InstructionCost>
5334 ElementCount VF,
5335 Type *Ty) const {
5336 using namespace llvm::PatternMatch;
5337 // Early exit for no inloop reductions
5338 if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
5339 return std::nullopt;
5340 auto *VectorTy = cast<VectorType>(Ty);
5341
5342 // We are looking for a pattern of, and finding the minimal acceptable cost:
5343 // reduce(mul(ext(A), ext(B))) or
5344 // reduce(mul(A, B)) or
5345 // reduce(ext(A)) or
5346 // reduce(A).
5347 // The basic idea is that we walk down the tree to do that, finding the root
5348 // reduction instruction in InLoopReductionImmediateChains. From there we find
5349 // the pattern of mul/ext and test the cost of the entire pattern vs the cost
5350 // of the components. If the reduction cost is lower then we return it for the
5351 // reduction instruction and 0 for the other instructions in the pattern. If
5352 // it is not we return an invalid cost specifying the orignal cost method
5353 // should be used.
5354 Instruction *RetI = I;
5355 if (match(RetI, m_ZExtOrSExt(m_Value()))) {
5356 if (!RetI->hasOneUser())
5357 return std::nullopt;
5358 RetI = RetI->user_back();
5359 }
5360
5361 if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
5362 RetI->user_back()->getOpcode() == Instruction::Add) {
5363 RetI = RetI->user_back();
5364 }
5365
5366 // Test if the found instruction is a reduction, and if not return an invalid
5367 // cost specifying the parent to use the original cost modelling.
5368 Instruction *LastChain = InLoopReductionImmediateChains.lookup(RetI);
5369 if (!LastChain)
5370 return std::nullopt;
5371
5372 // Find the reduction this chain is a part of and calculate the basic cost of
5373 // the reduction on its own.
5374 Instruction *ReductionPhi = LastChain;
5375 while (!isa<PHINode>(ReductionPhi))
5376 ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi);
5377
5378 const RecurrenceDescriptor &RdxDesc =
5379 Legal->getRecurrenceDescriptor(cast<PHINode>(ReductionPhi));
5380
5381 InstructionCost BaseCost;
5382 RecurKind RK = RdxDesc.getRecurrenceKind();
5385 BaseCost = TTI.getMinMaxReductionCost(MinMaxID, VectorTy,
5386 RdxDesc.getFastMathFlags(), CostKind);
5387 } else {
5388 BaseCost = TTI.getArithmeticReductionCost(
5389 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
5390 }
5391
5392 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
5393 // normal fmul instruction to the cost of the fadd reduction.
5394 if (RK == RecurKind::FMulAdd)
5395 BaseCost +=
5396 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
5397
5398 // If we're using ordered reductions then we can just return the base cost
5399 // here, since getArithmeticReductionCost calculates the full ordered
5400 // reduction cost when FP reassociation is not allowed.
5401 if (useOrderedReductions(RdxDesc))
5402 return BaseCost;
5403
5404 // Get the operand that was not the reduction chain and match it to one of the
5405 // patterns, returning the better cost if it is found.
5406 Instruction *RedOp = RetI->getOperand(1) == LastChain
5409
5410 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
5411
5412 Instruction *Op0, *Op1;
5413 if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5414 match(RedOp,
5416 match(Op0, m_ZExtOrSExt(m_Value())) &&
5417 Op0->getOpcode() == Op1->getOpcode() &&
5418 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
5419 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
5420 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
5421
5422 // Matched reduce.add(ext(mul(ext(A), ext(B)))
5423 // Note that the extend opcodes need to all match, or if A==B they will have
5424 // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
5425 // which is equally fine.
5426 bool IsUnsigned = isa<ZExtInst>(Op0);
5427 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
5428 auto *MulType = VectorType::get(Op0->getType(), VectorTy);
5429
5430 InstructionCost ExtCost =
5431 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
5433 InstructionCost MulCost =
5434 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
5435 InstructionCost Ext2Cost =
5436 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
5438
5439 InstructionCost RedCost = TTI.getMulAccReductionCost(
5440 IsUnsigned, RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), ExtType,
5441 CostKind);
5442
5443 if (RedCost.isValid() &&
5444 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
5445 return I == RetI ? RedCost : 0;
5446 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
5447 !TheLoop->isLoopInvariant(RedOp)) {
5448 // Matched reduce(ext(A))
5449 bool IsUnsigned = isa<ZExtInst>(RedOp);
5450 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
5451 InstructionCost RedCost = TTI.getExtendedReductionCost(
5452 RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
5453 RdxDesc.getFastMathFlags(), CostKind);
5454
5455 InstructionCost ExtCost =
5456 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
5458 if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
5459 return I == RetI ? RedCost : 0;
5460 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5461 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
5462 if (match(Op0, m_ZExtOrSExt(m_Value())) &&
5463 Op0->getOpcode() == Op1->getOpcode() &&
5464 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
5465 bool IsUnsigned = isa<ZExtInst>(Op0);
5466 Type *Op0Ty = Op0->getOperand(0)->getType();
5467 Type *Op1Ty = Op1->getOperand(0)->getType();
5468 Type *LargestOpTy =
5469 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
5470 : Op0Ty;
5471 auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
5472
5473 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
5474 // different sizes. We take the largest type as the ext to reduce, and add
5475 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
5476 InstructionCost ExtCost0 = TTI.getCastInstrCost(
5477 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
5479 InstructionCost ExtCost1 = TTI.getCastInstrCost(
5480 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
5482 InstructionCost MulCost =
5483 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
5484
5485 InstructionCost RedCost = TTI.getMulAccReductionCost(
5486 IsUnsigned, RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), ExtType,
5487 CostKind);
5488 InstructionCost ExtraExtCost = 0;
5489 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
5490 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
5491 ExtraExtCost = TTI.getCastInstrCost(
5492 ExtraExtOp->getOpcode(), ExtType,
5493 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
5495 }
5496
5497 if (RedCost.isValid() &&
5498 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
5499 return I == RetI ? RedCost : 0;
5500 } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
5501 // Matched reduce.add(mul())
5502 InstructionCost MulCost =
5503 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
5504
5505 InstructionCost RedCost = TTI.getMulAccReductionCost(
5506 true, RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), VectorTy,
5507 CostKind);
5508
5509 if (RedCost.isValid() && RedCost < MulCost + BaseCost)
5510 return I == RetI ? RedCost : 0;
5511 }
5512 }
5513
5514 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
5515}
5516
5518LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5519 ElementCount VF) {
5520 // Calculate scalar cost only. Vectorization cost should be ready at this
5521 // moment.
5522 if (VF.isScalar()) {
5523 Type *ValTy = getLoadStoreType(I);
5525 const Align Alignment = getLoadStoreAlignment(I);
5526 unsigned AS = getLoadStoreAddressSpace(I);
5527
5528 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
5529 return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
5530 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, CostKind,
5531 OpInfo, I);
5532 }
5533 return getWideningCost(I, VF);
5534}
5535
5537LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5538 ElementCount VF) const {
5539
5540 // There is no mechanism yet to create a scalable scalarization loop,
5541 // so this is currently Invalid.
5542 if (VF.isScalable())
5543 return InstructionCost::getInvalid();
5544
5545 if (VF.isScalar())
5546 return 0;
5547
5549 Type *RetTy = toVectorizedTy(I->getType(), VF);
5550 if (!RetTy->isVoidTy() &&
5552
5553 for (Type *VectorTy : getContainedTypes(RetTy)) {
5556 /*Insert=*/true,
5557 /*Extract=*/false, CostKind);
5558 }
5559 }
5560
5561 // Some targets keep addresses scalar.
5563 return Cost;
5564
5565 // Some targets support efficient element stores.
5567 return Cost;
5568
5569 // Collect operands to consider.
5570 CallInst *CI = dyn_cast<CallInst>(I);
5571 Instruction::op_range Ops = CI ? CI->args() : I->operands();
5572
5573 // Skip operands that do not require extraction/scalarization and do not incur
5574 // any overhead.
5576 for (auto *V : filterExtractingOperands(Ops, VF))
5577 Tys.push_back(maybeVectorizeType(V->getType(), VF));
5579}
5580
5582 if (VF.isScalar())
5583 return;
5584 NumPredStores = 0;
5585 for (BasicBlock *BB : TheLoop->blocks()) {
5586 // For each instruction in the old loop.
5587 for (Instruction &I : *BB) {
5589 if (!Ptr)
5590 continue;
5591
5592 // TODO: We should generate better code and update the cost model for
5593 // predicated uniform stores. Today they are treated as any other
5594 // predicated store (see added test cases in
5595 // invariant-store-vectorization.ll).
5597 NumPredStores++;
5598
5599 if (Legal->isUniformMemOp(I, VF)) {
5600 auto IsLegalToScalarize = [&]() {
5601 if (!VF.isScalable())
5602 // Scalarization of fixed length vectors "just works".
5603 return true;
5604
5605 // We have dedicated lowering for unpredicated uniform loads and
5606 // stores. Note that even with tail folding we know that at least
5607 // one lane is active (i.e. generalized predication is not possible
5608 // here), and the logic below depends on this fact.
5609 if (!foldTailByMasking())
5610 return true;
5611
5612 // For scalable vectors, a uniform memop load is always
5613 // uniform-by-parts and we know how to scalarize that.
5614 if (isa<LoadInst>(I))
5615 return true;
5616
5617 // A uniform store isn't neccessarily uniform-by-part
5618 // and we can't assume scalarization.
5619 auto &SI = cast<StoreInst>(I);
5620 return TheLoop->isLoopInvariant(SI.getValueOperand());
5621 };
5622
5623 const InstructionCost GatherScatterCost =
5625 getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
5626
5627 // Load: Scalar load + broadcast
5628 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
5629 // FIXME: This cost is a significant under-estimate for tail folded
5630 // memory ops.
5631 const InstructionCost ScalarizationCost =
5632 IsLegalToScalarize() ? getUniformMemOpCost(&I, VF)
5634
5635 // Choose better solution for the current VF, Note that Invalid
5636 // costs compare as maximumal large. If both are invalid, we get
5637 // scalable invalid which signals a failure and a vectorization abort.
5638 if (GatherScatterCost < ScalarizationCost)
5639 setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
5640 else
5641 setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
5642 continue;
5643 }
5644
5645 // We assume that widening is the best solution when possible.
5646 if (memoryInstructionCanBeWidened(&I, VF)) {
5647 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
5648 int ConsecutiveStride = Legal->isConsecutivePtr(
5650 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5651 "Expected consecutive stride.");
5652 InstWidening Decision =
5653 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
5654 setWideningDecision(&I, VF, Decision, Cost);
5655 continue;
5656 }
5657
5658 // Choose between Interleaving, Gather/Scatter or Scalarization.
5660 unsigned NumAccesses = 1;
5661 if (isAccessInterleaved(&I)) {
5662 const auto *Group = getInterleavedAccessGroup(&I);
5663 assert(Group && "Fail to get an interleaved access group.");
5664
5665 // Make one decision for the whole group.
5666 if (getWideningDecision(&I, VF) != CM_Unknown)
5667 continue;
5668
5669 NumAccesses = Group->getNumMembers();
5671 InterleaveCost = getInterleaveGroupCost(&I, VF);
5672 }
5673
5674 InstructionCost GatherScatterCost =
5676 ? getGatherScatterCost(&I, VF) * NumAccesses
5678
5679 InstructionCost ScalarizationCost =
5680 getMemInstScalarizationCost(&I, VF) * NumAccesses;
5681
5682 // Choose better solution for the current VF,
5683 // write down this decision and use it during vectorization.
5685 InstWidening Decision;
5686 if (InterleaveCost <= GatherScatterCost &&
5687 InterleaveCost < ScalarizationCost) {
5688 Decision = CM_Interleave;
5689 Cost = InterleaveCost;
5690 } else if (GatherScatterCost < ScalarizationCost) {
5691 Decision = CM_GatherScatter;
5692 Cost = GatherScatterCost;
5693 } else {
5694 Decision = CM_Scalarize;
5695 Cost = ScalarizationCost;
5696 }
5697 // If the instructions belongs to an interleave group, the whole group
5698 // receives the same decision. The whole group receives the cost, but
5699 // the cost will actually be assigned to one instruction.
5700 if (const auto *Group = getInterleavedAccessGroup(&I)) {
5701 if (Decision == CM_Scalarize) {
5702 for (unsigned Idx = 0; Idx < Group->getFactor(); ++Idx) {
5703 if (auto *I = Group->getMember(Idx)) {
5704 setWideningDecision(I, VF, Decision,
5705 getMemInstScalarizationCost(I, VF));
5706 }
5707 }
5708 } else {
5709 setWideningDecision(Group, VF, Decision, Cost);
5710 }
5711 } else
5712 setWideningDecision(&I, VF, Decision, Cost);
5713 }
5714 }
5715
5716 // Make sure that any load of address and any other address computation
5717 // remains scalar unless there is gather/scatter support. This avoids
5718 // inevitable extracts into address registers, and also has the benefit of
5719 // activating LSR more, since that pass can't optimize vectorized
5720 // addresses.
5721 if (TTI.prefersVectorizedAddressing())
5722 return;
5723
5724 // Start with all scalar pointer uses.
5726 for (BasicBlock *BB : TheLoop->blocks())
5727 for (Instruction &I : *BB) {
5728 Instruction *PtrDef =
5730 if (PtrDef && TheLoop->contains(PtrDef) &&
5732 AddrDefs.insert(PtrDef);
5733 }
5734
5735 // Add all instructions used to generate the addresses.
5737 append_range(Worklist, AddrDefs);
5738 while (!Worklist.empty()) {
5739 Instruction *I = Worklist.pop_back_val();
5740 for (auto &Op : I->operands())
5741 if (auto *InstOp = dyn_cast<Instruction>(Op))
5742 if (TheLoop->contains(InstOp) && !isa<PHINode>(InstOp) &&
5743 AddrDefs.insert(InstOp).second)
5744 Worklist.push_back(InstOp);
5745 }
5746
5747 auto UpdateMemOpUserCost = [this, VF](LoadInst *LI) {
5748 // If there are direct memory op users of the newly scalarized load,
5749 // their cost may have changed because there's no scalarization
5750 // overhead for the operand. Update it.
5751 for (User *U : LI->users()) {
5753 continue;
5755 continue;
5758 getMemInstScalarizationCost(cast<Instruction>(U), VF));
5759 }
5760 };
5761 for (auto *I : AddrDefs) {
5762 if (isa<LoadInst>(I)) {
5763 // Setting the desired widening decision should ideally be handled in
5764 // by cost functions, but since this involves the task of finding out
5765 // if the loaded register is involved in an address computation, it is
5766 // instead changed here when we know this is the case.
5767 InstWidening Decision = getWideningDecision(I, VF);
5768 if (Decision == CM_Widen || Decision == CM_Widen_Reverse ||
5769 (!isPredicatedInst(I) && !Legal->isUniformMemOp(*I, VF) &&
5770 Decision == CM_Scalarize)) {
5771 // Scalarize a widened load of address or update the cost of a scalar
5772 // load of an address.
5774 I, VF, CM_Scalarize,
5775 (VF.getKnownMinValue() *
5776 getMemoryInstructionCost(I, ElementCount::getFixed(1))));
5777 UpdateMemOpUserCost(cast<LoadInst>(I));
5778 } else if (const auto *Group = getInterleavedAccessGroup(I)) {
5779 // Scalarize all members of this interleaved group when any member
5780 // is used as an address. The address-used load skips scalarization
5781 // overhead, other members include it.
5782 for (unsigned Idx = 0; Idx < Group->getFactor(); ++Idx) {
5783 if (Instruction *Member = Group->getMember(Idx)) {
5785 AddrDefs.contains(Member)
5786 ? (VF.getKnownMinValue() *
5787 getMemoryInstructionCost(Member,
5789 : getMemInstScalarizationCost(Member, VF);
5791 UpdateMemOpUserCost(cast<LoadInst>(Member));
5792 }
5793 }
5794 }
5795 } else {
5796 // Cannot scalarize fixed-order recurrence phis at the moment.
5797 if (isa<PHINode>(I) && Legal->isFixedOrderRecurrence(cast<PHINode>(I)))
5798 continue;
5799
5800 // Make sure I gets scalarized and a cost estimate without
5801 // scalarization overhead.
5802 ForcedScalars[VF].insert(I);
5803 }
5804 }
5805}
5806
5808 assert(!VF.isScalar() &&
5809 "Trying to set a vectorization decision for a scalar VF");
5810
5811 auto ForcedScalar = ForcedScalars.find(VF);
5812 for (BasicBlock *BB : TheLoop->blocks()) {
5813 // For each instruction in the old loop.
5814 for (Instruction &I : *BB) {
5816
5817 if (!CI)
5818 continue;
5819
5823 Function *ScalarFunc = CI->getCalledFunction();
5824 Type *ScalarRetTy = CI->getType();
5825 SmallVector<Type *, 4> Tys, ScalarTys;
5826 for (auto &ArgOp : CI->args())
5827 ScalarTys.push_back(ArgOp->getType());
5828
5829 // Estimate cost of scalarized vector call. The source operands are
5830 // assumed to be vectors, so we need to extract individual elements from
5831 // there, execute VF scalar calls, and then gather the result into the
5832 // vector return value.
5833 if (VF.isFixed()) {
5834 InstructionCost ScalarCallCost =
5835 TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
5836
5837 // Compute costs of unpacking argument values for the scalar calls and
5838 // packing the return values to a vector.
5839 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
5840 ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
5841 } else {
5842 // There is no point attempting to calculate the scalar cost for a
5843 // scalable VF as we know it will be Invalid.
5845 "Unexpected valid cost for scalarizing scalable vectors");
5846 ScalarCost = InstructionCost::getInvalid();
5847 }
5848
5849 // Honor ForcedScalars and UniformAfterVectorization decisions.
5850 // TODO: For calls, it might still be more profitable to widen. Use
5851 // VPlan-based cost model to compare different options.
5852 if (VF.isVector() && ((ForcedScalar != ForcedScalars.end() &&
5853 ForcedScalar->second.contains(CI)) ||
5854 isUniformAfterVectorization(CI, VF))) {
5855 setCallWideningDecision(CI, VF, CM_Scalarize, nullptr,
5856 Intrinsic::not_intrinsic, std::nullopt,
5857 ScalarCost);
5858 continue;
5859 }
5860
5861 bool MaskRequired = Legal->isMaskRequired(CI);
5862 // Compute corresponding vector type for return value and arguments.
5863 Type *RetTy = toVectorizedTy(ScalarRetTy, VF);
5864 for (Type *ScalarTy : ScalarTys)
5865 Tys.push_back(toVectorizedTy(ScalarTy, VF));
5866
5867 // An in-loop reduction using an fmuladd intrinsic is a special case;
5868 // we don't want the normal cost for that intrinsic.
5870 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy)) {
5873 std::nullopt, *RedCost);
5874 continue;
5875 }
5876
5877 // Find the cost of vectorizing the call, if we can find a suitable
5878 // vector variant of the function.
5879 VFInfo FuncInfo;
5880 Function *VecFunc = nullptr;
5881 // Search through any available variants for one we can use at this VF.
5882 for (VFInfo &Info : VFDatabase::getMappings(*CI)) {
5883 // Must match requested VF.
5884 if (Info.Shape.VF != VF)
5885 continue;
5886
5887 // Must take a mask argument if one is required
5888 if (MaskRequired && !Info.isMasked())
5889 continue;
5890
5891 // Check that all parameter kinds are supported
5892 bool ParamsOk = true;
5893 for (VFParameter Param : Info.Shape.Parameters) {
5894 switch (Param.ParamKind) {
5896 break;
5898 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
5899 // Make sure the scalar parameter in the loop is invariant.
5900 if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),
5901 TheLoop))
5902 ParamsOk = false;
5903 break;
5904 }
5906 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
5907 // Find the stride for the scalar parameter in this loop and see if
5908 // it matches the stride for the variant.
5909 // TODO: do we need to figure out the cost of an extract to get the
5910 // first lane? Or do we hope that it will be folded away?
5911 ScalarEvolution *SE = PSE.getSE();
5912 if (!match(SE->getSCEV(ScalarParam),
5914 m_SCEV(), m_scev_SpecificSInt(Param.LinearStepOrPos),
5916 ParamsOk = false;
5917 break;
5918 }
5920 break;
5921 default:
5922 ParamsOk = false;
5923 break;
5924 }
5925 }
5926
5927 if (!ParamsOk)
5928 continue;
5929
5930 // Found a suitable candidate, stop here.
5931 VecFunc = CI->getModule()->getFunction(Info.VectorName);
5932 FuncInfo = Info;
5933 break;
5934 }
5935
5936 if (TLI && VecFunc && !CI->isNoBuiltin())
5937 VectorCost = TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind);
5938
5939 // Find the cost of an intrinsic; some targets may have instructions that
5940 // perform the operation without needing an actual call.
5942 if (IID != Intrinsic::not_intrinsic)
5944
5945 InstructionCost Cost = ScalarCost;
5946 InstWidening Decision = CM_Scalarize;
5947
5948 if (VectorCost <= Cost) {
5949 Cost = VectorCost;
5950 Decision = CM_VectorCall;
5951 }
5952
5953 if (IntrinsicCost <= Cost) {
5955 Decision = CM_IntrinsicCall;
5956 }
5957
5958 setCallWideningDecision(CI, VF, Decision, VecFunc, IID,
5960 }
5961 }
5962}
5963
5965 if (!Legal->isInvariant(Op))
5966 return false;
5967 // Consider Op invariant, if it or its operands aren't predicated
5968 // instruction in the loop. In that case, it is not trivially hoistable.
5969 auto *OpI = dyn_cast<Instruction>(Op);
5970 return !OpI || !TheLoop->contains(OpI) ||
5971 (!isPredicatedInst(OpI) &&
5972 (!isa<PHINode>(OpI) || OpI->getParent() != TheLoop->getHeader()) &&
5973 all_of(OpI->operands(),
5974 [this](Value *Op) { return shouldConsiderInvariant(Op); }));
5975}
5976
5979 ElementCount VF) {
5980 // If we know that this instruction will remain uniform, check the cost of
5981 // the scalar version.
5983 VF = ElementCount::getFixed(1);
5984
5985 if (VF.isVector() && isProfitableToScalarize(I, VF))
5986 return InstsToScalarize[VF][I];
5987
5988 // Forced scalars do not have any scalarization overhead.
5989 auto ForcedScalar = ForcedScalars.find(VF);
5990 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
5991 auto InstSet = ForcedScalar->second;
5992 if (InstSet.count(I))
5994 VF.getKnownMinValue();
5995 }
5996
5997 Type *RetTy = I->getType();
5999 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6000 auto *SE = PSE.getSE();
6001
6002 Type *VectorTy;
6003 if (isScalarAfterVectorization(I, VF)) {
6004 [[maybe_unused]] auto HasSingleCopyAfterVectorization =
6005 [this](Instruction *I, ElementCount VF) -> bool {
6006 if (VF.isScalar())
6007 return true;
6008
6009 auto Scalarized = InstsToScalarize.find(VF);
6010 assert(Scalarized != InstsToScalarize.end() &&
6011 "VF not yet analyzed for scalarization profitability");
6012 return !Scalarized->second.count(I) &&
6013 llvm::all_of(I->users(), [&](User *U) {
6014 auto *UI = cast<Instruction>(U);
6015 return !Scalarized->second.count(UI);
6016 });
6017 };
6018
6019 // With the exception of GEPs and PHIs, after scalarization there should
6020 // only be one copy of the instruction generated in the loop. This is
6021 // because the VF is either 1, or any instructions that need scalarizing
6022 // have already been dealt with by the time we get here. As a result,
6023 // it means we don't have to multiply the instruction cost by VF.
6024 assert(I->getOpcode() == Instruction::GetElementPtr ||
6025 I->getOpcode() == Instruction::PHI ||
6026 (I->getOpcode() == Instruction::BitCast &&
6027 I->getType()->isPointerTy()) ||
6028 HasSingleCopyAfterVectorization(I, VF));
6029 VectorTy = RetTy;
6030 } else
6031 VectorTy = toVectorizedTy(RetTy, VF);
6032
6033 if (VF.isVector() && VectorTy->isVectorTy() &&
6034 !TTI.getNumberOfParts(VectorTy))
6036
6037 // TODO: We need to estimate the cost of intrinsic calls.
6038 switch (I->getOpcode()) {
6039 case Instruction::GetElementPtr:
6040 // We mark this instruction as zero-cost because the cost of GEPs in
6041 // vectorized code depends on whether the corresponding memory instruction
6042 // is scalarized or not. Therefore, we handle GEPs with the memory
6043 // instruction cost.
6044 return 0;
6045 case Instruction::Br: {
6046 // In cases of scalarized and predicated instructions, there will be VF
6047 // predicated blocks in the vectorized loop. Each branch around these
6048 // blocks requires also an extract of its vector compare i1 element.
6049 // Note that the conditional branch from the loop latch will be replaced by
6050 // a single branch controlling the loop, so there is no extra overhead from
6051 // scalarization.
6052 bool ScalarPredicatedBB = false;
6054 if (VF.isVector() && BI->isConditional() &&
6055 (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
6056 PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))) &&
6057 BI->getParent() != TheLoop->getLoopLatch())
6058 ScalarPredicatedBB = true;
6059
6060 if (ScalarPredicatedBB) {
6061 // Not possible to scalarize scalable vector with predicated instructions.
6062 if (VF.isScalable())
6064 // Return cost for branches around scalarized and predicated blocks.
6065 auto *VecI1Ty =
6067 return (
6068 TTI.getScalarizationOverhead(
6069 VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
6070 /*Insert*/ false, /*Extract*/ true, CostKind) +
6071 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
6072 }
6073
6074 if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6075 // The back-edge branch will remain, as will all scalar branches.
6076 return TTI.getCFInstrCost(Instruction::Br, CostKind);
6077
6078 // This branch will be eliminated by if-conversion.
6079 return 0;
6080 // Note: We currently assume zero cost for an unconditional branch inside
6081 // a predicated block since it will become a fall-through, although we
6082 // may decide in the future to call TTI for all branches.
6083 }
6084 case Instruction::Switch: {
6085 if (VF.isScalar())
6086 return TTI.getCFInstrCost(Instruction::Switch, CostKind);
6087 auto *Switch = cast<SwitchInst>(I);
6088 return Switch->getNumCases() *
6089 TTI.getCmpSelInstrCost(
6090 Instruction::ICmp,
6091 toVectorTy(Switch->getCondition()->getType(), VF),
6092 toVectorTy(Type::getInt1Ty(I->getContext()), VF),
6094 }
6095 case Instruction::PHI: {
6096 auto *Phi = cast<PHINode>(I);
6097
6098 // First-order recurrences are replaced by vector shuffles inside the loop.
6099 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6101 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
6102 return TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
6103 cast<VectorType>(VectorTy),
6104 cast<VectorType>(VectorTy), Mask, CostKind,
6105 VF.getKnownMinValue() - 1);
6106 }
6107
6108 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6109 // converted into select instructions. We require N - 1 selects per phi
6110 // node, where N is the number of incoming values.
6111 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) {
6112 Type *ResultTy = Phi->getType();
6113
6114 // All instructions in an Any-of reduction chain are narrowed to bool.
6115 // Check if that is the case for this phi node.
6116 auto *HeaderUser = cast_if_present<PHINode>(
6117 find_singleton<User>(Phi->users(), [this](User *U, bool) -> User * {
6118 auto *Phi = dyn_cast<PHINode>(U);
6119 if (Phi && Phi->getParent() == TheLoop->getHeader())
6120 return Phi;
6121 return nullptr;
6122 }));
6123 if (HeaderUser) {
6124 auto &ReductionVars = Legal->getReductionVars();
6125 auto Iter = ReductionVars.find(HeaderUser);
6126 if (Iter != ReductionVars.end() &&
6128 Iter->second.getRecurrenceKind()))
6129 ResultTy = Type::getInt1Ty(Phi->getContext());
6130 }
6131 return (Phi->getNumIncomingValues() - 1) *
6132 TTI.getCmpSelInstrCost(
6133 Instruction::Select, toVectorTy(ResultTy, VF),
6134 toVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6136 }
6137
6138 // When tail folding with EVL, if the phi is part of an out of loop
6139 // reduction then it will be transformed into a wide vp_merge.
6140 if (VF.isVector() && foldTailWithEVL() &&
6141 Legal->getReductionVars().contains(Phi) && !isInLoopReduction(Phi)) {
6143 Intrinsic::vp_merge, toVectorTy(Phi->getType(), VF),
6144 {toVectorTy(Type::getInt1Ty(Phi->getContext()), VF)});
6145 return TTI.getIntrinsicInstrCost(ICA, CostKind);
6146 }
6147
6148 return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6149 }
6150 case Instruction::UDiv:
6151 case Instruction::SDiv:
6152 case Instruction::URem:
6153 case Instruction::SRem:
6154 if (VF.isVector() && isPredicatedInst(I)) {
6155 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6156 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6157 ScalarCost : SafeDivisorCost;
6158 }
6159 // We've proven all lanes safe to speculate, fall through.
6160 [[fallthrough]];
6161 case Instruction::Add:
6162 case Instruction::Sub: {
6163 auto Info = Legal->getHistogramInfo(I);
6164 if (Info && VF.isVector()) {
6165 const HistogramInfo *HGram = Info.value();
6166 // Assume that a non-constant update value (or a constant != 1) requires
6167 // a multiply, and add that into the cost.
6169 ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1));
6170 if (!RHS || RHS->getZExtValue() != 1)
6171 MulCost =
6172 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6173
6174 // Find the cost of the histogram operation itself.
6175 Type *PtrTy = VectorType::get(HGram->Load->getPointerOperandType(), VF);
6176 Type *ScalarTy = I->getType();
6177 Type *MaskTy = VectorType::get(Type::getInt1Ty(I->getContext()), VF);
6178 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
6179 Type::getVoidTy(I->getContext()),
6180 {PtrTy, ScalarTy, MaskTy});
6181
6182 // Add the costs together with the add/sub operation.
6183 return TTI.getIntrinsicInstrCost(ICA, CostKind) + MulCost +
6184 TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, CostKind);
6185 }
6186 [[fallthrough]];
6187 }
6188 case Instruction::FAdd:
6189 case Instruction::FSub:
6190 case Instruction::Mul:
6191 case Instruction::FMul:
6192 case Instruction::FDiv:
6193 case Instruction::FRem:
6194 case Instruction::Shl:
6195 case Instruction::LShr:
6196 case Instruction::AShr:
6197 case Instruction::And:
6198 case Instruction::Or:
6199 case Instruction::Xor: {
6200 // If we're speculating on the stride being 1, the multiplication may
6201 // fold away. We can generalize this for all operations using the notion
6202 // of neutral elements. (TODO)
6203 if (I->getOpcode() == Instruction::Mul &&
6204 ((TheLoop->isLoopInvariant(I->getOperand(0)) &&
6205 PSE.getSCEV(I->getOperand(0))->isOne()) ||
6206 (TheLoop->isLoopInvariant(I->getOperand(1)) &&
6207 PSE.getSCEV(I->getOperand(1))->isOne())))
6208 return 0;
6209
6210 // Detect reduction patterns
6211 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy))
6212 return *RedCost;
6213
6214 // Certain instructions can be cheaper to vectorize if they have a constant
6215 // second vector operand. One example of this are shifts on x86.
6216 Value *Op2 = I->getOperand(1);
6217 if (!isa<Constant>(Op2) && TheLoop->isLoopInvariant(Op2) &&
6218 PSE.getSE()->isSCEVable(Op2->getType()) &&
6219 isa<SCEVConstant>(PSE.getSCEV(Op2))) {
6220 Op2 = cast<SCEVConstant>(PSE.getSCEV(Op2))->getValue();
6221 }
6222 auto Op2Info = TTI.getOperandInfo(Op2);
6223 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6226
6227 SmallVector<const Value *, 4> Operands(I->operand_values());
6228 return TTI.getArithmeticInstrCost(
6229 I->getOpcode(), VectorTy, CostKind,
6230 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6231 Op2Info, Operands, I, TLI);
6232 }
6233 case Instruction::FNeg: {
6234 return TTI.getArithmeticInstrCost(
6235 I->getOpcode(), VectorTy, CostKind,
6236 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6237 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6238 I->getOperand(0), I);
6239 }
6240 case Instruction::Select: {
6242 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6243 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6244
6245 const Value *Op0, *Op1;
6246 using namespace llvm::PatternMatch;
6247 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
6248 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
6249 // select x, y, false --> x & y
6250 // select x, true, y --> x | y
6251 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);
6252 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);
6253 assert(Op0->getType()->getScalarSizeInBits() == 1 &&
6254 Op1->getType()->getScalarSizeInBits() == 1);
6255
6256 return TTI.getArithmeticInstrCost(
6257 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And,
6258 VectorTy, CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, {Op0, Op1}, I);
6259 }
6260
6261 Type *CondTy = SI->getCondition()->getType();
6262 if (!ScalarCond)
6263 CondTy = VectorType::get(CondTy, VF);
6264
6266 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
6267 Pred = Cmp->getPredicate();
6268 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
6269 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
6270 {TTI::OK_AnyValue, TTI::OP_None}, I);
6271 }
6272 case Instruction::ICmp:
6273 case Instruction::FCmp: {
6274 Type *ValTy = I->getOperand(0)->getType();
6275
6277 [[maybe_unused]] Instruction *Op0AsInstruction =
6278 dyn_cast<Instruction>(I->getOperand(0));
6279 assert((!canTruncateToMinimalBitwidth(Op0AsInstruction, VF) ||
6280 MinBWs[I] == MinBWs[Op0AsInstruction]) &&
6281 "if both the operand and the compare are marked for "
6282 "truncation, they must have the same bitwidth");
6283 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[I]);
6284 }
6285
6286 VectorTy = toVectorTy(ValTy, VF);
6287 return TTI.getCmpSelInstrCost(
6288 I->getOpcode(), VectorTy, CmpInst::makeCmpResultType(VectorTy),
6289 cast<CmpInst>(I)->getPredicate(), CostKind,
6290 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, I);
6291 }
6292 case Instruction::Store:
6293 case Instruction::Load: {
6294 ElementCount Width = VF;
6295 if (Width.isVector()) {
6296 InstWidening Decision = getWideningDecision(I, Width);
6297 assert(Decision != CM_Unknown &&
6298 "CM decision should be taken at this point");
6301 if (Decision == CM_Scalarize)
6302 Width = ElementCount::getFixed(1);
6303 }
6304 VectorTy = toVectorTy(getLoadStoreType(I), Width);
6305 return getMemoryInstructionCost(I, VF);
6306 }
6307 case Instruction::BitCast:
6308 if (I->getType()->isPointerTy())
6309 return 0;
6310 [[fallthrough]];
6311 case Instruction::ZExt:
6312 case Instruction::SExt:
6313 case Instruction::FPToUI:
6314 case Instruction::FPToSI:
6315 case Instruction::FPExt:
6316 case Instruction::PtrToInt:
6317 case Instruction::IntToPtr:
6318 case Instruction::SIToFP:
6319 case Instruction::UIToFP:
6320 case Instruction::Trunc:
6321 case Instruction::FPTrunc: {
6322 // Computes the CastContextHint from a Load/Store instruction.
6323 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6325 "Expected a load or a store!");
6326
6327 if (VF.isScalar() || !TheLoop->contains(I))
6329
6330 switch (getWideningDecision(I, VF)) {
6342 llvm_unreachable("Instr did not go through cost modelling?");
6345 llvm_unreachable_internal("Instr has invalid widening decision");
6346 }
6347
6348 llvm_unreachable("Unhandled case!");
6349 };
6350
6351 unsigned Opcode = I->getOpcode();
6353 // For Trunc, the context is the only user, which must be a StoreInst.
6354 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
6355 if (I->hasOneUse())
6356 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
6357 CCH = ComputeCCH(Store);
6358 }
6359 // For Z/Sext, the context is the operand, which must be a LoadInst.
6360 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
6361 Opcode == Instruction::FPExt) {
6362 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
6363 CCH = ComputeCCH(Load);
6364 }
6365
6366 // We optimize the truncation of induction variables having constant
6367 // integer steps. The cost of these truncations is the same as the scalar
6368 // operation.
6369 if (isOptimizableIVTruncate(I, VF)) {
6370 auto *Trunc = cast<TruncInst>(I);
6371 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6372 Trunc->getSrcTy(), CCH, CostKind, Trunc);
6373 }
6374
6375 // Detect reduction patterns
6376 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy))
6377 return *RedCost;
6378
6379 Type *SrcScalarTy = I->getOperand(0)->getType();
6380 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6381 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6382 SrcScalarTy =
6383 IntegerType::get(SrcScalarTy->getContext(), MinBWs[Op0AsInstruction]);
6384 Type *SrcVecTy =
6385 VectorTy->isVectorTy() ? toVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6386
6388 // If the result type is <= the source type, there will be no extend
6389 // after truncating the users to the minimal required bitwidth.
6390 if (VectorTy->getScalarSizeInBits() <= SrcVecTy->getScalarSizeInBits() &&
6391 (I->getOpcode() == Instruction::ZExt ||
6392 I->getOpcode() == Instruction::SExt))
6393 return 0;
6394 }
6395
6396 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
6397 }
6398 case Instruction::Call:
6399 return getVectorCallCost(cast<CallInst>(I), VF);
6400 case Instruction::ExtractValue:
6401 return TTI.getInstructionCost(I, CostKind);
6402 case Instruction::Alloca:
6403 // We cannot easily widen alloca to a scalable alloca, as
6404 // the result would need to be a vector of pointers.
6405 if (VF.isScalable())
6407 [[fallthrough]];
6408 default:
6409 // This opcode is unknown. Assume that it is the same as 'mul'.
6410 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6411 } // end of switch.
6412}
6413
6415 // Ignore ephemeral values.
6417
6418 SmallVector<Value *, 4> DeadInterleavePointerOps;
6420
6421 // If a scalar epilogue is required, users outside the loop won't use
6422 // live-outs from the vector loop but from the scalar epilogue. Ignore them if
6423 // that is the case.
6424 bool RequiresScalarEpilogue = requiresScalarEpilogue(true);
6425 auto IsLiveOutDead = [this, RequiresScalarEpilogue](User *U) {
6426 return RequiresScalarEpilogue &&
6427 !TheLoop->contains(cast<Instruction>(U)->getParent());
6428 };
6429
6431 DFS.perform(LI);
6432 for (BasicBlock *BB : reverse(make_range(DFS.beginRPO(), DFS.endRPO())))
6433 for (Instruction &I : reverse(*BB)) {
6434 if (VecValuesToIgnore.contains(&I) || ValuesToIgnore.contains(&I))
6435 continue;
6436
6437 // Add instructions that would be trivially dead and are only used by
6438 // values already ignored to DeadOps to seed worklist.
6440 all_of(I.users(), [this, IsLiveOutDead](User *U) {
6441 return VecValuesToIgnore.contains(U) ||
6442 ValuesToIgnore.contains(U) || IsLiveOutDead(U);
6443 }))
6444 DeadOps.push_back(&I);
6445
6446 // For interleave groups, we only create a pointer for the start of the
6447 // interleave group. Queue up addresses of group members except the insert
6448 // position for further processing.
6449 if (isAccessInterleaved(&I)) {
6450 auto *Group = getInterleavedAccessGroup(&I);
6451 if (Group->getInsertPos() == &I)
6452 continue;
6453 Value *PointerOp = getLoadStorePointerOperand(&I);
6454 DeadInterleavePointerOps.push_back(PointerOp);
6455 }
6456
6457 // Queue branches for analysis. They are dead, if their successors only
6458 // contain dead instructions.
6459 if (auto *Br = dyn_cast<BranchInst>(&I)) {
6460 if (Br->isConditional())
6461 DeadOps.push_back(&I);
6462 }
6463 }
6464
6465 // Mark ops feeding interleave group members as free, if they are only used
6466 // by other dead computations.
6467 for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) {
6468 auto *Op = dyn_cast<Instruction>(DeadInterleavePointerOps[I]);
6469 if (!Op || !TheLoop->contains(Op) || any_of(Op->users(), [this](User *U) {
6470 Instruction *UI = cast<Instruction>(U);
6471 return !VecValuesToIgnore.contains(U) &&
6472 (!isAccessInterleaved(UI) ||
6473 getInterleavedAccessGroup(UI)->getInsertPos() == UI);
6474 }))
6475 continue;
6476 VecValuesToIgnore.insert(Op);
6477 append_range(DeadInterleavePointerOps, Op->operands());
6478 }
6479
6480 // Mark ops that would be trivially dead and are only used by ignored
6481 // instructions as free.
6482 BasicBlock *Header = TheLoop->getHeader();
6483
6484 // Returns true if the block contains only dead instructions. Such blocks will
6485 // be removed by VPlan-to-VPlan transforms and won't be considered by the
6486 // VPlan-based cost model, so skip them in the legacy cost-model as well.
6487 auto IsEmptyBlock = [this](BasicBlock *BB) {
6488 return all_of(*BB, [this](Instruction &I) {
6489 return ValuesToIgnore.contains(&I) || VecValuesToIgnore.contains(&I) ||
6490 (isa<BranchInst>(&I) && !cast<BranchInst>(&I)->isConditional());
6491 });
6492 };
6493 for (unsigned I = 0; I != DeadOps.size(); ++I) {
6494 auto *Op = dyn_cast<Instruction>(DeadOps[I]);
6495
6496 // Check if the branch should be considered dead.
6497 if (auto *Br = dyn_cast_or_null<BranchInst>(Op)) {
6498 BasicBlock *ThenBB = Br->getSuccessor(0);
6499 BasicBlock *ElseBB = Br->getSuccessor(1);
6500 // Don't considers branches leaving the loop for simplification.
6501 if (!TheLoop->contains(ThenBB) || !TheLoop->contains(ElseBB))
6502 continue;
6503 bool ThenEmpty = IsEmptyBlock(ThenBB);
6504 bool ElseEmpty = IsEmptyBlock(ElseBB);
6505 if ((ThenEmpty && ElseEmpty) ||
6506 (ThenEmpty && ThenBB->getSingleSuccessor() == ElseBB &&
6507 ElseBB->phis().empty()) ||
6508 (ElseEmpty && ElseBB->getSingleSuccessor() == ThenBB &&
6509 ThenBB->phis().empty())) {
6510 VecValuesToIgnore.insert(Br);
6511 DeadOps.push_back(Br->getCondition());
6512 }
6513 continue;
6514 }
6515
6516 // Skip any op that shouldn't be considered dead.
6517 if (!Op || !TheLoop->contains(Op) ||
6518 (isa<PHINode>(Op) && Op->getParent() == Header) ||
6520 any_of(Op->users(), [this, IsLiveOutDead](User *U) {
6521 return !VecValuesToIgnore.contains(U) &&
6522 !ValuesToIgnore.contains(U) && !IsLiveOutDead(U);
6523 }))
6524 continue;
6525
6526 // If all of Op's users are in ValuesToIgnore, add it to ValuesToIgnore
6527 // which applies for both scalar and vector versions. Otherwise it is only
6528 // dead in vector versions, so only add it to VecValuesToIgnore.
6529 if (all_of(Op->users(),
6530 [this](User *U) { return ValuesToIgnore.contains(U); }))
6531 ValuesToIgnore.insert(Op);
6532
6533 VecValuesToIgnore.insert(Op);
6534 append_range(DeadOps, Op->operands());
6535 }
6536
6537 // Ignore type-promoting instructions we identified during reduction
6538 // detection.
6539 for (const auto &Reduction : Legal->getReductionVars()) {
6540 const RecurrenceDescriptor &RedDes = Reduction.second;
6541 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6542 VecValuesToIgnore.insert_range(Casts);
6543 }
6544 // Ignore type-casting instructions we identified during induction
6545 // detection.
6546 for (const auto &Induction : Legal->getInductionVars()) {
6547 const InductionDescriptor &IndDes = Induction.second;
6548 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6549 VecValuesToIgnore.insert_range(Casts);
6550 }
6551}
6552
6554 // Avoid duplicating work finding in-loop reductions.
6555 if (!InLoopReductions.empty())
6556 return;
6557
6558 for (const auto &Reduction : Legal->getReductionVars()) {
6559 PHINode *Phi = Reduction.first;
6560 const RecurrenceDescriptor &RdxDesc = Reduction.second;
6561
6562 // We don't collect reductions that are type promoted (yet).
6563 if (RdxDesc.getRecurrenceType() != Phi->getType())
6564 continue;
6565
6566 // If the target would prefer this reduction to happen "in-loop", then we
6567 // want to record it as such.
6568 RecurKind Kind = RdxDesc.getRecurrenceKind();
6569 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
6570 !TTI.preferInLoopReduction(Kind, Phi->getType()))
6571 continue;
6572
6573 // Check that we can correctly put the reductions into the loop, by
6574 // finding the chain of operations that leads from the phi to the loop
6575 // exit value.
6576 SmallVector<Instruction *, 4> ReductionOperations =
6577 RdxDesc.getReductionOpChain(Phi, TheLoop);
6578 bool InLoop = !ReductionOperations.empty();
6579
6580 if (InLoop) {
6581 InLoopReductions.insert(Phi);
6582 // Add the elements to InLoopReductionImmediateChains for cost modelling.
6583 Instruction *LastChain = Phi;
6584 for (auto *I : ReductionOperations) {
6585 InLoopReductionImmediateChains[I] = LastChain;
6586 LastChain = I;
6587 }
6588 }
6589 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
6590 << " reduction for phi: " << *Phi << "\n");
6591 }
6592}
6593
6594// This function will select a scalable VF if the target supports scalable
6595// vectors and a fixed one otherwise.
6596// TODO: we could return a pair of values that specify the max VF and
6597// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6598// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6599// doesn't have a cost model that can choose which plan to execute if
6600// more than one is generated.
6603 unsigned WidestType;
6604 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6605
6607 TTI.enableScalableVectorization()
6610
6611 TypeSize RegSize = TTI.getRegisterBitWidth(RegKind);
6612 unsigned N = RegSize.getKnownMinValue() / WidestType;
6613 return ElementCount::get(N, RegSize.isScalable());
6614}
6615
6618 ElementCount VF = UserVF;
6619 // Outer loop handling: They may require CFG and instruction level
6620 // transformations before even evaluating whether vectorization is profitable.
6621 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6622 // the vectorization pipeline.
6623 if (!OrigLoop->isInnermost()) {
6624 // If the user doesn't provide a vectorization factor, determine a
6625 // reasonable one.
6626 if (UserVF.isZero()) {
6627 VF = determineVPlanVF(TTI, CM);
6628 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6629
6630 // Make sure we have a VF > 1 for stress testing.
6631 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
6632 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6633 << "overriding computed VF.\n");
6634 VF = ElementCount::getFixed(4);
6635 }
6636 } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
6638 LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
6639 << "not supported by the target.\n");
6641 "Scalable vectorization requested but not supported by the target",
6642 "the scalable user-specified vectorization width for outer-loop "
6643 "vectorization cannot be used because the target does not support "
6644 "scalable vectors.",
6645 "ScalableVFUnfeasible", ORE, OrigLoop);
6647 }
6648 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6650 "VF needs to be a power of two");
6651 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
6652 << "VF " << VF << " to build VPlans.\n");
6653 buildVPlans(VF, VF);
6654
6655 if (VPlans.empty())
6657
6658 // For VPlan build stress testing, we bail out after VPlan construction.
6661
6662 return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
6663 }
6664
6665 LLVM_DEBUG(
6666 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6667 "VPlan-native path.\n");
6669}
6670
6671void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
6672 assert(OrigLoop->isInnermost() && "Inner loop expected.");
6673 CM.collectValuesToIgnore();
6674 CM.collectElementTypesForWidening();
6675
6676 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
6677 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
6678 return;
6679
6680 // Invalidate interleave groups if all blocks of loop will be predicated.
6681 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
6683 LLVM_DEBUG(
6684 dbgs()
6685 << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6686 "which requires masked-interleaved support.\n");
6687 if (CM.InterleaveInfo.invalidateGroups())
6688 // Invalidating interleave groups also requires invalidating all decisions
6689 // based on them, which includes widening decisions and uniform and scalar
6690 // values.
6691 CM.invalidateCostModelingDecisions();
6692 }
6693
6694 if (CM.foldTailByMasking())
6695 Legal->prepareToFoldTailByMasking();
6696
6697 ElementCount MaxUserVF =
6698 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
6699 if (UserVF) {
6700 if (!ElementCount::isKnownLE(UserVF, MaxUserVF)) {
6702 "UserVF ignored because it may be larger than the maximal safe VF",
6703 "InvalidUserVF", ORE, OrigLoop);
6704 } else {
6706 "VF needs to be a power of two");
6707 // Collect the instructions (and their associated costs) that will be more
6708 // profitable to scalarize.
6709 CM.collectInLoopReductions();
6710 if (CM.selectUserVectorizationFactor(UserVF)) {
6711 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6712 buildVPlansWithVPRecipes(UserVF, UserVF);
6714 return;
6715 }
6716 reportVectorizationInfo("UserVF ignored because of invalid costs.",
6717 "InvalidCost", ORE, OrigLoop);
6718 }
6719 }
6720
6721 // Collect the Vectorization Factor Candidates.
6722 SmallVector<ElementCount> VFCandidates;
6723 for (auto VF = ElementCount::getFixed(1);
6724 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
6725 VFCandidates.push_back(VF);
6726 for (auto VF = ElementCount::getScalable(1);
6727 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
6728 VFCandidates.push_back(VF);
6729
6730 CM.collectInLoopReductions();
6731 for (const auto &VF : VFCandidates) {
6732 // Collect Uniform and Scalar instructions after vectorization with VF.
6733 CM.collectNonVectorizedAndSetWideningDecisions(VF);
6734 }
6735
6736 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
6737 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
6738
6740}
6741
6743 ElementCount VF) const {
6744 InstructionCost Cost = CM.getInstructionCost(UI, VF);
6745 if (Cost.isValid() && ForceTargetInstructionCost.getNumOccurrences())
6747 return Cost;
6748}
6749
6751 ElementCount VF) const {
6752 return CM.isUniformAfterVectorization(I, VF);
6753}
6754
6755bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
6756 return CM.ValuesToIgnore.contains(UI) ||
6757 (IsVector && CM.VecValuesToIgnore.contains(UI)) ||
6758 SkipCostComputation.contains(UI);
6759}
6760
6762 return CM.getPredBlockCostDivisor(CostKind, BB);
6763}
6764
6766LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
6767 VPCostContext &CostCtx) const {
6769 // Cost modeling for inductions is inaccurate in the legacy cost model
6770 // compared to the recipes that are generated. To match here initially during
6771 // VPlan cost model bring up directly use the induction costs from the legacy
6772 // cost model. Note that we do this as pre-processing; the VPlan may not have
6773 // any recipes associated with the original induction increment instruction
6774 // and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute
6775 // the cost of induction phis and increments (both that are represented by
6776 // recipes and those that are not), to avoid distinguishing between them here,
6777 // and skip all recipes that represent induction phis and increments (the
6778 // former case) later on, if they exist, to avoid counting them twice.
6779 // Similarly we pre-compute the cost of any optimized truncates.
6780 // TODO: Switch to more accurate costing based on VPlan.
6781 for (const auto &[IV, IndDesc] : Legal->getInductionVars()) {
6783 IV->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
6784 SmallVector<Instruction *> IVInsts = {IVInc};
6785 for (unsigned I = 0; I != IVInsts.size(); I++) {
6786 for (Value *Op : IVInsts[I]->operands()) {
6787 auto *OpI = dyn_cast<Instruction>(Op);
6788 if (Op == IV || !OpI || !OrigLoop->contains(OpI) || !Op->hasOneUse())
6789 continue;
6790 IVInsts.push_back(OpI);
6791 }
6792 }
6793 IVInsts.push_back(IV);
6794 for (User *U : IV->users()) {
6795 auto *CI = cast<Instruction>(U);
6796 if (!CostCtx.CM.isOptimizableIVTruncate(CI, VF))
6797 continue;
6798 IVInsts.push_back(CI);
6799 }
6800
6801 // If the vector loop gets executed exactly once with the given VF, ignore
6802 // the costs of comparison and induction instructions, as they'll get
6803 // simplified away.
6804 // TODO: Remove this code after stepping away from the legacy cost model and
6805 // adding code to simplify VPlans before calculating their costs.
6806 auto TC = getSmallConstantTripCount(PSE.getSE(), OrigLoop);
6807 if (TC == VF && !CM.foldTailByMasking())
6808 addFullyUnrolledInstructionsToIgnore(OrigLoop, Legal->getInductionVars(),
6809 CostCtx.SkipCostComputation);
6810
6811 for (Instruction *IVInst : IVInsts) {
6812 if (CostCtx.skipCostComputation(IVInst, VF.isVector()))
6813 continue;
6814 InstructionCost InductionCost = CostCtx.getLegacyCost(IVInst, VF);
6815 LLVM_DEBUG({
6816 dbgs() << "Cost of " << InductionCost << " for VF " << VF
6817 << ": induction instruction " << *IVInst << "\n";
6818 });
6819 Cost += InductionCost;
6820 CostCtx.SkipCostComputation.insert(IVInst);
6821 }
6822 }
6823
6824 /// Compute the cost of all exiting conditions of the loop using the legacy
6825 /// cost model. This is to match the legacy behavior, which adds the cost of
6826 /// all exit conditions. Note that this over-estimates the cost, as there will
6827 /// be a single condition to control the vector loop.
6829 CM.TheLoop->getExitingBlocks(Exiting);
6830 SetVector<Instruction *> ExitInstrs;
6831 // Collect all exit conditions.
6832 for (BasicBlock *EB : Exiting) {
6833 auto *Term = dyn_cast<BranchInst>(EB->getTerminator());
6834 if (!Term || CostCtx.skipCostComputation(Term, VF.isVector()))
6835 continue;
6836 if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) {
6837 ExitInstrs.insert(CondI);
6838 }
6839 }
6840 // Compute the cost of all instructions only feeding the exit conditions.
6841 for (unsigned I = 0; I != ExitInstrs.size(); ++I) {
6842 Instruction *CondI = ExitInstrs[I];
6843 if (!OrigLoop->contains(CondI) ||
6844 !CostCtx.SkipCostComputation.insert(CondI).second)
6845 continue;
6846 InstructionCost CondICost = CostCtx.getLegacyCost(CondI, VF);
6847 LLVM_DEBUG({
6848 dbgs() << "Cost of " << CondICost << " for VF " << VF
6849 << ": exit condition instruction " << *CondI << "\n";
6850 });
6851 Cost += CondICost;
6852 for (Value *Op : CondI->operands()) {
6853 auto *OpI = dyn_cast<Instruction>(Op);
6854 if (!OpI || CostCtx.skipCostComputation(OpI, VF.isVector()) ||
6855 any_of(OpI->users(), [&ExitInstrs, this](User *U) {
6856 return OrigLoop->contains(cast<Instruction>(U)->getParent()) &&
6857 !ExitInstrs.contains(cast<Instruction>(U));
6858 }))
6859 continue;
6860 ExitInstrs.insert(OpI);
6861 }
6862 }
6863
6864 // Pre-compute the costs for branches except for the backedge, as the number
6865 // of replicate regions in a VPlan may not directly match the number of
6866 // branches, which would lead to different decisions.
6867 // TODO: Compute cost of branches for each replicate region in the VPlan,
6868 // which is more accurate than the legacy cost model.
6869 for (BasicBlock *BB : OrigLoop->blocks()) {
6870 if (CostCtx.skipCostComputation(BB->getTerminator(), VF.isVector()))
6871 continue;
6872 CostCtx.SkipCostComputation.insert(BB->getTerminator());
6873 if (BB == OrigLoop->getLoopLatch())
6874 continue;
6875 auto BranchCost = CostCtx.getLegacyCost(BB->getTerminator(), VF);
6876 Cost += BranchCost;
6877 }
6878
6879 // Pre-compute costs for instructions that are forced-scalar or profitable to
6880 // scalarize. Their costs will be computed separately in the legacy cost
6881 // model.
6882 for (Instruction *ForcedScalar : CM.ForcedScalars[VF]) {
6883 if (CostCtx.skipCostComputation(ForcedScalar, VF.isVector()))
6884 continue;
6885 CostCtx.SkipCostComputation.insert(ForcedScalar);
6886 InstructionCost ForcedCost = CostCtx.getLegacyCost(ForcedScalar, VF);
6887 LLVM_DEBUG({
6888 dbgs() << "Cost of " << ForcedCost << " for VF " << VF
6889 << ": forced scalar " << *ForcedScalar << "\n";
6890 });
6891 Cost += ForcedCost;
6892 }
6893 for (const auto &[Scalarized, ScalarCost] : CM.InstsToScalarize[VF]) {
6894 if (CostCtx.skipCostComputation(Scalarized, VF.isVector()))
6895 continue;
6896 CostCtx.SkipCostComputation.insert(Scalarized);
6897 LLVM_DEBUG({
6898 dbgs() << "Cost of " << ScalarCost << " for VF " << VF
6899 << ": profitable to scalarize " << *Scalarized << "\n";
6900 });
6901 Cost += ScalarCost;
6902 }
6903
6904 return Cost;
6905}
6906
6907InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
6908 ElementCount VF) const {
6909 VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE(),
6910 OrigLoop);
6911 InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
6912
6913 // Now compute and add the VPlan-based cost.
6914 Cost += Plan.cost(VF, CostCtx);
6915#ifndef NDEBUG
6916 unsigned EstimatedWidth = estimateElementCount(VF, CM.getVScaleForTuning());
6917 LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
6918 << " (Estimated cost per lane: ");
6919 if (Cost.isValid()) {
6920 double CostPerLane = double(Cost.getValue()) / EstimatedWidth;
6921 LLVM_DEBUG(dbgs() << format("%.1f", CostPerLane));
6922 } else /* No point dividing an invalid cost - it will still be invalid */
6923 LLVM_DEBUG(dbgs() << "Invalid");
6924 LLVM_DEBUG(dbgs() << ")\n");
6925#endif
6926 return Cost;
6927}
6928
6929#ifndef NDEBUG
6930/// Return true if the original loop \ TheLoop contains any instructions that do
6931/// not have corresponding recipes in \p Plan and are not marked to be ignored
6932/// in \p CostCtx. This means the VPlan contains simplification that the legacy
6933/// cost-model did not account for.
6935 VPCostContext &CostCtx,
6936 Loop *TheLoop,
6937 ElementCount VF) {
6938 // First collect all instructions for the recipes in Plan.
6939 auto GetInstructionForCost = [](const VPRecipeBase *R) -> Instruction * {
6940 if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
6941 return dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
6942 if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))
6943 return &WidenMem->getIngredient();
6944 return nullptr;
6945 };
6946
6947 // Check if a select for a safe divisor was hoisted to the pre-header. If so,
6948 // the select doesn't need to be considered for the vector loop cost; go with
6949 // the more accurate VPlan-based cost model.
6950 for (VPRecipeBase &R : *Plan.getVectorPreheader()) {
6951 auto *VPI = dyn_cast<VPInstruction>(&R);
6952 if (!VPI || VPI->getOpcode() != Instruction::Select ||
6953 VPI->getNumUsers() != 1)
6954 continue;
6955
6956 if (auto *WR = dyn_cast<VPWidenRecipe>(*VPI->user_begin())) {
6957 switch (WR->getOpcode()) {
6958 case Instruction::UDiv:
6959 case Instruction::SDiv:
6960 case Instruction::URem:
6961 case Instruction::SRem:
6962 return true;
6963 default:
6964 break;
6965 }
6966 }
6967 }
6968
6969 DenseSet<Instruction *> SeenInstrs;
6970 auto Iter = vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry());
6972 for (VPRecipeBase &R : *VPBB) {
6973 if (auto *IR = dyn_cast<VPInterleaveRecipe>(&R)) {
6974 auto *IG = IR->getInterleaveGroup();
6975 unsigned NumMembers = IG->getNumMembers();
6976 for (unsigned I = 0; I != NumMembers; ++I) {
6977 if (Instruction *M = IG->getMember(I))
6978 SeenInstrs.insert(M);
6979 }
6980 continue;
6981 }
6982 // Unused FOR splices are removed by VPlan transforms, so the VPlan-based
6983 // cost model won't cost it whilst the legacy will.
6984 if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) {
6985 using namespace VPlanPatternMatch;
6986 if (none_of(FOR->users(),
6987 match_fn(m_VPInstruction<
6989 return true;
6990 }
6991 // The VPlan-based cost model is more accurate for partial reduction and
6992 // comparing against the legacy cost isn't desirable.
6994 return true;
6995
6996 // The VPlan-based cost model can analyze if recipes are scalar
6997 // recursively, but the legacy cost model cannot.
6998 if (auto *WidenMemR = dyn_cast<VPWidenMemoryRecipe>(&R)) {
6999 auto *AddrI = dyn_cast<Instruction>(
7000 getLoadStorePointerOperand(&WidenMemR->getIngredient()));
7001 if (AddrI && vputils::isSingleScalar(WidenMemR->getAddr()) !=
7002 CostCtx.isLegacyUniformAfterVectorization(AddrI, VF))
7003 return true;
7004 }
7005
7006 /// If a VPlan transform folded a recipe to one producing a single-scalar,
7007 /// but the original instruction wasn't uniform-after-vectorization in the
7008 /// legacy cost model, the legacy cost overestimates the actual cost.
7009 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
7010 if (RepR->isSingleScalar() &&
7012 RepR->getUnderlyingInstr(), VF))
7013 return true;
7014 }
7015 if (Instruction *UI = GetInstructionForCost(&R)) {
7016 // If we adjusted the predicate of the recipe, the cost in the legacy
7017 // cost model may be different.
7018 using namespace VPlanPatternMatch;
7019 CmpPredicate Pred;
7020 if (match(&R, m_Cmp(Pred, m_VPValue(), m_VPValue())) &&
7021 cast<VPRecipeWithIRFlags>(R).getPredicate() !=
7022 cast<CmpInst>(UI)->getPredicate())
7023 return true;
7024 SeenInstrs.insert(UI);
7025 }
7026 }
7027 }
7028
7029 // Return true if the loop contains any instructions that are not also part of
7030 // the VPlan or are skipped for VPlan-based cost computations. This indicates
7031 // that the VPlan contains extra simplifications.
7032 return any_of(TheLoop->blocks(), [&SeenInstrs, &CostCtx,
7033 TheLoop](BasicBlock *BB) {
7034 return any_of(*BB, [&SeenInstrs, &CostCtx, TheLoop, BB](Instruction &I) {
7035 // Skip induction phis when checking for simplifications, as they may not
7036 // be lowered directly be lowered to a corresponding PHI recipe.
7037 if (isa<PHINode>(&I) && BB == TheLoop->getHeader() &&
7038 CostCtx.CM.Legal->isInductionPhi(cast<PHINode>(&I)))
7039 return false;
7040 return !SeenInstrs.contains(&I) && !CostCtx.skipCostComputation(&I, true);
7041 });
7042 });
7043}
7044#endif
7045
7047 if (VPlans.empty())
7049 // If there is a single VPlan with a single VF, return it directly.
7050 VPlan &FirstPlan = *VPlans[0];
7051 if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1)
7052 return {*FirstPlan.vectorFactors().begin(), 0, 0};
7053
7054 LLVM_DEBUG(dbgs() << "LV: Computing best VF using cost kind: "
7055 << (CM.CostKind == TTI::TCK_RecipThroughput
7056 ? "Reciprocal Throughput\n"
7057 : CM.CostKind == TTI::TCK_Latency
7058 ? "Instruction Latency\n"
7059 : CM.CostKind == TTI::TCK_CodeSize ? "Code Size\n"
7060 : CM.CostKind == TTI::TCK_SizeAndLatency
7061 ? "Code Size and Latency\n"
7062 : "Unknown\n"));
7063
7065 assert(hasPlanWithVF(ScalarVF) &&
7066 "More than a single plan/VF w/o any plan having scalar VF");
7067
7068 // TODO: Compute scalar cost using VPlan-based cost model.
7069 InstructionCost ScalarCost = CM.expectedCost(ScalarVF);
7070 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ScalarCost << ".\n");
7071 VectorizationFactor ScalarFactor(ScalarVF, ScalarCost, ScalarCost);
7072 VectorizationFactor BestFactor = ScalarFactor;
7073
7074 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
7075 if (ForceVectorization) {
7076 // Ignore scalar width, because the user explicitly wants vectorization.
7077 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
7078 // evaluation.
7079 BestFactor.Cost = InstructionCost::getMax();
7080 }
7081
7082 for (auto &P : VPlans) {
7083 ArrayRef<ElementCount> VFs(P->vectorFactors().begin(),
7084 P->vectorFactors().end());
7085
7087 if (any_of(VFs, [this](ElementCount VF) {
7088 return CM.shouldConsiderRegPressureForVF(VF);
7089 }))
7090 RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore);
7091
7092 for (unsigned I = 0; I < VFs.size(); I++) {
7093 ElementCount VF = VFs[I];
7094 if (VF.isScalar())
7095 continue;
7096 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
7097 LLVM_DEBUG(
7098 dbgs()
7099 << "LV: Not considering vector loop of width " << VF
7100 << " because it will not generate any vector instructions.\n");
7101 continue;
7102 }
7103 if (CM.OptForSize && !ForceVectorization && hasReplicatorRegion(*P)) {
7104 LLVM_DEBUG(
7105 dbgs()
7106 << "LV: Not considering vector loop of width " << VF
7107 << " because it would cause replicated blocks to be generated,"
7108 << " which isn't allowed when optimizing for size.\n");
7109 continue;
7110 }
7111
7112 InstructionCost Cost = cost(*P, VF);
7113 VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
7114
7115 if (CM.shouldConsiderRegPressureForVF(VF) &&
7116 RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs)) {
7117 LLVM_DEBUG(dbgs() << "LV(REG): Not considering vector loop of width "
7118 << VF << " because it uses too many registers\n");
7119 continue;
7120 }
7121
7122 if (isMoreProfitable(CurrentFactor, BestFactor, P->hasScalarTail()))
7123 BestFactor = CurrentFactor;
7124
7125 // If profitable add it to ProfitableVF list.
7126 if (isMoreProfitable(CurrentFactor, ScalarFactor, P->hasScalarTail()))
7127 ProfitableVFs.push_back(CurrentFactor);
7128 }
7129 }
7130
7131#ifndef NDEBUG
7132 // Select the optimal vectorization factor according to the legacy cost-model.
7133 // This is now only used to verify the decisions by the new VPlan-based
7134 // cost-model and will be retired once the VPlan-based cost-model is
7135 // stabilized.
7136 VectorizationFactor LegacyVF = selectVectorizationFactor();
7137 VPlan &BestPlan = getPlanFor(BestFactor.Width);
7138
7139 // Pre-compute the cost and use it to check if BestPlan contains any
7140 // simplifications not accounted for in the legacy cost model. If that's the
7141 // case, don't trigger the assertion, as the extra simplifications may cause a
7142 // different VF to be picked by the VPlan-based cost model.
7143 VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind,
7144 *CM.PSE.getSE(), OrigLoop);
7145 precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
7146 // Verify that the VPlan-based and legacy cost models agree, except for VPlans
7147 // with early exits and plans with additional VPlan simplifications. The
7148 // legacy cost model doesn't properly model costs for such loops.
7149 assert((BestFactor.Width == LegacyVF.Width || BestPlan.hasEarlyExit() ||
7150 !Legal->getLAI()->getSymbolicStrides().empty() ||
7152 CostCtx, OrigLoop,
7153 BestFactor.Width) ||
7155 getPlanFor(LegacyVF.Width), CostCtx, OrigLoop, LegacyVF.Width)) &&
7156 " VPlan cost model and legacy cost model disagreed");
7157 assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) &&
7158 "when vectorizing, the scalar cost must be computed.");
7159#endif
7160
7161 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << BestFactor.Width << ".\n");
7162 return BestFactor;
7163}
7164
7166 using namespace VPlanPatternMatch;
7168 "RdxResult must be ComputeFindIVResult");
7169 VPValue *StartVPV = RdxResult->getOperand(1);
7170 match(StartVPV, m_Freeze(m_VPValue(StartVPV)));
7171 return StartVPV->getLiveInIRValue();
7172}
7173
7174// If \p EpiResumePhiR is resume VPPhi for a reduction when vectorizing the
7175// epilog loop, fix the reduction's scalar PHI node by adding the incoming value
7176// from the main vector loop.
7178 VPPhi *EpiResumePhiR, PHINode &EpiResumePhi, BasicBlock *BypassBlock) {
7179 // Get the VPInstruction computing the reduction result in the middle block.
7180 // The first operand may not be from the middle block if it is not connected
7181 // to the scalar preheader. In that case, there's nothing to fix.
7182 VPValue *Incoming = EpiResumePhiR->getOperand(0);
7185 auto *EpiRedResult = dyn_cast<VPInstruction>(Incoming);
7186 if (!EpiRedResult ||
7187 (EpiRedResult->getOpcode() != VPInstruction::ComputeAnyOfResult &&
7188 EpiRedResult->getOpcode() != VPInstruction::ComputeReductionResult &&
7189 EpiRedResult->getOpcode() != VPInstruction::ComputeFindIVResult))
7190 return;
7191
7192 auto *EpiRedHeaderPhi =
7193 cast<VPReductionPHIRecipe>(EpiRedResult->getOperand(0));
7194 RecurKind Kind = EpiRedHeaderPhi->getRecurrenceKind();
7195 Value *MainResumeValue;
7196 if (auto *VPI = dyn_cast<VPInstruction>(EpiRedHeaderPhi->getStartValue())) {
7197 assert((VPI->getOpcode() == VPInstruction::Broadcast ||
7198 VPI->getOpcode() == VPInstruction::ReductionStartVector) &&
7199 "unexpected start recipe");
7200 MainResumeValue = VPI->getOperand(0)->getUnderlyingValue();
7201 } else
7202 MainResumeValue = EpiRedHeaderPhi->getStartValue()->getUnderlyingValue();
7204 [[maybe_unused]] Value *StartV =
7205 EpiRedResult->getOperand(1)->getLiveInIRValue();
7206 auto *Cmp = cast<ICmpInst>(MainResumeValue);
7207 assert(Cmp->getPredicate() == CmpInst::ICMP_NE &&
7208 "AnyOf expected to start with ICMP_NE");
7209 assert(Cmp->getOperand(1) == StartV &&
7210 "AnyOf expected to start by comparing main resume value to original "
7211 "start value");
7212 MainResumeValue = Cmp->getOperand(0);
7214 Value *StartV = getStartValueFromReductionResult(EpiRedResult);
7215 Value *SentinelV = EpiRedResult->getOperand(2)->getLiveInIRValue();
7216 using namespace llvm::PatternMatch;
7217 Value *Cmp, *OrigResumeV, *CmpOp;
7218 [[maybe_unused]] bool IsExpectedPattern =
7219 match(MainResumeValue,
7220 m_Select(m_OneUse(m_Value(Cmp)), m_Specific(SentinelV),
7221 m_Value(OrigResumeV))) &&
7223 m_Value(CmpOp))) &&
7224 ((CmpOp == StartV && isGuaranteedNotToBeUndefOrPoison(CmpOp))));
7225 assert(IsExpectedPattern && "Unexpected reduction resume pattern");
7226 MainResumeValue = OrigResumeV;
7227 }
7228 PHINode *MainResumePhi = cast<PHINode>(MainResumeValue);
7229
7230 // When fixing reductions in the epilogue loop we should already have
7231 // created a bc.merge.rdx Phi after the main vector body. Ensure that we carry
7232 // over the incoming values correctly.
7233 EpiResumePhi.setIncomingValueForBlock(
7234 BypassBlock, MainResumePhi->getIncomingValueForBlock(BypassBlock));
7235}
7236
7238 ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7239 InnerLoopVectorizer &ILV, DominatorTree *DT, bool VectorizingEpilogue) {
7240 assert(BestVPlan.hasVF(BestVF) &&
7241 "Trying to execute plan with unsupported VF");
7242 assert(BestVPlan.hasUF(BestUF) &&
7243 "Trying to execute plan with unsupported UF");
7244 if (BestVPlan.hasEarlyExit())
7245 ++LoopsEarlyExitVectorized;
7246 // TODO: Move to VPlan transform stage once the transition to the VPlan-based
7247 // cost model is complete for better cost estimates.
7250 BestVPlan);
7253 bool HasBranchWeights =
7254 hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator());
7255 if (HasBranchWeights) {
7256 std::optional<unsigned> VScale = CM.getVScaleForTuning();
7258 BestVPlan, BestVF, VScale);
7259 }
7260
7261 // Checks are the same for all VPlans, added to BestVPlan only for
7262 // compactness.
7263 attachRuntimeChecks(BestVPlan, ILV.RTChecks, HasBranchWeights);
7264
7265 // Retrieving VectorPH now when it's easier while VPlan still has Regions.
7266 VPBasicBlock *VectorPH = cast<VPBasicBlock>(BestVPlan.getVectorPreheader());
7267
7268 VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
7271 if (BestVPlan.getEntry()->getSingleSuccessor() ==
7272 BestVPlan.getScalarPreheader()) {
7273 // TODO: The vector loop would be dead, should not even try to vectorize.
7274 ORE->emit([&]() {
7275 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationDead",
7276 OrigLoop->getStartLoc(),
7277 OrigLoop->getHeader())
7278 << "Created vector loop never executes due to insufficient trip "
7279 "count.";
7280 });
7282 }
7283
7285 BestVPlan, BestVF,
7286 TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector));
7288
7290 // Regions are dissolved after optimizing for VF and UF, which completely
7291 // removes unneeded loop regions first.
7293 // Canonicalize EVL loops after regions are dissolved.
7297 BestVPlan, VectorPH, CM.foldTailByMasking(),
7298 CM.requiresScalarEpilogue(BestVF.isVector()));
7299 VPlanTransforms::materializeVFAndVFxUF(BestVPlan, VectorPH, BestVF);
7300 VPlanTransforms::cse(BestVPlan);
7302
7303 // 0. Generate SCEV-dependent code in the entry, including TripCount, before
7304 // making any changes to the CFG.
7305 DenseMap<const SCEV *, Value *> ExpandedSCEVs =
7306 VPlanTransforms::expandSCEVs(BestVPlan, *PSE.getSE());
7307 if (!ILV.getTripCount())
7308 ILV.setTripCount(BestVPlan.getTripCount()->getLiveInIRValue());
7309 else
7310 assert(VectorizingEpilogue && "should only re-use the existing trip "
7311 "count during epilogue vectorization");
7312
7313 // Perform the actual loop transformation.
7314 VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan,
7315 OrigLoop->getParentLoop(),
7316 Legal->getWidestInductionType());
7317
7318#ifdef EXPENSIVE_CHECKS
7319 assert(DT->verify(DominatorTree::VerificationLevel::Fast));
7320#endif
7321
7322 // 1. Set up the skeleton for vectorization, including vector pre-header and
7323 // middle block. The vector loop is created during VPlan execution.
7324 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
7326 State.CFG.PrevBB->getSingleSuccessor(), &BestVPlan);
7328
7329 assert(verifyVPlanIsValid(BestVPlan, true /*VerifyLate*/) &&
7330 "final VPlan is invalid");
7331
7332 // After vectorization, the exit blocks of the original loop will have
7333 // additional predecessors. Invalidate SCEVs for the exit phis in case SE
7334 // looked through single-entry phis.
7335 ScalarEvolution &SE = *PSE.getSE();
7336 for (VPIRBasicBlock *Exit : BestVPlan.getExitBlocks()) {
7337 if (!Exit->hasPredecessors())
7338 continue;
7339 for (VPRecipeBase &PhiR : Exit->phis())
7341 &cast<VPIRPhi>(PhiR).getIRPhi());
7342 }
7343 // Forget the original loop and block dispositions.
7344 SE.forgetLoop(OrigLoop);
7346
7348
7349 //===------------------------------------------------===//
7350 //
7351 // Notice: any optimization or new instruction that go
7352 // into the code below should also be implemented in
7353 // the cost-model.
7354 //
7355 //===------------------------------------------------===//
7356
7357 // Retrieve loop information before executing the plan, which may remove the
7358 // original loop, if it becomes unreachable.
7359 MDNode *LID = OrigLoop->getLoopID();
7360 unsigned OrigLoopInvocationWeight = 0;
7361 std::optional<unsigned> OrigAverageTripCount =
7362 getLoopEstimatedTripCount(OrigLoop, &OrigLoopInvocationWeight);
7363
7364 BestVPlan.execute(&State);
7365
7366 // 2.6. Maintain Loop Hints
7367 // Keep all loop hints from the original loop on the vector loop (we'll
7368 // replace the vectorizer-specific hints below).
7369 VPBasicBlock *HeaderVPBB = vputils::getFirstLoopHeader(BestVPlan, State.VPDT);
7370 // Add metadata to disable runtime unrolling a scalar loop when there
7371 // are no runtime checks about strides and memory. A scalar loop that is
7372 // rarely used is not worth unrolling.
7373 bool DisableRuntimeUnroll = !ILV.RTChecks.hasChecks() && !BestVF.isScalar();
7375 HeaderVPBB ? LI->getLoopFor(State.CFG.VPBB2IRBB.lookup(HeaderVPBB))
7376 : nullptr,
7377 HeaderVPBB, BestVPlan, VectorizingEpilogue, LID, OrigAverageTripCount,
7378 OrigLoopInvocationWeight,
7379 estimateElementCount(BestVF * BestUF, CM.getVScaleForTuning()),
7380 DisableRuntimeUnroll);
7381
7382 // 3. Fix the vectorized code: take care of header phi's, live-outs,
7383 // predication, updating analyses.
7384 ILV.fixVectorizedLoop(State);
7385
7387
7388 return ExpandedSCEVs;
7389}
7390
7391//===--------------------------------------------------------------------===//
7392// EpilogueVectorizerMainLoop
7393//===--------------------------------------------------------------------===//
7394
7395/// This function is partially responsible for generating the control flow
7396/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7398 BasicBlock *ScalarPH = createScalarPreheader("");
7399 BasicBlock *VectorPH = ScalarPH->getSinglePredecessor();
7400
7401 // Generate the code to check the minimum iteration count of the vector
7402 // epilogue (see below).
7403 EPI.EpilogueIterationCountCheck =
7404 emitIterationCountCheck(VectorPH, ScalarPH, true);
7405 EPI.EpilogueIterationCountCheck->setName("iter.check");
7406
7407 VectorPH = cast<BranchInst>(EPI.EpilogueIterationCountCheck->getTerminator())
7408 ->getSuccessor(1);
7409 // Generate the iteration count check for the main loop, *after* the check
7410 // for the epilogue loop, so that the path-length is shorter for the case
7411 // that goes directly through the vector epilogue. The longer-path length for
7412 // the main loop is compensated for, by the gain from vectorizing the larger
7413 // trip count. Note: the branch will get updated later on when we vectorize
7414 // the epilogue.
7415 EPI.MainLoopIterationCountCheck =
7416 emitIterationCountCheck(VectorPH, ScalarPH, false);
7417
7418 return cast<BranchInst>(EPI.MainLoopIterationCountCheck->getTerminator())
7419 ->getSuccessor(1);
7420}
7421
7423 LLVM_DEBUG({
7424 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7425 << "Main Loop VF:" << EPI.MainLoopVF
7426 << ", Main Loop UF:" << EPI.MainLoopUF
7427 << ", Epilogue Loop VF:" << EPI.EpilogueVF
7428 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7429 });
7430}
7431
7434 dbgs() << "intermediate fn:\n"
7435 << *OrigLoop->getHeader()->getParent() << "\n";
7436 });
7437}
7438
7440 BasicBlock *VectorPH, BasicBlock *Bypass, bool ForEpilogue) {
7441 assert(Bypass && "Expected valid bypass basic block.");
7444 Value *CheckMinIters = createIterationCountCheck(
7445 VectorPH, ForEpilogue ? EPI.EpilogueVF : EPI.MainLoopVF,
7446 ForEpilogue ? EPI.EpilogueUF : EPI.MainLoopUF);
7447
7448 BasicBlock *const TCCheckBlock = VectorPH;
7449 if (!ForEpilogue)
7450 TCCheckBlock->setName("vector.main.loop.iter.check");
7451
7452 // Create new preheader for vector loop.
7453 VectorPH = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7454 static_cast<DominatorTree *>(nullptr), LI, nullptr,
7455 "vector.ph");
7456 if (ForEpilogue) {
7457 // Save the trip count so we don't have to regenerate it in the
7458 // vec.epilog.iter.check. This is safe to do because the trip count
7459 // generated here dominates the vector epilog iter check.
7460 EPI.TripCount = Count;
7461 } else {
7463 }
7464
7465 BranchInst &BI = *BranchInst::Create(Bypass, VectorPH, CheckMinIters);
7466 if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
7467 setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
7468 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
7469
7470 // When vectorizing the main loop, its trip-count check is placed in a new
7471 // block, whereas the overall trip-count check is placed in the VPlan entry
7472 // block. When vectorizing the epilogue loop, its trip-count check is placed
7473 // in the VPlan entry block.
7474 if (!ForEpilogue)
7475 introduceCheckBlockInVPlan(TCCheckBlock);
7476 return TCCheckBlock;
7477}
7478
7479//===--------------------------------------------------------------------===//
7480// EpilogueVectorizerEpilogueLoop
7481//===--------------------------------------------------------------------===//
7482
7483/// This function creates a new scalar preheader, using the previous one as
7484/// entry block to the epilogue VPlan. The minimum iteration check is being
7485/// represented in VPlan.
7487 BasicBlock *NewScalarPH = createScalarPreheader("vec.epilog.");
7488 BasicBlock *OriginalScalarPH = NewScalarPH->getSinglePredecessor();
7489 OriginalScalarPH->setName("vec.epilog.iter.check");
7490 VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(OriginalScalarPH);
7491 VPBasicBlock *OldEntry = Plan.getEntry();
7492 for (auto &R : make_early_inc_range(*OldEntry)) {
7493 // Skip moving VPIRInstructions (including VPIRPhis), which are unmovable by
7494 // defining.
7495 if (isa<VPIRInstruction>(&R))
7496 continue;
7497 R.moveBefore(*NewEntry, NewEntry->end());
7498 }
7499
7500 VPBlockUtils::reassociateBlocks(OldEntry, NewEntry);
7501 Plan.setEntry(NewEntry);
7502 // OldEntry is now dead and will be cleaned up when the plan gets destroyed.
7503
7504 return OriginalScalarPH;
7505}
7506
7508 LLVM_DEBUG({
7509 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7510 << "Epilogue Loop VF:" << EPI.EpilogueVF
7511 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7512 });
7513}
7514
7517 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
7518 });
7519}
7520
7521VPWidenMemoryRecipe *VPRecipeBuilder::tryToWidenMemory(VPInstruction *VPI,
7522 VFRange &Range) {
7523 assert((VPI->getOpcode() == Instruction::Load ||
7524 VPI->getOpcode() == Instruction::Store) &&
7525 "Must be called with either a load or store");
7527
7528 auto WillWiden = [&](ElementCount VF) -> bool {
7530 CM.getWideningDecision(I, VF);
7532 "CM decision should be taken at this point.");
7534 return true;
7535 if (CM.isScalarAfterVectorization(I, VF) ||
7536 CM.isProfitableToScalarize(I, VF))
7537 return false;
7539 };
7540
7542 return nullptr;
7543
7544 VPValue *Mask = nullptr;
7545 if (Legal->isMaskRequired(I))
7546 Mask = getBlockInMask(Builder.getInsertBlock());
7547
7548 // Determine if the pointer operand of the access is either consecutive or
7549 // reverse consecutive.
7551 CM.getWideningDecision(I, Range.Start);
7553 bool Consecutive =
7555
7556 VPValue *Ptr = VPI->getOpcode() == Instruction::Load ? VPI->getOperand(0)
7557 : VPI->getOperand(1);
7558 if (Consecutive) {
7560 Ptr->getUnderlyingValue()->stripPointerCasts());
7561 VPSingleDefRecipe *VectorPtr;
7562 if (Reverse) {
7563 // When folding the tail, we may compute an address that we don't in the
7564 // original scalar loop: drop the GEP no-wrap flags in this case.
7565 // Otherwise preserve existing flags without no-unsigned-wrap, as we will
7566 // emit negative indices.
7567 GEPNoWrapFlags Flags =
7568 CM.foldTailByMasking() || !GEP
7570 : GEP->getNoWrapFlags().withoutNoUnsignedWrap();
7571 VectorPtr = new VPVectorEndPointerRecipe(
7572 Ptr, &Plan.getVF(), getLoadStoreType(I),
7573 /*Stride*/ -1, Flags, VPI->getDebugLoc());
7574 } else {
7575 VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
7576 GEP ? GEP->getNoWrapFlags()
7578 VPI->getDebugLoc());
7579 }
7580 Builder.insert(VectorPtr);
7581 Ptr = VectorPtr;
7582 }
7583 if (VPI->getOpcode() == Instruction::Load) {
7584 auto *Load = cast<LoadInst>(I);
7585 return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
7586 VPIRMetadata(*Load, LVer), I->getDebugLoc());
7587 }
7588
7589 StoreInst *Store = cast<StoreInst>(I);
7590 return new VPWidenStoreRecipe(*Store, Ptr, VPI->getOperand(0), Mask,
7591 Consecutive, Reverse,
7592 VPIRMetadata(*Store, LVer), VPI->getDebugLoc());
7593}
7594
7595/// Creates a VPWidenIntOrFpInductionRecipe for \p PhiR. If needed, it will
7596/// also insert a recipe to expand the step for the induction recipe.
7597static VPWidenIntOrFpInductionRecipe *
7599 const InductionDescriptor &IndDesc, VPlan &Plan,
7600 ScalarEvolution &SE, Loop &OrigLoop) {
7601 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
7602 "step must be loop invariant");
7603
7604 VPValue *Start = PhiR->getOperand(0);
7605 assert(Plan.getLiveIn(IndDesc.getStartValue()) == Start &&
7606 "Start VPValue must match IndDesc's start value");
7607
7608 VPValue *Step =
7611 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
7612 IndDesc, PhiR->getDebugLoc());
7613}
7614
7615VPHeaderPHIRecipe *
7616VPRecipeBuilder::tryToOptimizeInductionPHI(VPInstruction *VPI, VFRange &Range) {
7617 auto *Phi = cast<PHINode>(VPI->getUnderlyingInstr());
7618
7619 // Check if this is an integer or fp induction. If so, build the recipe that
7620 // produces its scalar and vector values.
7621 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
7622 return createWidenInductionRecipes(VPI, *II, Plan, *PSE.getSE(), *OrigLoop);
7623
7624 // Check if this is pointer induction. If so, build the recipe for it.
7625 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
7626 VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep());
7627 return new VPWidenPointerInductionRecipe(
7628 Phi, VPI->getOperand(0), Step, &Plan.getVFxUF(), *II,
7630 [&](ElementCount VF) {
7631 return CM.isScalarAfterVectorization(Phi, VF);
7632 },
7633 Range),
7634 VPI->getDebugLoc());
7635 }
7636 return nullptr;
7637}
7638
7639VPWidenIntOrFpInductionRecipe *
7640VPRecipeBuilder::tryToOptimizeInductionTruncate(VPInstruction *VPI,
7641 VFRange &Range) {
7642 auto *I = cast<TruncInst>(VPI->getUnderlyingInstr());
7643 // Optimize the special case where the source is a constant integer
7644 // induction variable. Notice that we can only optimize the 'trunc' case
7645 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
7646 // (c) other casts depend on pointer size.
7647
7648 // Determine whether \p K is a truncation based on an induction variable that
7649 // can be optimized.
7650 auto IsOptimizableIVTruncate =
7651 [&](Instruction *K) -> std::function<bool(ElementCount)> {
7652 return [=](ElementCount VF) -> bool {
7653 return CM.isOptimizableIVTruncate(K, VF);
7654 };
7655 };
7656
7658 IsOptimizableIVTruncate(I), Range))
7659 return nullptr;
7660
7662 VPI->getOperand(0)->getDefiningRecipe());
7663 PHINode *Phi = WidenIV->getPHINode();
7664 VPValue *Start = WidenIV->getStartValue();
7665 const InductionDescriptor &IndDesc = WidenIV->getInductionDescriptor();
7666 VPValue *Step =
7668 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
7669 IndDesc, I, VPI->getDebugLoc());
7670}
7671
7672VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(VPInstruction *VPI,
7673 VFRange &Range) {
7674 CallInst *CI = cast<CallInst>(VPI->getUnderlyingInstr());
7676 [this, CI](ElementCount VF) {
7677 return CM.isScalarWithPredication(CI, VF);
7678 },
7679 Range);
7680
7681 if (IsPredicated)
7682 return nullptr;
7683
7685 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
7686 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
7687 ID == Intrinsic::pseudoprobe ||
7688 ID == Intrinsic::experimental_noalias_scope_decl))
7689 return nullptr;
7690
7692 VPI->op_begin() + CI->arg_size());
7693
7694 // Is it beneficial to perform intrinsic call compared to lib call?
7695 bool ShouldUseVectorIntrinsic =
7697 [&](ElementCount VF) -> bool {
7698 return CM.getCallWideningDecision(CI, VF).Kind ==
7700 },
7701 Range);
7702 if (ShouldUseVectorIntrinsic)
7703 return new VPWidenIntrinsicRecipe(*CI, ID, Ops, CI->getType(),
7704 VPI->getDebugLoc());
7705
7706 Function *Variant = nullptr;
7707 std::optional<unsigned> MaskPos;
7708 // Is better to call a vectorized version of the function than to to scalarize
7709 // the call?
7710 auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
7711 [&](ElementCount VF) -> bool {
7712 // The following case may be scalarized depending on the VF.
7713 // The flag shows whether we can use a usual Call for vectorized
7714 // version of the instruction.
7715
7716 // If we've found a variant at a previous VF, then stop looking. A
7717 // vectorized variant of a function expects input in a certain shape
7718 // -- basically the number of input registers, the number of lanes
7719 // per register, and whether there's a mask required.
7720 // We store a pointer to the variant in the VPWidenCallRecipe, so
7721 // once we have an appropriate variant it's only valid for that VF.
7722 // This will force a different vplan to be generated for each VF that
7723 // finds a valid variant.
7724 if (Variant)
7725 return false;
7726 LoopVectorizationCostModel::CallWideningDecision Decision =
7727 CM.getCallWideningDecision(CI, VF);
7729 Variant = Decision.Variant;
7730 MaskPos = Decision.MaskPos;
7731 return true;
7732 }
7733
7734 return false;
7735 },
7736 Range);
7737 if (ShouldUseVectorCall) {
7738 if (MaskPos.has_value()) {
7739 // We have 2 cases that would require a mask:
7740 // 1) The block needs to be predicated, either due to a conditional
7741 // in the scalar loop or use of an active lane mask with
7742 // tail-folding, and we use the appropriate mask for the block.
7743 // 2) No mask is required for the block, but the only available
7744 // vector variant at this VF requires a mask, so we synthesize an
7745 // all-true mask.
7746 VPValue *Mask = nullptr;
7747 if (Legal->isMaskRequired(CI))
7748 Mask = getBlockInMask(Builder.getInsertBlock());
7749 else
7750 Mask = Plan.getOrAddLiveIn(
7751 ConstantInt::getTrue(IntegerType::getInt1Ty(Plan.getContext())));
7752
7753 Ops.insert(Ops.begin() + *MaskPos, Mask);
7754 }
7755
7756 Ops.push_back(VPI->getOperand(VPI->getNumOperands() - 1));
7757 return new VPWidenCallRecipe(CI, Variant, Ops, VPI->getDebugLoc());
7758 }
7759
7760 return nullptr;
7761}
7762
7763bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
7765 !isa<StoreInst>(I) && "Instruction should have been handled earlier");
7766 // Instruction should be widened, unless it is scalar after vectorization,
7767 // scalarization is profitable or it is predicated.
7768 auto WillScalarize = [this, I](ElementCount VF) -> bool {
7769 return CM.isScalarAfterVectorization(I, VF) ||
7770 CM.isProfitableToScalarize(I, VF) ||
7771 CM.isScalarWithPredication(I, VF);
7772 };
7774 Range);
7775}
7776
7777VPWidenRecipe *VPRecipeBuilder::tryToWiden(VPInstruction *VPI) {
7778 auto *I = VPI->getUnderlyingInstr();
7779 switch (VPI->getOpcode()) {
7780 default:
7781 return nullptr;
7782 case Instruction::SDiv:
7783 case Instruction::UDiv:
7784 case Instruction::SRem:
7785 case Instruction::URem: {
7786 // If not provably safe, use a select to form a safe divisor before widening the
7787 // div/rem operation itself. Otherwise fall through to general handling below.
7788 if (CM.isPredicatedInst(I)) {
7790 VPValue *Mask = getBlockInMask(Builder.getInsertBlock());
7791 VPValue *One = Plan.getConstantInt(I->getType(), 1u);
7792 auto *SafeRHS =
7793 Builder.createSelect(Mask, Ops[1], One, VPI->getDebugLoc());
7794 Ops[1] = SafeRHS;
7795 return new VPWidenRecipe(*I, Ops);
7796 }
7797 [[fallthrough]];
7798 }
7799 case Instruction::Add:
7800 case Instruction::And:
7801 case Instruction::AShr:
7802 case Instruction::FAdd:
7803 case Instruction::FCmp:
7804 case Instruction::FDiv:
7805 case Instruction::FMul:
7806 case Instruction::FNeg:
7807 case Instruction::FRem:
7808 case Instruction::FSub:
7809 case Instruction::ICmp:
7810 case Instruction::LShr:
7811 case Instruction::Mul:
7812 case Instruction::Or:
7813 case Instruction::Select:
7814 case Instruction::Shl:
7815 case Instruction::Sub:
7816 case Instruction::Xor:
7817 case Instruction::Freeze: {
7818 SmallVector<VPValue *> NewOps(VPI->operands());
7819 if (Instruction::isBinaryOp(VPI->getOpcode())) {
7820 // The legacy cost model uses SCEV to check if some of the operands are
7821 // constants. To match the legacy cost model's behavior, use SCEV to try
7822 // to replace operands with constants.
7823 ScalarEvolution &SE = *PSE.getSE();
7824 auto GetConstantViaSCEV = [this, &SE](VPValue *Op) {
7825 if (!Op->isLiveIn())
7826 return Op;
7827 Value *V = Op->getUnderlyingValue();
7828 if (isa<Constant>(V) || !SE.isSCEVable(V->getType()))
7829 return Op;
7830 auto *C = dyn_cast<SCEVConstant>(SE.getSCEV(V));
7831 if (!C)
7832 return Op;
7833 return Plan.getOrAddLiveIn(C->getValue());
7834 };
7835 // For Mul, the legacy cost model checks both operands.
7836 if (VPI->getOpcode() == Instruction::Mul)
7837 NewOps[0] = GetConstantViaSCEV(NewOps[0]);
7838 // For other binops, the legacy cost model only checks the second operand.
7839 NewOps[1] = GetConstantViaSCEV(NewOps[1]);
7840 }
7841 return new VPWidenRecipe(*I, NewOps);
7842 }
7843 case Instruction::ExtractValue: {
7844 SmallVector<VPValue *> NewOps(VPI->operands());
7845 auto *EVI = cast<ExtractValueInst>(I);
7846 assert(EVI->getNumIndices() == 1 && "Expected one extractvalue index");
7847 unsigned Idx = EVI->getIndices()[0];
7848 NewOps.push_back(Plan.getConstantInt(32, Idx));
7849 return new VPWidenRecipe(*I, NewOps);
7850 }
7851 };
7852}
7853
7854VPHistogramRecipe *VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
7855 VPInstruction *VPI) {
7856 // FIXME: Support other operations.
7857 unsigned Opcode = HI->Update->getOpcode();
7858 assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) &&
7859 "Histogram update operation must be an Add or Sub");
7860
7862 // Bucket address.
7863 HGramOps.push_back(VPI->getOperand(1));
7864 // Increment value.
7865 HGramOps.push_back(getVPValueOrAddLiveIn(HI->Update->getOperand(1)));
7866
7867 // In case of predicated execution (due to tail-folding, or conditional
7868 // execution, or both), pass the relevant mask.
7869 if (Legal->isMaskRequired(HI->Store))
7870 HGramOps.push_back(getBlockInMask(Builder.getInsertBlock()));
7871
7872 return new VPHistogramRecipe(Opcode, HGramOps, VPI->getDebugLoc());
7873}
7874
7876 VFRange &Range) {
7877 auto *I = VPI->getUnderlyingInstr();
7879 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
7880 Range);
7881
7882 bool IsPredicated = CM.isPredicatedInst(I);
7883
7884 // Even if the instruction is not marked as uniform, there are certain
7885 // intrinsic calls that can be effectively treated as such, so we check for
7886 // them here. Conservatively, we only do this for scalable vectors, since
7887 // for fixed-width VFs we can always fall back on full scalarization.
7888 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
7889 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
7890 case Intrinsic::assume:
7891 case Intrinsic::lifetime_start:
7892 case Intrinsic::lifetime_end:
7893 // For scalable vectors if one of the operands is variant then we still
7894 // want to mark as uniform, which will generate one instruction for just
7895 // the first lane of the vector. We can't scalarize the call in the same
7896 // way as for fixed-width vectors because we don't know how many lanes
7897 // there are.
7898 //
7899 // The reasons for doing it this way for scalable vectors are:
7900 // 1. For the assume intrinsic generating the instruction for the first
7901 // lane is still be better than not generating any at all. For
7902 // example, the input may be a splat across all lanes.
7903 // 2. For the lifetime start/end intrinsics the pointer operand only
7904 // does anything useful when the input comes from a stack object,
7905 // which suggests it should always be uniform. For non-stack objects
7906 // the effect is to poison the object, which still allows us to
7907 // remove the call.
7908 IsUniform = true;
7909 break;
7910 default:
7911 break;
7912 }
7913 }
7914 VPValue *BlockInMask = nullptr;
7915 if (!IsPredicated) {
7916 // Finalize the recipe for Instr, first if it is not predicated.
7917 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
7918 } else {
7919 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
7920 // Instructions marked for predication are replicated and a mask operand is
7921 // added initially. Masked replicate recipes will later be placed under an
7922 // if-then construct to prevent side-effects. Generate recipes to compute
7923 // the block mask for this region.
7924 BlockInMask = getBlockInMask(Builder.getInsertBlock());
7925 }
7926
7927 // Note that there is some custom logic to mark some intrinsics as uniform
7928 // manually above for scalable vectors, which this assert needs to account for
7929 // as well.
7930 assert((Range.Start.isScalar() || !IsUniform || !IsPredicated ||
7931 (Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
7932 "Should not predicate a uniform recipe");
7933 auto *Recipe = new VPReplicateRecipe(I, VPI->operands(), IsUniform,
7934 BlockInMask, VPIRMetadata(*I, LVer));
7935 return Recipe;
7936}
7937
7938/// Find all possible partial reductions in the loop and track all of those that
7939/// are valid so recipes can be formed later.
7941 // Find all possible partial reductions.
7943 PartialReductionChains;
7944 for (const auto &[Phi, RdxDesc] : Legal->getReductionVars()) {
7945 getScaledReductions(Phi, RdxDesc.getLoopExitInstr(), Range,
7946 PartialReductionChains);
7947 }
7948
7949 // A partial reduction is invalid if any of its extends are used by
7950 // something that isn't another partial reduction. This is because the
7951 // extends are intended to be lowered along with the reduction itself.
7952
7953 // Build up a set of partial reduction ops for efficient use checking.
7954 SmallPtrSet<User *, 4> PartialReductionOps;
7955 for (const auto &[PartialRdx, _] : PartialReductionChains)
7956 PartialReductionOps.insert(PartialRdx.ExtendUser);
7957
7958 auto ExtendIsOnlyUsedByPartialReductions =
7959 [&PartialReductionOps](Instruction *Extend) {
7960 return all_of(Extend->users(), [&](const User *U) {
7961 return PartialReductionOps.contains(U);
7962 });
7963 };
7964
7965 // Check if each use of a chain's two extends is a partial reduction
7966 // and only add those that don't have non-partial reduction users.
7967 for (auto Pair : PartialReductionChains) {
7968 PartialReductionChain Chain = Pair.first;
7969 if (ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) &&
7970 (!Chain.ExtendB || ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB)))
7971 ScaledReductionMap.try_emplace(Chain.Reduction, Pair.second);
7972 }
7973
7974 // Check that all partial reductions in a chain are only used by other
7975 // partial reductions with the same scale factor. Otherwise we end up creating
7976 // users of scaled reductions where the types of the other operands don't
7977 // match.
7978 for (const auto &[Chain, Scale] : PartialReductionChains) {
7979 auto AllUsersPartialRdx = [ScaleVal = Scale, this](const User *U) {
7980 auto *UI = cast<Instruction>(U);
7981 if (isa<PHINode>(UI) && UI->getParent() == OrigLoop->getHeader()) {
7982 return all_of(UI->users(), [ScaleVal, this](const User *U) {
7983 auto *UI = cast<Instruction>(U);
7984 return ScaledReductionMap.lookup_or(UI, 0) == ScaleVal;
7985 });
7986 }
7987 return ScaledReductionMap.lookup_or(UI, 0) == ScaleVal ||
7988 !OrigLoop->contains(UI->getParent());
7989 };
7990 if (!all_of(Chain.Reduction->users(), AllUsersPartialRdx))
7991 ScaledReductionMap.erase(Chain.Reduction);
7992 }
7993}
7994
7995bool VPRecipeBuilder::getScaledReductions(
7996 Instruction *PHI, Instruction *RdxExitInstr, VFRange &Range,
7997 SmallVectorImpl<std::pair<PartialReductionChain, unsigned>> &Chains) {
7998 if (!CM.TheLoop->contains(RdxExitInstr))
7999 return false;
8000
8001 auto *Update = dyn_cast<BinaryOperator>(RdxExitInstr);
8002 if (!Update)
8003 return false;
8004
8005 Value *Op = Update->getOperand(0);
8006 Value *PhiOp = Update->getOperand(1);
8007 if (Op == PHI)
8008 std::swap(Op, PhiOp);
8009
8010 // Try and get a scaled reduction from the first non-phi operand.
8011 // If one is found, we use the discovered reduction instruction in
8012 // place of the accumulator for costing.
8013 if (auto *OpInst = dyn_cast<Instruction>(Op)) {
8014 if (getScaledReductions(PHI, OpInst, Range, Chains)) {
8015 PHI = Chains.rbegin()->first.Reduction;
8016
8017 Op = Update->getOperand(0);
8018 PhiOp = Update->getOperand(1);
8019 if (Op == PHI)
8020 std::swap(Op, PhiOp);
8021 }
8022 }
8023 if (PhiOp != PHI)
8024 return false;
8025
8026 using namespace llvm::PatternMatch;
8027
8028 // If the update is a binary operator, check both of its operands to see if
8029 // they are extends. Otherwise, see if the update comes directly from an
8030 // extend.
8031 Instruction *Exts[2] = {nullptr};
8032 BinaryOperator *ExtendUser = dyn_cast<BinaryOperator>(Op);
8033 std::optional<unsigned> BinOpc;
8034 Type *ExtOpTypes[2] = {nullptr};
8036
8037 auto CollectExtInfo = [this, &Exts, &ExtOpTypes,
8038 &ExtKinds](SmallVectorImpl<Value *> &Ops) -> bool {
8039 for (const auto &[I, OpI] : enumerate(Ops)) {
8040 const APInt *C;
8041 if (I > 0 && match(OpI, m_APInt(C)) &&
8042 canConstantBeExtended(C, ExtOpTypes[0], ExtKinds[0])) {
8043 ExtOpTypes[I] = ExtOpTypes[0];
8044 ExtKinds[I] = ExtKinds[0];
8045 continue;
8046 }
8047 Value *ExtOp;
8048 if (!match(OpI, m_ZExtOrSExt(m_Value(ExtOp))))
8049 return false;
8050 Exts[I] = cast<Instruction>(OpI);
8051
8052 // TODO: We should be able to support live-ins.
8053 if (!CM.TheLoop->contains(Exts[I]))
8054 return false;
8055
8056 ExtOpTypes[I] = ExtOp->getType();
8057 ExtKinds[I] = TTI::getPartialReductionExtendKind(Exts[I]);
8058 }
8059 return true;
8060 };
8061
8062 if (ExtendUser) {
8063 if (!ExtendUser->hasOneUse())
8064 return false;
8065
8066 // Use the side-effect of match to replace BinOp only if the pattern is
8067 // matched, we don't care at this point whether it actually matched.
8068 match(ExtendUser, m_Neg(m_BinOp(ExtendUser)));
8069
8070 SmallVector<Value *> Ops(ExtendUser->operands());
8071 if (!CollectExtInfo(Ops))
8072 return false;
8073
8074 BinOpc = std::make_optional(ExtendUser->getOpcode());
8075 } else if (match(Update, m_Add(m_Value(), m_Value()))) {
8076 // We already know the operands for Update are Op and PhiOp.
8078 if (!CollectExtInfo(Ops))
8079 return false;
8080
8081 ExtendUser = Update;
8082 BinOpc = std::nullopt;
8083 } else
8084 return false;
8085
8086 PartialReductionChain Chain(RdxExitInstr, Exts[0], Exts[1], ExtendUser);
8087
8088 TypeSize PHISize = PHI->getType()->getPrimitiveSizeInBits();
8089 TypeSize ASize = ExtOpTypes[0]->getPrimitiveSizeInBits();
8090 if (!PHISize.hasKnownScalarFactor(ASize))
8091 return false;
8092 unsigned TargetScaleFactor = PHISize.getKnownScalarFactor(ASize);
8093
8095 [&](ElementCount VF) {
8097 Update->getOpcode(), ExtOpTypes[0], ExtOpTypes[1],
8098 PHI->getType(), VF, ExtKinds[0], ExtKinds[1], BinOpc,
8099 CM.CostKind);
8100 return Cost.isValid();
8101 },
8102 Range)) {
8103 Chains.emplace_back(Chain, TargetScaleFactor);
8104 return true;
8105 }
8106
8107 return false;
8108}
8109
8111 VFRange &Range) {
8112 // First, check for specific widening recipes that deal with inductions, Phi
8113 // nodes, calls and memory operations.
8114 VPRecipeBase *Recipe;
8115 if (auto *PhiR = dyn_cast<VPPhi>(R)) {
8116 VPBasicBlock *Parent = PhiR->getParent();
8117 [[maybe_unused]] VPRegionBlock *LoopRegionOf =
8118 Parent->getEnclosingLoopRegion();
8119 assert(LoopRegionOf && LoopRegionOf->getEntry() == Parent &&
8120 "Non-header phis should have been handled during predication");
8121 auto *Phi = cast<PHINode>(R->getUnderlyingInstr());
8122 assert(R->getNumOperands() == 2 && "Must have 2 operands for header phis");
8123 if ((Recipe = tryToOptimizeInductionPHI(PhiR, Range)))
8124 return Recipe;
8125
8126 VPHeaderPHIRecipe *PhiRecipe = nullptr;
8127 assert((Legal->isReductionVariable(Phi) ||
8128 Legal->isFixedOrderRecurrence(Phi)) &&
8129 "can only widen reductions and fixed-order recurrences here");
8130 VPValue *StartV = R->getOperand(0);
8131 if (Legal->isReductionVariable(Phi)) {
8132 const RecurrenceDescriptor &RdxDesc = Legal->getRecurrenceDescriptor(Phi);
8133 assert(RdxDesc.getRecurrenceStartValue() ==
8134 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8135
8136 // If the PHI is used by a partial reduction, set the scale factor.
8137 unsigned ScaleFactor =
8138 getScalingForReduction(RdxDesc.getLoopExitInstr()).value_or(1);
8139 PhiRecipe = new VPReductionPHIRecipe(
8140 Phi, RdxDesc.getRecurrenceKind(), *StartV, CM.isInLoopReduction(Phi),
8141 CM.useOrderedReductions(RdxDesc), ScaleFactor);
8142 } else {
8143 // TODO: Currently fixed-order recurrences are modeled as chains of
8144 // first-order recurrences. If there are no users of the intermediate
8145 // recurrences in the chain, the fixed order recurrence should be modeled
8146 // directly, enabling more efficient codegen.
8147 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8148 }
8149 // Add backedge value.
8150 PhiRecipe->addOperand(R->getOperand(1));
8151 return PhiRecipe;
8152 }
8153 assert(!R->isPhi() && "only VPPhi nodes expected at this point");
8154
8155 auto *VPI = cast<VPInstruction>(R);
8156 Instruction *Instr = R->getUnderlyingInstr();
8157 if (VPI->getOpcode() == Instruction::Trunc &&
8158 (Recipe = tryToOptimizeInductionTruncate(VPI, Range)))
8159 return Recipe;
8160
8161 // All widen recipes below deal only with VF > 1.
8163 [&](ElementCount VF) { return VF.isScalar(); }, Range))
8164 return nullptr;
8165
8166 if (VPI->getOpcode() == Instruction::Call)
8167 return tryToWidenCall(VPI, Range);
8168
8169 if (VPI->getOpcode() == Instruction::Store)
8170 if (auto HistInfo = Legal->getHistogramInfo(cast<StoreInst>(Instr)))
8171 return tryToWidenHistogram(*HistInfo, VPI);
8172
8173 if (VPI->getOpcode() == Instruction::Load ||
8174 VPI->getOpcode() == Instruction::Store)
8175 return tryToWidenMemory(VPI, Range);
8176
8177 if (std::optional<unsigned> ScaleFactor = getScalingForReduction(Instr))
8178 return tryToCreatePartialReduction(VPI, ScaleFactor.value());
8179
8180 if (!shouldWiden(Instr, Range))
8181 return nullptr;
8182
8183 if (VPI->getOpcode() == Instruction::GetElementPtr)
8184 return new VPWidenGEPRecipe(cast<GetElementPtrInst>(Instr), R->operands());
8185
8186 if (VPI->getOpcode() == Instruction::Select)
8187 return new VPWidenSelectRecipe(*cast<SelectInst>(Instr), R->operands());
8188
8189 if (Instruction::isCast(VPI->getOpcode())) {
8190 auto *CastR = cast<VPInstructionWithType>(R);
8191 auto *CI = cast<CastInst>(Instr);
8192 return new VPWidenCastRecipe(CI->getOpcode(), VPI->getOperand(0),
8193 CastR->getResultType(), *CI);
8194 }
8195
8196 return tryToWiden(VPI);
8197}
8198
8201 unsigned ScaleFactor) {
8202 assert(Reduction->getNumOperands() == 2 &&
8203 "Unexpected number of operands for partial reduction");
8204
8205 VPValue *BinOp = Reduction->getOperand(0);
8206 VPValue *Accumulator = Reduction->getOperand(1);
8207 VPRecipeBase *BinOpRecipe = BinOp->getDefiningRecipe();
8208 if (isa<VPReductionPHIRecipe>(BinOpRecipe) ||
8209 isa<VPPartialReductionRecipe>(BinOpRecipe))
8210 std::swap(BinOp, Accumulator);
8211
8212 assert(ScaleFactor ==
8213 vputils::getVFScaleFactor(Accumulator->getDefiningRecipe()) &&
8214 "all accumulators in chain must have same scale factor");
8215
8216 unsigned ReductionOpcode = Reduction->getOpcode();
8217 auto *ReductionI = Reduction->getUnderlyingInstr();
8218 if (ReductionOpcode == Instruction::Sub) {
8219 auto *const Zero = ConstantInt::get(ReductionI->getType(), 0);
8221 Ops.push_back(Plan.getOrAddLiveIn(Zero));
8222 Ops.push_back(BinOp);
8223 BinOp = new VPWidenRecipe(*ReductionI, Ops);
8224 Builder.insert(BinOp->getDefiningRecipe());
8225 ReductionOpcode = Instruction::Add;
8226 }
8227
8228 VPValue *Cond = nullptr;
8229 if (CM.blockNeedsPredicationForAnyReason(ReductionI->getParent())) {
8230 assert((ReductionOpcode == Instruction::Add ||
8231 ReductionOpcode == Instruction::Sub) &&
8232 "Expected an ADD or SUB operation for predicated partial "
8233 "reductions (because the neutral element in the mask is zero)!");
8234 Cond = getBlockInMask(Builder.getInsertBlock());
8235 VPValue *Zero = Plan.getConstantInt(ReductionI->getType(), 0);
8236 BinOp = Builder.createSelect(Cond, BinOp, Zero, Reduction->getDebugLoc());
8237 }
8238 return new VPPartialReductionRecipe(ReductionOpcode, Accumulator, BinOp, Cond,
8239 ScaleFactor, ReductionI);
8240}
8241
8242void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8243 ElementCount MaxVF) {
8244 if (ElementCount::isKnownGT(MinVF, MaxVF))
8245 return;
8246
8247 assert(OrigLoop->isInnermost() && "Inner loop expected.");
8248
8249 const LoopAccessInfo *LAI = Legal->getLAI();
8251 OrigLoop, LI, DT, PSE.getSE());
8252 if (!LAI->getRuntimePointerChecking()->getChecks().empty() &&
8254 // Only use noalias metadata when using memory checks guaranteeing no
8255 // overlap across all iterations.
8256 LVer.prepareNoAliasMetadata();
8257 }
8258
8259 // Create initial base VPlan0, to serve as common starting point for all
8260 // candidates built later for specific VF ranges.
8261 auto VPlan0 = VPlanTransforms::buildVPlan0(
8262 OrigLoop, *LI, Legal->getWidestInductionType(),
8263 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE);
8264
8265 auto MaxVFTimes2 = MaxVF * 2;
8266 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
8267 VFRange SubRange = {VF, MaxVFTimes2};
8268 if (auto Plan = tryToBuildVPlanWithVPRecipes(
8269 std::unique_ptr<VPlan>(VPlan0->duplicate()), SubRange, &LVer)) {
8270 // Now optimize the initial VPlan.
8272 *Plan, CM.getMinimalBitwidths());
8274 // TODO: try to put it close to addActiveLaneMask().
8275 if (CM.foldTailWithEVL())
8277 *Plan, CM.getMaxSafeElements());
8278 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8279 VPlans.push_back(std::move(Plan));
8280 }
8281 VF = SubRange.End;
8282 }
8283}
8284
8285VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
8286 VPlanPtr Plan, VFRange &Range, LoopVersioning *LVer) {
8287
8288 using namespace llvm::VPlanPatternMatch;
8289 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8290
8291 // ---------------------------------------------------------------------------
8292 // Build initial VPlan: Scan the body of the loop in a topological order to
8293 // visit each basic block after having visited its predecessor basic blocks.
8294 // ---------------------------------------------------------------------------
8295
8296 bool RequiresScalarEpilogueCheck =
8298 [this](ElementCount VF) {
8299 return !CM.requiresScalarEpilogue(VF.isVector());
8300 },
8301 Range);
8302 VPlanTransforms::handleEarlyExits(*Plan, Legal->hasUncountableEarlyExit());
8303 VPlanTransforms::addMiddleCheck(*Plan, RequiresScalarEpilogueCheck,
8304 CM.foldTailByMasking());
8305
8307
8308 // Don't use getDecisionAndClampRange here, because we don't know the UF
8309 // so this function is better to be conservative, rather than to split
8310 // it up into different VPlans.
8311 // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
8312 bool IVUpdateMayOverflow = false;
8313 for (ElementCount VF : Range)
8314 IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
8315
8316 TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
8317 // Use NUW for the induction increment if we proved that it won't overflow in
8318 // the vector loop or when not folding the tail. In the later case, we know
8319 // that the canonical induction increment will not overflow as the vector trip
8320 // count is >= increment and a multiple of the increment.
8321 VPRegionBlock *LoopRegion = Plan->getVectorLoopRegion();
8322 bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None;
8323 if (!HasNUW) {
8324 auto *IVInc =
8325 LoopRegion->getExitingBasicBlock()->getTerminator()->getOperand(0);
8326 assert(match(IVInc,
8327 m_VPInstruction<Instruction::Add>(
8328 m_Specific(LoopRegion->getCanonicalIV()), m_VPValue())) &&
8329 "Did not find the canonical IV increment");
8330 cast<VPRecipeWithIRFlags>(IVInc)->dropPoisonGeneratingFlags();
8331 }
8332
8333 // ---------------------------------------------------------------------------
8334 // Pre-construction: record ingredients whose recipes we'll need to further
8335 // process after constructing the initial VPlan.
8336 // ---------------------------------------------------------------------------
8337
8338 // For each interleave group which is relevant for this (possibly trimmed)
8339 // Range, add it to the set of groups to be later applied to the VPlan and add
8340 // placeholders for its members' Recipes which we'll be replacing with a
8341 // single VPInterleaveRecipe.
8342 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8343 auto ApplyIG = [IG, this](ElementCount VF) -> bool {
8344 bool Result = (VF.isVector() && // Query is illegal for VF == 1
8345 CM.getWideningDecision(IG->getInsertPos(), VF) ==
8347 // For scalable vectors, the interleave factors must be <= 8 since we
8348 // require the (de)interleaveN intrinsics instead of shufflevectors.
8349 assert((!Result || !VF.isScalable() || IG->getFactor() <= 8) &&
8350 "Unsupported interleave factor for scalable vectors");
8351 return Result;
8352 };
8353 if (!getDecisionAndClampRange(ApplyIG, Range))
8354 continue;
8355 InterleaveGroups.insert(IG);
8356 }
8357
8358 // ---------------------------------------------------------------------------
8359 // Predicate and linearize the top-level loop region.
8360 // ---------------------------------------------------------------------------
8361 auto BlockMaskCache = VPlanTransforms::introduceMasksAndLinearize(
8362 *Plan, CM.foldTailByMasking());
8363
8364 // ---------------------------------------------------------------------------
8365 // Construct wide recipes and apply predication for original scalar
8366 // VPInstructions in the loop.
8367 // ---------------------------------------------------------------------------
8368 VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
8369 Builder, BlockMaskCache, LVer);
8370 RecipeBuilder.collectScaledReductions(Range);
8371
8372 // Scan the body of the loop in a topological order to visit each basic block
8373 // after having visited its predecessor basic blocks.
8374 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
8375 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
8376 HeaderVPBB);
8377
8378 auto *MiddleVPBB = Plan->getMiddleBlock();
8379 VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
8380 // Mapping from VPValues in the initial plan to their widened VPValues. Needed
8381 // temporarily to update created block masks.
8382 DenseMap<VPValue *, VPValue *> Old2New;
8383 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
8384 // Convert input VPInstructions to widened recipes.
8385 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
8386 auto *SingleDef = cast<VPSingleDefRecipe>(&R);
8387 auto *UnderlyingValue = SingleDef->getUnderlyingValue();
8388 // Skip recipes that do not need transforming, including canonical IV,
8389 // wide canonical IV and VPInstructions without underlying values. The
8390 // latter are added above for masking.
8391 // FIXME: Migrate code relying on the underlying instruction from VPlan0
8392 // to construct recipes below to not use the underlying instruction.
8394 &R) ||
8395 (isa<VPInstruction>(&R) && !UnderlyingValue))
8396 continue;
8397 assert(isa<VPInstruction>(&R) && UnderlyingValue && "unsupported recipe");
8398
8399 // TODO: Gradually replace uses of underlying instruction by analyses on
8400 // VPlan.
8401 Instruction *Instr = cast<Instruction>(UnderlyingValue);
8402 Builder.setInsertPoint(SingleDef);
8403
8404 // The stores with invariant address inside the loop will be deleted, and
8405 // in the exit block, a uniform store recipe will be created for the final
8406 // invariant store of the reduction.
8407 StoreInst *SI;
8408 if ((SI = dyn_cast<StoreInst>(Instr)) &&
8409 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
8410 // Only create recipe for the final invariant store of the reduction.
8411 if (Legal->isInvariantStoreOfReduction(SI)) {
8412 auto *Recipe =
8413 new VPReplicateRecipe(SI, R.operands(), true /* IsUniform */,
8414 nullptr /*Mask*/, VPIRMetadata(*SI, LVer));
8415 Recipe->insertBefore(*MiddleVPBB, MBIP);
8416 }
8417 R.eraseFromParent();
8418 continue;
8419 }
8420
8421 VPRecipeBase *Recipe =
8422 RecipeBuilder.tryToCreateWidenRecipe(SingleDef, Range);
8423 if (!Recipe)
8424 Recipe = RecipeBuilder.handleReplication(cast<VPInstruction>(SingleDef),
8425 Range);
8426
8427 RecipeBuilder.setRecipe(Instr, Recipe);
8428 if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && isa<TruncInst>(Instr)) {
8429 // Optimized a truncate to VPWidenIntOrFpInductionRecipe. It needs to be
8430 // moved to the phi section in the header.
8431 Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
8432 } else {
8433 Builder.insert(Recipe);
8434 }
8435 if (Recipe->getNumDefinedValues() == 1) {
8436 SingleDef->replaceAllUsesWith(Recipe->getVPSingleValue());
8437 Old2New[SingleDef] = Recipe->getVPSingleValue();
8438 } else {
8439 assert(Recipe->getNumDefinedValues() == 0 &&
8440 "Unexpected multidef recipe");
8441 R.eraseFromParent();
8442 }
8443 }
8444 }
8445
8446 // replaceAllUsesWith above may invalidate the block masks. Update them here.
8447 // TODO: Include the masks as operands in the predicated VPlan directly
8448 // to remove the need to keep a map of masks beyond the predication
8449 // transform.
8450 RecipeBuilder.updateBlockMaskCache(Old2New);
8451 for (VPValue *Old : Old2New.keys())
8452 Old->getDefiningRecipe()->eraseFromParent();
8453
8454 assert(isa<VPRegionBlock>(LoopRegion) &&
8455 !LoopRegion->getEntryBasicBlock()->empty() &&
8456 "entry block must be set to a VPRegionBlock having a non-empty entry "
8457 "VPBasicBlock");
8458
8459 // Update wide induction increments to use the same step as the corresponding
8460 // wide induction. This enables detecting induction increments directly in
8461 // VPlan and removes redundant splats.
8462 for (const auto &[Phi, ID] : Legal->getInductionVars()) {
8463 auto *IVInc = cast<Instruction>(
8464 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
8465 if (IVInc->getOperand(0) != Phi || IVInc->getOpcode() != Instruction::Add)
8466 continue;
8467 VPWidenInductionRecipe *WideIV =
8468 cast<VPWidenInductionRecipe>(RecipeBuilder.getRecipe(Phi));
8469 VPRecipeBase *R = RecipeBuilder.getRecipe(IVInc);
8470 R->setOperand(1, WideIV->getStepValue());
8471 }
8472
8473 // TODO: We can't call runPass on these transforms yet, due to verifier
8474 // failures.
8476 DenseMap<VPValue *, VPValue *> IVEndValues;
8477 VPlanTransforms::addScalarResumePhis(*Plan, RecipeBuilder, IVEndValues);
8478
8479 // ---------------------------------------------------------------------------
8480 // Transform initial VPlan: Apply previously taken decisions, in order, to
8481 // bring the VPlan to its final state.
8482 // ---------------------------------------------------------------------------
8483
8484 // Adjust the recipes for any inloop reductions.
8485 adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start);
8486
8487 // Apply mandatory transformation to handle FP maxnum/minnum reduction with
8488 // NaNs if possible, bail out otherwise.
8490 *Plan))
8491 return nullptr;
8492
8493 // Transform recipes to abstract recipes if it is legal and beneficial and
8494 // clamp the range for better cost estimation.
8495 // TODO: Enable following transform when the EVL-version of extended-reduction
8496 // and mulacc-reduction are implemented.
8497 if (!CM.foldTailWithEVL()) {
8498 VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
8499 *CM.PSE.getSE(), OrigLoop);
8501 CostCtx, Range);
8502 }
8503
8504 for (ElementCount VF : Range)
8505 Plan->addVF(VF);
8506 Plan->setName("Initial VPlan");
8507
8508 // Interleave memory: for each Interleave Group we marked earlier as relevant
8509 // for this VPlan, replace the Recipes widening its memory instructions with a
8510 // single VPInterleaveRecipe at its insertion point.
8512 InterleaveGroups, RecipeBuilder,
8513 CM.isScalarEpilogueAllowed());
8514
8515 // Replace VPValues for known constant strides.
8517 Legal->getLAI()->getSymbolicStrides());
8518
8519 auto BlockNeedsPredication = [this](BasicBlock *BB) {
8520 return Legal->blockNeedsPredication(BB);
8521 };
8523 BlockNeedsPredication);
8524
8525 // Sink users of fixed-order recurrence past the recipe defining the previous
8526 // value and introduce FirstOrderRecurrenceSplice VPInstructions.
8528 *Plan, Builder))
8529 return nullptr;
8530
8531 if (useActiveLaneMask(Style)) {
8532 // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
8533 // TailFoldingStyle is visible there.
8534 bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
8535 bool WithoutRuntimeCheck =
8536 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
8537 VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
8538 WithoutRuntimeCheck);
8539 }
8540 VPlanTransforms::optimizeInductionExitUsers(*Plan, IVEndValues, *PSE.getSE());
8541
8542 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8543 return Plan;
8544}
8545
8546VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
8547 // Outer loop handling: They may require CFG and instruction level
8548 // transformations before even evaluating whether vectorization is profitable.
8549 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8550 // the vectorization pipeline.
8551 assert(!OrigLoop->isInnermost());
8552 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8553
8554 auto Plan = VPlanTransforms::buildVPlan0(
8555 OrigLoop, *LI, Legal->getWidestInductionType(),
8556 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE);
8558 /*HasUncountableExit*/ false);
8559 VPlanTransforms::addMiddleCheck(*Plan, /*RequiresScalarEpilogue*/ true,
8560 /*TailFolded*/ false);
8561
8563
8564 for (ElementCount VF : Range)
8565 Plan->addVF(VF);
8566
8568 *Plan,
8569 [this](PHINode *P) {
8570 return Legal->getIntOrFpInductionDescriptor(P);
8571 },
8572 *TLI))
8573 return nullptr;
8574
8575 // Collect mapping of IR header phis to header phi recipes, to be used in
8576 // addScalarResumePhis.
8577 DenseMap<VPBasicBlock *, VPValue *> BlockMaskCache;
8578 VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
8579 Builder, BlockMaskCache, nullptr /*LVer*/);
8580 for (auto &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
8582 continue;
8583 auto *HeaderR = cast<VPHeaderPHIRecipe>(&R);
8584 RecipeBuilder.setRecipe(HeaderR->getUnderlyingInstr(), HeaderR);
8585 }
8586 DenseMap<VPValue *, VPValue *> IVEndValues;
8587 // TODO: IVEndValues are not used yet in the native path, to optimize exit
8588 // values.
8589 // TODO: We can't call runPass on the transform yet, due to verifier
8590 // failures.
8591 VPlanTransforms::addScalarResumePhis(*Plan, RecipeBuilder, IVEndValues);
8592
8593 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8594 return Plan;
8595}
8596
8597// Adjust the recipes for reductions. For in-loop reductions the chain of
8598// instructions leading from the loop exit instr to the phi need to be converted
8599// to reductions, with one operand being vector and the other being the scalar
8600// reduction chain. For other reductions, a select is introduced between the phi
8601// and users outside the vector region when folding the tail.
8602//
8603// A ComputeReductionResult recipe is added to the middle block, also for
8604// in-loop reductions which compute their result in-loop, because generating
8605// the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
8606//
8607// Adjust AnyOf reductions; replace the reduction phi for the selected value
8608// with a boolean reduction phi node to check if the condition is true in any
8609// iteration. The final value is selected by the final ComputeReductionResult.
8610void LoopVectorizationPlanner::adjustRecipesForReductions(
8611 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
8612 using namespace VPlanPatternMatch;
8613 VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
8614 VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
8615 VPBasicBlock *MiddleVPBB = Plan->getMiddleBlock();
8617
8618 for (VPRecipeBase &R : Header->phis()) {
8619 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
8620 if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
8621 continue;
8622
8623 RecurKind Kind = PhiR->getRecurrenceKind();
8624 assert(
8627 "AnyOf and FindIV reductions are not allowed for in-loop reductions");
8628
8629 // Collect the chain of "link" recipes for the reduction starting at PhiR.
8630 SetVector<VPSingleDefRecipe *> Worklist;
8631 Worklist.insert(PhiR);
8632 for (unsigned I = 0; I != Worklist.size(); ++I) {
8633 VPSingleDefRecipe *Cur = Worklist[I];
8634 for (VPUser *U : Cur->users()) {
8635 auto *UserRecipe = cast<VPSingleDefRecipe>(U);
8636 if (!UserRecipe->getParent()->getEnclosingLoopRegion()) {
8637 assert((UserRecipe->getParent() == MiddleVPBB ||
8638 UserRecipe->getParent() == Plan->getScalarPreheader()) &&
8639 "U must be either in the loop region, the middle block or the "
8640 "scalar preheader.");
8641 continue;
8642 }
8643 Worklist.insert(UserRecipe);
8644 }
8645 }
8646
8647 // Visit operation "Links" along the reduction chain top-down starting from
8648 // the phi until LoopExitValue. We keep track of the previous item
8649 // (PreviousLink) to tell which of the two operands of a Link will remain
8650 // scalar and which will be reduced. For minmax by select(cmp), Link will be
8651 // the select instructions. Blend recipes of in-loop reduction phi's will
8652 // get folded to their non-phi operand, as the reduction recipe handles the
8653 // condition directly.
8654 VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0].
8655 for (VPSingleDefRecipe *CurrentLink : drop_begin(Worklist)) {
8656 if (auto *Blend = dyn_cast<VPBlendRecipe>(CurrentLink)) {
8657 assert(Blend->getNumIncomingValues() == 2 &&
8658 "Blend must have 2 incoming values");
8659 if (Blend->getIncomingValue(0) == PhiR) {
8660 Blend->replaceAllUsesWith(Blend->getIncomingValue(1));
8661 } else {
8662 assert(Blend->getIncomingValue(1) == PhiR &&
8663 "PhiR must be an operand of the blend");
8664 Blend->replaceAllUsesWith(Blend->getIncomingValue(0));
8665 }
8666 continue;
8667 }
8668
8669 Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
8670
8671 // Index of the first operand which holds a non-mask vector operand.
8672 unsigned IndexOfFirstOperand;
8673 // Recognize a call to the llvm.fmuladd intrinsic.
8674 bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
8675 VPValue *VecOp;
8676 VPBasicBlock *LinkVPBB = CurrentLink->getParent();
8677 if (IsFMulAdd) {
8678 assert(
8680 "Expected instruction to be a call to the llvm.fmuladd intrinsic");
8681 assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) ||
8682 isa<VPWidenIntrinsicRecipe>(CurrentLink)) &&
8683 CurrentLink->getOperand(2) == PreviousLink &&
8684 "expected a call where the previous link is the added operand");
8685
8686 // If the instruction is a call to the llvm.fmuladd intrinsic then we
8687 // need to create an fmul recipe (multiplying the first two operands of
8688 // the fmuladd together) to use as the vector operand for the fadd
8689 // reduction.
8690 VPInstruction *FMulRecipe = new VPInstruction(
8691 Instruction::FMul,
8692 {CurrentLink->getOperand(0), CurrentLink->getOperand(1)},
8693 CurrentLinkI->getFastMathFlags());
8694 LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());
8695 VecOp = FMulRecipe;
8696 } else if (PhiR->isInLoop() && Kind == RecurKind::AddChainWithSubs &&
8697 CurrentLinkI->getOpcode() == Instruction::Sub) {
8698 Type *PhiTy = PhiR->getUnderlyingValue()->getType();
8699 auto *Zero = Plan->getConstantInt(PhiTy, 0);
8700 VPWidenRecipe *Sub = new VPWidenRecipe(
8701 Instruction::Sub, {Zero, CurrentLink->getOperand(1)}, {},
8702 VPIRMetadata(), CurrentLinkI->getDebugLoc());
8703 Sub->setUnderlyingValue(CurrentLinkI);
8704 LinkVPBB->insert(Sub, CurrentLink->getIterator());
8705 VecOp = Sub;
8706 } else {
8708 if (isa<VPWidenRecipe>(CurrentLink)) {
8709 assert(isa<CmpInst>(CurrentLinkI) &&
8710 "need to have the compare of the select");
8711 continue;
8712 }
8713 assert(isa<VPWidenSelectRecipe>(CurrentLink) &&
8714 "must be a select recipe");
8715 IndexOfFirstOperand = 1;
8716 } else {
8717 assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) &&
8718 "Expected to replace a VPWidenSC");
8719 IndexOfFirstOperand = 0;
8720 }
8721 // Note that for non-commutable operands (cmp-selects), the semantics of
8722 // the cmp-select are captured in the recurrence kind.
8723 unsigned VecOpId =
8724 CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLink
8725 ? IndexOfFirstOperand + 1
8726 : IndexOfFirstOperand;
8727 VecOp = CurrentLink->getOperand(VecOpId);
8728 assert(VecOp != PreviousLink &&
8729 CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 -
8730 (VecOpId - IndexOfFirstOperand)) ==
8731 PreviousLink &&
8732 "PreviousLink must be the operand other than VecOp");
8733 }
8734
8735 VPValue *CondOp = nullptr;
8736 if (CM.blockNeedsPredicationForAnyReason(CurrentLinkI->getParent()))
8737 CondOp = RecipeBuilder.getBlockInMask(CurrentLink->getParent());
8738
8739 // TODO: Retrieve FMFs from recipes directly.
8740 RecurrenceDescriptor RdxDesc = Legal->getRecurrenceDescriptor(
8741 cast<PHINode>(PhiR->getUnderlyingInstr()));
8742 // Non-FP RdxDescs will have all fast math flags set, so clear them.
8743 FastMathFlags FMFs = isa<FPMathOperator>(CurrentLinkI)
8744 ? RdxDesc.getFastMathFlags()
8745 : FastMathFlags();
8746 auto *RedRecipe = new VPReductionRecipe(
8747 Kind, FMFs, CurrentLinkI, PreviousLink, VecOp, CondOp,
8748 PhiR->isOrdered(), CurrentLinkI->getDebugLoc());
8749 // Append the recipe to the end of the VPBasicBlock because we need to
8750 // ensure that it comes after all of it's inputs, including CondOp.
8751 // Delete CurrentLink as it will be invalid if its operand is replaced
8752 // with a reduction defined at the bottom of the block in the next link.
8753 if (LinkVPBB->getNumSuccessors() == 0)
8754 RedRecipe->insertBefore(&*std::prev(std::prev(LinkVPBB->end())));
8755 else
8756 LinkVPBB->appendRecipe(RedRecipe);
8757
8758 CurrentLink->replaceAllUsesWith(RedRecipe);
8759 ToDelete.push_back(CurrentLink);
8760 PreviousLink = RedRecipe;
8761 }
8762 }
8763 VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
8764 Builder.setInsertPoint(&*std::prev(std::prev(LatchVPBB->end())));
8765 VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
8766 for (VPRecipeBase &R :
8767 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
8768 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
8769 if (!PhiR)
8770 continue;
8771
8772 const RecurrenceDescriptor &RdxDesc = Legal->getRecurrenceDescriptor(
8774 Type *PhiTy = PhiR->getUnderlyingValue()->getType();
8775 // If tail is folded by masking, introduce selects between the phi
8776 // and the users outside the vector region of each reduction, at the
8777 // beginning of the dedicated latch block.
8778 auto *OrigExitingVPV = PhiR->getBackedgeValue();
8779 auto *NewExitingVPV = PhiR->getBackedgeValue();
8780 // Don't output selects for partial reductions because they have an output
8781 // with fewer lanes than the VF. So the operands of the select would have
8782 // different numbers of lanes. Partial reductions mask the input instead.
8783 if (!PhiR->isInLoop() && CM.foldTailByMasking() &&
8784 !isa<VPPartialReductionRecipe>(OrigExitingVPV->getDefiningRecipe())) {
8785 VPValue *Cond = RecipeBuilder.getBlockInMask(PhiR->getParent());
8786 std::optional<FastMathFlags> FMFs =
8787 PhiTy->isFloatingPointTy()
8788 ? std::make_optional(RdxDesc.getFastMathFlags())
8789 : std::nullopt;
8790 NewExitingVPV =
8791 Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs);
8792 OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) {
8793 return isa<VPInstruction>(&U) &&
8794 (cast<VPInstruction>(&U)->getOpcode() ==
8796 cast<VPInstruction>(&U)->getOpcode() ==
8798 cast<VPInstruction>(&U)->getOpcode() ==
8800 });
8801 if (CM.usePredicatedReductionSelect())
8802 PhiR->setOperand(1, NewExitingVPV);
8803 }
8804
8805 // We want code in the middle block to appear to execute on the location of
8806 // the scalar loop's latch terminator because: (a) it is all compiler
8807 // generated, (b) these instructions are always executed after evaluating
8808 // the latch conditional branch, and (c) other passes may add new
8809 // predecessors which terminate on this line. This is the easiest way to
8810 // ensure we don't accidentally cause an extra step back into the loop while
8811 // debugging.
8812 DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
8813
8814 // TODO: At the moment ComputeReductionResult also drives creation of the
8815 // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
8816 // even for in-loop reductions, until the reduction resume value handling is
8817 // also modeled in VPlan.
8818 VPInstruction *FinalReductionResult;
8819 VPBuilder::InsertPointGuard Guard(Builder);
8820 Builder.setInsertPoint(MiddleVPBB, IP);
8821 RecurKind RecurrenceKind = PhiR->getRecurrenceKind();
8823 VPValue *Start = PhiR->getStartValue();
8824 VPValue *Sentinel = Plan->getOrAddLiveIn(RdxDesc.getSentinelValue());
8825 FinalReductionResult =
8826 Builder.createNaryOp(VPInstruction::ComputeFindIVResult,
8827 {PhiR, Start, Sentinel, NewExitingVPV}, ExitDL);
8828 } else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RecurrenceKind)) {
8829 VPValue *Start = PhiR->getStartValue();
8830 FinalReductionResult =
8831 Builder.createNaryOp(VPInstruction::ComputeAnyOfResult,
8832 {PhiR, Start, NewExitingVPV}, ExitDL);
8833 } else {
8834 VPIRFlags Flags =
8836 ? VPIRFlags(RdxDesc.getFastMathFlags())
8837 : VPIRFlags();
8838 FinalReductionResult =
8839 Builder.createNaryOp(VPInstruction::ComputeReductionResult,
8840 {PhiR, NewExitingVPV}, Flags, ExitDL);
8841 }
8842 // If the vector reduction can be performed in a smaller type, we truncate
8843 // then extend the loop exit value to enable InstCombine to evaluate the
8844 // entire expression in the smaller type.
8845 if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&
8847 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
8849 "Unexpected truncated min-max recurrence!");
8850 Type *RdxTy = RdxDesc.getRecurrenceType();
8851 VPWidenCastRecipe *Trunc;
8852 Instruction::CastOps ExtendOpc =
8853 RdxDesc.isSigned() ? Instruction::SExt : Instruction::ZExt;
8854 VPWidenCastRecipe *Extnd;
8855 {
8856 VPBuilder::InsertPointGuard Guard(Builder);
8857 Builder.setInsertPoint(
8858 NewExitingVPV->getDefiningRecipe()->getParent(),
8859 std::next(NewExitingVPV->getDefiningRecipe()->getIterator()));
8860 Trunc =
8861 Builder.createWidenCast(Instruction::Trunc, NewExitingVPV, RdxTy);
8862 Extnd = Builder.createWidenCast(ExtendOpc, Trunc, PhiTy);
8863 }
8864 if (PhiR->getOperand(1) == NewExitingVPV)
8865 PhiR->setOperand(1, Extnd->getVPSingleValue());
8866
8867 // Update ComputeReductionResult with the truncated exiting value and
8868 // extend its result.
8869 FinalReductionResult->setOperand(1, Trunc);
8870 FinalReductionResult =
8871 Builder.createScalarCast(ExtendOpc, FinalReductionResult, PhiTy, {});
8872 }
8873
8874 // Update all users outside the vector region. Also replace redundant
8875 // ExtractLastElement.
8876 for (auto *U : to_vector(OrigExitingVPV->users())) {
8877 auto *Parent = cast<VPRecipeBase>(U)->getParent();
8878 if (FinalReductionResult == U || Parent->getParent())
8879 continue;
8880 U->replaceUsesOfWith(OrigExitingVPV, FinalReductionResult);
8882 cast<VPInstruction>(U)->replaceAllUsesWith(FinalReductionResult);
8883 }
8884
8885 // Adjust AnyOf reductions; replace the reduction phi for the selected value
8886 // with a boolean reduction phi node to check if the condition is true in
8887 // any iteration. The final value is selected by the final
8888 // ComputeReductionResult.
8889 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RecurrenceKind)) {
8890 auto *Select = cast<VPRecipeBase>(*find_if(PhiR->users(), [](VPUser *U) {
8891 return isa<VPWidenSelectRecipe>(U) ||
8892 (isa<VPReplicateRecipe>(U) &&
8893 cast<VPReplicateRecipe>(U)->getUnderlyingInstr()->getOpcode() ==
8894 Instruction::Select);
8895 }));
8896 VPValue *Cmp = Select->getOperand(0);
8897 // If the compare is checking the reduction PHI node, adjust it to check
8898 // the start value.
8899 if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe())
8900 CmpR->replaceUsesOfWith(PhiR, PhiR->getStartValue());
8901 Builder.setInsertPoint(Select);
8902
8903 // If the true value of the select is the reduction phi, the new value is
8904 // selected if the negated condition is true in any iteration.
8905 if (Select->getOperand(1) == PhiR)
8906 Cmp = Builder.createNot(Cmp);
8907 VPValue *Or = Builder.createOr(PhiR, Cmp);
8908 Select->getVPSingleValue()->replaceAllUsesWith(Or);
8909 // Delete Select now that it has invalid types.
8910 ToDelete.push_back(Select);
8911
8912 // Convert the reduction phi to operate on bools.
8913 PhiR->setOperand(0, Plan->getFalse());
8914 continue;
8915 }
8916
8918 RdxDesc.getRecurrenceKind())) {
8919 // Adjust the start value for FindFirstIV/FindLastIV recurrences to use
8920 // the sentinel value after generating the ResumePhi recipe, which uses
8921 // the original start value.
8922 PhiR->setOperand(0, Plan->getOrAddLiveIn(RdxDesc.getSentinelValue()));
8923 }
8924 RecurKind RK = RdxDesc.getRecurrenceKind();
8928 VPBuilder PHBuilder(Plan->getVectorPreheader());
8929 VPValue *Iden = Plan->getOrAddLiveIn(
8930 getRecurrenceIdentity(RK, PhiTy, RdxDesc.getFastMathFlags()));
8931 // If the PHI is used by a partial reduction, set the scale factor.
8932 unsigned ScaleFactor =
8933 RecipeBuilder.getScalingForReduction(RdxDesc.getLoopExitInstr())
8934 .value_or(1);
8935 auto *ScaleFactorVPV = Plan->getConstantInt(32, ScaleFactor);
8936 VPValue *StartV = PHBuilder.createNaryOp(
8938 {PhiR->getStartValue(), Iden, ScaleFactorVPV},
8939 PhiTy->isFloatingPointTy() ? RdxDesc.getFastMathFlags()
8940 : FastMathFlags());
8941 PhiR->setOperand(0, StartV);
8942 }
8943 }
8944 for (VPRecipeBase *R : ToDelete)
8945 R->eraseFromParent();
8946
8948}
8949
8950void LoopVectorizationPlanner::attachRuntimeChecks(
8951 VPlan &Plan, GeneratedRTChecks &RTChecks, bool HasBranchWeights) const {
8952 const auto &[SCEVCheckCond, SCEVCheckBlock] = RTChecks.getSCEVChecks();
8953 if (SCEVCheckBlock && SCEVCheckBlock->hasNPredecessors(0)) {
8954 assert((!CM.OptForSize ||
8955 CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled) &&
8956 "Cannot SCEV check stride or overflow when optimizing for size");
8957 VPlanTransforms::attachCheckBlock(Plan, SCEVCheckCond, SCEVCheckBlock,
8958 HasBranchWeights);
8959 }
8960 const auto &[MemCheckCond, MemCheckBlock] = RTChecks.getMemRuntimeChecks();
8961 if (MemCheckBlock && MemCheckBlock->hasNPredecessors(0)) {
8962 // VPlan-native path does not do any analysis for runtime checks
8963 // currently.
8964 assert((!EnableVPlanNativePath || OrigLoop->isInnermost()) &&
8965 "Runtime checks are not supported for outer loops yet");
8966
8967 if (CM.OptForSize) {
8968 assert(
8969 CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
8970 "Cannot emit memory checks when optimizing for size, unless forced "
8971 "to vectorize.");
8972 ORE->emit([&]() {
8973 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
8974 OrigLoop->getStartLoc(),
8975 OrigLoop->getHeader())
8976 << "Code-size may be reduced by not forcing "
8977 "vectorization, or by source-code modifications "
8978 "eliminating the need for runtime checks "
8979 "(e.g., adding 'restrict').";
8980 });
8981 }
8982 VPlanTransforms::attachCheckBlock(Plan, MemCheckCond, MemCheckBlock,
8983 HasBranchWeights);
8984 }
8985}
8986
8988 VPlan &Plan, ElementCount VF, unsigned UF,
8989 ElementCount MinProfitableTripCount) const {
8990 // vscale is not necessarily a power-of-2, which means we cannot guarantee
8991 // an overflow to zero when updating induction variables and so an
8992 // additional overflow check is required before entering the vector loop.
8993 bool IsIndvarOverflowCheckNeededForVF =
8994 VF.isScalable() && !TTI.isVScaleKnownToBeAPowerOfTwo() &&
8995 !isIndvarOverflowCheckKnownFalse(&CM, VF, UF) &&
8996 CM.getTailFoldingStyle() !=
8998 const uint32_t *BranchWeigths =
8999 hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())
9001 : nullptr;
9003 Plan, VF, UF, MinProfitableTripCount,
9004 CM.requiresScalarEpilogue(VF.isVector()), CM.foldTailByMasking(),
9005 IsIndvarOverflowCheckNeededForVF, OrigLoop, BranchWeigths,
9006 OrigLoop->getLoopPredecessor()->getTerminator()->getDebugLoc(),
9007 *PSE.getSE());
9008}
9009
9011 assert(!State.Lane && "VPDerivedIVRecipe being replicated.");
9012
9013 // Fast-math-flags propagate from the original induction instruction.
9014 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9015 if (FPBinOp)
9016 State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
9017
9018 Value *Step = State.get(getStepValue(), VPLane(0));
9019 Value *Index = State.get(getOperand(1), VPLane(0));
9020 Value *DerivedIV = emitTransformedIndex(
9021 State.Builder, Index, getStartValue()->getLiveInIRValue(), Step, Kind,
9023 DerivedIV->setName(Name);
9024 State.set(this, DerivedIV, VPLane(0));
9025}
9026
9027// Determine how to lower the scalar epilogue, which depends on 1) optimising
9028// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9029// predication, and 4) a TTI hook that analyses whether the loop is suitable
9030// for predication.
9035 // 1) OptSize takes precedence over all other options, i.e. if this is set,
9036 // don't look at hints or options, and don't request a scalar epilogue.
9037 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9038 // LoopAccessInfo (due to code dependency and not being able to reliably get
9039 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9040 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9041 // versioning when the vectorization is forced, unlike hasOptSize. So revert
9042 // back to the old way and vectorize with versioning when forced. See D81345.)
9043 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9047
9048 // 2) If set, obey the directives
9049 if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9057 };
9058 }
9059
9060 // 3) If set, obey the hints
9061 switch (Hints.getPredicate()) {
9066 };
9067
9068 // 4) if the TTI hook indicates this is profitable, request predication.
9069 TailFoldingInfo TFI(TLI, &LVL, IAI);
9070 if (TTI->preferPredicateOverEpilogue(&TFI))
9072
9074}
9075
9076// Process the loop in the VPlan-native vectorization path. This path builds
9077// VPlan upfront in the vectorization pipeline, which allows to apply
9078// VPlan-to-VPlan transformations from the very beginning without modifying the
9079// input LLVM IR.
9086 LoopVectorizationRequirements &Requirements) {
9087
9089 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9090 return false;
9091 }
9092 assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9093 Function *F = L->getHeader()->getParent();
9094 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9095
9097 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
9098
9099 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9100 &Hints, IAI, PSI, BFI);
9101 // Use the planner for outer loop vectorization.
9102 // TODO: CM is not used at this point inside the planner. Turn CM into an
9103 // optional argument if we don't need it in the future.
9104 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
9105 ORE);
9106
9107 // Get user vectorization factor.
9108 ElementCount UserVF = Hints.getWidth();
9109
9111
9112 // Plan how to best vectorize, return the best VF and its cost.
9113 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9114
9115 // If we are stress testing VPlan builds, do not attempt to generate vector
9116 // code. Masked vector code generation support will follow soon.
9117 // Also, do not attempt to vectorize if no vector code will be produced.
9119 return false;
9120
9121 VPlan &BestPlan = LVP.getPlanFor(VF.Width);
9122
9123 {
9124 GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind);
9125 InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, /*UF=*/1, &CM,
9126 BFI, PSI, Checks, BestPlan);
9127 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
9128 << L->getHeader()->getParent()->getName() << "\"\n");
9129 LVP.addMinimumIterationCheck(BestPlan, VF.Width, /*UF=*/1,
9131
9132 LVP.executePlan(VF.Width, /*UF=*/1, BestPlan, LB, DT, false);
9133 }
9134
9135 reportVectorization(ORE, L, VF, 1);
9136
9137 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9138 return true;
9139}
9140
9141// Emit a remark if there are stores to floats that required a floating point
9142// extension. If the vectorized loop was generated with floating point there
9143// will be a performance penalty from the conversion overhead and the change in
9144// the vector width.
9147 for (BasicBlock *BB : L->getBlocks()) {
9148 for (Instruction &Inst : *BB) {
9149 if (auto *S = dyn_cast<StoreInst>(&Inst)) {
9150 if (S->getValueOperand()->getType()->isFloatTy())
9151 Worklist.push_back(S);
9152 }
9153 }
9154 }
9155
9156 // Traverse the floating point stores upwards searching, for floating point
9157 // conversions.
9160 while (!Worklist.empty()) {
9161 auto *I = Worklist.pop_back_val();
9162 if (!L->contains(I))
9163 continue;
9164 if (!Visited.insert(I).second)
9165 continue;
9166
9167 // Emit a remark if the floating point store required a floating
9168 // point conversion.
9169 // TODO: More work could be done to identify the root cause such as a
9170 // constant or a function return type and point the user to it.
9171 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
9172 ORE->emit([&]() {
9173 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
9174 I->getDebugLoc(), L->getHeader())
9175 << "floating point conversion changes vector width. "
9176 << "Mixed floating point precision requires an up/down "
9177 << "cast that will negatively impact performance.";
9178 });
9179
9180 for (Use &Op : I->operands())
9181 if (auto *OpI = dyn_cast<Instruction>(Op))
9182 Worklist.push_back(OpI);
9183 }
9184}
9185
9186/// For loops with uncountable early exits, find the cost of doing work when
9187/// exiting the loop early, such as calculating the final exit values of
9188/// variables used outside the loop.
9189/// TODO: This is currently overly pessimistic because the loop may not take
9190/// the early exit, but better to keep this conservative for now. In future,
9191/// it might be possible to relax this by using branch probabilities.
9193 VPlan &Plan, ElementCount VF) {
9194 InstructionCost Cost = 0;
9195 for (auto *ExitVPBB : Plan.getExitBlocks()) {
9196 for (auto *PredVPBB : ExitVPBB->getPredecessors()) {
9197 // If the predecessor is not the middle.block, then it must be the
9198 // vector.early.exit block, which may contain work to calculate the exit
9199 // values of variables used outside the loop.
9200 if (PredVPBB != Plan.getMiddleBlock()) {
9201 LLVM_DEBUG(dbgs() << "Calculating cost of work in exit block "
9202 << PredVPBB->getName() << ":\n");
9203 Cost += PredVPBB->cost(VF, CostCtx);
9204 }
9205 }
9206 }
9207 return Cost;
9208}
9209
9210/// This function determines whether or not it's still profitable to vectorize
9211/// the loop given the extra work we have to do outside of the loop:
9212/// 1. Perform the runtime checks before entering the loop to ensure it's safe
9213/// to vectorize.
9214/// 2. In the case of loops with uncountable early exits, we may have to do
9215/// extra work when exiting the loop early, such as calculating the final
9216/// exit values of variables used outside the loop.
9217static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
9218 VectorizationFactor &VF, Loop *L,
9220 VPCostContext &CostCtx, VPlan &Plan,
9222 std::optional<unsigned> VScale) {
9223 InstructionCost TotalCost = Checks.getCost();
9224 if (!TotalCost.isValid())
9225 return false;
9226
9227 // Add on the cost of any work required in the vector early exit block, if
9228 // one exists.
9229 TotalCost += calculateEarlyExitCost(CostCtx, Plan, VF.Width);
9230
9231 // When interleaving only scalar and vector cost will be equal, which in turn
9232 // would lead to a divide by 0. Fall back to hard threshold.
9233 if (VF.Width.isScalar()) {
9234 // TODO: Should we rename VectorizeMemoryCheckThreshold?
9235 if (TotalCost > VectorizeMemoryCheckThreshold) {
9236 LLVM_DEBUG(
9237 dbgs()
9238 << "LV: Interleaving only is not profitable due to runtime checks\n");
9239 return false;
9240 }
9241 return true;
9242 }
9243
9244 // The scalar cost should only be 0 when vectorizing with a user specified
9245 // VF/IC. In those cases, runtime checks should always be generated.
9246 uint64_t ScalarC = VF.ScalarCost.getValue();
9247 if (ScalarC == 0)
9248 return true;
9249
9250 // First, compute the minimum iteration count required so that the vector
9251 // loop outperforms the scalar loop.
9252 // The total cost of the scalar loop is
9253 // ScalarC * TC
9254 // where
9255 // * TC is the actual trip count of the loop.
9256 // * ScalarC is the cost of a single scalar iteration.
9257 //
9258 // The total cost of the vector loop is
9259 // RtC + VecC * (TC / VF) + EpiC
9260 // where
9261 // * RtC is the cost of the generated runtime checks plus the cost of
9262 // performing any additional work in the vector.early.exit block for loops
9263 // with uncountable early exits.
9264 // * VecC is the cost of a single vector iteration.
9265 // * TC is the actual trip count of the loop
9266 // * VF is the vectorization factor
9267 // * EpiCost is the cost of the generated epilogue, including the cost
9268 // of the remaining scalar operations.
9269 //
9270 // Vectorization is profitable once the total vector cost is less than the
9271 // total scalar cost:
9272 // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC
9273 //
9274 // Now we can compute the minimum required trip count TC as
9275 // VF * (RtC + EpiC) / (ScalarC * VF - VecC) < TC
9276 //
9277 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
9278 // the computations are performed on doubles, not integers and the result
9279 // is rounded up, hence we get an upper estimate of the TC.
9280 unsigned IntVF = estimateElementCount(VF.Width, VScale);
9281 uint64_t RtC = TotalCost.getValue();
9282 uint64_t Div = ScalarC * IntVF - VF.Cost.getValue();
9283 uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
9284
9285 // Second, compute a minimum iteration count so that the cost of the
9286 // runtime checks is only a fraction of the total scalar loop cost. This
9287 // adds a loop-dependent bound on the overhead incurred if the runtime
9288 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
9289 // * TC. To bound the runtime check to be a fraction 1/X of the scalar
9290 // cost, compute
9291 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC
9292 uint64_t MinTC2 = divideCeil(RtC * 10, ScalarC);
9293
9294 // Now pick the larger minimum. If it is not a multiple of VF and a scalar
9295 // epilogue is allowed, choose the next closest multiple of VF. This should
9296 // partly compensate for ignoring the epilogue cost.
9297 uint64_t MinTC = std::max(MinTC1, MinTC2);
9298 if (SEL == CM_ScalarEpilogueAllowed)
9299 MinTC = alignTo(MinTC, IntVF);
9301
9302 LLVM_DEBUG(
9303 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
9304 << VF.MinProfitableTripCount << "\n");
9305
9306 // Skip vectorization if the expected trip count is less than the minimum
9307 // required trip count.
9308 if (auto ExpectedTC = getSmallBestKnownTC(PSE, L)) {
9309 if (ElementCount::isKnownLT(*ExpectedTC, VF.MinProfitableTripCount)) {
9310 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
9311 "trip count < minimum profitable VF ("
9312 << *ExpectedTC << " < " << VF.MinProfitableTripCount
9313 << ")\n");
9314
9315 return false;
9316 }
9317 }
9318 return true;
9319}
9320
9322 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
9324 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
9326
9327/// Prepare \p MainPlan for vectorizing the main vector loop during epilogue
9328/// vectorization. Remove ResumePhis from \p MainPlan for inductions that
9329/// don't have a corresponding wide induction in \p EpiPlan.
9330static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
9331 // Collect PHI nodes of widened phis in the VPlan for the epilogue. Those
9332 // will need their resume-values computed in the main vector loop. Others
9333 // can be removed from the main VPlan.
9334 SmallPtrSet<PHINode *, 2> EpiWidenedPhis;
9335 for (VPRecipeBase &R :
9338 continue;
9339 EpiWidenedPhis.insert(
9340 cast<PHINode>(R.getVPSingleValue()->getUnderlyingValue()));
9341 }
9342 for (VPRecipeBase &R :
9343 make_early_inc_range(MainPlan.getScalarHeader()->phis())) {
9344 auto *VPIRInst = cast<VPIRPhi>(&R);
9345 if (EpiWidenedPhis.contains(&VPIRInst->getIRPhi()))
9346 continue;
9347 // There is no corresponding wide induction in the epilogue plan that would
9348 // need a resume value. Remove the VPIRInst wrapping the scalar header phi
9349 // together with the corresponding ResumePhi. The resume values for the
9350 // scalar loop will be created during execution of EpiPlan.
9351 VPRecipeBase *ResumePhi = VPIRInst->getOperand(0)->getDefiningRecipe();
9352 VPIRInst->eraseFromParent();
9353 ResumePhi->eraseFromParent();
9354 }
9356
9357 using namespace VPlanPatternMatch;
9358 // When vectorizing the epilogue, FindFirstIV & FindLastIV reductions can
9359 // introduce multiple uses of undef/poison. If the reduction start value may
9360 // be undef or poison it needs to be frozen and the frozen start has to be
9361 // used when computing the reduction result. We also need to use the frozen
9362 // value in the resume phi generated by the main vector loop, as this is also
9363 // used to compute the reduction result after the epilogue vector loop.
9364 auto AddFreezeForFindLastIVReductions = [](VPlan &Plan,
9365 bool UpdateResumePhis) {
9366 VPBuilder Builder(Plan.getEntry());
9367 for (VPRecipeBase &R : *Plan.getMiddleBlock()) {
9368 auto *VPI = dyn_cast<VPInstruction>(&R);
9369 if (!VPI || VPI->getOpcode() != VPInstruction::ComputeFindIVResult)
9370 continue;
9371 VPValue *OrigStart = VPI->getOperand(1);
9373 continue;
9374 VPInstruction *Freeze =
9375 Builder.createNaryOp(Instruction::Freeze, {OrigStart}, {}, "fr");
9376 VPI->setOperand(1, Freeze);
9377 if (UpdateResumePhis)
9378 OrigStart->replaceUsesWithIf(Freeze, [Freeze](VPUser &U, unsigned) {
9379 return Freeze != &U && isa<VPPhi>(&U);
9380 });
9381 }
9382 };
9383 AddFreezeForFindLastIVReductions(MainPlan, true);
9384 AddFreezeForFindLastIVReductions(EpiPlan, false);
9385
9386 VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader();
9387 VPValue *VectorTC = &MainPlan.getVectorTripCount();
9388 // If there is a suitable resume value for the canonical induction in the
9389 // scalar (which will become vector) epilogue loop, use it and move it to the
9390 // beginning of the scalar preheader. Otherwise create it below.
9391 auto ResumePhiIter =
9392 find_if(MainScalarPH->phis(), [VectorTC](VPRecipeBase &R) {
9393 return match(&R, m_VPInstruction<Instruction::PHI>(m_Specific(VectorTC),
9394 m_ZeroInt()));
9395 });
9396 VPPhi *ResumePhi = nullptr;
9397 if (ResumePhiIter == MainScalarPH->phis().end()) {
9398 VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin());
9399 ResumePhi = ScalarPHBuilder.createScalarPhi(
9400 {VectorTC,
9402 {}, "vec.epilog.resume.val");
9403 } else {
9404 ResumePhi = cast<VPPhi>(&*ResumePhiIter);
9405 if (MainScalarPH->begin() == MainScalarPH->end())
9406 ResumePhi->moveBefore(*MainScalarPH, MainScalarPH->end());
9407 else if (&*MainScalarPH->begin() != ResumePhi)
9408 ResumePhi->moveBefore(*MainScalarPH, MainScalarPH->begin());
9409 }
9410 // Add a user to to make sure the resume phi won't get removed.
9411 VPBuilder(MainScalarPH)
9413}
9414
9415/// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded
9416/// SCEVs from \p ExpandedSCEVs and set resume values for header recipes. Some
9417/// reductions require creating new instructions to compute the resume values.
9418/// They are collected in a vector and returned. They must be moved to the
9419/// preheader of the vector epilogue loop, after created by the execution of \p
9420/// Plan.
9422 VPlan &Plan, Loop *L, const SCEV2ValueTy &ExpandedSCEVs,
9424 ScalarEvolution &SE) {
9425 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
9426 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
9427 Header->setName("vec.epilog.vector.body");
9428
9429 VPCanonicalIVPHIRecipe *IV = VectorLoop->getCanonicalIV();
9430 // When vectorizing the epilogue loop, the canonical induction needs to be
9431 // adjusted by the value after the main vector loop. Find the resume value
9432 // created during execution of the main VPlan. It must be the first phi in the
9433 // loop preheader. Use the value to increment the canonical IV, and update all
9434 // users in the loop region to use the adjusted value.
9435 // FIXME: Improve modeling for canonical IV start values in the epilogue
9436 // loop.
9437 using namespace llvm::PatternMatch;
9438 PHINode *EPResumeVal = &*L->getLoopPreheader()->phis().begin();
9439 for (Value *Inc : EPResumeVal->incoming_values()) {
9440 if (match(Inc, m_SpecificInt(0)))
9441 continue;
9442 assert(!EPI.VectorTripCount &&
9443 "Must only have a single non-zero incoming value");
9444 EPI.VectorTripCount = Inc;
9445 }
9446 // If we didn't find a non-zero vector trip count, all incoming values
9447 // must be zero, which also means the vector trip count is zero. Pick the
9448 // first zero as vector trip count.
9449 // TODO: We should not choose VF * UF so the main vector loop is known to
9450 // be dead.
9451 if (!EPI.VectorTripCount) {
9452 assert(EPResumeVal->getNumIncomingValues() > 0 &&
9453 all_of(EPResumeVal->incoming_values(),
9454 [](Value *Inc) { return match(Inc, m_SpecificInt(0)); }) &&
9455 "all incoming values must be 0");
9456 EPI.VectorTripCount = EPResumeVal->getOperand(0);
9457 }
9458 VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal);
9459 assert(all_of(IV->users(),
9460 [](const VPUser *U) {
9461 return isa<VPScalarIVStepsRecipe>(U) ||
9462 isa<VPDerivedIVRecipe>(U) ||
9463 cast<VPRecipeBase>(U)->isScalarCast() ||
9464 cast<VPInstruction>(U)->getOpcode() ==
9465 Instruction::Add;
9466 }) &&
9467 "the canonical IV should only be used by its increment or "
9468 "ScalarIVSteps when resetting the start value");
9469 VPBuilder Builder(Header, Header->getFirstNonPhi());
9470 VPInstruction *Add = Builder.createNaryOp(Instruction::Add, {IV, VPV});
9471 IV->replaceAllUsesWith(Add);
9472 Add->setOperand(0, IV);
9473
9475 SmallVector<Instruction *> InstsToMove;
9476 // Ensure that the start values for all header phi recipes are updated before
9477 // vectorizing the epilogue loop. Skip the canonical IV, which has been
9478 // handled above.
9479 for (VPRecipeBase &R : drop_begin(Header->phis())) {
9480 Value *ResumeV = nullptr;
9481 // TODO: Move setting of resume values to prepareToExecute.
9482 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
9483 auto *RdxResult =
9484 cast<VPInstruction>(*find_if(ReductionPhi->users(), [](VPUser *U) {
9485 auto *VPI = dyn_cast<VPInstruction>(U);
9486 return VPI &&
9487 (VPI->getOpcode() == VPInstruction::ComputeAnyOfResult ||
9488 VPI->getOpcode() == VPInstruction::ComputeReductionResult ||
9489 VPI->getOpcode() == VPInstruction::ComputeFindIVResult);
9490 }));
9491 ResumeV = cast<PHINode>(ReductionPhi->getUnderlyingInstr())
9492 ->getIncomingValueForBlock(L->getLoopPreheader());
9493 RecurKind RK = ReductionPhi->getRecurrenceKind();
9495 Value *StartV = RdxResult->getOperand(1)->getLiveInIRValue();
9496 // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
9497 // start value; compare the final value from the main vector loop
9498 // to the start value.
9499 BasicBlock *PBB = cast<Instruction>(ResumeV)->getParent();
9500 IRBuilder<> Builder(PBB, PBB->getFirstNonPHIIt());
9501 ResumeV = Builder.CreateICmpNE(ResumeV, StartV);
9502 if (auto *I = dyn_cast<Instruction>(ResumeV))
9503 InstsToMove.push_back(I);
9505 Value *StartV = getStartValueFromReductionResult(RdxResult);
9506 ToFrozen[StartV] = cast<PHINode>(ResumeV)->getIncomingValueForBlock(
9508
9509 // VPReductionPHIRecipe for FindFirstIV/FindLastIV reductions requires
9510 // an adjustment to the resume value. The resume value is adjusted to
9511 // the sentinel value when the final value from the main vector loop
9512 // equals the start value. This ensures correctness when the start value
9513 // might not be less than the minimum value of a monotonically
9514 // increasing induction variable.
9515 BasicBlock *ResumeBB = cast<Instruction>(ResumeV)->getParent();
9516 IRBuilder<> Builder(ResumeBB, ResumeBB->getFirstNonPHIIt());
9517 Value *Cmp = Builder.CreateICmpEQ(ResumeV, ToFrozen[StartV]);
9518 if (auto *I = dyn_cast<Instruction>(Cmp))
9519 InstsToMove.push_back(I);
9520 Value *Sentinel = RdxResult->getOperand(2)->getLiveInIRValue();
9521 ResumeV = Builder.CreateSelect(Cmp, Sentinel, ResumeV);
9522 if (auto *I = dyn_cast<Instruction>(ResumeV))
9523 InstsToMove.push_back(I);
9524 } else {
9525 VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
9526 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9527 if (auto *VPI = dyn_cast<VPInstruction>(PhiR->getStartValue())) {
9529 "unexpected start value");
9530 VPI->setOperand(0, StartVal);
9531 continue;
9532 }
9533 }
9534 } else {
9535 // Retrieve the induction resume values for wide inductions from
9536 // their original phi nodes in the scalar loop.
9537 PHINode *IndPhi = cast<VPWidenInductionRecipe>(&R)->getPHINode();
9538 // Hook up to the PHINode generated by a ResumePhi recipe of main
9539 // loop VPlan, which feeds the scalar loop.
9540 ResumeV = IndPhi->getIncomingValueForBlock(L->getLoopPreheader());
9541 }
9542 assert(ResumeV && "Must have a resume value");
9543 VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
9544 cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
9545 }
9546
9547 // For some VPValues in the epilogue plan we must re-use the generated IR
9548 // values from the main plan. Replace them with live-in VPValues.
9549 // TODO: This is a workaround needed for epilogue vectorization and it
9550 // should be removed once induction resume value creation is done
9551 // directly in VPlan.
9552 for (auto &R : make_early_inc_range(*Plan.getEntry())) {
9553 // Re-use frozen values from the main plan for Freeze VPInstructions in the
9554 // epilogue plan. This ensures all users use the same frozen value.
9555 auto *VPI = dyn_cast<VPInstruction>(&R);
9556 if (VPI && VPI->getOpcode() == Instruction::Freeze) {
9558 ToFrozen.lookup(VPI->getOperand(0)->getLiveInIRValue())));
9559 continue;
9560 }
9561
9562 // Re-use the trip count and steps expanded for the main loop, as
9563 // skeleton creation needs it as a value that dominates both the scalar
9564 // and vector epilogue loops
9565 auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(&R);
9566 if (!ExpandR)
9567 continue;
9568 VPValue *ExpandedVal =
9569 Plan.getOrAddLiveIn(ExpandedSCEVs.lookup(ExpandR->getSCEV()));
9570 ExpandR->replaceAllUsesWith(ExpandedVal);
9571 if (Plan.getTripCount() == ExpandR)
9572 Plan.resetTripCount(ExpandedVal);
9573 ExpandR->eraseFromParent();
9574 }
9575
9576 auto VScale = CM.getVScaleForTuning();
9577 unsigned MainLoopStep =
9578 estimateElementCount(EPI.MainLoopVF * EPI.MainLoopUF, VScale);
9579 unsigned EpilogueLoopStep =
9580 estimateElementCount(EPI.EpilogueVF * EPI.EpilogueUF, VScale);
9582 Plan, EPI.TripCount, EPI.VectorTripCount,
9584 EPI.EpilogueUF, MainLoopStep, EpilogueLoopStep, SE);
9585
9586 return InstsToMove;
9587}
9588
9589// Generate bypass values from the additional bypass block. Note that when the
9590// vectorized epilogue is skipped due to iteration count check, then the
9591// resume value for the induction variable comes from the trip count of the
9592// main vector loop, passed as the second argument.
9594 PHINode *OrigPhi, const InductionDescriptor &II, IRBuilder<> &BypassBuilder,
9595 const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount,
9596 Instruction *OldInduction) {
9597 Value *Step = getExpandedStep(II, ExpandedSCEVs);
9598 // For the primary induction the additional bypass end value is known.
9599 // Otherwise it is computed.
9600 Value *EndValueFromAdditionalBypass = MainVectorTripCount;
9601 if (OrigPhi != OldInduction) {
9602 auto *BinOp = II.getInductionBinOp();
9603 // Fast-math-flags propagate from the original induction instruction.
9605 BypassBuilder.setFastMathFlags(BinOp->getFastMathFlags());
9606
9607 // Compute the end value for the additional bypass.
9608 EndValueFromAdditionalBypass =
9609 emitTransformedIndex(BypassBuilder, MainVectorTripCount,
9610 II.getStartValue(), Step, II.getKind(), BinOp);
9611 EndValueFromAdditionalBypass->setName("ind.end");
9612 }
9613 return EndValueFromAdditionalBypass;
9614}
9615
9617 VPlan &BestEpiPlan,
9619 const SCEV2ValueTy &ExpandedSCEVs,
9620 Value *MainVectorTripCount) {
9621 // Fix reduction resume values from the additional bypass block.
9622 BasicBlock *PH = L->getLoopPreheader();
9623 for (auto *Pred : predecessors(PH)) {
9624 for (PHINode &Phi : PH->phis()) {
9625 if (Phi.getBasicBlockIndex(Pred) != -1)
9626 continue;
9627 Phi.addIncoming(Phi.getIncomingValueForBlock(BypassBlock), Pred);
9628 }
9629 }
9630 auto *ScalarPH = cast<VPIRBasicBlock>(BestEpiPlan.getScalarPreheader());
9631 if (ScalarPH->hasPredecessors()) {
9632 // If ScalarPH has predecessors, we may need to update its reduction
9633 // resume values.
9634 for (const auto &[R, IRPhi] :
9635 zip(ScalarPH->phis(), ScalarPH->getIRBasicBlock()->phis())) {
9637 BypassBlock);
9638 }
9639 }
9640
9641 // Fix induction resume values from the additional bypass block.
9642 IRBuilder<> BypassBuilder(BypassBlock, BypassBlock->getFirstInsertionPt());
9643 for (const auto &[IVPhi, II] : LVL.getInductionVars()) {
9644 auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH));
9646 IVPhi, II, BypassBuilder, ExpandedSCEVs, MainVectorTripCount,
9647 LVL.getPrimaryInduction());
9648 // TODO: Directly add as extra operand to the VPResumePHI recipe.
9649 Inc->setIncomingValueForBlock(BypassBlock, V);
9650 }
9651}
9652
9653/// Connect the epilogue vector loop generated for \p EpiPlan to the main vector
9654// loop, after both plans have executed, updating branches from the iteration
9655// and runtime checks of the main loop, as well as updating various phis. \p
9656// InstsToMove contains instructions that need to be moved to the preheader of
9657// the epilogue vector loop.
9659 VPlan &EpiPlan, Loop *L, EpilogueLoopVectorizationInfo &EPI,
9661 DenseMap<const SCEV *, Value *> &ExpandedSCEVs, GeneratedRTChecks &Checks,
9662 ArrayRef<Instruction *> InstsToMove) {
9663 BasicBlock *VecEpilogueIterationCountCheck =
9664 cast<VPIRBasicBlock>(EpiPlan.getEntry())->getIRBasicBlock();
9665
9666 BasicBlock *VecEpiloguePreHeader =
9667 cast<BranchInst>(VecEpilogueIterationCountCheck->getTerminator())
9668 ->getSuccessor(1);
9669 // Adjust the control flow taking the state info from the main loop
9670 // vectorization into account.
9672 "expected this to be saved from the previous pass.");
9673 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
9675 VecEpilogueIterationCountCheck, VecEpiloguePreHeader);
9676
9678 VecEpilogueIterationCountCheck},
9680 VecEpiloguePreHeader}});
9681
9682 BasicBlock *ScalarPH =
9683 cast<VPIRBasicBlock>(EpiPlan.getScalarPreheader())->getIRBasicBlock();
9685 VecEpilogueIterationCountCheck, ScalarPH);
9686 DTU.applyUpdates(
9688 VecEpilogueIterationCountCheck},
9690
9691 // Adjust the terminators of runtime check blocks and phis using them.
9692 BasicBlock *SCEVCheckBlock = Checks.getSCEVChecks().second;
9693 BasicBlock *MemCheckBlock = Checks.getMemRuntimeChecks().second;
9694 if (SCEVCheckBlock) {
9695 SCEVCheckBlock->getTerminator()->replaceUsesOfWith(
9696 VecEpilogueIterationCountCheck, ScalarPH);
9697 DTU.applyUpdates({{DominatorTree::Delete, SCEVCheckBlock,
9698 VecEpilogueIterationCountCheck},
9699 {DominatorTree::Insert, SCEVCheckBlock, ScalarPH}});
9700 }
9701 if (MemCheckBlock) {
9702 MemCheckBlock->getTerminator()->replaceUsesOfWith(
9703 VecEpilogueIterationCountCheck, ScalarPH);
9704 DTU.applyUpdates(
9705 {{DominatorTree::Delete, MemCheckBlock, VecEpilogueIterationCountCheck},
9706 {DominatorTree::Insert, MemCheckBlock, ScalarPH}});
9707 }
9708
9709 // The vec.epilog.iter.check block may contain Phi nodes from inductions
9710 // or reductions which merge control-flow from the latch block and the
9711 // middle block. Update the incoming values here and move the Phi into the
9712 // preheader.
9713 SmallVector<PHINode *, 4> PhisInBlock(
9714 llvm::make_pointer_range(VecEpilogueIterationCountCheck->phis()));
9715
9716 for (PHINode *Phi : PhisInBlock) {
9717 Phi->moveBefore(VecEpiloguePreHeader->getFirstNonPHIIt());
9718 Phi->replaceIncomingBlockWith(
9719 VecEpilogueIterationCountCheck->getSinglePredecessor(),
9720 VecEpilogueIterationCountCheck);
9721
9722 // If the phi doesn't have an incoming value from the
9723 // EpilogueIterationCountCheck, we are done. Otherwise remove the
9724 // incoming value and also those from other check blocks. This is needed
9725 // for reduction phis only.
9726 if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
9727 return EPI.EpilogueIterationCountCheck == IncB;
9728 }))
9729 continue;
9730 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
9731 if (SCEVCheckBlock)
9732 Phi->removeIncomingValue(SCEVCheckBlock);
9733 if (MemCheckBlock)
9734 Phi->removeIncomingValue(MemCheckBlock);
9735 }
9736
9737 auto IP = VecEpiloguePreHeader->getFirstNonPHIIt();
9738 for (auto *I : InstsToMove)
9739 I->moveBefore(IP);
9740
9741 // VecEpilogueIterationCountCheck conditionally skips over the epilogue loop
9742 // after executing the main loop. We need to update the resume values of
9743 // inductions and reductions during epilogue vectorization.
9744 fixScalarResumeValuesFromBypass(VecEpilogueIterationCountCheck, L, EpiPlan,
9745 LVL, ExpandedSCEVs, EPI.VectorTripCount);
9746}
9747
9749 assert((EnableVPlanNativePath || L->isInnermost()) &&
9750 "VPlan-native path is not enabled. Only process inner loops.");
9751
9752 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
9753 << L->getHeader()->getParent()->getName() << "' from "
9754 << L->getLocStr() << "\n");
9755
9756 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
9757
9758 LLVM_DEBUG(
9759 dbgs() << "LV: Loop hints:"
9760 << " force="
9762 ? "disabled"
9764 ? "enabled"
9765 : "?"))
9766 << " width=" << Hints.getWidth()
9767 << " interleave=" << Hints.getInterleave() << "\n");
9768
9769 // Function containing loop
9770 Function *F = L->getHeader()->getParent();
9771
9772 // Looking at the diagnostic output is the only way to determine if a loop
9773 // was vectorized (other than looking at the IR or machine code), so it
9774 // is important to generate an optimization remark for each loop. Most of
9775 // these messages are generated as OptimizationRemarkAnalysis. Remarks
9776 // generated as OptimizationRemark and OptimizationRemarkMissed are
9777 // less verbose reporting vectorized loops and unvectorized loops that may
9778 // benefit from vectorization, respectively.
9779
9780 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
9781 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
9782 return false;
9783 }
9784
9785 PredicatedScalarEvolution PSE(*SE, *L);
9786
9787 // Check if it is legal to vectorize the loop.
9788 LoopVectorizationRequirements Requirements;
9789 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
9790 &Requirements, &Hints, DB, AC, BFI, PSI, AA);
9792 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
9793 Hints.emitRemarkWithHints();
9794 return false;
9795 }
9796
9798 reportVectorizationFailure("Auto-vectorization of loops with uncountable "
9799 "early exit is not enabled",
9800 "UncountableEarlyExitLoopsDisabled", ORE, L);
9801 return false;
9802 }
9803
9804 if (!LVL.getPotentiallyFaultingLoads().empty()) {
9805 reportVectorizationFailure("Auto-vectorization of loops with potentially "
9806 "faulting load is not supported",
9807 "PotentiallyFaultingLoadsNotSupported", ORE, L);
9808 return false;
9809 }
9810
9811 // Entrance to the VPlan-native vectorization path. Outer loops are processed
9812 // here. They may require CFG and instruction level transformations before
9813 // even evaluating whether vectorization is profitable. Since we cannot modify
9814 // the incoming IR, we need to build VPlan upfront in the vectorization
9815 // pipeline.
9816 if (!L->isInnermost())
9817 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
9818 ORE, BFI, PSI, Hints, Requirements);
9819
9820 assert(L->isInnermost() && "Inner loop expected.");
9821
9822 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9823 bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9824
9825 // If an override option has been passed in for interleaved accesses, use it.
9826 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
9827 UseInterleaved = EnableInterleavedMemAccesses;
9828
9829 // Analyze interleaved memory accesses.
9830 if (UseInterleaved)
9832
9833 if (LVL.hasUncountableEarlyExit()) {
9834 BasicBlock *LoopLatch = L->getLoopLatch();
9835 if (IAI.requiresScalarEpilogue() ||
9837 [LoopLatch](BasicBlock *BB) { return BB != LoopLatch; })) {
9838 reportVectorizationFailure("Auto-vectorization of early exit loops "
9839 "requiring a scalar epilogue is unsupported",
9840 "UncountableEarlyExitUnsupported", ORE, L);
9841 return false;
9842 }
9843 }
9844
9845 // Check the function attributes and profiles to find out if this function
9846 // should be optimized for size.
9848 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI);
9849
9850 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
9851 // count by optimizing for size, to minimize overheads.
9852 auto ExpectedTC = getSmallBestKnownTC(PSE, L);
9853 if (ExpectedTC && ExpectedTC->isFixed() &&
9854 ExpectedTC->getFixedValue() < TinyTripCountVectorThreshold) {
9855 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
9856 << "This loop is worth vectorizing only if no scalar "
9857 << "iteration overheads are incurred.");
9859 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
9860 else {
9861 LLVM_DEBUG(dbgs() << "\n");
9862 // Predicate tail-folded loops are efficient even when the loop
9863 // iteration count is low. However, setting the epilogue policy to
9864 // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
9865 // with runtime checks. It's more effective to let
9866 // `isOutsideLoopWorkProfitable` determine if vectorization is
9867 // beneficial for the loop.
9870 }
9871 }
9872
9873 // Check the function attributes to see if implicit floats or vectors are
9874 // allowed.
9875 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
9877 "Can't vectorize when the NoImplicitFloat attribute is used",
9878 "loop not vectorized due to NoImplicitFloat attribute",
9879 "NoImplicitFloat", ORE, L);
9880 Hints.emitRemarkWithHints();
9881 return false;
9882 }
9883
9884 // Check if the target supports potentially unsafe FP vectorization.
9885 // FIXME: Add a check for the type of safety issue (denormal, signaling)
9886 // for the target we're vectorizing for, to make sure none of the
9887 // additional fp-math flags can help.
9888 if (Hints.isPotentiallyUnsafe() &&
9889 TTI->isFPVectorizationPotentiallyUnsafe()) {
9891 "Potentially unsafe FP op prevents vectorization",
9892 "loop not vectorized due to unsafe FP support.",
9893 "UnsafeFP", ORE, L);
9894 Hints.emitRemarkWithHints();
9895 return false;
9896 }
9897
9898 bool AllowOrderedReductions;
9899 // If the flag is set, use that instead and override the TTI behaviour.
9900 if (ForceOrderedReductions.getNumOccurrences() > 0)
9901 AllowOrderedReductions = ForceOrderedReductions;
9902 else
9903 AllowOrderedReductions = TTI->enableOrderedReductions();
9904 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
9905 ORE->emit([&]() {
9906 auto *ExactFPMathInst = Requirements.getExactFPInst();
9907 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
9908 ExactFPMathInst->getDebugLoc(),
9909 ExactFPMathInst->getParent())
9910 << "loop not vectorized: cannot prove it is safe to reorder "
9911 "floating-point operations";
9912 });
9913 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
9914 "reorder floating-point operations\n");
9915 Hints.emitRemarkWithHints();
9916 return false;
9917 }
9918
9919 // Use the cost model.
9920 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
9921 F, &Hints, IAI, PSI, BFI);
9922 // Use the planner for vectorization.
9923 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
9924 ORE);
9925
9926 // Get user vectorization factor and interleave count.
9927 ElementCount UserVF = Hints.getWidth();
9928 unsigned UserIC = Hints.getInterleave();
9929 if (UserIC > 1 && !LVL.isSafeForAnyVectorWidth())
9930 UserIC = 1;
9931
9932 // Plan how to best vectorize.
9933 LVP.plan(UserVF, UserIC);
9935 unsigned IC = 1;
9936
9937 if (ORE->allowExtraAnalysis(LV_NAME))
9939
9940 GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind);
9941 if (LVP.hasPlanWithVF(VF.Width)) {
9942 // Select the interleave count.
9943 IC = LVP.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost);
9944
9945 unsigned SelectedIC = std::max(IC, UserIC);
9946 // Optimistically generate runtime checks if they are needed. Drop them if
9947 // they turn out to not be profitable.
9948 if (VF.Width.isVector() || SelectedIC > 1) {
9949 Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
9950
9951 // Bail out early if either the SCEV or memory runtime checks are known to
9952 // fail. In that case, the vector loop would never execute.
9953 using namespace llvm::PatternMatch;
9954 if (Checks.getSCEVChecks().first &&
9955 match(Checks.getSCEVChecks().first, m_One()))
9956 return false;
9957 if (Checks.getMemRuntimeChecks().first &&
9958 match(Checks.getMemRuntimeChecks().first, m_One()))
9959 return false;
9960 }
9961
9962 // Check if it is profitable to vectorize with runtime checks.
9963 bool ForceVectorization =
9965 VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM,
9966 CM.CostKind, *CM.PSE.getSE(), L);
9967 if (!ForceVectorization &&
9968 !isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx,
9969 LVP.getPlanFor(VF.Width), SEL,
9970 CM.getVScaleForTuning())) {
9971 ORE->emit([&]() {
9973 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
9974 L->getHeader())
9975 << "loop not vectorized: cannot prove it is safe to reorder "
9976 "memory operations";
9977 });
9978 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
9979 Hints.emitRemarkWithHints();
9980 return false;
9981 }
9982 }
9983
9984 // Identify the diagnostic messages that should be produced.
9985 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
9986 bool VectorizeLoop = true, InterleaveLoop = true;
9987 if (VF.Width.isScalar()) {
9988 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
9989 VecDiagMsg = {
9990 "VectorizationNotBeneficial",
9991 "the cost-model indicates that vectorization is not beneficial"};
9992 VectorizeLoop = false;
9993 }
9994
9995 if (UserIC == 1 && Hints.getInterleave() > 1) {
9997 "UserIC should only be ignored due to unsafe dependencies");
9998 LLVM_DEBUG(dbgs() << "LV: Ignoring user-specified interleave count.\n");
9999 IntDiagMsg = {"InterleavingUnsafe",
10000 "Ignoring user-specified interleave count due to possibly "
10001 "unsafe dependencies in the loop."};
10002 InterleaveLoop = false;
10003 } else if (!LVP.hasPlanWithVF(VF.Width) && UserIC > 1) {
10004 // Tell the user interleaving was avoided up-front, despite being explicitly
10005 // requested.
10006 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10007 "interleaving should be avoided up front\n");
10008 IntDiagMsg = {"InterleavingAvoided",
10009 "Ignoring UserIC, because interleaving was avoided up front"};
10010 InterleaveLoop = false;
10011 } else if (IC == 1 && UserIC <= 1) {
10012 // Tell the user interleaving is not beneficial.
10013 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10014 IntDiagMsg = {
10015 "InterleavingNotBeneficial",
10016 "the cost-model indicates that interleaving is not beneficial"};
10017 InterleaveLoop = false;
10018 if (UserIC == 1) {
10019 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10020 IntDiagMsg.second +=
10021 " and is explicitly disabled or interleave count is set to 1";
10022 }
10023 } else if (IC > 1 && UserIC == 1) {
10024 // Tell the user interleaving is beneficial, but it explicitly disabled.
10025 LLVM_DEBUG(dbgs() << "LV: Interleaving is beneficial but is explicitly "
10026 "disabled.\n");
10027 IntDiagMsg = {"InterleavingBeneficialButDisabled",
10028 "the cost-model indicates that interleaving is beneficial "
10029 "but is explicitly disabled or interleave count is set to 1"};
10030 InterleaveLoop = false;
10031 }
10032
10033 // If there is a histogram in the loop, do not just interleave without
10034 // vectorizing. The order of operations will be incorrect without the
10035 // histogram intrinsics, which are only used for recipes with VF > 1.
10036 if (!VectorizeLoop && InterleaveLoop && LVL.hasHistograms()) {
10037 LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due "
10038 << "to histogram operations.\n");
10039 IntDiagMsg = {
10040 "HistogramPreventsScalarInterleaving",
10041 "Unable to interleave without vectorization due to constraints on "
10042 "the order of histogram operations"};
10043 InterleaveLoop = false;
10044 }
10045
10046 // Override IC if user provided an interleave count.
10047 IC = UserIC > 0 ? UserIC : IC;
10048
10049 // Emit diagnostic messages, if any.
10050 const char *VAPassName = Hints.vectorizeAnalysisPassName();
10051 if (!VectorizeLoop && !InterleaveLoop) {
10052 // Do not vectorize or interleaving the loop.
10053 ORE->emit([&]() {
10054 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10055 L->getStartLoc(), L->getHeader())
10056 << VecDiagMsg.second;
10057 });
10058 ORE->emit([&]() {
10059 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10060 L->getStartLoc(), L->getHeader())
10061 << IntDiagMsg.second;
10062 });
10063 return false;
10064 }
10065
10066 if (!VectorizeLoop && InterleaveLoop) {
10067 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10068 ORE->emit([&]() {
10069 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10070 L->getStartLoc(), L->getHeader())
10071 << VecDiagMsg.second;
10072 });
10073 } else if (VectorizeLoop && !InterleaveLoop) {
10074 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10075 << ") in " << L->getLocStr() << '\n');
10076 ORE->emit([&]() {
10077 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10078 L->getStartLoc(), L->getHeader())
10079 << IntDiagMsg.second;
10080 });
10081 } else if (VectorizeLoop && InterleaveLoop) {
10082 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10083 << ") in " << L->getLocStr() << '\n');
10084 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10085 }
10086
10087 // Report the vectorization decision.
10088 if (VF.Width.isScalar()) {
10089 using namespace ore;
10090 assert(IC > 1);
10091 ORE->emit([&]() {
10092 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10093 L->getHeader())
10094 << "interleaved loop (interleaved count: "
10095 << NV("InterleaveCount", IC) << ")";
10096 });
10097 } else {
10098 // Report the vectorization decision.
10099 reportVectorization(ORE, L, VF, IC);
10100 }
10101 if (ORE->allowExtraAnalysis(LV_NAME))
10103
10104 // If we decided that it is *legal* to interleave or vectorize the loop, then
10105 // do it.
10106
10107 VPlan &BestPlan = LVP.getPlanFor(VF.Width);
10108 // Consider vectorizing the epilogue too if it's profitable.
10109 VectorizationFactor EpilogueVF =
10111 if (EpilogueVF.Width.isVector()) {
10112 std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
10113
10114 // The first pass vectorizes the main loop and creates a scalar epilogue
10115 // to be vectorized by executing the plan (potentially with a different
10116 // factor) again shortly afterwards.
10117 VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width);
10118 BestEpiPlan.getMiddleBlock()->setName("vec.epilog.middle.block");
10119 BestEpiPlan.getVectorPreheader()->setName("vec.epilog.ph");
10120 preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan);
10121 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1,
10122 BestEpiPlan);
10123 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TTI, AC, EPI, &CM, BFI,
10124 PSI, Checks, *BestMainPlan);
10125 auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
10126 *BestMainPlan, MainILV, DT, false);
10127 ++LoopsVectorized;
10128
10129 // Second pass vectorizes the epilogue and adjusts the control flow
10130 // edges from the first pass.
10131 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TTI, AC, EPI, &CM,
10132 BFI, PSI, Checks, BestEpiPlan);
10134 BestEpiPlan, L, ExpandedSCEVs, EPI, CM, *PSE.getSE());
10135 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, DT,
10136 true);
10137 connectEpilogueVectorLoop(BestEpiPlan, L, EPI, DT, LVL, ExpandedSCEVs,
10138 Checks, InstsToMove);
10139 ++LoopsEpilogueVectorized;
10140 } else {
10141 InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, IC, &CM, BFI, PSI,
10142 Checks, BestPlan);
10143 // TODO: Move to general VPlan pipeline once epilogue loops are also
10144 // supported.
10147 IC, PSE);
10148 LVP.addMinimumIterationCheck(BestPlan, VF.Width, IC,
10150
10151 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
10152 ++LoopsVectorized;
10153 }
10154
10155 assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
10156 "DT not preserved correctly");
10157 assert(!verifyFunction(*F, &dbgs()));
10158
10159 return true;
10160}
10161
10163
10164 // Don't attempt if
10165 // 1. the target claims to have no vector registers, and
10166 // 2. interleaving won't help ILP.
10167 //
10168 // The second condition is necessary because, even if the target has no
10169 // vector registers, loop vectorization may still enable scalar
10170 // interleaving.
10171 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10172 TTI->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2)
10173 return LoopVectorizeResult(false, false);
10174
10175 bool Changed = false, CFGChanged = false;
10176
10177 // The vectorizer requires loops to be in simplified form.
10178 // Since simplification may add new inner loops, it has to run before the
10179 // legality and profitability checks. This means running the loop vectorizer
10180 // will simplify all loops, regardless of whether anything end up being
10181 // vectorized.
10182 for (const auto &L : *LI)
10183 Changed |= CFGChanged |=
10184 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10185
10186 // Build up a worklist of inner-loops to vectorize. This is necessary as
10187 // the act of vectorizing or partially unrolling a loop creates new loops
10188 // and can invalidate iterators across the loops.
10189 SmallVector<Loop *, 8> Worklist;
10190
10191 for (Loop *L : *LI)
10192 collectSupportedLoops(*L, LI, ORE, Worklist);
10193
10194 LoopsAnalyzed += Worklist.size();
10195
10196 // Now walk the identified inner loops.
10197 while (!Worklist.empty()) {
10198 Loop *L = Worklist.pop_back_val();
10199
10200 // For the inner loops we actually process, form LCSSA to simplify the
10201 // transform.
10202 Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10203
10204 Changed |= CFGChanged |= processLoop(L);
10205
10206 if (Changed) {
10207 LAIs->clear();
10208
10209#ifndef NDEBUG
10210 if (VerifySCEV)
10211 SE->verify();
10212#endif
10213 }
10214 }
10215
10216 // Process each loop nest in the function.
10217 return LoopVectorizeResult(Changed, CFGChanged);
10218}
10219
10222 LI = &AM.getResult<LoopAnalysis>(F);
10223 // There are no loops in the function. Return before computing other
10224 // expensive analyses.
10225 if (LI->empty())
10226 return PreservedAnalyses::all();
10235 AA = &AM.getResult<AAManager>(F);
10236
10237 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10238 PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10239 BFI = nullptr;
10240 if (PSI && PSI->hasProfileSummary())
10242 LoopVectorizeResult Result = runImpl(F);
10243 if (!Result.MadeAnyChange)
10244 return PreservedAnalyses::all();
10246
10247 if (isAssignmentTrackingEnabled(*F.getParent())) {
10248 for (auto &BB : F)
10250 }
10251
10252 PA.preserve<LoopAnalysis>();
10256
10257 if (Result.MadeCFGChange) {
10258 // Making CFG changes likely means a loop got vectorized. Indicate that
10259 // extra simplification passes should be run.
10260 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10261 // be run if runtime checks have been added.
10264 } else {
10266 }
10267 return PA;
10268}
10269
10271 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10272 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10273 OS, MapClassName2PassName);
10274
10275 OS << '<';
10276 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10277 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10278 OS << '>';
10279}
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
static unsigned getIntrinsicID(const SDNode *N)
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
AMDGPU Lower Kernel Arguments
AMDGPU Register Bank Select
Rewrite undef for PHI
This file implements a class to represent arbitrary precision integral constant values and operations...
@ PostInc
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
This is the interface for LLVM's primary stateless and local alias analysis.
static bool IsEmptyBlock(MachineBasicBlock *MBB)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
static InstructionCost getCost(Instruction &Inst, TTI::TargetCostKind CostKind, TargetTransformInfo &TTI, TargetLibraryInfo &TLI)
Definition CostModel.cpp:74
This file defines DenseMapInfo traits for DenseMap.
This file defines the DenseMap class.
#define DEBUG_TYPE
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
#define _
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
Module.h This file contains the declarations for the Module class.
This defines the Use class.
static bool hasNoUnsignedWrap(BinaryOperator &I)
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:80
static cl::opt< unsigned, true > VectorizationFactor("force-vector-width", cl::Hidden, cl::desc("Sets the SIMD width. Zero is autoselect."), cl::location(VectorizerParams::VectorizationFactor))
This header provides classes for managing per-loop analyses.
static cl::opt< bool > WidenIV("loop-flatten-widen-iv", cl::Hidden, cl::init(true), cl::desc("Widen the loop induction variables, if possible, so " "overflow checks won't reject flattening"))
static const char * VerboseDebug
#define LV_NAME
This file defines the LoopVectorizationLegality class.
This file provides a LoopVectorizationPlanner class.
static void collectSupportedLoops(Loop &L, LoopInfo *LI, OptimizationRemarkEmitter *ORE, SmallVectorImpl< Loop * > &V)
static cl::opt< unsigned > EpilogueVectorizationMinVF("epilogue-vectorization-minimum-VF", cl::Hidden, cl::desc("Only loops with vectorization factor equal to or larger than " "the specified value are considered for epilogue vectorization."))
static cl::opt< unsigned > EpilogueVectorizationForceVF("epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, cl::desc("When epilogue vectorization is enabled, and a value greater than " "1 is specified, forces the given VF for all applicable epilogue " "loops."))
static Type * maybeVectorizeType(Type *Ty, ElementCount VF)
static ElementCount determineVPlanVF(const TargetTransformInfo &TTI, LoopVectorizationCostModel &CM)
static ElementCount getSmallConstantTripCount(ScalarEvolution *SE, const Loop *L)
A version of ScalarEvolution::getSmallConstantTripCount that returns an ElementCount to include loops...
static cl::opt< unsigned > VectorizeMemoryCheckThreshold("vectorize-memory-check-threshold", cl::init(128), cl::Hidden, cl::desc("The maximum allowed number of runtime memory checks"))
static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan)
Prepare MainPlan for vectorizing the main vector loop during epilogue vectorization.
static cl::opt< unsigned > TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), cl::Hidden, cl::desc("Loops with a constant trip count that is smaller than this " "value are vectorized only if no scalar iteration overheads " "are incurred."))
Loops with a known constant trip count below this number are vectorized only if no scalar iteration o...
static void debugVectorizationMessage(const StringRef Prefix, const StringRef DebugMsg, Instruction *I)
Write a DebugMsg about vectorization to the debug output stream.
static cl::opt< bool > EnableCondStoresVectorization("enable-cond-stores-vec", cl::init(true), cl::Hidden, cl::desc("Enable if predication of stores during vectorization."))
static void legacyCSE(BasicBlock *BB)
FIXME: This legacy common-subexpression-elimination routine is scheduled for removal,...
static VPIRBasicBlock * replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB, VPlan *Plan=nullptr)
Replace VPBB with a VPIRBasicBlock wrapping IRBB.
static Value * emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue, Value *Step, InductionDescriptor::InductionKind InductionKind, const BinaryOperator *InductionBinOp)
Compute the transformed value of Index at offset StartValue using step StepValue.
static DebugLoc getDebugLocFromInstOrOperands(Instruction *I)
Look for a meaningful debug location on the instruction or its operands.
static Value * createInductionAdditionalBypassValues(PHINode *OrigPhi, const InductionDescriptor &II, IRBuilder<> &BypassBuilder, const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount, Instruction *OldInduction)
static void fixReductionScalarResumeWhenVectorizingEpilog(VPPhi *EpiResumePhiR, PHINode &EpiResumePhi, BasicBlock *BypassBlock)
static Value * getStartValueFromReductionResult(VPInstruction *RdxResult)
static cl::opt< bool > ForceTargetSupportsScalableVectors("force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, cl::desc("Pretend that scalable vectors are supported, even if the target does " "not support them. This flag should only be used for testing."))
static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style)
static VPWidenIntOrFpInductionRecipe * createWidenInductionRecipes(VPInstruction *PhiR, const InductionDescriptor &IndDesc, VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop)
Creates a VPWidenIntOrFpInductionRecipe for PhiR.
static cl::opt< bool > EnableEarlyExitVectorization("enable-early-exit-vectorization", cl::init(true), cl::Hidden, cl::desc("Enable vectorization of early exit loops with uncountable exits."))
static cl::opt< bool > ConsiderRegPressure("vectorizer-consider-reg-pressure", cl::init(false), cl::Hidden, cl::desc("Discard VFs if their register pressure is too high."))
static unsigned estimateElementCount(ElementCount VF, std::optional< unsigned > VScale)
This function attempts to return a value that represents the ElementCount at runtime.
static constexpr uint32_t MinItersBypassWeights[]
static cl::opt< unsigned > ForceTargetNumScalarRegs("force-target-num-scalar-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of scalar registers."))
static cl::opt< bool > UseWiderVFIfCallVariantsPresent("vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true), cl::Hidden, cl::desc("Try wider VFs if they enable the use of vector variants"))
static std::optional< unsigned > getMaxVScale(const Function &F, const TargetTransformInfo &TTI)
static cl::opt< unsigned > SmallLoopCost("small-loop-cost", cl::init(20), cl::Hidden, cl::desc("The cost of a loop that is considered 'small' by the interleaver."))
static void connectEpilogueVectorLoop(VPlan &EpiPlan, Loop *L, EpilogueLoopVectorizationInfo &EPI, DominatorTree *DT, LoopVectorizationLegality &LVL, DenseMap< const SCEV *, Value * > &ExpandedSCEVs, GeneratedRTChecks &Checks, ArrayRef< Instruction * > InstsToMove)
Connect the epilogue vector loop generated for EpiPlan to the main vector.
static bool planContainsAdditionalSimplifications(VPlan &Plan, VPCostContext &CostCtx, Loop *TheLoop, ElementCount VF)
Return true if the original loop \ TheLoop contains any instructions that do not have corresponding r...
static cl::opt< unsigned > ForceTargetNumVectorRegs("force-target-num-vector-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of vector registers."))
static bool isExplicitVecOuterLoop(Loop *OuterLp, OptimizationRemarkEmitter *ORE)
static cl::opt< bool > EnableIndVarRegisterHeur("enable-ind-var-reg-heur", cl::init(true), cl::Hidden, cl::desc("Count the induction variable only once when interleaving"))
static cl::opt< TailFoldingStyle > ForceTailFoldingStyle("force-tail-folding-style", cl::desc("Force the tail folding style"), cl::init(TailFoldingStyle::None), cl::values(clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"), clEnumValN(TailFoldingStyle::Data, "data", "Create lane mask for data only, using active.lane.mask intrinsic"), clEnumValN(TailFoldingStyle::DataWithoutLaneMask, "data-without-lane-mask", "Create lane mask with compare/stepvector"), clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control", "Create lane mask using active.lane.mask intrinsic, and use " "it for both data and control flow"), clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, "data-and-control-without-rt-check", "Similar to data-and-control, but remove the runtime check"), clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl", "Use predicated EVL instructions for tail folding. If EVL " "is unsupported, fallback to data-without-lane-mask.")))
static cl::opt< bool > EnableEpilogueVectorization("enable-epilogue-vectorization", cl::init(true), cl::Hidden, cl::desc("Enable vectorization of epilogue loops."))
static ScalarEpilogueLowering getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI)
static cl::opt< bool > PreferPredicatedReductionSelect("prefer-predicated-reduction-select", cl::init(false), cl::Hidden, cl::desc("Prefer predicating a reduction operation over an after loop select."))
static cl::opt< bool > PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), cl::Hidden, cl::desc("Prefer in-loop vector reductions, " "overriding the targets preference."))
static SmallVector< Instruction * > preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L, const SCEV2ValueTy &ExpandedSCEVs, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel &CM, ScalarEvolution &SE)
Prepare Plan for vectorizing the epilogue loop.
static cl::opt< bool > EnableLoadStoreRuntimeInterleave("enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, cl::desc("Enable runtime interleaving until load/store ports are saturated"))
static cl::opt< bool > VPlanBuildStressTest("vplan-build-stress-test", cl::init(false), cl::Hidden, cl::desc("Build VPlan for every supported loop nest in the function and bail " "out right after the build (stress test the VPlan H-CFG construction " "in the VPlan-native vectorization path)."))
static bool hasIrregularType(Type *Ty, const DataLayout &DL)
A helper function that returns true if the given type is irregular.
static cl::opt< bool > LoopVectorizeWithBlockFrequency("loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, cl::desc("Enable the use of the block frequency analysis to access PGO " "heuristics minimizing code growth in cold regions and being more " "aggressive in hot regions."))
static std::optional< ElementCount > getSmallBestKnownTC(PredicatedScalarEvolution &PSE, Loop *L, bool CanUseConstantMax=true)
Returns "best known" trip count, which is either a valid positive trip count or std::nullopt when an ...
static Value * getExpandedStep(const InductionDescriptor &ID, const SCEV2ValueTy &ExpandedSCEVs)
Return the expanded step for ID using ExpandedSCEVs to look up SCEV expansion results.
static bool useActiveLaneMask(TailFoldingStyle Style)
static bool hasReplicatorRegion(VPlan &Plan)
static bool isIndvarOverflowCheckKnownFalse(const LoopVectorizationCostModel *Cost, ElementCount VF, std::optional< unsigned > UF=std::nullopt)
For the given VF and UF and maximum trip count computed for the loop, return whether the induction va...
static void addFullyUnrolledInstructionsToIgnore(Loop *L, const LoopVectorizationLegality::InductionList &IL, SmallPtrSetImpl< Instruction * > &InstsToIgnore)
Knowing that loop L executes a single vector iteration, add instructions that will get simplified and...
static cl::opt< PreferPredicateTy::Option > PreferPredicateOverEpilogue("prefer-predicate-over-epilogue", cl::init(PreferPredicateTy::ScalarEpilogue), cl::Hidden, cl::desc("Tail-folding and predication preferences over creating a scalar " "epilogue loop."), cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, "scalar-epilogue", "Don't tail-predicate loops, create scalar epilogue"), clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, "predicate-else-scalar-epilogue", "prefer tail-folding, create scalar epilogue if tail " "folding fails."), clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, "predicate-dont-vectorize", "prefers tail-folding, don't attempt vectorization if " "tail-folding fails.")))
static cl::opt< bool > EnableInterleavedMemAccesses("enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on interleaved memory accesses in a loop"))
static cl::opt< bool > EnableMaskedInterleavedMemAccesses("enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"))
An interleave-group may need masking if it resides in a block that needs predication,...
static cl::opt< bool > ForceOrderedReductions("force-ordered-reductions", cl::init(false), cl::Hidden, cl::desc("Enable the vectorisation of loops with in-order (strict) " "FP reductions"))
static const SCEV * getAddressAccessSCEV(Value *Ptr, LoopVectorizationLegality *Legal, PredicatedScalarEvolution &PSE, const Loop *TheLoop)
Gets Address Access SCEV after verifying that the access pattern is loop invariant except the inducti...
static cl::opt< cl::boolOrDefault > ForceSafeDivisor("force-widen-divrem-via-safe-divisor", cl::Hidden, cl::desc("Override cost based safe divisor widening for div/rem instructions"))
static InstructionCost calculateEarlyExitCost(VPCostContext &CostCtx, VPlan &Plan, ElementCount VF)
For loops with uncountable early exits, find the cost of doing work when exiting the loop early,...
static cl::opt< unsigned > ForceTargetMaxVectorInterleaveFactor("force-target-max-vector-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "vectorized loops."))
static bool processLoopInVPlanNativePath(Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, LoopVectorizationRequirements &Requirements)
static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI)
static cl::opt< unsigned > NumberOfStoresToPredicate("vectorize-num-stores-pred", cl::init(1), cl::Hidden, cl::desc("Max number of stores to be predicated behind an if."))
The number of stores in a loop that are allowed to need predication.
static cl::opt< unsigned > MaxNestedScalarReductionIC("max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, cl::desc("The maximum interleave count to use when interleaving a scalar " "reduction in a nested loop."))
static cl::opt< unsigned > ForceTargetMaxScalarInterleaveFactor("force-target-max-scalar-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "scalar loops."))
static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE)
static bool willGenerateVectors(VPlan &Plan, ElementCount VF, const TargetTransformInfo &TTI)
Check if any recipe of Plan will generate a vector value, which will be assigned a vector register.
static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks, VectorizationFactor &VF, Loop *L, PredicatedScalarEvolution &PSE, VPCostContext &CostCtx, VPlan &Plan, ScalarEpilogueLowering SEL, std::optional< unsigned > VScale)
This function determines whether or not it's still profitable to vectorize the loop given the extra w...
static void fixScalarResumeValuesFromBypass(BasicBlock *BypassBlock, Loop *L, VPlan &BestEpiPlan, LoopVectorizationLegality &LVL, const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount)
static cl::opt< bool > MaximizeBandwidth("vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " "will be determined by the smallest type in loop."))
static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop, Instruction *I, DebugLoc DL={})
Create an analysis remark that explains why vectorization failed.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
This file implements a map that provides insertion order iteration.
This file contains the declarations for metadata subclasses.
#define T
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
#define P(N)
This file contains the declarations for profiling metadata utility functions.
const SmallVectorImpl< MachineOperand > & Cond
static BinaryOperator * CreateMul(Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore, Value *FlagsOp)
static BinaryOperator * CreateAdd(Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore, Value *FlagsOp)
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy, VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={})
This is similar to TargetTransformInfo::getScalarizationOverhead, but if ScalarTy is a FixedVectorTyp...
This file contains some templates that are useful if you are working with the STL at all.
#define OP(OPC)
Definition Instruction.h:46
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
#define DEBUG_WITH_TYPE(TYPE,...)
DEBUG_WITH_TYPE macro - This macro should be used by passes to emit debug information.
Definition Debug.h:72
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
This pass exposes codegen information to IR-level passes.
LocallyHashedType DenseMapInfo< LocallyHashedType >::Empty
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file contains the declarations of different VPlan-related auxiliary helpers.
This file provides utility VPlan to VPlan transformations.
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const char PassName[]
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition blake3_impl.h:83
A manager for alias analyses.
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1541
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1513
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:143
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM_ABI unsigned getVScaleRangeMin() const
Returns the minimum value for the vscale_range attribute.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition BasicBlock.h:528
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
LLVM_ABI const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this basic block belongs to.
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
BinaryOps getOpcode() const
Definition InstrTypes.h:374
Analysis pass which computes BlockFrequencyInfo.
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Conditional or Unconditional Branch instruction.
bool isConditional() const
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
BasicBlock * getSuccessor(unsigned i) const
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Value * getArgOperand(unsigned i) const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:982
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
A debug info location.
Definition DebugLoc.h:124
static DebugLoc getTemporary()
Definition DebugLoc.h:161
static DebugLoc getUnknown()
Definition DebugLoc.h:162
An analysis that produces DemandedBits for a function.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:248
iterator end()
Definition DenseMap.h:81
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:169
void insert_range(Range &&R)
Inserts range of 'std::pair<KeyT, ValueT>' values into the map.
Definition DenseMap.h:286
Implements a dense probed hash-table based set.
Definition DenseSet.h:279
Analysis pass which computes a DominatorTree.
Definition Dominators.h:283
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
void eraseNode(NodeT *BB)
eraseNode - Removes a node from the dominator tree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
static constexpr ElementCount get(ScalarTy MinVal, bool Scalable)
Definition TypeSize.h:315
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
BasicBlock * createVectorizedLoopSkeleton() final
Implements the interface for creating a vectorized skeleton using the epilogue loop strategy (i....
EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan)
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
A specialized derived class of inner loop vectorizer that performs vectorization of main loops in the...
void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB)
Introduces a new VPIRBasicBlock for CheckIRBB to Plan between the vector preheader and its predecesso...
BasicBlock * emitIterationCountCheck(BasicBlock *VectorPH, BasicBlock *Bypass, bool ForEpilogue)
Emits an iteration count bypass check once for the main loop (when ForEpilogue is false) and once for...
EpilogueVectorizerMainLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Check, VPlan &Plan)
Value * createIterationCountCheck(BasicBlock *VectorPH, ElementCount VF, unsigned UF) const
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
BasicBlock * createVectorizedLoopSkeleton() final
Implements the interface for creating a vectorized skeleton using the main loop strategy (i....
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
Class to represent function types.
param_iterator param_begin() const
param_iterator param_end() const
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:762
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:727
Represents flags for the getelementptr instruction/expression.
static GEPNoWrapFlags none()
void applyUpdates(ArrayRef< UpdateT > Updates)
Submit updates to all available trees.
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition IRBuilder.h:345
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788
A struct for saving information about induction variables.
const SCEV * getStep() const
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_NoInduction
Not an induction variable.
@ IK_FpInduction
Floating point induction variable.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
const SmallVectorImpl< Instruction * > & getCastInsts() const
Returns a reference to the type cast instructions in the induction update chain, that are redundant w...
Value * getStartValue() const
InnerLoopAndEpilogueVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan, ElementCount VecWidth, ElementCount MinProfitableTripCount, unsigned UnrollFactor)
EpilogueLoopVectorizationInfo & EPI
Holds and updates state information required to vectorize the main loop and its epilogue in two separ...
InnerLoopVectorizer vectorizes loops which contain only one basic block to a specified vectorization ...
virtual void printDebugTracesAtStart()
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
Value * TripCount
Trip count of the original loop.
const TargetTransformInfo * TTI
Target Transform Info.
LoopVectorizationCostModel * Cost
The profitablity analysis.
BlockFrequencyInfo * BFI
BFI and PSI are used to check for profile guided size optimizations.
Value * getTripCount() const
Returns the original loop trip count.
friend class LoopVectorizationPlanner
PredicatedScalarEvolution & PSE
A wrapper around ScalarEvolution used to add runtime SCEV checks.
LoopInfo * LI
Loop Info.
ProfileSummaryInfo * PSI
DominatorTree * DT
Dominator Tree.
void setTripCount(Value *TC)
Used to set the trip count after ILV's construction and after the preheader block has been executed.
void fixVectorizedLoop(VPTransformState &State)
Fix the vectorized code, taking care of header phi's, and more.
virtual BasicBlock * createVectorizedLoopSkeleton()
Creates a basic block for the scalar preheader.
virtual void printDebugTracesAtEnd()
AssumptionCache * AC
Assumption Cache.
InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC, ElementCount VecWidth, unsigned UnrollFactor, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks, VPlan &Plan)
IRBuilder Builder
The builder that we use.
void fixNonInductionPHIs(VPTransformState &State)
Fix the non-induction PHIs in Plan.
VPBasicBlock * VectorPHVPBB
The vector preheader block of Plan, used as target for check blocks introduced during skeleton creati...
unsigned UF
The vectorization unroll factor to use.
GeneratedRTChecks & RTChecks
Structure to hold information about generated runtime checks, responsible for cleaning the checks,...
virtual ~InnerLoopVectorizer()=default
ElementCount VF
The vectorization SIMD factor to use.
Loop * OrigLoop
The original loop.
BasicBlock * createScalarPreheader(StringRef Prefix)
Create and return a new IR basic block for the scalar preheader whose name is prefixed with Prefix.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
static InstructionCost getInvalid(CostType Val=0)
static InstructionCost getMax()
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
bool isCast() const
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI void moveBefore(InstListType::iterator InsertPos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool isBinaryOp() const
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
LLVM_ABI FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
const char * getOpcodeName() const
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Class to represent integer types.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:319
LLVM_ABI APInt getMask() const
For example, this is 0xFF for an 8 bit integer, 0xFFFF for i16, etc.
Definition Type.cpp:343
The group of interleaved loads/stores sharing the same stride and close to each other.
uint32_t getFactor() const
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
InstTy * getInsertPos() const
uint32_t getNumMembers() const
Drive the analysis of interleaved memory accesses in the loop.
bool requiresScalarEpilogue() const
Returns true if an interleaved group that may access memory out-of-bounds requires a scalar epilogue ...
LLVM_ABI void analyzeInterleaving(bool EnableMaskedInterleavedGroup)
Analyze the interleaved accesses and collect them in interleave groups.
An instruction for reading from memory.
Type * getPointerOperandType() const
This analysis provides dependence information for the memory accesses of a loop.
Drive the analysis of memory accesses in the loop.
const RuntimePointerChecking * getRuntimePointerChecking() const
unsigned getNumRuntimePointerChecks() const
Number of memchecks required to prove independence of otherwise may-alias pointers.
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
BlockT * getHeader() const
iterator_range< block_iterator > blocks() const
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Store the result of a depth first search within basic blocks contained by a single loop.
RPOIterator beginRPO() const
Reverse iterate over the cached postorder blocks.
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
RPOIterator endRPO() const
Wrapper class to LoopBlocksDFS that provides a standard begin()/end() interface for the DFS reverse p...
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
void removeBlock(BlockT *BB)
This method completely removes BB from all data structures, including all of the Loop objects it is n...
LoopVectorizationCostModel - estimates the expected speedups due to vectorization.
SmallPtrSet< Type *, 16 > ElementTypesInLoop
All element types found in the loop.
bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment, unsigned AddressSpace) const
Returns true if the target machine supports masked load operation for the given DataType and kind of ...
LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, const Function *F, const LoopVectorizeHints *Hints, InterleavedAccessInfo &IAI, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI)
void collectElementTypesForWidening()
Collect all element types in the loop for which widening is needed.
bool canVectorizeReductions(ElementCount VF) const
Returns true if the target machine supports all of the reduction variables found for the given VF.
bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment, unsigned AddressSpace) const
Returns true if the target machine supports masked store operation for the given DataType and kind of...
bool isEpilogueVectorizationProfitable(const ElementCount VF, const unsigned IC) const
Returns true if epilogue vectorization is considered profitable, and false otherwise.
bool isPredicatedInst(Instruction *I) const
Returns true if I is an instruction that needs to be predicated at runtime.
void collectValuesToIgnore()
Collect values we want to ignore in the cost model.
void collectInLoopReductions()
Split reductions into those that happen in the loop, and those that happen outside.
std::pair< unsigned, unsigned > getSmallestAndWidestTypes()
bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be uniform after vectorization.
void collectNonVectorizedAndSetWideningDecisions(ElementCount VF)
Collect values that will not be widened, including Uniforms, Scalars, and Instructions to Scalarize f...
PredicatedScalarEvolution & PSE
Predicated scalar evolution analysis.
const LoopVectorizeHints * Hints
Loop Vectorize Hint.
std::optional< unsigned > getMaxSafeElements() const
Return maximum safe number of elements to be processed per vector iteration, which do not prevent sto...
const TargetTransformInfo & TTI
Vector target information.
LoopVectorizationLegality * Legal
Vectorization legality.
std::optional< InstructionCost > getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy) const
Return the cost of instructions in an inloop reduction pattern, if I is part of that pattern.
InstructionCost getInstructionCost(Instruction *I, ElementCount VF)
Returns the execution time cost of an instruction for a given vector width.
DemandedBits * DB
Demanded bits analysis.
bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const
Returns true if I is a memory instruction in an interleaved-group of memory accesses that can be vect...
const TargetLibraryInfo * TLI
Target Library Info.
bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF)
Returns true if I is a memory instruction with consecutive memory access that can be widened.
const InterleaveGroup< Instruction > * getInterleavedAccessGroup(Instruction *Instr) const
Get the interleaved access group that Instr belongs to.
InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const
Estimate cost of an intrinsic call instruction CI if it were vectorized with factor VF.
bool OptForSize
Whether this loop should be optimized for size based on function attribute or profile information.
bool useMaxBandwidth(TargetTransformInfo::RegisterKind RegKind)
bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be scalar after vectorization.
bool isOptimizableIVTruncate(Instruction *I, ElementCount VF)
Return True if instruction I is an optimizable truncate whose operand is an induction variable.
FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC)
bool shouldConsiderRegPressureForVF(ElementCount VF)
Loop * TheLoop
The loop that we evaluate.
TTI::TargetCostKind CostKind
The kind of cost that we are calculating.
TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow=true) const
Returns the TailFoldingStyle that is best for the current loop.
InterleavedAccessInfo & InterleaveInfo
The interleave access information contains groups of interleaved accesses with the same stride and cl...
SmallPtrSet< const Value *, 16 > ValuesToIgnore
Values to ignore in the cost model.
void setVectorizedCallDecision(ElementCount VF)
A call may be vectorized in different ways depending on whether we have vectorized variants available...
void invalidateCostModelingDecisions()
Invalidates decisions already taken by the cost model.
bool isAccessInterleaved(Instruction *Instr) const
Check if Instr belongs to any interleaved access group.
bool selectUserVectorizationFactor(ElementCount UserVF)
Setup cost-based decisions for user vectorization factor.
std::optional< unsigned > getVScaleForTuning() const
Return the value of vscale used for tuning the cost model.
OptimizationRemarkEmitter * ORE
Interface to emit optimization remarks.
LoopInfo * LI
Loop Info analysis.
bool requiresScalarEpilogue(bool IsVectorizing) const
Returns true if we're required to use a scalar epilogue for at least the final iteration of the origi...
SmallPtrSet< const Value *, 16 > VecValuesToIgnore
Values to ignore in the cost model when VF > 1.
unsigned getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind, BasicBlock *BB) const
A helper function that returns how much we should divide the cost of a predicated block by.
bool isInLoopReduction(PHINode *Phi) const
Returns true if the Phi is part of an inloop reduction.
bool isProfitableToScalarize(Instruction *I, ElementCount VF) const
void setWideningDecision(const InterleaveGroup< Instruction > *Grp, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for interleaving group Grp and vector ...
const MapVector< Instruction *, uint64_t > & getMinimalBitwidths() const
CallWideningDecision getCallWideningDecision(CallInst *CI, ElementCount VF) const
bool isLegalGatherOrScatter(Value *V, ElementCount VF)
Returns true if the target machine can represent V as a masked gather or scatter operation.
bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const
bool shouldConsiderInvariant(Value *Op)
Returns true if Op should be considered invariant and if it is trivially hoistable.
bool foldTailByMasking() const
Returns true if all loop blocks should be masked to fold tail loop.
bool foldTailWithEVL() const
Returns true if VP intrinsics with explicit vector length support should be generated in the tail fol...
bool usePredicatedReductionSelect() const
Returns true if the predicated reduction select should be used to set the incoming value for the redu...
bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const
Returns true if the instructions in this block requires predication for any reason,...
void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind, Function *Variant, Intrinsic::ID IID, std::optional< unsigned > MaskPos, InstructionCost Cost)
void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC)
Selects and saves TailFoldingStyle for 2 options - if IV update may overflow or not.
AssumptionCache * AC
Assumption cache.
void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for instruction I and vector width VF.
InstWidening
Decision that was taken during cost calculation for memory instruction.
bool isScalarWithPredication(Instruction *I, ElementCount VF) const
Returns true if I is an instruction which requires predication and for which our chosen predication s...
InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const
Estimate cost of a call instruction CI if it were vectorized with factor VF.
bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const
Returns true if we should use strict in-order reductions for the given RdxDesc.
std::pair< InstructionCost, InstructionCost > getDivRemSpeculationCost(Instruction *I, ElementCount VF) const
Return the costs for our two available strategies for lowering a div/rem operation which requires spe...
bool isDivRemScalarWithPredication(InstructionCost ScalarCost, InstructionCost SafeDivisorCost) const
Given costs for both strategies, return true if the scalar predication lowering should be used for di...
InstructionCost expectedCost(ElementCount VF)
Returns the expected execution cost.
void setCostBasedWideningDecision(ElementCount VF)
Memory access instruction may be vectorized in more than one way.
InstWidening getWideningDecision(Instruction *I, ElementCount VF) const
Return the cost model decision for the given instruction I and vector width VF.
FixedScalableVFPair MaxPermissibleVFWithoutMaxBW
The highest VF possible for this loop, without using MaxBandwidth.
bool isScalarEpilogueAllowed() const
Returns true if a scalar epilogue is not allowed due to optsize or a loop hint annotation.
InstructionCost getWideningCost(Instruction *I, ElementCount VF)
Return the vectorization cost for the given instruction I and vector width VF.
void collectInstsToScalarize(ElementCount VF)
Collects the instructions to scalarize for each predicated instruction in the loop.
LoopVectorizationLegality checks if it is legal to vectorize a loop, and to what vectorization factor...
MapVector< PHINode *, InductionDescriptor > InductionList
InductionList saves induction variables and maps them to the induction descriptor.
const SmallPtrSetImpl< const Instruction * > & getPotentiallyFaultingLoads() const
Returns potentially faulting loads.
bool canVectorize(bool UseVPlanNativePath)
Returns true if it is legal to vectorize this loop.
bool canVectorizeFPMath(bool EnableStrictReductions)
Returns true if it is legal to vectorize the FP math operations in this loop.
PHINode * getPrimaryInduction()
Returns the primary induction variable.
const SmallVector< BasicBlock *, 4 > & getCountableExitingBlocks() const
Returns all exiting blocks with a countable exit, i.e.
const InductionList & getInductionVars() const
Returns the induction variables found in the loop.
bool hasUncountableEarlyExit() const
Returns true if the loop has exactly one uncountable early exit, i.e.
bool hasHistograms() const
Returns a list of all known histogram operations in the loop.
const LoopAccessInfo * getLAI() const
Planner drives the vectorization process after having passed Legality checks.
VectorizationFactor selectEpilogueVectorizationFactor(const ElementCount MaxVF, unsigned IC)
VPlan & getPlanFor(ElementCount VF) const
Return the VPlan for VF.
Definition VPlan.cpp:1577
VectorizationFactor planInVPlanNativePath(ElementCount UserVF)
Use the VPlan-native path to plan how to best vectorize, return the best VF and its cost.
void updateLoopMetadataAndProfileInfo(Loop *VectorLoop, VPBasicBlock *HeaderVPBB, const VPlan &Plan, bool VectorizingEpilogue, MDNode *OrigLoopID, std::optional< unsigned > OrigAverageTripCount, unsigned OrigLoopInvocationWeight, unsigned EstimatedVFxUF, bool DisableRuntimeUnroll)
Update loop metadata and profile info for both the scalar remainder loop and VectorLoop,...
Definition VPlan.cpp:1628
void buildVPlans(ElementCount MinVF, ElementCount MaxVF)
Build VPlans for power-of-2 VF's between MinVF and MaxVF inclusive, according to the information gath...
Definition VPlan.cpp:1561
VectorizationFactor computeBestVF()
Compute and return the most profitable vectorization factor.
DenseMap< const SCEV *, Value * > executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan, InnerLoopVectorizer &LB, DominatorTree *DT, bool VectorizingEpilogue)
Generate the IR code for the vectorized loop captured in VPlan BestPlan according to the best selecte...
unsigned selectInterleaveCount(VPlan &Plan, ElementCount VF, InstructionCost LoopCost)
void emitInvalidCostRemarks(OptimizationRemarkEmitter *ORE)
Emit remarks for recipes with invalid costs in the available VPlans.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1542
void printPlans(raw_ostream &O)
Definition VPlan.cpp:1706
void plan(ElementCount UserVF, unsigned UserIC)
Build VPlans for the specified UserVF and UserIC if they are non-zero or all applicable candidate VFs...
void addMinimumIterationCheck(VPlan &Plan, ElementCount VF, unsigned UF, ElementCount MinProfitableTripCount) const
Create a check to Plan to see if the vector loop should be executed based on its trip count.
bool hasPlanWithVF(ElementCount VF) const
Look through the existing plans and return true if we have one with vectorization factor VF.
This holds vectorization requirements that must be verified late in the process.
Utility class for getting and setting loop vectorizer hints in the form of loop metadata.
bool allowVectorization(Function *F, Loop *L, bool VectorizeOnlyWhenForced) const
void emitRemarkWithHints() const
Dumps all the hint information.
const char * vectorizeAnalysisPassName() const
If hints are provided that force vectorization, use the AlwaysPrint pass name to force the frontend t...
This class emits a version of the loop where run-time checks ensure that may-alias pointers can't ove...
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
bool hasLoopInvariantOperands(const Instruction *I) const
Return true if all the operands of the specified instruction are loop invariant.
Definition LoopInfo.cpp:67
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition LoopInfo.cpp:632
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition LoopInfo.cpp:61
Metadata node.
Definition Metadata.h:1078
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition MapVector.h:119
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition Module.cpp:230
Diagnostic information for optimization analysis remarks related to pointer aliasing.
Diagnostic information for optimization analysis remarks related to floating-point non-commutativity.
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
op_range incoming_values()
void setIncomingValueForBlock(const BasicBlock *BB, Value *V)
Set every incoming value(s) for block BB to V.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
unsigned getNumIncomingValues() const
Return the number of incoming edges.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEVPredicate & getPredicate() const
LLVM_ABI unsigned getSmallConstantMaxTripCount()
Returns the upper bound of the loop trip count as a normal unsigned value, or 0 if the trip count is ...
LLVM_ABI const SCEV * getBackedgeTakenCount()
Get the (predicated) backedge count for the analyzed loop.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
PreservedAnalyses & preserve()
Mark an analysis as preserved.
Definition Analysis.h:132
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
Analysis providing profile information.
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
static bool isFMulAddIntrinsic(Instruction *I)
Returns true if the instruction is a call to the llvm.fmuladd intrinsic.
FastMathFlags getFastMathFlags() const
Instruction * getLoopExitInstr() const
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
Type * getRecurrenceType() const
Returns the type of the recurrence.
const SmallPtrSet< Instruction *, 8 > & getCastInsts() const
Returns a reference to the instructions used for type-promoting the recurrence.
unsigned getMinWidthCastToRecurrenceTypeInBits() const
Returns the minimum width used by the recurrence in bits.
TrackingVH< Value > getRecurrenceStartValue() const
LLVM_ABI SmallVector< Instruction *, 4 > getReductionOpChain(PHINode *Phi, Loop *L) const
Attempts to find a chain of operations from Phi to LoopExitInst that can be treated as a set of reduc...
static bool isAnyOfRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
bool isSigned() const
Returns true if all source operands of the recurrence are SExtInsts.
RecurKind getRecurrenceKind() const
bool isOrdered() const
Expose an ordered FP reduction to the instance users.
static LLVM_ABI bool isFloatingPointRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is a floating point kind.
static bool isFindIVRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
Value * getSentinelValue() const
Returns the sentinel value for FindFirstIV & FindLastIV recurrences to replace the start value.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
std::optional< ArrayRef< PointerDiffInfo > > getDiffChecks() const
const SmallVectorImpl< RuntimePointerCheck > & getChecks() const
Returns the checks that generateChecks created.
This class uses information about analyze scalars to rewrite expressions in canonical form.
ScalarEvolution * getSE()
bool isInsertedInstruction(Instruction *I) const
Return true if the specified instruction was inserted by the code rewriter.
LLVM_ABI Value * expandCodeForPredicate(const SCEVPredicate *Pred, Instruction *Loc)
Generates a code sequence that evaluates this predicate.
void eraseDeadInstructions(Value *Root)
Remove inserted instructions that are dead, e.g.
virtual bool isAlwaysTrue() const =0
Returns true if the predicate is always true.
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
LLVM_ABI const SCEV * getURemExpr(const SCEV *LHS, const SCEV *RHS)
Represents an unsigned remainder expression based on unsigned division.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getTripCountFromExitCount(const SCEV *ExitCount)
A version of getTripCountFromExitCount below which always picks an evaluation type which can not resu...
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
LLVM_ABI void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
LLVM_ABI bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
LLVM_ABI void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
LLVM_ABI void forgetBlockAndLoopDispositions(Value *V=nullptr)
Called when the client has changed the disposition of values in a loop or block.
const SCEV * getMinusOne(Type *Ty)
Return a SCEV for the constant -1 of a specific type.
LLVM_ABI void forgetLcssaPhiWithNewPredecessor(Loop *L, PHINode *V)
Forget LCSSA phi node V of loop L to which a new predecessor was added, such that it may no longer be...
LLVM_ABI unsigned getSmallConstantTripCount(const Loop *L)
Returns the exact trip count of the loop if we can compute it, and the result is a small constant.
APInt getUnsignedRangeMax(const SCEV *S)
Determine the max of the unsigned range for a particular SCEV.
LLVM_ABI const SCEV * applyLoopGuards(const SCEV *Expr, const Loop *L)
Try to apply information from loop guards for L to Expr.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:59
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:102
void insert_range(Range &&R)
Definition SetVector.h:175
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition SetVector.h:261
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:150
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:338
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
LLVM_ABI std::optional< unsigned > getVScaleForTuning() const
LLVM_ABI InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
LLVM_ABI bool supportsEfficientVectorElementLoadStore() const
If target has efficient vector element load/store instructions, it can return true here so that inser...
LLVM_ABI bool prefersVectorizedAddressing() const
Return true if target doesn't mind addresses in vectors.
LLVM_ABI TypeSize getRegisterBitWidth(RegisterKind K) const
LLVM_ABI bool preferFixedOverScalableIfEqualCost(bool IsEpilogue) const
LLVM_ABI InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
LLVM_ABI bool isElementTypeLegalForScalableVector(Type *Ty) const
LLVM_ABI ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
LLVM_ABI InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
LLVM_ABI InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const
LLVM_ABI InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, PartialReductionExtendKind OpAExtend, PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind) const
LLVM_ABI InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
LLVM_ABI bool supportsScalableVectors() const
@ TCC_Free
Expected to fold away in lowering.
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
LLVM_ABI InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const
LLVM_ABI InstructionCost getOperandsScalarizationOverhead(ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const
Estimate the overhead of scalarizing operands with the given types.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
LLVM_ABI InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ Interleave
The cast is used with an interleaved load/store.
@ GatherScatter
The cast is used with a gather/scatter.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:88
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:97
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:281
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:198
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
op_range operands()
Definition User.h:292
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition User.cpp:24
Value * getOperand(unsigned i) const
Definition User.h:232
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition VectorUtils.h:74
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:3824
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition VPlan.h:3899
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:3851
iterator end()
Definition VPlan.h:3861
iterator begin()
Recipe iterator methods.
Definition VPlan.h:3859
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:3912
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:216
VPRegionBlock * getEnclosingLoopRegion()
Definition VPlan.cpp:578
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:623
void insert(VPRecipeBase *Recipe, iterator InsertPt)
Definition VPlan.h:3890
bool empty() const
Definition VPlan.h:3870
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:80
VPRegionBlock * getParent()
Definition VPlan.h:172
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:186
void setName(const Twine &newName)
Definition VPlan.h:165
size_t getNumSuccessors() const
Definition VPlan.h:218
void swapSuccessors()
Swap successors of the block. The block must have exactly 2 successors.
Definition VPlan.h:321
size_t getNumPredecessors() const
Definition VPlan.h:219
VPlan * getPlan()
Definition VPlan.cpp:161
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:166
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:208
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:197
static auto blocksOnly(const T &Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:187
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:208
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:146
static void reassociateBlocks(VPBlockBase *Old, VPBlockBase *New)
Reassociate all the blocks connected to Old so that they now point to New.
Definition VPlanUtils.h:173
VPlan-based builder utility analogous to IRBuilder.
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL, const Twine &Name="")
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
Canonical scalar induction phi of the vector loop.
Definition VPlan.h:3480
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:424
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:397
void execute(VPTransformState &State) override
Generate the transformed value of the induction at offset StartValue (1.
VPValue * getStepValue() const
Definition VPlan.h:3701
VPValue * getStartValue() const
Definition VPlan.h:3700
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:1987
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2035
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2024
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:3977
Helper to manage IR metadata for recipes.
Definition VPlan.h:938
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:981
@ ComputeAnyOfResult
Compute the final result of a AnyOf reduction with select(cmp(),x,y), where one of (x,...
Definition VPlan.h:1019
@ ResumeForEpilogue
Explicit user for the resume phi of the canonical induction in the main VPlan, used by the epilogue v...
Definition VPlan.h:1068
@ FirstOrderRecurrenceSplice
Definition VPlan.h:987
@ ReductionStartVector
Start vector for reductions with 3 operands: the original start value, the identity value for the red...
Definition VPlan.h:1059
unsigned getOpcode() const
Definition VPlan.h:1125
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:2585
In what follows, the term "input IR" refers to code that is fed into the vectorizer whereas the term ...
A recipe for forming partial reductions.
Definition VPlan.h:2772
detail::zippy< llvm::detail::zip_first, VPUser::const_operand_range, const_incoming_blocks_range > incoming_values_and_blocks() const
Returns an iterator range over pairs of incoming values and corresponding incoming blocks.
Definition VPlan.h:1302
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:386
VPBasicBlock * getParent()
Definition VPlan.h:407
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:478
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
VPRecipeBase * tryToCreateWidenRecipe(VPSingleDefRecipe *R, VFRange &Range)
Create and return a widened recipe for R if one can be created within the given VF Range.
VPValue * getBlockInMask(VPBasicBlock *VPBB) const
Returns the entry mask for block VPBB or null if the mask is all-true.
VPRecipeBase * tryToCreatePartialReduction(VPInstruction *Reduction, unsigned ScaleFactor)
Create and return a partial reduction recipe for a reduction instruction along with binary operation ...
std::optional< unsigned > getScalingForReduction(const Instruction *ExitInst)
void collectScaledReductions(VFRange &Range)
Find all possible partial reductions in the loop and track all of those that are valid so recipes can...
VPReplicateRecipe * handleReplication(VPInstruction *VPI, VFRange &Range)
Build a VPReplicationRecipe for VPI.
A recipe for handling reduction phis.
Definition VPlan.h:2340
bool isInLoop() const
Returns true, if the phi is part of an in-loop reduction.
Definition VPlan.h:2400
RecurKind getRecurrenceKind() const
Returns the recurrence kind of the reduction.
Definition VPlan.h:2394
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4012
const VPBlockBase * getEntry() const
Definition VPlan.h:4048
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the region.
Definition VPlan.h:4110
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:2878
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:517
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:582
An analysis for type-inference for VPValues.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:199
operand_range operands()
Definition VPlanValue.h:267
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:243
unsigned getNumOperands() const
Definition VPlanValue.h:237
operand_iterator op_begin()
Definition VPlanValue.h:263
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:238
void addOperand(VPValue *Operand)
Definition VPlanValue.h:232
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:48
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:131
Value * getLiveInIRValue() const
Returns the underlying IR value, if this VPValue is defined outside the scope of VPlan.
Definition VPlanValue.h:176
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:85
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1378
user_iterator user_begin()
Definition VPlanValue.h:130
unsigned getNumUsers() const
Definition VPlanValue.h:113
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1382
user_range users()
Definition VPlanValue.h:134
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:1851
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1494
A recipe for handling GEP instructions.
Definition VPlan.h:1779
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2080
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2127
A common base class for widening memory operations.
Definition VPlan.h:3184
A recipe for widened phis.
Definition VPlan.h:2263
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1451
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4142
bool hasVF(ElementCount VF) const
Definition VPlan.h:4348
LLVMContext & getContext() const
Definition VPlan.h:4336
VPBasicBlock * getEntry()
Definition VPlan.h:4236
VPValue & getVectorTripCount()
The vector trip count.
Definition VPlan.h:4327
VPValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4334
VPValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4330
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4298
iterator_range< SmallSetVector< ElementCount, 2 >::iterator > vectorFactors() const
Returns an iterator range over all VFs of the plan.
Definition VPlan.h:4355
bool hasUF(unsigned UF) const
Definition VPlan.h:4366
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4288
VPValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:4411
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1012
bool hasEarlyExit() const
Returns true if the VPlan is based on a loop with an early exit.
Definition VPlan.h:4504
InstructionCost cost(ElementCount VF, VPCostContext &Ctx)
Return the cost of this plan.
Definition VPlan.cpp:994
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4312
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4261
VPValue * getOrAddLiveIn(Value *V)
Gets the live-in VPValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:4390
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4279
void execute(VPTransformState *State)
Generate the IR code for this VPlan.
Definition VPlan.cpp:906
VPIRBasicBlock * getScalarHeader() const
Return the VPIRBasicBlock wrapping the header of the scalar loop.
Definition VPlan.h:4284
VPValue * getLiveIn(Value *V) const
Return the live-in VPValue for V, if there is one or nullptr otherwise.
Definition VPlan.h:4427
VPBasicBlock * getVectorPreheader()
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4241
VPlan * duplicate()
Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...
Definition VPlan.cpp:1154
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
LLVM_ABI bool hasOneUser() const
Return true if there is exactly one user of this value.
Definition Value.cpp:166
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:390
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
constexpr bool hasKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns true if there exists a value X where RHS.multiplyCoefficientBy(X) will result in a value whos...
Definition TypeSize.h:269
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:230
constexpr bool isNonZero() const
Definition TypeSize.h:155
constexpr ScalarTy getKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns a value X where RHS.multiplyCoefficientBy(X) will result in a value whose quantity matches ou...
Definition TypeSize.h:277
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr bool isZero() const
Definition TypeSize.h:153
static constexpr bool isKnownGT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:223
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
static constexpr bool isKnownGE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:237
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
IteratorT end() const
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
A raw_ostream that writes to an std::string.
Changed
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
std::variant< std::monostate, Loc::Single, Loc::Multi, Loc::MMI, Loc::EntryValue > Variant
Alias for the std::variant specialization base class of DbgVariable.
Definition DwarfDebug.h:189
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
OneOps_match< OpTy, Instruction::Freeze > m_Freeze(const OpTy &Op)
Matches FreezeInst.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
MatchFunctor< Val, Pattern > match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
class_match< const SCEVVScale > m_SCEVVScale()
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
specificloop_ty m_SpecificLoop(const Loop *L)
cst_pred_ty< is_specific_signed_cst > m_scev_SpecificSInt(int64_t V)
Match an SCEV constant with a plain signed integer (sign-extended value will be matched)
SCEVAffineAddRec_match< Op0_t, Op1_t, class_match< const Loop > > m_scev_AffineAddRec(const Op0_t &Op0, const Op1_t &Op1)
bind_ty< const SCEVMulExpr > m_scev_Mul(const SCEVMulExpr *&V)
bool match(const SCEV *S, const Pattern &P)
SCEVBinaryExpr_match< SCEVMulExpr, Op0_t, Op1_t, SCEV::FlagAnyWrap, true > m_scev_c_Mul(const Op0_t &Op0, const Op1_t &Op1)
class_match< const SCEV > m_SCEV()
match_combine_or< AllRecipe_match< Instruction::ZExt, Op0_t >, AllRecipe_match< Instruction::SExt, Op0_t > > m_ZExtOrSExt(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ExtractLastElement, Op0_t > m_ExtractLastElement(const Op0_t &Op0)
class_match< VPValue > m_VPValue()
Match an arbitrary VPValue and ignore it.
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
Add a small namespace to avoid name clashes with the classes used in the streaming interface.
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< InstrNode * > Instr
Definition RDFGraph.h:389
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
VPBasicBlock * getFirstLoopHeader(VPlan &Plan, VPDominatorTree &VPDT)
Returns the header block of the first, top-level loop, or null if none exist.
unsigned getVFScaleFactor(VPRecipeBase *R)
Get the VF scaling factor applied to the recipe's output, if the recipe has one.
const SCEV * getSCEVExprForVPValue(const VPValue *V, ScalarEvolution &SE, const Loop *L=nullptr)
Return the SCEV expression for V.
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI bool simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, AssumptionCache *AC, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
Simplify each loop in a loop nest recursively.
LLVM_ABI void ReplaceInstWithInst(BasicBlock *BB, BasicBlock::iterator &BI, Instruction *I)
Replace the instruction specified by BI with the instruction specified by I.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:477
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:829
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI Value * addRuntimeChecks(Instruction *Loc, Loop *TheLoop, const SmallVectorImpl< RuntimePointerCheck > &PointerChecks, SCEVExpander &Expander, bool HoistRuntimeChecks=false)
Add code that checks at runtime if the accessed arrays in PointerChecks overlap.
auto cast_if_present(const Y &Val)
cast_if_present<X> - Functionally identical to cast, except that a null value is accepted.
Definition Casting.h:683
LLVM_ABI bool RemoveRedundantDbgInstrs(BasicBlock *BB)
Try to remove redundant dbg.value instructions from given basic block.
cl::opt< bool > VerifyEachVPlan
LLVM_ABI std::optional< unsigned > getLoopEstimatedTripCount(Loop *L, unsigned *EstimatedLoopInvocationWeight=nullptr)
Return either:
static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop, VectorizationFactor VF, unsigned IC)
Report successful vectorization of the loop.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
unsigned getLoadStoreAddressSpace(const Value *I)
A helper function that returns the address space of the pointer operand of load or store instruction.
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1655
LLVM_ABI_FOR_TEST bool verifyVPlanIsValid(const VPlan &Plan, bool VerifyLate=false)
Verify invariants for general VPlans.
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
InstructionCost Cost
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
OuterAnalysisManagerProxy< ModuleAnalysisManager, Function > ModuleAnalysisManagerFunctionProxy
Provide the ModuleAnalysisManager to Function proxy.
Value * getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF)
Return the runtime value for VF.
LLVM_ABI bool formLCSSARecursively(Loop &L, const DominatorTree &DT, const LoopInfo *LI, ScalarEvolution *SE)
Put a loop nest into LCSSA form.
Definition LCSSA.cpp:449
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2136
LLVM_ABI bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:216
LLVM_ABI bool VerifySCEV
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:243
SmallVector< VPRegisterUsage, 8 > calculateRegisterUsageForPlan(VPlan &Plan, ArrayRef< ElementCount > VFs, const TargetTransformInfo &TTI, const SmallPtrSetImpl< const Value * > &ValuesToIgnore)
Estimate the register usage for Plan and vectorization factors in VFs by calculating the highest numb...
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:337
LLVM_ABI void setBranchWeights(Instruction &I, ArrayRef< uint32_t > Weights, bool IsExpected, bool ElideAllZero=false)
Create a new branch_weights metadata node and add or overwrite a prof metadata reference to instructi...
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
void collectEphemeralRecipesForVPlan(VPlan &Plan, DenseSet< VPRecipeBase * > &EphRecipes)
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
bool containsIrreducibleCFG(RPOTraversalT &RPOTraversal, const LoopInfoT &LI)
Return true if the control flow in RPOTraversal is irreducible.
Definition CFG.h:149
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1622
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
LLVM_ABI cl::opt< bool > EnableLoopVectorization
LLVM_ABI bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition Local.cpp:421
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
Type * toVectorizedTy(Type *Ty, ElementCount EC)
A helper for converting to vectorized types.
LLVM_ABI void llvm_unreachable_internal(const char *msg=nullptr, const char *file=nullptr, unsigned line=0)
This function calls abort(), and prints the optional message to stderr.
bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind)
Check if a constant CI can be safely treated as having been extended from a narrower type with the gi...
Definition VPlan.cpp:1719
T * find_singleton(R &&Range, Predicate P, bool AllowRepeats=false)
Return the single value in Range that satisfies P(<member of Range> *, AllowRepeats)->T * returning n...
Definition STLExtras.h:1787
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
cl::opt< unsigned > ForceTargetInstructionCost
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:129
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
bool canVectorizeTy(Type *Ty)
Returns true if Ty is a valid vector element type, void, or an unpacked literal struct where all elem...
TargetTransformInfo TTI
static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr, DebugLoc DL={})
Reports an informative message: print Msg for debugging purposes as well as an optimization remark.
LLVM_ABI bool isAssignmentTrackingEnabled(const Module &M)
Return true if assignment tracking is enabled for module M.
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
LLVM_ABI Value * getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF)
Given information about an recurrence kind, return the identity for the @llvm.vector....
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
LLVM_ABI void reportVectorizationFailure(const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr)
Reports a vectorization failure: print DebugMsg for debugging purposes along with the corresponding o...
DWARFExpression::Operation Op
ScalarEpilogueLowering
@ CM_ScalarEpilogueNotAllowedLowTripLoop
@ CM_ScalarEpilogueNotNeededUsePredicate
@ CM_ScalarEpilogueNotAllowedOptSize
@ CM_ScalarEpilogueAllowed
@ CM_ScalarEpilogueNotAllowedUsePredicate
LLVM_ABI bool isGuaranteedNotToBeUndefOrPoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Return true if this function can prove that V does not have undef bits and is never poison.
ArrayRef(const T &OneElt) -> ArrayRef< T >
Value * createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, int64_t Step)
Return a value for Step multiplied by VF.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1758
auto predecessors(const MachineBasicBlock *BB)
iterator_range< pointer_iterator< WrappedIteratorT > > make_pointer_range(RangeT &&Range)
Definition iterator.h:363
cl::opt< bool > EnableVPlanNativePath
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
ArrayRef< Type * > getContainedTypes(Type *const &Ty)
Returns the types contained in Ty.
LLVM_ABI Value * addDiffRuntimeChecks(Instruction *Loc, ArrayRef< PointerDiffInfo > Checks, SCEVExpander &Expander, function_ref< Value *(IRBuilderBase &, unsigned)> GetVF, unsigned IC)
bool pred_empty(const BasicBlock *BB)
Definition CFG.h:119
@ DataAndControlFlowWithoutRuntimeCheck
Use predicate to control both data and control flow, but modify the trip count so that a runtime over...
@ None
Don't use tail folding.
@ DataWithEVL
Use predicated EVL instructions for tail-folding.
@ DataAndControlFlow
Use predicate to control both data and control flow.
@ DataWithoutLaneMask
Same as Data, but avoids using the get.active.lane.mask intrinsic to calculate the mask and instead i...
@ Data
Use predicate only to mask operations on data in the loop.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI bool hasBranchWeightMD(const Instruction &I)
Checks if an instructions has Branch Weight Metadata.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:330
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
std::unique_ptr< VPlan > VPlanPtr
Definition VPlan.h:76
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
LLVM_ABI MapVector< Instruction *, uint64_t > computeMinimumValueSizes(ArrayRef< BasicBlock * > Blocks, DemandedBits &DB, const TargetTransformInfo *TTI=nullptr)
Compute a map of integer instructions to their minimum legal type size.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
LLVM_ABI cl::opt< bool > EnableLoopInterleaving
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
A special type used by analysis passes to provide an address that identifies that particular analysis...
Definition Analysis.h:29
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
An information struct used to provide DenseMap with the various necessary components for a given valu...
Encapsulate information regarding vectorization of a loop and its epilogue.
EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, ElementCount EVF, unsigned EUF, VPlan &EpiloguePlan)
A class that represents two vectorization factors (initialized with 0 by default).
static FixedScalableVFPair getNone()
This holds details about a histogram operation – a load -> update -> store sequence where each lane i...
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
TargetLibraryInfo * TLI
LLVM_ABI LoopVectorizeResult runImpl(Function &F)
LLVM_ABI bool processLoop(Loop *L)
ProfileSummaryInfo * PSI
LoopAccessInfoManager * LAIs
LLVM_ABI void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName)
LLVM_ABI LoopVectorizePass(LoopVectorizeOptions Opts={})
BlockFrequencyInfo * BFI
ScalarEvolution * SE
AssumptionCache * AC
LLVM_ABI PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
OptimizationRemarkEmitter * ORE
TargetTransformInfo * TTI
Storage for information about made changes.
A chain of instructions that form a partial reduction.
Instruction * Reduction
The top-level binary operation that forms the reduction to a scalar after the loop body.
Instruction * ExtendA
The extension of each of the inner binary operation's operands.
A CRTP mix-in to automatically provide informational APIs needed for passes.
Definition PassManager.h:70
A marker analysis to determine if extra passes should be run after loop vectorization.
static LLVM_ABI AnalysisKey Key
Holds the VFShape for a specific scalar to vector function mapping.
std::optional< unsigned > getParamIndexForOptionalMask() const
Instruction Set Architecture.
Encapsulates information needed to describe a parameter.
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
ElementCount End
Struct to hold various analysis needed for cost computations.
unsigned getPredBlockCostDivisor(BasicBlock *BB) const
LoopVectorizationCostModel & CM
bool isLegacyUniformAfterVectorization(Instruction *I, ElementCount VF) const
Return true if I is considered uniform-after-vectorization in the legacy cost model for VF.
bool skipCostComputation(Instruction *UI, bool IsVector) const
Return true if the cost for UI shouldn't be computed, e.g.
InstructionCost getLegacyCost(Instruction *UI, ElementCount VF) const
Return the cost for UI with VF using the legacy cost model as fallback until computing the cost of al...
TargetTransformInfo::TargetCostKind CostKind
SmallPtrSet< Instruction *, 8 > SkipCostComputation
A recipe for handling first-order recurrence phis.
Definition VPlan.h:2305
A struct that represents some properties of the register usage of a loop.
VPTransformState holds information passed down when "executing" a VPlan, needed for generating the ou...
A recipe for widening select instructions.
Definition VPlan.h:1737
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializePacksAndUnpacks(VPlan &Plan)
Add explicit Build[Struct]Vector recipes to Pack multiple scalar values into vectors and Unpack recip...
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static LLVM_ABI_FOR_TEST std::unique_ptr< VPlan > buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy, DebugLoc IVDL, PredicatedScalarEvolution &PSE)
Create a base VPlan0, serving as the common starting point for all later candidates.
static void optimizeInductionExitUsers(VPlan &Plan, DenseMap< VPValue *, VPValue * > &EndValues, ScalarEvolution &SE)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static LLVM_ABI_FOR_TEST void handleEarlyExits(VPlan &Plan, bool HasUncountableExit)
Update Plan to account for all early exits.
static void addScalarResumePhis(VPlan &Plan, VPRecipeBuilder &Builder, DenseMap< VPValue *, VPValue * > &IVEndValues)
Create resume phis in the scalar preheader for first-order recurrences, reductions and inductions,...
static void canonicalizeEVLLoops(VPlan &Plan)
Transform EVL loops to use variable-length stepping after region dissolution.
static void dropPoisonGeneratingRecipes(VPlan &Plan, const std::function< bool(BasicBlock *)> &BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed)
static bool runPass(bool(*Transform)(VPlan &, ArgsTy...), VPlan &Plan, typename std::remove_reference< ArgsTy >::type &...Args)
Helper to run a VPlan transform Transform on VPlan, forwarding extra arguments to the transform.
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static void unrollByUF(VPlan &Plan, unsigned UF)
Explicitly unroll Plan by UF.
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand VPExpandSCEVRecipes in Plan's entry block.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void addMinimumIterationCheck(VPlan &Plan, ElementCount VF, unsigned UF, ElementCount MinProfitableTripCount, bool RequiresScalarEpilogue, bool TailFolded, bool CheckNeededWithTailFolding, Loop *OrigLoop, const uint32_t *MinItersBypassWeights, DebugLoc DL, ScalarEvolution &SE)
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, function_ref< const InductionDescriptor *(PHINode *)> GetIntOrFpInductionDescriptor, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range)
Handle users in the exit block for first order reductions in the original exit block.
static DenseMap< VPBasicBlock *, VPValue * > introduceMasksAndLinearize(VPlan &Plan, bool FoldTail)
Predicate and linearize the control-flow in the only loop region of Plan.
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPEVLBasedIVPHIRecipe and related recipes to Plan and replaces all uses except the canonical IV...
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static bool handleMaxMinNumReductions(VPlan &Plan)
Check if Plan contains any FMaxNum or FMinNum reductions.
static void removeBranchOnConst(VPlan &Plan)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static LLVM_ABI_FOR_TEST void createLoopRegions(VPlan &Plan)
Replace loops in Plan's flat CFG with VPRegionBlocks, turning Plan's flat CFG into a hierarchical CFG...
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void attachCheckBlock(VPlan &Plan, Value *Cond, BasicBlock *CheckBlock, bool AddBranchWeights)
Wrap runtime check block CheckBlock in a VPIRBB and Cond in a VPValue and connect the block to Plan,...
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue)
Materialize vector trip count computations to a set of VPInstructions.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static void replicateByVF(VPlan &Plan, ElementCount VF)
Replace each replicating VPReplicateRecipe and VPInstruction outside of any replicate region in Plan ...
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF, unsigned VectorRegWidth)
Try to convert a plan with interleave groups with VF elements to a plan with the interleave groups re...
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Try to have all users of fixed-order recurrences appear after the recipe defining their previous valu...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize VF and VFxUF to be computed explicitly using VPInstructions.
static void addMinimumVectorEpilogueIterationCheck(VPlan &Plan, Value *TripCount, Value *VectorTripCount, bool RequiresScalarEpilogue, ElementCount EpilogueVF, unsigned EpilogueUF, unsigned MainLoopStep, unsigned EpilogueLoopStep, ScalarEvolution &SE)
Add a check to Plan to see if the epilogue vector loop should be executed.
static LLVM_ABI_FOR_TEST void addMiddleCheck(VPlan &Plan, bool RequiresScalarEpilogueCheck, bool TailFolded)
If a check is needed to guard executing the scalar epilogue loop, it will be added to the middle bloc...
TODO: The following VectorizationFactor was pulled out of LoopVectorizationCostModel class.
InstructionCost Cost
Cost of the loop with that width.
ElementCount MinProfitableTripCount
The minimum trip count required to make vectorization profitable, e.g.
ElementCount Width
Vector width with best cost.
InstructionCost ScalarCost
Cost of the scalar loop.
static VectorizationFactor Disabled()
Width 1 means no vectorization, cost 0 means uncomputed cost.
static LLVM_ABI bool HoistRuntimeChecks