LLVM 23.0.0git
LoopVectorize.cpp
Go to the documentation of this file.
1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanAnalysis.h"
61#include "VPlanCFG.h"
62#include "VPlanHelpers.h"
63#include "VPlanPatternMatch.h"
64#include "VPlanTransforms.h"
65#include "VPlanUtils.h"
66#include "VPlanVerifier.h"
67#include "llvm/ADT/APInt.h"
68#include "llvm/ADT/ArrayRef.h"
69#include "llvm/ADT/DenseMap.h"
71#include "llvm/ADT/Hashing.h"
72#include "llvm/ADT/MapVector.h"
73#include "llvm/ADT/STLExtras.h"
76#include "llvm/ADT/Statistic.h"
77#include "llvm/ADT/StringRef.h"
78#include "llvm/ADT/Twine.h"
79#include "llvm/ADT/TypeSwitch.h"
84#include "llvm/Analysis/CFG.h"
101#include "llvm/IR/Attributes.h"
102#include "llvm/IR/BasicBlock.h"
103#include "llvm/IR/CFG.h"
104#include "llvm/IR/Constant.h"
105#include "llvm/IR/Constants.h"
106#include "llvm/IR/DataLayout.h"
107#include "llvm/IR/DebugInfo.h"
108#include "llvm/IR/DebugLoc.h"
109#include "llvm/IR/DerivedTypes.h"
111#include "llvm/IR/Dominators.h"
112#include "llvm/IR/Function.h"
113#include "llvm/IR/IRBuilder.h"
114#include "llvm/IR/InstrTypes.h"
115#include "llvm/IR/Instruction.h"
116#include "llvm/IR/Instructions.h"
118#include "llvm/IR/Intrinsics.h"
119#include "llvm/IR/MDBuilder.h"
120#include "llvm/IR/Metadata.h"
121#include "llvm/IR/Module.h"
122#include "llvm/IR/Operator.h"
123#include "llvm/IR/PatternMatch.h"
125#include "llvm/IR/Type.h"
126#include "llvm/IR/Use.h"
127#include "llvm/IR/User.h"
128#include "llvm/IR/Value.h"
129#include "llvm/IR/Verifier.h"
130#include "llvm/Support/Casting.h"
132#include "llvm/Support/Debug.h"
147#include <algorithm>
148#include <cassert>
149#include <cmath>
150#include <cstdint>
151#include <functional>
152#include <iterator>
153#include <limits>
154#include <memory>
155#include <string>
156#include <tuple>
157#include <utility>
158
159using namespace llvm;
160using namespace SCEVPatternMatch;
161
162#define LV_NAME "loop-vectorize"
163#define DEBUG_TYPE LV_NAME
164
165#ifndef NDEBUG
166const char VerboseDebug[] = DEBUG_TYPE "-verbose";
167#endif
168
169STATISTIC(LoopsVectorized, "Number of loops vectorized");
170STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
171STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
172STATISTIC(LoopsEarlyExitVectorized, "Number of early exit loops vectorized");
173
175 "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
176 cl::desc("Enable vectorization of epilogue loops."));
177
179 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
180 cl::desc("When epilogue vectorization is enabled, and a value greater than "
181 "1 is specified, forces the given VF for all applicable epilogue "
182 "loops."));
183
185 "epilogue-vectorization-minimum-VF", cl::Hidden,
186 cl::desc("Only loops with vectorization factor equal to or larger than "
187 "the specified value are considered for epilogue vectorization."));
188
189/// Loops with a known constant trip count below this number are vectorized only
190/// if no scalar iteration overheads are incurred.
192 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
193 cl::desc("Loops with a constant trip count that is smaller than this "
194 "value are vectorized only if no scalar iteration overheads "
195 "are incurred."));
196
198 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
199 cl::desc("The maximum allowed number of runtime memory checks"));
200
201// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
202// that predication is preferred, and this lists all options. I.e., the
203// vectorizer will try to fold the tail-loop (epilogue) into the vector body
204// and predicate the instructions accordingly. If tail-folding fails, there are
205// different fallback strategies depending on these values:
212} // namespace PreferPredicateTy
213
215 "prefer-predicate-over-epilogue",
218 cl::desc("Tail-folding and predication preferences over creating a scalar "
219 "epilogue loop."),
221 "scalar-epilogue",
222 "Don't tail-predicate loops, create scalar epilogue"),
224 "predicate-else-scalar-epilogue",
225 "prefer tail-folding, create scalar epilogue if tail "
226 "folding fails."),
228 "predicate-dont-vectorize",
229 "prefers tail-folding, don't attempt vectorization if "
230 "tail-folding fails.")));
231
233 "force-tail-folding-style", cl::desc("Force the tail folding style"),
236 clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
239 "Create lane mask for data only, using active.lane.mask intrinsic"),
241 "data-without-lane-mask",
242 "Create lane mask with compare/stepvector"),
244 "Create lane mask using active.lane.mask intrinsic, and use "
245 "it for both data and control flow"),
247 "data-and-control-without-rt-check",
248 "Similar to data-and-control, but remove the runtime check"),
250 "Use predicated EVL instructions for tail folding. If EVL "
251 "is unsupported, fallback to data-without-lane-mask.")));
252
254 "enable-wide-lane-mask", cl::init(false), cl::Hidden,
255 cl::desc("Enable use of wide lane masks when used for control flow in "
256 "tail-folded loops"));
257
259 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
260 cl::desc("Maximize bandwidth when selecting vectorization factor which "
261 "will be determined by the smallest type in loop."));
262
264 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
265 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
266
267/// An interleave-group may need masking if it resides in a block that needs
268/// predication, or in order to mask away gaps.
270 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
271 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
272
274 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
275 cl::desc("A flag that overrides the target's number of scalar registers."));
276
278 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
279 cl::desc("A flag that overrides the target's number of vector registers."));
280
282 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
283 cl::desc("A flag that overrides the target's max interleave factor for "
284 "scalar loops."));
285
287 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
288 cl::desc("A flag that overrides the target's max interleave factor for "
289 "vectorized loops."));
290
292 "force-target-instruction-cost", cl::init(0), cl::Hidden,
293 cl::desc("A flag that overrides the target's expected cost for "
294 "an instruction to a single constant value. Mostly "
295 "useful for getting consistent testing."));
296
298 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
299 cl::desc(
300 "Pretend that scalable vectors are supported, even if the target does "
301 "not support them. This flag should only be used for testing."));
302
304 "small-loop-cost", cl::init(20), cl::Hidden,
305 cl::desc(
306 "The cost of a loop that is considered 'small' by the interleaver."));
307
309 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
310 cl::desc("Enable the use of the block frequency analysis to access PGO "
311 "heuristics minimizing code growth in cold regions and being more "
312 "aggressive in hot regions."));
313
314// Runtime interleave loops for load/store throughput.
316 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
317 cl::desc(
318 "Enable runtime interleaving until load/store ports are saturated"));
319
320/// The number of stores in a loop that are allowed to need predication.
322 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
323 cl::desc("Max number of stores to be predicated behind an if."));
324
326 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
327 cl::desc("Count the induction variable only once when interleaving"));
328
330 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
331 cl::desc("Enable if predication of stores during vectorization."));
332
334 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
335 cl::desc("The maximum interleave count to use when interleaving a scalar "
336 "reduction in a nested loop."));
337
338static cl::opt<bool>
339 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
341 cl::desc("Prefer in-loop vector reductions, "
342 "overriding the targets preference."));
343
345 "force-ordered-reductions", cl::init(false), cl::Hidden,
346 cl::desc("Enable the vectorisation of loops with in-order (strict) "
347 "FP reductions"));
348
350 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
351 cl::desc(
352 "Prefer predicating a reduction operation over an after loop select."));
353
355 "enable-vplan-native-path", cl::Hidden,
356 cl::desc("Enable VPlan-native vectorization path with "
357 "support for outer loop vectorization."));
358
360 llvm::VerifyEachVPlan("vplan-verify-each",
361#ifdef EXPENSIVE_CHECKS
362 cl::init(true),
363#else
364 cl::init(false),
365#endif
367 cl::desc("Verify VPlans after VPlan transforms."));
368
369#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
371 "vplan-print-after-all", cl::init(false), cl::Hidden,
372 cl::desc("Print VPlans after all VPlan transformations."));
373
375 "vplan-print-after", cl::Hidden,
376 cl::desc("Print VPlans after specified VPlan transformations (regexp)."));
377
379 "vplan-print-vector-region-scope", cl::init(false), cl::Hidden,
380 cl::desc("Limit VPlan printing to vector loop region in "
381 "`-vplan-print-after*` if the plan has one."));
382#endif
383
384// This flag enables the stress testing of the VPlan H-CFG construction in the
385// VPlan-native vectorization path. It must be used in conjuction with
386// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
387// verification of the H-CFGs built.
389 "vplan-build-stress-test", cl::init(false), cl::Hidden,
390 cl::desc(
391 "Build VPlan for every supported loop nest in the function and bail "
392 "out right after the build (stress test the VPlan H-CFG construction "
393 "in the VPlan-native vectorization path)."));
394
396 "interleave-loops", cl::init(true), cl::Hidden,
397 cl::desc("Enable loop interleaving in Loop vectorization passes"));
399 "vectorize-loops", cl::init(true), cl::Hidden,
400 cl::desc("Run the Loop vectorization passes"));
401
403 "force-widen-divrem-via-safe-divisor", cl::Hidden,
404 cl::desc(
405 "Override cost based safe divisor widening for div/rem instructions"));
406
408 "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
410 cl::desc("Try wider VFs if they enable the use of vector variants"));
411
413 "enable-early-exit-vectorization", cl::init(true), cl::Hidden,
414 cl::desc(
415 "Enable vectorization of early exit loops with uncountable exits."));
416
418 "vectorizer-consider-reg-pressure", cl::init(false), cl::Hidden,
419 cl::desc("Discard VFs if their register pressure is too high."));
420
421// Likelyhood of bypassing the vectorized loop because there are zero trips left
422// after prolog. See `emitIterationCountCheck`.
423static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
424
425/// A helper function that returns true if the given type is irregular. The
426/// type is irregular if its allocated size doesn't equal the store size of an
427/// element of the corresponding vector type.
428static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
429 // Determine if an array of N elements of type Ty is "bitcast compatible"
430 // with a <N x Ty> vector.
431 // This is only true if there is no padding between the array elements.
432 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
433}
434
435/// A version of ScalarEvolution::getSmallConstantTripCount that returns an
436/// ElementCount to include loops whose trip count is a function of vscale.
438 const Loop *L) {
439 if (unsigned ExpectedTC = SE->getSmallConstantTripCount(L))
440 return ElementCount::getFixed(ExpectedTC);
441
442 const SCEV *BTC = SE->getBackedgeTakenCount(L);
444 return ElementCount::getFixed(0);
445
446 const SCEV *ExitCount = SE->getTripCountFromExitCount(BTC, BTC->getType(), L);
447 if (isa<SCEVVScale>(ExitCount))
449
450 const APInt *Scale;
451 if (match(ExitCount, m_scev_Mul(m_scev_APInt(Scale), m_SCEVVScale())))
452 if (cast<SCEVMulExpr>(ExitCount)->hasNoUnsignedWrap())
453 if (Scale->getActiveBits() <= 32)
455
456 return ElementCount::getFixed(0);
457}
458
459/// Returns "best known" trip count, which is either a valid positive trip count
460/// or std::nullopt when an estimate cannot be made (including when the trip
461/// count would overflow), for the specified loop \p L as defined by the
462/// following procedure:
463/// 1) Returns exact trip count if it is known.
464/// 2) Returns expected trip count according to profile data if any.
465/// 3) Returns upper bound estimate if known, and if \p CanUseConstantMax.
466/// 4) Returns std::nullopt if all of the above failed.
467static std::optional<ElementCount>
469 bool CanUseConstantMax = true) {
470 // Check if exact trip count is known.
471 if (auto ExpectedTC = getSmallConstantTripCount(PSE.getSE(), L))
472 return ExpectedTC;
473
474 // Check if there is an expected trip count available from profile data.
476 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
477 return ElementCount::getFixed(*EstimatedTC);
478
479 if (!CanUseConstantMax)
480 return std::nullopt;
481
482 // Check if upper bound estimate is known.
483 if (unsigned ExpectedTC = PSE.getSmallConstantMaxTripCount())
484 return ElementCount::getFixed(ExpectedTC);
485
486 return std::nullopt;
487}
488
489namespace {
490// Forward declare GeneratedRTChecks.
491class GeneratedRTChecks;
492
493using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
494} // namespace
495
496namespace llvm {
497
499
500/// InnerLoopVectorizer vectorizes loops which contain only one basic
501/// block to a specified vectorization factor (VF).
502/// This class performs the widening of scalars into vectors, or multiple
503/// scalars. This class also implements the following features:
504/// * It inserts an epilogue loop for handling loops that don't have iteration
505/// counts that are known to be a multiple of the vectorization factor.
506/// * It handles the code generation for reduction variables.
507/// * Scalarization (implementation using scalars) of un-vectorizable
508/// instructions.
509/// InnerLoopVectorizer does not perform any vectorization-legality
510/// checks, and relies on the caller to check for the different legality
511/// aspects. The InnerLoopVectorizer relies on the
512/// LoopVectorizationLegality class to provide information about the induction
513/// and reduction variables that were found to a given vectorization factor.
515public:
519 ElementCount VecWidth, unsigned UnrollFactor,
521 GeneratedRTChecks &RTChecks, VPlan &Plan)
522 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TTI(TTI), AC(AC),
523 VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()),
526 Plan.getVectorLoopRegion()->getSinglePredecessor())) {}
527
528 virtual ~InnerLoopVectorizer() = default;
529
530 /// Creates a basic block for the scalar preheader. Both
531 /// EpilogueVectorizerMainLoop and EpilogueVectorizerEpilogueLoop overwrite
532 /// the method to create additional blocks and checks needed for epilogue
533 /// vectorization.
535
536 /// Fix the vectorized code, taking care of header phi's, and more.
538
539 /// Fix the non-induction PHIs in \p Plan.
541
542 /// Returns the original loop trip count.
543 Value *getTripCount() const { return TripCount; }
544
545 /// Used to set the trip count after ILV's construction and after the
546 /// preheader block has been executed. Note that this always holds the trip
547 /// count of the original loop for both main loop and epilogue vectorization.
548 void setTripCount(Value *TC) { TripCount = TC; }
549
550protected:
552
553 /// Create and return a new IR basic block for the scalar preheader whose name
554 /// is prefixed with \p Prefix.
556
557 /// Allow subclasses to override and print debug traces before/after vplan
558 /// execution, when trace information is requested.
559 virtual void printDebugTracesAtStart() {}
560 virtual void printDebugTracesAtEnd() {}
561
562 /// The original loop.
564
565 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
566 /// dynamic knowledge to simplify SCEV expressions and converts them to a
567 /// more usable form.
569
570 /// Loop Info.
572
573 /// Dominator Tree.
575
576 /// Target Transform Info.
578
579 /// Assumption Cache.
581
582 /// The vectorization SIMD factor to use. Each vector will have this many
583 /// vector elements.
585
586 /// The vectorization unroll factor to use. Each scalar is vectorized to this
587 /// many different vector instructions.
588 unsigned UF;
589
590 /// The builder that we use
592
593 // --- Vectorization state ---
594
595 /// Trip count of the original loop.
596 Value *TripCount = nullptr;
597
598 /// The profitablity analysis.
600
601 /// Structure to hold information about generated runtime checks, responsible
602 /// for cleaning the checks, if vectorization turns out unprofitable.
603 GeneratedRTChecks &RTChecks;
604
606
607 /// The vector preheader block of \p Plan, used as target for check blocks
608 /// introduced during skeleton creation.
610};
611
612/// Encapsulate information regarding vectorization of a loop and its epilogue.
613/// This information is meant to be updated and used across two stages of
614/// epilogue vectorization.
617 unsigned MainLoopUF = 0;
619 unsigned EpilogueUF = 0;
622 Value *TripCount = nullptr;
625
627 ElementCount EVF, unsigned EUF,
629 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF),
631 assert(EUF == 1 &&
632 "A high UF for the epilogue loop is likely not beneficial.");
633 }
634};
635
636/// An extension of the inner loop vectorizer that creates a skeleton for a
637/// vectorized loop that has its epilogue (residual) also vectorized.
638/// The idea is to run the vplan on a given loop twice, firstly to setup the
639/// skeleton and vectorize the main loop, and secondly to complete the skeleton
640/// from the first step and vectorize the epilogue. This is achieved by
641/// deriving two concrete strategy classes from this base class and invoking
642/// them in succession from the loop vectorizer planner.
644public:
654
655 /// Holds and updates state information required to vectorize the main loop
656 /// and its epilogue in two separate passes. This setup helps us avoid
657 /// regenerating and recomputing runtime safety checks. It also helps us to
658 /// shorten the iteration-count-check path length for the cases where the
659 /// iteration count of the loop is so small that the main vector loop is
660 /// completely skipped.
662
663protected:
665};
666
667/// A specialized derived class of inner loop vectorizer that performs
668/// vectorization of *main* loops in the process of vectorizing loops and their
669/// epilogues.
671public:
682 /// Implements the interface for creating a vectorized skeleton using the
683 /// *main loop* strategy (i.e., the first pass of VPlan execution).
685
686protected:
687 /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the
688 /// vector preheader and its predecessor, also connecting the new block to the
689 /// scalar preheader.
690 void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB);
691
692 // Create a check to see if the main vector loop should be executed
694 unsigned UF) const;
695
696 /// Emits an iteration count bypass check once for the main loop (when \p
697 /// ForEpilogue is false) and once for the epilogue loop (when \p
698 /// ForEpilogue is true).
700 bool ForEpilogue);
701 void printDebugTracesAtStart() override;
702 void printDebugTracesAtEnd() override;
703};
704
705// A specialized derived class of inner loop vectorizer that performs
706// vectorization of *epilogue* loops in the process of vectorizing loops and
707// their epilogues.
709public:
716 GeneratedRTChecks &Checks, VPlan &Plan)
718 Checks, Plan, EPI.EpilogueVF,
719 EPI.EpilogueVF, EPI.EpilogueUF) {}
720 /// Implements the interface for creating a vectorized skeleton using the
721 /// *epilogue loop* strategy (i.e., the second pass of VPlan execution).
723
724protected:
725 void printDebugTracesAtStart() override;
726 void printDebugTracesAtEnd() override;
727};
728} // end namespace llvm
729
730/// Look for a meaningful debug location on the instruction or its operands.
732 if (!I)
733 return DebugLoc::getUnknown();
734
736 if (I->getDebugLoc() != Empty)
737 return I->getDebugLoc();
738
739 for (Use &Op : I->operands()) {
740 if (Instruction *OpInst = dyn_cast<Instruction>(Op))
741 if (OpInst->getDebugLoc() != Empty)
742 return OpInst->getDebugLoc();
743 }
744
745 return I->getDebugLoc();
746}
747
748/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
749/// is passed, the message relates to that particular instruction.
750#ifndef NDEBUG
751static void debugVectorizationMessage(const StringRef Prefix,
752 const StringRef DebugMsg,
753 Instruction *I) {
754 dbgs() << "LV: " << Prefix << DebugMsg;
755 if (I != nullptr)
756 dbgs() << " " << *I;
757 else
758 dbgs() << '.';
759 dbgs() << '\n';
760}
761#endif
762
763/// Create an analysis remark that explains why vectorization failed
764///
765/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
766/// RemarkName is the identifier for the remark. If \p I is passed it is an
767/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
768/// the location of the remark. If \p DL is passed, use it as debug location for
769/// the remark. \return the remark object that can be streamed to.
770static OptimizationRemarkAnalysis
771createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,
772 Instruction *I, DebugLoc DL = {}) {
773 BasicBlock *CodeRegion = I ? I->getParent() : TheLoop->getHeader();
774 // If debug location is attached to the instruction, use it. Otherwise if DL
775 // was not provided, use the loop's.
776 if (I && I->getDebugLoc())
777 DL = I->getDebugLoc();
778 else if (!DL)
779 DL = TheLoop->getStartLoc();
780
781 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
782}
783
784namespace llvm {
785
786/// Return a value for Step multiplied by VF.
788 int64_t Step) {
789 assert(Ty->isIntegerTy() && "Expected an integer step");
790 ElementCount VFxStep = VF.multiplyCoefficientBy(Step);
791 assert(isPowerOf2_64(VF.getKnownMinValue()) && "must pass power-of-2 VF");
792 if (VF.isScalable() && isPowerOf2_64(Step)) {
793 return B.CreateShl(
794 B.CreateVScale(Ty),
795 ConstantInt::get(Ty, Log2_64(VFxStep.getKnownMinValue())), "", true);
796 }
797 return B.CreateElementCount(Ty, VFxStep);
798}
799
800/// Return the runtime value for VF.
802 return B.CreateElementCount(Ty, VF);
803}
804
806 const StringRef OREMsg, const StringRef ORETag,
807 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
808 Instruction *I) {
809 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
810 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
811 ORE->emit(
812 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
813 << "loop not vectorized: " << OREMsg);
814}
815
816/// Reports an informative message: print \p Msg for debugging purposes as well
817/// as an optimization remark. Uses either \p I as location of the remark, or
818/// otherwise \p TheLoop. If \p DL is passed, use it as debug location for the
819/// remark. If \p DL is passed, use it as debug location for the remark.
820static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
822 Loop *TheLoop, Instruction *I = nullptr,
823 DebugLoc DL = {}) {
825 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
826 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop,
827 I, DL)
828 << Msg);
829}
830
831/// Report successful vectorization of the loop. In case an outer loop is
832/// vectorized, prepend "outer" to the vectorization remark.
834 VectorizationFactor VF, unsigned IC) {
836 "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
837 nullptr));
838 StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
839 ORE->emit([&]() {
840 return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
841 TheLoop->getHeader())
842 << "vectorized " << LoopType << "loop (vectorization width: "
843 << ore::NV("VectorizationFactor", VF.Width)
844 << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
845 });
846}
847
848} // end namespace llvm
849
850namespace llvm {
851
852// Loop vectorization cost-model hints how the scalar epilogue loop should be
853// lowered.
855
856 // The default: allowing scalar epilogues.
858
859 // Vectorization with OptForSize: don't allow epilogues.
861
862 // A special case of vectorisation with OptForSize: loops with a very small
863 // trip count are considered for vectorization under OptForSize, thereby
864 // making sure the cost of their loop body is dominant, free of runtime
865 // guards and scalar iteration overheads.
867
868 // Loop hint predicate indicating an epilogue is undesired.
870
871 // Directive indicating we must either tail fold or not vectorize
873};
874
875/// LoopVectorizationCostModel - estimates the expected speedups due to
876/// vectorization.
877/// In many cases vectorization is not profitable. This can happen because of
878/// a number of reasons. In this class we mainly attempt to predict the
879/// expected speedup/slowdowns due to the supported instruction set. We use the
880/// TargetTransformInfo to query the different backends for the cost of
881/// different operations.
884
885public:
893 std::function<BlockFrequencyInfo &()> GetBFI,
894 const Function *F, const LoopVectorizeHints *Hints,
896 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
897 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), GetBFI(GetBFI),
900 if (TTI.supportsScalableVectors() || ForceTargetSupportsScalableVectors)
901 initializeVScaleForTuning();
903 }
904
905 /// \return An upper bound for the vectorization factors (both fixed and
906 /// scalable). If the factors are 0, vectorization and interleaving should be
907 /// avoided up front.
908 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
909
910 /// \return True if runtime checks are required for vectorization, and false
911 /// otherwise.
912 bool runtimeChecksRequired();
913
914 /// Setup cost-based decisions for user vectorization factor.
915 /// \return true if the UserVF is a feasible VF to be chosen.
918 return expectedCost(UserVF).isValid();
919 }
920
921 /// \return True if maximizing vector bandwidth is enabled by the target or
922 /// user options, for the given register kind.
923 bool useMaxBandwidth(TargetTransformInfo::RegisterKind RegKind);
924
925 /// \return True if register pressure should be considered for the given VF.
926 bool shouldConsiderRegPressureForVF(ElementCount VF);
927
928 /// \return The size (in bits) of the smallest and widest types in the code
929 /// that needs to be vectorized. We ignore values that remain scalar such as
930 /// 64 bit loop indices.
931 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
932
933 /// Memory access instruction may be vectorized in more than one way.
934 /// Form of instruction after vectorization depends on cost.
935 /// This function takes cost-based decisions for Load/Store instructions
936 /// and collects them in a map. This decisions map is used for building
937 /// the lists of loop-uniform and loop-scalar instructions.
938 /// The calculated cost is saved with widening decision in order to
939 /// avoid redundant calculations.
940 void setCostBasedWideningDecision(ElementCount VF);
941
942 /// A call may be vectorized in different ways depending on whether we have
943 /// vectorized variants available and whether the target supports masking.
944 /// This function analyzes all calls in the function at the supplied VF,
945 /// makes a decision based on the costs of available options, and stores that
946 /// decision in a map for use in planning and plan execution.
947 void setVectorizedCallDecision(ElementCount VF);
948
949 /// Collect values we want to ignore in the cost model.
950 void collectValuesToIgnore();
951
952 /// Collect all element types in the loop for which widening is needed.
953 void collectElementTypesForWidening();
954
955 /// Split reductions into those that happen in the loop, and those that happen
956 /// outside. In loop reductions are collected into InLoopReductions.
957 void collectInLoopReductions();
958
959 /// Returns true if we should use strict in-order reductions for the given
960 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
961 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
962 /// of FP operations.
963 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
964 return !Hints->allowReordering() && RdxDesc.isOrdered();
965 }
966
967 /// \returns The smallest bitwidth each instruction can be represented with.
968 /// The vector equivalents of these instructions should be truncated to this
969 /// type.
971 return MinBWs;
972 }
973
974 /// \returns True if it is more profitable to scalarize instruction \p I for
975 /// vectorization factor \p VF.
977 assert(VF.isVector() &&
978 "Profitable to scalarize relevant only for VF > 1.");
979 assert(
980 TheLoop->isInnermost() &&
981 "cost-model should not be used for outer loops (in VPlan-native path)");
982
983 auto Scalars = InstsToScalarize.find(VF);
984 assert(Scalars != InstsToScalarize.end() &&
985 "VF not yet analyzed for scalarization profitability");
986 return Scalars->second.contains(I);
987 }
988
989 /// Returns true if \p I is known to be uniform after vectorization.
991 assert(
992 TheLoop->isInnermost() &&
993 "cost-model should not be used for outer loops (in VPlan-native path)");
994 // Pseudo probe needs to be duplicated for each unrolled iteration and
995 // vector lane so that profiled loop trip count can be accurately
996 // accumulated instead of being under counted.
998 return false;
999
1000 if (VF.isScalar())
1001 return true;
1002
1003 auto UniformsPerVF = Uniforms.find(VF);
1004 assert(UniformsPerVF != Uniforms.end() &&
1005 "VF not yet analyzed for uniformity");
1006 return UniformsPerVF->second.count(I);
1007 }
1008
1009 /// Returns true if \p I is known to be scalar after vectorization.
1011 assert(
1012 TheLoop->isInnermost() &&
1013 "cost-model should not be used for outer loops (in VPlan-native path)");
1014 if (VF.isScalar())
1015 return true;
1016
1017 auto ScalarsPerVF = Scalars.find(VF);
1018 assert(ScalarsPerVF != Scalars.end() &&
1019 "Scalar values are not calculated for VF");
1020 return ScalarsPerVF->second.count(I);
1021 }
1022
1023 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1024 /// for vectorization factor \p VF.
1026 // Truncs must truncate at most to their destination type.
1027 if (isa_and_nonnull<TruncInst>(I) && MinBWs.contains(I) &&
1028 I->getType()->getScalarSizeInBits() < MinBWs.lookup(I))
1029 return false;
1030 return VF.isVector() && MinBWs.contains(I) &&
1031 !isProfitableToScalarize(I, VF) &&
1033 }
1034
1035 /// Decision that was taken during cost calculation for memory instruction.
1038 CM_Widen, // For consecutive accesses with stride +1.
1039 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1045 };
1046
1047 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1048 /// instruction \p I and vector width \p VF.
1051 assert(VF.isVector() && "Expected VF >=2");
1052 WideningDecisions[{I, VF}] = {W, Cost};
1053 }
1054
1055 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1056 /// interleaving group \p Grp and vector width \p VF.
1060 assert(VF.isVector() && "Expected VF >=2");
1061 /// Broadcast this decicion to all instructions inside the group.
1062 /// When interleaving, the cost will only be assigned one instruction, the
1063 /// insert position. For other cases, add the appropriate fraction of the
1064 /// total cost to each instruction. This ensures accurate costs are used,
1065 /// even if the insert position instruction is not used.
1066 InstructionCost InsertPosCost = Cost;
1067 InstructionCost OtherMemberCost = 0;
1068 if (W != CM_Interleave)
1069 OtherMemberCost = InsertPosCost = Cost / Grp->getNumMembers();
1070 ;
1071 for (unsigned Idx = 0; Idx < Grp->getFactor(); ++Idx) {
1072 if (auto *I = Grp->getMember(Idx)) {
1073 if (Grp->getInsertPos() == I)
1074 WideningDecisions[{I, VF}] = {W, InsertPosCost};
1075 else
1076 WideningDecisions[{I, VF}] = {W, OtherMemberCost};
1077 }
1078 }
1079 }
1080
1081 /// Return the cost model decision for the given instruction \p I and vector
1082 /// width \p VF. Return CM_Unknown if this instruction did not pass
1083 /// through the cost modeling.
1085 assert(VF.isVector() && "Expected VF to be a vector VF");
1086 assert(
1087 TheLoop->isInnermost() &&
1088 "cost-model should not be used for outer loops (in VPlan-native path)");
1089
1090 std::pair<Instruction *, ElementCount> InstOnVF(I, VF);
1091 auto Itr = WideningDecisions.find(InstOnVF);
1092 if (Itr == WideningDecisions.end())
1093 return CM_Unknown;
1094 return Itr->second.first;
1095 }
1096
1097 /// Return the vectorization cost for the given instruction \p I and vector
1098 /// width \p VF.
1100 assert(VF.isVector() && "Expected VF >=2");
1101 std::pair<Instruction *, ElementCount> InstOnVF(I, VF);
1102 assert(WideningDecisions.contains(InstOnVF) &&
1103 "The cost is not calculated");
1104 return WideningDecisions[InstOnVF].second;
1105 }
1106
1114
1116 Function *Variant, Intrinsic::ID IID,
1117 std::optional<unsigned> MaskPos,
1119 assert(!VF.isScalar() && "Expected vector VF");
1120 CallWideningDecisions[{CI, VF}] = {Kind, Variant, IID, MaskPos, Cost};
1121 }
1122
1124 ElementCount VF) const {
1125 assert(!VF.isScalar() && "Expected vector VF");
1126 auto I = CallWideningDecisions.find({CI, VF});
1127 if (I == CallWideningDecisions.end())
1128 return {CM_Unknown, nullptr, Intrinsic::not_intrinsic, std::nullopt, 0};
1129 return I->second;
1130 }
1131
1132 /// Return True if instruction \p I is an optimizable truncate whose operand
1133 /// is an induction variable. Such a truncate will be removed by adding a new
1134 /// induction variable with the destination type.
1136 // If the instruction is not a truncate, return false.
1137 auto *Trunc = dyn_cast<TruncInst>(I);
1138 if (!Trunc)
1139 return false;
1140
1141 // Get the source and destination types of the truncate.
1142 Type *SrcTy = toVectorTy(Trunc->getSrcTy(), VF);
1143 Type *DestTy = toVectorTy(Trunc->getDestTy(), VF);
1144
1145 // If the truncate is free for the given types, return false. Replacing a
1146 // free truncate with an induction variable would add an induction variable
1147 // update instruction to each iteration of the loop. We exclude from this
1148 // check the primary induction variable since it will need an update
1149 // instruction regardless.
1150 Value *Op = Trunc->getOperand(0);
1151 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1152 return false;
1153
1154 // If the truncated value is not an induction variable, return false.
1155 return Legal->isInductionPhi(Op);
1156 }
1157
1158 /// Collects the instructions to scalarize for each predicated instruction in
1159 /// the loop.
1160 void collectInstsToScalarize(ElementCount VF);
1161
1162 /// Collect values that will not be widened, including Uniforms, Scalars, and
1163 /// Instructions to Scalarize for the given \p VF.
1164 /// The sets depend on CM decision for Load/Store instructions
1165 /// that may be vectorized as interleave, gather-scatter or scalarized.
1166 /// Also make a decision on what to do about call instructions in the loop
1167 /// at that VF -- scalarize, call a known vector routine, or call a
1168 /// vector intrinsic.
1170 // Do the analysis once.
1171 if (VF.isScalar() || Uniforms.contains(VF))
1172 return;
1174 collectLoopUniforms(VF);
1176 collectLoopScalars(VF);
1178 }
1179
1180 /// Returns true if the target machine supports masked store operation
1181 /// for the given \p DataType and kind of access to \p Ptr.
1182 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment,
1183 unsigned AddressSpace) const {
1184 return Legal->isConsecutivePtr(DataType, Ptr) &&
1185 TTI.isLegalMaskedStore(DataType, Alignment, AddressSpace);
1186 }
1187
1188 /// Returns true if the target machine supports masked load operation
1189 /// for the given \p DataType and kind of access to \p Ptr.
1190 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment,
1191 unsigned AddressSpace) const {
1192 return Legal->isConsecutivePtr(DataType, Ptr) &&
1193 TTI.isLegalMaskedLoad(DataType, Alignment, AddressSpace);
1194 }
1195
1196 /// Returns true if the target machine can represent \p V as a masked gather
1197 /// or scatter operation.
1199 bool LI = isa<LoadInst>(V);
1200 bool SI = isa<StoreInst>(V);
1201 if (!LI && !SI)
1202 return false;
1203 auto *Ty = getLoadStoreType(V);
1205 if (VF.isVector())
1206 Ty = VectorType::get(Ty, VF);
1207 return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1208 (SI && TTI.isLegalMaskedScatter(Ty, Align));
1209 }
1210
1211 /// Returns true if the target machine supports all of the reduction
1212 /// variables found for the given VF.
1214 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1215 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1216 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1217 }));
1218 }
1219
1220 /// Given costs for both strategies, return true if the scalar predication
1221 /// lowering should be used for div/rem. This incorporates an override
1222 /// option so it is not simply a cost comparison.
1224 InstructionCost SafeDivisorCost) const {
1225 switch (ForceSafeDivisor) {
1226 case cl::BOU_UNSET:
1227 return ScalarCost < SafeDivisorCost;
1228 case cl::BOU_TRUE:
1229 return false;
1230 case cl::BOU_FALSE:
1231 return true;
1232 }
1233 llvm_unreachable("impossible case value");
1234 }
1235
1236 /// Returns true if \p I is an instruction which requires predication and
1237 /// for which our chosen predication strategy is scalarization (i.e. we
1238 /// don't have an alternate strategy such as masking available).
1239 /// \p VF is the vectorization factor that will be used to vectorize \p I.
1240 bool isScalarWithPredication(Instruction *I, ElementCount VF);
1241
1242 /// Returns true if \p I is an instruction that needs to be predicated
1243 /// at runtime. The result is independent of the predication mechanism.
1244 /// Superset of instructions that return true for isScalarWithPredication.
1245 bool isPredicatedInst(Instruction *I) const;
1246
1247 /// A helper function that returns how much we should divide the cost of a
1248 /// predicated block by. Typically this is the reciprocal of the block
1249 /// probability, i.e. if we return X we are assuming the predicated block will
1250 /// execute once for every X iterations of the loop header so the block should
1251 /// only contribute 1/X of its cost to the total cost calculation, but when
1252 /// optimizing for code size it will just be 1 as code size costs don't depend
1253 /// on execution probabilities.
1254 ///
1255 /// Note that if a block wasn't originally predicated but was predicated due
1256 /// to tail folding, the divisor will still be 1 because it will execute for
1257 /// every iteration of the loop header.
1258 inline uint64_t
1259 getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind,
1260 const BasicBlock *BB);
1261
1262 /// Returns true if an artificially high cost for emulated masked memrefs
1263 /// should be used.
1264 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1265
1266 /// Return the costs for our two available strategies for lowering a
1267 /// div/rem operation which requires speculating at least one lane.
1268 /// First result is for scalarization (will be invalid for scalable
1269 /// vectors); second is for the safe-divisor strategy.
1270 std::pair<InstructionCost, InstructionCost>
1271 getDivRemSpeculationCost(Instruction *I, ElementCount VF);
1272
1273 /// Returns true if \p I is a memory instruction with consecutive memory
1274 /// access that can be widened.
1275 bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
1276
1277 /// Returns true if \p I is a memory instruction in an interleaved-group
1278 /// of memory accesses that can be vectorized with wide vector loads/stores
1279 /// and shuffles.
1280 bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const;
1281
1282 /// Check if \p Instr belongs to any interleaved access group.
1284 return InterleaveInfo.isInterleaved(Instr);
1285 }
1286
1287 /// Get the interleaved access group that \p Instr belongs to.
1290 return InterleaveInfo.getInterleaveGroup(Instr);
1291 }
1292
1293 /// Returns true if we're required to use a scalar epilogue for at least
1294 /// the final iteration of the original loop.
1295 bool requiresScalarEpilogue(bool IsVectorizing) const {
1296 if (!isScalarEpilogueAllowed()) {
1297 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1298 return false;
1299 }
1300 // If we might exit from anywhere but the latch and early exit vectorization
1301 // is disabled, we must run the exiting iteration in scalar form.
1302 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
1303 !(EnableEarlyExitVectorization && Legal->hasUncountableEarlyExit())) {
1304 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting "
1305 "from latch block\n");
1306 return true;
1307 }
1308 if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
1309 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
1310 "interleaved group requires scalar epilogue\n");
1311 return true;
1312 }
1313 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1314 return false;
1315 }
1316
1317 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1318 /// loop hint annotation.
1320 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1321 }
1322
1323 /// Returns true if tail-folding is preferred over a scalar epilogue.
1325 return ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate ||
1326 ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate;
1327 }
1328
1329 /// Returns the TailFoldingStyle that is best for the current loop.
1330 TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1331 if (!ChosenTailFoldingStyle)
1333 return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first
1334 : ChosenTailFoldingStyle->second;
1335 }
1336
1337 /// Selects and saves TailFoldingStyle for 2 options - if IV update may
1338 /// overflow or not.
1339 /// \param IsScalableVF true if scalable vector factors enabled.
1340 /// \param UserIC User specific interleave count.
1341 void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
1342 assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
1343 if (!Legal->canFoldTailByMasking()) {
1344 ChosenTailFoldingStyle = {TailFoldingStyle::None, TailFoldingStyle::None};
1345 return;
1346 }
1347
1348 // Default to TTI preference, but allow command line override.
1349 ChosenTailFoldingStyle = {
1350 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
1351 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false)};
1352 if (ForceTailFoldingStyle.getNumOccurrences())
1353 ChosenTailFoldingStyle = {ForceTailFoldingStyle.getValue(),
1354 ForceTailFoldingStyle.getValue()};
1355
1356 if (ChosenTailFoldingStyle->first != TailFoldingStyle::DataWithEVL &&
1357 ChosenTailFoldingStyle->second != TailFoldingStyle::DataWithEVL)
1358 return;
1359 // Override EVL styles if needed.
1360 // FIXME: Investigate opportunity for fixed vector factor.
1361 bool EVLIsLegal = UserIC <= 1 && IsScalableVF &&
1362 TTI.hasActiveVectorLength() && !EnableVPlanNativePath;
1363 if (EVLIsLegal)
1364 return;
1365 // If for some reason EVL mode is unsupported, fallback to a scalar epilogue
1366 // if it's allowed, or DataWithoutLaneMask otherwise.
1367 if (ScalarEpilogueStatus == CM_ScalarEpilogueAllowed ||
1368 ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate)
1369 ChosenTailFoldingStyle = {TailFoldingStyle::None, TailFoldingStyle::None};
1370 else
1371 ChosenTailFoldingStyle = {TailFoldingStyle::DataWithoutLaneMask,
1373
1374 LLVM_DEBUG(
1375 dbgs() << "LV: Preference for VP intrinsics indicated. Will "
1376 "not try to generate VP Intrinsics "
1377 << (UserIC > 1
1378 ? "since interleave count specified is greater than 1.\n"
1379 : "due to non-interleaving reasons.\n"));
1380 }
1381
1382 /// Returns true if all loop blocks should be masked to fold tail loop.
1383 bool foldTailByMasking() const {
1384 // TODO: check if it is possible to check for None style independent of
1385 // IVUpdateMayOverflow flag in getTailFoldingStyle.
1387 }
1388
1389 /// Returns true if the use of wide lane masks is requested and the loop is
1390 /// using tail-folding with a lane mask for control flow.
1399
1400 /// Return maximum safe number of elements to be processed per vector
1401 /// iteration, which do not prevent store-load forwarding and are safe with
1402 /// regard to the memory dependencies. Required for EVL-based VPlans to
1403 /// correctly calculate AVL (application vector length) as min(remaining AVL,
1404 /// MaxSafeElements).
1405 /// TODO: need to consider adjusting cost model to use this value as a
1406 /// vectorization factor for EVL-based vectorization.
1407 std::optional<unsigned> getMaxSafeElements() const { return MaxSafeElements; }
1408
1409 /// Returns true if the instructions in this block requires predication
1410 /// for any reason, e.g. because tail folding now requires a predicate
1411 /// or because the block in the original loop was predicated.
1413 return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1414 }
1415
1416 /// Returns true if VP intrinsics with explicit vector length support should
1417 /// be generated in the tail folded loop.
1421
1422 /// Returns true if the Phi is part of an inloop reduction.
1423 bool isInLoopReduction(PHINode *Phi) const {
1424 return InLoopReductions.contains(Phi);
1425 }
1426
1427 /// Returns the set of in-loop reduction PHIs.
1429 return InLoopReductions;
1430 }
1431
1432 /// Returns true if the predicated reduction select should be used to set the
1433 /// incoming value for the reduction phi.
1435 // Force to use predicated reduction select since the EVL of the
1436 // second-to-last iteration might not be VF*UF.
1437 if (foldTailWithEVL())
1438 return true;
1440 TTI.preferPredicatedReductionSelect();
1441 }
1442
1443 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1444 /// with factor VF. Return the cost of the instruction, including
1445 /// scalarization overhead if it's needed.
1446 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1447
1448 /// Estimate cost of a call instruction CI if it were vectorized with factor
1449 /// VF. Return the cost of the instruction, including scalarization overhead
1450 /// if it's needed.
1451 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const;
1452
1453 /// Invalidates decisions already taken by the cost model.
1455 WideningDecisions.clear();
1456 CallWideningDecisions.clear();
1457 Uniforms.clear();
1458 Scalars.clear();
1459 }
1460
1461 /// Returns the expected execution cost. The unit of the cost does
1462 /// not matter because we use the 'cost' units to compare different
1463 /// vector widths. The cost that is returned is *not* normalized by
1464 /// the factor width.
1465 InstructionCost expectedCost(ElementCount VF);
1466
1467 bool hasPredStores() const { return NumPredStores > 0; }
1468
1469 /// Returns true if epilogue vectorization is considered profitable, and
1470 /// false otherwise.
1471 /// \p VF is the vectorization factor chosen for the original loop.
1472 /// \p Multiplier is an aditional scaling factor applied to VF before
1473 /// comparing to EpilogueVectorizationMinVF.
1474 bool isEpilogueVectorizationProfitable(const ElementCount VF,
1475 const unsigned IC) const;
1476
1477 /// Returns the execution time cost of an instruction for a given vector
1478 /// width. Vector width of one means scalar.
1479 InstructionCost getInstructionCost(Instruction *I, ElementCount VF);
1480
1481 /// Return the cost of instructions in an inloop reduction pattern, if I is
1482 /// part of that pattern.
1483 std::optional<InstructionCost> getReductionPatternCost(Instruction *I,
1484 ElementCount VF,
1485 Type *VectorTy) const;
1486
1487 /// Returns true if \p Op should be considered invariant and if it is
1488 /// trivially hoistable.
1489 bool shouldConsiderInvariant(Value *Op);
1490
1491 /// Return the value of vscale used for tuning the cost model.
1492 std::optional<unsigned> getVScaleForTuning() const { return VScaleForTuning; }
1493
1494private:
1495 unsigned NumPredStores = 0;
1496
1497 /// Used to store the value of vscale used for tuning the cost model. It is
1498 /// initialized during object construction.
1499 std::optional<unsigned> VScaleForTuning;
1500
1501 /// Initializes the value of vscale used for tuning the cost model. If
1502 /// vscale_range.min == vscale_range.max then return vscale_range.max, else
1503 /// return the value returned by the corresponding TTI method.
1504 void initializeVScaleForTuning() {
1505 const Function *Fn = TheLoop->getHeader()->getParent();
1506 if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
1507 auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
1508 auto Min = Attr.getVScaleRangeMin();
1509 auto Max = Attr.getVScaleRangeMax();
1510 if (Max && Min == Max) {
1511 VScaleForTuning = Max;
1512 return;
1513 }
1514 }
1515
1516 VScaleForTuning = TTI.getVScaleForTuning();
1517 }
1518
1519 /// \return An upper bound for the vectorization factors for both
1520 /// fixed and scalable vectorization, where the minimum-known number of
1521 /// elements is a power-of-2 larger than zero. If scalable vectorization is
1522 /// disabled or unsupported, then the scalable part will be equal to
1523 /// ElementCount::getScalable(0).
1524 FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1525 ElementCount UserVF, unsigned UserIC,
1526 bool FoldTailByMasking);
1527
1528 /// If \p VF * \p UserIC > MaxTripcount, clamps VF to the next lower VF that
1529 /// results in VF * UserIC <= MaxTripCount.
1530 ElementCount clampVFByMaxTripCount(ElementCount VF, unsigned MaxTripCount,
1531 unsigned UserIC,
1532 bool FoldTailByMasking) const;
1533
1534 /// \return the maximized element count based on the targets vector
1535 /// registers and the loop trip-count, but limited to a maximum safe VF.
1536 /// This is a helper function of computeFeasibleMaxVF.
1537 ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1538 unsigned SmallestType,
1539 unsigned WidestType,
1540 ElementCount MaxSafeVF, unsigned UserIC,
1541 bool FoldTailByMasking);
1542
1543 /// Checks if scalable vectorization is supported and enabled. Caches the
1544 /// result to avoid repeated debug dumps for repeated queries.
1545 bool isScalableVectorizationAllowed();
1546
1547 /// \return the maximum legal scalable VF, based on the safe max number
1548 /// of elements.
1549 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1550
1551 /// Calculate vectorization cost of memory instruction \p I.
1552 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1553
1554 /// The cost computation for scalarized memory instruction.
1555 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1556
1557 /// The cost computation for interleaving group of memory instructions.
1558 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1559
1560 /// The cost computation for Gather/Scatter instruction.
1561 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1562
1563 /// The cost computation for widening instruction \p I with consecutive
1564 /// memory access.
1565 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1566
1567 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1568 /// Load: scalar load + broadcast.
1569 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1570 /// element)
1571 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1572
1573 /// Estimate the overhead of scalarizing an instruction. This is a
1574 /// convenience wrapper for the type-based getScalarizationOverhead API.
1576 ElementCount VF) const;
1577
1578 /// Map of scalar integer values to the smallest bitwidth they can be legally
1579 /// represented as. The vector equivalents of these values should be truncated
1580 /// to this type.
1581 MapVector<Instruction *, uint64_t> MinBWs;
1582
1583 /// A type representing the costs for instructions if they were to be
1584 /// scalarized rather than vectorized. The entries are Instruction-Cost
1585 /// pairs.
1586 using ScalarCostsTy = MapVector<Instruction *, InstructionCost>;
1587
1588 /// A set containing all BasicBlocks that are known to present after
1589 /// vectorization as a predicated block.
1590 DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
1591 PredicatedBBsAfterVectorization;
1592
1593 /// Records whether it is allowed to have the original scalar loop execute at
1594 /// least once. This may be needed as a fallback loop in case runtime
1595 /// aliasing/dependence checks fail, or to handle the tail/remainder
1596 /// iterations when the trip count is unknown or doesn't divide by the VF,
1597 /// or as a peel-loop to handle gaps in interleave-groups.
1598 /// Under optsize and when the trip count is very small we don't allow any
1599 /// iterations to execute in the scalar loop.
1600 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1601
1602 /// Control finally chosen tail folding style. The first element is used if
1603 /// the IV update may overflow, the second element - if it does not.
1604 std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
1605 ChosenTailFoldingStyle;
1606
1607 /// true if scalable vectorization is supported and enabled.
1608 std::optional<bool> IsScalableVectorizationAllowed;
1609
1610 /// Maximum safe number of elements to be processed per vector iteration,
1611 /// which do not prevent store-load forwarding and are safe with regard to the
1612 /// memory dependencies. Required for EVL-based veectorization, where this
1613 /// value is used as the upper bound of the safe AVL.
1614 std::optional<unsigned> MaxSafeElements;
1615
1616 /// A map holding scalar costs for different vectorization factors. The
1617 /// presence of a cost for an instruction in the mapping indicates that the
1618 /// instruction will be scalarized when vectorizing with the associated
1619 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1620 MapVector<ElementCount, ScalarCostsTy> InstsToScalarize;
1621
1622 /// Holds the instructions known to be uniform after vectorization.
1623 /// The data is collected per VF.
1624 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1625
1626 /// Holds the instructions known to be scalar after vectorization.
1627 /// The data is collected per VF.
1628 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1629
1630 /// Holds the instructions (address computations) that are forced to be
1631 /// scalarized.
1632 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1633
1634 /// PHINodes of the reductions that should be expanded in-loop.
1635 SmallPtrSet<PHINode *, 4> InLoopReductions;
1636
1637 /// A Map of inloop reduction operations and their immediate chain operand.
1638 /// FIXME: This can be removed once reductions can be costed correctly in
1639 /// VPlan. This was added to allow quick lookup of the inloop operations.
1640 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1641
1642 /// Returns the expected difference in cost from scalarizing the expression
1643 /// feeding a predicated instruction \p PredInst. The instructions to
1644 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1645 /// non-negative return value implies the expression will be scalarized.
1646 /// Currently, only single-use chains are considered for scalarization.
1647 InstructionCost computePredInstDiscount(Instruction *PredInst,
1648 ScalarCostsTy &ScalarCosts,
1649 ElementCount VF);
1650
1651 /// Collect the instructions that are uniform after vectorization. An
1652 /// instruction is uniform if we represent it with a single scalar value in
1653 /// the vectorized loop corresponding to each vector iteration. Examples of
1654 /// uniform instructions include pointer operands of consecutive or
1655 /// interleaved memory accesses. Note that although uniformity implies an
1656 /// instruction will be scalar, the reverse is not true. In general, a
1657 /// scalarized instruction will be represented by VF scalar values in the
1658 /// vectorized loop, each corresponding to an iteration of the original
1659 /// scalar loop.
1660 void collectLoopUniforms(ElementCount VF);
1661
1662 /// Collect the instructions that are scalar after vectorization. An
1663 /// instruction is scalar if it is known to be uniform or will be scalarized
1664 /// during vectorization. collectLoopScalars should only add non-uniform nodes
1665 /// to the list if they are used by a load/store instruction that is marked as
1666 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1667 /// VF values in the vectorized loop, each corresponding to an iteration of
1668 /// the original scalar loop.
1669 void collectLoopScalars(ElementCount VF);
1670
1671 /// Keeps cost model vectorization decision and cost for instructions.
1672 /// Right now it is used for memory instructions only.
1673 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1674 std::pair<InstWidening, InstructionCost>>;
1675
1676 DecisionList WideningDecisions;
1677
1678 using CallDecisionList =
1679 DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1680
1681 CallDecisionList CallWideningDecisions;
1682
1683 /// Returns true if \p V is expected to be vectorized and it needs to be
1684 /// extracted.
1685 bool needsExtract(Value *V, ElementCount VF) const {
1687 if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1688 TheLoop->isLoopInvariant(I) ||
1689 getWideningDecision(I, VF) == CM_Scalarize ||
1690 (isa<CallInst>(I) &&
1691 getCallWideningDecision(cast<CallInst>(I), VF).Kind == CM_Scalarize))
1692 return false;
1693
1694 // Assume we can vectorize V (and hence we need extraction) if the
1695 // scalars are not computed yet. This can happen, because it is called
1696 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1697 // the scalars are collected. That should be a safe assumption in most
1698 // cases, because we check if the operands have vectorizable types
1699 // beforehand in LoopVectorizationLegality.
1700 return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
1701 };
1702
1703 /// Returns a range containing only operands needing to be extracted.
1704 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1705 ElementCount VF) const {
1706
1707 SmallPtrSet<const Value *, 4> UniqueOperands;
1708 SmallVector<Value *, 4> Res;
1709 for (Value *Op : Ops) {
1710 if (isa<Constant>(Op) || !UniqueOperands.insert(Op).second ||
1711 !needsExtract(Op, VF))
1712 continue;
1713 Res.push_back(Op);
1714 }
1715 return Res;
1716 }
1717
1718public:
1719 /// The loop that we evaluate.
1721
1722 /// Predicated scalar evolution analysis.
1724
1725 /// Loop Info analysis.
1727
1728 /// Vectorization legality.
1730
1731 /// Vector target information.
1733
1734 /// Target Library Info.
1736
1737 /// Demanded bits analysis.
1739
1740 /// Assumption cache.
1742
1743 /// Interface to emit optimization remarks.
1745
1746 /// A function to lazily fetch BlockFrequencyInfo. This avoids computing it
1747 /// unless necessary, e.g. when the loop isn't legal to vectorize or when
1748 /// there is no predication.
1749 std::function<BlockFrequencyInfo &()> GetBFI;
1750 /// The BlockFrequencyInfo returned from GetBFI.
1752 /// Returns the BlockFrequencyInfo for the function if cached, otherwise
1753 /// fetches it via GetBFI. Avoids an indirect call to the std::function.
1755 if (!BFI)
1756 BFI = &GetBFI();
1757 return *BFI;
1758 }
1759
1761
1762 /// Loop Vectorize Hint.
1764
1765 /// The interleave access information contains groups of interleaved accesses
1766 /// with the same stride and close to each other.
1768
1769 /// Values to ignore in the cost model.
1771
1772 /// Values to ignore in the cost model when VF > 1.
1774
1775 /// All element types found in the loop.
1777
1778 /// The kind of cost that we are calculating
1780
1781 /// Whether this loop should be optimized for size based on function attribute
1782 /// or profile information.
1784
1785 /// The highest VF possible for this loop, without using MaxBandwidth.
1787};
1788} // end namespace llvm
1789
1790namespace {
1791/// Helper struct to manage generating runtime checks for vectorization.
1792///
1793/// The runtime checks are created up-front in temporary blocks to allow better
1794/// estimating the cost and un-linked from the existing IR. After deciding to
1795/// vectorize, the checks are moved back. If deciding not to vectorize, the
1796/// temporary blocks are completely removed.
1797class GeneratedRTChecks {
1798 /// Basic block which contains the generated SCEV checks, if any.
1799 BasicBlock *SCEVCheckBlock = nullptr;
1800
1801 /// The value representing the result of the generated SCEV checks. If it is
1802 /// nullptr no SCEV checks have been generated.
1803 Value *SCEVCheckCond = nullptr;
1804
1805 /// Basic block which contains the generated memory runtime checks, if any.
1806 BasicBlock *MemCheckBlock = nullptr;
1807
1808 /// The value representing the result of the generated memory runtime checks.
1809 /// If it is nullptr no memory runtime checks have been generated.
1810 Value *MemRuntimeCheckCond = nullptr;
1811
1812 DominatorTree *DT;
1813 LoopInfo *LI;
1815
1816 SCEVExpander SCEVExp;
1817 SCEVExpander MemCheckExp;
1818
1819 bool CostTooHigh = false;
1820
1821 Loop *OuterLoop = nullptr;
1822
1824
1825 /// The kind of cost that we are calculating
1827
1828public:
1829 GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
1832 : DT(DT), LI(LI), TTI(TTI),
1833 SCEVExp(*PSE.getSE(), "scev.check", /*PreserveLCSSA=*/false),
1834 MemCheckExp(*PSE.getSE(), "scev.check", /*PreserveLCSSA=*/false),
1835 PSE(PSE), CostKind(CostKind) {}
1836
1837 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1838 /// accurately estimate the cost of the runtime checks. The blocks are
1839 /// un-linked from the IR and are added back during vector code generation. If
1840 /// there is no vector code generation, the check blocks are removed
1841 /// completely.
1842 void create(Loop *L, const LoopAccessInfo &LAI,
1843 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC,
1844 OptimizationRemarkEmitter &ORE) {
1845
1846 // Hard cutoff to limit compile-time increase in case a very large number of
1847 // runtime checks needs to be generated.
1848 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1849 // profile info.
1850 CostTooHigh =
1852 if (CostTooHigh) {
1853 // Mark runtime checks as never succeeding when they exceed the threshold.
1854 MemRuntimeCheckCond = ConstantInt::getTrue(L->getHeader()->getContext());
1855 SCEVCheckCond = ConstantInt::getTrue(L->getHeader()->getContext());
1856 ORE.emit([&]() {
1857 return OptimizationRemarkAnalysisAliasing(
1858 DEBUG_TYPE, "TooManyMemoryRuntimeChecks", L->getStartLoc(),
1859 L->getHeader())
1860 << "loop not vectorized: too many memory checks needed";
1861 });
1862 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
1863 return;
1864 }
1865
1866 BasicBlock *LoopHeader = L->getHeader();
1867 BasicBlock *Preheader = L->getLoopPreheader();
1868
1869 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1870 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1871 // may be used by SCEVExpander. The blocks will be un-linked from their
1872 // predecessors and removed from LI & DT at the end of the function.
1873 if (!UnionPred.isAlwaysTrue()) {
1874 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1875 nullptr, "vector.scevcheck");
1876
1877 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1878 &UnionPred, SCEVCheckBlock->getTerminator());
1879 if (isa<Constant>(SCEVCheckCond)) {
1880 // Clean up directly after expanding the predicate to a constant, to
1881 // avoid further expansions re-using anything left over from SCEVExp.
1882 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
1883 SCEVCleaner.cleanup();
1884 }
1885 }
1886
1887 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1888 if (RtPtrChecking.Need) {
1889 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1890 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1891 "vector.memcheck");
1892
1893 auto DiffChecks = RtPtrChecking.getDiffChecks();
1894 if (DiffChecks) {
1895 Value *RuntimeVF = nullptr;
1896 MemRuntimeCheckCond = addDiffRuntimeChecks(
1897 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
1898 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1899 if (!RuntimeVF)
1900 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1901 return RuntimeVF;
1902 },
1903 IC);
1904 } else {
1905 MemRuntimeCheckCond = addRuntimeChecks(
1906 MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
1908 }
1909 assert(MemRuntimeCheckCond &&
1910 "no RT checks generated although RtPtrChecking "
1911 "claimed checks are required");
1912 }
1913
1914 SCEVExp.eraseDeadInstructions(SCEVCheckCond);
1915
1916 if (!MemCheckBlock && !SCEVCheckBlock)
1917 return;
1918
1919 // Unhook the temporary block with the checks, update various places
1920 // accordingly.
1921 if (SCEVCheckBlock)
1922 SCEVCheckBlock->replaceAllUsesWith(Preheader);
1923 if (MemCheckBlock)
1924 MemCheckBlock->replaceAllUsesWith(Preheader);
1925
1926 if (SCEVCheckBlock) {
1927 SCEVCheckBlock->getTerminator()->moveBefore(
1928 Preheader->getTerminator()->getIterator());
1929 auto *UI = new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1930 UI->setDebugLoc(DebugLoc::getTemporary());
1931 Preheader->getTerminator()->eraseFromParent();
1932 }
1933 if (MemCheckBlock) {
1934 MemCheckBlock->getTerminator()->moveBefore(
1935 Preheader->getTerminator()->getIterator());
1936 auto *UI = new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1937 UI->setDebugLoc(DebugLoc::getTemporary());
1938 Preheader->getTerminator()->eraseFromParent();
1939 }
1940
1941 DT->changeImmediateDominator(LoopHeader, Preheader);
1942 if (MemCheckBlock) {
1943 DT->eraseNode(MemCheckBlock);
1944 LI->removeBlock(MemCheckBlock);
1945 }
1946 if (SCEVCheckBlock) {
1947 DT->eraseNode(SCEVCheckBlock);
1948 LI->removeBlock(SCEVCheckBlock);
1949 }
1950
1951 // Outer loop is used as part of the later cost calculations.
1952 OuterLoop = L->getParentLoop();
1953 }
1954
1956 if (SCEVCheckBlock || MemCheckBlock)
1957 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
1958
1959 if (CostTooHigh) {
1961 Cost.setInvalid();
1962 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n");
1963 return Cost;
1964 }
1965
1966 InstructionCost RTCheckCost = 0;
1967 if (SCEVCheckBlock)
1968 for (Instruction &I : *SCEVCheckBlock) {
1969 if (SCEVCheckBlock->getTerminator() == &I)
1970 continue;
1972 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1973 RTCheckCost += C;
1974 }
1975 if (MemCheckBlock) {
1976 InstructionCost MemCheckCost = 0;
1977 for (Instruction &I : *MemCheckBlock) {
1978 if (MemCheckBlock->getTerminator() == &I)
1979 continue;
1981 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1982 MemCheckCost += C;
1983 }
1984
1985 // If the runtime memory checks are being created inside an outer loop
1986 // we should find out if these checks are outer loop invariant. If so,
1987 // the checks will likely be hoisted out and so the effective cost will
1988 // reduce according to the outer loop trip count.
1989 if (OuterLoop) {
1990 ScalarEvolution *SE = MemCheckExp.getSE();
1991 // TODO: If profitable, we could refine this further by analysing every
1992 // individual memory check, since there could be a mixture of loop
1993 // variant and invariant checks that mean the final condition is
1994 // variant.
1995 const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
1996 if (SE->isLoopInvariant(Cond, OuterLoop)) {
1997 // It seems reasonable to assume that we can reduce the effective
1998 // cost of the checks even when we know nothing about the trip
1999 // count. Assume that the outer loop executes at least twice.
2000 unsigned BestTripCount = 2;
2001
2002 // Get the best known TC estimate.
2003 if (auto EstimatedTC = getSmallBestKnownTC(
2004 PSE, OuterLoop, /* CanUseConstantMax = */ false))
2005 if (EstimatedTC->isFixed())
2006 BestTripCount = EstimatedTC->getFixedValue();
2007
2008 InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
2009
2010 // Let's ensure the cost is always at least 1.
2011 NewMemCheckCost = std::max(NewMemCheckCost.getValue(),
2012 (InstructionCost::CostType)1);
2013
2014 if (BestTripCount > 1)
2016 << "We expect runtime memory checks to be hoisted "
2017 << "out of the outer loop. Cost reduced from "
2018 << MemCheckCost << " to " << NewMemCheckCost << '\n');
2019
2020 MemCheckCost = NewMemCheckCost;
2021 }
2022 }
2023
2024 RTCheckCost += MemCheckCost;
2025 }
2026
2027 if (SCEVCheckBlock || MemCheckBlock)
2028 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2029 << "\n");
2030
2031 return RTCheckCost;
2032 }
2033
2034 /// Remove the created SCEV & memory runtime check blocks & instructions, if
2035 /// unused.
2036 ~GeneratedRTChecks() {
2037 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2038 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2039 bool SCEVChecksUsed = !SCEVCheckBlock || !pred_empty(SCEVCheckBlock);
2040 bool MemChecksUsed = !MemCheckBlock || !pred_empty(MemCheckBlock);
2041 if (SCEVChecksUsed)
2042 SCEVCleaner.markResultUsed();
2043
2044 if (MemChecksUsed) {
2045 MemCheckCleaner.markResultUsed();
2046 } else {
2047 auto &SE = *MemCheckExp.getSE();
2048 // Memory runtime check generation creates compares that use expanded
2049 // values. Remove them before running the SCEVExpanderCleaners.
2050 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2051 if (MemCheckExp.isInsertedInstruction(&I))
2052 continue;
2053 SE.forgetValue(&I);
2054 I.eraseFromParent();
2055 }
2056 }
2057 MemCheckCleaner.cleanup();
2058 SCEVCleaner.cleanup();
2059
2060 if (!SCEVChecksUsed)
2061 SCEVCheckBlock->eraseFromParent();
2062 if (!MemChecksUsed)
2063 MemCheckBlock->eraseFromParent();
2064 }
2065
2066 /// Retrieves the SCEVCheckCond and SCEVCheckBlock that were generated as IR
2067 /// outside VPlan.
2068 std::pair<Value *, BasicBlock *> getSCEVChecks() const {
2069 using namespace llvm::PatternMatch;
2070 if (!SCEVCheckCond || match(SCEVCheckCond, m_ZeroInt()))
2071 return {nullptr, nullptr};
2072
2073 return {SCEVCheckCond, SCEVCheckBlock};
2074 }
2075
2076 /// Retrieves the MemCheckCond and MemCheckBlock that were generated as IR
2077 /// outside VPlan.
2078 std::pair<Value *, BasicBlock *> getMemRuntimeChecks() const {
2079 using namespace llvm::PatternMatch;
2080 if (MemRuntimeCheckCond && match(MemRuntimeCheckCond, m_ZeroInt()))
2081 return {nullptr, nullptr};
2082 return {MemRuntimeCheckCond, MemCheckBlock};
2083 }
2084
2085 /// Return true if any runtime checks have been added
2086 bool hasChecks() const {
2087 return getSCEVChecks().first || getMemRuntimeChecks().first;
2088 }
2089};
2090} // namespace
2091
2097
2102
2103// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2104// vectorization. The loop needs to be annotated with #pragma omp simd
2105// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2106// vector length information is not provided, vectorization is not considered
2107// explicit. Interleave hints are not allowed either. These limitations will be
2108// relaxed in the future.
2109// Please, note that we are currently forced to abuse the pragma 'clang
2110// vectorize' semantics. This pragma provides *auto-vectorization hints*
2111// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2112// provides *explicit vectorization hints* (LV can bypass legal checks and
2113// assume that vectorization is legal). However, both hints are implemented
2114// using the same metadata (llvm.loop.vectorize, processed by
2115// LoopVectorizeHints). This will be fixed in the future when the native IR
2116// representation for pragma 'omp simd' is introduced.
2117static bool isExplicitVecOuterLoop(Loop *OuterLp,
2119 assert(!OuterLp->isInnermost() && "This is not an outer loop");
2120 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2121
2122 // Only outer loops with an explicit vectorization hint are supported.
2123 // Unannotated outer loops are ignored.
2125 return false;
2126
2127 Function *Fn = OuterLp->getHeader()->getParent();
2128 if (!Hints.allowVectorization(Fn, OuterLp,
2129 true /*VectorizeOnlyWhenForced*/)) {
2130 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2131 return false;
2132 }
2133
2134 if (Hints.getInterleave() > 1) {
2135 // TODO: Interleave support is future work.
2136 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2137 "outer loops.\n");
2138 Hints.emitRemarkWithHints();
2139 return false;
2140 }
2141
2142 return true;
2143}
2144
2148 // Collect inner loops and outer loops without irreducible control flow. For
2149 // now, only collect outer loops that have explicit vectorization hints. If we
2150 // are stress testing the VPlan H-CFG construction, we collect the outermost
2151 // loop of every loop nest.
2152 if (L.isInnermost() || VPlanBuildStressTest ||
2154 LoopBlocksRPO RPOT(&L);
2155 RPOT.perform(LI);
2157 V.push_back(&L);
2158 // TODO: Collect inner loops inside marked outer loops in case
2159 // vectorization fails for the outer loop. Do not invoke
2160 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2161 // already known to be reducible. We can use an inherited attribute for
2162 // that.
2163 return;
2164 }
2165 }
2166 for (Loop *InnerL : L)
2167 collectSupportedLoops(*InnerL, LI, ORE, V);
2168}
2169
2170//===----------------------------------------------------------------------===//
2171// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2172// LoopVectorizationCostModel and LoopVectorizationPlanner.
2173//===----------------------------------------------------------------------===//
2174
2175/// FIXME: The newly created binary instructions should contain nsw/nuw
2176/// flags, which can be found from the original scalar operations.
2177Value *
2179 Value *Step,
2181 const BinaryOperator *InductionBinOp) {
2182 using namespace llvm::PatternMatch;
2183 Type *StepTy = Step->getType();
2184 Value *CastedIndex = StepTy->isIntegerTy()
2185 ? B.CreateSExtOrTrunc(Index, StepTy)
2186 : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2187 if (CastedIndex != Index) {
2188 CastedIndex->setName(CastedIndex->getName() + ".cast");
2189 Index = CastedIndex;
2190 }
2191
2192 // Note: the IR at this point is broken. We cannot use SE to create any new
2193 // SCEV and then expand it, hoping that SCEV's simplification will give us
2194 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2195 // lead to various SCEV crashes. So all we can do is to use builder and rely
2196 // on InstCombine for future simplifications. Here we handle some trivial
2197 // cases only.
2198 auto CreateAdd = [&B](Value *X, Value *Y) {
2199 assert(X->getType() == Y->getType() && "Types don't match!");
2200 if (match(X, m_ZeroInt()))
2201 return Y;
2202 if (match(Y, m_ZeroInt()))
2203 return X;
2204 return B.CreateAdd(X, Y);
2205 };
2206
2207 // We allow X to be a vector type, in which case Y will potentially be
2208 // splatted into a vector with the same element count.
2209 auto CreateMul = [&B](Value *X, Value *Y) {
2210 assert(X->getType()->getScalarType() == Y->getType() &&
2211 "Types don't match!");
2212 if (match(X, m_One()))
2213 return Y;
2214 if (match(Y, m_One()))
2215 return X;
2216 VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2217 if (XVTy && !isa<VectorType>(Y->getType()))
2218 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2219 return B.CreateMul(X, Y);
2220 };
2221
2222 switch (InductionKind) {
2224 assert(!isa<VectorType>(Index->getType()) &&
2225 "Vector indices not supported for integer inductions yet");
2226 assert(Index->getType() == StartValue->getType() &&
2227 "Index type does not match StartValue type");
2228 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2229 return B.CreateSub(StartValue, Index);
2230 auto *Offset = CreateMul(Index, Step);
2231 return CreateAdd(StartValue, Offset);
2232 }
2234 return B.CreatePtrAdd(StartValue, CreateMul(Index, Step));
2236 assert(!isa<VectorType>(Index->getType()) &&
2237 "Vector indices not supported for FP inductions yet");
2238 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2239 assert(InductionBinOp &&
2240 (InductionBinOp->getOpcode() == Instruction::FAdd ||
2241 InductionBinOp->getOpcode() == Instruction::FSub) &&
2242 "Original bin op should be defined for FP induction");
2243
2244 Value *MulExp = B.CreateFMul(Step, Index);
2245 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2246 "induction");
2247 }
2249 return nullptr;
2250 }
2251 llvm_unreachable("invalid enum");
2252}
2253
2254static std::optional<unsigned> getMaxVScale(const Function &F,
2255 const TargetTransformInfo &TTI) {
2256 if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2257 return MaxVScale;
2258
2259 if (F.hasFnAttribute(Attribute::VScaleRange))
2260 return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2261
2262 return std::nullopt;
2263}
2264
2265/// For the given VF and UF and maximum trip count computed for the loop, return
2266/// whether the induction variable might overflow in the vectorized loop. If not,
2267/// then we know a runtime overflow check always evaluates to false and can be
2268/// removed.
2270 const LoopVectorizationCostModel *Cost,
2271 ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2272 // Always be conservative if we don't know the exact unroll factor.
2273 unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2274
2275 IntegerType *IdxTy = Cost->Legal->getWidestInductionType();
2276 APInt MaxUIntTripCount = IdxTy->getMask();
2277
2278 // We know the runtime overflow check is known false iff the (max) trip-count
2279 // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2280 // the vector loop induction variable.
2281 if (unsigned TC = Cost->PSE.getSmallConstantMaxTripCount()) {
2282 uint64_t MaxVF = VF.getKnownMinValue();
2283 if (VF.isScalable()) {
2284 std::optional<unsigned> MaxVScale =
2285 getMaxVScale(*Cost->TheFunction, Cost->TTI);
2286 if (!MaxVScale)
2287 return false;
2288 MaxVF *= *MaxVScale;
2289 }
2290
2291 return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
2292 }
2293
2294 return false;
2295}
2296
2297// Return whether we allow using masked interleave-groups (for dealing with
2298// strided loads/stores that reside in predicated blocks, or for dealing
2299// with gaps).
2301 // If an override option has been passed in for interleaved accesses, use it.
2302 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2304
2305 return TTI.enableMaskedInterleavedAccessVectorization();
2306}
2307
2309 BasicBlock *CheckIRBB) {
2310 // Note: The block with the minimum trip-count check is already connected
2311 // during earlier VPlan construction.
2312 VPBlockBase *ScalarPH = Plan.getScalarPreheader();
2313 VPBlockBase *PreVectorPH = VectorPHVPBB->getSinglePredecessor();
2314 assert(PreVectorPH->getNumSuccessors() == 2 && "Expected 2 successors");
2315 assert(PreVectorPH->getSuccessors()[0] == ScalarPH && "Unexpected successor");
2316 VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(CheckIRBB);
2317 VPBlockUtils::insertOnEdge(PreVectorPH, VectorPHVPBB, CheckVPIRBB);
2318 PreVectorPH = CheckVPIRBB;
2319 VPBlockUtils::connectBlocks(PreVectorPH, ScalarPH);
2320 PreVectorPH->swapSuccessors();
2321
2322 // We just connected a new block to the scalar preheader. Update all
2323 // VPPhis by adding an incoming value for it, replicating the last value.
2324 unsigned NumPredecessors = ScalarPH->getNumPredecessors();
2325 for (VPRecipeBase &R : cast<VPBasicBlock>(ScalarPH)->phis()) {
2326 assert(isa<VPPhi>(&R) && "Phi expected to be VPPhi");
2327 assert(cast<VPPhi>(&R)->getNumIncoming() == NumPredecessors - 1 &&
2328 "must have incoming values for all operands");
2329 R.addOperand(R.getOperand(NumPredecessors - 2));
2330 }
2331}
2332
2334 BasicBlock *VectorPH, ElementCount VF, unsigned UF) const {
2335 // Generate code to check if the loop's trip count is less than VF * UF, or
2336 // equal to it in case a scalar epilogue is required; this implies that the
2337 // vector trip count is zero. This check also covers the case where adding one
2338 // to the backedge-taken count overflowed leading to an incorrect trip count
2339 // of zero. In this case we will also jump to the scalar loop.
2340 auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
2342
2343 // Reuse existing vector loop preheader for TC checks.
2344 // Note that new preheader block is generated for vector loop.
2345 BasicBlock *const TCCheckBlock = VectorPH;
2347 TCCheckBlock->getContext(),
2348 InstSimplifyFolder(TCCheckBlock->getDataLayout()));
2349 Builder.SetInsertPoint(TCCheckBlock->getTerminator());
2350
2351 // If tail is to be folded, vector loop takes care of all iterations.
2353 Type *CountTy = Count->getType();
2354 Value *CheckMinIters = Builder.getFalse();
2355 auto CreateStep = [&]() -> Value * {
2356 // Create step with max(MinProTripCount, UF * VF).
2357 if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
2358 return createStepForVF(Builder, CountTy, VF, UF);
2359
2360 Value *MinProfTC =
2361 Builder.CreateElementCount(CountTy, MinProfitableTripCount);
2362 if (!VF.isScalable())
2363 return MinProfTC;
2364 return Builder.CreateBinaryIntrinsic(
2365 Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2366 };
2367
2368 TailFoldingStyle Style = Cost->getTailFoldingStyle();
2369 if (Style == TailFoldingStyle::None) {
2370 Value *Step = CreateStep();
2371 ScalarEvolution &SE = *PSE.getSE();
2372 // TODO: Emit unconditional branch to vector preheader instead of
2373 // conditional branch with known condition.
2374 const SCEV *TripCountSCEV = SE.applyLoopGuards(SE.getSCEV(Count), OrigLoop);
2375 // Check if the trip count is < the step.
2376 if (SE.isKnownPredicate(P, TripCountSCEV, SE.getSCEV(Step))) {
2377 // TODO: Ensure step is at most the trip count when determining max VF and
2378 // UF, w/o tail folding.
2379 CheckMinIters = Builder.getTrue();
2381 TripCountSCEV, SE.getSCEV(Step))) {
2382 // Generate the minimum iteration check only if we cannot prove the
2383 // check is known to be true, or known to be false.
2384 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
2385 } // else step known to be < trip count, use CheckMinIters preset to false.
2386 }
2387
2388 return CheckMinIters;
2389}
2390
2391/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
2392/// VPBB are moved to the end of the newly created VPIRBasicBlock. All
2393/// predecessors and successors of VPBB, if any, are rewired to the new
2394/// VPIRBasicBlock. If \p VPBB may be unreachable, \p Plan must be passed.
2396 BasicBlock *IRBB,
2397 VPlan *Plan = nullptr) {
2398 if (!Plan)
2399 Plan = VPBB->getPlan();
2400 VPIRBasicBlock *IRVPBB = Plan->createVPIRBasicBlock(IRBB);
2401 auto IP = IRVPBB->begin();
2402 for (auto &R : make_early_inc_range(VPBB->phis()))
2403 R.moveBefore(*IRVPBB, IP);
2404
2405 for (auto &R :
2407 R.moveBefore(*IRVPBB, IRVPBB->end());
2408
2409 VPBlockUtils::reassociateBlocks(VPBB, IRVPBB);
2410 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
2411 return IRVPBB;
2412}
2413
2415 BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
2416 assert(VectorPH && "Invalid loop structure");
2417 assert((OrigLoop->getUniqueLatchExitBlock() ||
2418 Cost->requiresScalarEpilogue(VF.isVector())) &&
2419 "loops not exiting via the latch without required epilogue?");
2420
2421 // NOTE: The Plan's scalar preheader VPBB isn't replaced with a VPIRBasicBlock
2422 // wrapping the newly created scalar preheader here at the moment, because the
2423 // Plan's scalar preheader may be unreachable at this point. Instead it is
2424 // replaced in executePlan.
2425 return SplitBlock(VectorPH, VectorPH->getTerminator(), DT, LI, nullptr,
2426 Twine(Prefix) + "scalar.ph");
2427}
2428
2429/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
2430/// expansion results.
2432 const SCEV2ValueTy &ExpandedSCEVs) {
2433 const SCEV *Step = ID.getStep();
2434 if (auto *C = dyn_cast<SCEVConstant>(Step))
2435 return C->getValue();
2436 if (auto *U = dyn_cast<SCEVUnknown>(Step))
2437 return U->getValue();
2438 Value *V = ExpandedSCEVs.lookup(Step);
2439 assert(V && "SCEV must be expanded at this point");
2440 return V;
2441}
2442
2443/// Knowing that loop \p L executes a single vector iteration, add instructions
2444/// that will get simplified and thus should not have any cost to \p
2445/// InstsToIgnore.
2448 SmallPtrSetImpl<Instruction *> &InstsToIgnore) {
2449 auto *Cmp = L->getLatchCmpInst();
2450 if (Cmp)
2451 InstsToIgnore.insert(Cmp);
2452 for (const auto &KV : IL) {
2453 // Extract the key by hand so that it can be used in the lambda below. Note
2454 // that captured structured bindings are a C++20 extension.
2455 const PHINode *IV = KV.first;
2456
2457 // Get next iteration value of the induction variable.
2458 Instruction *IVInst =
2459 cast<Instruction>(IV->getIncomingValueForBlock(L->getLoopLatch()));
2460 if (all_of(IVInst->users(),
2461 [&](const User *U) { return U == IV || U == Cmp; }))
2462 InstsToIgnore.insert(IVInst);
2463 }
2464}
2465
2467 // Create a new IR basic block for the scalar preheader.
2468 BasicBlock *ScalarPH = createScalarPreheader("");
2469 return ScalarPH->getSinglePredecessor();
2470}
2471
2472namespace {
2473
2474struct CSEDenseMapInfo {
2475 static bool canHandle(const Instruction *I) {
2478 }
2479
2480 static inline Instruction *getEmptyKey() {
2482 }
2483
2484 static inline Instruction *getTombstoneKey() {
2485 return DenseMapInfo<Instruction *>::getTombstoneKey();
2486 }
2487
2488 static unsigned getHashValue(const Instruction *I) {
2489 assert(canHandle(I) && "Unknown instruction!");
2490 return hash_combine(I->getOpcode(),
2491 hash_combine_range(I->operand_values()));
2492 }
2493
2494 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
2495 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
2496 LHS == getTombstoneKey() || RHS == getTombstoneKey())
2497 return LHS == RHS;
2498 return LHS->isIdenticalTo(RHS);
2499 }
2500};
2501
2502} // end anonymous namespace
2503
2504/// FIXME: This legacy common-subexpression-elimination routine is scheduled for
2505/// removal, in favor of the VPlan-based one.
2506static void legacyCSE(BasicBlock *BB) {
2507 // Perform simple cse.
2509 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
2510 if (!CSEDenseMapInfo::canHandle(&In))
2511 continue;
2512
2513 // Check if we can replace this instruction with any of the
2514 // visited instructions.
2515 if (Instruction *V = CSEMap.lookup(&In)) {
2516 In.replaceAllUsesWith(V);
2517 In.eraseFromParent();
2518 continue;
2519 }
2520
2521 CSEMap[&In] = &In;
2522 }
2523}
2524
2525/// This function attempts to return a value that represents the ElementCount
2526/// at runtime. For fixed-width VFs we know this precisely at compile
2527/// time, but for scalable VFs we calculate it based on an estimate of the
2528/// vscale value.
2530 std::optional<unsigned> VScale) {
2531 unsigned EstimatedVF = VF.getKnownMinValue();
2532 if (VF.isScalable())
2533 if (VScale)
2534 EstimatedVF *= *VScale;
2535 assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
2536 return EstimatedVF;
2537}
2538
2541 ElementCount VF) const {
2542 // We only need to calculate a cost if the VF is scalar; for actual vectors
2543 // we should already have a pre-calculated cost at each VF.
2544 if (!VF.isScalar())
2545 return getCallWideningDecision(CI, VF).Cost;
2546
2547 Type *RetTy = CI->getType();
2549 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy))
2550 return *RedCost;
2551
2553 for (auto &ArgOp : CI->args())
2554 Tys.push_back(ArgOp->getType());
2555
2556 InstructionCost ScalarCallCost =
2557 TTI.getCallInstrCost(CI->getCalledFunction(), RetTy, Tys, CostKind);
2558
2559 // If this is an intrinsic we may have a lower cost for it.
2562 return std::min(ScalarCallCost, IntrinsicCost);
2563 }
2564 return ScalarCallCost;
2565}
2566
2568 if (VF.isScalar() || !canVectorizeTy(Ty))
2569 return Ty;
2570 return toVectorizedTy(Ty, VF);
2571}
2572
2575 ElementCount VF) const {
2577 assert(ID && "Expected intrinsic call!");
2578 Type *RetTy = maybeVectorizeType(CI->getType(), VF);
2579 FastMathFlags FMF;
2580 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
2581 FMF = FPMO->getFastMathFlags();
2582
2585 SmallVector<Type *> ParamTys;
2586 std::transform(FTy->param_begin(), FTy->param_end(),
2587 std::back_inserter(ParamTys),
2588 [&](Type *Ty) { return maybeVectorizeType(Ty, VF); });
2589
2590 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
2593 return TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
2594}
2595
2597 // Fix widened non-induction PHIs by setting up the PHI operands.
2598 fixNonInductionPHIs(State);
2599
2600 // Don't apply optimizations below when no (vector) loop remains, as they all
2601 // require one at the moment.
2602 VPBasicBlock *HeaderVPBB =
2603 vputils::getFirstLoopHeader(*State.Plan, State.VPDT);
2604 if (!HeaderVPBB)
2605 return;
2606
2607 BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB];
2608
2609 // Remove redundant induction instructions.
2610 legacyCSE(HeaderBB);
2611}
2612
2614 auto Iter = vp_depth_first_shallow(Plan.getEntry());
2616 for (VPRecipeBase &P : VPBB->phis()) {
2618 if (!VPPhi)
2619 continue;
2620 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi));
2621 // Make sure the builder has a valid insert point.
2622 Builder.SetInsertPoint(NewPhi);
2623 for (const auto &[Inc, VPBB] : VPPhi->incoming_values_and_blocks())
2624 NewPhi->addIncoming(State.get(Inc), State.CFG.VPBB2IRBB[VPBB]);
2625 }
2626 }
2627}
2628
2629void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
2630 // We should not collect Scalars more than once per VF. Right now, this
2631 // function is called from collectUniformsAndScalars(), which already does
2632 // this check. Collecting Scalars for VF=1 does not make any sense.
2633 assert(VF.isVector() && !Scalars.contains(VF) &&
2634 "This function should not be visited twice for the same VF");
2635
2636 // This avoids any chances of creating a REPLICATE recipe during planning
2637 // since that would result in generation of scalarized code during execution,
2638 // which is not supported for scalable vectors.
2639 if (VF.isScalable()) {
2640 Scalars[VF].insert_range(Uniforms[VF]);
2641 return;
2642 }
2643
2645
2646 // These sets are used to seed the analysis with pointers used by memory
2647 // accesses that will remain scalar.
2649 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
2650 auto *Latch = TheLoop->getLoopLatch();
2651
2652 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
2653 // The pointer operands of loads and stores will be scalar as long as the
2654 // memory access is not a gather or scatter operation. The value operand of a
2655 // store will remain scalar if the store is scalarized.
2656 auto IsScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
2657 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
2658 assert(WideningDecision != CM_Unknown &&
2659 "Widening decision should be ready at this moment");
2660 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
2661 if (Ptr == Store->getValueOperand())
2662 return WideningDecision == CM_Scalarize;
2663 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
2664 "Ptr is neither a value or pointer operand");
2665 return WideningDecision != CM_GatherScatter;
2666 };
2667
2668 // A helper that returns true if the given value is a getelementptr
2669 // instruction contained in the loop.
2670 auto IsLoopVaryingGEP = [&](Value *V) {
2671 return isa<GetElementPtrInst>(V) && !TheLoop->isLoopInvariant(V);
2672 };
2673
2674 // A helper that evaluates a memory access's use of a pointer. If the use will
2675 // be a scalar use and the pointer is only used by memory accesses, we place
2676 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
2677 // PossibleNonScalarPtrs.
2678 auto EvaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
2679 // We only care about bitcast and getelementptr instructions contained in
2680 // the loop.
2681 if (!IsLoopVaryingGEP(Ptr))
2682 return;
2683
2684 // If the pointer has already been identified as scalar (e.g., if it was
2685 // also identified as uniform), there's nothing to do.
2686 auto *I = cast<Instruction>(Ptr);
2687 if (Worklist.count(I))
2688 return;
2689
2690 // If the use of the pointer will be a scalar use, and all users of the
2691 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
2692 // place the pointer in PossibleNonScalarPtrs.
2693 if (IsScalarUse(MemAccess, Ptr) &&
2695 ScalarPtrs.insert(I);
2696 else
2697 PossibleNonScalarPtrs.insert(I);
2698 };
2699
2700 // We seed the scalars analysis with three classes of instructions: (1)
2701 // instructions marked uniform-after-vectorization and (2) bitcast,
2702 // getelementptr and (pointer) phi instructions used by memory accesses
2703 // requiring a scalar use.
2704 //
2705 // (1) Add to the worklist all instructions that have been identified as
2706 // uniform-after-vectorization.
2707 Worklist.insert_range(Uniforms[VF]);
2708
2709 // (2) Add to the worklist all bitcast and getelementptr instructions used by
2710 // memory accesses requiring a scalar use. The pointer operands of loads and
2711 // stores will be scalar unless the operation is a gather or scatter.
2712 // The value operand of a store will remain scalar if the store is scalarized.
2713 for (auto *BB : TheLoop->blocks())
2714 for (auto &I : *BB) {
2715 if (auto *Load = dyn_cast<LoadInst>(&I)) {
2716 EvaluatePtrUse(Load, Load->getPointerOperand());
2717 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
2718 EvaluatePtrUse(Store, Store->getPointerOperand());
2719 EvaluatePtrUse(Store, Store->getValueOperand());
2720 }
2721 }
2722 for (auto *I : ScalarPtrs)
2723 if (!PossibleNonScalarPtrs.count(I)) {
2724 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
2725 Worklist.insert(I);
2726 }
2727
2728 // Insert the forced scalars.
2729 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
2730 // induction variable when the PHI user is scalarized.
2731 auto ForcedScalar = ForcedScalars.find(VF);
2732 if (ForcedScalar != ForcedScalars.end())
2733 for (auto *I : ForcedScalar->second) {
2734 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
2735 Worklist.insert(I);
2736 }
2737
2738 // Expand the worklist by looking through any bitcasts and getelementptr
2739 // instructions we've already identified as scalar. This is similar to the
2740 // expansion step in collectLoopUniforms(); however, here we're only
2741 // expanding to include additional bitcasts and getelementptr instructions.
2742 unsigned Idx = 0;
2743 while (Idx != Worklist.size()) {
2744 Instruction *Dst = Worklist[Idx++];
2745 if (!IsLoopVaryingGEP(Dst->getOperand(0)))
2746 continue;
2747 auto *Src = cast<Instruction>(Dst->getOperand(0));
2748 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
2749 auto *J = cast<Instruction>(U);
2750 return !TheLoop->contains(J) || Worklist.count(J) ||
2751 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
2752 IsScalarUse(J, Src));
2753 })) {
2754 Worklist.insert(Src);
2755 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
2756 }
2757 }
2758
2759 // An induction variable will remain scalar if all users of the induction
2760 // variable and induction variable update remain scalar.
2761 for (const auto &Induction : Legal->getInductionVars()) {
2762 auto *Ind = Induction.first;
2763 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
2764
2765 // If tail-folding is applied, the primary induction variable will be used
2766 // to feed a vector compare.
2767 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
2768 continue;
2769
2770 // Returns true if \p Indvar is a pointer induction that is used directly by
2771 // load/store instruction \p I.
2772 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
2773 Instruction *I) {
2774 return Induction.second.getKind() ==
2777 Indvar == getLoadStorePointerOperand(I) && IsScalarUse(I, Indvar);
2778 };
2779
2780 // Determine if all users of the induction variable are scalar after
2781 // vectorization.
2782 bool ScalarInd = all_of(Ind->users(), [&](User *U) -> bool {
2783 auto *I = cast<Instruction>(U);
2784 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
2785 IsDirectLoadStoreFromPtrIndvar(Ind, I);
2786 });
2787 if (!ScalarInd)
2788 continue;
2789
2790 // If the induction variable update is a fixed-order recurrence, neither the
2791 // induction variable or its update should be marked scalar after
2792 // vectorization.
2793 auto *IndUpdatePhi = dyn_cast<PHINode>(IndUpdate);
2794 if (IndUpdatePhi && Legal->isFixedOrderRecurrence(IndUpdatePhi))
2795 continue;
2796
2797 // Determine if all users of the induction variable update instruction are
2798 // scalar after vectorization.
2799 bool ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
2800 auto *I = cast<Instruction>(U);
2801 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
2802 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
2803 });
2804 if (!ScalarIndUpdate)
2805 continue;
2806
2807 // The induction variable and its update instruction will remain scalar.
2808 Worklist.insert(Ind);
2809 Worklist.insert(IndUpdate);
2810 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
2811 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
2812 << "\n");
2813 }
2814
2815 Scalars[VF].insert_range(Worklist);
2816}
2817
2819 ElementCount VF) {
2820 if (!isPredicatedInst(I))
2821 return false;
2822
2823 // Do we have a non-scalar lowering for this predicated
2824 // instruction? No - it is scalar with predication.
2825 switch(I->getOpcode()) {
2826 default:
2827 return true;
2828 case Instruction::Call:
2829 if (VF.isScalar())
2830 return true;
2832 case Instruction::Load:
2833 case Instruction::Store: {
2834 auto *Ptr = getLoadStorePointerOperand(I);
2835 auto *Ty = getLoadStoreType(I);
2836 unsigned AS = getLoadStoreAddressSpace(I);
2837 Type *VTy = Ty;
2838 if (VF.isVector())
2839 VTy = VectorType::get(Ty, VF);
2840 const Align Alignment = getLoadStoreAlignment(I);
2841 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment, AS) ||
2842 TTI.isLegalMaskedGather(VTy, Alignment))
2843 : !(isLegalMaskedStore(Ty, Ptr, Alignment, AS) ||
2844 TTI.isLegalMaskedScatter(VTy, Alignment));
2845 }
2846 case Instruction::UDiv:
2847 case Instruction::SDiv:
2848 case Instruction::SRem:
2849 case Instruction::URem: {
2850 // We have the option to use the safe-divisor idiom to avoid predication.
2851 // The cost based decision here will always select safe-divisor for
2852 // scalable vectors as scalarization isn't legal.
2853 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
2854 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
2855 }
2856 }
2857}
2858
2859// TODO: Fold into LoopVectorizationLegality::isMaskRequired.
2861 // TODO: We can use the loop-preheader as context point here and get
2862 // context sensitive reasoning for isSafeToSpeculativelyExecute.
2864 (isa<LoadInst, StoreInst, CallInst>(I) && !Legal->isMaskRequired(I)) ||
2866 return false;
2867
2868 // If the instruction was executed conditionally in the original scalar loop,
2869 // predication is needed with a mask whose lanes are all possibly inactive.
2870 if (Legal->blockNeedsPredication(I->getParent()))
2871 return true;
2872
2873 // If we're not folding the tail by masking, predication is unnecessary.
2874 if (!foldTailByMasking())
2875 return false;
2876
2877 // All that remain are instructions with side-effects originally executed in
2878 // the loop unconditionally, but now execute under a tail-fold mask (only)
2879 // having at least one active lane (the first). If the side-effects of the
2880 // instruction are invariant, executing it w/o (the tail-folding) mask is safe
2881 // - it will cause the same side-effects as when masked.
2882 switch(I->getOpcode()) {
2883 default:
2885 "instruction should have been considered by earlier checks");
2886 case Instruction::Call:
2887 // Side-effects of a Call are assumed to be non-invariant, needing a
2888 // (fold-tail) mask.
2889 assert(Legal->isMaskRequired(I) &&
2890 "should have returned earlier for calls not needing a mask");
2891 return true;
2892 case Instruction::Load:
2893 // If the address is loop invariant no predication is needed.
2894 return !Legal->isInvariant(getLoadStorePointerOperand(I));
2895 case Instruction::Store: {
2896 // For stores, we need to prove both speculation safety (which follows from
2897 // the same argument as loads), but also must prove the value being stored
2898 // is correct. The easiest form of the later is to require that all values
2899 // stored are the same.
2900 return !(Legal->isInvariant(getLoadStorePointerOperand(I)) &&
2901 TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()));
2902 }
2903 case Instruction::UDiv:
2904 case Instruction::URem:
2905 // If the divisor is loop-invariant no predication is needed.
2906 return !Legal->isInvariant(I->getOperand(1));
2907 case Instruction::SDiv:
2908 case Instruction::SRem:
2909 // Conservative for now, since masked-off lanes may be poison and could
2910 // trigger signed overflow.
2911 return true;
2912 }
2913}
2914
2918 return 1;
2919 // If the block wasn't originally predicated then return early to avoid
2920 // computing BlockFrequencyInfo unnecessarily.
2921 if (!Legal->blockNeedsPredication(BB))
2922 return 1;
2923
2924 uint64_t HeaderFreq =
2925 getBFI().getBlockFreq(TheLoop->getHeader()).getFrequency();
2926 uint64_t BBFreq = getBFI().getBlockFreq(BB).getFrequency();
2927 assert(HeaderFreq >= BBFreq &&
2928 "Header has smaller block freq than dominated BB?");
2929 return std::round((double)HeaderFreq / BBFreq);
2930}
2931
2932std::pair<InstructionCost, InstructionCost>
2934 ElementCount VF) {
2935 assert(I->getOpcode() == Instruction::UDiv ||
2936 I->getOpcode() == Instruction::SDiv ||
2937 I->getOpcode() == Instruction::SRem ||
2938 I->getOpcode() == Instruction::URem);
2940
2941 // Scalarization isn't legal for scalable vector types
2942 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
2943 if (!VF.isScalable()) {
2944 // Get the scalarization cost and scale this amount by the probability of
2945 // executing the predicated block. If the instruction is not predicated,
2946 // we fall through to the next case.
2947 ScalarizationCost = 0;
2948
2949 // These instructions have a non-void type, so account for the phi nodes
2950 // that we will create. This cost is likely to be zero. The phi node
2951 // cost, if any, should be scaled by the block probability because it
2952 // models a copy at the end of each predicated block.
2953 ScalarizationCost +=
2954 VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
2955
2956 // The cost of the non-predicated instruction.
2957 ScalarizationCost +=
2958 VF.getFixedValue() *
2959 TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
2960
2961 // The cost of insertelement and extractelement instructions needed for
2962 // scalarization.
2963 ScalarizationCost += getScalarizationOverhead(I, VF);
2964
2965 // Scale the cost by the probability of executing the predicated blocks.
2966 // This assumes the predicated block for each vector lane is equally
2967 // likely.
2968 ScalarizationCost =
2969 ScalarizationCost / getPredBlockCostDivisor(CostKind, I->getParent());
2970 }
2971
2972 InstructionCost SafeDivisorCost = 0;
2973 auto *VecTy = toVectorTy(I->getType(), VF);
2974 // The cost of the select guard to ensure all lanes are well defined
2975 // after we speculate above any internal control flow.
2976 SafeDivisorCost +=
2977 TTI.getCmpSelInstrCost(Instruction::Select, VecTy,
2978 toVectorTy(Type::getInt1Ty(I->getContext()), VF),
2980
2981 SmallVector<const Value *, 4> Operands(I->operand_values());
2982 SafeDivisorCost += TTI.getArithmeticInstrCost(
2983 I->getOpcode(), VecTy, CostKind,
2984 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
2985 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
2986 Operands, I);
2987 return {ScalarizationCost, SafeDivisorCost};
2988}
2989
2991 Instruction *I, ElementCount VF) const {
2992 assert(isAccessInterleaved(I) && "Expecting interleaved access.");
2994 "Decision should not be set yet.");
2995 auto *Group = getInterleavedAccessGroup(I);
2996 assert(Group && "Must have a group.");
2997 unsigned InterleaveFactor = Group->getFactor();
2998
2999 // If the instruction's allocated size doesn't equal its type size, it
3000 // requires padding and will be scalarized.
3001 auto &DL = I->getDataLayout();
3002 auto *ScalarTy = getLoadStoreType(I);
3003 if (hasIrregularType(ScalarTy, DL))
3004 return false;
3005
3006 // For scalable vectors, the interleave factors must be <= 8 since we require
3007 // the (de)interleaveN intrinsics instead of shufflevectors.
3008 if (VF.isScalable() && InterleaveFactor > 8)
3009 return false;
3010
3011 // If the group involves a non-integral pointer, we may not be able to
3012 // losslessly cast all values to a common type.
3013 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
3014 for (unsigned Idx = 0; Idx < InterleaveFactor; Idx++) {
3015 Instruction *Member = Group->getMember(Idx);
3016 if (!Member)
3017 continue;
3018 auto *MemberTy = getLoadStoreType(Member);
3019 bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
3020 // Don't coerce non-integral pointers to integers or vice versa.
3021 if (MemberNI != ScalarNI)
3022 // TODO: Consider adding special nullptr value case here
3023 return false;
3024 if (MemberNI && ScalarNI &&
3025 ScalarTy->getPointerAddressSpace() !=
3026 MemberTy->getPointerAddressSpace())
3027 return false;
3028 }
3029
3030 // Check if masking is required.
3031 // A Group may need masking for one of two reasons: it resides in a block that
3032 // needs predication, or it was decided to use masking to deal with gaps
3033 // (either a gap at the end of a load-access that may result in a speculative
3034 // load, or any gaps in a store-access).
3035 bool PredicatedAccessRequiresMasking =
3036 blockNeedsPredicationForAnyReason(I->getParent()) &&
3037 Legal->isMaskRequired(I);
3038 bool LoadAccessWithGapsRequiresEpilogMasking =
3039 isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
3041 bool StoreAccessWithGapsRequiresMasking =
3042 isa<StoreInst>(I) && !Group->isFull();
3043 if (!PredicatedAccessRequiresMasking &&
3044 !LoadAccessWithGapsRequiresEpilogMasking &&
3045 !StoreAccessWithGapsRequiresMasking)
3046 return true;
3047
3048 // If masked interleaving is required, we expect that the user/target had
3049 // enabled it, because otherwise it either wouldn't have been created or
3050 // it should have been invalidated by the CostModel.
3052 "Masked interleave-groups for predicated accesses are not enabled.");
3053
3054 if (Group->isReverse())
3055 return false;
3056
3057 // TODO: Support interleaved access that requires a gap mask for scalable VFs.
3058 bool NeedsMaskForGaps = LoadAccessWithGapsRequiresEpilogMasking ||
3059 StoreAccessWithGapsRequiresMasking;
3060 if (VF.isScalable() && NeedsMaskForGaps)
3061 return false;
3062
3063 auto *Ty = getLoadStoreType(I);
3064 const Align Alignment = getLoadStoreAlignment(I);
3065 unsigned AS = getLoadStoreAddressSpace(I);
3066 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment, AS)
3067 : TTI.isLegalMaskedStore(Ty, Alignment, AS);
3068}
3069
3071 Instruction *I, ElementCount VF) {
3072 // Get and ensure we have a valid memory instruction.
3073 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
3074
3075 auto *Ptr = getLoadStorePointerOperand(I);
3076 auto *ScalarTy = getLoadStoreType(I);
3077
3078 // In order to be widened, the pointer should be consecutive, first of all.
3079 if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
3080 return false;
3081
3082 // If the instruction is a store located in a predicated block, it will be
3083 // scalarized.
3084 if (isScalarWithPredication(I, VF))
3085 return false;
3086
3087 // If the instruction's allocated size doesn't equal it's type size, it
3088 // requires padding and will be scalarized.
3089 auto &DL = I->getDataLayout();
3090 if (hasIrregularType(ScalarTy, DL))
3091 return false;
3092
3093 return true;
3094}
3095
3096void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
3097 // We should not collect Uniforms more than once per VF. Right now,
3098 // this function is called from collectUniformsAndScalars(), which
3099 // already does this check. Collecting Uniforms for VF=1 does not make any
3100 // sense.
3101
3102 assert(VF.isVector() && !Uniforms.contains(VF) &&
3103 "This function should not be visited twice for the same VF");
3104
3105 // Visit the list of Uniforms. If we find no uniform value, we won't
3106 // analyze again. Uniforms.count(VF) will return 1.
3107 Uniforms[VF].clear();
3108
3109 // Now we know that the loop is vectorizable!
3110 // Collect instructions inside the loop that will remain uniform after
3111 // vectorization.
3112
3113 // Global values, params and instructions outside of current loop are out of
3114 // scope.
3115 auto IsOutOfScope = [&](Value *V) -> bool {
3117 return (!I || !TheLoop->contains(I));
3118 };
3119
3120 // Worklist containing uniform instructions demanding lane 0.
3121 SetVector<Instruction *> Worklist;
3122
3123 // Add uniform instructions demanding lane 0 to the worklist. Instructions
3124 // that require predication must not be considered uniform after
3125 // vectorization, because that would create an erroneous replicating region
3126 // where only a single instance out of VF should be formed.
3127 auto AddToWorklistIfAllowed = [&](Instruction *I) -> void {
3128 if (IsOutOfScope(I)) {
3129 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
3130 << *I << "\n");
3131 return;
3132 }
3133 if (isPredicatedInst(I)) {
3134 LLVM_DEBUG(
3135 dbgs() << "LV: Found not uniform due to requiring predication: " << *I
3136 << "\n");
3137 return;
3138 }
3139 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
3140 Worklist.insert(I);
3141 };
3142
3143 // Start with the conditional branches exiting the loop. If the branch
3144 // condition is an instruction contained in the loop that is only used by the
3145 // branch, it is uniform. Note conditions from uncountable early exits are not
3146 // uniform.
3148 TheLoop->getExitingBlocks(Exiting);
3149 for (BasicBlock *E : Exiting) {
3150 if (Legal->hasUncountableEarlyExit() && TheLoop->getLoopLatch() != E)
3151 continue;
3152 auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
3153 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
3154 AddToWorklistIfAllowed(Cmp);
3155 }
3156
3157 auto PrevVF = VF.divideCoefficientBy(2);
3158 // Return true if all lanes perform the same memory operation, and we can
3159 // thus choose to execute only one.
3160 auto IsUniformMemOpUse = [&](Instruction *I) {
3161 // If the value was already known to not be uniform for the previous
3162 // (smaller VF), it cannot be uniform for the larger VF.
3163 if (PrevVF.isVector()) {
3164 auto Iter = Uniforms.find(PrevVF);
3165 if (Iter != Uniforms.end() && !Iter->second.contains(I))
3166 return false;
3167 }
3168 if (!Legal->isUniformMemOp(*I, VF))
3169 return false;
3170 if (isa<LoadInst>(I))
3171 // Loading the same address always produces the same result - at least
3172 // assuming aliasing and ordering which have already been checked.
3173 return true;
3174 // Storing the same value on every iteration.
3175 return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
3176 };
3177
3178 auto IsUniformDecision = [&](Instruction *I, ElementCount VF) {
3179 InstWidening WideningDecision = getWideningDecision(I, VF);
3180 assert(WideningDecision != CM_Unknown &&
3181 "Widening decision should be ready at this moment");
3182
3183 if (IsUniformMemOpUse(I))
3184 return true;
3185
3186 return (WideningDecision == CM_Widen ||
3187 WideningDecision == CM_Widen_Reverse ||
3188 WideningDecision == CM_Interleave);
3189 };
3190
3191 // Returns true if Ptr is the pointer operand of a memory access instruction
3192 // I, I is known to not require scalarization, and the pointer is not also
3193 // stored.
3194 auto IsVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
3195 if (isa<StoreInst>(I) && I->getOperand(0) == Ptr)
3196 return false;
3197 return getLoadStorePointerOperand(I) == Ptr &&
3198 (IsUniformDecision(I, VF) || Legal->isInvariant(Ptr));
3199 };
3200
3201 // Holds a list of values which are known to have at least one uniform use.
3202 // Note that there may be other uses which aren't uniform. A "uniform use"
3203 // here is something which only demands lane 0 of the unrolled iterations;
3204 // it does not imply that all lanes produce the same value (e.g. this is not
3205 // the usual meaning of uniform)
3206 SetVector<Value *> HasUniformUse;
3207
3208 // Scan the loop for instructions which are either a) known to have only
3209 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
3210 for (auto *BB : TheLoop->blocks())
3211 for (auto &I : *BB) {
3212 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
3213 switch (II->getIntrinsicID()) {
3214 case Intrinsic::sideeffect:
3215 case Intrinsic::experimental_noalias_scope_decl:
3216 case Intrinsic::assume:
3217 case Intrinsic::lifetime_start:
3218 case Intrinsic::lifetime_end:
3219 if (TheLoop->hasLoopInvariantOperands(&I))
3220 AddToWorklistIfAllowed(&I);
3221 break;
3222 default:
3223 break;
3224 }
3225 }
3226
3227 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
3228 if (IsOutOfScope(EVI->getAggregateOperand())) {
3229 AddToWorklistIfAllowed(EVI);
3230 continue;
3231 }
3232 // Only ExtractValue instructions where the aggregate value comes from a
3233 // call are allowed to be non-uniform.
3234 assert(isa<CallInst>(EVI->getAggregateOperand()) &&
3235 "Expected aggregate value to be call return value");
3236 }
3237
3238 // If there's no pointer operand, there's nothing to do.
3239 auto *Ptr = getLoadStorePointerOperand(&I);
3240 if (!Ptr)
3241 continue;
3242
3243 // If the pointer can be proven to be uniform, always add it to the
3244 // worklist.
3245 if (isa<Instruction>(Ptr) && Legal->isUniform(Ptr, VF))
3246 AddToWorklistIfAllowed(cast<Instruction>(Ptr));
3247
3248 if (IsUniformMemOpUse(&I))
3249 AddToWorklistIfAllowed(&I);
3250
3251 if (IsVectorizedMemAccessUse(&I, Ptr))
3252 HasUniformUse.insert(Ptr);
3253 }
3254
3255 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
3256 // demanding) users. Since loops are assumed to be in LCSSA form, this
3257 // disallows uses outside the loop as well.
3258 for (auto *V : HasUniformUse) {
3259 if (IsOutOfScope(V))
3260 continue;
3261 auto *I = cast<Instruction>(V);
3262 bool UsersAreMemAccesses = all_of(I->users(), [&](User *U) -> bool {
3263 auto *UI = cast<Instruction>(U);
3264 return TheLoop->contains(UI) && IsVectorizedMemAccessUse(UI, V);
3265 });
3266 if (UsersAreMemAccesses)
3267 AddToWorklistIfAllowed(I);
3268 }
3269
3270 // Expand Worklist in topological order: whenever a new instruction
3271 // is added , its users should be already inside Worklist. It ensures
3272 // a uniform instruction will only be used by uniform instructions.
3273 unsigned Idx = 0;
3274 while (Idx != Worklist.size()) {
3275 Instruction *I = Worklist[Idx++];
3276
3277 for (auto *OV : I->operand_values()) {
3278 // isOutOfScope operands cannot be uniform instructions.
3279 if (IsOutOfScope(OV))
3280 continue;
3281 // First order recurrence Phi's should typically be considered
3282 // non-uniform.
3283 auto *OP = dyn_cast<PHINode>(OV);
3284 if (OP && Legal->isFixedOrderRecurrence(OP))
3285 continue;
3286 // If all the users of the operand are uniform, then add the
3287 // operand into the uniform worklist.
3288 auto *OI = cast<Instruction>(OV);
3289 if (llvm::all_of(OI->users(), [&](User *U) -> bool {
3290 auto *J = cast<Instruction>(U);
3291 return Worklist.count(J) || IsVectorizedMemAccessUse(J, OI);
3292 }))
3293 AddToWorklistIfAllowed(OI);
3294 }
3295 }
3296
3297 // For an instruction to be added into Worklist above, all its users inside
3298 // the loop should also be in Worklist. However, this condition cannot be
3299 // true for phi nodes that form a cyclic dependence. We must process phi
3300 // nodes separately. An induction variable will remain uniform if all users
3301 // of the induction variable and induction variable update remain uniform.
3302 // The code below handles both pointer and non-pointer induction variables.
3303 BasicBlock *Latch = TheLoop->getLoopLatch();
3304 for (const auto &Induction : Legal->getInductionVars()) {
3305 auto *Ind = Induction.first;
3306 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3307
3308 // Determine if all users of the induction variable are uniform after
3309 // vectorization.
3310 bool UniformInd = all_of(Ind->users(), [&](User *U) -> bool {
3311 auto *I = cast<Instruction>(U);
3312 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3313 IsVectorizedMemAccessUse(I, Ind);
3314 });
3315 if (!UniformInd)
3316 continue;
3317
3318 // Determine if all users of the induction variable update instruction are
3319 // uniform after vectorization.
3320 bool UniformIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
3321 auto *I = cast<Instruction>(U);
3322 return I == Ind || Worklist.count(I) ||
3323 IsVectorizedMemAccessUse(I, IndUpdate);
3324 });
3325 if (!UniformIndUpdate)
3326 continue;
3327
3328 // The induction variable and its update instruction will remain uniform.
3329 AddToWorklistIfAllowed(Ind);
3330 AddToWorklistIfAllowed(IndUpdate);
3331 }
3332
3333 Uniforms[VF].insert_range(Worklist);
3334}
3335
3337 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
3338
3339 if (Legal->getRuntimePointerChecking()->Need) {
3340 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
3341 "runtime pointer checks needed. Enable vectorization of this "
3342 "loop with '#pragma clang loop vectorize(enable)' when "
3343 "compiling with -Os/-Oz",
3344 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3345 return true;
3346 }
3347
3348 if (!PSE.getPredicate().isAlwaysTrue()) {
3349 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
3350 "runtime SCEV checks needed. Enable vectorization of this "
3351 "loop with '#pragma clang loop vectorize(enable)' when "
3352 "compiling with -Os/-Oz",
3353 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3354 return true;
3355 }
3356
3357 // FIXME: Avoid specializing for stride==1 instead of bailing out.
3358 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
3359 reportVectorizationFailure("Runtime stride check for small trip count",
3360 "runtime stride == 1 checks needed. Enable vectorization of "
3361 "this loop without such check by compiling with -Os/-Oz",
3362 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3363 return true;
3364 }
3365
3366 return false;
3367}
3368
3369bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
3370 if (IsScalableVectorizationAllowed)
3371 return *IsScalableVectorizationAllowed;
3372
3373 IsScalableVectorizationAllowed = false;
3374 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
3375 return false;
3376
3377 if (Hints->isScalableVectorizationDisabled()) {
3378 reportVectorizationInfo("Scalable vectorization is explicitly disabled",
3379 "ScalableVectorizationDisabled", ORE, TheLoop);
3380 return false;
3381 }
3382
3383 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
3384
3385 auto MaxScalableVF = ElementCount::getScalable(
3386 std::numeric_limits<ElementCount::ScalarTy>::max());
3387
3388 // Test that the loop-vectorizer can legalize all operations for this MaxVF.
3389 // FIXME: While for scalable vectors this is currently sufficient, this should
3390 // be replaced by a more detailed mechanism that filters out specific VFs,
3391 // instead of invalidating vectorization for a whole set of VFs based on the
3392 // MaxVF.
3393
3394 // Disable scalable vectorization if the loop contains unsupported reductions.
3395 if (!canVectorizeReductions(MaxScalableVF)) {
3397 "Scalable vectorization not supported for the reduction "
3398 "operations found in this loop.",
3399 "ScalableVFUnfeasible", ORE, TheLoop);
3400 return false;
3401 }
3402
3403 // Disable scalable vectorization if the loop contains any instructions
3404 // with element types not supported for scalable vectors.
3405 if (any_of(ElementTypesInLoop, [&](Type *Ty) {
3406 return !Ty->isVoidTy() &&
3408 })) {
3409 reportVectorizationInfo("Scalable vectorization is not supported "
3410 "for all element types found in this loop.",
3411 "ScalableVFUnfeasible", ORE, TheLoop);
3412 return false;
3413 }
3414
3415 if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(*TheFunction, TTI)) {
3416 reportVectorizationInfo("The target does not provide maximum vscale value "
3417 "for safe distance analysis.",
3418 "ScalableVFUnfeasible", ORE, TheLoop);
3419 return false;
3420 }
3421
3422 IsScalableVectorizationAllowed = true;
3423 return true;
3424}
3425
3426ElementCount
3427LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
3428 if (!isScalableVectorizationAllowed())
3429 return ElementCount::getScalable(0);
3430
3431 auto MaxScalableVF = ElementCount::getScalable(
3432 std::numeric_limits<ElementCount::ScalarTy>::max());
3433 if (Legal->isSafeForAnyVectorWidth())
3434 return MaxScalableVF;
3435
3436 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
3437 // Limit MaxScalableVF by the maximum safe dependence distance.
3438 MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);
3439
3440 if (!MaxScalableVF)
3442 "Max legal vector width too small, scalable vectorization "
3443 "unfeasible.",
3444 "ScalableVFUnfeasible", ORE, TheLoop);
3445
3446 return MaxScalableVF;
3447}
3448
3449FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
3450 unsigned MaxTripCount, ElementCount UserVF, unsigned UserIC,
3451 bool FoldTailByMasking) {
3452 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
3453 unsigned SmallestType, WidestType;
3454 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
3455
3456 // Get the maximum safe dependence distance in bits computed by LAA.
3457 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
3458 // the memory accesses that is most restrictive (involved in the smallest
3459 // dependence distance).
3460 unsigned MaxSafeElementsPowerOf2 =
3461 bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
3462 if (!Legal->isSafeForAnyStoreLoadForwardDistances()) {
3463 unsigned SLDist = Legal->getMaxStoreLoadForwardSafeDistanceInBits();
3464 MaxSafeElementsPowerOf2 =
3465 std::min(MaxSafeElementsPowerOf2, SLDist / WidestType);
3466 }
3467 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElementsPowerOf2);
3468 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElementsPowerOf2);
3469
3470 if (!Legal->isSafeForAnyVectorWidth())
3471 this->MaxSafeElements = MaxSafeElementsPowerOf2;
3472
3473 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
3474 << ".\n");
3475 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
3476 << ".\n");
3477
3478 // First analyze the UserVF, fall back if the UserVF should be ignored.
3479 if (UserVF) {
3480 auto MaxSafeUserVF =
3481 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
3482
3483 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
3484 // If `VF=vscale x N` is safe, then so is `VF=N`
3485 if (UserVF.isScalable())
3486 return FixedScalableVFPair(
3487 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
3488
3489 return UserVF;
3490 }
3491
3492 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
3493
3494 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
3495 // is better to ignore the hint and let the compiler choose a suitable VF.
3496 if (!UserVF.isScalable()) {
3497 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3498 << " is unsafe, clamping to max safe VF="
3499 << MaxSafeFixedVF << ".\n");
3500 ORE->emit([&]() {
3501 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3502 TheLoop->getStartLoc(),
3503 TheLoop->getHeader())
3504 << "User-specified vectorization factor "
3505 << ore::NV("UserVectorizationFactor", UserVF)
3506 << " is unsafe, clamping to maximum safe vectorization factor "
3507 << ore::NV("VectorizationFactor", MaxSafeFixedVF);
3508 });
3509 return MaxSafeFixedVF;
3510 }
3511
3513 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3514 << " is ignored because scalable vectors are not "
3515 "available.\n");
3516 ORE->emit([&]() {
3517 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3518 TheLoop->getStartLoc(),
3519 TheLoop->getHeader())
3520 << "User-specified vectorization factor "
3521 << ore::NV("UserVectorizationFactor", UserVF)
3522 << " is ignored because the target does not support scalable "
3523 "vectors. The compiler will pick a more suitable value.";
3524 });
3525 } else {
3526 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3527 << " is unsafe. Ignoring scalable UserVF.\n");
3528 ORE->emit([&]() {
3529 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3530 TheLoop->getStartLoc(),
3531 TheLoop->getHeader())
3532 << "User-specified vectorization factor "
3533 << ore::NV("UserVectorizationFactor", UserVF)
3534 << " is unsafe. Ignoring the hint to let the compiler pick a "
3535 "more suitable value.";
3536 });
3537 }
3538 }
3539
3540 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
3541 << " / " << WidestType << " bits.\n");
3542
3543 FixedScalableVFPair Result(ElementCount::getFixed(1),
3545 if (auto MaxVF =
3546 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3547 MaxSafeFixedVF, UserIC, FoldTailByMasking))
3548 Result.FixedVF = MaxVF;
3549
3550 if (auto MaxVF =
3551 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3552 MaxSafeScalableVF, UserIC, FoldTailByMasking))
3553 if (MaxVF.isScalable()) {
3554 Result.ScalableVF = MaxVF;
3555 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
3556 << "\n");
3557 }
3558
3559 return Result;
3560}
3561
3562FixedScalableVFPair
3564 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
3565 // TODO: It may be useful to do since it's still likely to be dynamically
3566 // uniform if the target can skip.
3568 "Not inserting runtime ptr check for divergent target",
3569 "runtime pointer checks needed. Not enabled for divergent target",
3570 "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
3572 }
3573
3574 ScalarEvolution *SE = PSE.getSE();
3576 unsigned MaxTC = PSE.getSmallConstantMaxTripCount();
3577 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
3578 if (TC != ElementCount::getFixed(MaxTC))
3579 LLVM_DEBUG(dbgs() << "LV: Found maximum trip count: " << MaxTC << '\n');
3580 if (TC.isScalar()) {
3581 reportVectorizationFailure("Single iteration (non) loop",
3582 "loop trip count is one, irrelevant for vectorization",
3583 "SingleIterationLoop", ORE, TheLoop);
3585 }
3586
3587 // If BTC matches the widest induction type and is -1 then the trip count
3588 // computation will wrap to 0 and the vector trip count will be 0. Do not try
3589 // to vectorize.
3590 const SCEV *BTC = SE->getBackedgeTakenCount(TheLoop);
3591 if (!isa<SCEVCouldNotCompute>(BTC) &&
3592 BTC->getType()->getScalarSizeInBits() >=
3593 Legal->getWidestInductionType()->getScalarSizeInBits() &&
3595 SE->getMinusOne(BTC->getType()))) {
3597 "Trip count computation wrapped",
3598 "backedge-taken count is -1, loop trip count wrapped to 0",
3599 "TripCountWrapped", ORE, TheLoop);
3601 }
3602
3603 switch (ScalarEpilogueStatus) {
3605 return computeFeasibleMaxVF(MaxTC, UserVF, UserIC, false);
3607 [[fallthrough]];
3609 LLVM_DEBUG(
3610 dbgs() << "LV: vector predicate hint/switch found.\n"
3611 << "LV: Not allowing scalar epilogue, creating predicated "
3612 << "vector loop.\n");
3613 break;
3615 // fallthrough as a special case of OptForSize
3617 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
3618 LLVM_DEBUG(
3619 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
3620 else
3621 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
3622 << "count.\n");
3623
3624 // Bail if runtime checks are required, which are not good when optimising
3625 // for size.
3628
3629 break;
3630 }
3631
3632 // Now try the tail folding
3633
3634 // Invalidate interleave groups that require an epilogue if we can't mask
3635 // the interleave-group.
3637 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
3638 "No decisions should have been taken at this point");
3639 // Note: There is no need to invalidate any cost modeling decisions here, as
3640 // none were taken so far.
3641 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
3642 }
3643
3644 FixedScalableVFPair MaxFactors =
3645 computeFeasibleMaxVF(MaxTC, UserVF, UserIC, true);
3646
3647 // Avoid tail folding if the trip count is known to be a multiple of any VF
3648 // we choose.
3649 std::optional<unsigned> MaxPowerOf2RuntimeVF =
3650 MaxFactors.FixedVF.getFixedValue();
3651 if (MaxFactors.ScalableVF) {
3652 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
3653 if (MaxVScale) {
3654 MaxPowerOf2RuntimeVF = std::max<unsigned>(
3655 *MaxPowerOf2RuntimeVF,
3656 *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
3657 } else
3658 MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
3659 }
3660
3661 auto NoScalarEpilogueNeeded = [this, &UserIC](unsigned MaxVF) {
3662 // Return false if the loop is neither a single-latch-exit loop nor an
3663 // early-exit loop as tail-folding is not supported in that case.
3664 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
3665 !Legal->hasUncountableEarlyExit())
3666 return false;
3667 unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF;
3668 ScalarEvolution *SE = PSE.getSE();
3669 // Calling getSymbolicMaxBackedgeTakenCount enables support for loops
3670 // with uncountable exits. For countable loops, the symbolic maximum must
3671 // remain identical to the known back-edge taken count.
3672 const SCEV *BackedgeTakenCount = PSE.getSymbolicMaxBackedgeTakenCount();
3673 assert((Legal->hasUncountableEarlyExit() ||
3674 BackedgeTakenCount == PSE.getBackedgeTakenCount()) &&
3675 "Invalid loop count");
3676 const SCEV *ExitCount = SE->getAddExpr(
3677 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
3678 const SCEV *Rem = SE->getURemExpr(
3679 SE->applyLoopGuards(ExitCount, TheLoop),
3680 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
3681 return Rem->isZero();
3682 };
3683
3684 if (MaxPowerOf2RuntimeVF > 0u) {
3685 assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
3686 "MaxFixedVF must be a power of 2");
3687 if (NoScalarEpilogueNeeded(*MaxPowerOf2RuntimeVF)) {
3688 // Accept MaxFixedVF if we do not have a tail.
3689 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
3690 return MaxFactors;
3691 }
3692 }
3693
3694 auto ExpectedTC = getSmallBestKnownTC(PSE, TheLoop);
3695 if (ExpectedTC && ExpectedTC->isFixed() &&
3696 ExpectedTC->getFixedValue() <=
3697 TTI.getMinTripCountTailFoldingThreshold()) {
3698 if (MaxPowerOf2RuntimeVF > 0u) {
3699 // If we have a low-trip-count, and the fixed-width VF is known to divide
3700 // the trip count but the scalable factor does not, use the fixed-width
3701 // factor in preference to allow the generation of a non-predicated loop.
3702 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedLowTripLoop &&
3703 NoScalarEpilogueNeeded(MaxFactors.FixedVF.getFixedValue())) {
3704 LLVM_DEBUG(dbgs() << "LV: Picking a fixed-width so that no tail will "
3705 "remain for any chosen VF.\n");
3706 MaxFactors.ScalableVF = ElementCount::getScalable(0);
3707 return MaxFactors;
3708 }
3709 }
3710
3712 "The trip count is below the minial threshold value.",
3713 "loop trip count is too low, avoiding vectorization", "LowTripCount",
3714 ORE, TheLoop);
3716 }
3717
3718 // If we don't know the precise trip count, or if the trip count that we
3719 // found modulo the vectorization factor is not zero, try to fold the tail
3720 // by masking.
3721 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
3722 bool ContainsScalableVF = MaxFactors.ScalableVF.isNonZero();
3723 setTailFoldingStyles(ContainsScalableVF, UserIC);
3724 if (foldTailByMasking()) {
3725 if (foldTailWithEVL()) {
3726 LLVM_DEBUG(
3727 dbgs()
3728 << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
3729 "try to generate VP Intrinsics with scalable vector "
3730 "factors only.\n");
3731 // Tail folded loop using VP intrinsics restricts the VF to be scalable
3732 // for now.
3733 // TODO: extend it for fixed vectors, if required.
3734 assert(ContainsScalableVF && "Expected scalable vector factor.");
3735
3736 MaxFactors.FixedVF = ElementCount::getFixed(1);
3737 }
3738 return MaxFactors;
3739 }
3740
3741 // If there was a tail-folding hint/switch, but we can't fold the tail by
3742 // masking, fallback to a vectorization with a scalar epilogue.
3743 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
3744 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
3745 "scalar epilogue instead.\n");
3746 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
3747 return MaxFactors;
3748 }
3749
3750 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
3751 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
3753 }
3754
3755 if (TC.isZero()) {
3757 "unable to calculate the loop count due to complex control flow",
3758 "UnknownLoopCountComplexCFG", ORE, TheLoop);
3760 }
3761
3763 "Cannot optimize for size and vectorize at the same time.",
3764 "cannot optimize for size and vectorize at the same time. "
3765 "Enable vectorization of this loop with '#pragma clang loop "
3766 "vectorize(enable)' when compiling with -Os/-Oz",
3767 "NoTailLoopWithOptForSize", ORE, TheLoop);
3769}
3770
3772 ElementCount VF) {
3773 if (ConsiderRegPressure.getNumOccurrences())
3774 return ConsiderRegPressure;
3775
3776 // TODO: We should eventually consider register pressure for all targets. The
3777 // TTI hook is temporary whilst target-specific issues are being fixed.
3778 if (TTI.shouldConsiderVectorizationRegPressure())
3779 return true;
3780
3781 if (!useMaxBandwidth(VF.isScalable()
3784 return false;
3785 // Only calculate register pressure for VFs enabled by MaxBandwidth.
3787 VF, VF.isScalable() ? MaxPermissibleVFWithoutMaxBW.ScalableVF
3789}
3790
3793 return MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
3794 (TTI.shouldMaximizeVectorBandwidth(RegKind) ||
3796 Legal->hasVectorCallVariants())));
3797}
3798
3799ElementCount LoopVectorizationCostModel::clampVFByMaxTripCount(
3800 ElementCount VF, unsigned MaxTripCount, unsigned UserIC,
3801 bool FoldTailByMasking) const {
3802 unsigned EstimatedVF = VF.getKnownMinValue();
3803 if (VF.isScalable() && TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
3804 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
3805 auto Min = Attr.getVScaleRangeMin();
3806 EstimatedVF *= Min;
3807 }
3808
3809 // When a scalar epilogue is required, at least one iteration of the scalar
3810 // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
3811 // max VF that results in a dead vector loop.
3812 if (MaxTripCount > 0 && requiresScalarEpilogue(true))
3813 MaxTripCount -= 1;
3814
3815 // When the user specifies an interleave count, we need to ensure that
3816 // VF * UserIC <= MaxTripCount to avoid a dead vector loop.
3817 unsigned IC = UserIC > 0 ? UserIC : 1;
3818 unsigned EstimatedVFTimesIC = EstimatedVF * IC;
3819
3820 if (MaxTripCount && MaxTripCount <= EstimatedVFTimesIC &&
3821 (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
3822 // If upper bound loop trip count (TC) is known at compile time there is no
3823 // point in choosing VF greater than TC / IC (as done in the loop below).
3824 // Select maximum power of two which doesn't exceed TC / IC. If VF is
3825 // scalable, we only fall back on a fixed VF when the TC is less than or
3826 // equal to the known number of lanes.
3827 auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount / IC);
3828 if (ClampedUpperTripCount == 0)
3829 ClampedUpperTripCount = 1;
3830 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
3831 "exceeding the constant trip count"
3832 << (UserIC > 0 ? " divided by UserIC" : "") << ": "
3833 << ClampedUpperTripCount << "\n");
3834 return ElementCount::get(ClampedUpperTripCount,
3835 FoldTailByMasking ? VF.isScalable() : false);
3836 }
3837 return VF;
3838}
3839
3840ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
3841 unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
3842 ElementCount MaxSafeVF, unsigned UserIC, bool FoldTailByMasking) {
3843 bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
3844 const TypeSize WidestRegister = TTI.getRegisterBitWidth(
3845 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
3847
3848 // Convenience function to return the minimum of two ElementCounts.
3849 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
3850 assert((LHS.isScalable() == RHS.isScalable()) &&
3851 "Scalable flags must match");
3852 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
3853 };
3854
3855 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
3856 // Note that both WidestRegister and WidestType may not be a powers of 2.
3857 auto MaxVectorElementCount = ElementCount::get(
3858 llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),
3859 ComputeScalableMaxVF);
3860 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
3861 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
3862 << (MaxVectorElementCount * WidestType) << " bits.\n");
3863
3864 if (!MaxVectorElementCount) {
3865 LLVM_DEBUG(dbgs() << "LV: The target has no "
3866 << (ComputeScalableMaxVF ? "scalable" : "fixed")
3867 << " vector registers.\n");
3868 return ElementCount::getFixed(1);
3869 }
3870
3871 ElementCount MaxVF = clampVFByMaxTripCount(
3872 MaxVectorElementCount, MaxTripCount, UserIC, FoldTailByMasking);
3873 // If the MaxVF was already clamped, there's no point in trying to pick a
3874 // larger one.
3875 if (MaxVF != MaxVectorElementCount)
3876 return MaxVF;
3877
3879 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
3881
3882 if (MaxVF.isScalable())
3883 MaxPermissibleVFWithoutMaxBW.ScalableVF = MaxVF;
3884 else
3885 MaxPermissibleVFWithoutMaxBW.FixedVF = MaxVF;
3886
3887 if (useMaxBandwidth(RegKind)) {
3888 auto MaxVectorElementCountMaxBW = ElementCount::get(
3889 llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
3890 ComputeScalableMaxVF);
3891 MaxVF = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
3892
3893 if (ElementCount MinVF =
3894 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
3895 if (ElementCount::isKnownLT(MaxVF, MinVF)) {
3896 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
3897 << ") with target's minimum: " << MinVF << '\n');
3898 MaxVF = MinVF;
3899 }
3900 }
3901
3902 MaxVF =
3903 clampVFByMaxTripCount(MaxVF, MaxTripCount, UserIC, FoldTailByMasking);
3904
3905 if (MaxVectorElementCount != MaxVF) {
3906 // Invalidate any widening decisions we might have made, in case the loop
3907 // requires prediction (decided later), but we have already made some
3908 // load/store widening decisions.
3909 invalidateCostModelingDecisions();
3910 }
3911 }
3912 return MaxVF;
3913}
3914
3915bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
3916 const VectorizationFactor &B,
3917 const unsigned MaxTripCount,
3918 bool HasTail,
3919 bool IsEpilogue) const {
3920 InstructionCost CostA = A.Cost;
3921 InstructionCost CostB = B.Cost;
3922
3923 // Improve estimate for the vector width if it is scalable.
3924 unsigned EstimatedWidthA = A.Width.getKnownMinValue();
3925 unsigned EstimatedWidthB = B.Width.getKnownMinValue();
3926 if (std::optional<unsigned> VScale = CM.getVScaleForTuning()) {
3927 if (A.Width.isScalable())
3928 EstimatedWidthA *= *VScale;
3929 if (B.Width.isScalable())
3930 EstimatedWidthB *= *VScale;
3931 }
3932
3933 // When optimizing for size choose whichever is smallest, which will be the
3934 // one with the smallest cost for the whole loop. On a tie pick the larger
3935 // vector width, on the assumption that throughput will be greater.
3936 if (CM.CostKind == TTI::TCK_CodeSize)
3937 return CostA < CostB ||
3938 (CostA == CostB && EstimatedWidthA > EstimatedWidthB);
3939
3940 // Assume vscale may be larger than 1 (or the value being tuned for),
3941 // so that scalable vectorization is slightly favorable over fixed-width
3942 // vectorization.
3943 bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost(IsEpilogue) &&
3944 A.Width.isScalable() && !B.Width.isScalable();
3945
3946 auto CmpFn = [PreferScalable](const InstructionCost &LHS,
3947 const InstructionCost &RHS) {
3948 return PreferScalable ? LHS <= RHS : LHS < RHS;
3949 };
3950
3951 // To avoid the need for FP division:
3952 // (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)
3953 // <=> (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA)
3954 if (!MaxTripCount)
3955 return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);
3956
3957 auto GetCostForTC = [MaxTripCount, HasTail](unsigned VF,
3958 InstructionCost VectorCost,
3959 InstructionCost ScalarCost) {
3960 // If the trip count is a known (possibly small) constant, the trip count
3961 // will be rounded up to an integer number of iterations under
3962 // FoldTailByMasking. The total cost in that case will be
3963 // VecCost*ceil(TripCount/VF). When not folding the tail, the total
3964 // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
3965 // some extra overheads, but for the purpose of comparing the costs of
3966 // different VFs we can use this to compare the total loop-body cost
3967 // expected after vectorization.
3968 if (HasTail)
3969 return VectorCost * (MaxTripCount / VF) +
3970 ScalarCost * (MaxTripCount % VF);
3971 return VectorCost * divideCeil(MaxTripCount, VF);
3972 };
3973
3974 auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost);
3975 auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost);
3976 return CmpFn(RTCostA, RTCostB);
3977}
3978
3979bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
3980 const VectorizationFactor &B,
3981 bool HasTail,
3982 bool IsEpilogue) const {
3983 const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount();
3984 return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount, HasTail,
3985 IsEpilogue);
3986}
3987
3990 using RecipeVFPair = std::pair<VPRecipeBase *, ElementCount>;
3991 SmallVector<RecipeVFPair> InvalidCosts;
3992 for (const auto &Plan : VPlans) {
3993 for (ElementCount VF : Plan->vectorFactors()) {
3994 // The VPlan-based cost model is designed for computing vector cost.
3995 // Querying VPlan-based cost model with a scarlar VF will cause some
3996 // errors because we expect the VF is vector for most of the widen
3997 // recipes.
3998 if (VF.isScalar())
3999 continue;
4000
4001 VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, CM.PSE,
4002 OrigLoop);
4003 precomputeCosts(*Plan, VF, CostCtx);
4004 auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
4006 for (auto &R : *VPBB) {
4007 if (!R.cost(VF, CostCtx).isValid())
4008 InvalidCosts.emplace_back(&R, VF);
4009 }
4010 }
4011 }
4012 }
4013 if (InvalidCosts.empty())
4014 return;
4015
4016 // Emit a report of VFs with invalid costs in the loop.
4017
4018 // Group the remarks per recipe, keeping the recipe order from InvalidCosts.
4020 unsigned I = 0;
4021 for (auto &Pair : InvalidCosts)
4022 if (Numbering.try_emplace(Pair.first, I).second)
4023 ++I;
4024
4025 // Sort the list, first on recipe(number) then on VF.
4026 sort(InvalidCosts, [&Numbering](RecipeVFPair &A, RecipeVFPair &B) {
4027 unsigned NA = Numbering[A.first];
4028 unsigned NB = Numbering[B.first];
4029 if (NA != NB)
4030 return NA < NB;
4031 return ElementCount::isKnownLT(A.second, B.second);
4032 });
4033
4034 // For a list of ordered recipe-VF pairs:
4035 // [(load, VF1), (load, VF2), (store, VF1)]
4036 // group the recipes together to emit separate remarks for:
4037 // load (VF1, VF2)
4038 // store (VF1)
4039 auto Tail = ArrayRef<RecipeVFPair>(InvalidCosts);
4040 auto Subset = ArrayRef<RecipeVFPair>();
4041 do {
4042 if (Subset.empty())
4043 Subset = Tail.take_front(1);
4044
4045 VPRecipeBase *R = Subset.front().first;
4046
4047 unsigned Opcode =
4049 .Case([](const VPHeaderPHIRecipe *R) { return Instruction::PHI; })
4050 .Case(
4051 [](const VPWidenStoreRecipe *R) { return Instruction::Store; })
4052 .Case([](const VPWidenLoadRecipe *R) { return Instruction::Load; })
4053 .Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>(
4054 [](const auto *R) { return Instruction::Call; })
4057 [](const auto *R) { return R->getOpcode(); })
4058 .Case([](const VPInterleaveRecipe *R) {
4059 return R->getStoredValues().empty() ? Instruction::Load
4060 : Instruction::Store;
4061 })
4062 .Case([](const VPReductionRecipe *R) {
4063 return RecurrenceDescriptor::getOpcode(R->getRecurrenceKind());
4064 });
4065
4066 // If the next recipe is different, or if there are no other pairs,
4067 // emit a remark for the collated subset. e.g.
4068 // [(load, VF1), (load, VF2))]
4069 // to emit:
4070 // remark: invalid costs for 'load' at VF=(VF1, VF2)
4071 if (Subset == Tail || Tail[Subset.size()].first != R) {
4072 std::string OutString;
4073 raw_string_ostream OS(OutString);
4074 assert(!Subset.empty() && "Unexpected empty range");
4075 OS << "Recipe with invalid costs prevented vectorization at VF=(";
4076 for (const auto &Pair : Subset)
4077 OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
4078 OS << "):";
4079 if (Opcode == Instruction::Call) {
4080 StringRef Name = "";
4081 if (auto *Int = dyn_cast<VPWidenIntrinsicRecipe>(R)) {
4082 Name = Int->getIntrinsicName();
4083 } else {
4084 auto *WidenCall = dyn_cast<VPWidenCallRecipe>(R);
4085 Function *CalledFn =
4086 WidenCall ? WidenCall->getCalledScalarFunction()
4087 : cast<Function>(R->getOperand(R->getNumOperands() - 1)
4088 ->getLiveInIRValue());
4089 Name = CalledFn->getName();
4090 }
4091 OS << " call to " << Name;
4092 } else
4093 OS << " " << Instruction::getOpcodeName(Opcode);
4094 reportVectorizationInfo(OutString, "InvalidCost", ORE, OrigLoop, nullptr,
4095 R->getDebugLoc());
4096 Tail = Tail.drop_front(Subset.size());
4097 Subset = {};
4098 } else
4099 // Grow the subset by one element
4100 Subset = Tail.take_front(Subset.size() + 1);
4101 } while (!Tail.empty());
4102}
4103
4104/// Check if any recipe of \p Plan will generate a vector value, which will be
4105/// assigned a vector register.
4107 const TargetTransformInfo &TTI) {
4108 assert(VF.isVector() && "Checking a scalar VF?");
4109 VPTypeAnalysis TypeInfo(Plan);
4110 DenseSet<VPRecipeBase *> EphemeralRecipes;
4111 collectEphemeralRecipesForVPlan(Plan, EphemeralRecipes);
4112 // Set of already visited types.
4113 DenseSet<Type *> Visited;
4116 for (VPRecipeBase &R : *VPBB) {
4117 if (EphemeralRecipes.contains(&R))
4118 continue;
4119 // Continue early if the recipe is considered to not produce a vector
4120 // result. Note that this includes VPInstruction where some opcodes may
4121 // produce a vector, to preserve existing behavior as VPInstructions model
4122 // aspects not directly mapped to existing IR instructions.
4123 switch (R.getVPRecipeID()) {
4124 case VPRecipeBase::VPDerivedIVSC:
4125 case VPRecipeBase::VPScalarIVStepsSC:
4126 case VPRecipeBase::VPReplicateSC:
4127 case VPRecipeBase::VPInstructionSC:
4128 case VPRecipeBase::VPCanonicalIVPHISC:
4129 case VPRecipeBase::VPCurrentIterationPHISC:
4130 case VPRecipeBase::VPVectorPointerSC:
4131 case VPRecipeBase::VPVectorEndPointerSC:
4132 case VPRecipeBase::VPExpandSCEVSC:
4133 case VPRecipeBase::VPPredInstPHISC:
4134 case VPRecipeBase::VPBranchOnMaskSC:
4135 continue;
4136 case VPRecipeBase::VPReductionSC:
4137 case VPRecipeBase::VPActiveLaneMaskPHISC:
4138 case VPRecipeBase::VPWidenCallSC:
4139 case VPRecipeBase::VPWidenCanonicalIVSC:
4140 case VPRecipeBase::VPWidenCastSC:
4141 case VPRecipeBase::VPWidenGEPSC:
4142 case VPRecipeBase::VPWidenIntrinsicSC:
4143 case VPRecipeBase::VPWidenSC:
4144 case VPRecipeBase::VPBlendSC:
4145 case VPRecipeBase::VPFirstOrderRecurrencePHISC:
4146 case VPRecipeBase::VPHistogramSC:
4147 case VPRecipeBase::VPWidenPHISC:
4148 case VPRecipeBase::VPWidenIntOrFpInductionSC:
4149 case VPRecipeBase::VPWidenPointerInductionSC:
4150 case VPRecipeBase::VPReductionPHISC:
4151 case VPRecipeBase::VPInterleaveEVLSC:
4152 case VPRecipeBase::VPInterleaveSC:
4153 case VPRecipeBase::VPWidenLoadEVLSC:
4154 case VPRecipeBase::VPWidenLoadSC:
4155 case VPRecipeBase::VPWidenStoreEVLSC:
4156 case VPRecipeBase::VPWidenStoreSC:
4157 break;
4158 default:
4159 llvm_unreachable("unhandled recipe");
4160 }
4161
4162 auto WillGenerateTargetVectors = [&TTI, VF](Type *VectorTy) {
4163 unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
4164 if (!NumLegalParts)
4165 return false;
4166 if (VF.isScalable()) {
4167 // <vscale x 1 x iN> is assumed to be profitable over iN because
4168 // scalable registers are a distinct register class from scalar
4169 // ones. If we ever find a target which wants to lower scalable
4170 // vectors back to scalars, we'll need to update this code to
4171 // explicitly ask TTI about the register class uses for each part.
4172 return NumLegalParts <= VF.getKnownMinValue();
4173 }
4174 // Two or more elements that share a register - are vectorized.
4175 return NumLegalParts < VF.getFixedValue();
4176 };
4177
4178 // If no def nor is a store, e.g., branches, continue - no value to check.
4179 if (R.getNumDefinedValues() == 0 &&
4181 continue;
4182 // For multi-def recipes, currently only interleaved loads, suffice to
4183 // check first def only.
4184 // For stores check their stored value; for interleaved stores suffice
4185 // the check first stored value only. In all cases this is the second
4186 // operand.
4187 VPValue *ToCheck =
4188 R.getNumDefinedValues() >= 1 ? R.getVPValue(0) : R.getOperand(1);
4189 Type *ScalarTy = TypeInfo.inferScalarType(ToCheck);
4190 if (!Visited.insert({ScalarTy}).second)
4191 continue;
4192 Type *WideTy = toVectorizedTy(ScalarTy, VF);
4193 if (any_of(getContainedTypes(WideTy), WillGenerateTargetVectors))
4194 return true;
4195 }
4196 }
4197
4198 return false;
4199}
4200
4201static bool hasReplicatorRegion(VPlan &Plan) {
4203 Plan.getVectorLoopRegion()->getEntry())),
4204 [](auto *VPRB) { return VPRB->isReplicator(); });
4205}
4206
4207#ifndef NDEBUG
4208VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
4209 InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1));
4210 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
4211 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
4212 assert(
4213 any_of(VPlans,
4214 [](std::unique_ptr<VPlan> &P) { return P->hasScalarVFOnly(); }) &&
4215 "Expected Scalar VF to be a candidate");
4216
4217 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
4218 ExpectedCost);
4219 VectorizationFactor ChosenFactor = ScalarCost;
4220
4221 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
4222 if (ForceVectorization &&
4223 (VPlans.size() > 1 || !VPlans[0]->hasScalarVFOnly())) {
4224 // Ignore scalar width, because the user explicitly wants vectorization.
4225 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4226 // evaluation.
4227 ChosenFactor.Cost = InstructionCost::getMax();
4228 }
4229
4230 for (auto &P : VPlans) {
4231 ArrayRef<ElementCount> VFs(P->vectorFactors().begin(),
4232 P->vectorFactors().end());
4233
4235 if (any_of(VFs, [this](ElementCount VF) {
4236 return CM.shouldConsiderRegPressureForVF(VF);
4237 }))
4238 RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore);
4239
4240 for (unsigned I = 0; I < VFs.size(); I++) {
4241 ElementCount VF = VFs[I];
4242 // The cost for scalar VF=1 is already calculated, so ignore it.
4243 if (VF.isScalar())
4244 continue;
4245
4246 /// If the register pressure needs to be considered for VF,
4247 /// don't consider the VF as valid if it exceeds the number
4248 /// of registers for the target.
4249 if (CM.shouldConsiderRegPressureForVF(VF) &&
4250 RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs))
4251 continue;
4252
4253 InstructionCost C = CM.expectedCost(VF);
4254
4255 // Add on other costs that are modelled in VPlan, but not in the legacy
4256 // cost model.
4257 VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind, CM.PSE,
4258 OrigLoop);
4259 VPRegionBlock *VectorRegion = P->getVectorLoopRegion();
4260 assert(VectorRegion && "Expected to have a vector region!");
4261 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4262 vp_depth_first_shallow(VectorRegion->getEntry()))) {
4263 for (VPRecipeBase &R : *VPBB) {
4264 auto *VPI = dyn_cast<VPInstruction>(&R);
4265 if (!VPI)
4266 continue;
4267 switch (VPI->getOpcode()) {
4268 // Selects are only modelled in the legacy cost model for safe
4269 // divisors.
4270 case Instruction::Select: {
4271 if (auto *WR =
4272 dyn_cast_or_null<VPWidenRecipe>(VPI->getSingleUser())) {
4273 switch (WR->getOpcode()) {
4274 case Instruction::UDiv:
4275 case Instruction::SDiv:
4276 case Instruction::URem:
4277 case Instruction::SRem:
4278 continue;
4279 default:
4280 break;
4281 }
4282 }
4283 C += VPI->cost(VF, CostCtx);
4284 break;
4285 }
4287 unsigned Multiplier =
4288 cast<VPConstantInt>(VPI->getOperand(2))->getZExtValue();
4289 C += VPI->cost(VF * Multiplier, CostCtx);
4290 break;
4291 }
4293 C += VPI->cost(VF, CostCtx);
4294 break;
4295 default:
4296 break;
4297 }
4298 }
4299 }
4300
4301 VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
4302 unsigned Width =
4303 estimateElementCount(Candidate.Width, CM.getVScaleForTuning());
4304 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
4305 << " costs: " << (Candidate.Cost / Width));
4306 if (VF.isScalable())
4307 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
4308 << CM.getVScaleForTuning().value_or(1) << ")");
4309 LLVM_DEBUG(dbgs() << ".\n");
4310
4311 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
4312 LLVM_DEBUG(
4313 dbgs()
4314 << "LV: Not considering vector loop of width " << VF
4315 << " because it will not generate any vector instructions.\n");
4316 continue;
4317 }
4318
4319 if (CM.OptForSize && !ForceVectorization && hasReplicatorRegion(*P)) {
4320 LLVM_DEBUG(
4321 dbgs()
4322 << "LV: Not considering vector loop of width " << VF
4323 << " because it would cause replicated blocks to be generated,"
4324 << " which isn't allowed when optimizing for size.\n");
4325 continue;
4326 }
4327
4328 if (isMoreProfitable(Candidate, ChosenFactor, P->hasScalarTail()))
4329 ChosenFactor = Candidate;
4330 }
4331 }
4332
4333 if (!EnableCondStoresVectorization && CM.hasPredStores()) {
4335 "There are conditional stores.",
4336 "store that is conditionally executed prevents vectorization",
4337 "ConditionalStore", ORE, OrigLoop);
4338 ChosenFactor = ScalarCost;
4339 }
4340
4341 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
4342 !isMoreProfitable(ChosenFactor, ScalarCost,
4343 !CM.foldTailByMasking())) dbgs()
4344 << "LV: Vectorization seems to be not beneficial, "
4345 << "but was forced by a user.\n");
4346 return ChosenFactor;
4347}
4348#endif
4349
4350/// Returns true if the VPlan contains a VPReductionPHIRecipe with
4351/// FindLast recurrence kind.
4352static bool hasFindLastReductionPhi(VPlan &Plan) {
4354 [](VPRecipeBase &R) {
4355 auto *RedPhi = dyn_cast<VPReductionPHIRecipe>(&R);
4356 return RedPhi &&
4357 RecurrenceDescriptor::isFindLastRecurrenceKind(
4358 RedPhi->getRecurrenceKind());
4359 });
4360}
4361
4362/// Returns true if the VPlan contains header phi recipes that are not currently
4363/// supported for epilogue vectorization.
4365 return any_of(
4367 [](VPRecipeBase &R) {
4368 if (auto *WidenInd = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R))
4369 return !WidenInd->getPHINode();
4370 auto *RedPhi = dyn_cast<VPReductionPHIRecipe>(&R);
4371 return RedPhi && (RecurrenceDescriptor::isFindLastRecurrenceKind(
4372 RedPhi->getRecurrenceKind()) ||
4373 !RedPhi->getUnderlyingValue());
4374 });
4375}
4376
4377bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
4378 ElementCount VF) const {
4379 // Cross iteration phis such as fixed-order recurrences and FMaxNum/FMinNum
4380 // reductions need special handling and are currently unsupported.
4381 if (any_of(OrigLoop->getHeader()->phis(), [&](PHINode &Phi) {
4382 if (!Legal->isReductionVariable(&Phi))
4383 return Legal->isFixedOrderRecurrence(&Phi);
4384 RecurKind Kind =
4385 Legal->getRecurrenceDescriptor(&Phi).getRecurrenceKind();
4386 return RecurrenceDescriptor::isFPMinMaxNumRecurrenceKind(Kind);
4387 }))
4388 return false;
4389
4390 // FindLast reductions and inductions without underlying PHI require special
4391 // handling and are currently not supported for epilogue vectorization.
4392 if (hasUnsupportedHeaderPhiRecipe(getPlanFor(VF)))
4393 return false;
4394
4395 // Phis with uses outside of the loop require special handling and are
4396 // currently unsupported.
4397 for (const auto &Entry : Legal->getInductionVars()) {
4398 // Look for uses of the value of the induction at the last iteration.
4399 Value *PostInc =
4400 Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch());
4401 for (User *U : PostInc->users())
4402 if (!OrigLoop->contains(cast<Instruction>(U)))
4403 return false;
4404 // Look for uses of penultimate value of the induction.
4405 for (User *U : Entry.first->users())
4406 if (!OrigLoop->contains(cast<Instruction>(U)))
4407 return false;
4408 }
4409
4410 // Epilogue vectorization code has not been auditted to ensure it handles
4411 // non-latch exits properly. It may be fine, but it needs auditted and
4412 // tested.
4413 // TODO: Add support for loops with an early exit.
4414 if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
4415 return false;
4416
4417 return true;
4418}
4419
4421 const ElementCount VF, const unsigned IC) const {
4422 // FIXME: We need a much better cost-model to take different parameters such
4423 // as register pressure, code size increase and cost of extra branches into
4424 // account. For now we apply a very crude heuristic and only consider loops
4425 // with vectorization factors larger than a certain value.
4426
4427 // Allow the target to opt out.
4428 if (!TTI.preferEpilogueVectorization(VF * IC))
4429 return false;
4430
4431 unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
4433 : TTI.getEpilogueVectorizationMinVF();
4434 return estimateElementCount(VF * IC, VScaleForTuning) >= MinVFThreshold;
4435}
4436
4438 const ElementCount MainLoopVF, unsigned IC) {
4441 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
4442 return Result;
4443 }
4444
4445 if (!CM.isScalarEpilogueAllowed()) {
4446 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
4447 "epilogue is allowed.\n");
4448 return Result;
4449 }
4450
4451 // Not really a cost consideration, but check for unsupported cases here to
4452 // simplify the logic.
4453 if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
4454 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
4455 "is not a supported candidate.\n");
4456 return Result;
4457 }
4458
4460 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
4462 if (hasPlanWithVF(ForcedEC))
4463 return {ForcedEC, 0, 0};
4464
4465 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
4466 "viable.\n");
4467 return Result;
4468 }
4469
4470 if (OrigLoop->getHeader()->getParent()->hasOptSize()) {
4471 LLVM_DEBUG(
4472 dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
4473 return Result;
4474 }
4475
4476 if (!CM.isEpilogueVectorizationProfitable(MainLoopVF, IC)) {
4477 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
4478 "this loop\n");
4479 return Result;
4480 }
4481
4482 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
4483 // the main loop handles 8 lanes per iteration. We could still benefit from
4484 // vectorizing the epilogue loop with VF=4.
4485 ElementCount EstimatedRuntimeVF = ElementCount::getFixed(
4486 estimateElementCount(MainLoopVF, CM.getVScaleForTuning()));
4487
4488 Type *TCType = Legal->getWidestInductionType();
4489 const SCEV *RemainingIterations = nullptr;
4490 unsigned MaxTripCount = 0;
4492 getPlanFor(MainLoopVF).getTripCount(), PSE);
4493 assert(!isa<SCEVCouldNotCompute>(TC) && "Trip count SCEV must be computable");
4494 const SCEV *KnownMinTC;
4495 bool ScalableTC = match(TC, m_scev_c_Mul(m_SCEV(KnownMinTC), m_SCEVVScale()));
4496 bool ScalableRemIter = false;
4497 ScalarEvolution &SE = *PSE.getSE();
4498 // Use versions of TC and VF in which both are either scalable or fixed.
4499 if (ScalableTC == MainLoopVF.isScalable()) {
4500 ScalableRemIter = ScalableTC;
4501 RemainingIterations =
4502 SE.getURemExpr(TC, SE.getElementCount(TCType, MainLoopVF * IC));
4503 } else if (ScalableTC) {
4504 const SCEV *EstimatedTC = SE.getMulExpr(
4505 KnownMinTC,
4506 SE.getConstant(TCType, CM.getVScaleForTuning().value_or(1)));
4507 RemainingIterations = SE.getURemExpr(
4508 EstimatedTC, SE.getElementCount(TCType, MainLoopVF * IC));
4509 } else
4510 RemainingIterations =
4511 SE.getURemExpr(TC, SE.getElementCount(TCType, EstimatedRuntimeVF * IC));
4512
4513 // No iterations left to process in the epilogue.
4514 if (RemainingIterations->isZero())
4515 return Result;
4516
4517 if (MainLoopVF.isFixed()) {
4518 MaxTripCount = MainLoopVF.getFixedValue() * IC - 1;
4519 if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations,
4520 SE.getConstant(TCType, MaxTripCount))) {
4521 MaxTripCount = SE.getUnsignedRangeMax(RemainingIterations).getZExtValue();
4522 }
4523 LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: "
4524 << MaxTripCount << "\n");
4525 }
4526
4527 auto SkipVF = [&](const SCEV *VF, const SCEV *RemIter) -> bool {
4528 return SE.isKnownPredicate(CmpInst::ICMP_UGT, VF, RemIter);
4529 };
4530 for (auto &NextVF : ProfitableVFs) {
4531 // Skip candidate VFs without a corresponding VPlan.
4532 if (!hasPlanWithVF(NextVF.Width))
4533 continue;
4534
4535 // Skip candidate VFs with widths >= the (estimated) runtime VF (scalable
4536 // vectors) or > the VF of the main loop (fixed vectors).
4537 if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
4538 ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||
4539 (NextVF.Width.isScalable() &&
4540 ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) ||
4541 (!NextVF.Width.isScalable() && !MainLoopVF.isScalable() &&
4542 ElementCount::isKnownGT(NextVF.Width, MainLoopVF)))
4543 continue;
4544
4545 // If NextVF is greater than the number of remaining iterations, the
4546 // epilogue loop would be dead. Skip such factors.
4547 // TODO: We should also consider comparing against a scalable
4548 // RemainingIterations when SCEV be able to evaluate non-canonical
4549 // vscale-based expressions.
4550 if (!ScalableRemIter) {
4551 // Handle the case where NextVF and RemainingIterations are in different
4552 // numerical spaces.
4553 ElementCount EC = NextVF.Width;
4554 if (NextVF.Width.isScalable())
4556 estimateElementCount(NextVF.Width, CM.getVScaleForTuning()));
4557 if (SkipVF(SE.getElementCount(TCType, EC), RemainingIterations))
4558 continue;
4559 }
4560
4561 if (Result.Width.isScalar() ||
4562 isMoreProfitable(NextVF, Result, MaxTripCount, !CM.foldTailByMasking(),
4563 /*IsEpilogue*/ true))
4564 Result = NextVF;
4565 }
4566
4567 if (Result != VectorizationFactor::Disabled())
4568 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
4569 << Result.Width << "\n");
4570 return Result;
4571}
4572
4573std::pair<unsigned, unsigned>
4575 unsigned MinWidth = -1U;
4576 unsigned MaxWidth = 8;
4577 const DataLayout &DL = TheFunction->getDataLayout();
4578 // For in-loop reductions, no element types are added to ElementTypesInLoop
4579 // if there are no loads/stores in the loop. In this case, check through the
4580 // reduction variables to determine the maximum width.
4581 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
4582 for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
4583 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
4584 // When finding the min width used by the recurrence we need to account
4585 // for casts on the input operands of the recurrence.
4586 MinWidth = std::min(
4587 MinWidth,
4588 std::min(RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
4590 MaxWidth = std::max(MaxWidth,
4592 }
4593 } else {
4594 for (Type *T : ElementTypesInLoop) {
4595 MinWidth = std::min<unsigned>(
4596 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4597 MaxWidth = std::max<unsigned>(
4598 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4599 }
4600 }
4601 return {MinWidth, MaxWidth};
4602}
4603
4605 ElementTypesInLoop.clear();
4606 // For each block.
4607 for (BasicBlock *BB : TheLoop->blocks()) {
4608 // For each instruction in the loop.
4609 for (Instruction &I : BB->instructionsWithoutDebug()) {
4610 Type *T = I.getType();
4611
4612 // Skip ignored values.
4613 if (ValuesToIgnore.count(&I))
4614 continue;
4615
4616 // Only examine Loads, Stores and PHINodes.
4617 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
4618 continue;
4619
4620 // Examine PHI nodes that are reduction variables. Update the type to
4621 // account for the recurrence type.
4622 if (auto *PN = dyn_cast<PHINode>(&I)) {
4623 if (!Legal->isReductionVariable(PN))
4624 continue;
4625 const RecurrenceDescriptor &RdxDesc =
4626 Legal->getRecurrenceDescriptor(PN);
4628 TTI.preferInLoopReduction(RdxDesc.getRecurrenceKind(),
4629 RdxDesc.getRecurrenceType()))
4630 continue;
4631 T = RdxDesc.getRecurrenceType();
4632 }
4633
4634 // Examine the stored values.
4635 if (auto *ST = dyn_cast<StoreInst>(&I))
4636 T = ST->getValueOperand()->getType();
4637
4638 assert(T->isSized() &&
4639 "Expected the load/store/recurrence type to be sized");
4640
4641 ElementTypesInLoop.insert(T);
4642 }
4643 }
4644}
4645
4646unsigned
4648 InstructionCost LoopCost) {
4649 // -- The interleave heuristics --
4650 // We interleave the loop in order to expose ILP and reduce the loop overhead.
4651 // There are many micro-architectural considerations that we can't predict
4652 // at this level. For example, frontend pressure (on decode or fetch) due to
4653 // code size, or the number and capabilities of the execution ports.
4654 //
4655 // We use the following heuristics to select the interleave count:
4656 // 1. If the code has reductions, then we interleave to break the cross
4657 // iteration dependency.
4658 // 2. If the loop is really small, then we interleave to reduce the loop
4659 // overhead.
4660 // 3. We don't interleave if we think that we will spill registers to memory
4661 // due to the increased register pressure.
4662
4663 // Only interleave tail-folded loops if wide lane masks are requested, as the
4664 // overhead of multiple instructions to calculate the predicate is likely
4665 // not beneficial. If a scalar epilogue is not allowed for any other reason,
4666 // do not interleave.
4667 if (!CM.isScalarEpilogueAllowed() &&
4668 !(CM.preferPredicatedLoop() && CM.useWideActiveLaneMask()))
4669 return 1;
4670
4673 LLVM_DEBUG(dbgs() << "LV: Loop requires variable-length step. "
4674 "Unroll factor forced to be 1.\n");
4675 return 1;
4676 }
4677
4678 // We used the distance for the interleave count.
4679 if (!Legal->isSafeForAnyVectorWidth())
4680 return 1;
4681
4682 // We don't attempt to perform interleaving for loops with uncountable early
4683 // exits because the VPInstruction::AnyOf code cannot currently handle
4684 // multiple parts.
4685 if (Plan.hasEarlyExit())
4686 return 1;
4687
4688 const bool HasReductions =
4691
4692 // FIXME: implement interleaving for FindLast transform correctly.
4693 if (hasFindLastReductionPhi(Plan))
4694 return 1;
4695
4696 // If we did not calculate the cost for VF (because the user selected the VF)
4697 // then we calculate the cost of VF here.
4698 if (LoopCost == 0) {
4699 if (VF.isScalar())
4700 LoopCost = CM.expectedCost(VF);
4701 else
4702 LoopCost = cost(Plan, VF);
4703 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
4704
4705 // Loop body is free and there is no need for interleaving.
4706 if (LoopCost == 0)
4707 return 1;
4708 }
4709
4710 VPRegisterUsage R =
4711 calculateRegisterUsageForPlan(Plan, {VF}, TTI, CM.ValuesToIgnore)[0];
4712 // We divide by these constants so assume that we have at least one
4713 // instruction that uses at least one register.
4714 for (auto &Pair : R.MaxLocalUsers) {
4715 Pair.second = std::max(Pair.second, 1U);
4716 }
4717
4718 // We calculate the interleave count using the following formula.
4719 // Subtract the number of loop invariants from the number of available
4720 // registers. These registers are used by all of the interleaved instances.
4721 // Next, divide the remaining registers by the number of registers that is
4722 // required by the loop, in order to estimate how many parallel instances
4723 // fit without causing spills. All of this is rounded down if necessary to be
4724 // a power of two. We want power of two interleave count to simplify any
4725 // addressing operations or alignment considerations.
4726 // We also want power of two interleave counts to ensure that the induction
4727 // variable of the vector loop wraps to zero, when tail is folded by masking;
4728 // this currently happens when OptForSize, in which case IC is set to 1 above.
4729 unsigned IC = UINT_MAX;
4730
4731 for (const auto &Pair : R.MaxLocalUsers) {
4732 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(Pair.first);
4733 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
4734 << " registers of "
4735 << TTI.getRegisterClassName(Pair.first)
4736 << " register class\n");
4737 if (VF.isScalar()) {
4738 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
4739 TargetNumRegisters = ForceTargetNumScalarRegs;
4740 } else {
4741 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
4742 TargetNumRegisters = ForceTargetNumVectorRegs;
4743 }
4744 unsigned MaxLocalUsers = Pair.second;
4745 unsigned LoopInvariantRegs = 0;
4746 if (R.LoopInvariantRegs.contains(Pair.first))
4747 LoopInvariantRegs = R.LoopInvariantRegs[Pair.first];
4748
4749 unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /
4750 MaxLocalUsers);
4751 // Don't count the induction variable as interleaved.
4753 TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) /
4754 std::max(1U, (MaxLocalUsers - 1)));
4755 }
4756
4757 IC = std::min(IC, TmpIC);
4758 }
4759
4760 // Clamp the interleave ranges to reasonable counts.
4761 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
4762
4763 // Check if the user has overridden the max.
4764 if (VF.isScalar()) {
4765 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
4766 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
4767 } else {
4768 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
4769 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
4770 }
4771
4772 // Try to get the exact trip count, or an estimate based on profiling data or
4773 // ConstantMax from PSE, failing that.
4774 auto BestKnownTC = getSmallBestKnownTC(PSE, OrigLoop);
4775
4776 // For fixed length VFs treat a scalable trip count as unknown.
4777 if (BestKnownTC && (BestKnownTC->isFixed() || VF.isScalable())) {
4778 // Re-evaluate trip counts and VFs to be in the same numerical space.
4779 unsigned AvailableTC =
4780 estimateElementCount(*BestKnownTC, CM.getVScaleForTuning());
4781 unsigned EstimatedVF = estimateElementCount(VF, CM.getVScaleForTuning());
4782
4783 // At least one iteration must be scalar when this constraint holds. So the
4784 // maximum available iterations for interleaving is one less.
4785 if (CM.requiresScalarEpilogue(VF.isVector()))
4786 --AvailableTC;
4787
4788 unsigned InterleaveCountLB = bit_floor(std::max(
4789 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
4790
4791 if (getSmallConstantTripCount(PSE.getSE(), OrigLoop).isNonZero()) {
4792 // If the best known trip count is exact, we select between two
4793 // prospective ICs, where
4794 //
4795 // 1) the aggressive IC is capped by the trip count divided by VF
4796 // 2) the conservative IC is capped by the trip count divided by (VF * 2)
4797 //
4798 // The final IC is selected in a way that the epilogue loop trip count is
4799 // minimized while maximizing the IC itself, so that we either run the
4800 // vector loop at least once if it generates a small epilogue loop, or
4801 // else we run the vector loop at least twice.
4802
4803 unsigned InterleaveCountUB = bit_floor(std::max(
4804 1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
4805 MaxInterleaveCount = InterleaveCountLB;
4806
4807 if (InterleaveCountUB != InterleaveCountLB) {
4808 unsigned TailTripCountUB =
4809 (AvailableTC % (EstimatedVF * InterleaveCountUB));
4810 unsigned TailTripCountLB =
4811 (AvailableTC % (EstimatedVF * InterleaveCountLB));
4812 // If both produce same scalar tail, maximize the IC to do the same work
4813 // in fewer vector loop iterations
4814 if (TailTripCountUB == TailTripCountLB)
4815 MaxInterleaveCount = InterleaveCountUB;
4816 }
4817 } else {
4818 // If trip count is an estimated compile time constant, limit the
4819 // IC to be capped by the trip count divided by VF * 2, such that the
4820 // vector loop runs at least twice to make interleaving seem profitable
4821 // when there is an epilogue loop present. Since exact Trip count is not
4822 // known we choose to be conservative in our IC estimate.
4823 MaxInterleaveCount = InterleaveCountLB;
4824 }
4825 }
4826
4827 assert(MaxInterleaveCount > 0 &&
4828 "Maximum interleave count must be greater than 0");
4829
4830 // Clamp the calculated IC to be between the 1 and the max interleave count
4831 // that the target and trip count allows.
4832 if (IC > MaxInterleaveCount)
4833 IC = MaxInterleaveCount;
4834 else
4835 // Make sure IC is greater than 0.
4836 IC = std::max(1u, IC);
4837
4838 assert(IC > 0 && "Interleave count must be greater than 0.");
4839
4840 // Interleave if we vectorized this loop and there is a reduction that could
4841 // benefit from interleaving.
4842 if (VF.isVector() && HasReductions) {
4843 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
4844 return IC;
4845 }
4846
4847 // For any scalar loop that either requires runtime checks or predication we
4848 // are better off leaving this to the unroller. Note that if we've already
4849 // vectorized the loop we will have done the runtime check and so interleaving
4850 // won't require further checks.
4851 bool ScalarInterleavingRequiresPredication =
4852 (VF.isScalar() && any_of(OrigLoop->blocks(), [this](BasicBlock *BB) {
4853 return Legal->blockNeedsPredication(BB);
4854 }));
4855 bool ScalarInterleavingRequiresRuntimePointerCheck =
4856 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
4857
4858 // We want to interleave small loops in order to reduce the loop overhead and
4859 // potentially expose ILP opportunities.
4860 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
4861 << "LV: IC is " << IC << '\n'
4862 << "LV: VF is " << VF << '\n');
4863 const bool AggressivelyInterleave =
4864 TTI.enableAggressiveInterleaving(HasReductions);
4865 if (!ScalarInterleavingRequiresRuntimePointerCheck &&
4866 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
4867 // We assume that the cost overhead is 1 and we use the cost model
4868 // to estimate the cost of the loop and interleave until the cost of the
4869 // loop overhead is about 5% of the cost of the loop.
4870 unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>(
4871 SmallLoopCost / LoopCost.getValue()));
4872
4873 // Interleave until store/load ports (estimated by max interleave count) are
4874 // saturated.
4875 unsigned NumStores = 0;
4876 unsigned NumLoads = 0;
4879 for (VPRecipeBase &R : *VPBB) {
4881 NumLoads++;
4882 continue;
4883 }
4885 NumStores++;
4886 continue;
4887 }
4888
4889 if (auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R)) {
4890 if (unsigned StoreOps = InterleaveR->getNumStoreOperands())
4891 NumStores += StoreOps;
4892 else
4893 NumLoads += InterleaveR->getNumDefinedValues();
4894 continue;
4895 }
4896 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
4897 NumLoads += isa<LoadInst>(RepR->getUnderlyingInstr());
4898 NumStores += isa<StoreInst>(RepR->getUnderlyingInstr());
4899 continue;
4900 }
4901 if (isa<VPHistogramRecipe>(&R)) {
4902 NumLoads++;
4903 NumStores++;
4904 continue;
4905 }
4906 }
4907 }
4908 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
4909 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
4910
4911 // There is little point in interleaving for reductions containing selects
4912 // and compares when VF=1 since it may just create more overhead than it's
4913 // worth for loops with small trip counts. This is because we still have to
4914 // do the final reduction after the loop.
4915 bool HasSelectCmpReductions =
4916 HasReductions &&
4918 [](VPRecipeBase &R) {
4919 auto *RedR = dyn_cast<VPReductionPHIRecipe>(&R);
4920 return RedR && (RecurrenceDescriptor::isAnyOfRecurrenceKind(
4921 RedR->getRecurrenceKind()) ||
4922 RecurrenceDescriptor::isFindIVRecurrenceKind(
4923 RedR->getRecurrenceKind()));
4924 });
4925 if (HasSelectCmpReductions) {
4926 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
4927 return 1;
4928 }
4929
4930 // If we have a scalar reduction (vector reductions are already dealt with
4931 // by this point), we can increase the critical path length if the loop
4932 // we're interleaving is inside another loop. For tree-wise reductions
4933 // set the limit to 2, and for ordered reductions it's best to disable
4934 // interleaving entirely.
4935 if (HasReductions && OrigLoop->getLoopDepth() > 1) {
4936 bool HasOrderedReductions =
4938 [](VPRecipeBase &R) {
4939 auto *RedR = dyn_cast<VPReductionPHIRecipe>(&R);
4940
4941 return RedR && RedR->isOrdered();
4942 });
4943 if (HasOrderedReductions) {
4944 LLVM_DEBUG(
4945 dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
4946 return 1;
4947 }
4948
4949 unsigned F = MaxNestedScalarReductionIC;
4950 SmallIC = std::min(SmallIC, F);
4951 StoresIC = std::min(StoresIC, F);
4952 LoadsIC = std::min(LoadsIC, F);
4953 }
4954
4956 std::max(StoresIC, LoadsIC) > SmallIC) {
4957 LLVM_DEBUG(
4958 dbgs() << "LV: Interleaving to saturate store or load ports.\n");
4959 return std::max(StoresIC, LoadsIC);
4960 }
4961
4962 // If there are scalar reductions and TTI has enabled aggressive
4963 // interleaving for reductions, we will interleave to expose ILP.
4964 if (VF.isScalar() && AggressivelyInterleave) {
4965 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
4966 // Interleave no less than SmallIC but not as aggressive as the normal IC
4967 // to satisfy the rare situation when resources are too limited.
4968 return std::max(IC / 2, SmallIC);
4969 }
4970
4971 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
4972 return SmallIC;
4973 }
4974
4975 // Interleave if this is a large loop (small loops are already dealt with by
4976 // this point) that could benefit from interleaving.
4977 if (AggressivelyInterleave) {
4978 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
4979 return IC;
4980 }
4981
4982 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
4983 return 1;
4984}
4985
4987 ElementCount VF) {
4988 // TODO: Cost model for emulated masked load/store is completely
4989 // broken. This hack guides the cost model to use an artificially
4990 // high enough value to practically disable vectorization with such
4991 // operations, except where previously deployed legality hack allowed
4992 // using very low cost values. This is to avoid regressions coming simply
4993 // from moving "masked load/store" check from legality to cost model.
4994 // Masked Load/Gather emulation was previously never allowed.
4995 // Limited number of Masked Store/Scatter emulation was allowed.
4997 "Expecting a scalar emulated instruction");
4998 return isa<LoadInst>(I) ||
4999 (isa<StoreInst>(I) &&
5000 NumPredStores > NumberOfStoresToPredicate);
5001}
5002
5004 assert(VF.isVector() && "Expected VF >= 2");
5005
5006 // If we've already collected the instructions to scalarize or the predicated
5007 // BBs after vectorization, there's nothing to do. Collection may already have
5008 // occurred if we have a user-selected VF and are now computing the expected
5009 // cost for interleaving.
5010 if (InstsToScalarize.contains(VF) ||
5011 PredicatedBBsAfterVectorization.contains(VF))
5012 return;
5013
5014 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5015 // not profitable to scalarize any instructions, the presence of VF in the
5016 // map will indicate that we've analyzed it already.
5017 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5018
5019 // Find all the instructions that are scalar with predication in the loop and
5020 // determine if it would be better to not if-convert the blocks they are in.
5021 // If so, we also record the instructions to scalarize.
5022 for (BasicBlock *BB : TheLoop->blocks()) {
5024 continue;
5025 for (Instruction &I : *BB)
5026 if (isScalarWithPredication(&I, VF)) {
5027 ScalarCostsTy ScalarCosts;
5028 // Do not apply discount logic for:
5029 // 1. Scalars after vectorization, as there will only be a single copy
5030 // of the instruction.
5031 // 2. Scalable VF, as that would lead to invalid scalarization costs.
5032 // 3. Emulated masked memrefs, if a hacked cost is needed.
5033 if (!isScalarAfterVectorization(&I, VF) && !VF.isScalable() &&
5035 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) {
5036 for (const auto &[I, IC] : ScalarCosts)
5037 ScalarCostsVF.insert({I, IC});
5038 // Check if we decided to scalarize a call. If so, update the widening
5039 // decision of the call to CM_Scalarize with the computed scalar cost.
5040 for (const auto &[I, Cost] : ScalarCosts) {
5041 auto *CI = dyn_cast<CallInst>(I);
5042 if (!CI || !CallWideningDecisions.contains({CI, VF}))
5043 continue;
5044 CallWideningDecisions[{CI, VF}].Kind = CM_Scalarize;
5045 CallWideningDecisions[{CI, VF}].Cost = Cost;
5046 }
5047 }
5048 // Remember that BB will remain after vectorization.
5049 PredicatedBBsAfterVectorization[VF].insert(BB);
5050 for (auto *Pred : predecessors(BB)) {
5051 if (Pred->getSingleSuccessor() == BB)
5052 PredicatedBBsAfterVectorization[VF].insert(Pred);
5053 }
5054 }
5055 }
5056}
5057
5058InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
5059 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
5060 assert(!isUniformAfterVectorization(PredInst, VF) &&
5061 "Instruction marked uniform-after-vectorization will be predicated");
5062
5063 // Initialize the discount to zero, meaning that the scalar version and the
5064 // vector version cost the same.
5065 InstructionCost Discount = 0;
5066
5067 // Holds instructions to analyze. The instructions we visit are mapped in
5068 // ScalarCosts. Those instructions are the ones that would be scalarized if
5069 // we find that the scalar version costs less.
5071
5072 // Returns true if the given instruction can be scalarized.
5073 auto CanBeScalarized = [&](Instruction *I) -> bool {
5074 // We only attempt to scalarize instructions forming a single-use chain
5075 // from the original predicated block that would otherwise be vectorized.
5076 // Although not strictly necessary, we give up on instructions we know will
5077 // already be scalar to avoid traversing chains that are unlikely to be
5078 // beneficial.
5079 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5080 isScalarAfterVectorization(I, VF))
5081 return false;
5082
5083 // If the instruction is scalar with predication, it will be analyzed
5084 // separately. We ignore it within the context of PredInst.
5085 if (isScalarWithPredication(I, VF))
5086 return false;
5087
5088 // If any of the instruction's operands are uniform after vectorization,
5089 // the instruction cannot be scalarized. This prevents, for example, a
5090 // masked load from being scalarized.
5091 //
5092 // We assume we will only emit a value for lane zero of an instruction
5093 // marked uniform after vectorization, rather than VF identical values.
5094 // Thus, if we scalarize an instruction that uses a uniform, we would
5095 // create uses of values corresponding to the lanes we aren't emitting code
5096 // for. This behavior can be changed by allowing getScalarValue to clone
5097 // the lane zero values for uniforms rather than asserting.
5098 for (Use &U : I->operands())
5099 if (auto *J = dyn_cast<Instruction>(U.get()))
5100 if (isUniformAfterVectorization(J, VF))
5101 return false;
5102
5103 // Otherwise, we can scalarize the instruction.
5104 return true;
5105 };
5106
5107 // Compute the expected cost discount from scalarizing the entire expression
5108 // feeding the predicated instruction. We currently only consider expressions
5109 // that are single-use instruction chains.
5110 Worklist.push_back(PredInst);
5111 while (!Worklist.empty()) {
5112 Instruction *I = Worklist.pop_back_val();
5113
5114 // If we've already analyzed the instruction, there's nothing to do.
5115 if (ScalarCosts.contains(I))
5116 continue;
5117
5118 // Cannot scalarize fixed-order recurrence phis at the moment.
5119 if (isa<PHINode>(I) && Legal->isFixedOrderRecurrence(cast<PHINode>(I)))
5120 continue;
5121
5122 // Compute the cost of the vector instruction. Note that this cost already
5123 // includes the scalarization overhead of the predicated instruction.
5124 InstructionCost VectorCost = getInstructionCost(I, VF);
5125
5126 // Compute the cost of the scalarized instruction. This cost is the cost of
5127 // the instruction as if it wasn't if-converted and instead remained in the
5128 // predicated block. We will scale this cost by block probability after
5129 // computing the scalarization overhead.
5130 InstructionCost ScalarCost =
5131 VF.getFixedValue() * getInstructionCost(I, ElementCount::getFixed(1));
5132
5133 // Compute the scalarization overhead of needed insertelement instructions
5134 // and phi nodes.
5135 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5136 Type *WideTy = toVectorizedTy(I->getType(), VF);
5137 for (Type *VectorTy : getContainedTypes(WideTy)) {
5138 ScalarCost += TTI.getScalarizationOverhead(
5140 /*Insert=*/true,
5141 /*Extract=*/false, CostKind);
5142 }
5143 ScalarCost +=
5144 VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
5145 }
5146
5147 // Compute the scalarization overhead of needed extractelement
5148 // instructions. For each of the instruction's operands, if the operand can
5149 // be scalarized, add it to the worklist; otherwise, account for the
5150 // overhead.
5151 for (Use &U : I->operands())
5152 if (auto *J = dyn_cast<Instruction>(U.get())) {
5153 assert(canVectorizeTy(J->getType()) &&
5154 "Instruction has non-scalar type");
5155 if (CanBeScalarized(J))
5156 Worklist.push_back(J);
5157 else if (needsExtract(J, VF)) {
5158 Type *WideTy = toVectorizedTy(J->getType(), VF);
5159 for (Type *VectorTy : getContainedTypes(WideTy)) {
5160 ScalarCost += TTI.getScalarizationOverhead(
5161 cast<VectorType>(VectorTy),
5162 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
5163 /*Extract*/ true, CostKind);
5164 }
5165 }
5166 }
5167
5168 // Scale the total scalar cost by block probability.
5169 ScalarCost /= getPredBlockCostDivisor(CostKind, I->getParent());
5170
5171 // Compute the discount. A non-negative discount means the vector version
5172 // of the instruction costs more, and scalarizing would be beneficial.
5173 Discount += VectorCost - ScalarCost;
5174 ScalarCosts[I] = ScalarCost;
5175 }
5176
5177 return Discount;
5178}
5179
5182
5183 // If the vector loop gets executed exactly once with the given VF, ignore the
5184 // costs of comparison and induction instructions, as they'll get simplified
5185 // away.
5186 SmallPtrSet<Instruction *, 2> ValuesToIgnoreForVF;
5187 auto TC = getSmallConstantTripCount(PSE.getSE(), TheLoop);
5188 if (TC == VF && !foldTailByMasking())
5190 ValuesToIgnoreForVF);
5191
5192 // For each block.
5193 for (BasicBlock *BB : TheLoop->blocks()) {
5194 InstructionCost BlockCost;
5195
5196 // For each instruction in the old loop.
5197 for (Instruction &I : BB->instructionsWithoutDebug()) {
5198 // Skip ignored values.
5199 if (ValuesToIgnore.count(&I) || ValuesToIgnoreForVF.count(&I) ||
5200 (VF.isVector() && VecValuesToIgnore.count(&I)))
5201 continue;
5202
5204
5205 // Check if we should override the cost.
5206 if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0) {
5207 // For interleave groups, use ForceTargetInstructionCost once for the
5208 // whole group.
5209 if (VF.isVector() && getWideningDecision(&I, VF) == CM_Interleave) {
5210 if (getInterleavedAccessGroup(&I)->getInsertPos() == &I)
5212 else
5213 C = InstructionCost(0);
5214 } else {
5216 }
5217 }
5218
5219 BlockCost += C;
5220 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
5221 << VF << " For instruction: " << I << '\n');
5222 }
5223
5224 // If we are vectorizing a predicated block, it will have been
5225 // if-converted. This means that the block's instructions (aside from
5226 // stores and instructions that may divide by zero) will now be
5227 // unconditionally executed. For the scalar case, we may not always execute
5228 // the predicated block, if it is an if-else block. Thus, scale the block's
5229 // cost by the probability of executing it.
5230 // getPredBlockCostDivisor will return 1 for blocks that are only predicated
5231 // by the header mask when folding the tail.
5232 if (VF.isScalar())
5233 BlockCost /= getPredBlockCostDivisor(CostKind, BB);
5234
5235 Cost += BlockCost;
5236 }
5237
5238 return Cost;
5239}
5240
5241/// Gets the address access SCEV for Ptr, if it should be used for cost modeling
5242/// according to isAddressSCEVForCost.
5243///
5244/// This SCEV can be sent to the Target in order to estimate the address
5245/// calculation cost.
5247 Value *Ptr,
5249 const Loop *TheLoop) {
5250 const SCEV *Addr = PSE.getSCEV(Ptr);
5251 return vputils::isAddressSCEVForCost(Addr, *PSE.getSE(), TheLoop) ? Addr
5252 : nullptr;
5253}
5254
5256LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5257 ElementCount VF) {
5258 assert(VF.isVector() &&
5259 "Scalarization cost of instruction implies vectorization.");
5260 if (VF.isScalable())
5261 return InstructionCost::getInvalid();
5262
5263 Type *ValTy = getLoadStoreType(I);
5264 auto *SE = PSE.getSE();
5265
5266 unsigned AS = getLoadStoreAddressSpace(I);
5268 Type *PtrTy = toVectorTy(Ptr->getType(), VF);
5269 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
5270 // that it is being called from this specific place.
5271
5272 // Figure out whether the access is strided and get the stride value
5273 // if it's known in compile time
5274 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, PSE, TheLoop);
5275
5276 // Get the cost of the scalar memory instruction and address computation.
5278 PtrTy, SE, PtrSCEV, CostKind);
5279
5280 // Don't pass *I here, since it is scalar but will actually be part of a
5281 // vectorized loop where the user of it is a vectorized instruction.
5282 const Align Alignment = getLoadStoreAlignment(I);
5283 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
5284 Cost += VF.getFixedValue() *
5285 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
5286 AS, CostKind, OpInfo);
5287
5288 // Get the overhead of the extractelement and insertelement instructions
5289 // we might create due to scalarization.
5291
5292 // If we have a predicated load/store, it will need extra i1 extracts and
5293 // conditional branches, but may not be executed for each vector lane. Scale
5294 // the cost by the probability of executing the predicated block.
5295 if (isPredicatedInst(I)) {
5296 Cost /= getPredBlockCostDivisor(CostKind, I->getParent());
5297
5298 // Add the cost of an i1 extract and a branch
5299 auto *VecI1Ty =
5300 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
5302 VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
5303 /*Insert=*/false, /*Extract=*/true, CostKind);
5304 Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
5305
5306 if (useEmulatedMaskMemRefHack(I, VF))
5307 // Artificially setting to a high enough value to practically disable
5308 // vectorization with such operations.
5309 Cost = 3000000;
5310 }
5311
5312 return Cost;
5313}
5314
5316LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5317 ElementCount VF) {
5318 Type *ValTy = getLoadStoreType(I);
5319 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5321 unsigned AS = getLoadStoreAddressSpace(I);
5322 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
5323
5324 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5325 "Stride should be 1 or -1 for consecutive memory access");
5326 const Align Alignment = getLoadStoreAlignment(I);
5328 if (Legal->isMaskRequired(I)) {
5329 unsigned IID = I->getOpcode() == Instruction::Load
5330 ? Intrinsic::masked_load
5331 : Intrinsic::masked_store;
5333 MemIntrinsicCostAttributes(IID, VectorTy, Alignment, AS), CostKind);
5334 } else {
5335 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
5336 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5337 CostKind, OpInfo, I);
5338 }
5339
5340 bool Reverse = ConsecutiveStride < 0;
5341 if (Reverse)
5343 VectorTy, {}, CostKind, 0);
5344 return Cost;
5345}
5346
5348LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5349 ElementCount VF) {
5350 assert(Legal->isUniformMemOp(*I, VF));
5351
5352 Type *ValTy = getLoadStoreType(I);
5354 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5355 const Align Alignment = getLoadStoreAlignment(I);
5356 unsigned AS = getLoadStoreAddressSpace(I);
5357 if (isa<LoadInst>(I)) {
5358 return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
5359 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
5360 CostKind) +
5362 VectorTy, {}, CostKind);
5363 }
5364 StoreInst *SI = cast<StoreInst>(I);
5365
5366 bool IsLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
5367 // TODO: We have existing tests that request the cost of extracting element
5368 // VF.getKnownMinValue() - 1 from a scalable vector. This does not represent
5369 // the actual generated code, which involves extracting the last element of
5370 // a scalable vector where the lane to extract is unknown at compile time.
5372 TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
5373 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, CostKind);
5374 if (!IsLoopInvariantStoreValue)
5375 Cost += TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement,
5376 VectorTy, CostKind, 0);
5377 return Cost;
5378}
5379
5381LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5382 ElementCount VF) {
5383 Type *ValTy = getLoadStoreType(I);
5384 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5385 const Align Alignment = getLoadStoreAlignment(I);
5387 Type *PtrTy = Ptr->getType();
5388
5389 if (!Legal->isUniform(Ptr, VF))
5390 PtrTy = toVectorTy(PtrTy, VF);
5391
5392 unsigned IID = I->getOpcode() == Instruction::Load
5393 ? Intrinsic::masked_gather
5394 : Intrinsic::masked_scatter;
5395 return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
5397 MemIntrinsicCostAttributes(IID, VectorTy, Ptr,
5398 Legal->isMaskRequired(I), Alignment, I),
5399 CostKind);
5400}
5401
5403LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5404 ElementCount VF) {
5405 const auto *Group = getInterleavedAccessGroup(I);
5406 assert(Group && "Fail to get an interleaved access group.");
5407
5408 Instruction *InsertPos = Group->getInsertPos();
5409 Type *ValTy = getLoadStoreType(InsertPos);
5410 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5411 unsigned AS = getLoadStoreAddressSpace(InsertPos);
5412
5413 unsigned InterleaveFactor = Group->getFactor();
5414 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5415
5416 // Holds the indices of existing members in the interleaved group.
5417 SmallVector<unsigned, 4> Indices;
5418 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
5419 if (Group->getMember(IF))
5420 Indices.push_back(IF);
5421
5422 // Calculate the cost of the whole interleaved group.
5423 bool UseMaskForGaps =
5424 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
5425 (isa<StoreInst>(I) && !Group->isFull());
5427 InsertPos->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5428 Group->getAlign(), AS, CostKind, Legal->isMaskRequired(I),
5429 UseMaskForGaps);
5430
5431 if (Group->isReverse()) {
5432 // TODO: Add support for reversed masked interleaved access.
5433 assert(!Legal->isMaskRequired(I) &&
5434 "Reverse masked interleaved access not supported.");
5435 Cost += Group->getNumMembers() *
5437 VectorTy, {}, CostKind, 0);
5438 }
5439 return Cost;
5440}
5441
5442std::optional<InstructionCost>
5444 ElementCount VF,
5445 Type *Ty) const {
5446 using namespace llvm::PatternMatch;
5447 // Early exit for no inloop reductions
5448 if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
5449 return std::nullopt;
5450 auto *VectorTy = cast<VectorType>(Ty);
5451
5452 // We are looking for a pattern of, and finding the minimal acceptable cost:
5453 // reduce(mul(ext(A), ext(B))) or
5454 // reduce(mul(A, B)) or
5455 // reduce(ext(A)) or
5456 // reduce(A).
5457 // The basic idea is that we walk down the tree to do that, finding the root
5458 // reduction instruction in InLoopReductionImmediateChains. From there we find
5459 // the pattern of mul/ext and test the cost of the entire pattern vs the cost
5460 // of the components. If the reduction cost is lower then we return it for the
5461 // reduction instruction and 0 for the other instructions in the pattern. If
5462 // it is not we return an invalid cost specifying the orignal cost method
5463 // should be used.
5464 Instruction *RetI = I;
5465 if (match(RetI, m_ZExtOrSExt(m_Value()))) {
5466 if (!RetI->hasOneUser())
5467 return std::nullopt;
5468 RetI = RetI->user_back();
5469 }
5470
5471 if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
5472 RetI->user_back()->getOpcode() == Instruction::Add) {
5473 RetI = RetI->user_back();
5474 }
5475
5476 // Test if the found instruction is a reduction, and if not return an invalid
5477 // cost specifying the parent to use the original cost modelling.
5478 Instruction *LastChain = InLoopReductionImmediateChains.lookup(RetI);
5479 if (!LastChain)
5480 return std::nullopt;
5481
5482 // Find the reduction this chain is a part of and calculate the basic cost of
5483 // the reduction on its own.
5484 Instruction *ReductionPhi = LastChain;
5485 while (!isa<PHINode>(ReductionPhi))
5486 ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi);
5487
5488 const RecurrenceDescriptor &RdxDesc =
5489 Legal->getRecurrenceDescriptor(cast<PHINode>(ReductionPhi));
5490
5491 InstructionCost BaseCost;
5492 RecurKind RK = RdxDesc.getRecurrenceKind();
5495 BaseCost = TTI.getMinMaxReductionCost(MinMaxID, VectorTy,
5496 RdxDesc.getFastMathFlags(), CostKind);
5497 } else {
5498 BaseCost = TTI.getArithmeticReductionCost(
5499 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
5500 }
5501
5502 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
5503 // normal fmul instruction to the cost of the fadd reduction.
5504 if (RK == RecurKind::FMulAdd)
5505 BaseCost +=
5506 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
5507
5508 // If we're using ordered reductions then we can just return the base cost
5509 // here, since getArithmeticReductionCost calculates the full ordered
5510 // reduction cost when FP reassociation is not allowed.
5511 if (useOrderedReductions(RdxDesc))
5512 return BaseCost;
5513
5514 // Get the operand that was not the reduction chain and match it to one of the
5515 // patterns, returning the better cost if it is found.
5516 Instruction *RedOp = RetI->getOperand(1) == LastChain
5519
5520 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
5521
5522 Instruction *Op0, *Op1;
5523 if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5524 match(RedOp,
5526 match(Op0, m_ZExtOrSExt(m_Value())) &&
5527 Op0->getOpcode() == Op1->getOpcode() &&
5528 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
5529 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
5530 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
5531
5532 // Matched reduce.add(ext(mul(ext(A), ext(B)))
5533 // Note that the extend opcodes need to all match, or if A==B they will have
5534 // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
5535 // which is equally fine.
5536 bool IsUnsigned = isa<ZExtInst>(Op0);
5537 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
5538 auto *MulType = VectorType::get(Op0->getType(), VectorTy);
5539
5540 InstructionCost ExtCost =
5541 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
5543 InstructionCost MulCost =
5544 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
5545 InstructionCost Ext2Cost =
5546 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
5548
5549 InstructionCost RedCost = TTI.getMulAccReductionCost(
5550 IsUnsigned, RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), ExtType,
5551 CostKind);
5552
5553 if (RedCost.isValid() &&
5554 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
5555 return I == RetI ? RedCost : 0;
5556 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
5557 !TheLoop->isLoopInvariant(RedOp)) {
5558 // Matched reduce(ext(A))
5559 bool IsUnsigned = isa<ZExtInst>(RedOp);
5560 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
5561 InstructionCost RedCost = TTI.getExtendedReductionCost(
5562 RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
5563 RdxDesc.getFastMathFlags(), CostKind);
5564
5565 InstructionCost ExtCost =
5566 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
5568 if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
5569 return I == RetI ? RedCost : 0;
5570 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5571 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
5572 if (match(Op0, m_ZExtOrSExt(m_Value())) &&
5573 Op0->getOpcode() == Op1->getOpcode() &&
5574 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
5575 bool IsUnsigned = isa<ZExtInst>(Op0);
5576 Type *Op0Ty = Op0->getOperand(0)->getType();
5577 Type *Op1Ty = Op1->getOperand(0)->getType();
5578 Type *LargestOpTy =
5579 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
5580 : Op0Ty;
5581 auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
5582
5583 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
5584 // different sizes. We take the largest type as the ext to reduce, and add
5585 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
5586 InstructionCost ExtCost0 = TTI.getCastInstrCost(
5587 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
5589 InstructionCost ExtCost1 = TTI.getCastInstrCost(
5590 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
5592 InstructionCost MulCost =
5593 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
5594
5595 InstructionCost RedCost = TTI.getMulAccReductionCost(
5596 IsUnsigned, RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), ExtType,
5597 CostKind);
5598 InstructionCost ExtraExtCost = 0;
5599 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
5600 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
5601 ExtraExtCost = TTI.getCastInstrCost(
5602 ExtraExtOp->getOpcode(), ExtType,
5603 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
5605 }
5606
5607 if (RedCost.isValid() &&
5608 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
5609 return I == RetI ? RedCost : 0;
5610 } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
5611 // Matched reduce.add(mul())
5612 InstructionCost MulCost =
5613 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
5614
5615 InstructionCost RedCost = TTI.getMulAccReductionCost(
5616 true, RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), VectorTy,
5617 CostKind);
5618
5619 if (RedCost.isValid() && RedCost < MulCost + BaseCost)
5620 return I == RetI ? RedCost : 0;
5621 }
5622 }
5623
5624 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
5625}
5626
5628LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5629 ElementCount VF) {
5630 // Calculate scalar cost only. Vectorization cost should be ready at this
5631 // moment.
5632 if (VF.isScalar()) {
5633 Type *ValTy = getLoadStoreType(I);
5635 const Align Alignment = getLoadStoreAlignment(I);
5636 unsigned AS = getLoadStoreAddressSpace(I);
5637
5638 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
5639 return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
5640 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, CostKind,
5641 OpInfo, I);
5642 }
5643 return getWideningCost(I, VF);
5644}
5645
5647LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5648 ElementCount VF) const {
5649
5650 // There is no mechanism yet to create a scalable scalarization loop,
5651 // so this is currently Invalid.
5652 if (VF.isScalable())
5653 return InstructionCost::getInvalid();
5654
5655 if (VF.isScalar())
5656 return 0;
5657
5659 Type *RetTy = toVectorizedTy(I->getType(), VF);
5660 if (!RetTy->isVoidTy() &&
5662
5664 if (isa<LoadInst>(I))
5666 else if (isa<StoreInst>(I))
5668
5669 for (Type *VectorTy : getContainedTypes(RetTy)) {
5672 /*Insert=*/true, /*Extract=*/false, CostKind,
5673 /*ForPoisonSrc=*/true, {}, VIC);
5674 }
5675 }
5676
5677 // Some targets keep addresses scalar.
5679 return Cost;
5680
5681 // Some targets support efficient element stores.
5683 return Cost;
5684
5685 // Collect operands to consider.
5686 CallInst *CI = dyn_cast<CallInst>(I);
5687 Instruction::op_range Ops = CI ? CI->args() : I->operands();
5688
5689 // Skip operands that do not require extraction/scalarization and do not incur
5690 // any overhead.
5692 for (auto *V : filterExtractingOperands(Ops, VF))
5693 Tys.push_back(maybeVectorizeType(V->getType(), VF));
5694
5698 return Cost + TTI.getOperandsScalarizationOverhead(Tys, CostKind, OperandVIC);
5699}
5700
5702 if (VF.isScalar())
5703 return;
5704 NumPredStores = 0;
5705 for (BasicBlock *BB : TheLoop->blocks()) {
5706 // For each instruction in the old loop.
5707 for (Instruction &I : *BB) {
5709 if (!Ptr)
5710 continue;
5711
5712 // TODO: We should generate better code and update the cost model for
5713 // predicated uniform stores. Today they are treated as any other
5714 // predicated store (see added test cases in
5715 // invariant-store-vectorization.ll).
5717 NumPredStores++;
5718
5719 if (Legal->isUniformMemOp(I, VF)) {
5720 auto IsLegalToScalarize = [&]() {
5721 if (!VF.isScalable())
5722 // Scalarization of fixed length vectors "just works".
5723 return true;
5724
5725 // We have dedicated lowering for unpredicated uniform loads and
5726 // stores. Note that even with tail folding we know that at least
5727 // one lane is active (i.e. generalized predication is not possible
5728 // here), and the logic below depends on this fact.
5729 if (!foldTailByMasking())
5730 return true;
5731
5732 // For scalable vectors, a uniform memop load is always
5733 // uniform-by-parts and we know how to scalarize that.
5734 if (isa<LoadInst>(I))
5735 return true;
5736
5737 // A uniform store isn't neccessarily uniform-by-part
5738 // and we can't assume scalarization.
5739 auto &SI = cast<StoreInst>(I);
5740 return TheLoop->isLoopInvariant(SI.getValueOperand());
5741 };
5742
5743 const InstructionCost GatherScatterCost =
5745 getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
5746
5747 // Load: Scalar load + broadcast
5748 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
5749 // FIXME: This cost is a significant under-estimate for tail folded
5750 // memory ops.
5751 const InstructionCost ScalarizationCost =
5752 IsLegalToScalarize() ? getUniformMemOpCost(&I, VF)
5754
5755 // Choose better solution for the current VF, Note that Invalid
5756 // costs compare as maximumal large. If both are invalid, we get
5757 // scalable invalid which signals a failure and a vectorization abort.
5758 if (GatherScatterCost < ScalarizationCost)
5759 setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
5760 else
5761 setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
5762 continue;
5763 }
5764
5765 // We assume that widening is the best solution when possible.
5766 if (memoryInstructionCanBeWidened(&I, VF)) {
5767 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
5768 int ConsecutiveStride = Legal->isConsecutivePtr(
5770 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5771 "Expected consecutive stride.");
5772 InstWidening Decision =
5773 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
5774 setWideningDecision(&I, VF, Decision, Cost);
5775 continue;
5776 }
5777
5778 // Choose between Interleaving, Gather/Scatter or Scalarization.
5780 unsigned NumAccesses = 1;
5781 if (isAccessInterleaved(&I)) {
5782 const auto *Group = getInterleavedAccessGroup(&I);
5783 assert(Group && "Fail to get an interleaved access group.");
5784
5785 // Make one decision for the whole group.
5786 if (getWideningDecision(&I, VF) != CM_Unknown)
5787 continue;
5788
5789 NumAccesses = Group->getNumMembers();
5791 InterleaveCost = getInterleaveGroupCost(&I, VF);
5792 }
5793
5794 InstructionCost GatherScatterCost =
5796 ? getGatherScatterCost(&I, VF) * NumAccesses
5798
5799 InstructionCost ScalarizationCost =
5800 getMemInstScalarizationCost(&I, VF) * NumAccesses;
5801
5802 // Choose better solution for the current VF,
5803 // write down this decision and use it during vectorization.
5805 InstWidening Decision;
5806 if (InterleaveCost <= GatherScatterCost &&
5807 InterleaveCost < ScalarizationCost) {
5808 Decision = CM_Interleave;
5809 Cost = InterleaveCost;
5810 } else if (GatherScatterCost < ScalarizationCost) {
5811 Decision = CM_GatherScatter;
5812 Cost = GatherScatterCost;
5813 } else {
5814 Decision = CM_Scalarize;
5815 Cost = ScalarizationCost;
5816 }
5817 // If the instructions belongs to an interleave group, the whole group
5818 // receives the same decision. The whole group receives the cost, but
5819 // the cost will actually be assigned to one instruction.
5820 if (const auto *Group = getInterleavedAccessGroup(&I)) {
5821 if (Decision == CM_Scalarize) {
5822 for (unsigned Idx = 0; Idx < Group->getFactor(); ++Idx) {
5823 if (auto *I = Group->getMember(Idx)) {
5824 setWideningDecision(I, VF, Decision,
5825 getMemInstScalarizationCost(I, VF));
5826 }
5827 }
5828 } else {
5829 setWideningDecision(Group, VF, Decision, Cost);
5830 }
5831 } else
5832 setWideningDecision(&I, VF, Decision, Cost);
5833 }
5834 }
5835
5836 // Make sure that any load of address and any other address computation
5837 // remains scalar unless there is gather/scatter support. This avoids
5838 // inevitable extracts into address registers, and also has the benefit of
5839 // activating LSR more, since that pass can't optimize vectorized
5840 // addresses.
5841 if (TTI.prefersVectorizedAddressing())
5842 return;
5843
5844 // Start with all scalar pointer uses.
5846 for (BasicBlock *BB : TheLoop->blocks())
5847 for (Instruction &I : *BB) {
5848 Instruction *PtrDef =
5850 if (PtrDef && TheLoop->contains(PtrDef) &&
5852 AddrDefs.insert(PtrDef);
5853 }
5854
5855 // Add all instructions used to generate the addresses.
5857 append_range(Worklist, AddrDefs);
5858 while (!Worklist.empty()) {
5859 Instruction *I = Worklist.pop_back_val();
5860 for (auto &Op : I->operands())
5861 if (auto *InstOp = dyn_cast<Instruction>(Op))
5862 if (TheLoop->contains(InstOp) && !isa<PHINode>(InstOp) &&
5863 AddrDefs.insert(InstOp).second)
5864 Worklist.push_back(InstOp);
5865 }
5866
5867 auto UpdateMemOpUserCost = [this, VF](LoadInst *LI) {
5868 // If there are direct memory op users of the newly scalarized load,
5869 // their cost may have changed because there's no scalarization
5870 // overhead for the operand. Update it.
5871 for (User *U : LI->users()) {
5873 continue;
5875 continue;
5878 getMemInstScalarizationCost(cast<Instruction>(U), VF));
5879 }
5880 };
5881 for (auto *I : AddrDefs) {
5882 if (isa<LoadInst>(I)) {
5883 // Setting the desired widening decision should ideally be handled in
5884 // by cost functions, but since this involves the task of finding out
5885 // if the loaded register is involved in an address computation, it is
5886 // instead changed here when we know this is the case.
5887 InstWidening Decision = getWideningDecision(I, VF);
5888 if (!isPredicatedInst(I) &&
5889 (Decision == CM_Widen || Decision == CM_Widen_Reverse ||
5890 (!Legal->isUniformMemOp(*I, VF) && Decision == CM_Scalarize))) {
5891 // Scalarize a widened load of address or update the cost of a scalar
5892 // load of an address.
5894 I, VF, CM_Scalarize,
5895 (VF.getKnownMinValue() *
5896 getMemoryInstructionCost(I, ElementCount::getFixed(1))));
5897 UpdateMemOpUserCost(cast<LoadInst>(I));
5898 } else if (const auto *Group = getInterleavedAccessGroup(I)) {
5899 // Scalarize all members of this interleaved group when any member
5900 // is used as an address. The address-used load skips scalarization
5901 // overhead, other members include it.
5902 for (unsigned Idx = 0; Idx < Group->getFactor(); ++Idx) {
5903 if (Instruction *Member = Group->getMember(Idx)) {
5905 AddrDefs.contains(Member)
5906 ? (VF.getKnownMinValue() *
5907 getMemoryInstructionCost(Member,
5909 : getMemInstScalarizationCost(Member, VF);
5911 UpdateMemOpUserCost(cast<LoadInst>(Member));
5912 }
5913 }
5914 }
5915 } else {
5916 // Cannot scalarize fixed-order recurrence phis at the moment.
5917 if (isa<PHINode>(I) && Legal->isFixedOrderRecurrence(cast<PHINode>(I)))
5918 continue;
5919
5920 // Make sure I gets scalarized and a cost estimate without
5921 // scalarization overhead.
5922 ForcedScalars[VF].insert(I);
5923 }
5924 }
5925}
5926
5928 assert(!VF.isScalar() &&
5929 "Trying to set a vectorization decision for a scalar VF");
5930
5931 auto ForcedScalar = ForcedScalars.find(VF);
5932 for (BasicBlock *BB : TheLoop->blocks()) {
5933 // For each instruction in the old loop.
5934 for (Instruction &I : *BB) {
5936
5937 if (!CI)
5938 continue;
5939
5943 Function *ScalarFunc = CI->getCalledFunction();
5944 Type *ScalarRetTy = CI->getType();
5945 SmallVector<Type *, 4> Tys, ScalarTys;
5946 for (auto &ArgOp : CI->args())
5947 ScalarTys.push_back(ArgOp->getType());
5948
5949 // Estimate cost of scalarized vector call. The source operands are
5950 // assumed to be vectors, so we need to extract individual elements from
5951 // there, execute VF scalar calls, and then gather the result into the
5952 // vector return value.
5953 if (VF.isFixed()) {
5954 InstructionCost ScalarCallCost =
5955 TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
5956
5957 // Compute costs of unpacking argument values for the scalar calls and
5958 // packing the return values to a vector.
5959 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
5960 ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
5961 } else {
5962 // There is no point attempting to calculate the scalar cost for a
5963 // scalable VF as we know it will be Invalid.
5965 "Unexpected valid cost for scalarizing scalable vectors");
5966 ScalarCost = InstructionCost::getInvalid();
5967 }
5968
5969 // Honor ForcedScalars and UniformAfterVectorization decisions.
5970 // TODO: For calls, it might still be more profitable to widen. Use
5971 // VPlan-based cost model to compare different options.
5972 if (VF.isVector() && ((ForcedScalar != ForcedScalars.end() &&
5973 ForcedScalar->second.contains(CI)) ||
5974 isUniformAfterVectorization(CI, VF))) {
5975 setCallWideningDecision(CI, VF, CM_Scalarize, nullptr,
5976 Intrinsic::not_intrinsic, std::nullopt,
5977 ScalarCost);
5978 continue;
5979 }
5980
5981 bool MaskRequired = Legal->isMaskRequired(CI);
5982 // Compute corresponding vector type for return value and arguments.
5983 Type *RetTy = toVectorizedTy(ScalarRetTy, VF);
5984 for (Type *ScalarTy : ScalarTys)
5985 Tys.push_back(toVectorizedTy(ScalarTy, VF));
5986
5987 // An in-loop reduction using an fmuladd intrinsic is a special case;
5988 // we don't want the normal cost for that intrinsic.
5990 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy)) {
5993 std::nullopt, *RedCost);
5994 continue;
5995 }
5996
5997 // Find the cost of vectorizing the call, if we can find a suitable
5998 // vector variant of the function.
5999 VFInfo FuncInfo;
6000 Function *VecFunc = nullptr;
6001 // Search through any available variants for one we can use at this VF.
6002 for (VFInfo &Info : VFDatabase::getMappings(*CI)) {
6003 // Must match requested VF.
6004 if (Info.Shape.VF != VF)
6005 continue;
6006
6007 // Must take a mask argument if one is required
6008 if (MaskRequired && !Info.isMasked())
6009 continue;
6010
6011 // Check that all parameter kinds are supported
6012 bool ParamsOk = true;
6013 for (VFParameter Param : Info.Shape.Parameters) {
6014 switch (Param.ParamKind) {
6016 break;
6018 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6019 // Make sure the scalar parameter in the loop is invariant.
6020 if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),
6021 TheLoop))
6022 ParamsOk = false;
6023 break;
6024 }
6026 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6027 // Find the stride for the scalar parameter in this loop and see if
6028 // it matches the stride for the variant.
6029 // TODO: do we need to figure out the cost of an extract to get the
6030 // first lane? Or do we hope that it will be folded away?
6031 ScalarEvolution *SE = PSE.getSE();
6032 if (!match(SE->getSCEV(ScalarParam),
6034 m_SCEV(), m_scev_SpecificSInt(Param.LinearStepOrPos),
6036 ParamsOk = false;
6037 break;
6038 }
6040 break;
6041 default:
6042 ParamsOk = false;
6043 break;
6044 }
6045 }
6046
6047 if (!ParamsOk)
6048 continue;
6049
6050 // Found a suitable candidate, stop here.
6051 VecFunc = CI->getModule()->getFunction(Info.VectorName);
6052 FuncInfo = Info;
6053 break;
6054 }
6055
6056 if (TLI && VecFunc && !CI->isNoBuiltin())
6057 VectorCost = TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind);
6058
6059 // Find the cost of an intrinsic; some targets may have instructions that
6060 // perform the operation without needing an actual call.
6062 if (IID != Intrinsic::not_intrinsic)
6064
6065 InstructionCost Cost = ScalarCost;
6066 InstWidening Decision = CM_Scalarize;
6067
6068 if (VectorCost.isValid() && VectorCost <= Cost) {
6069 Cost = VectorCost;
6070 Decision = CM_VectorCall;
6071 }
6072
6073 if (IntrinsicCost.isValid() && IntrinsicCost <= Cost) {
6075 Decision = CM_IntrinsicCall;
6076 }
6077
6078 setCallWideningDecision(CI, VF, Decision, VecFunc, IID,
6080 }
6081 }
6082}
6083
6085 if (!Legal->isInvariant(Op))
6086 return false;
6087 // Consider Op invariant, if it or its operands aren't predicated
6088 // instruction in the loop. In that case, it is not trivially hoistable.
6089 auto *OpI = dyn_cast<Instruction>(Op);
6090 return !OpI || !TheLoop->contains(OpI) ||
6091 (!isPredicatedInst(OpI) &&
6092 (!isa<PHINode>(OpI) || OpI->getParent() != TheLoop->getHeader()) &&
6093 all_of(OpI->operands(),
6094 [this](Value *Op) { return shouldConsiderInvariant(Op); }));
6095}
6096
6099 ElementCount VF) {
6100 // If we know that this instruction will remain uniform, check the cost of
6101 // the scalar version.
6103 VF = ElementCount::getFixed(1);
6104
6105 if (VF.isVector() && isProfitableToScalarize(I, VF))
6106 return InstsToScalarize[VF][I];
6107
6108 // Forced scalars do not have any scalarization overhead.
6109 auto ForcedScalar = ForcedScalars.find(VF);
6110 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6111 auto InstSet = ForcedScalar->second;
6112 if (InstSet.count(I))
6114 VF.getKnownMinValue();
6115 }
6116
6117 Type *RetTy = I->getType();
6119 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6120 auto *SE = PSE.getSE();
6121
6122 Type *VectorTy;
6123 if (isScalarAfterVectorization(I, VF)) {
6124 [[maybe_unused]] auto HasSingleCopyAfterVectorization =
6125 [this](Instruction *I, ElementCount VF) -> bool {
6126 if (VF.isScalar())
6127 return true;
6128
6129 auto Scalarized = InstsToScalarize.find(VF);
6130 assert(Scalarized != InstsToScalarize.end() &&
6131 "VF not yet analyzed for scalarization profitability");
6132 return !Scalarized->second.count(I) &&
6133 llvm::all_of(I->users(), [&](User *U) {
6134 auto *UI = cast<Instruction>(U);
6135 return !Scalarized->second.count(UI);
6136 });
6137 };
6138
6139 // With the exception of GEPs and PHIs, after scalarization there should
6140 // only be one copy of the instruction generated in the loop. This is
6141 // because the VF is either 1, or any instructions that need scalarizing
6142 // have already been dealt with by the time we get here. As a result,
6143 // it means we don't have to multiply the instruction cost by VF.
6144 assert(I->getOpcode() == Instruction::GetElementPtr ||
6145 I->getOpcode() == Instruction::PHI ||
6146 (I->getOpcode() == Instruction::BitCast &&
6147 I->getType()->isPointerTy()) ||
6148 HasSingleCopyAfterVectorization(I, VF));
6149 VectorTy = RetTy;
6150 } else
6151 VectorTy = toVectorizedTy(RetTy, VF);
6152
6153 if (VF.isVector() && VectorTy->isVectorTy() &&
6154 !TTI.getNumberOfParts(VectorTy))
6156
6157 // TODO: We need to estimate the cost of intrinsic calls.
6158 switch (I->getOpcode()) {
6159 case Instruction::GetElementPtr:
6160 // We mark this instruction as zero-cost because the cost of GEPs in
6161 // vectorized code depends on whether the corresponding memory instruction
6162 // is scalarized or not. Therefore, we handle GEPs with the memory
6163 // instruction cost.
6164 return 0;
6165 case Instruction::Br: {
6166 // In cases of scalarized and predicated instructions, there will be VF
6167 // predicated blocks in the vectorized loop. Each branch around these
6168 // blocks requires also an extract of its vector compare i1 element.
6169 // Note that the conditional branch from the loop latch will be replaced by
6170 // a single branch controlling the loop, so there is no extra overhead from
6171 // scalarization.
6172 bool ScalarPredicatedBB = false;
6174 if (VF.isVector() && BI->isConditional() &&
6175 (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
6176 PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))) &&
6177 BI->getParent() != TheLoop->getLoopLatch())
6178 ScalarPredicatedBB = true;
6179
6180 if (ScalarPredicatedBB) {
6181 // Not possible to scalarize scalable vector with predicated instructions.
6182 if (VF.isScalable())
6184 // Return cost for branches around scalarized and predicated blocks.
6185 auto *VecI1Ty =
6187 return (
6188 TTI.getScalarizationOverhead(
6189 VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
6190 /*Insert*/ false, /*Extract*/ true, CostKind) +
6191 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
6192 }
6193
6194 if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6195 // The back-edge branch will remain, as will all scalar branches.
6196 return TTI.getCFInstrCost(Instruction::Br, CostKind);
6197
6198 // This branch will be eliminated by if-conversion.
6199 return 0;
6200 // Note: We currently assume zero cost for an unconditional branch inside
6201 // a predicated block since it will become a fall-through, although we
6202 // may decide in the future to call TTI for all branches.
6203 }
6204 case Instruction::Switch: {
6205 if (VF.isScalar())
6206 return TTI.getCFInstrCost(Instruction::Switch, CostKind);
6207 auto *Switch = cast<SwitchInst>(I);
6208 return Switch->getNumCases() *
6209 TTI.getCmpSelInstrCost(
6210 Instruction::ICmp,
6211 toVectorTy(Switch->getCondition()->getType(), VF),
6212 toVectorTy(Type::getInt1Ty(I->getContext()), VF),
6214 }
6215 case Instruction::PHI: {
6216 auto *Phi = cast<PHINode>(I);
6217
6218 // First-order recurrences are replaced by vector shuffles inside the loop.
6219 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6221 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
6222 return TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
6223 cast<VectorType>(VectorTy),
6224 cast<VectorType>(VectorTy), Mask, CostKind,
6225 VF.getKnownMinValue() - 1);
6226 }
6227
6228 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6229 // converted into select instructions. We require N - 1 selects per phi
6230 // node, where N is the number of incoming values.
6231 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) {
6232 Type *ResultTy = Phi->getType();
6233
6234 // All instructions in an Any-of reduction chain are narrowed to bool.
6235 // Check if that is the case for this phi node.
6236 auto *HeaderUser = cast_if_present<PHINode>(
6237 find_singleton<User>(Phi->users(), [this](User *U, bool) -> User * {
6238 auto *Phi = dyn_cast<PHINode>(U);
6239 if (Phi && Phi->getParent() == TheLoop->getHeader())
6240 return Phi;
6241 return nullptr;
6242 }));
6243 if (HeaderUser) {
6244 auto &ReductionVars = Legal->getReductionVars();
6245 auto Iter = ReductionVars.find(HeaderUser);
6246 if (Iter != ReductionVars.end() &&
6248 Iter->second.getRecurrenceKind()))
6249 ResultTy = Type::getInt1Ty(Phi->getContext());
6250 }
6251 return (Phi->getNumIncomingValues() - 1) *
6252 TTI.getCmpSelInstrCost(
6253 Instruction::Select, toVectorTy(ResultTy, VF),
6254 toVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6256 }
6257
6258 // When tail folding with EVL, if the phi is part of an out of loop
6259 // reduction then it will be transformed into a wide vp_merge.
6260 if (VF.isVector() && foldTailWithEVL() &&
6261 Legal->getReductionVars().contains(Phi) && !isInLoopReduction(Phi)) {
6263 Intrinsic::vp_merge, toVectorTy(Phi->getType(), VF),
6264 {toVectorTy(Type::getInt1Ty(Phi->getContext()), VF)});
6265 return TTI.getIntrinsicInstrCost(ICA, CostKind);
6266 }
6267
6268 return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6269 }
6270 case Instruction::UDiv:
6271 case Instruction::SDiv:
6272 case Instruction::URem:
6273 case Instruction::SRem:
6274 if (VF.isVector() && isPredicatedInst(I)) {
6275 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6276 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6277 ScalarCost : SafeDivisorCost;
6278 }
6279 // We've proven all lanes safe to speculate, fall through.
6280 [[fallthrough]];
6281 case Instruction::Add:
6282 case Instruction::Sub: {
6283 auto Info = Legal->getHistogramInfo(I);
6284 if (Info && VF.isVector()) {
6285 const HistogramInfo *HGram = Info.value();
6286 // Assume that a non-constant update value (or a constant != 1) requires
6287 // a multiply, and add that into the cost.
6289 ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1));
6290 if (!RHS || RHS->getZExtValue() != 1)
6291 MulCost =
6292 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6293
6294 // Find the cost of the histogram operation itself.
6295 Type *PtrTy = VectorType::get(HGram->Load->getPointerOperandType(), VF);
6296 Type *ScalarTy = I->getType();
6297 Type *MaskTy = VectorType::get(Type::getInt1Ty(I->getContext()), VF);
6298 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
6299 Type::getVoidTy(I->getContext()),
6300 {PtrTy, ScalarTy, MaskTy});
6301
6302 // Add the costs together with the add/sub operation.
6303 return TTI.getIntrinsicInstrCost(ICA, CostKind) + MulCost +
6304 TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, CostKind);
6305 }
6306 [[fallthrough]];
6307 }
6308 case Instruction::FAdd:
6309 case Instruction::FSub:
6310 case Instruction::Mul:
6311 case Instruction::FMul:
6312 case Instruction::FDiv:
6313 case Instruction::FRem:
6314 case Instruction::Shl:
6315 case Instruction::LShr:
6316 case Instruction::AShr:
6317 case Instruction::And:
6318 case Instruction::Or:
6319 case Instruction::Xor: {
6320 // If we're speculating on the stride being 1, the multiplication may
6321 // fold away. We can generalize this for all operations using the notion
6322 // of neutral elements. (TODO)
6323 if (I->getOpcode() == Instruction::Mul &&
6324 ((TheLoop->isLoopInvariant(I->getOperand(0)) &&
6325 PSE.getSCEV(I->getOperand(0))->isOne()) ||
6326 (TheLoop->isLoopInvariant(I->getOperand(1)) &&
6327 PSE.getSCEV(I->getOperand(1))->isOne())))
6328 return 0;
6329
6330 // Detect reduction patterns
6331 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy))
6332 return *RedCost;
6333
6334 // Certain instructions can be cheaper to vectorize if they have a constant
6335 // second vector operand. One example of this are shifts on x86.
6336 Value *Op2 = I->getOperand(1);
6337 if (!isa<Constant>(Op2) && TheLoop->isLoopInvariant(Op2) &&
6338 PSE.getSE()->isSCEVable(Op2->getType()) &&
6339 isa<SCEVConstant>(PSE.getSCEV(Op2))) {
6340 Op2 = cast<SCEVConstant>(PSE.getSCEV(Op2))->getValue();
6341 }
6342 auto Op2Info = TTI.getOperandInfo(Op2);
6343 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6346
6347 SmallVector<const Value *, 4> Operands(I->operand_values());
6348 return TTI.getArithmeticInstrCost(
6349 I->getOpcode(), VectorTy, CostKind,
6350 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6351 Op2Info, Operands, I, TLI);
6352 }
6353 case Instruction::FNeg: {
6354 return TTI.getArithmeticInstrCost(
6355 I->getOpcode(), VectorTy, CostKind,
6356 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6357 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6358 I->getOperand(0), I);
6359 }
6360 case Instruction::Select: {
6362 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6363 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6364
6365 const Value *Op0, *Op1;
6366 using namespace llvm::PatternMatch;
6367 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
6368 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
6369 // select x, y, false --> x & y
6370 // select x, true, y --> x | y
6371 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);
6372 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);
6373 assert(Op0->getType()->getScalarSizeInBits() == 1 &&
6374 Op1->getType()->getScalarSizeInBits() == 1);
6375
6376 return TTI.getArithmeticInstrCost(
6377 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And,
6378 VectorTy, CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, {Op0, Op1}, I);
6379 }
6380
6381 Type *CondTy = SI->getCondition()->getType();
6382 if (!ScalarCond)
6383 CondTy = VectorType::get(CondTy, VF);
6384
6386 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
6387 Pred = Cmp->getPredicate();
6388 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
6389 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
6390 {TTI::OK_AnyValue, TTI::OP_None}, I);
6391 }
6392 case Instruction::ICmp:
6393 case Instruction::FCmp: {
6394 Type *ValTy = I->getOperand(0)->getType();
6395
6397 [[maybe_unused]] Instruction *Op0AsInstruction =
6398 dyn_cast<Instruction>(I->getOperand(0));
6399 assert((!canTruncateToMinimalBitwidth(Op0AsInstruction, VF) ||
6400 MinBWs[I] == MinBWs[Op0AsInstruction]) &&
6401 "if both the operand and the compare are marked for "
6402 "truncation, they must have the same bitwidth");
6403 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[I]);
6404 }
6405
6406 VectorTy = toVectorTy(ValTy, VF);
6407 return TTI.getCmpSelInstrCost(
6408 I->getOpcode(), VectorTy, CmpInst::makeCmpResultType(VectorTy),
6409 cast<CmpInst>(I)->getPredicate(), CostKind,
6410 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, I);
6411 }
6412 case Instruction::Store:
6413 case Instruction::Load: {
6414 ElementCount Width = VF;
6415 if (Width.isVector()) {
6416 InstWidening Decision = getWideningDecision(I, Width);
6417 assert(Decision != CM_Unknown &&
6418 "CM decision should be taken at this point");
6421 if (Decision == CM_Scalarize)
6422 Width = ElementCount::getFixed(1);
6423 }
6424 VectorTy = toVectorTy(getLoadStoreType(I), Width);
6425 return getMemoryInstructionCost(I, VF);
6426 }
6427 case Instruction::BitCast:
6428 if (I->getType()->isPointerTy())
6429 return 0;
6430 [[fallthrough]];
6431 case Instruction::ZExt:
6432 case Instruction::SExt:
6433 case Instruction::FPToUI:
6434 case Instruction::FPToSI:
6435 case Instruction::FPExt:
6436 case Instruction::PtrToInt:
6437 case Instruction::IntToPtr:
6438 case Instruction::SIToFP:
6439 case Instruction::UIToFP:
6440 case Instruction::Trunc:
6441 case Instruction::FPTrunc: {
6442 // Computes the CastContextHint from a Load/Store instruction.
6443 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6445 "Expected a load or a store!");
6446
6447 if (VF.isScalar() || !TheLoop->contains(I))
6449
6450 switch (getWideningDecision(I, VF)) {
6462 llvm_unreachable("Instr did not go through cost modelling?");
6465 llvm_unreachable_internal("Instr has invalid widening decision");
6466 }
6467
6468 llvm_unreachable("Unhandled case!");
6469 };
6470
6471 unsigned Opcode = I->getOpcode();
6473 // For Trunc, the context is the only user, which must be a StoreInst.
6474 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
6475 if (I->hasOneUse())
6476 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
6477 CCH = ComputeCCH(Store);
6478 }
6479 // For Z/Sext, the context is the operand, which must be a LoadInst.
6480 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
6481 Opcode == Instruction::FPExt) {
6482 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
6483 CCH = ComputeCCH(Load);
6484 }
6485
6486 // We optimize the truncation of induction variables having constant
6487 // integer steps. The cost of these truncations is the same as the scalar
6488 // operation.
6489 if (isOptimizableIVTruncate(I, VF)) {
6490 auto *Trunc = cast<TruncInst>(I);
6491 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6492 Trunc->getSrcTy(), CCH, CostKind, Trunc);
6493 }
6494
6495 // Detect reduction patterns
6496 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy))
6497 return *RedCost;
6498
6499 Type *SrcScalarTy = I->getOperand(0)->getType();
6500 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6501 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6502 SrcScalarTy =
6503 IntegerType::get(SrcScalarTy->getContext(), MinBWs[Op0AsInstruction]);
6504 Type *SrcVecTy =
6505 VectorTy->isVectorTy() ? toVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6506
6508 // If the result type is <= the source type, there will be no extend
6509 // after truncating the users to the minimal required bitwidth.
6510 if (VectorTy->getScalarSizeInBits() <= SrcVecTy->getScalarSizeInBits() &&
6511 (I->getOpcode() == Instruction::ZExt ||
6512 I->getOpcode() == Instruction::SExt))
6513 return 0;
6514 }
6515
6516 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
6517 }
6518 case Instruction::Call:
6519 return getVectorCallCost(cast<CallInst>(I), VF);
6520 case Instruction::ExtractValue:
6521 return TTI.getInstructionCost(I, CostKind);
6522 case Instruction::Alloca:
6523 // We cannot easily widen alloca to a scalable alloca, as
6524 // the result would need to be a vector of pointers.
6525 if (VF.isScalable())
6527 return TTI.getArithmeticInstrCost(Instruction::Mul, RetTy, CostKind);
6528 default:
6529 // This opcode is unknown. Assume that it is the same as 'mul'.
6530 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6531 } // end of switch.
6532}
6533
6535 // Ignore ephemeral values.
6537
6538 SmallVector<Value *, 4> DeadInterleavePointerOps;
6540
6541 // If a scalar epilogue is required, users outside the loop won't use
6542 // live-outs from the vector loop but from the scalar epilogue. Ignore them if
6543 // that is the case.
6544 bool RequiresScalarEpilogue = requiresScalarEpilogue(true);
6545 auto IsLiveOutDead = [this, RequiresScalarEpilogue](User *U) {
6546 return RequiresScalarEpilogue &&
6547 !TheLoop->contains(cast<Instruction>(U)->getParent());
6548 };
6549
6551 DFS.perform(LI);
6552 for (BasicBlock *BB : reverse(make_range(DFS.beginRPO(), DFS.endRPO())))
6553 for (Instruction &I : reverse(*BB)) {
6554 if (VecValuesToIgnore.contains(&I) || ValuesToIgnore.contains(&I))
6555 continue;
6556
6557 // Add instructions that would be trivially dead and are only used by
6558 // values already ignored to DeadOps to seed worklist.
6560 all_of(I.users(), [this, IsLiveOutDead](User *U) {
6561 return VecValuesToIgnore.contains(U) ||
6562 ValuesToIgnore.contains(U) || IsLiveOutDead(U);
6563 }))
6564 DeadOps.push_back(&I);
6565
6566 // For interleave groups, we only create a pointer for the start of the
6567 // interleave group. Queue up addresses of group members except the insert
6568 // position for further processing.
6569 if (isAccessInterleaved(&I)) {
6570 auto *Group = getInterleavedAccessGroup(&I);
6571 if (Group->getInsertPos() == &I)
6572 continue;
6573 Value *PointerOp = getLoadStorePointerOperand(&I);
6574 DeadInterleavePointerOps.push_back(PointerOp);
6575 }
6576
6577 // Queue branches for analysis. They are dead, if their successors only
6578 // contain dead instructions.
6579 if (auto *Br = dyn_cast<BranchInst>(&I)) {
6580 if (Br->isConditional())
6581 DeadOps.push_back(&I);
6582 }
6583 }
6584
6585 // Mark ops feeding interleave group members as free, if they are only used
6586 // by other dead computations.
6587 for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) {
6588 auto *Op = dyn_cast<Instruction>(DeadInterleavePointerOps[I]);
6589 if (!Op || !TheLoop->contains(Op) || any_of(Op->users(), [this](User *U) {
6590 Instruction *UI = cast<Instruction>(U);
6591 return !VecValuesToIgnore.contains(U) &&
6592 (!isAccessInterleaved(UI) ||
6593 getInterleavedAccessGroup(UI)->getInsertPos() == UI);
6594 }))
6595 continue;
6596 VecValuesToIgnore.insert(Op);
6597 append_range(DeadInterleavePointerOps, Op->operands());
6598 }
6599
6600 // Mark ops that would be trivially dead and are only used by ignored
6601 // instructions as free.
6602 BasicBlock *Header = TheLoop->getHeader();
6603
6604 // Returns true if the block contains only dead instructions. Such blocks will
6605 // be removed by VPlan-to-VPlan transforms and won't be considered by the
6606 // VPlan-based cost model, so skip them in the legacy cost-model as well.
6607 auto IsEmptyBlock = [this](BasicBlock *BB) {
6608 return all_of(*BB, [this](Instruction &I) {
6609 return ValuesToIgnore.contains(&I) || VecValuesToIgnore.contains(&I) ||
6610 (isa<BranchInst>(&I) && !cast<BranchInst>(&I)->isConditional());
6611 });
6612 };
6613 for (unsigned I = 0; I != DeadOps.size(); ++I) {
6614 auto *Op = dyn_cast<Instruction>(DeadOps[I]);
6615
6616 // Check if the branch should be considered dead.
6617 if (auto *Br = dyn_cast_or_null<BranchInst>(Op)) {
6618 BasicBlock *ThenBB = Br->getSuccessor(0);
6619 BasicBlock *ElseBB = Br->getSuccessor(1);
6620 // Don't considers branches leaving the loop for simplification.
6621 if (!TheLoop->contains(ThenBB) || !TheLoop->contains(ElseBB))
6622 continue;
6623 bool ThenEmpty = IsEmptyBlock(ThenBB);
6624 bool ElseEmpty = IsEmptyBlock(ElseBB);
6625 if ((ThenEmpty && ElseEmpty) ||
6626 (ThenEmpty && ThenBB->getSingleSuccessor() == ElseBB &&
6627 ElseBB->phis().empty()) ||
6628 (ElseEmpty && ElseBB->getSingleSuccessor() == ThenBB &&
6629 ThenBB->phis().empty())) {
6630 VecValuesToIgnore.insert(Br);
6631 DeadOps.push_back(Br->getCondition());
6632 }
6633 continue;
6634 }
6635
6636 // Skip any op that shouldn't be considered dead.
6637 if (!Op || !TheLoop->contains(Op) ||
6638 (isa<PHINode>(Op) && Op->getParent() == Header) ||
6640 any_of(Op->users(), [this, IsLiveOutDead](User *U) {
6641 return !VecValuesToIgnore.contains(U) &&
6642 !ValuesToIgnore.contains(U) && !IsLiveOutDead(U);
6643 }))
6644 continue;
6645
6646 // If all of Op's users are in ValuesToIgnore, add it to ValuesToIgnore
6647 // which applies for both scalar and vector versions. Otherwise it is only
6648 // dead in vector versions, so only add it to VecValuesToIgnore.
6649 if (all_of(Op->users(),
6650 [this](User *U) { return ValuesToIgnore.contains(U); }))
6651 ValuesToIgnore.insert(Op);
6652
6653 VecValuesToIgnore.insert(Op);
6654 append_range(DeadOps, Op->operands());
6655 }
6656
6657 // Ignore type-promoting instructions we identified during reduction
6658 // detection.
6659 for (const auto &Reduction : Legal->getReductionVars()) {
6660 const RecurrenceDescriptor &RedDes = Reduction.second;
6661 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6662 VecValuesToIgnore.insert_range(Casts);
6663 }
6664 // Ignore type-casting instructions we identified during induction
6665 // detection.
6666 for (const auto &Induction : Legal->getInductionVars()) {
6667 const InductionDescriptor &IndDes = Induction.second;
6668 VecValuesToIgnore.insert_range(IndDes.getCastInsts());
6669 }
6670}
6671
6673 // Avoid duplicating work finding in-loop reductions.
6674 if (!InLoopReductions.empty())
6675 return;
6676
6677 for (const auto &Reduction : Legal->getReductionVars()) {
6678 PHINode *Phi = Reduction.first;
6679 const RecurrenceDescriptor &RdxDesc = Reduction.second;
6680
6681 // Multi-use reductions (e.g., used in FindLastIV patterns) are handled
6682 // separately and should not be considered for in-loop reductions.
6683 if (RdxDesc.hasUsesOutsideReductionChain())
6684 continue;
6685
6686 // We don't collect reductions that are type promoted (yet).
6687 if (RdxDesc.getRecurrenceType() != Phi->getType())
6688 continue;
6689
6690 // In-loop AnyOf and FindIV reductions are not yet supported.
6691 RecurKind Kind = RdxDesc.getRecurrenceKind();
6695 continue;
6696
6697 // If the target would prefer this reduction to happen "in-loop", then we
6698 // want to record it as such.
6699 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
6700 !TTI.preferInLoopReduction(Kind, Phi->getType()))
6701 continue;
6702
6703 // Check that we can correctly put the reductions into the loop, by
6704 // finding the chain of operations that leads from the phi to the loop
6705 // exit value.
6706 SmallVector<Instruction *, 4> ReductionOperations =
6707 RdxDesc.getReductionOpChain(Phi, TheLoop);
6708 bool InLoop = !ReductionOperations.empty();
6709
6710 if (InLoop) {
6711 InLoopReductions.insert(Phi);
6712 // Add the elements to InLoopReductionImmediateChains for cost modelling.
6713 Instruction *LastChain = Phi;
6714 for (auto *I : ReductionOperations) {
6715 InLoopReductionImmediateChains[I] = LastChain;
6716 LastChain = I;
6717 }
6718 }
6719 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
6720 << " reduction for phi: " << *Phi << "\n");
6721 }
6722}
6723
6724// This function will select a scalable VF if the target supports scalable
6725// vectors and a fixed one otherwise.
6726// TODO: we could return a pair of values that specify the max VF and
6727// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6728// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6729// doesn't have a cost model that can choose which plan to execute if
6730// more than one is generated.
6733 unsigned WidestType;
6734 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6735
6737 TTI.enableScalableVectorization()
6740
6741 TypeSize RegSize = TTI.getRegisterBitWidth(RegKind);
6742 unsigned N = RegSize.getKnownMinValue() / WidestType;
6743 return ElementCount::get(N, RegSize.isScalable());
6744}
6745
6748 ElementCount VF = UserVF;
6749 // Outer loop handling: They may require CFG and instruction level
6750 // transformations before even evaluating whether vectorization is profitable.
6751 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6752 // the vectorization pipeline.
6753 if (!OrigLoop->isInnermost()) {
6754 // If the user doesn't provide a vectorization factor, determine a
6755 // reasonable one.
6756 if (UserVF.isZero()) {
6757 VF = determineVPlanVF(TTI, CM);
6758 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6759
6760 // Make sure we have a VF > 1 for stress testing.
6761 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
6762 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6763 << "overriding computed VF.\n");
6764 VF = ElementCount::getFixed(4);
6765 }
6766 } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
6768 LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
6769 << "not supported by the target.\n");
6771 "Scalable vectorization requested but not supported by the target",
6772 "the scalable user-specified vectorization width for outer-loop "
6773 "vectorization cannot be used because the target does not support "
6774 "scalable vectors.",
6775 "ScalableVFUnfeasible", ORE, OrigLoop);
6777 }
6778 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6780 "VF needs to be a power of two");
6781 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
6782 << "VF " << VF << " to build VPlans.\n");
6783 buildVPlans(VF, VF);
6784
6785 if (VPlans.empty())
6787
6788 // For VPlan build stress testing, we bail out after VPlan construction.
6791
6792 return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
6793 }
6794
6795 LLVM_DEBUG(
6796 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6797 "VPlan-native path.\n");
6799}
6800
6801void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
6802 assert(OrigLoop->isInnermost() && "Inner loop expected.");
6803 CM.collectValuesToIgnore();
6804 CM.collectElementTypesForWidening();
6805
6806 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
6807 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
6808 return;
6809
6810 // Invalidate interleave groups if all blocks of loop will be predicated.
6811 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
6813 LLVM_DEBUG(
6814 dbgs()
6815 << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6816 "which requires masked-interleaved support.\n");
6817 if (CM.InterleaveInfo.invalidateGroups())
6818 // Invalidating interleave groups also requires invalidating all decisions
6819 // based on them, which includes widening decisions and uniform and scalar
6820 // values.
6821 CM.invalidateCostModelingDecisions();
6822 }
6823
6824 if (CM.foldTailByMasking())
6825 Legal->prepareToFoldTailByMasking();
6826
6827 ElementCount MaxUserVF =
6828 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
6829 if (UserVF) {
6830 if (!ElementCount::isKnownLE(UserVF, MaxUserVF)) {
6832 "UserVF ignored because it may be larger than the maximal safe VF",
6833 "InvalidUserVF", ORE, OrigLoop);
6834 } else {
6836 "VF needs to be a power of two");
6837 // Collect the instructions (and their associated costs) that will be more
6838 // profitable to scalarize.
6839 CM.collectInLoopReductions();
6840 if (CM.selectUserVectorizationFactor(UserVF)) {
6841 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6842 buildVPlansWithVPRecipes(UserVF, UserVF);
6844 return;
6845 }
6846 reportVectorizationInfo("UserVF ignored because of invalid costs.",
6847 "InvalidCost", ORE, OrigLoop);
6848 }
6849 }
6850
6851 // Collect the Vectorization Factor Candidates.
6852 SmallVector<ElementCount> VFCandidates;
6853 for (auto VF = ElementCount::getFixed(1);
6854 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
6855 VFCandidates.push_back(VF);
6856 for (auto VF = ElementCount::getScalable(1);
6857 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
6858 VFCandidates.push_back(VF);
6859
6860 CM.collectInLoopReductions();
6861 for (const auto &VF : VFCandidates) {
6862 // Collect Uniform and Scalar instructions after vectorization with VF.
6863 CM.collectNonVectorizedAndSetWideningDecisions(VF);
6864 }
6865
6866 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
6867 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
6868
6870}
6871
6873 ElementCount VF) const {
6874 InstructionCost Cost = CM.getInstructionCost(UI, VF);
6875 if (Cost.isValid() && ForceTargetInstructionCost.getNumOccurrences())
6877 return Cost;
6878}
6879
6881 ElementCount VF) const {
6882 return CM.isUniformAfterVectorization(I, VF);
6883}
6884
6885bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
6886 return CM.ValuesToIgnore.contains(UI) ||
6887 (IsVector && CM.VecValuesToIgnore.contains(UI)) ||
6888 SkipCostComputation.contains(UI);
6889}
6890
6892 return CM.getPredBlockCostDivisor(CostKind, BB);
6893}
6894
6896LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
6897 VPCostContext &CostCtx) const {
6899 // Cost modeling for inductions is inaccurate in the legacy cost model
6900 // compared to the recipes that are generated. To match here initially during
6901 // VPlan cost model bring up directly use the induction costs from the legacy
6902 // cost model. Note that we do this as pre-processing; the VPlan may not have
6903 // any recipes associated with the original induction increment instruction
6904 // and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute
6905 // the cost of induction phis and increments (both that are represented by
6906 // recipes and those that are not), to avoid distinguishing between them here,
6907 // and skip all recipes that represent induction phis and increments (the
6908 // former case) later on, if they exist, to avoid counting them twice.
6909 // Similarly we pre-compute the cost of any optimized truncates.
6910 // TODO: Switch to more accurate costing based on VPlan.
6911 for (const auto &[IV, IndDesc] : Legal->getInductionVars()) {
6913 IV->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
6914 SmallVector<Instruction *> IVInsts = {IVInc};
6915 for (unsigned I = 0; I != IVInsts.size(); I++) {
6916 for (Value *Op : IVInsts[I]->operands()) {
6917 auto *OpI = dyn_cast<Instruction>(Op);
6918 if (Op == IV || !OpI || !OrigLoop->contains(OpI) || !Op->hasOneUse())
6919 continue;
6920 IVInsts.push_back(OpI);
6921 }
6922 }
6923 IVInsts.push_back(IV);
6924 for (User *U : IV->users()) {
6925 auto *CI = cast<Instruction>(U);
6926 if (!CostCtx.CM.isOptimizableIVTruncate(CI, VF))
6927 continue;
6928 IVInsts.push_back(CI);
6929 }
6930
6931 // If the vector loop gets executed exactly once with the given VF, ignore
6932 // the costs of comparison and induction instructions, as they'll get
6933 // simplified away.
6934 // TODO: Remove this code after stepping away from the legacy cost model and
6935 // adding code to simplify VPlans before calculating their costs.
6936 auto TC = getSmallConstantTripCount(PSE.getSE(), OrigLoop);
6937 if (TC == VF && !CM.foldTailByMasking())
6938 addFullyUnrolledInstructionsToIgnore(OrigLoop, Legal->getInductionVars(),
6939 CostCtx.SkipCostComputation);
6940
6941 for (Instruction *IVInst : IVInsts) {
6942 if (CostCtx.skipCostComputation(IVInst, VF.isVector()))
6943 continue;
6944 InstructionCost InductionCost = CostCtx.getLegacyCost(IVInst, VF);
6945 LLVM_DEBUG({
6946 dbgs() << "Cost of " << InductionCost << " for VF " << VF
6947 << ": induction instruction " << *IVInst << "\n";
6948 });
6949 Cost += InductionCost;
6950 CostCtx.SkipCostComputation.insert(IVInst);
6951 }
6952 }
6953
6954 /// Compute the cost of all exiting conditions of the loop using the legacy
6955 /// cost model. This is to match the legacy behavior, which adds the cost of
6956 /// all exit conditions. Note that this over-estimates the cost, as there will
6957 /// be a single condition to control the vector loop.
6959 CM.TheLoop->getExitingBlocks(Exiting);
6960 SetVector<Instruction *> ExitInstrs;
6961 // Collect all exit conditions.
6962 for (BasicBlock *EB : Exiting) {
6963 auto *Term = dyn_cast<BranchInst>(EB->getTerminator());
6964 if (!Term || CostCtx.skipCostComputation(Term, VF.isVector()))
6965 continue;
6966 if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) {
6967 ExitInstrs.insert(CondI);
6968 }
6969 }
6970 // Compute the cost of all instructions only feeding the exit conditions.
6971 for (unsigned I = 0; I != ExitInstrs.size(); ++I) {
6972 Instruction *CondI = ExitInstrs[I];
6973 if (!OrigLoop->contains(CondI) ||
6974 !CostCtx.SkipCostComputation.insert(CondI).second)
6975 continue;
6976 InstructionCost CondICost = CostCtx.getLegacyCost(CondI, VF);
6977 LLVM_DEBUG({
6978 dbgs() << "Cost of " << CondICost << " for VF " << VF
6979 << ": exit condition instruction " << *CondI << "\n";
6980 });
6981 Cost += CondICost;
6982 for (Value *Op : CondI->operands()) {
6983 auto *OpI = dyn_cast<Instruction>(Op);
6984 if (!OpI || CostCtx.skipCostComputation(OpI, VF.isVector()) ||
6985 any_of(OpI->users(), [&ExitInstrs](User *U) {
6986 return !ExitInstrs.contains(cast<Instruction>(U));
6987 }))
6988 continue;
6989 ExitInstrs.insert(OpI);
6990 }
6991 }
6992
6993 // Pre-compute the costs for branches except for the backedge, as the number
6994 // of replicate regions in a VPlan may not directly match the number of
6995 // branches, which would lead to different decisions.
6996 // TODO: Compute cost of branches for each replicate region in the VPlan,
6997 // which is more accurate than the legacy cost model.
6998 for (BasicBlock *BB : OrigLoop->blocks()) {
6999 if (CostCtx.skipCostComputation(BB->getTerminator(), VF.isVector()))
7000 continue;
7001 CostCtx.SkipCostComputation.insert(BB->getTerminator());
7002 if (BB == OrigLoop->getLoopLatch())
7003 continue;
7004 auto BranchCost = CostCtx.getLegacyCost(BB->getTerminator(), VF);
7005 Cost += BranchCost;
7006 }
7007
7008 // Don't apply special costs when instruction cost is forced to make sure the
7009 // forced cost is used for each recipe.
7010 if (ForceTargetInstructionCost.getNumOccurrences())
7011 return Cost;
7012
7013 // Pre-compute costs for instructions that are forced-scalar or profitable to
7014 // scalarize. Their costs will be computed separately in the legacy cost
7015 // model.
7016 for (Instruction *ForcedScalar : CM.ForcedScalars[VF]) {
7017 if (CostCtx.skipCostComputation(ForcedScalar, VF.isVector()))
7018 continue;
7019 CostCtx.SkipCostComputation.insert(ForcedScalar);
7020 InstructionCost ForcedCost = CostCtx.getLegacyCost(ForcedScalar, VF);
7021 LLVM_DEBUG({
7022 dbgs() << "Cost of " << ForcedCost << " for VF " << VF
7023 << ": forced scalar " << *ForcedScalar << "\n";
7024 });
7025 Cost += ForcedCost;
7026 }
7027 for (const auto &[Scalarized, ScalarCost] : CM.InstsToScalarize[VF]) {
7028 if (CostCtx.skipCostComputation(Scalarized, VF.isVector()))
7029 continue;
7030 CostCtx.SkipCostComputation.insert(Scalarized);
7031 LLVM_DEBUG({
7032 dbgs() << "Cost of " << ScalarCost << " for VF " << VF
7033 << ": profitable to scalarize " << *Scalarized << "\n";
7034 });
7035 Cost += ScalarCost;
7036 }
7037
7038 return Cost;
7039}
7040
7041InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
7042 ElementCount VF) const {
7043 VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, PSE, OrigLoop);
7044 InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
7045
7046 // Now compute and add the VPlan-based cost.
7047 Cost += Plan.cost(VF, CostCtx);
7048#ifndef NDEBUG
7049 unsigned EstimatedWidth = estimateElementCount(VF, CM.getVScaleForTuning());
7050 LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
7051 << " (Estimated cost per lane: ");
7052 if (Cost.isValid()) {
7053 double CostPerLane = double(Cost.getValue()) / EstimatedWidth;
7054 LLVM_DEBUG(dbgs() << format("%.1f", CostPerLane));
7055 } else /* No point dividing an invalid cost - it will still be invalid */
7056 LLVM_DEBUG(dbgs() << "Invalid");
7057 LLVM_DEBUG(dbgs() << ")\n");
7058#endif
7059 return Cost;
7060}
7061
7062#ifndef NDEBUG
7063/// Return true if the original loop \ TheLoop contains any instructions that do
7064/// not have corresponding recipes in \p Plan and are not marked to be ignored
7065/// in \p CostCtx. This means the VPlan contains simplification that the legacy
7066/// cost-model did not account for.
7068 VPCostContext &CostCtx,
7069 Loop *TheLoop,
7070 ElementCount VF) {
7071 // First collect all instructions for the recipes in Plan.
7072 auto GetInstructionForCost = [](const VPRecipeBase *R) -> Instruction * {
7073 if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
7074 return dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
7075 if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))
7076 return &WidenMem->getIngredient();
7077 return nullptr;
7078 };
7079
7080 // Check if a select for a safe divisor was hoisted to the pre-header. If so,
7081 // the select doesn't need to be considered for the vector loop cost; go with
7082 // the more accurate VPlan-based cost model.
7083 for (VPRecipeBase &R : *Plan.getVectorPreheader()) {
7084 auto *VPI = dyn_cast<VPInstruction>(&R);
7085 if (!VPI || VPI->getOpcode() != Instruction::Select)
7086 continue;
7087
7088 if (auto *WR = dyn_cast_or_null<VPWidenRecipe>(VPI->getSingleUser())) {
7089 switch (WR->getOpcode()) {
7090 case Instruction::UDiv:
7091 case Instruction::SDiv:
7092 case Instruction::URem:
7093 case Instruction::SRem:
7094 return true;
7095 default:
7096 break;
7097 }
7098 }
7099 }
7100
7101 DenseSet<Instruction *> SeenInstrs;
7102 auto Iter = vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry());
7104 for (VPRecipeBase &R : *VPBB) {
7105 if (auto *IR = dyn_cast<VPInterleaveRecipe>(&R)) {
7106 auto *IG = IR->getInterleaveGroup();
7107 unsigned NumMembers = IG->getNumMembers();
7108 for (unsigned I = 0; I != NumMembers; ++I) {
7109 if (Instruction *M = IG->getMember(I))
7110 SeenInstrs.insert(M);
7111 }
7112 continue;
7113 }
7114 // Unused FOR splices are removed by VPlan transforms, so the VPlan-based
7115 // cost model won't cost it whilst the legacy will.
7116 if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) {
7117 using namespace VPlanPatternMatch;
7118 if (none_of(FOR->users(),
7119 match_fn(m_VPInstruction<
7121 return true;
7122 }
7123 // The VPlan-based cost model is more accurate for partial reductions and
7124 // comparing against the legacy cost isn't desirable.
7125 if (auto *VPR = dyn_cast<VPReductionRecipe>(&R))
7126 if (VPR->isPartialReduction())
7127 return true;
7128
7129 // The VPlan-based cost model can analyze if recipes are scalar
7130 // recursively, but the legacy cost model cannot.
7131 if (auto *WidenMemR = dyn_cast<VPWidenMemoryRecipe>(&R)) {
7132 auto *AddrI = dyn_cast<Instruction>(
7133 getLoadStorePointerOperand(&WidenMemR->getIngredient()));
7134 if (AddrI && vputils::isSingleScalar(WidenMemR->getAddr()) !=
7135 CostCtx.isLegacyUniformAfterVectorization(AddrI, VF))
7136 return true;
7137
7138 if (WidenMemR->isReverse()) {
7139 // If the stored value of a reverse store is invariant, LICM will
7140 // hoist the reverse operation to the preheader. In this case, the
7141 // result of the VPlan-based cost model will diverge from that of
7142 // the legacy model.
7143 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(WidenMemR))
7144 if (StoreR->getStoredValue()->isDefinedOutsideLoopRegions())
7145 return true;
7146
7147 if (auto *StoreR = dyn_cast<VPWidenStoreEVLRecipe>(WidenMemR))
7148 if (StoreR->getStoredValue()->isDefinedOutsideLoopRegions())
7149 return true;
7150 }
7151 }
7152
7153 // The legacy cost model costs non-header phis with a scalar VF as a phi,
7154 // but scalar unrolled VPlans will have VPBlendRecipes which emit selects.
7155 if (isa<VPBlendRecipe>(&R) &&
7156 vputils::onlyFirstLaneUsed(R.getVPSingleValue()))
7157 return true;
7158
7159 /// If a VPlan transform folded a recipe to one producing a single-scalar,
7160 /// but the original instruction wasn't uniform-after-vectorization in the
7161 /// legacy cost model, the legacy cost overestimates the actual cost.
7162 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
7163 if (RepR->isSingleScalar() &&
7165 RepR->getUnderlyingInstr(), VF))
7166 return true;
7167 }
7168 if (Instruction *UI = GetInstructionForCost(&R)) {
7169 // If we adjusted the predicate of the recipe, the cost in the legacy
7170 // cost model may be different.
7171 using namespace VPlanPatternMatch;
7172 CmpPredicate Pred;
7173 if (match(&R, m_Cmp(Pred, m_VPValue(), m_VPValue())) &&
7174 cast<VPRecipeWithIRFlags>(R).getPredicate() !=
7175 cast<CmpInst>(UI)->getPredicate())
7176 return true;
7177
7178 // Recipes with underlying instructions being moved out of the loop
7179 // region by LICM may cause discrepancies between the legacy cost model
7180 // and the VPlan-based cost model.
7181 if (!VPBB->getEnclosingLoopRegion())
7182 return true;
7183
7184 SeenInstrs.insert(UI);
7185 }
7186 }
7187 }
7188
7189 // Return true if the loop contains any instructions that are not also part of
7190 // the VPlan or are skipped for VPlan-based cost computations. This indicates
7191 // that the VPlan contains extra simplifications.
7192 return any_of(TheLoop->blocks(), [&SeenInstrs, &CostCtx,
7193 TheLoop](BasicBlock *BB) {
7194 return any_of(*BB, [&SeenInstrs, &CostCtx, TheLoop, BB](Instruction &I) {
7195 // Skip induction phis when checking for simplifications, as they may not
7196 // be lowered directly be lowered to a corresponding PHI recipe.
7197 if (isa<PHINode>(&I) && BB == TheLoop->getHeader() &&
7198 CostCtx.CM.Legal->isInductionPhi(cast<PHINode>(&I)))
7199 return false;
7200 return !SeenInstrs.contains(&I) && !CostCtx.skipCostComputation(&I, true);
7201 });
7202 });
7203}
7204#endif
7205
7207 if (VPlans.empty())
7209 // If there is a single VPlan with a single VF, return it directly.
7210 VPlan &FirstPlan = *VPlans[0];
7211 if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1)
7212 return {*FirstPlan.vectorFactors().begin(), 0, 0};
7213
7214 LLVM_DEBUG(dbgs() << "LV: Computing best VF using cost kind: "
7215 << (CM.CostKind == TTI::TCK_RecipThroughput
7216 ? "Reciprocal Throughput\n"
7217 : CM.CostKind == TTI::TCK_Latency
7218 ? "Instruction Latency\n"
7219 : CM.CostKind == TTI::TCK_CodeSize ? "Code Size\n"
7220 : CM.CostKind == TTI::TCK_SizeAndLatency
7221 ? "Code Size and Latency\n"
7222 : "Unknown\n"));
7223
7225 assert(hasPlanWithVF(ScalarVF) &&
7226 "More than a single plan/VF w/o any plan having scalar VF");
7227
7228 // TODO: Compute scalar cost using VPlan-based cost model.
7229 InstructionCost ScalarCost = CM.expectedCost(ScalarVF);
7230 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ScalarCost << ".\n");
7231 VectorizationFactor ScalarFactor(ScalarVF, ScalarCost, ScalarCost);
7232 VectorizationFactor BestFactor = ScalarFactor;
7233
7234 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
7235 if (ForceVectorization) {
7236 // Ignore scalar width, because the user explicitly wants vectorization.
7237 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
7238 // evaluation.
7239 BestFactor.Cost = InstructionCost::getMax();
7240 }
7241
7242 for (auto &P : VPlans) {
7243 ArrayRef<ElementCount> VFs(P->vectorFactors().begin(),
7244 P->vectorFactors().end());
7245
7247 if (any_of(VFs, [this](ElementCount VF) {
7248 return CM.shouldConsiderRegPressureForVF(VF);
7249 }))
7250 RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore);
7251
7252 for (unsigned I = 0; I < VFs.size(); I++) {
7253 ElementCount VF = VFs[I];
7254 if (VF.isScalar())
7255 continue;
7256 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
7257 LLVM_DEBUG(
7258 dbgs()
7259 << "LV: Not considering vector loop of width " << VF
7260 << " because it will not generate any vector instructions.\n");
7261 continue;
7262 }
7263 if (CM.OptForSize && !ForceVectorization && hasReplicatorRegion(*P)) {
7264 LLVM_DEBUG(
7265 dbgs()
7266 << "LV: Not considering vector loop of width " << VF
7267 << " because it would cause replicated blocks to be generated,"
7268 << " which isn't allowed when optimizing for size.\n");
7269 continue;
7270 }
7271
7272 InstructionCost Cost = cost(*P, VF);
7273 VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
7274
7275 if (CM.shouldConsiderRegPressureForVF(VF) &&
7276 RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs)) {
7277 LLVM_DEBUG(dbgs() << "LV(REG): Not considering vector loop of width "
7278 << VF << " because it uses too many registers\n");
7279 continue;
7280 }
7281
7282 if (isMoreProfitable(CurrentFactor, BestFactor, P->hasScalarTail()))
7283 BestFactor = CurrentFactor;
7284
7285 // If profitable add it to ProfitableVF list.
7286 if (isMoreProfitable(CurrentFactor, ScalarFactor, P->hasScalarTail()))
7287 ProfitableVFs.push_back(CurrentFactor);
7288 }
7289 }
7290
7291#ifndef NDEBUG
7292 // Select the optimal vectorization factor according to the legacy cost-model.
7293 // This is now only used to verify the decisions by the new VPlan-based
7294 // cost-model and will be retired once the VPlan-based cost-model is
7295 // stabilized.
7296 VectorizationFactor LegacyVF = selectVectorizationFactor();
7297 VPlan &BestPlan = getPlanFor(BestFactor.Width);
7298
7299 // Pre-compute the cost and use it to check if BestPlan contains any
7300 // simplifications not accounted for in the legacy cost model. If that's the
7301 // case, don't trigger the assertion, as the extra simplifications may cause a
7302 // different VF to be picked by the VPlan-based cost model.
7303 VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind, CM.PSE,
7304 OrigLoop);
7305 precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
7306 // Verify that the VPlan-based and legacy cost models agree, except for
7307 // * VPlans with early exits,
7308 // * VPlans with additional VPlan simplifications,
7309 // * EVL-based VPlans with gather/scatters (the VPlan-based cost model uses
7310 // vp_scatter/vp_gather).
7311 // The legacy cost model doesn't properly model costs for such loops.
7312 bool UsesEVLGatherScatter =
7314 BestPlan.getVectorLoopRegion()->getEntry())),
7315 [](VPBasicBlock *VPBB) {
7316 return any_of(*VPBB, [](VPRecipeBase &R) {
7317 return isa<VPWidenLoadEVLRecipe, VPWidenStoreEVLRecipe>(&R) &&
7318 !cast<VPWidenMemoryRecipe>(&R)->isConsecutive();
7319 });
7320 });
7321 assert(
7322 (BestFactor.Width == LegacyVF.Width || BestPlan.hasEarlyExit() ||
7323 !Legal->getLAI()->getSymbolicStrides().empty() || UsesEVLGatherScatter ||
7325 getPlanFor(BestFactor.Width), CostCtx, OrigLoop, BestFactor.Width) ||
7327 getPlanFor(LegacyVF.Width), CostCtx, OrigLoop, LegacyVF.Width)) &&
7328 " VPlan cost model and legacy cost model disagreed");
7329 assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) &&
7330 "when vectorizing, the scalar cost must be computed.");
7331#endif
7332
7333 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << BestFactor.Width << ".\n");
7334 return BestFactor;
7335}
7336
7337// If \p EpiResumePhiR is resume VPPhi for a reduction when vectorizing the
7338// epilog loop, fix the reduction's scalar PHI node by adding the incoming value
7339// from the main vector loop.
7341 VPPhi *EpiResumePhiR, PHINode &EpiResumePhi, BasicBlock *BypassBlock) {
7342 using namespace VPlanPatternMatch;
7343 // Get the VPInstruction computing the reduction result in the middle block.
7344 // The first operand may not be from the middle block if it is not connected
7345 // to the scalar preheader. In that case, there's nothing to fix.
7346 VPValue *Incoming = EpiResumePhiR->getOperand(0);
7349 auto *EpiRedResult = dyn_cast<VPInstruction>(Incoming);
7350 if (!EpiRedResult)
7351 return;
7352
7353 VPValue *BackedgeVal;
7354 bool IsFindIV = false;
7355 if (EpiRedResult->getOpcode() == VPInstruction::ComputeAnyOfResult ||
7356 EpiRedResult->getOpcode() == VPInstruction::ComputeReductionResult)
7357 BackedgeVal = EpiRedResult->getOperand(EpiRedResult->getNumOperands() - 1);
7358 else if (matchFindIVResult(EpiRedResult, m_VPValue(BackedgeVal), m_VPValue()))
7359 IsFindIV = true;
7360 else
7361 return;
7362
7363 auto *EpiRedHeaderPhi = cast_if_present<VPReductionPHIRecipe>(
7365 if (!EpiRedHeaderPhi) {
7366 match(BackedgeVal,
7368 VPlanPatternMatch::m_VPValue(BackedgeVal),
7370 EpiRedHeaderPhi = cast<VPReductionPHIRecipe>(
7372 }
7373
7374 Value *MainResumeValue;
7375 if (auto *VPI = dyn_cast<VPInstruction>(EpiRedHeaderPhi->getStartValue())) {
7376 assert((VPI->getOpcode() == VPInstruction::Broadcast ||
7377 VPI->getOpcode() == VPInstruction::ReductionStartVector) &&
7378 "unexpected start recipe");
7379 MainResumeValue = VPI->getOperand(0)->getUnderlyingValue();
7380 } else
7381 MainResumeValue = EpiRedHeaderPhi->getStartValue()->getUnderlyingValue();
7382 if (EpiRedResult->getOpcode() == VPInstruction::ComputeAnyOfResult) {
7383 [[maybe_unused]] Value *StartV =
7384 EpiRedResult->getOperand(0)->getLiveInIRValue();
7385 auto *Cmp = cast<ICmpInst>(MainResumeValue);
7386 assert(Cmp->getPredicate() == CmpInst::ICMP_NE &&
7387 "AnyOf expected to start with ICMP_NE");
7388 assert(Cmp->getOperand(1) == StartV &&
7389 "AnyOf expected to start by comparing main resume value to original "
7390 "start value");
7391 MainResumeValue = Cmp->getOperand(0);
7392 } else if (IsFindIV) {
7393 MainResumeValue = cast<SelectInst>(MainResumeValue)->getFalseValue();
7394 }
7395 PHINode *MainResumePhi = cast<PHINode>(MainResumeValue);
7396
7397 // When fixing reductions in the epilogue loop we should already have
7398 // created a bc.merge.rdx Phi after the main vector body. Ensure that we carry
7399 // over the incoming values correctly.
7400 EpiResumePhi.setIncomingValueForBlock(
7401 BypassBlock, MainResumePhi->getIncomingValueForBlock(BypassBlock));
7402}
7403
7405 ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7406 InnerLoopVectorizer &ILV, DominatorTree *DT, bool VectorizingEpilogue) {
7407 assert(BestVPlan.hasVF(BestVF) &&
7408 "Trying to execute plan with unsupported VF");
7409 assert(BestVPlan.hasUF(BestUF) &&
7410 "Trying to execute plan with unsupported UF");
7411 if (BestVPlan.hasEarlyExit())
7412 ++LoopsEarlyExitVectorized;
7413 // TODO: Move to VPlan transform stage once the transition to the VPlan-based
7414 // cost model is complete for better cost estimates.
7415 RUN_VPLAN_PASS(VPlanTransforms::unrollByUF, BestVPlan, BestUF);
7419 bool HasBranchWeights =
7420 hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator());
7421 if (HasBranchWeights) {
7422 std::optional<unsigned> VScale = CM.getVScaleForTuning();
7424 BestVPlan, BestVF, VScale);
7425 }
7426
7427 // Checks are the same for all VPlans, added to BestVPlan only for
7428 // compactness.
7429 attachRuntimeChecks(BestVPlan, ILV.RTChecks, HasBranchWeights);
7430
7431 // Retrieving VectorPH now when it's easier while VPlan still has Regions.
7432 VPBasicBlock *VectorPH = cast<VPBasicBlock>(BestVPlan.getVectorPreheader());
7433
7434 VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
7437 if (BestVPlan.getEntry()->getSingleSuccessor() ==
7438 BestVPlan.getScalarPreheader()) {
7439 // TODO: The vector loop would be dead, should not even try to vectorize.
7440 ORE->emit([&]() {
7441 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationDead",
7442 OrigLoop->getStartLoc(),
7443 OrigLoop->getHeader())
7444 << "Created vector loop never executes due to insufficient trip "
7445 "count.";
7446 });
7448 }
7449
7451
7453 // Convert the exit condition to AVLNext == 0 for EVL tail folded loops.
7455 // Regions are dissolved after optimizing for VF and UF, which completely
7456 // removes unneeded loop regions first.
7458 // Expand BranchOnTwoConds after dissolution, when latch has direct access to
7459 // its successors.
7461 // Convert loops with variable-length stepping after regions are dissolved.
7465 BestVPlan, VectorPH, CM.foldTailByMasking(),
7466 CM.requiresScalarEpilogue(BestVF.isVector()));
7467 VPlanTransforms::materializeFactors(BestVPlan, VectorPH, BestVF);
7468 VPlanTransforms::cse(BestVPlan);
7470
7471 // 0. Generate SCEV-dependent code in the entry, including TripCount, before
7472 // making any changes to the CFG.
7473 DenseMap<const SCEV *, Value *> ExpandedSCEVs =
7474 VPlanTransforms::expandSCEVs(BestVPlan, *PSE.getSE());
7475 if (!ILV.getTripCount()) {
7476 ILV.setTripCount(BestVPlan.getTripCount()->getLiveInIRValue());
7477 } else {
7478 assert(VectorizingEpilogue && "should only re-use the existing trip "
7479 "count during epilogue vectorization");
7480 }
7481
7482 // Perform the actual loop transformation.
7483 VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan,
7484 OrigLoop->getParentLoop(),
7485 Legal->getWidestInductionType());
7486
7487#ifdef EXPENSIVE_CHECKS
7488 assert(DT->verify(DominatorTree::VerificationLevel::Fast));
7489#endif
7490
7491 // 1. Set up the skeleton for vectorization, including vector pre-header and
7492 // middle block. The vector loop is created during VPlan execution.
7493 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
7495 State.CFG.PrevBB->getSingleSuccessor(), &BestVPlan);
7497
7498 assert(verifyVPlanIsValid(BestVPlan) && "final VPlan is invalid");
7499
7500 // After vectorization, the exit blocks of the original loop will have
7501 // additional predecessors. Invalidate SCEVs for the exit phis in case SE
7502 // looked through single-entry phis.
7503 ScalarEvolution &SE = *PSE.getSE();
7504 for (VPIRBasicBlock *Exit : BestVPlan.getExitBlocks()) {
7505 if (!Exit->hasPredecessors())
7506 continue;
7507 for (VPRecipeBase &PhiR : Exit->phis())
7509 &cast<VPIRPhi>(PhiR).getIRPhi());
7510 }
7511 // Forget the original loop and block dispositions.
7512 SE.forgetLoop(OrigLoop);
7514
7516
7517 //===------------------------------------------------===//
7518 //
7519 // Notice: any optimization or new instruction that go
7520 // into the code below should also be implemented in
7521 // the cost-model.
7522 //
7523 //===------------------------------------------------===//
7524
7525 // Retrieve loop information before executing the plan, which may remove the
7526 // original loop, if it becomes unreachable.
7527 MDNode *LID = OrigLoop->getLoopID();
7528 unsigned OrigLoopInvocationWeight = 0;
7529 std::optional<unsigned> OrigAverageTripCount =
7530 getLoopEstimatedTripCount(OrigLoop, &OrigLoopInvocationWeight);
7531
7532 BestVPlan.execute(&State);
7533
7534 // 2.6. Maintain Loop Hints
7535 // Keep all loop hints from the original loop on the vector loop (we'll
7536 // replace the vectorizer-specific hints below).
7537 VPBasicBlock *HeaderVPBB = vputils::getFirstLoopHeader(BestVPlan, State.VPDT);
7538 // Add metadata to disable runtime unrolling a scalar loop when there
7539 // are no runtime checks about strides and memory. A scalar loop that is
7540 // rarely used is not worth unrolling.
7541 bool DisableRuntimeUnroll = !ILV.RTChecks.hasChecks() && !BestVF.isScalar();
7543 HeaderVPBB ? LI->getLoopFor(State.CFG.VPBB2IRBB.lookup(HeaderVPBB))
7544 : nullptr,
7545 HeaderVPBB, BestVPlan, VectorizingEpilogue, LID, OrigAverageTripCount,
7546 OrigLoopInvocationWeight,
7547 estimateElementCount(BestVF * BestUF, CM.getVScaleForTuning()),
7548 DisableRuntimeUnroll);
7549
7550 // 3. Fix the vectorized code: take care of header phi's, live-outs,
7551 // predication, updating analyses.
7552 ILV.fixVectorizedLoop(State);
7553
7555
7556 return ExpandedSCEVs;
7557}
7558
7559//===--------------------------------------------------------------------===//
7560// EpilogueVectorizerMainLoop
7561//===--------------------------------------------------------------------===//
7562
7563/// This function is partially responsible for generating the control flow
7564/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7566 BasicBlock *ScalarPH = createScalarPreheader("");
7567 BasicBlock *VectorPH = ScalarPH->getSinglePredecessor();
7568
7569 // Generate the code to check the minimum iteration count of the vector
7570 // epilogue (see below).
7571 EPI.EpilogueIterationCountCheck =
7572 emitIterationCountCheck(VectorPH, ScalarPH, true);
7573 EPI.EpilogueIterationCountCheck->setName("iter.check");
7574
7575 VectorPH = cast<BranchInst>(EPI.EpilogueIterationCountCheck->getTerminator())
7576 ->getSuccessor(1);
7577 // Generate the iteration count check for the main loop, *after* the check
7578 // for the epilogue loop, so that the path-length is shorter for the case
7579 // that goes directly through the vector epilogue. The longer-path length for
7580 // the main loop is compensated for, by the gain from vectorizing the larger
7581 // trip count. Note: the branch will get updated later on when we vectorize
7582 // the epilogue.
7583 EPI.MainLoopIterationCountCheck =
7584 emitIterationCountCheck(VectorPH, ScalarPH, false);
7585
7586 return cast<BranchInst>(EPI.MainLoopIterationCountCheck->getTerminator())
7587 ->getSuccessor(1);
7588}
7589
7591 LLVM_DEBUG({
7592 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7593 << "Main Loop VF:" << EPI.MainLoopVF
7594 << ", Main Loop UF:" << EPI.MainLoopUF
7595 << ", Epilogue Loop VF:" << EPI.EpilogueVF
7596 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7597 });
7598}
7599
7602 dbgs() << "intermediate fn:\n"
7603 << *OrigLoop->getHeader()->getParent() << "\n";
7604 });
7605}
7606
7608 BasicBlock *VectorPH, BasicBlock *Bypass, bool ForEpilogue) {
7609 assert(Bypass && "Expected valid bypass basic block.");
7612 Value *CheckMinIters = createIterationCountCheck(
7613 VectorPH, ForEpilogue ? EPI.EpilogueVF : EPI.MainLoopVF,
7614 ForEpilogue ? EPI.EpilogueUF : EPI.MainLoopUF);
7615
7616 BasicBlock *const TCCheckBlock = VectorPH;
7617 if (!ForEpilogue)
7618 TCCheckBlock->setName("vector.main.loop.iter.check");
7619
7620 // Create new preheader for vector loop.
7621 VectorPH = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7622 static_cast<DominatorTree *>(nullptr), LI, nullptr,
7623 "vector.ph");
7624 if (ForEpilogue) {
7625 // Save the trip count so we don't have to regenerate it in the
7626 // vec.epilog.iter.check. This is safe to do because the trip count
7627 // generated here dominates the vector epilog iter check.
7628 EPI.TripCount = Count;
7629 } else {
7631 }
7632
7633 BranchInst &BI = *BranchInst::Create(Bypass, VectorPH, CheckMinIters);
7634 if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
7635 setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
7636 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
7637
7638 // When vectorizing the main loop, its trip-count check is placed in a new
7639 // block, whereas the overall trip-count check is placed in the VPlan entry
7640 // block. When vectorizing the epilogue loop, its trip-count check is placed
7641 // in the VPlan entry block.
7642 if (!ForEpilogue)
7643 introduceCheckBlockInVPlan(TCCheckBlock);
7644 return TCCheckBlock;
7645}
7646
7647//===--------------------------------------------------------------------===//
7648// EpilogueVectorizerEpilogueLoop
7649//===--------------------------------------------------------------------===//
7650
7651/// This function creates a new scalar preheader, using the previous one as
7652/// entry block to the epilogue VPlan. The minimum iteration check is being
7653/// represented in VPlan.
7655 BasicBlock *NewScalarPH = createScalarPreheader("vec.epilog.");
7656 BasicBlock *OriginalScalarPH = NewScalarPH->getSinglePredecessor();
7657 OriginalScalarPH->setName("vec.epilog.iter.check");
7658 VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(OriginalScalarPH);
7659 VPBasicBlock *OldEntry = Plan.getEntry();
7660 for (auto &R : make_early_inc_range(*OldEntry)) {
7661 // Skip moving VPIRInstructions (including VPIRPhis), which are unmovable by
7662 // defining.
7663 if (isa<VPIRInstruction>(&R))
7664 continue;
7665 R.moveBefore(*NewEntry, NewEntry->end());
7666 }
7667
7668 VPBlockUtils::reassociateBlocks(OldEntry, NewEntry);
7669 Plan.setEntry(NewEntry);
7670 // OldEntry is now dead and will be cleaned up when the plan gets destroyed.
7671
7672 return OriginalScalarPH;
7673}
7674
7676 LLVM_DEBUG({
7677 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7678 << "Epilogue Loop VF:" << EPI.EpilogueVF
7679 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7680 });
7681}
7682
7685 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
7686 });
7687}
7688
7689VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(VPInstruction *VPI,
7690 VFRange &Range) {
7691 assert((VPI->getOpcode() == Instruction::Load ||
7692 VPI->getOpcode() == Instruction::Store) &&
7693 "Must be called with either a load or store");
7695
7696 auto WillWiden = [&](ElementCount VF) -> bool {
7698 CM.getWideningDecision(I, VF);
7700 "CM decision should be taken at this point.");
7702 return true;
7703 if (CM.isScalarAfterVectorization(I, VF) ||
7704 CM.isProfitableToScalarize(I, VF))
7705 return false;
7707 };
7708
7710 return nullptr;
7711
7712 // If a mask is not required, drop it - use unmasked version for safe loads.
7713 // TODO: Determine if mask is needed in VPlan.
7714 VPValue *Mask = Legal->isMaskRequired(I) ? VPI->getMask() : nullptr;
7715
7716 // Determine if the pointer operand of the access is either consecutive or
7717 // reverse consecutive.
7719 CM.getWideningDecision(I, Range.Start);
7721 bool Consecutive =
7723
7724 VPValue *Ptr = VPI->getOpcode() == Instruction::Load ? VPI->getOperand(0)
7725 : VPI->getOperand(1);
7726 if (Consecutive) {
7729 VPSingleDefRecipe *VectorPtr;
7730 if (Reverse) {
7731 // When folding the tail, we may compute an address that we don't in the
7732 // original scalar loop: drop the GEP no-wrap flags in this case.
7733 // Otherwise preserve existing flags without no-unsigned-wrap, as we will
7734 // emit negative indices.
7735 GEPNoWrapFlags Flags =
7736 CM.foldTailByMasking() || !GEP
7738 : GEP->getNoWrapFlags().withoutNoUnsignedWrap();
7739 VectorPtr = new VPVectorEndPointerRecipe(
7740 Ptr, &Plan.getVF(), getLoadStoreType(I),
7741 /*Stride*/ -1, Flags, VPI->getDebugLoc());
7742 } else {
7743 VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
7744 GEP ? GEP->getNoWrapFlags()
7746 VPI->getDebugLoc());
7747 }
7748 Builder.insert(VectorPtr);
7749 Ptr = VectorPtr;
7750 }
7751
7752 if (VPI->getOpcode() == Instruction::Load) {
7753 auto *Load = cast<LoadInst>(I);
7754 auto *LoadR = new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
7755 *VPI, Load->getDebugLoc());
7756 if (Reverse) {
7757 Builder.insert(LoadR);
7758 return new VPInstruction(VPInstruction::Reverse, LoadR, {}, {},
7759 LoadR->getDebugLoc());
7760 }
7761 return LoadR;
7762 }
7763
7764 StoreInst *Store = cast<StoreInst>(I);
7765 VPValue *StoredVal = VPI->getOperand(0);
7766 if (Reverse)
7767 StoredVal = Builder.createNaryOp(VPInstruction::Reverse, StoredVal,
7768 Store->getDebugLoc());
7769 return new VPWidenStoreRecipe(*Store, Ptr, StoredVal, Mask, Consecutive,
7770 Reverse, *VPI, Store->getDebugLoc());
7771}
7772
7774VPRecipeBuilder::tryToOptimizeInductionTruncate(VPInstruction *VPI,
7775 VFRange &Range) {
7776 auto *I = cast<TruncInst>(VPI->getUnderlyingInstr());
7777 // Optimize the special case where the source is a constant integer
7778 // induction variable. Notice that we can only optimize the 'trunc' case
7779 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
7780 // (c) other casts depend on pointer size.
7781
7782 // Determine whether \p K is a truncation based on an induction variable that
7783 // can be optimized.
7784 auto IsOptimizableIVTruncate =
7785 [&](Instruction *K) -> std::function<bool(ElementCount)> {
7786 return [=](ElementCount VF) -> bool {
7787 return CM.isOptimizableIVTruncate(K, VF);
7788 };
7789 };
7790
7792 IsOptimizableIVTruncate(I), Range))
7793 return nullptr;
7794
7796 VPI->getOperand(0)->getDefiningRecipe());
7797 PHINode *Phi = WidenIV->getPHINode();
7798 VPIRValue *Start = WidenIV->getStartValue();
7799 const InductionDescriptor &IndDesc = WidenIV->getInductionDescriptor();
7800
7801 // It is always safe to copy over the NoWrap and FastMath flags. In
7802 // particular, when folding tail by masking, the masked-off lanes are never
7803 // used, so it is safe.
7804 VPIRFlags Flags = vputils::getFlagsFromIndDesc(IndDesc);
7805 VPValue *Step =
7807 return new VPWidenIntOrFpInductionRecipe(
7808 Phi, Start, Step, &Plan.getVF(), IndDesc, I, Flags, VPI->getDebugLoc());
7809}
7810
7811VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(VPInstruction *VPI,
7812 VFRange &Range) {
7813 CallInst *CI = cast<CallInst>(VPI->getUnderlyingInstr());
7815 [this, CI](ElementCount VF) {
7816 return CM.isScalarWithPredication(CI, VF);
7817 },
7818 Range);
7819
7820 if (IsPredicated)
7821 return nullptr;
7822
7824 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
7825 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
7826 ID == Intrinsic::pseudoprobe ||
7827 ID == Intrinsic::experimental_noalias_scope_decl))
7828 return nullptr;
7829
7831 VPI->op_begin() + CI->arg_size());
7832
7833 // Is it beneficial to perform intrinsic call compared to lib call?
7834 bool ShouldUseVectorIntrinsic =
7836 [&](ElementCount VF) -> bool {
7837 return CM.getCallWideningDecision(CI, VF).Kind ==
7839 },
7840 Range);
7841 if (ShouldUseVectorIntrinsic)
7842 return new VPWidenIntrinsicRecipe(*CI, ID, Ops, CI->getType(), *VPI, *VPI,
7843 VPI->getDebugLoc());
7844
7845 Function *Variant = nullptr;
7846 std::optional<unsigned> MaskPos;
7847 // Is better to call a vectorized version of the function than to to scalarize
7848 // the call?
7849 auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
7850 [&](ElementCount VF) -> bool {
7851 // The following case may be scalarized depending on the VF.
7852 // The flag shows whether we can use a usual Call for vectorized
7853 // version of the instruction.
7854
7855 // If we've found a variant at a previous VF, then stop looking. A
7856 // vectorized variant of a function expects input in a certain shape
7857 // -- basically the number of input registers, the number of lanes
7858 // per register, and whether there's a mask required.
7859 // We store a pointer to the variant in the VPWidenCallRecipe, so
7860 // once we have an appropriate variant it's only valid for that VF.
7861 // This will force a different vplan to be generated for each VF that
7862 // finds a valid variant.
7863 if (Variant)
7864 return false;
7865 LoopVectorizationCostModel::CallWideningDecision Decision =
7866 CM.getCallWideningDecision(CI, VF);
7868 Variant = Decision.Variant;
7869 MaskPos = Decision.MaskPos;
7870 return true;
7871 }
7872
7873 return false;
7874 },
7875 Range);
7876 if (ShouldUseVectorCall) {
7877 if (MaskPos.has_value()) {
7878 // We have 2 cases that would require a mask:
7879 // 1) The call needs to be predicated, either due to a conditional
7880 // in the scalar loop or use of an active lane mask with
7881 // tail-folding, and we use the appropriate mask for the block.
7882 // 2) No mask is required for the call instruction, but the only
7883 // available vector variant at this VF requires a mask, so we
7884 // synthesize an all-true mask.
7885 VPValue *Mask = VPI->isMasked() ? VPI->getMask() : Plan.getTrue();
7886
7887 Ops.insert(Ops.begin() + *MaskPos, Mask);
7888 }
7889
7890 Ops.push_back(VPI->getOperand(VPI->getNumOperandsWithoutMask() - 1));
7891 return new VPWidenCallRecipe(CI, Variant, Ops, *VPI, *VPI,
7892 VPI->getDebugLoc());
7893 }
7894
7895 return nullptr;
7896}
7897
7898bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
7900 !isa<StoreInst>(I) && "Instruction should have been handled earlier");
7901 // Instruction should be widened, unless it is scalar after vectorization,
7902 // scalarization is profitable or it is predicated.
7903 auto WillScalarize = [this, I](ElementCount VF) -> bool {
7904 return CM.isScalarAfterVectorization(I, VF) ||
7905 CM.isProfitableToScalarize(I, VF) ||
7906 CM.isScalarWithPredication(I, VF);
7907 };
7909 Range);
7910}
7911
7912VPWidenRecipe *VPRecipeBuilder::tryToWiden(VPInstruction *VPI) {
7913 auto *I = VPI->getUnderlyingInstr();
7914 switch (VPI->getOpcode()) {
7915 default:
7916 return nullptr;
7917 case Instruction::SDiv:
7918 case Instruction::UDiv:
7919 case Instruction::SRem:
7920 case Instruction::URem: {
7921 // If not provably safe, use a select to form a safe divisor before widening the
7922 // div/rem operation itself. Otherwise fall through to general handling below.
7923 if (CM.isPredicatedInst(I)) {
7925 VPValue *Mask = VPI->getMask();
7926 VPValue *One = Plan.getConstantInt(I->getType(), 1u);
7927 auto *SafeRHS =
7928 Builder.createSelect(Mask, Ops[1], One, VPI->getDebugLoc());
7929 Ops[1] = SafeRHS;
7930 return new VPWidenRecipe(*I, Ops, *VPI, *VPI, VPI->getDebugLoc());
7931 }
7932 [[fallthrough]];
7933 }
7934 case Instruction::Add:
7935 case Instruction::And:
7936 case Instruction::AShr:
7937 case Instruction::FAdd:
7938 case Instruction::FCmp:
7939 case Instruction::FDiv:
7940 case Instruction::FMul:
7941 case Instruction::FNeg:
7942 case Instruction::FRem:
7943 case Instruction::FSub:
7944 case Instruction::ICmp:
7945 case Instruction::LShr:
7946 case Instruction::Mul:
7947 case Instruction::Or:
7948 case Instruction::Select:
7949 case Instruction::Shl:
7950 case Instruction::Sub:
7951 case Instruction::Xor:
7952 case Instruction::Freeze:
7953 return new VPWidenRecipe(*I, VPI->operandsWithoutMask(), *VPI, *VPI,
7954 VPI->getDebugLoc());
7955 case Instruction::ExtractValue: {
7957 auto *EVI = cast<ExtractValueInst>(I);
7958 assert(EVI->getNumIndices() == 1 && "Expected one extractvalue index");
7959 unsigned Idx = EVI->getIndices()[0];
7960 NewOps.push_back(Plan.getConstantInt(32, Idx));
7961 return new VPWidenRecipe(*I, NewOps, *VPI, *VPI, VPI->getDebugLoc());
7962 }
7963 };
7964}
7965
7966VPHistogramRecipe *VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
7967 VPInstruction *VPI) {
7968 // FIXME: Support other operations.
7969 unsigned Opcode = HI->Update->getOpcode();
7970 assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) &&
7971 "Histogram update operation must be an Add or Sub");
7972
7974 // Bucket address.
7975 HGramOps.push_back(VPI->getOperand(1));
7976 // Increment value.
7977 HGramOps.push_back(getVPValueOrAddLiveIn(HI->Update->getOperand(1)));
7978
7979 // In case of predicated execution (due to tail-folding, or conditional
7980 // execution, or both), pass the relevant mask.
7981 if (Legal->isMaskRequired(HI->Store))
7982 HGramOps.push_back(VPI->getMask());
7983
7984 return new VPHistogramRecipe(Opcode, HGramOps, VPI->getDebugLoc());
7985}
7986
7988 VFRange &Range) {
7989 auto *I = VPI->getUnderlyingInstr();
7991 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
7992 Range);
7993
7994 bool IsPredicated = CM.isPredicatedInst(I);
7995
7996 // Even if the instruction is not marked as uniform, there are certain
7997 // intrinsic calls that can be effectively treated as such, so we check for
7998 // them here. Conservatively, we only do this for scalable vectors, since
7999 // for fixed-width VFs we can always fall back on full scalarization.
8000 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8001 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8002 case Intrinsic::assume:
8003 case Intrinsic::lifetime_start:
8004 case Intrinsic::lifetime_end:
8005 // For scalable vectors if one of the operands is variant then we still
8006 // want to mark as uniform, which will generate one instruction for just
8007 // the first lane of the vector. We can't scalarize the call in the same
8008 // way as for fixed-width vectors because we don't know how many lanes
8009 // there are.
8010 //
8011 // The reasons for doing it this way for scalable vectors are:
8012 // 1. For the assume intrinsic generating the instruction for the first
8013 // lane is still be better than not generating any at all. For
8014 // example, the input may be a splat across all lanes.
8015 // 2. For the lifetime start/end intrinsics the pointer operand only
8016 // does anything useful when the input comes from a stack object,
8017 // which suggests it should always be uniform. For non-stack objects
8018 // the effect is to poison the object, which still allows us to
8019 // remove the call.
8020 IsUniform = true;
8021 break;
8022 default:
8023 break;
8024 }
8025 }
8026 VPValue *BlockInMask = nullptr;
8027 if (!IsPredicated) {
8028 // Finalize the recipe for Instr, first if it is not predicated.
8029 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8030 } else {
8031 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8032 // Instructions marked for predication are replicated and a mask operand is
8033 // added initially. Masked replicate recipes will later be placed under an
8034 // if-then construct to prevent side-effects. Generate recipes to compute
8035 // the block mask for this region.
8036 BlockInMask = VPI->getMask();
8037 }
8038
8039 // Note that there is some custom logic to mark some intrinsics as uniform
8040 // manually above for scalable vectors, which this assert needs to account for
8041 // as well.
8042 assert((Range.Start.isScalar() || !IsUniform || !IsPredicated ||
8043 (Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
8044 "Should not predicate a uniform recipe");
8045 auto *Recipe =
8046 new VPReplicateRecipe(I, VPI->operandsWithoutMask(), IsUniform,
8047 BlockInMask, *VPI, *VPI, VPI->getDebugLoc());
8048 return Recipe;
8049}
8050
8053 VFRange &Range) {
8054 assert(!R->isPhi() && "phis must be handled earlier");
8055 // First, check for specific widening recipes that deal with optimizing
8056 // truncates, calls and memory operations.
8057
8058 VPRecipeBase *Recipe;
8059 auto *VPI = cast<VPInstruction>(R);
8060 if (VPI->getOpcode() == Instruction::Trunc &&
8061 (Recipe = tryToOptimizeInductionTruncate(VPI, Range)))
8062 return Recipe;
8063
8064 // All widen recipes below deal only with VF > 1.
8066 [&](ElementCount VF) { return VF.isScalar(); }, Range))
8067 return nullptr;
8068
8069 if (VPI->getOpcode() == Instruction::Call)
8070 return tryToWidenCall(VPI, Range);
8071
8072 Instruction *Instr = R->getUnderlyingInstr();
8073 if (VPI->getOpcode() == Instruction::Store)
8074 if (auto HistInfo = Legal->getHistogramInfo(cast<StoreInst>(Instr)))
8075 return tryToWidenHistogram(*HistInfo, VPI);
8076
8077 if (VPI->getOpcode() == Instruction::Load ||
8078 VPI->getOpcode() == Instruction::Store)
8079 return tryToWidenMemory(VPI, Range);
8080
8081 if (!shouldWiden(Instr, Range))
8082 return nullptr;
8083
8084 if (VPI->getOpcode() == Instruction::GetElementPtr)
8085 return new VPWidenGEPRecipe(cast<GetElementPtrInst>(Instr),
8086 VPI->operandsWithoutMask(), *VPI,
8087 VPI->getDebugLoc());
8088
8089 if (Instruction::isCast(VPI->getOpcode())) {
8090 auto *CI = cast<CastInst>(Instr);
8091 auto *CastR = cast<VPInstructionWithType>(VPI);
8092 return new VPWidenCastRecipe(CI->getOpcode(), VPI->getOperand(0),
8093 CastR->getResultType(), CI, *VPI, *VPI,
8094 VPI->getDebugLoc());
8095 }
8096
8097 return tryToWiden(VPI);
8098}
8099
8100void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8101 ElementCount MaxVF) {
8102 if (ElementCount::isKnownGT(MinVF, MaxVF))
8103 return;
8104
8105 assert(OrigLoop->isInnermost() && "Inner loop expected.");
8106
8107 const LoopAccessInfo *LAI = Legal->getLAI();
8109 OrigLoop, LI, DT, PSE.getSE());
8110 if (!LAI->getRuntimePointerChecking()->getChecks().empty() &&
8112 // Only use noalias metadata when using memory checks guaranteeing no
8113 // overlap across all iterations.
8114 LVer.prepareNoAliasMetadata();
8115 }
8116
8117 // Create initial base VPlan0, to serve as common starting point for all
8118 // candidates built later for specific VF ranges.
8119 auto VPlan0 = VPlanTransforms::buildVPlan0(
8120 OrigLoop, *LI, Legal->getWidestInductionType(),
8121 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE, &LVer);
8122
8123 // Create recipes for header phis.
8125 *VPlan0, PSE, *OrigLoop, Legal->getInductionVars(),
8126 Legal->getReductionVars(), Legal->getFixedOrderRecurrences(),
8127 CM.getInLoopReductions(), Hints.allowReordering());
8128
8130
8131 auto MaxVFTimes2 = MaxVF * 2;
8132 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
8133 VFRange SubRange = {VF, MaxVFTimes2};
8134 if (auto Plan = tryToBuildVPlanWithVPRecipes(
8135 std::unique_ptr<VPlan>(VPlan0->duplicate()), SubRange, &LVer)) {
8136 // Now optimize the initial VPlan.
8137 VPlanTransforms::hoistPredicatedLoads(*Plan, PSE, OrigLoop);
8138 VPlanTransforms::sinkPredicatedStores(*Plan, PSE, OrigLoop);
8140 CM.getMinimalBitwidths());
8142 // TODO: try to put addExplicitVectorLength close to addActiveLaneMask
8143 if (CM.foldTailWithEVL()) {
8145 CM.getMaxSafeElements());
8147 }
8148
8149 if (auto P = VPlanTransforms::narrowInterleaveGroups(*Plan, TTI))
8150 VPlans.push_back(std::move(P));
8151
8152 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8153 VPlans.push_back(std::move(Plan));
8154 }
8155 VF = SubRange.End;
8156 }
8157}
8158
8159VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
8160 VPlanPtr Plan, VFRange &Range, LoopVersioning *LVer) {
8161
8162 using namespace llvm::VPlanPatternMatch;
8163 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8164
8165 // ---------------------------------------------------------------------------
8166 // Build initial VPlan: Scan the body of the loop in a topological order to
8167 // visit each basic block after having visited its predecessor basic blocks.
8168 // ---------------------------------------------------------------------------
8169
8170 bool RequiresScalarEpilogueCheck =
8172 [this](ElementCount VF) {
8173 return !CM.requiresScalarEpilogue(VF.isVector());
8174 },
8175 Range);
8176 VPlanTransforms::handleEarlyExits(*Plan, Legal->hasUncountableEarlyExit());
8177 VPlanTransforms::addMiddleCheck(*Plan, RequiresScalarEpilogueCheck,
8178 CM.foldTailByMasking());
8179
8181
8182 // Don't use getDecisionAndClampRange here, because we don't know the UF
8183 // so this function is better to be conservative, rather than to split
8184 // it up into different VPlans.
8185 // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
8186 bool IVUpdateMayOverflow = false;
8187 for (ElementCount VF : Range)
8188 IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
8189
8190 TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
8191 // Use NUW for the induction increment if we proved that it won't overflow in
8192 // the vector loop or when not folding the tail. In the later case, we know
8193 // that the canonical induction increment will not overflow as the vector trip
8194 // count is >= increment and a multiple of the increment.
8195 VPRegionBlock *LoopRegion = Plan->getVectorLoopRegion();
8196 bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None;
8197 if (!HasNUW) {
8198 auto *IVInc =
8199 LoopRegion->getExitingBasicBlock()->getTerminator()->getOperand(0);
8200 assert(match(IVInc,
8201 m_VPInstruction<Instruction::Add>(
8202 m_Specific(LoopRegion->getCanonicalIV()), m_VPValue())) &&
8203 "Did not find the canonical IV increment");
8204 cast<VPRecipeWithIRFlags>(IVInc)->dropPoisonGeneratingFlags();
8205 }
8206
8207 // ---------------------------------------------------------------------------
8208 // Pre-construction: record ingredients whose recipes we'll need to further
8209 // process after constructing the initial VPlan.
8210 // ---------------------------------------------------------------------------
8211
8212 // For each interleave group which is relevant for this (possibly trimmed)
8213 // Range, add it to the set of groups to be later applied to the VPlan and add
8214 // placeholders for its members' Recipes which we'll be replacing with a
8215 // single VPInterleaveRecipe.
8216 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8217 auto ApplyIG = [IG, this](ElementCount VF) -> bool {
8218 bool Result = (VF.isVector() && // Query is illegal for VF == 1
8219 CM.getWideningDecision(IG->getInsertPos(), VF) ==
8221 // For scalable vectors, the interleave factors must be <= 8 since we
8222 // require the (de)interleaveN intrinsics instead of shufflevectors.
8223 assert((!Result || !VF.isScalable() || IG->getFactor() <= 8) &&
8224 "Unsupported interleave factor for scalable vectors");
8225 return Result;
8226 };
8227 if (!getDecisionAndClampRange(ApplyIG, Range))
8228 continue;
8229 InterleaveGroups.insert(IG);
8230 }
8231
8232 // ---------------------------------------------------------------------------
8233 // Predicate and linearize the top-level loop region.
8234 // ---------------------------------------------------------------------------
8236 CM.foldTailByMasking());
8237
8238 // ---------------------------------------------------------------------------
8239 // Construct wide recipes and apply predication for original scalar
8240 // VPInstructions in the loop.
8241 // ---------------------------------------------------------------------------
8242 VPRecipeBuilder RecipeBuilder(*Plan, TLI, Legal, CM, Builder);
8243
8244 // Scan the body of the loop in a topological order to visit each basic block
8245 // after having visited its predecessor basic blocks.
8246 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
8247 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
8248 HeaderVPBB);
8249
8250 auto *MiddleVPBB = Plan->getMiddleBlock();
8251 VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
8252
8253 // Collect blocks that need predication for in-loop reduction recipes.
8254 DenseSet<BasicBlock *> BlocksNeedingPredication;
8255 for (BasicBlock *BB : OrigLoop->blocks())
8256 if (CM.blockNeedsPredicationForAnyReason(BB))
8257 BlocksNeedingPredication.insert(BB);
8258
8259 VPlanTransforms::createInLoopReductionRecipes(*Plan, BlocksNeedingPredication,
8260 Range.Start);
8261
8262 // Now process all other blocks and instructions.
8263 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
8264 // Convert input VPInstructions to widened recipes.
8265 for (VPRecipeBase &R : make_early_inc_range(
8266 make_range(VPBB->getFirstNonPhi(), VPBB->end()))) {
8267 // Skip recipes that do not need transforming.
8269 continue;
8270 auto *VPI = cast<VPInstruction>(&R);
8271 if (!VPI->getUnderlyingValue())
8272 continue;
8273
8274 // TODO: Gradually replace uses of underlying instruction by analyses on
8275 // VPlan. Migrate code relying on the underlying instruction from VPlan0
8276 // to construct recipes below to not use the underlying instruction.
8278 Builder.setInsertPoint(VPI);
8279
8280 // The stores with invariant address inside the loop will be deleted, and
8281 // in the exit block, a uniform store recipe will be created for the final
8282 // invariant store of the reduction.
8283 StoreInst *SI;
8284 if ((SI = dyn_cast<StoreInst>(Instr)) &&
8285 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
8286 // Only create recipe for the final invariant store of the reduction.
8287 if (Legal->isInvariantStoreOfReduction(SI)) {
8288 auto *Recipe = new VPReplicateRecipe(
8289 SI, VPI->operandsWithoutMask(), true /* IsUniform */,
8290 nullptr /*Mask*/, *VPI, *VPI, VPI->getDebugLoc());
8291 Recipe->insertBefore(*MiddleVPBB, MBIP);
8292 }
8293 R.eraseFromParent();
8294 continue;
8295 }
8296
8297 VPRecipeBase *Recipe =
8298 RecipeBuilder.tryToCreateWidenNonPhiRecipe(VPI, Range);
8299 if (!Recipe)
8300 Recipe =
8301 RecipeBuilder.handleReplication(cast<VPInstruction>(VPI), Range);
8302
8303 RecipeBuilder.setRecipe(Instr, Recipe);
8304 if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && isa<TruncInst>(Instr)) {
8305 // Optimized a truncate to VPWidenIntOrFpInductionRecipe. It needs to be
8306 // moved to the phi section in the header.
8307 Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
8308 } else {
8309 Builder.insert(Recipe);
8310 }
8311 if (Recipe->getNumDefinedValues() == 1) {
8312 VPI->replaceAllUsesWith(Recipe->getVPSingleValue());
8313 } else {
8314 assert(Recipe->getNumDefinedValues() == 0 &&
8315 "Unexpected multidef recipe");
8316 }
8317 R.eraseFromParent();
8318 }
8319 }
8320
8321 assert(isa<VPRegionBlock>(LoopRegion) &&
8322 !LoopRegion->getEntryBasicBlock()->empty() &&
8323 "entry block must be set to a VPRegionBlock having a non-empty entry "
8324 "VPBasicBlock");
8325
8326 // TODO: We can't call runPass on these transforms yet, due to verifier
8327 // failures.
8329 DenseMap<VPValue *, VPValue *> IVEndValues;
8330 VPlanTransforms::updateScalarResumePhis(*Plan, IVEndValues);
8331
8332 // ---------------------------------------------------------------------------
8333 // Transform initial VPlan: Apply previously taken decisions, in order, to
8334 // bring the VPlan to its final state.
8335 // ---------------------------------------------------------------------------
8336
8337 addReductionResultComputation(Plan, RecipeBuilder, Range.Start);
8338
8339 // Optimize FindIV reductions to use sentinel-based approach when possible.
8341 *OrigLoop);
8342
8343 // Apply mandatory transformation to handle reductions with multiple in-loop
8344 // uses if possible, bail out otherwise.
8346 OrigLoop))
8347 return nullptr;
8348 // Apply mandatory transformation to handle FP maxnum/minnum reduction with
8349 // NaNs if possible, bail out otherwise.
8351 return nullptr;
8352
8353 // Create whole-vector selects for find-last recurrences.
8355 return nullptr;
8356
8357 // Create partial reduction recipes for scaled reductions and transform
8358 // recipes to abstract recipes if it is legal and beneficial and clamp the
8359 // range for better cost estimation.
8360 // TODO: Enable following transform when the EVL-version of extended-reduction
8361 // and mulacc-reduction are implemented.
8362 if (!CM.foldTailWithEVL()) {
8363 VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, CM.PSE,
8364 OrigLoop);
8366 Range);
8368 Range);
8369 }
8370
8371 for (ElementCount VF : Range)
8372 Plan->addVF(VF);
8373 Plan->setName("Initial VPlan");
8374
8375 // Interleave memory: for each Interleave Group we marked earlier as relevant
8376 // for this VPlan, replace the Recipes widening its memory instructions with a
8377 // single VPInterleaveRecipe at its insertion point.
8379 InterleaveGroups, RecipeBuilder, CM.isScalarEpilogueAllowed());
8380
8381 // Replace VPValues for known constant strides.
8383 Legal->getLAI()->getSymbolicStrides());
8384
8385 auto BlockNeedsPredication = [this](BasicBlock *BB) {
8386 return Legal->blockNeedsPredication(BB);
8387 };
8389 BlockNeedsPredication);
8390
8391 // Sink users of fixed-order recurrence past the recipe defining the previous
8392 // value and introduce FirstOrderRecurrenceSplice VPInstructions.
8394 Builder))
8395 return nullptr;
8396
8397 if (useActiveLaneMask(Style)) {
8398 // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
8399 // TailFoldingStyle is visible there.
8400 bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
8401 bool WithoutRuntimeCheck =
8403 VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
8404 WithoutRuntimeCheck);
8405 }
8406 VPlanTransforms::optimizeInductionExitUsers(*Plan, IVEndValues, PSE);
8407
8408 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8409 return Plan;
8410}
8411
8412VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
8413 // Outer loop handling: They may require CFG and instruction level
8414 // transformations before even evaluating whether vectorization is profitable.
8415 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8416 // the vectorization pipeline.
8417 assert(!OrigLoop->isInnermost());
8418 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8419
8420 auto Plan = VPlanTransforms::buildVPlan0(
8421 OrigLoop, *LI, Legal->getWidestInductionType(),
8422 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE);
8423
8425 *Plan, PSE, *OrigLoop, Legal->getInductionVars(),
8426 MapVector<PHINode *, RecurrenceDescriptor>(),
8427 SmallPtrSet<const PHINode *, 1>(), SmallPtrSet<PHINode *, 1>(),
8428 /*AllowReordering=*/false);
8430 /*HasUncountableExit*/ false);
8431 VPlanTransforms::addMiddleCheck(*Plan, /*RequiresScalarEpilogue*/ true,
8432 /*TailFolded*/ false);
8433
8435
8436 for (ElementCount VF : Range)
8437 Plan->addVF(VF);
8438
8440 return nullptr;
8441
8442 // TODO: IVEndValues are not used yet in the native path, to optimize exit
8443 // values.
8444 // TODO: We can't call runPass on the transform yet, due to verifier
8445 // failures.
8446 DenseMap<VPValue *, VPValue *> IVEndValues;
8447 VPlanTransforms::updateScalarResumePhis(*Plan, IVEndValues);
8448
8449 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8450 return Plan;
8451}
8452
8453void LoopVectorizationPlanner::addReductionResultComputation(
8454 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
8455 using namespace VPlanPatternMatch;
8456 VPTypeAnalysis TypeInfo(*Plan);
8457 VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
8458 VPBasicBlock *MiddleVPBB = Plan->getMiddleBlock();
8460 VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
8461 Builder.setInsertPoint(&*std::prev(std::prev(LatchVPBB->end())));
8462 VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
8463 for (VPRecipeBase &R :
8464 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
8465 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
8466 // TODO: Remove check for constant incoming value once removeDeadRecipes is
8467 // used on VPlan0.
8468 if (!PhiR || isa<VPIRValue>(PhiR->getOperand(1)))
8469 continue;
8470
8471 const RecurrenceDescriptor &RdxDesc = Legal->getRecurrenceDescriptor(
8473 Type *PhiTy = TypeInfo.inferScalarType(PhiR);
8474 // If tail is folded by masking, introduce selects between the phi
8475 // and the users outside the vector region of each reduction, at the
8476 // beginning of the dedicated latch block.
8477 auto *OrigExitingVPV = PhiR->getBackedgeValue();
8478 auto *NewExitingVPV = PhiR->getBackedgeValue();
8479 // Don't output selects for partial reductions because they have an output
8480 // with fewer lanes than the VF. So the operands of the select would have
8481 // different numbers of lanes. Partial reductions mask the input instead.
8482 auto *RR = dyn_cast<VPReductionRecipe>(OrigExitingVPV->getDefiningRecipe());
8483 if (!PhiR->isInLoop() && CM.foldTailByMasking() &&
8484 (!RR || !RR->isPartialReduction())) {
8485 VPValue *Cond = vputils::findHeaderMask(*Plan);
8486 NewExitingVPV =
8487 Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", *PhiR);
8488 OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) {
8489 using namespace VPlanPatternMatch;
8490 return match(
8491 &U, m_CombineOr(
8492 m_VPInstruction<VPInstruction::ComputeAnyOfResult>(),
8493 m_VPInstruction<VPInstruction::ComputeReductionResult>()));
8494 });
8495 if (CM.usePredicatedReductionSelect())
8496 PhiR->setOperand(1, NewExitingVPV);
8497 }
8498
8499 // We want code in the middle block to appear to execute on the location of
8500 // the scalar loop's latch terminator because: (a) it is all compiler
8501 // generated, (b) these instructions are always executed after evaluating
8502 // the latch conditional branch, and (c) other passes may add new
8503 // predecessors which terminate on this line. This is the easiest way to
8504 // ensure we don't accidentally cause an extra step back into the loop while
8505 // debugging.
8506 DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
8507
8508 // TODO: At the moment ComputeReductionResult also drives creation of the
8509 // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
8510 // even for in-loop reductions, until the reduction resume value handling is
8511 // also modeled in VPlan.
8512 VPInstruction *FinalReductionResult;
8513 VPBuilder::InsertPointGuard Guard(Builder);
8514 Builder.setInsertPoint(MiddleVPBB, IP);
8515 RecurKind RecurrenceKind = PhiR->getRecurrenceKind();
8516 // For AnyOf reductions, find the select among PhiR's users. This is used
8517 // both to find NewVal for ComputeAnyOfResult and to adjust the reduction.
8518 VPRecipeBase *AnyOfSelect = nullptr;
8519 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RecurrenceKind)) {
8520 AnyOfSelect = cast<VPRecipeBase>(*find_if(PhiR->users(), [](VPUser *U) {
8521 return match(U, m_Select(m_VPValue(), m_VPValue(), m_VPValue()));
8522 }));
8523 }
8524 if (AnyOfSelect) {
8525 VPValue *Start = PhiR->getStartValue();
8526 // NewVal is the non-phi operand of the select.
8527 VPValue *NewVal = AnyOfSelect->getOperand(1) == PhiR
8528 ? AnyOfSelect->getOperand(2)
8529 : AnyOfSelect->getOperand(1);
8530 FinalReductionResult =
8531 Builder.createNaryOp(VPInstruction::ComputeAnyOfResult,
8532 {Start, NewVal, NewExitingVPV}, ExitDL);
8533 } else {
8534 VPIRFlags Flags(RecurrenceKind, PhiR->isOrdered(), PhiR->isInLoop(),
8535 PhiR->getFastMathFlags());
8536 FinalReductionResult =
8537 Builder.createNaryOp(VPInstruction::ComputeReductionResult,
8538 {NewExitingVPV}, Flags, ExitDL);
8539 }
8540 // If the vector reduction can be performed in a smaller type, we truncate
8541 // then extend the loop exit value to enable InstCombine to evaluate the
8542 // entire expression in the smaller type.
8543 if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&
8545 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
8547 "Unexpected truncated min-max recurrence!");
8548 Type *RdxTy = RdxDesc.getRecurrenceType();
8549 VPWidenCastRecipe *Trunc;
8550 Instruction::CastOps ExtendOpc =
8551 RdxDesc.isSigned() ? Instruction::SExt : Instruction::ZExt;
8552 VPWidenCastRecipe *Extnd;
8553 {
8554 VPBuilder::InsertPointGuard Guard(Builder);
8555 Builder.setInsertPoint(
8556 NewExitingVPV->getDefiningRecipe()->getParent(),
8557 std::next(NewExitingVPV->getDefiningRecipe()->getIterator()));
8558 Trunc =
8559 Builder.createWidenCast(Instruction::Trunc, NewExitingVPV, RdxTy);
8560 Extnd = Builder.createWidenCast(ExtendOpc, Trunc, PhiTy);
8561 }
8562 if (PhiR->getOperand(1) == NewExitingVPV)
8563 PhiR->setOperand(1, Extnd->getVPSingleValue());
8564
8565 // Update ComputeReductionResult with the truncated exiting value and
8566 // extend its result. Operand 0 provides the values to be reduced.
8567 FinalReductionResult->setOperand(0, Trunc);
8568 FinalReductionResult =
8569 Builder.createScalarCast(ExtendOpc, FinalReductionResult, PhiTy, {});
8570 }
8571
8572 // Update all users outside the vector region. Also replace redundant
8573 // extracts.
8574 for (auto *U : to_vector(OrigExitingVPV->users())) {
8575 auto *Parent = cast<VPRecipeBase>(U)->getParent();
8576 if (FinalReductionResult == U || Parent->getParent())
8577 continue;
8578 // Skip FindIV reduction chain recipes (ComputeReductionResult, icmp).
8580 match(U, m_CombineOr(
8581 m_VPInstruction<VPInstruction::ComputeReductionResult>(),
8582 m_VPInstruction<Instruction::ICmp>())))
8583 continue;
8584 U->replaceUsesOfWith(OrigExitingVPV, FinalReductionResult);
8585
8586 // Look through ExtractLastPart.
8588 U = cast<VPInstruction>(U)->getSingleUser();
8589
8592 cast<VPInstruction>(U)->replaceAllUsesWith(FinalReductionResult);
8593 }
8594
8595 // Adjust AnyOf reductions; replace the reduction phi for the selected value
8596 // with a boolean reduction phi node to check if the condition is true in
8597 // any iteration. The final value is selected by the final
8598 // ComputeReductionResult.
8599 if (AnyOfSelect) {
8600 VPValue *Cmp = AnyOfSelect->getOperand(0);
8601 // If the compare is checking the reduction PHI node, adjust it to check
8602 // the start value.
8603 if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe())
8604 CmpR->replaceUsesOfWith(PhiR, PhiR->getStartValue());
8605 Builder.setInsertPoint(AnyOfSelect);
8606
8607 // If the true value of the select is the reduction phi, the new value is
8608 // selected if the negated condition is true in any iteration.
8609 if (AnyOfSelect->getOperand(1) == PhiR)
8610 Cmp = Builder.createNot(Cmp);
8611 VPValue *Or = Builder.createOr(PhiR, Cmp);
8612 AnyOfSelect->getVPSingleValue()->replaceAllUsesWith(Or);
8613 // Delete AnyOfSelect now that it has invalid types.
8614 ToDelete.push_back(AnyOfSelect);
8615
8616 // Convert the reduction phi to operate on bools.
8617 PhiR->setOperand(0, Plan->getFalse());
8618 continue;
8619 }
8620
8621 RecurKind RK = PhiR->getRecurrenceKind();
8626 VPBuilder PHBuilder(Plan->getVectorPreheader());
8627 VPValue *Iden = Plan->getOrAddLiveIn(
8628 getRecurrenceIdentity(RK, PhiTy, PhiR->getFastMathFlags()));
8629 auto *ScaleFactorVPV = Plan->getConstantInt(32, 1);
8630 VPValue *StartV = PHBuilder.createNaryOp(
8632 {PhiR->getStartValue(), Iden, ScaleFactorVPV}, *PhiR);
8633 PhiR->setOperand(0, StartV);
8634 }
8635 }
8636 for (VPRecipeBase *R : ToDelete)
8637 R->eraseFromParent();
8638
8640}
8641
8642void LoopVectorizationPlanner::attachRuntimeChecks(
8643 VPlan &Plan, GeneratedRTChecks &RTChecks, bool HasBranchWeights) const {
8644 const auto &[SCEVCheckCond, SCEVCheckBlock] = RTChecks.getSCEVChecks();
8645 if (SCEVCheckBlock && SCEVCheckBlock->hasNPredecessors(0)) {
8646 assert((!CM.OptForSize ||
8647 CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled) &&
8648 "Cannot SCEV check stride or overflow when optimizing for size");
8649 VPlanTransforms::attachCheckBlock(Plan, SCEVCheckCond, SCEVCheckBlock,
8650 HasBranchWeights);
8651 }
8652 const auto &[MemCheckCond, MemCheckBlock] = RTChecks.getMemRuntimeChecks();
8653 if (MemCheckBlock && MemCheckBlock->hasNPredecessors(0)) {
8654 // VPlan-native path does not do any analysis for runtime checks
8655 // currently.
8656 assert((!EnableVPlanNativePath || OrigLoop->isInnermost()) &&
8657 "Runtime checks are not supported for outer loops yet");
8658
8659 if (CM.OptForSize) {
8660 assert(
8661 CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
8662 "Cannot emit memory checks when optimizing for size, unless forced "
8663 "to vectorize.");
8664 ORE->emit([&]() {
8665 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
8666 OrigLoop->getStartLoc(),
8667 OrigLoop->getHeader())
8668 << "Code-size may be reduced by not forcing "
8669 "vectorization, or by source-code modifications "
8670 "eliminating the need for runtime checks "
8671 "(e.g., adding 'restrict').";
8672 });
8673 }
8674 VPlanTransforms::attachCheckBlock(Plan, MemCheckCond, MemCheckBlock,
8675 HasBranchWeights);
8676 }
8677}
8678
8680 VPlan &Plan, ElementCount VF, unsigned UF,
8681 ElementCount MinProfitableTripCount) const {
8682 const uint32_t *BranchWeights =
8683 hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())
8685 : nullptr;
8687 Plan, VF, UF, MinProfitableTripCount,
8688 CM.requiresScalarEpilogue(VF.isVector()), CM.foldTailByMasking(),
8689 OrigLoop, BranchWeights,
8690 OrigLoop->getLoopPredecessor()->getTerminator()->getDebugLoc(), PSE);
8691}
8692
8693// Determine how to lower the scalar epilogue, which depends on 1) optimising
8694// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
8695// predication, and 4) a TTI hook that analyses whether the loop is suitable
8696// for predication.
8698 Function *F, Loop *L, LoopVectorizeHints &Hints, bool OptForSize,
8701 // 1) OptSize takes precedence over all other options, i.e. if this is set,
8702 // don't look at hints or options, and don't request a scalar epilogue.
8703 if (F->hasOptSize() ||
8704 (OptForSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled))
8706
8707 // 2) If set, obey the directives
8708 if (PreferPredicateOverEpilogue.getNumOccurrences()) {
8716 };
8717 }
8718
8719 // 3) If set, obey the hints
8720 switch (Hints.getPredicate()) {
8725 };
8726
8727 // 4) if the TTI hook indicates this is profitable, request predication.
8728 TailFoldingInfo TFI(TLI, &LVL, IAI);
8729 if (TTI->preferPredicateOverEpilogue(&TFI))
8731
8733}
8734
8735// Process the loop in the VPlan-native vectorization path. This path builds
8736// VPlan upfront in the vectorization pipeline, which allows to apply
8737// VPlan-to-VPlan transformations from the very beginning without modifying the
8738// input LLVM IR.
8744 std::function<BlockFrequencyInfo &()> GetBFI, bool OptForSize,
8745 LoopVectorizeHints &Hints, LoopVectorizationRequirements &Requirements) {
8746
8748 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
8749 return false;
8750 }
8751 assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
8752 Function *F = L->getHeader()->getParent();
8753 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
8754
8756 getScalarEpilogueLowering(F, L, Hints, OptForSize, TTI, TLI, *LVL, &IAI);
8757
8758 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE,
8759 GetBFI, F, &Hints, IAI, OptForSize);
8760 // Use the planner for outer loop vectorization.
8761 // TODO: CM is not used at this point inside the planner. Turn CM into an
8762 // optional argument if we don't need it in the future.
8763 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
8764 ORE);
8765
8766 // Get user vectorization factor.
8767 ElementCount UserVF = Hints.getWidth();
8768
8770
8771 // Plan how to best vectorize, return the best VF and its cost.
8772 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
8773
8774 // If we are stress testing VPlan builds, do not attempt to generate vector
8775 // code. Masked vector code generation support will follow soon.
8776 // Also, do not attempt to vectorize if no vector code will be produced.
8778 return false;
8779
8780 VPlan &BestPlan = LVP.getPlanFor(VF.Width);
8781
8782 {
8783 GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind);
8784 InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, /*UF=*/1, &CM,
8785 Checks, BestPlan);
8786 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
8787 << L->getHeader()->getParent()->getName() << "\"\n");
8788 LVP.addMinimumIterationCheck(BestPlan, VF.Width, /*UF=*/1,
8790
8791 LVP.executePlan(VF.Width, /*UF=*/1, BestPlan, LB, DT, false);
8792 }
8793
8794 reportVectorization(ORE, L, VF, 1);
8795
8796 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
8797 return true;
8798}
8799
8800// Emit a remark if there are stores to floats that required a floating point
8801// extension. If the vectorized loop was generated with floating point there
8802// will be a performance penalty from the conversion overhead and the change in
8803// the vector width.
8806 for (BasicBlock *BB : L->getBlocks()) {
8807 for (Instruction &Inst : *BB) {
8808 if (auto *S = dyn_cast<StoreInst>(&Inst)) {
8809 if (S->getValueOperand()->getType()->isFloatTy())
8810 Worklist.push_back(S);
8811 }
8812 }
8813 }
8814
8815 // Traverse the floating point stores upwards searching, for floating point
8816 // conversions.
8819 while (!Worklist.empty()) {
8820 auto *I = Worklist.pop_back_val();
8821 if (!L->contains(I))
8822 continue;
8823 if (!Visited.insert(I).second)
8824 continue;
8825
8826 // Emit a remark if the floating point store required a floating
8827 // point conversion.
8828 // TODO: More work could be done to identify the root cause such as a
8829 // constant or a function return type and point the user to it.
8830 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
8831 ORE->emit([&]() {
8832 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
8833 I->getDebugLoc(), L->getHeader())
8834 << "floating point conversion changes vector width. "
8835 << "Mixed floating point precision requires an up/down "
8836 << "cast that will negatively impact performance.";
8837 });
8838
8839 for (Use &Op : I->operands())
8840 if (auto *OpI = dyn_cast<Instruction>(Op))
8841 Worklist.push_back(OpI);
8842 }
8843}
8844
8845/// For loops with uncountable early exits, find the cost of doing work when
8846/// exiting the loop early, such as calculating the final exit values of
8847/// variables used outside the loop.
8848/// TODO: This is currently overly pessimistic because the loop may not take
8849/// the early exit, but better to keep this conservative for now. In future,
8850/// it might be possible to relax this by using branch probabilities.
8852 VPlan &Plan, ElementCount VF) {
8853 InstructionCost Cost = 0;
8854 for (auto *ExitVPBB : Plan.getExitBlocks()) {
8855 for (auto *PredVPBB : ExitVPBB->getPredecessors()) {
8856 // If the predecessor is not the middle.block, then it must be the
8857 // vector.early.exit block, which may contain work to calculate the exit
8858 // values of variables used outside the loop.
8859 if (PredVPBB != Plan.getMiddleBlock()) {
8860 LLVM_DEBUG(dbgs() << "Calculating cost of work in exit block "
8861 << PredVPBB->getName() << ":\n");
8862 Cost += PredVPBB->cost(VF, CostCtx);
8863 }
8864 }
8865 }
8866 return Cost;
8867}
8868
8869/// This function determines whether or not it's still profitable to vectorize
8870/// the loop given the extra work we have to do outside of the loop:
8871/// 1. Perform the runtime checks before entering the loop to ensure it's safe
8872/// to vectorize.
8873/// 2. In the case of loops with uncountable early exits, we may have to do
8874/// extra work when exiting the loop early, such as calculating the final
8875/// exit values of variables used outside the loop.
8876/// 3. The middle block.
8877static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
8878 VectorizationFactor &VF, Loop *L,
8880 VPCostContext &CostCtx, VPlan &Plan,
8882 std::optional<unsigned> VScale) {
8883 InstructionCost RtC = Checks.getCost();
8884 if (!RtC.isValid())
8885 return false;
8886
8887 // When interleaving only scalar and vector cost will be equal, which in turn
8888 // would lead to a divide by 0. Fall back to hard threshold.
8889 if (VF.Width.isScalar()) {
8890 // TODO: Should we rename VectorizeMemoryCheckThreshold?
8892 LLVM_DEBUG(
8893 dbgs()
8894 << "LV: Interleaving only is not profitable due to runtime checks\n");
8895 return false;
8896 }
8897 return true;
8898 }
8899
8900 // The scalar cost should only be 0 when vectorizing with a user specified
8901 // VF/IC. In those cases, runtime checks should always be generated.
8902 uint64_t ScalarC = VF.ScalarCost.getValue();
8903 if (ScalarC == 0)
8904 return true;
8905
8906 InstructionCost TotalCost = RtC;
8907 // Add on the cost of any work required in the vector early exit block, if
8908 // one exists.
8909 TotalCost += calculateEarlyExitCost(CostCtx, Plan, VF.Width);
8910 TotalCost += Plan.getMiddleBlock()->cost(VF.Width, CostCtx);
8911
8912 // First, compute the minimum iteration count required so that the vector
8913 // loop outperforms the scalar loop.
8914 // The total cost of the scalar loop is
8915 // ScalarC * TC
8916 // where
8917 // * TC is the actual trip count of the loop.
8918 // * ScalarC is the cost of a single scalar iteration.
8919 //
8920 // The total cost of the vector loop is
8921 // TotalCost + VecC * (TC / VF) + EpiC
8922 // where
8923 // * TotalCost is the sum of the costs cost of
8924 // - the generated runtime checks, i.e. RtC
8925 // - performing any additional work in the vector.early.exit block for
8926 // loops with uncountable early exits.
8927 // - the middle block, if ExpectedTC <= VF.Width.
8928 // * VecC is the cost of a single vector iteration.
8929 // * TC is the actual trip count of the loop
8930 // * VF is the vectorization factor
8931 // * EpiCost is the cost of the generated epilogue, including the cost
8932 // of the remaining scalar operations.
8933 //
8934 // Vectorization is profitable once the total vector cost is less than the
8935 // total scalar cost:
8936 // TotalCost + VecC * (TC / VF) + EpiC < ScalarC * TC
8937 //
8938 // Now we can compute the minimum required trip count TC as
8939 // VF * (TotalCost + EpiC) / (ScalarC * VF - VecC) < TC
8940 //
8941 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
8942 // the computations are performed on doubles, not integers and the result
8943 // is rounded up, hence we get an upper estimate of the TC.
8944 unsigned IntVF = estimateElementCount(VF.Width, VScale);
8945 uint64_t Div = ScalarC * IntVF - VF.Cost.getValue();
8946 uint64_t MinTC1 =
8947 Div == 0 ? 0 : divideCeil(TotalCost.getValue() * IntVF, Div);
8948
8949 // Second, compute a minimum iteration count so that the cost of the
8950 // runtime checks is only a fraction of the total scalar loop cost. This
8951 // adds a loop-dependent bound on the overhead incurred if the runtime
8952 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
8953 // * TC. To bound the runtime check to be a fraction 1/X of the scalar
8954 // cost, compute
8955 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC
8956 uint64_t MinTC2 = divideCeil(RtC.getValue() * 10, ScalarC);
8957
8958 // Now pick the larger minimum. If it is not a multiple of VF and a scalar
8959 // epilogue is allowed, choose the next closest multiple of VF. This should
8960 // partly compensate for ignoring the epilogue cost.
8961 uint64_t MinTC = std::max(MinTC1, MinTC2);
8962 if (SEL == CM_ScalarEpilogueAllowed)
8963 MinTC = alignTo(MinTC, IntVF);
8965
8966 LLVM_DEBUG(
8967 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
8968 << VF.MinProfitableTripCount << "\n");
8969
8970 // Skip vectorization if the expected trip count is less than the minimum
8971 // required trip count.
8972 if (auto ExpectedTC = getSmallBestKnownTC(PSE, L)) {
8973 if (ElementCount::isKnownLT(*ExpectedTC, VF.MinProfitableTripCount)) {
8974 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
8975 "trip count < minimum profitable VF ("
8976 << *ExpectedTC << " < " << VF.MinProfitableTripCount
8977 << ")\n");
8978
8979 return false;
8980 }
8981 }
8982 return true;
8983}
8984
8986 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
8988 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
8990
8991/// Prepare \p MainPlan for vectorizing the main vector loop during epilogue
8992/// vectorization. Remove ResumePhis from \p MainPlan for inductions that
8993/// don't have a corresponding wide induction in \p EpiPlan.
8994static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
8995 // Collect PHI nodes of widened phis in the VPlan for the epilogue. Those
8996 // will need their resume-values computed in the main vector loop. Others
8997 // can be removed from the main VPlan.
8998 SmallPtrSet<PHINode *, 2> EpiWidenedPhis;
8999 for (VPRecipeBase &R :
9002 continue;
9003 EpiWidenedPhis.insert(
9004 cast<PHINode>(R.getVPSingleValue()->getUnderlyingValue()));
9005 }
9006 for (VPRecipeBase &R :
9007 make_early_inc_range(MainPlan.getScalarHeader()->phis())) {
9008 auto *VPIRInst = cast<VPIRPhi>(&R);
9009 if (EpiWidenedPhis.contains(&VPIRInst->getIRPhi()))
9010 continue;
9011 // There is no corresponding wide induction in the epilogue plan that would
9012 // need a resume value. Remove the VPIRInst wrapping the scalar header phi
9013 // together with the corresponding ResumePhi. The resume values for the
9014 // scalar loop will be created during execution of EpiPlan.
9015 VPRecipeBase *ResumePhi = VPIRInst->getOperand(0)->getDefiningRecipe();
9016 VPIRInst->eraseFromParent();
9017 ResumePhi->eraseFromParent();
9018 }
9020
9021 using namespace VPlanPatternMatch;
9022 // When vectorizing the epilogue, FindFirstIV & FindLastIV reductions can
9023 // introduce multiple uses of undef/poison. If the reduction start value may
9024 // be undef or poison it needs to be frozen and the frozen start has to be
9025 // used when computing the reduction result. We also need to use the frozen
9026 // value in the resume phi generated by the main vector loop, as this is also
9027 // used to compute the reduction result after the epilogue vector loop.
9028 auto AddFreezeForFindLastIVReductions = [](VPlan &Plan,
9029 bool UpdateResumePhis) {
9030 VPBuilder Builder(Plan.getEntry());
9031 for (VPRecipeBase &R : *Plan.getMiddleBlock()) {
9032 auto *VPI = dyn_cast<VPInstruction>(&R);
9033 if (!VPI)
9034 continue;
9035 VPValue *OrigStart;
9036 if (!matchFindIVResult(VPI, m_VPValue(), m_VPValue(OrigStart)))
9037 continue;
9039 continue;
9040 VPInstruction *Freeze =
9041 Builder.createNaryOp(Instruction::Freeze, {OrigStart}, {}, "fr");
9042 VPI->setOperand(2, Freeze);
9043 if (UpdateResumePhis)
9044 OrigStart->replaceUsesWithIf(Freeze, [Freeze](VPUser &U, unsigned) {
9045 return Freeze != &U && isa<VPPhi>(&U);
9046 });
9047 }
9048 };
9049 AddFreezeForFindLastIVReductions(MainPlan, true);
9050 AddFreezeForFindLastIVReductions(EpiPlan, false);
9051
9052 VPValue *VectorTC = nullptr;
9053 auto *Term =
9055 [[maybe_unused]] bool MatchedTC =
9056 match(Term, m_BranchOnCount(m_VPValue(), m_VPValue(VectorTC)));
9057 assert(MatchedTC && "must match vector trip count");
9058
9059 // If there is a suitable resume value for the canonical induction in the
9060 // scalar (which will become vector) epilogue loop, use it and move it to the
9061 // beginning of the scalar preheader. Otherwise create it below.
9062 VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader();
9063 auto ResumePhiIter =
9064 find_if(MainScalarPH->phis(), [VectorTC](VPRecipeBase &R) {
9065 return match(&R, m_VPInstruction<Instruction::PHI>(m_Specific(VectorTC),
9066 m_ZeroInt()));
9067 });
9068 VPPhi *ResumePhi = nullptr;
9069 if (ResumePhiIter == MainScalarPH->phis().end()) {
9070 VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin());
9071 ResumePhi = ScalarPHBuilder.createScalarPhi(
9072 {VectorTC,
9074 {}, "vec.epilog.resume.val");
9075 } else {
9076 ResumePhi = cast<VPPhi>(&*ResumePhiIter);
9077 if (MainScalarPH->begin() == MainScalarPH->end())
9078 ResumePhi->moveBefore(*MainScalarPH, MainScalarPH->end());
9079 else if (&*MainScalarPH->begin() != ResumePhi)
9080 ResumePhi->moveBefore(*MainScalarPH, MainScalarPH->begin());
9081 }
9082 // Add a user to to make sure the resume phi won't get removed.
9083 VPBuilder(MainScalarPH)
9085}
9086
9087/// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded
9088/// SCEVs from \p ExpandedSCEVs and set resume values for header recipes. Some
9089/// reductions require creating new instructions to compute the resume values.
9090/// They are collected in a vector and returned. They must be moved to the
9091/// preheader of the vector epilogue loop, after created by the execution of \p
9092/// Plan.
9094 VPlan &Plan, Loop *L, const SCEV2ValueTy &ExpandedSCEVs,
9096 ScalarEvolution &SE) {
9097 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
9098 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
9099 Header->setName("vec.epilog.vector.body");
9100
9101 VPCanonicalIVPHIRecipe *IV = VectorLoop->getCanonicalIV();
9102 // When vectorizing the epilogue loop, the canonical induction needs to be
9103 // adjusted by the value after the main vector loop. Find the resume value
9104 // created during execution of the main VPlan. It must be the first phi in the
9105 // loop preheader. Use the value to increment the canonical IV, and update all
9106 // users in the loop region to use the adjusted value.
9107 // FIXME: Improve modeling for canonical IV start values in the epilogue
9108 // loop.
9109 using namespace llvm::PatternMatch;
9110 PHINode *EPResumeVal = &*L->getLoopPreheader()->phis().begin();
9111 for (Value *Inc : EPResumeVal->incoming_values()) {
9112 if (match(Inc, m_SpecificInt(0)))
9113 continue;
9114 assert(!EPI.VectorTripCount &&
9115 "Must only have a single non-zero incoming value");
9116 EPI.VectorTripCount = Inc;
9117 }
9118 // If we didn't find a non-zero vector trip count, all incoming values
9119 // must be zero, which also means the vector trip count is zero. Pick the
9120 // first zero as vector trip count.
9121 // TODO: We should not choose VF * UF so the main vector loop is known to
9122 // be dead.
9123 if (!EPI.VectorTripCount) {
9124 assert(EPResumeVal->getNumIncomingValues() > 0 &&
9125 all_of(EPResumeVal->incoming_values(),
9126 [](Value *Inc) { return match(Inc, m_SpecificInt(0)); }) &&
9127 "all incoming values must be 0");
9128 EPI.VectorTripCount = EPResumeVal->getOperand(0);
9129 }
9130 VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal);
9131 assert(all_of(IV->users(),
9132 [](const VPUser *U) {
9133 return isa<VPScalarIVStepsRecipe>(U) ||
9134 isa<VPDerivedIVRecipe>(U) ||
9135 cast<VPRecipeBase>(U)->isScalarCast() ||
9136 cast<VPInstruction>(U)->getOpcode() ==
9137 Instruction::Add;
9138 }) &&
9139 "the canonical IV should only be used by its increment or "
9140 "ScalarIVSteps when resetting the start value");
9141 VPBuilder Builder(Header, Header->getFirstNonPhi());
9142 VPInstruction *Add = Builder.createAdd(IV, VPV);
9143 IV->replaceAllUsesWith(Add);
9144 Add->setOperand(0, IV);
9145
9147 SmallVector<Instruction *> InstsToMove;
9148 // Ensure that the start values for all header phi recipes are updated before
9149 // vectorizing the epilogue loop. Skip the canonical IV, which has been
9150 // handled above.
9151 for (VPRecipeBase &R : drop_begin(Header->phis())) {
9152 Value *ResumeV = nullptr;
9153 // TODO: Move setting of resume values to prepareToExecute.
9154 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
9155 // Find the reduction result by searching users of the phi or its backedge
9156 // value.
9157 auto IsReductionResult = [](VPRecipeBase *R) {
9158 auto *VPI = dyn_cast<VPInstruction>(R);
9159 if (!VPI)
9160 return false;
9163 };
9164 auto *RdxResult = cast<VPInstruction>(
9165 vputils::findRecipe(ReductionPhi->getBackedgeValue(), IsReductionResult));
9166 assert(RdxResult && "expected to find reduction result");
9167
9168 ResumeV = cast<PHINode>(ReductionPhi->getUnderlyingInstr())
9169 ->getIncomingValueForBlock(L->getLoopPreheader());
9170
9171 // Check for FindIV pattern by looking for icmp user of RdxResult.
9172 // The pattern is: select(icmp ne RdxResult, Sentinel), RdxResult, Start
9173 using namespace VPlanPatternMatch;
9174 VPValue *SentinelVPV = nullptr;
9175 bool IsFindIV = any_of(RdxResult->users(), [&](VPUser *U) {
9176 return match(U, VPlanPatternMatch::m_SpecificICmp(
9177 ICmpInst::ICMP_NE, m_Specific(RdxResult),
9178 m_VPValue(SentinelVPV)));
9179 });
9180
9181 if (RdxResult->getOpcode() == VPInstruction::ComputeAnyOfResult) {
9182 Value *StartV = RdxResult->getOperand(0)->getLiveInIRValue();
9183 // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
9184 // start value; compare the final value from the main vector loop
9185 // to the start value.
9186 BasicBlock *PBB = cast<Instruction>(ResumeV)->getParent();
9187 IRBuilder<> Builder(PBB, PBB->getFirstNonPHIIt());
9188 ResumeV = Builder.CreateICmpNE(ResumeV, StartV);
9189 if (auto *I = dyn_cast<Instruction>(ResumeV))
9190 InstsToMove.push_back(I);
9191 } else if (IsFindIV) {
9192 assert(SentinelVPV && "expected to find icmp using RdxResult");
9193
9194 // Get the frozen start value from the main loop.
9195 Value *FrozenStartV = cast<PHINode>(ResumeV)->getIncomingValueForBlock(
9197 if (auto *FreezeI = dyn_cast<FreezeInst>(FrozenStartV))
9198 ToFrozen[FreezeI->getOperand(0)] = FrozenStartV;
9199
9200 // Adjust resume: select(icmp eq ResumeV, FrozenStartV), Sentinel,
9201 // ResumeV
9202 BasicBlock *ResumeBB = cast<Instruction>(ResumeV)->getParent();
9203 IRBuilder<> Builder(ResumeBB, ResumeBB->getFirstNonPHIIt());
9204 Value *Cmp = Builder.CreateICmpEQ(ResumeV, FrozenStartV);
9205 if (auto *I = dyn_cast<Instruction>(Cmp))
9206 InstsToMove.push_back(I);
9207 ResumeV =
9208 Builder.CreateSelect(Cmp, SentinelVPV->getLiveInIRValue(), ResumeV);
9209 if (auto *I = dyn_cast<Instruction>(ResumeV))
9210 InstsToMove.push_back(I);
9211 } else {
9212 VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
9213 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9214 if (auto *VPI = dyn_cast<VPInstruction>(PhiR->getStartValue())) {
9216 "unexpected start value");
9217 // Partial sub-reductions always start at 0 and account for the
9218 // reduction start value in a final subtraction. Update it to use the
9219 // resume value from the main vector loop.
9220 if (PhiR->getVFScaleFactor() > 1 &&
9221 PhiR->getRecurrenceKind() == RecurKind::Sub) {
9222 auto *Sub = cast<VPInstruction>(RdxResult->getSingleUser());
9223 assert(Sub->getOpcode() == Instruction::Sub && "Unexpected opcode");
9224 assert(isa<VPIRValue>(Sub->getOperand(0)) &&
9225 "Expected operand to match the original start value of the "
9226 "reduction");
9229 "Expected start value for partial sub-reduction to start at "
9230 "zero");
9231 Sub->setOperand(0, StartVal);
9232 } else
9233 VPI->setOperand(0, StartVal);
9234 continue;
9235 }
9236 }
9237 } else {
9238 // Retrieve the induction resume values for wide inductions from
9239 // their original phi nodes in the scalar loop.
9240 PHINode *IndPhi = cast<VPWidenInductionRecipe>(&R)->getPHINode();
9241 // Hook up to the PHINode generated by a ResumePhi recipe of main
9242 // loop VPlan, which feeds the scalar loop.
9243 ResumeV = IndPhi->getIncomingValueForBlock(L->getLoopPreheader());
9244 }
9245 assert(ResumeV && "Must have a resume value");
9246 VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
9247 cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
9248 }
9249
9250 // For some VPValues in the epilogue plan we must re-use the generated IR
9251 // values from the main plan. Replace them with live-in VPValues.
9252 // TODO: This is a workaround needed for epilogue vectorization and it
9253 // should be removed once induction resume value creation is done
9254 // directly in VPlan.
9255 for (auto &R : make_early_inc_range(*Plan.getEntry())) {
9256 // Re-use frozen values from the main plan for Freeze VPInstructions in the
9257 // epilogue plan. This ensures all users use the same frozen value.
9258 auto *VPI = dyn_cast<VPInstruction>(&R);
9259 if (VPI && VPI->getOpcode() == Instruction::Freeze) {
9261 ToFrozen.lookup(VPI->getOperand(0)->getLiveInIRValue())));
9262 continue;
9263 }
9264
9265 // Re-use the trip count and steps expanded for the main loop, as
9266 // skeleton creation needs it as a value that dominates both the scalar
9267 // and vector epilogue loops
9268 auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(&R);
9269 if (!ExpandR)
9270 continue;
9271 VPValue *ExpandedVal =
9272 Plan.getOrAddLiveIn(ExpandedSCEVs.lookup(ExpandR->getSCEV()));
9273 ExpandR->replaceAllUsesWith(ExpandedVal);
9274 if (Plan.getTripCount() == ExpandR)
9275 Plan.resetTripCount(ExpandedVal);
9276 ExpandR->eraseFromParent();
9277 }
9278
9279 auto VScale = CM.getVScaleForTuning();
9280 unsigned MainLoopStep =
9281 estimateElementCount(EPI.MainLoopVF * EPI.MainLoopUF, VScale);
9282 unsigned EpilogueLoopStep =
9283 estimateElementCount(EPI.EpilogueVF * EPI.EpilogueUF, VScale);
9285 Plan, EPI.TripCount, EPI.VectorTripCount,
9287 EPI.EpilogueUF, MainLoopStep, EpilogueLoopStep, SE);
9288
9289 return InstsToMove;
9290}
9291
9292// Generate bypass values from the additional bypass block. Note that when the
9293// vectorized epilogue is skipped due to iteration count check, then the
9294// resume value for the induction variable comes from the trip count of the
9295// main vector loop, passed as the second argument.
9297 PHINode *OrigPhi, const InductionDescriptor &II, IRBuilder<> &BypassBuilder,
9298 const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount,
9299 Instruction *OldInduction) {
9300 Value *Step = getExpandedStep(II, ExpandedSCEVs);
9301 // For the primary induction the additional bypass end value is known.
9302 // Otherwise it is computed.
9303 Value *EndValueFromAdditionalBypass = MainVectorTripCount;
9304 if (OrigPhi != OldInduction) {
9305 auto *BinOp = II.getInductionBinOp();
9306 // Fast-math-flags propagate from the original induction instruction.
9308 BypassBuilder.setFastMathFlags(BinOp->getFastMathFlags());
9309
9310 // Compute the end value for the additional bypass.
9311 EndValueFromAdditionalBypass =
9312 emitTransformedIndex(BypassBuilder, MainVectorTripCount,
9313 II.getStartValue(), Step, II.getKind(), BinOp);
9314 EndValueFromAdditionalBypass->setName("ind.end");
9315 }
9316 return EndValueFromAdditionalBypass;
9317}
9318
9320 VPlan &BestEpiPlan,
9322 const SCEV2ValueTy &ExpandedSCEVs,
9323 Value *MainVectorTripCount) {
9324 // Fix reduction resume values from the additional bypass block.
9325 BasicBlock *PH = L->getLoopPreheader();
9326 for (auto *Pred : predecessors(PH)) {
9327 for (PHINode &Phi : PH->phis()) {
9328 if (Phi.getBasicBlockIndex(Pred) != -1)
9329 continue;
9330 Phi.addIncoming(Phi.getIncomingValueForBlock(BypassBlock), Pred);
9331 }
9332 }
9333 auto *ScalarPH = cast<VPIRBasicBlock>(BestEpiPlan.getScalarPreheader());
9334 if (ScalarPH->hasPredecessors()) {
9335 // If ScalarPH has predecessors, we may need to update its reduction
9336 // resume values.
9337 for (const auto &[R, IRPhi] :
9338 zip(ScalarPH->phis(), ScalarPH->getIRBasicBlock()->phis())) {
9340 BypassBlock);
9341 }
9342 }
9343
9344 // Fix induction resume values from the additional bypass block.
9345 IRBuilder<> BypassBuilder(BypassBlock, BypassBlock->getFirstInsertionPt());
9346 for (const auto &[IVPhi, II] : LVL.getInductionVars()) {
9347 auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH));
9349 IVPhi, II, BypassBuilder, ExpandedSCEVs, MainVectorTripCount,
9350 LVL.getPrimaryInduction());
9351 // TODO: Directly add as extra operand to the VPResumePHI recipe.
9352 Inc->setIncomingValueForBlock(BypassBlock, V);
9353 }
9354}
9355
9356/// Connect the epilogue vector loop generated for \p EpiPlan to the main vector
9357// loop, after both plans have executed, updating branches from the iteration
9358// and runtime checks of the main loop, as well as updating various phis. \p
9359// InstsToMove contains instructions that need to be moved to the preheader of
9360// the epilogue vector loop.
9362 VPlan &EpiPlan, Loop *L, EpilogueLoopVectorizationInfo &EPI,
9364 DenseMap<const SCEV *, Value *> &ExpandedSCEVs, GeneratedRTChecks &Checks,
9365 ArrayRef<Instruction *> InstsToMove) {
9366 BasicBlock *VecEpilogueIterationCountCheck =
9367 cast<VPIRBasicBlock>(EpiPlan.getEntry())->getIRBasicBlock();
9368
9369 BasicBlock *VecEpiloguePreHeader =
9370 cast<BranchInst>(VecEpilogueIterationCountCheck->getTerminator())
9371 ->getSuccessor(1);
9372 // Adjust the control flow taking the state info from the main loop
9373 // vectorization into account.
9375 "expected this to be saved from the previous pass.");
9376 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
9378 VecEpilogueIterationCountCheck, VecEpiloguePreHeader);
9379
9381 VecEpilogueIterationCountCheck},
9383 VecEpiloguePreHeader}});
9384
9385 BasicBlock *ScalarPH =
9386 cast<VPIRBasicBlock>(EpiPlan.getScalarPreheader())->getIRBasicBlock();
9388 VecEpilogueIterationCountCheck, ScalarPH);
9389 DTU.applyUpdates(
9391 VecEpilogueIterationCountCheck},
9393
9394 // Adjust the terminators of runtime check blocks and phis using them.
9395 BasicBlock *SCEVCheckBlock = Checks.getSCEVChecks().second;
9396 BasicBlock *MemCheckBlock = Checks.getMemRuntimeChecks().second;
9397 if (SCEVCheckBlock) {
9398 SCEVCheckBlock->getTerminator()->replaceUsesOfWith(
9399 VecEpilogueIterationCountCheck, ScalarPH);
9400 DTU.applyUpdates({{DominatorTree::Delete, SCEVCheckBlock,
9401 VecEpilogueIterationCountCheck},
9402 {DominatorTree::Insert, SCEVCheckBlock, ScalarPH}});
9403 }
9404 if (MemCheckBlock) {
9405 MemCheckBlock->getTerminator()->replaceUsesOfWith(
9406 VecEpilogueIterationCountCheck, ScalarPH);
9407 DTU.applyUpdates(
9408 {{DominatorTree::Delete, MemCheckBlock, VecEpilogueIterationCountCheck},
9409 {DominatorTree::Insert, MemCheckBlock, ScalarPH}});
9410 }
9411
9412 // The vec.epilog.iter.check block may contain Phi nodes from inductions
9413 // or reductions which merge control-flow from the latch block and the
9414 // middle block. Update the incoming values here and move the Phi into the
9415 // preheader.
9416 SmallVector<PHINode *, 4> PhisInBlock(
9417 llvm::make_pointer_range(VecEpilogueIterationCountCheck->phis()));
9418
9419 for (PHINode *Phi : PhisInBlock) {
9420 Phi->moveBefore(VecEpiloguePreHeader->getFirstNonPHIIt());
9421 Phi->replaceIncomingBlockWith(
9422 VecEpilogueIterationCountCheck->getSinglePredecessor(),
9423 VecEpilogueIterationCountCheck);
9424
9425 // If the phi doesn't have an incoming value from the
9426 // EpilogueIterationCountCheck, we are done. Otherwise remove the
9427 // incoming value and also those from other check blocks. This is needed
9428 // for reduction phis only.
9429 if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
9430 return EPI.EpilogueIterationCountCheck == IncB;
9431 }))
9432 continue;
9433 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
9434 if (SCEVCheckBlock)
9435 Phi->removeIncomingValue(SCEVCheckBlock);
9436 if (MemCheckBlock)
9437 Phi->removeIncomingValue(MemCheckBlock);
9438 }
9439
9440 auto IP = VecEpiloguePreHeader->getFirstNonPHIIt();
9441 for (auto *I : InstsToMove)
9442 I->moveBefore(IP);
9443
9444 // VecEpilogueIterationCountCheck conditionally skips over the epilogue loop
9445 // after executing the main loop. We need to update the resume values of
9446 // inductions and reductions during epilogue vectorization.
9447 fixScalarResumeValuesFromBypass(VecEpilogueIterationCountCheck, L, EpiPlan,
9448 LVL, ExpandedSCEVs, EPI.VectorTripCount);
9449}
9450
9452 assert((EnableVPlanNativePath || L->isInnermost()) &&
9453 "VPlan-native path is not enabled. Only process inner loops.");
9454
9455 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
9456 << L->getHeader()->getParent()->getName() << "' from "
9457 << L->getLocStr() << "\n");
9458
9459 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
9460
9461 LLVM_DEBUG(
9462 dbgs() << "LV: Loop hints:"
9463 << " force="
9465 ? "disabled"
9467 ? "enabled"
9468 : "?"))
9469 << " width=" << Hints.getWidth()
9470 << " interleave=" << Hints.getInterleave() << "\n");
9471
9472 // Function containing loop
9473 Function *F = L->getHeader()->getParent();
9474
9475 // Looking at the diagnostic output is the only way to determine if a loop
9476 // was vectorized (other than looking at the IR or machine code), so it
9477 // is important to generate an optimization remark for each loop. Most of
9478 // these messages are generated as OptimizationRemarkAnalysis. Remarks
9479 // generated as OptimizationRemark and OptimizationRemarkMissed are
9480 // less verbose reporting vectorized loops and unvectorized loops that may
9481 // benefit from vectorization, respectively.
9482
9483 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
9484 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
9485 return false;
9486 }
9487
9488 PredicatedScalarEvolution PSE(*SE, *L);
9489
9490 // Query this against the original loop and save it here because the profile
9491 // of the original loop header may change as the transformation happens.
9492 bool OptForSize = llvm::shouldOptimizeForSize(
9493 L->getHeader(), PSI,
9494 PSI && PSI->hasProfileSummary() ? &GetBFI() : nullptr,
9496
9497 // Check if it is legal to vectorize the loop.
9498 LoopVectorizationRequirements Requirements;
9499 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
9500 &Requirements, &Hints, DB, AC,
9501 /*AllowRuntimeSCEVChecks=*/!OptForSize, AA);
9503 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
9504 Hints.emitRemarkWithHints();
9505 return false;
9506 }
9507
9508 if (LVL.hasUncountableEarlyExit()) {
9510 reportVectorizationFailure("Auto-vectorization of loops with uncountable "
9511 "early exit is not enabled",
9512 "UncountableEarlyExitLoopsDisabled", ORE, L);
9513 return false;
9514 }
9515 }
9516
9517 if (!LVL.getPotentiallyFaultingLoads().empty()) {
9518 reportVectorizationFailure("Auto-vectorization of loops with potentially "
9519 "faulting load is not supported",
9520 "PotentiallyFaultingLoadsNotSupported", ORE, L);
9521 return false;
9522 }
9523
9524 // Entrance to the VPlan-native vectorization path. Outer loops are processed
9525 // here. They may require CFG and instruction level transformations before
9526 // even evaluating whether vectorization is profitable. Since we cannot modify
9527 // the incoming IR, we need to build VPlan upfront in the vectorization
9528 // pipeline.
9529 if (!L->isInnermost())
9530 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
9531 ORE, GetBFI, OptForSize, Hints,
9532 Requirements);
9533
9534 assert(L->isInnermost() && "Inner loop expected.");
9535
9536 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9537 bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9538
9539 // If an override option has been passed in for interleaved accesses, use it.
9540 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
9541 UseInterleaved = EnableInterleavedMemAccesses;
9542
9543 // Analyze interleaved memory accesses.
9544 if (UseInterleaved)
9546
9547 if (LVL.hasUncountableEarlyExit()) {
9548 BasicBlock *LoopLatch = L->getLoopLatch();
9549 if (IAI.requiresScalarEpilogue() ||
9551 [LoopLatch](BasicBlock *BB) { return BB != LoopLatch; })) {
9552 reportVectorizationFailure("Auto-vectorization of early exit loops "
9553 "requiring a scalar epilogue is unsupported",
9554 "UncountableEarlyExitUnsupported", ORE, L);
9555 return false;
9556 }
9557 }
9558
9559 // Check the function attributes and profiles to find out if this function
9560 // should be optimized for size.
9562 getScalarEpilogueLowering(F, L, Hints, OptForSize, TTI, TLI, LVL, &IAI);
9563
9564 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
9565 // count by optimizing for size, to minimize overheads.
9566 auto ExpectedTC = getSmallBestKnownTC(PSE, L);
9567 if (ExpectedTC && ExpectedTC->isFixed() &&
9568 ExpectedTC->getFixedValue() < TinyTripCountVectorThreshold) {
9569 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
9570 << "This loop is worth vectorizing only if no scalar "
9571 << "iteration overheads are incurred.");
9573 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
9574 else {
9575 LLVM_DEBUG(dbgs() << "\n");
9576 // Predicate tail-folded loops are efficient even when the loop
9577 // iteration count is low. However, setting the epilogue policy to
9578 // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
9579 // with runtime checks. It's more effective to let
9580 // `isOutsideLoopWorkProfitable` determine if vectorization is
9581 // beneficial for the loop.
9584 }
9585 }
9586
9587 // Check the function attributes to see if implicit floats or vectors are
9588 // allowed.
9589 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
9591 "Can't vectorize when the NoImplicitFloat attribute is used",
9592 "loop not vectorized due to NoImplicitFloat attribute",
9593 "NoImplicitFloat", ORE, L);
9594 Hints.emitRemarkWithHints();
9595 return false;
9596 }
9597
9598 // Check if the target supports potentially unsafe FP vectorization.
9599 // FIXME: Add a check for the type of safety issue (denormal, signaling)
9600 // for the target we're vectorizing for, to make sure none of the
9601 // additional fp-math flags can help.
9602 if (Hints.isPotentiallyUnsafe() &&
9603 TTI->isFPVectorizationPotentiallyUnsafe()) {
9605 "Potentially unsafe FP op prevents vectorization",
9606 "loop not vectorized due to unsafe FP support.",
9607 "UnsafeFP", ORE, L);
9608 Hints.emitRemarkWithHints();
9609 return false;
9610 }
9611
9612 bool AllowOrderedReductions;
9613 // If the flag is set, use that instead and override the TTI behaviour.
9614 if (ForceOrderedReductions.getNumOccurrences() > 0)
9615 AllowOrderedReductions = ForceOrderedReductions;
9616 else
9617 AllowOrderedReductions = TTI->enableOrderedReductions();
9618 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
9619 ORE->emit([&]() {
9620 auto *ExactFPMathInst = Requirements.getExactFPInst();
9621 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
9622 ExactFPMathInst->getDebugLoc(),
9623 ExactFPMathInst->getParent())
9624 << "loop not vectorized: cannot prove it is safe to reorder "
9625 "floating-point operations";
9626 });
9627 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
9628 "reorder floating-point operations\n");
9629 Hints.emitRemarkWithHints();
9630 return false;
9631 }
9632
9633 // Use the cost model.
9634 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
9635 GetBFI, F, &Hints, IAI, OptForSize);
9636 // Use the planner for vectorization.
9637 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
9638 ORE);
9639
9640 // Get user vectorization factor and interleave count.
9641 ElementCount UserVF = Hints.getWidth();
9642 unsigned UserIC = Hints.getInterleave();
9643 if (UserIC > 1 && !LVL.isSafeForAnyVectorWidth())
9644 UserIC = 1;
9645
9646 // Plan how to best vectorize.
9647 LVP.plan(UserVF, UserIC);
9649 unsigned IC = 1;
9650
9651 if (ORE->allowExtraAnalysis(LV_NAME))
9653
9654 GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind);
9655 if (LVP.hasPlanWithVF(VF.Width)) {
9656 // Select the interleave count.
9657 IC = LVP.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost);
9658
9659 unsigned SelectedIC = std::max(IC, UserIC);
9660 // Optimistically generate runtime checks if they are needed. Drop them if
9661 // they turn out to not be profitable.
9662 if (VF.Width.isVector() || SelectedIC > 1) {
9663 Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC,
9664 *ORE);
9665
9666 // Bail out early if either the SCEV or memory runtime checks are known to
9667 // fail. In that case, the vector loop would never execute.
9668 using namespace llvm::PatternMatch;
9669 if (Checks.getSCEVChecks().first &&
9670 match(Checks.getSCEVChecks().first, m_One()))
9671 return false;
9672 if (Checks.getMemRuntimeChecks().first &&
9673 match(Checks.getMemRuntimeChecks().first, m_One()))
9674 return false;
9675 }
9676
9677 // Check if it is profitable to vectorize with runtime checks.
9678 bool ForceVectorization =
9680 VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM,
9681 CM.CostKind, CM.PSE, L);
9682 if (!ForceVectorization &&
9683 !isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx,
9684 LVP.getPlanFor(VF.Width), SEL,
9685 CM.getVScaleForTuning())) {
9686 ORE->emit([&]() {
9688 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
9689 L->getHeader())
9690 << "loop not vectorized: cannot prove it is safe to reorder "
9691 "memory operations";
9692 });
9693 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
9694 Hints.emitRemarkWithHints();
9695 return false;
9696 }
9697 }
9698
9699 // Identify the diagnostic messages that should be produced.
9700 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
9701 bool VectorizeLoop = true, InterleaveLoop = true;
9702 if (VF.Width.isScalar()) {
9703 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
9704 VecDiagMsg = {
9705 "VectorizationNotBeneficial",
9706 "the cost-model indicates that vectorization is not beneficial"};
9707 VectorizeLoop = false;
9708 }
9709
9710 if (UserIC == 1 && Hints.getInterleave() > 1) {
9712 "UserIC should only be ignored due to unsafe dependencies");
9713 LLVM_DEBUG(dbgs() << "LV: Ignoring user-specified interleave count.\n");
9714 IntDiagMsg = {"InterleavingUnsafe",
9715 "Ignoring user-specified interleave count due to possibly "
9716 "unsafe dependencies in the loop."};
9717 InterleaveLoop = false;
9718 } else if (!LVP.hasPlanWithVF(VF.Width) && UserIC > 1) {
9719 // Tell the user interleaving was avoided up-front, despite being explicitly
9720 // requested.
9721 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
9722 "interleaving should be avoided up front\n");
9723 IntDiagMsg = {"InterleavingAvoided",
9724 "Ignoring UserIC, because interleaving was avoided up front"};
9725 InterleaveLoop = false;
9726 } else if (IC == 1 && UserIC <= 1) {
9727 // Tell the user interleaving is not beneficial.
9728 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
9729 IntDiagMsg = {
9730 "InterleavingNotBeneficial",
9731 "the cost-model indicates that interleaving is not beneficial"};
9732 InterleaveLoop = false;
9733 if (UserIC == 1) {
9734 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
9735 IntDiagMsg.second +=
9736 " and is explicitly disabled or interleave count is set to 1";
9737 }
9738 } else if (IC > 1 && UserIC == 1) {
9739 // Tell the user interleaving is beneficial, but it explicitly disabled.
9740 LLVM_DEBUG(dbgs() << "LV: Interleaving is beneficial but is explicitly "
9741 "disabled.\n");
9742 IntDiagMsg = {"InterleavingBeneficialButDisabled",
9743 "the cost-model indicates that interleaving is beneficial "
9744 "but is explicitly disabled or interleave count is set to 1"};
9745 InterleaveLoop = false;
9746 }
9747
9748 // If there is a histogram in the loop, do not just interleave without
9749 // vectorizing. The order of operations will be incorrect without the
9750 // histogram intrinsics, which are only used for recipes with VF > 1.
9751 if (!VectorizeLoop && InterleaveLoop && LVL.hasHistograms()) {
9752 LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due "
9753 << "to histogram operations.\n");
9754 IntDiagMsg = {
9755 "HistogramPreventsScalarInterleaving",
9756 "Unable to interleave without vectorization due to constraints on "
9757 "the order of histogram operations"};
9758 InterleaveLoop = false;
9759 }
9760
9761 // Override IC if user provided an interleave count.
9762 IC = UserIC > 0 ? UserIC : IC;
9763
9764 // FIXME: Enable interleaving for FindLast reductions.
9765 if (InterleaveLoop && hasFindLastReductionPhi(LVP.getPlanFor(VF.Width))) {
9766 LLVM_DEBUG(dbgs() << "LV: Not interleaving due to FindLast reduction.\n");
9767 IntDiagMsg = {"FindLastPreventsScalarInterleaving",
9768 "Unable to interleave due to FindLast reduction."};
9769 InterleaveLoop = false;
9770 IC = 1;
9771 }
9772
9773 // Emit diagnostic messages, if any.
9774 const char *VAPassName = Hints.vectorizeAnalysisPassName();
9775 if (!VectorizeLoop && !InterleaveLoop) {
9776 // Do not vectorize or interleaving the loop.
9777 ORE->emit([&]() {
9778 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
9779 L->getStartLoc(), L->getHeader())
9780 << VecDiagMsg.second;
9781 });
9782 ORE->emit([&]() {
9783 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
9784 L->getStartLoc(), L->getHeader())
9785 << IntDiagMsg.second;
9786 });
9787 return false;
9788 }
9789
9790 if (!VectorizeLoop && InterleaveLoop) {
9791 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9792 ORE->emit([&]() {
9793 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
9794 L->getStartLoc(), L->getHeader())
9795 << VecDiagMsg.second;
9796 });
9797 } else if (VectorizeLoop && !InterleaveLoop) {
9798 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9799 << ") in " << L->getLocStr() << '\n');
9800 ORE->emit([&]() {
9801 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
9802 L->getStartLoc(), L->getHeader())
9803 << IntDiagMsg.second;
9804 });
9805 } else if (VectorizeLoop && InterleaveLoop) {
9806 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9807 << ") in " << L->getLocStr() << '\n');
9808 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9809 }
9810
9811 // Report the vectorization decision.
9812 if (VF.Width.isScalar()) {
9813 using namespace ore;
9814 assert(IC > 1);
9815 ORE->emit([&]() {
9816 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
9817 L->getHeader())
9818 << "interleaved loop (interleaved count: "
9819 << NV("InterleaveCount", IC) << ")";
9820 });
9821 } else {
9822 // Report the vectorization decision.
9823 reportVectorization(ORE, L, VF, IC);
9824 }
9825 if (ORE->allowExtraAnalysis(LV_NAME))
9827
9828 // If we decided that it is *legal* to interleave or vectorize the loop, then
9829 // do it.
9830
9831 VPlan &BestPlan = LVP.getPlanFor(VF.Width);
9832 // Consider vectorizing the epilogue too if it's profitable.
9833 VectorizationFactor EpilogueVF =
9835 if (EpilogueVF.Width.isVector()) {
9836 std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
9837
9838 // The first pass vectorizes the main loop and creates a scalar epilogue
9839 // to be vectorized by executing the plan (potentially with a different
9840 // factor) again shortly afterwards.
9841 VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width);
9842 BestEpiPlan.getMiddleBlock()->setName("vec.epilog.middle.block");
9843 BestEpiPlan.getVectorPreheader()->setName("vec.epilog.ph");
9844 preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan);
9845 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1,
9846 BestEpiPlan);
9847 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TTI, AC, EPI, &CM,
9848 Checks, *BestMainPlan);
9849 auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
9850 *BestMainPlan, MainILV, DT, false);
9851 ++LoopsVectorized;
9852
9853 // Second pass vectorizes the epilogue and adjusts the control flow
9854 // edges from the first pass.
9855 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TTI, AC, EPI, &CM,
9856 Checks, BestEpiPlan);
9858 BestEpiPlan, L, ExpandedSCEVs, EPI, CM, *PSE.getSE());
9859 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, DT,
9860 true);
9861 connectEpilogueVectorLoop(BestEpiPlan, L, EPI, DT, LVL, ExpandedSCEVs,
9862 Checks, InstsToMove);
9863 ++LoopsEpilogueVectorized;
9864 } else {
9865 InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, IC, &CM, Checks,
9866 BestPlan);
9867 // TODO: Move to general VPlan pipeline once epilogue loops are also
9868 // supported.
9870 BestPlan, VF.Width, IC, PSE);
9871 LVP.addMinimumIterationCheck(BestPlan, VF.Width, IC,
9873
9874 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
9875 ++LoopsVectorized;
9876 }
9877
9878 assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
9879 "DT not preserved correctly");
9880 assert(!verifyFunction(*F, &dbgs()));
9881
9882 return true;
9883}
9884
9886
9887 // Don't attempt if
9888 // 1. the target claims to have no vector registers, and
9889 // 2. interleaving won't help ILP.
9890 //
9891 // The second condition is necessary because, even if the target has no
9892 // vector registers, loop vectorization may still enable scalar
9893 // interleaving.
9894 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
9895 TTI->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2)
9896 return LoopVectorizeResult(false, false);
9897
9898 bool Changed = false, CFGChanged = false;
9899
9900 // The vectorizer requires loops to be in simplified form.
9901 // Since simplification may add new inner loops, it has to run before the
9902 // legality and profitability checks. This means running the loop vectorizer
9903 // will simplify all loops, regardless of whether anything end up being
9904 // vectorized.
9905 for (const auto &L : *LI)
9906 Changed |= CFGChanged |=
9907 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
9908
9909 // Build up a worklist of inner-loops to vectorize. This is necessary as
9910 // the act of vectorizing or partially unrolling a loop creates new loops
9911 // and can invalidate iterators across the loops.
9912 SmallVector<Loop *, 8> Worklist;
9913
9914 for (Loop *L : *LI)
9915 collectSupportedLoops(*L, LI, ORE, Worklist);
9916
9917 LoopsAnalyzed += Worklist.size();
9918
9919 // Now walk the identified inner loops.
9920 while (!Worklist.empty()) {
9921 Loop *L = Worklist.pop_back_val();
9922
9923 // For the inner loops we actually process, form LCSSA to simplify the
9924 // transform.
9925 Changed |= formLCSSARecursively(*L, *DT, LI, SE);
9926
9927 Changed |= CFGChanged |= processLoop(L);
9928
9929 if (Changed) {
9930 LAIs->clear();
9931
9932#ifndef NDEBUG
9933 if (VerifySCEV)
9934 SE->verify();
9935#endif
9936 }
9937 }
9938
9939 // Process each loop nest in the function.
9940 return LoopVectorizeResult(Changed, CFGChanged);
9941}
9942
9945 LI = &AM.getResult<LoopAnalysis>(F);
9946 // There are no loops in the function. Return before computing other
9947 // expensive analyses.
9948 if (LI->empty())
9949 return PreservedAnalyses::all();
9958 AA = &AM.getResult<AAManager>(F);
9959
9960 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
9961 PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
9962 GetBFI = [&AM, &F]() -> BlockFrequencyInfo & {
9964 };
9965 LoopVectorizeResult Result = runImpl(F);
9966 if (!Result.MadeAnyChange)
9967 return PreservedAnalyses::all();
9969
9970 if (isAssignmentTrackingEnabled(*F.getParent())) {
9971 for (auto &BB : F)
9973 }
9974
9975 PA.preserve<LoopAnalysis>();
9979
9980 if (Result.MadeCFGChange) {
9981 // Making CFG changes likely means a loop got vectorized. Indicate that
9982 // extra simplification passes should be run.
9983 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
9984 // be run if runtime checks have been added.
9987 } else {
9989 }
9990 return PA;
9991}
9992
9994 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
9995 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
9996 OS, MapClassName2PassName);
9997
9998 OS << '<';
9999 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10000 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10001 OS << '>';
10002}
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
static unsigned getIntrinsicID(const SDNode *N)
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
AMDGPU Lower Kernel Arguments
This file implements a class to represent arbitrary precision integral constant values and operations...
@ PostInc
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
This is the interface for LLVM's primary stateless and local alias analysis.
static bool IsEmptyBlock(MachineBasicBlock *MBB)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
static InstructionCost getCost(Instruction &Inst, TTI::TargetCostKind CostKind, TargetTransformInfo &TTI, TargetLibraryInfo &TLI)
Definition CostModel.cpp:74
This file defines DenseMapInfo traits for DenseMap.
This file defines the DenseMap class.
#define DEBUG_TYPE
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
Module.h This file contains the declarations for the Module class.
This defines the Use class.
static bool hasNoUnsignedWrap(BinaryOperator &I)
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:81
static cl::opt< unsigned, true > VectorizationFactor("force-vector-width", cl::Hidden, cl::desc("Sets the SIMD width. Zero is autoselect."), cl::location(VectorizerParams::VectorizationFactor))
This header provides classes for managing per-loop analyses.
static cl::opt< bool > WidenIV("loop-flatten-widen-iv", cl::Hidden, cl::init(true), cl::desc("Widen the loop induction variables, if possible, so " "overflow checks won't reject flattening"))
static const char * VerboseDebug
#define LV_NAME
This file defines the LoopVectorizationLegality class.
This file provides a LoopVectorizationPlanner class.
static void collectSupportedLoops(Loop &L, LoopInfo *LI, OptimizationRemarkEmitter *ORE, SmallVectorImpl< Loop * > &V)
static cl::opt< unsigned > EpilogueVectorizationMinVF("epilogue-vectorization-minimum-VF", cl::Hidden, cl::desc("Only loops with vectorization factor equal to or larger than " "the specified value are considered for epilogue vectorization."))
static cl::opt< unsigned > EpilogueVectorizationForceVF("epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, cl::desc("When epilogue vectorization is enabled, and a value greater than " "1 is specified, forces the given VF for all applicable epilogue " "loops."))
static Type * maybeVectorizeType(Type *Ty, ElementCount VF)
static ElementCount determineVPlanVF(const TargetTransformInfo &TTI, LoopVectorizationCostModel &CM)
static ElementCount getSmallConstantTripCount(ScalarEvolution *SE, const Loop *L)
A version of ScalarEvolution::getSmallConstantTripCount that returns an ElementCount to include loops...
static bool hasUnsupportedHeaderPhiRecipe(VPlan &Plan)
Returns true if the VPlan contains header phi recipes that are not currently supported for epilogue v...
static cl::opt< unsigned > VectorizeMemoryCheckThreshold("vectorize-memory-check-threshold", cl::init(128), cl::Hidden, cl::desc("The maximum allowed number of runtime memory checks"))
static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan)
Prepare MainPlan for vectorizing the main vector loop during epilogue vectorization.
static cl::opt< unsigned > TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), cl::Hidden, cl::desc("Loops with a constant trip count that is smaller than this " "value are vectorized only if no scalar iteration overheads " "are incurred."))
Loops with a known constant trip count below this number are vectorized only if no scalar iteration o...
static void debugVectorizationMessage(const StringRef Prefix, const StringRef DebugMsg, Instruction *I)
Write a DebugMsg about vectorization to the debug output stream.
static cl::opt< bool > EnableCondStoresVectorization("enable-cond-stores-vec", cl::init(true), cl::Hidden, cl::desc("Enable if predication of stores during vectorization."))
static void legacyCSE(BasicBlock *BB)
FIXME: This legacy common-subexpression-elimination routine is scheduled for removal,...
static VPIRBasicBlock * replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB, VPlan *Plan=nullptr)
Replace VPBB with a VPIRBasicBlock wrapping IRBB.
static DebugLoc getDebugLocFromInstOrOperands(Instruction *I)
Look for a meaningful debug location on the instruction or its operands.
static Value * createInductionAdditionalBypassValues(PHINode *OrigPhi, const InductionDescriptor &II, IRBuilder<> &BypassBuilder, const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount, Instruction *OldInduction)
static void fixReductionScalarResumeWhenVectorizingEpilog(VPPhi *EpiResumePhiR, PHINode &EpiResumePhi, BasicBlock *BypassBlock)
static cl::opt< bool > ForceTargetSupportsScalableVectors("force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, cl::desc("Pretend that scalable vectors are supported, even if the target does " "not support them. This flag should only be used for testing."))
static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style)
static cl::opt< bool > EnableEarlyExitVectorization("enable-early-exit-vectorization", cl::init(true), cl::Hidden, cl::desc("Enable vectorization of early exit loops with uncountable exits."))
static bool processLoopInVPlanNativePath(Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, std::function< BlockFrequencyInfo &()> GetBFI, bool OptForSize, LoopVectorizeHints &Hints, LoopVectorizationRequirements &Requirements)
static cl::opt< bool > ConsiderRegPressure("vectorizer-consider-reg-pressure", cl::init(false), cl::Hidden, cl::desc("Discard VFs if their register pressure is too high."))
static unsigned estimateElementCount(ElementCount VF, std::optional< unsigned > VScale)
This function attempts to return a value that represents the ElementCount at runtime.
static constexpr uint32_t MinItersBypassWeights[]
static cl::opt< unsigned > ForceTargetNumScalarRegs("force-target-num-scalar-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of scalar registers."))
static cl::opt< bool > UseWiderVFIfCallVariantsPresent("vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true), cl::Hidden, cl::desc("Try wider VFs if they enable the use of vector variants"))
static std::optional< unsigned > getMaxVScale(const Function &F, const TargetTransformInfo &TTI)
static cl::opt< unsigned > SmallLoopCost("small-loop-cost", cl::init(20), cl::Hidden, cl::desc("The cost of a loop that is considered 'small' by the interleaver."))
static void connectEpilogueVectorLoop(VPlan &EpiPlan, Loop *L, EpilogueLoopVectorizationInfo &EPI, DominatorTree *DT, LoopVectorizationLegality &LVL, DenseMap< const SCEV *, Value * > &ExpandedSCEVs, GeneratedRTChecks &Checks, ArrayRef< Instruction * > InstsToMove)
Connect the epilogue vector loop generated for EpiPlan to the main vector.
static bool planContainsAdditionalSimplifications(VPlan &Plan, VPCostContext &CostCtx, Loop *TheLoop, ElementCount VF)
Return true if the original loop \ TheLoop contains any instructions that do not have corresponding r...
static cl::opt< unsigned > ForceTargetNumVectorRegs("force-target-num-vector-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of vector registers."))
static bool isExplicitVecOuterLoop(Loop *OuterLp, OptimizationRemarkEmitter *ORE)
static cl::opt< bool > EnableIndVarRegisterHeur("enable-ind-var-reg-heur", cl::init(true), cl::Hidden, cl::desc("Count the induction variable only once when interleaving"))
static cl::opt< TailFoldingStyle > ForceTailFoldingStyle("force-tail-folding-style", cl::desc("Force the tail folding style"), cl::init(TailFoldingStyle::None), cl::values(clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"), clEnumValN(TailFoldingStyle::Data, "data", "Create lane mask for data only, using active.lane.mask intrinsic"), clEnumValN(TailFoldingStyle::DataWithoutLaneMask, "data-without-lane-mask", "Create lane mask with compare/stepvector"), clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control", "Create lane mask using active.lane.mask intrinsic, and use " "it for both data and control flow"), clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, "data-and-control-without-rt-check", "Similar to data-and-control, but remove the runtime check"), clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl", "Use predicated EVL instructions for tail folding. If EVL " "is unsupported, fallback to data-without-lane-mask.")))
static ScalarEpilogueLowering getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints, bool OptForSize, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI)
static cl::opt< bool > EnableEpilogueVectorization("enable-epilogue-vectorization", cl::init(true), cl::Hidden, cl::desc("Enable vectorization of epilogue loops."))
static cl::opt< bool > PreferPredicatedReductionSelect("prefer-predicated-reduction-select", cl::init(false), cl::Hidden, cl::desc("Prefer predicating a reduction operation over an after loop select."))
static cl::opt< bool > PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), cl::Hidden, cl::desc("Prefer in-loop vector reductions, " "overriding the targets preference."))
static SmallVector< Instruction * > preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L, const SCEV2ValueTy &ExpandedSCEVs, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel &CM, ScalarEvolution &SE)
Prepare Plan for vectorizing the epilogue loop.
static const SCEV * getAddressAccessSCEV(Value *Ptr, PredicatedScalarEvolution &PSE, const Loop *TheLoop)
Gets the address access SCEV for Ptr, if it should be used for cost modeling according to isAddressSC...
static cl::opt< bool > EnableLoadStoreRuntimeInterleave("enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, cl::desc("Enable runtime interleaving until load/store ports are saturated"))
static cl::opt< bool > VPlanBuildStressTest("vplan-build-stress-test", cl::init(false), cl::Hidden, cl::desc("Build VPlan for every supported loop nest in the function and bail " "out right after the build (stress test the VPlan H-CFG construction " "in the VPlan-native vectorization path)."))
static bool hasIrregularType(Type *Ty, const DataLayout &DL)
A helper function that returns true if the given type is irregular.
static cl::opt< bool > LoopVectorizeWithBlockFrequency("loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, cl::desc("Enable the use of the block frequency analysis to access PGO " "heuristics minimizing code growth in cold regions and being more " "aggressive in hot regions."))
static std::optional< ElementCount > getSmallBestKnownTC(PredicatedScalarEvolution &PSE, Loop *L, bool CanUseConstantMax=true)
Returns "best known" trip count, which is either a valid positive trip count or std::nullopt when an ...
static Value * getExpandedStep(const InductionDescriptor &ID, const SCEV2ValueTy &ExpandedSCEVs)
Return the expanded step for ID using ExpandedSCEVs to look up SCEV expansion results.
static bool useActiveLaneMask(TailFoldingStyle Style)
static bool hasReplicatorRegion(VPlan &Plan)
static bool isIndvarOverflowCheckKnownFalse(const LoopVectorizationCostModel *Cost, ElementCount VF, std::optional< unsigned > UF=std::nullopt)
For the given VF and UF and maximum trip count computed for the loop, return whether the induction va...
static void addFullyUnrolledInstructionsToIgnore(Loop *L, const LoopVectorizationLegality::InductionList &IL, SmallPtrSetImpl< Instruction * > &InstsToIgnore)
Knowing that loop L executes a single vector iteration, add instructions that will get simplified and...
static cl::opt< PreferPredicateTy::Option > PreferPredicateOverEpilogue("prefer-predicate-over-epilogue", cl::init(PreferPredicateTy::ScalarEpilogue), cl::Hidden, cl::desc("Tail-folding and predication preferences over creating a scalar " "epilogue loop."), cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, "scalar-epilogue", "Don't tail-predicate loops, create scalar epilogue"), clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, "predicate-else-scalar-epilogue", "prefer tail-folding, create scalar epilogue if tail " "folding fails."), clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, "predicate-dont-vectorize", "prefers tail-folding, don't attempt vectorization if " "tail-folding fails.")))
static bool hasFindLastReductionPhi(VPlan &Plan)
Returns true if the VPlan contains a VPReductionPHIRecipe with FindLast recurrence kind.
static cl::opt< bool > EnableInterleavedMemAccesses("enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on interleaved memory accesses in a loop"))
static cl::opt< bool > EnableMaskedInterleavedMemAccesses("enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"))
An interleave-group may need masking if it resides in a block that needs predication,...
static cl::opt< bool > ForceOrderedReductions("force-ordered-reductions", cl::init(false), cl::Hidden, cl::desc("Enable the vectorisation of loops with in-order (strict) " "FP reductions"))
static cl::opt< cl::boolOrDefault > ForceSafeDivisor("force-widen-divrem-via-safe-divisor", cl::Hidden, cl::desc("Override cost based safe divisor widening for div/rem instructions"))
static InstructionCost calculateEarlyExitCost(VPCostContext &CostCtx, VPlan &Plan, ElementCount VF)
For loops with uncountable early exits, find the cost of doing work when exiting the loop early,...
static cl::opt< unsigned > ForceTargetMaxVectorInterleaveFactor("force-target-max-vector-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "vectorized loops."))
static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI)
cl::opt< unsigned > NumberOfStoresToPredicate("vectorize-num-stores-pred", cl::init(1), cl::Hidden, cl::desc("Max number of stores to be predicated behind an if."))
The number of stores in a loop that are allowed to need predication.
static cl::opt< unsigned > MaxNestedScalarReductionIC("max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, cl::desc("The maximum interleave count to use when interleaving a scalar " "reduction in a nested loop."))
static cl::opt< unsigned > ForceTargetMaxScalarInterleaveFactor("force-target-max-scalar-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "scalar loops."))
static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE)
static bool willGenerateVectors(VPlan &Plan, ElementCount VF, const TargetTransformInfo &TTI)
Check if any recipe of Plan will generate a vector value, which will be assigned a vector register.
static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks, VectorizationFactor &VF, Loop *L, PredicatedScalarEvolution &PSE, VPCostContext &CostCtx, VPlan &Plan, ScalarEpilogueLowering SEL, std::optional< unsigned > VScale)
This function determines whether or not it's still profitable to vectorize the loop given the extra w...
static void fixScalarResumeValuesFromBypass(BasicBlock *BypassBlock, Loop *L, VPlan &BestEpiPlan, LoopVectorizationLegality &LVL, const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount)
static cl::opt< bool > MaximizeBandwidth("vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " "will be determined by the smallest type in loop."))
static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop, Instruction *I, DebugLoc DL={})
Create an analysis remark that explains why vectorization failed.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
This file implements a map that provides insertion order iteration.
This file contains the declarations for metadata subclasses.
#define T
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
#define P(N)
This file contains the declarations for profiling metadata utility functions.
const SmallVectorImpl< MachineOperand > & Cond
static BinaryOperator * CreateMul(Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore, Value *FlagsOp)
static BinaryOperator * CreateAdd(Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore, Value *FlagsOp)
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy, VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={})
This is similar to TargetTransformInfo::getScalarizationOverhead, but if ScalarTy is a FixedVectorTyp...
This file contains some templates that are useful if you are working with the STL at all.
#define OP(OPC)
Definition Instruction.h:46
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
#define DEBUG_WITH_TYPE(TYPE,...)
DEBUG_WITH_TYPE macro - This macro should be used by passes to emit debug information.
Definition Debug.h:72
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
This pass exposes codegen information to IR-level passes.
LocallyHashedType DenseMapInfo< LocallyHashedType >::Empty
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file contains the declarations of different VPlan-related auxiliary helpers.
This file provides utility VPlan to VPlan transformations.
#define RUN_VPLAN_PASS(PASS,...)
#define RUN_VPLAN_PASS_NO_VERIFY(PASS,...)
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const char PassName[]
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition blake3_impl.h:83
A manager for alias analyses.
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1555
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1527
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM_ABI unsigned getVScaleRangeMin() const
Returns the minimum value for the vscale_range attribute.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition BasicBlock.h:539
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
LLVM_ABI const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this basic block belongs to.
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
BinaryOps getOpcode() const
Definition InstrTypes.h:374
Analysis pass which computes BlockFrequencyInfo.
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Conditional or Unconditional Branch instruction.
bool isConditional() const
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
BasicBlock * getSuccessor(unsigned i) const
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Value * getArgOperand(unsigned i) const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:982
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
A debug info location.
Definition DebugLoc.h:123
static DebugLoc getTemporary()
Definition DebugLoc.h:160
static DebugLoc getUnknown()
Definition DebugLoc.h:161
An analysis that produces DemandedBits for a function.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
iterator end()
Definition DenseMap.h:81
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:169
void insert_range(Range &&R)
Inserts range of 'std::pair<KeyT, ValueT>' values into the map.
Definition DenseMap.h:294
Implements a dense probed hash-table based set.
Definition DenseSet.h:279
Analysis pass which computes a DominatorTree.
Definition Dominators.h:283
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
void eraseNode(NodeT *BB)
eraseNode - Removes a node from the dominator tree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
static constexpr ElementCount get(ScalarTy MinVal, bool Scalable)
Definition TypeSize.h:315
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel *CM, GeneratedRTChecks &Checks, VPlan &Plan)
BasicBlock * createVectorizedLoopSkeleton() final
Implements the interface for creating a vectorized skeleton using the epilogue loop strategy (i....
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
A specialized derived class of inner loop vectorizer that performs vectorization of main loops in the...
void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB)
Introduces a new VPIRBasicBlock for CheckIRBB to Plan between the vector preheader and its predecesso...
BasicBlock * emitIterationCountCheck(BasicBlock *VectorPH, BasicBlock *Bypass, bool ForEpilogue)
Emits an iteration count bypass check once for the main loop (when ForEpilogue is false) and once for...
Value * createIterationCountCheck(BasicBlock *VectorPH, ElementCount VF, unsigned UF) const
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
EpilogueVectorizerMainLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel *CM, GeneratedRTChecks &Check, VPlan &Plan)
BasicBlock * createVectorizedLoopSkeleton() final
Implements the interface for creating a vectorized skeleton using the main loop strategy (i....
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
Class to represent function types.
param_iterator param_begin() const
param_iterator param_end() const
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:211
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:764
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:729
Represents flags for the getelementptr instruction/expression.
static GEPNoWrapFlags none()
void applyUpdates(ArrayRef< UpdateT > Updates)
Submit updates to all available trees.
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition IRBuilder.h:345
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2787
A struct for saving information about induction variables.
const SCEV * getStep() const
ArrayRef< Instruction * > getCastInsts() const
Returns an ArrayRef to the type cast instructions in the induction update chain, that are redundant w...
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_NoInduction
Not an induction variable.
@ IK_FpInduction
Floating point induction variable.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
InnerLoopAndEpilogueVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel *CM, GeneratedRTChecks &Checks, VPlan &Plan, ElementCount VecWidth, ElementCount MinProfitableTripCount, unsigned UnrollFactor)
EpilogueLoopVectorizationInfo & EPI
Holds and updates state information required to vectorize the main loop and its epilogue in two separ...
InnerLoopVectorizer vectorizes loops which contain only one basic block to a specified vectorization ...
virtual void printDebugTracesAtStart()
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
Value * TripCount
Trip count of the original loop.
const TargetTransformInfo * TTI
Target Transform Info.
LoopVectorizationCostModel * Cost
The profitablity analysis.
Value * getTripCount() const
Returns the original loop trip count.
friend class LoopVectorizationPlanner
InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC, ElementCount VecWidth, unsigned UnrollFactor, LoopVectorizationCostModel *CM, GeneratedRTChecks &RTChecks, VPlan &Plan)
PredicatedScalarEvolution & PSE
A wrapper around ScalarEvolution used to add runtime SCEV checks.
LoopInfo * LI
Loop Info.
DominatorTree * DT
Dominator Tree.
void setTripCount(Value *TC)
Used to set the trip count after ILV's construction and after the preheader block has been executed.
void fixVectorizedLoop(VPTransformState &State)
Fix the vectorized code, taking care of header phi's, and more.
virtual BasicBlock * createVectorizedLoopSkeleton()
Creates a basic block for the scalar preheader.
virtual void printDebugTracesAtEnd()
AssumptionCache * AC
Assumption Cache.
IRBuilder Builder
The builder that we use.
void fixNonInductionPHIs(VPTransformState &State)
Fix the non-induction PHIs in Plan.
VPBasicBlock * VectorPHVPBB
The vector preheader block of Plan, used as target for check blocks introduced during skeleton creati...
unsigned UF
The vectorization unroll factor to use.
GeneratedRTChecks & RTChecks
Structure to hold information about generated runtime checks, responsible for cleaning the checks,...
virtual ~InnerLoopVectorizer()=default
ElementCount VF
The vectorization SIMD factor to use.
Loop * OrigLoop
The original loop.
BasicBlock * createScalarPreheader(StringRef Prefix)
Create and return a new IR basic block for the scalar preheader whose name is prefixed with Prefix.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
static InstructionCost getInvalid(CostType Val=0)
static InstructionCost getMax()
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
bool isCast() const
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI void moveBefore(InstListType::iterator InsertPos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
const char * getOpcodeName() const
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Class to represent integer types.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
LLVM_ABI APInt getMask() const
For example, this is 0xFF for an 8 bit integer, 0xFFFF for i16, etc.
Definition Type.cpp:342
The group of interleaved loads/stores sharing the same stride and close to each other.
uint32_t getFactor() const
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
InstTy * getInsertPos() const
uint32_t getNumMembers() const
Drive the analysis of interleaved memory accesses in the loop.
bool requiresScalarEpilogue() const
Returns true if an interleaved group that may access memory out-of-bounds requires a scalar epilogue ...
LLVM_ABI void analyzeInterleaving(bool EnableMaskedInterleavedGroup)
Analyze the interleaved accesses and collect them in interleave groups.
An instruction for reading from memory.
Type * getPointerOperandType() const
This analysis provides dependence information for the memory accesses of a loop.
Drive the analysis of memory accesses in the loop.
const RuntimePointerChecking * getRuntimePointerChecking() const
unsigned getNumRuntimePointerChecks() const
Number of memchecks required to prove independence of otherwise may-alias pointers.
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
BlockT * getHeader() const
iterator_range< block_iterator > blocks() const
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Store the result of a depth first search within basic blocks contained by a single loop.
RPOIterator beginRPO() const
Reverse iterate over the cached postorder blocks.
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
RPOIterator endRPO() const
Wrapper class to LoopBlocksDFS that provides a standard begin()/end() interface for the DFS reverse p...
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
void removeBlock(BlockT *BB)
This method completely removes BB from all data structures, including all of the Loop objects it is n...
LoopVectorizationCostModel - estimates the expected speedups due to vectorization.
SmallPtrSet< Type *, 16 > ElementTypesInLoop
All element types found in the loop.
bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment, unsigned AddressSpace) const
Returns true if the target machine supports masked load operation for the given DataType and kind of ...
void collectElementTypesForWidening()
Collect all element types in the loop for which widening is needed.
bool canVectorizeReductions(ElementCount VF) const
Returns true if the target machine supports all of the reduction variables found for the given VF.
bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment, unsigned AddressSpace) const
Returns true if the target machine supports masked store operation for the given DataType and kind of...
bool isEpilogueVectorizationProfitable(const ElementCount VF, const unsigned IC) const
Returns true if epilogue vectorization is considered profitable, and false otherwise.
bool useWideActiveLaneMask() const
Returns true if the use of wide lane masks is requested and the loop is using tail-folding with a lan...
bool isPredicatedInst(Instruction *I) const
Returns true if I is an instruction that needs to be predicated at runtime.
void collectValuesToIgnore()
Collect values we want to ignore in the cost model.
BlockFrequencyInfo * BFI
The BlockFrequencyInfo returned from GetBFI.
void collectInLoopReductions()
Split reductions into those that happen in the loop, and those that happen outside.
BlockFrequencyInfo & getBFI()
Returns the BlockFrequencyInfo for the function if cached, otherwise fetches it via GetBFI.
std::pair< unsigned, unsigned > getSmallestAndWidestTypes()
bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be uniform after vectorization.
bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF)
Returns true if an artificially high cost for emulated masked memrefs should be used.
void collectNonVectorizedAndSetWideningDecisions(ElementCount VF)
Collect values that will not be widened, including Uniforms, Scalars, and Instructions to Scalarize f...
PredicatedScalarEvolution & PSE
Predicated scalar evolution analysis.
const LoopVectorizeHints * Hints
Loop Vectorize Hint.
std::optional< unsigned > getMaxSafeElements() const
Return maximum safe number of elements to be processed per vector iteration, which do not prevent sto...
const TargetTransformInfo & TTI
Vector target information.
LoopVectorizationLegality * Legal
Vectorization legality.
uint64_t getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind, const BasicBlock *BB)
A helper function that returns how much we should divide the cost of a predicated block by.
std::optional< InstructionCost > getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy) const
Return the cost of instructions in an inloop reduction pattern, if I is part of that pattern.
InstructionCost getInstructionCost(Instruction *I, ElementCount VF)
Returns the execution time cost of an instruction for a given vector width.
DemandedBits * DB
Demanded bits analysis.
bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const
Returns true if I is a memory instruction in an interleaved-group of memory accesses that can be vect...
const TargetLibraryInfo * TLI
Target Library Info.
bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF)
Returns true if I is a memory instruction with consecutive memory access that can be widened.
const InterleaveGroup< Instruction > * getInterleavedAccessGroup(Instruction *Instr) const
Get the interleaved access group that Instr belongs to.
InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const
Estimate cost of an intrinsic call instruction CI if it were vectorized with factor VF.
bool OptForSize
Whether this loop should be optimized for size based on function attribute or profile information.
bool useMaxBandwidth(TargetTransformInfo::RegisterKind RegKind)
bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be scalar after vectorization.
bool isOptimizableIVTruncate(Instruction *I, ElementCount VF)
Return True if instruction I is an optimizable truncate whose operand is an induction variable.
FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC)
bool shouldConsiderRegPressureForVF(ElementCount VF)
Loop * TheLoop
The loop that we evaluate.
TTI::TargetCostKind CostKind
The kind of cost that we are calculating.
TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow=true) const
Returns the TailFoldingStyle that is best for the current loop.
InterleavedAccessInfo & InterleaveInfo
The interleave access information contains groups of interleaved accesses with the same stride and cl...
SmallPtrSet< const Value *, 16 > ValuesToIgnore
Values to ignore in the cost model.
void setVectorizedCallDecision(ElementCount VF)
A call may be vectorized in different ways depending on whether we have vectorized variants available...
void invalidateCostModelingDecisions()
Invalidates decisions already taken by the cost model.
bool isAccessInterleaved(Instruction *Instr) const
Check if Instr belongs to any interleaved access group.
bool selectUserVectorizationFactor(ElementCount UserVF)
Setup cost-based decisions for user vectorization factor.
std::optional< unsigned > getVScaleForTuning() const
Return the value of vscale used for tuning the cost model.
OptimizationRemarkEmitter * ORE
Interface to emit optimization remarks.
bool preferPredicatedLoop() const
Returns true if tail-folding is preferred over a scalar epilogue.
LoopInfo * LI
Loop Info analysis.
bool requiresScalarEpilogue(bool IsVectorizing) const
Returns true if we're required to use a scalar epilogue for at least the final iteration of the origi...
SmallPtrSet< const Value *, 16 > VecValuesToIgnore
Values to ignore in the cost model when VF > 1.
bool isInLoopReduction(PHINode *Phi) const
Returns true if the Phi is part of an inloop reduction.
bool isProfitableToScalarize(Instruction *I, ElementCount VF) const
void setWideningDecision(const InterleaveGroup< Instruction > *Grp, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for interleaving group Grp and vector ...
const MapVector< Instruction *, uint64_t > & getMinimalBitwidths() const
CallWideningDecision getCallWideningDecision(CallInst *CI, ElementCount VF) const
bool isLegalGatherOrScatter(Value *V, ElementCount VF)
Returns true if the target machine can represent V as a masked gather or scatter operation.
bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const
bool shouldConsiderInvariant(Value *Op)
Returns true if Op should be considered invariant and if it is trivially hoistable.
bool foldTailByMasking() const
Returns true if all loop blocks should be masked to fold tail loop.
bool foldTailWithEVL() const
Returns true if VP intrinsics with explicit vector length support should be generated in the tail fol...
bool usePredicatedReductionSelect() const
Returns true if the predicated reduction select should be used to set the incoming value for the redu...
bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const
Returns true if the instructions in this block requires predication for any reason,...
void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind, Function *Variant, Intrinsic::ID IID, std::optional< unsigned > MaskPos, InstructionCost Cost)
void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC)
Selects and saves TailFoldingStyle for 2 options - if IV update may overflow or not.
AssumptionCache * AC
Assumption cache.
void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for instruction I and vector width VF.
InstWidening
Decision that was taken during cost calculation for memory instruction.
std::pair< InstructionCost, InstructionCost > getDivRemSpeculationCost(Instruction *I, ElementCount VF)
Return the costs for our two available strategies for lowering a div/rem operation which requires spe...
InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const
Estimate cost of a call instruction CI if it were vectorized with factor VF.
bool isScalarWithPredication(Instruction *I, ElementCount VF)
Returns true if I is an instruction which requires predication and for which our chosen predication s...
bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const
Returns true if we should use strict in-order reductions for the given RdxDesc.
bool isDivRemScalarWithPredication(InstructionCost ScalarCost, InstructionCost SafeDivisorCost) const
Given costs for both strategies, return true if the scalar predication lowering should be used for di...
std::function< BlockFrequencyInfo &()> GetBFI
A function to lazily fetch BlockFrequencyInfo.
LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, std::function< BlockFrequencyInfo &()> GetBFI, const Function *F, const LoopVectorizeHints *Hints, InterleavedAccessInfo &IAI, bool OptForSize)
InstructionCost expectedCost(ElementCount VF)
Returns the expected execution cost.
void setCostBasedWideningDecision(ElementCount VF)
Memory access instruction may be vectorized in more than one way.
InstWidening getWideningDecision(Instruction *I, ElementCount VF) const
Return the cost model decision for the given instruction I and vector width VF.
FixedScalableVFPair MaxPermissibleVFWithoutMaxBW
The highest VF possible for this loop, without using MaxBandwidth.
const SmallPtrSetImpl< PHINode * > & getInLoopReductions() const
Returns the set of in-loop reduction PHIs.
bool isScalarEpilogueAllowed() const
Returns true if a scalar epilogue is not allowed due to optsize or a loop hint annotation.
InstructionCost getWideningCost(Instruction *I, ElementCount VF)
Return the vectorization cost for the given instruction I and vector width VF.
void collectInstsToScalarize(ElementCount VF)
Collects the instructions to scalarize for each predicated instruction in the loop.
LoopVectorizationLegality checks if it is legal to vectorize a loop, and to what vectorization factor...
MapVector< PHINode *, InductionDescriptor > InductionList
InductionList saves induction variables and maps them to the induction descriptor.
const SmallPtrSetImpl< const Instruction * > & getPotentiallyFaultingLoads() const
Returns potentially faulting loads.
bool canVectorize(bool UseVPlanNativePath)
Returns true if it is legal to vectorize this loop.
bool canVectorizeFPMath(bool EnableStrictReductions)
Returns true if it is legal to vectorize the FP math operations in this loop.
PHINode * getPrimaryInduction()
Returns the primary induction variable.
const SmallVector< BasicBlock *, 4 > & getCountableExitingBlocks() const
Returns all exiting blocks with a countable exit, i.e.
const InductionList & getInductionVars() const
Returns the induction variables found in the loop.
bool hasUncountableEarlyExit() const
Returns true if the loop has uncountable early exits, i.e.
bool hasHistograms() const
Returns a list of all known histogram operations in the loop.
const LoopAccessInfo * getLAI() const
Planner drives the vectorization process after having passed Legality checks.
VectorizationFactor selectEpilogueVectorizationFactor(const ElementCount MainLoopVF, unsigned IC)
VPlan & getPlanFor(ElementCount VF) const
Return the VPlan for VF.
Definition VPlan.cpp:1604
VectorizationFactor planInVPlanNativePath(ElementCount UserVF)
Use the VPlan-native path to plan how to best vectorize, return the best VF and its cost.
void updateLoopMetadataAndProfileInfo(Loop *VectorLoop, VPBasicBlock *HeaderVPBB, const VPlan &Plan, bool VectorizingEpilogue, MDNode *OrigLoopID, std::optional< unsigned > OrigAverageTripCount, unsigned OrigLoopInvocationWeight, unsigned EstimatedVFxUF, bool DisableRuntimeUnroll)
Update loop metadata and profile info for both the scalar remainder loop and VectorLoop,...
Definition VPlan.cpp:1655
void buildVPlans(ElementCount MinVF, ElementCount MaxVF)
Build VPlans for power-of-2 VF's between MinVF and MaxVF inclusive, according to the information gath...
Definition VPlan.cpp:1588
VectorizationFactor computeBestVF()
Compute and return the most profitable vectorization factor.
DenseMap< const SCEV *, Value * > executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan, InnerLoopVectorizer &LB, DominatorTree *DT, bool VectorizingEpilogue)
Generate the IR code for the vectorized loop captured in VPlan BestPlan according to the best selecte...
unsigned selectInterleaveCount(VPlan &Plan, ElementCount VF, InstructionCost LoopCost)
void emitInvalidCostRemarks(OptimizationRemarkEmitter *ORE)
Emit remarks for recipes with invalid costs in the available VPlans.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1569
void printPlans(raw_ostream &O)
Definition VPlan.cpp:1749
void plan(ElementCount UserVF, unsigned UserIC)
Build VPlans for the specified UserVF and UserIC if they are non-zero or all applicable candidate VFs...
void addMinimumIterationCheck(VPlan &Plan, ElementCount VF, unsigned UF, ElementCount MinProfitableTripCount) const
Create a check to Plan to see if the vector loop should be executed based on its trip count.
bool hasPlanWithVF(ElementCount VF) const
Look through the existing plans and return true if we have one with vectorization factor VF.
This holds vectorization requirements that must be verified late in the process.
Utility class for getting and setting loop vectorizer hints in the form of loop metadata.
bool allowVectorization(Function *F, Loop *L, bool VectorizeOnlyWhenForced) const
bool allowReordering() const
When enabling loop hints are provided we allow the vectorizer to change the order of operations that ...
void emitRemarkWithHints() const
Dumps all the hint information.
const char * vectorizeAnalysisPassName() const
If hints are provided that force vectorization, use the AlwaysPrint pass name to force the frontend t...
This class emits a version of the loop where run-time checks ensure that may-alias pointers can't ove...
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
bool hasLoopInvariantOperands(const Instruction *I) const
Return true if all the operands of the specified instruction are loop invariant.
Definition LoopInfo.cpp:67
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition LoopInfo.cpp:632
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition LoopInfo.cpp:61
Metadata node.
Definition Metadata.h:1080
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition MapVector.h:124
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition Module.cpp:235
Diagnostic information for optimization analysis remarks related to pointer aliasing.
Diagnostic information for optimization analysis remarks related to floating-point non-commutativity.
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
op_range incoming_values()
void setIncomingValueForBlock(const BasicBlock *BB, Value *V)
Set every incoming value(s) for block BB to V.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
unsigned getNumIncomingValues() const
Return the number of incoming edges.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEVPredicate & getPredicate() const
LLVM_ABI unsigned getSmallConstantMaxTripCount()
Returns the upper bound of the loop trip count as a normal unsigned value, or 0 if the trip count is ...
LLVM_ABI const SCEV * getBackedgeTakenCount()
Get the (predicated) backedge count for the analyzed loop.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
PreservedAnalyses & preserve()
Mark an analysis as preserved.
Definition Analysis.h:132
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
static bool isFMulAddIntrinsic(Instruction *I)
Returns true if the instruction is a call to the llvm.fmuladd intrinsic.
FastMathFlags getFastMathFlags() const
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
Type * getRecurrenceType() const
Returns the type of the recurrence.
bool hasUsesOutsideReductionChain() const
Returns true if the reduction PHI has any uses outside the reduction chain.
const SmallPtrSet< Instruction *, 8 > & getCastInsts() const
Returns a reference to the instructions used for type-promoting the recurrence.
static bool isFindLastRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
unsigned getMinWidthCastToRecurrenceTypeInBits() const
Returns the minimum width used by the recurrence in bits.
LLVM_ABI SmallVector< Instruction *, 4 > getReductionOpChain(PHINode *Phi, Loop *L) const
Attempts to find a chain of operations from Phi to LoopExitInst that can be treated as a set of reduc...
static bool isAnyOfRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
bool isSigned() const
Returns true if all source operands of the recurrence are SExtInsts.
RecurKind getRecurrenceKind() const
bool isOrdered() const
Expose an ordered FP reduction to the instance users.
static bool isFindIVRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
std::optional< ArrayRef< PointerDiffInfo > > getDiffChecks() const
const SmallVectorImpl< RuntimePointerCheck > & getChecks() const
Returns the checks that generateChecks created.
This class uses information about analyze scalars to rewrite expressions in canonical form.
ScalarEvolution * getSE()
bool isInsertedInstruction(Instruction *I) const
Return true if the specified instruction was inserted by the code rewriter.
LLVM_ABI Value * expandCodeForPredicate(const SCEVPredicate *Pred, Instruction *Loc)
Generates a code sequence that evaluates this predicate.
void eraseDeadInstructions(Value *Root)
Remove inserted instructions that are dead, e.g.
virtual bool isAlwaysTrue() const =0
Returns true if the predicate is always true.
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
LLVM_ABI const SCEV * getURemExpr(const SCEV *LHS, const SCEV *RHS)
Represents an unsigned remainder expression based on unsigned division.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getTripCountFromExitCount(const SCEV *ExitCount)
A version of getTripCountFromExitCount below which always picks an evaluation type which can not resu...
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
LLVM_ABI void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
LLVM_ABI void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
LLVM_ABI void forgetBlockAndLoopDispositions(Value *V=nullptr)
Called when the client has changed the disposition of values in a loop or block.
const SCEV * getMinusOne(Type *Ty)
Return a SCEV for the constant -1 of a specific type.
LLVM_ABI void forgetLcssaPhiWithNewPredecessor(Loop *L, PHINode *V)
Forget LCSSA phi node V of loop L to which a new predecessor was added, such that it may no longer be...
LLVM_ABI unsigned getSmallConstantTripCount(const Loop *L)
Returns the exact trip count of the loop if we can compute it, and the result is a small constant.
APInt getUnsignedRangeMax(const SCEV *S)
Determine the max of the unsigned range for a particular SCEV.
LLVM_ABI const SCEV * applyLoopGuards(const SCEV *Expr, const Loop *L)
Try to apply information from loop guards for L to Expr.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
void insert_range(Range &&R)
Definition SetVector.h:176
size_type count(const_arg_type key) const
Count the number of elements of a given key in the SetVector.
Definition SetVector.h:262
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:339
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
LLVM_ABI std::optional< unsigned > getVScaleForTuning() const
LLVM_ABI bool supportsEfficientVectorElementLoadStore() const
If target has efficient vector element load/store instructions, it can return true here so that inser...
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
@ None
The insert/extract is not used with a load/store.
@ Load
The value being inserted comes from a load (InsertElement only).
@ Store
The extracted value is stored (ExtractElement only).
LLVM_ABI bool prefersVectorizedAddressing() const
Return true if target doesn't mind addresses in vectors.
LLVM_ABI TypeSize getRegisterBitWidth(RegisterKind K) const
LLVM_ABI InstructionCost getOperandsScalarizationOverhead(ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const
Estimate the overhead of scalarizing operands with the given types.
LLVM_ABI bool preferFixedOverScalableIfEqualCost(bool IsEpilogue) const
LLVM_ABI InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
LLVM_ABI bool isElementTypeLegalForScalableVector(Type *Ty) const
LLVM_ABI ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
LLVM_ABI InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
LLVM_ABI InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const
LLVM_ABI bool supportsScalableVectors() const
@ TCC_Free
Expected to fold away in lowering.
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
LLVM_ABI InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const
LLVM_ABI InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const
Estimate the overhead of scalarizing an instruction.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
LLVM_ABI InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ Normal
The cast is used with a normal load/store.
@ Interleave
The cast is used with an interleaved load/store.
@ GatherScatter
The cast is used with a gather/scatter.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:89
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:98
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:280
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
op_range operands()
Definition User.h:267
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition User.cpp:25
Value * getOperand(unsigned i) const
Definition User.h:207
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition VectorUtils.h:74
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:4177
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:4204
iterator end()
Definition VPlan.h:4214
iterator begin()
Recipe iterator methods.
Definition VPlan.h:4212
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:4265
InstructionCost cost(ElementCount VF, VPCostContext &Ctx) override
Return the cost of this VPBasicBlock.
Definition VPlan.cpp:779
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:232
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:639
bool empty() const
Definition VPlan.h:4223
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:81
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:202
void setName(const Twine &newName)
Definition VPlan.h:166
size_t getNumSuccessors() const
Definition VPlan.h:219
void swapSuccessors()
Swap successors of the block. The block must have exactly 2 successors.
Definition VPlan.h:322
size_t getNumPredecessors() const
Definition VPlan.h:220
VPlan * getPlan()
Definition VPlan.cpp:177
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:182
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:209
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:198
static auto blocksOnly(const T &Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:269
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:290
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:221
static void reassociateBlocks(VPBlockBase *Old, VPBlockBase *New)
Reassociate all the blocks connected to Old so that they now point to New.
Definition VPlanUtils.h:247
VPlan-based builder utility analogous to IRBuilder.
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="", const VPIRFlags &Flags={})
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
Canonical scalar induction phi of the vector loop.
Definition VPlan.h:3752
VPIRValue * getStartValue() const
Returns the start value of the canonical induction.
Definition VPlan.h:3774
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:427
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:400
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:2228
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2270
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2259
A recipe representing a sequence of load -> update -> store as part of a histogram operation.
Definition VPlan.h:1970
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:4330
LLVM_ABI_FOR_TEST FastMathFlags getFastMathFlags() const
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1155
unsigned getNumOperandsWithoutMask() const
Returns the number of operands, excluding the mask if the VPInstruction is masked.
Definition VPlan.h:1384
iterator_range< operand_iterator > operandsWithoutMask()
Returns an iterator range over the operands excluding the mask operand if present.
Definition VPlan.h:1404
@ ComputeAnyOfResult
Compute the final result of a AnyOf reduction with select(cmp(),x,y), where one of (x,...
Definition VPlan.h:1202
@ ResumeForEpilogue
Explicit user for the resume phi of the canonical induction in the main VPlan, used by the epilogue v...
Definition VPlan.h:1260
@ ReductionStartVector
Start vector for reductions with 3 operands: the original start value, the identity value for the red...
Definition VPlan.h:1251
unsigned getOpcode() const
Definition VPlan.h:1334
VPValue * getMask() const
Returns the mask for the VPInstruction.
Definition VPlan.h:1398
bool isMasked() const
Returns true if the VPInstruction has a mask operand.
Definition VPlan.h:1374
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:2891
detail::zippy< llvm::detail::zip_first, VPUser::const_operand_range, const_incoming_blocks_range > incoming_values_and_blocks() const
Returns an iterator range over pairs of incoming values and corresponding incoming blocks.
Definition VPlan.h:1560
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:387
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:536
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPRecipeBase * tryToCreateWidenNonPhiRecipe(VPSingleDefRecipe *R, VFRange &Range)
Create and return a widened recipe for a non-phi recipe R if one can be created within the given VF R...
VPValue * getVPValueOrAddLiveIn(Value *V)
VPReplicateRecipe * handleReplication(VPInstruction *VPI, VFRange &Range)
Build a VPReplicationRecipe for VPI.
bool isOrdered() const
Returns true, if the phi is part of an ordered reduction.
Definition VPlan.h:2682
unsigned getVFScaleFactor() const
Get the factor that the VF of this recipe's output should be scaled by, or 1 if it isn't scaled.
Definition VPlan.h:2661
bool isInLoop() const
Returns true if the phi is part of an in-loop reduction.
Definition VPlan.h:2685
RecurKind getRecurrenceKind() const
Returns the recurrence kind of the reduction.
Definition VPlan.h:2679
A recipe to represent inloop, ordered or partial reduction operations.
Definition VPlan.h:2984
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4365
const VPBlockBase * getEntry() const
Definition VPlan.h:4401
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the region.
Definition VPlan.h:4463
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:3138
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:588
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:651
An analysis for type-inference for VPValues.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:258
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:302
operand_iterator op_begin()
Definition VPlanValue.h:322
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:297
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:46
Value * getLiveInIRValue() const
Return the underlying IR value for a VPIRValue.
Definition VPlan.cpp:137
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:127
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:71
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1403
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1407
user_range users()
Definition VPlanValue.h:125
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:2076
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1762
A recipe for handling GEP instructions.
Definition VPlan.h:2012
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2376
A recipe for widened phis.
Definition VPlan.h:2512
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1706
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4495
bool hasVF(ElementCount VF) const
Definition VPlan.h:4704
VPBasicBlock * getEntry()
Definition VPlan.h:4587
VPValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4677
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4645
iterator_range< SmallSetVector< ElementCount, 2 >::iterator > vectorFactors() const
Returns an iterator range over all VFs of the plan.
Definition VPlan.h:4711
bool hasUF(unsigned UF) const
Definition VPlan.h:4722
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4635
VPIRValue * getOrAddLiveIn(Value *V)
Gets the live-in VPIRValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:4747
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1033
bool hasEarlyExit() const
Returns true if the VPlan is based on a loop with an early exit.
Definition VPlan.h:4861
InstructionCost cost(ElementCount VF, VPCostContext &Ctx)
Return the cost of this plan.
Definition VPlan.cpp:1015
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4659
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4612
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4626
void execute(VPTransformState *State)
Generate the IR code for this VPlan.
Definition VPlan.cpp:923
VPIRBasicBlock * getScalarHeader() const
Return the VPIRBasicBlock wrapping the header of the scalar loop.
Definition VPlan.h:4631
VPBasicBlock * getVectorPreheader()
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4592
LLVM_ABI_FOR_TEST VPlan * duplicate()
Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...
Definition VPlan.cpp:1181
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
LLVM_ABI bool hasOneUser() const
Return true if there is exactly one user of this value.
Definition Value.cpp:166
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:397
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
Definition Value.cpp:708
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:230
constexpr bool isNonZero() const
Definition TypeSize.h:155
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr bool isZero() const
Definition TypeSize.h:153
static constexpr bool isKnownGT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:223
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
static constexpr bool isKnownGE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:237
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
IteratorT end() const
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
A raw_ostream that writes to an std::string.
Changed
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
std::variant< std::monostate, Loc::Single, Loc::Multi, Loc::MMI, Loc::EntryValue > Variant
Alias for the std::variant specialization base class of DbgVariable.
Definition DwarfDebug.h:189
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
auto match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
class_match< const SCEVVScale > m_SCEVVScale()
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
specificloop_ty m_SpecificLoop(const Loop *L)
cst_pred_ty< is_specific_signed_cst > m_scev_SpecificSInt(int64_t V)
Match an SCEV constant with a plain signed integer (sign-extended value will be matched)
SCEVAffineAddRec_match< Op0_t, Op1_t, class_match< const Loop > > m_scev_AffineAddRec(const Op0_t &Op0, const Op1_t &Op1)
bind_ty< const SCEVMulExpr > m_scev_Mul(const SCEVMulExpr *&V)
bool match(const SCEV *S, const Pattern &P)
SCEVBinaryExpr_match< SCEVMulExpr, Op0_t, Op1_t, SCEV::FlagAnyWrap, true > m_scev_c_Mul(const Op0_t &Op0, const Op1_t &Op1)
class_match< const SCEV > m_SCEV()
AllRecipe_match< Instruction::Select, Op0_t, Op1_t, Op2_t > m_Select(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
int_pred_ty< is_zero_int > m_ZeroInt()
Match an integer 0 or a vector with all elements equal to 0.
bool matchFindIVResult(VPInstruction *VPI, Op0_t ReducedIV, Op1_t Start)
Match FindIV result pattern: select(icmp ne ComputeReductionResult(ReducedIV), Sentinel),...
match_combine_or< AllRecipe_match< Instruction::ZExt, Op0_t >, AllRecipe_match< Instruction::SExt, Op0_t > > m_ZExtOrSExt(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ExtractLastLane, Op0_t > m_ExtractLastLane(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BranchOnCount > m_BranchOnCount()
VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > m_ExtractLastPart(const Op0_t &Op0)
bool match(Val *V, const Pattern &P)
class_match< VPValue > m_VPValue()
Match an arbitrary VPValue and ignore it.
VPInstruction_match< VPInstruction::ExtractLane, Op0_t, Op1_t > m_ExtractLane(const Op0_t &Op0, const Op1_t &Op1)
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
Add a small namespace to avoid name clashes with the classes used in the streaming interface.
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< InstrNode * > Instr
Definition RDFGraph.h:389
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
VPBasicBlock * getFirstLoopHeader(VPlan &Plan, VPDominatorTree &VPDT)
Returns the header block of the first, top-level loop, or null if none exist.
bool isAddressSCEVForCost(const SCEV *Addr, ScalarEvolution &SE, const Loop *L)
Returns true if Addr is an address SCEV that can be passed to TTI::getAddressComputationCost,...
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
VPIRFlags getFlagsFromIndDesc(const InductionDescriptor &ID)
Extracts and returns NoWrap and FastMath flags from the induction binop in ID.
Definition VPlanUtils.h:94
VPRecipeBase * findRecipe(VPValue *Start, PredT Pred)
Search Start's users for a recipe satisfying Pred, looking through recipes with definitions.
Definition VPlanUtils.h:111
VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) TODO: Int...
const SCEV * getSCEVExprForVPValue(const VPValue *V, PredicatedScalarEvolution &PSE, const Loop *L=nullptr)
Return the SCEV expression for V.
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
LLVM_ABI bool simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, AssumptionCache *AC, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
Simplify each loop in a loop nest recursively.
LLVM_ABI void ReplaceInstWithInst(BasicBlock *BB, BasicBlock::iterator &BI, Instruction *I)
Replace the instruction specified by BI with the instruction specified by I.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:831
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI Value * addRuntimeChecks(Instruction *Loc, Loop *TheLoop, const SmallVectorImpl< RuntimePointerCheck > &PointerChecks, SCEVExpander &Expander, bool HoistRuntimeChecks=false)
Add code that checks at runtime if the accessed arrays in PointerChecks overlap.
auto cast_if_present(const Y &Val)
cast_if_present<X> - Functionally identical to cast, except that a null value is accepted.
Definition Casting.h:683
LLVM_ABI bool RemoveRedundantDbgInstrs(BasicBlock *BB)
Try to remove redundant dbg.value instructions from given basic block.
LLVM_ABI_FOR_TEST cl::opt< bool > VerifyEachVPlan
LLVM_ABI std::optional< unsigned > getLoopEstimatedTripCount(Loop *L, unsigned *EstimatedLoopInvocationWeight=nullptr)
Return either:
static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop, VectorizationFactor VF, unsigned IC)
Report successful vectorization of the loop.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
unsigned getLoadStoreAddressSpace(const Value *I)
A helper function that returns the address space of the pointer operand of load or store instruction.
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1669
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
InstructionCost Cost
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
OuterAnalysisManagerProxy< ModuleAnalysisManager, Function > ModuleAnalysisManagerFunctionProxy
Provide the ModuleAnalysisManager to Function proxy.
Value * getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF)
Return the runtime value for VF.
LLVM_ABI bool formLCSSARecursively(Loop &L, const DominatorTree &DT, const LoopInfo *LI, ScalarEvolution *SE)
Put a loop nest into LCSSA form.
Definition LCSSA.cpp:449
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
LLVM_ABI bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:262
LLVM_ABI bool VerifySCEV
LLVM_ABI_FOR_TEST cl::opt< bool > VPlanPrintAfterAll
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:289
SmallVector< VPRegisterUsage, 8 > calculateRegisterUsageForPlan(VPlan &Plan, ArrayRef< ElementCount > VFs, const TargetTransformInfo &TTI, const SmallPtrSetImpl< const Value * > &ValuesToIgnore)
Estimate the register usage for Plan and vectorization factors in VFs by calculating the highest numb...
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:337
LLVM_ABI void setBranchWeights(Instruction &I, ArrayRef< uint32_t > Weights, bool IsExpected, bool ElideAllZero=false)
Create a new branch_weights metadata node and add or overwrite a prof metadata reference to instructi...
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
void collectEphemeralRecipesForVPlan(VPlan &Plan, DenseSet< VPRecipeBase * > &EphRecipes)
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
bool containsIrreducibleCFG(RPOTraversalT &RPOTraversal, const LoopInfoT &LI)
Return true if the control flow in RPOTraversal is irreducible.
Definition CFG.h:149
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
LLVM_ABI_FOR_TEST cl::opt< bool > EnableWideActiveLaneMask
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
LLVM_ABI cl::opt< bool > EnableLoopVectorization
LLVM_ABI_FOR_TEST cl::list< std::string > VPlanPrintAfterPasses
LLVM_ABI bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition Local.cpp:425
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
Type * toVectorizedTy(Type *Ty, ElementCount EC)
A helper for converting to vectorized types.
LLVM_ABI void llvm_unreachable_internal(const char *msg=nullptr, const char *file=nullptr, unsigned line=0)
This function calls abort(), and prints the optional message to stderr.
T * find_singleton(R &&Range, Predicate P, bool AllowRepeats=false)
Return the single value in Range that satisfies P(<member of Range> *, AllowRepeats)->T * returning n...
Definition STLExtras.h:1837
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
cl::opt< unsigned > ForceTargetInstructionCost
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:129
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
bool canVectorizeTy(Type *Ty)
Returns true if Ty is a valid vector element type, void, or an unpacked literal struct where all elem...
TargetTransformInfo TTI
static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr, DebugLoc DL={})
Reports an informative message: print Msg for debugging purposes as well as an optimization remark.
LLVM_ABI bool isAssignmentTrackingEnabled(const Module &M)
Return true if assignment tracking is enabled for module M.
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
LLVM_ABI Value * getRecurrenceIdentity(RecurKind K, Type *Tp, FastMathFlags FMF)
Given information about an recurrence kind, return the identity for the @llvm.vector....
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
LLVM_ABI void reportVectorizationFailure(const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr)
Reports a vectorization failure: print DebugMsg for debugging purposes along with the corresponding o...
DWARFExpression::Operation Op
ScalarEpilogueLowering
@ CM_ScalarEpilogueNotAllowedLowTripLoop
@ CM_ScalarEpilogueNotNeededUsePredicate
@ CM_ScalarEpilogueNotAllowedOptSize
@ CM_ScalarEpilogueAllowed
@ CM_ScalarEpilogueNotAllowedUsePredicate
LLVM_ABI bool isGuaranteedNotToBeUndefOrPoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Return true if this function can prove that V does not have undef bits and is never poison.
ArrayRef(const T &OneElt) -> ArrayRef< T >
Value * createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, int64_t Step)
Return a value for Step multiplied by VF.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1772
Value * emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue, Value *Step, InductionDescriptor::InductionKind InductionKind, const BinaryOperator *InductionBinOp)
Compute the transformed value of Index at offset StartValue using step StepValue.
auto predecessors(const MachineBasicBlock *BB)
iterator_range< pointer_iterator< WrappedIteratorT > > make_pointer_range(RangeT &&Range)
Definition iterator.h:368
cl::opt< bool > EnableVPlanNativePath
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
ArrayRef< Type * > getContainedTypes(Type *const &Ty)
Returns the types contained in Ty.
LLVM_ABI Value * addDiffRuntimeChecks(Instruction *Loc, ArrayRef< PointerDiffInfo > Checks, SCEVExpander &Expander, function_ref< Value *(IRBuilderBase &, unsigned)> GetVF, unsigned IC)
bool pred_empty(const BasicBlock *BB)
Definition CFG.h:119
@ DataAndControlFlowWithoutRuntimeCheck
Use predicate to control both data and control flow, but modify the trip count so that a runtime over...
@ None
Don't use tail folding.
@ DataWithEVL
Use predicated EVL instructions for tail-folding.
@ DataAndControlFlow
Use predicate to control both data and control flow.
@ DataWithoutLaneMask
Same as Data, but avoids using the get.active.lane.mask intrinsic to calculate the mask and instead i...
@ Data
Use predicate only to mask operations on data in the loop.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI bool hasBranchWeightMD(const Instruction &I)
Checks if an instructions has Branch Weight Metadata.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:330
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
std::unique_ptr< VPlan > VPlanPtr
Definition VPlan.h:77
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
LLVM_ABI_FOR_TEST bool verifyVPlanIsValid(const VPlan &Plan)
Verify invariants for general VPlans.
LLVM_ABI MapVector< Instruction *, uint64_t > computeMinimumValueSizes(ArrayRef< BasicBlock * > Blocks, DemandedBits &DB, const TargetTransformInfo *TTI=nullptr)
Compute a map of integer instructions to their minimum legal type size.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
LLVM_ABI_FOR_TEST cl::opt< bool > VPlanPrintVectorRegionScope
LLVM_ABI cl::opt< bool > EnableLoopInterleaving
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
A special type used by analysis passes to provide an address that identifies that particular analysis...
Definition Analysis.h:29
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
An information struct used to provide DenseMap with the various necessary components for a given valu...
Encapsulate information regarding vectorization of a loop and its epilogue.
EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, ElementCount EVF, unsigned EUF, VPlan &EpiloguePlan)
A class that represents two vectorization factors (initialized with 0 by default).
static FixedScalableVFPair getNone()
This holds details about a histogram operation – a load -> update -> store sequence where each lane i...
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
TargetLibraryInfo * TLI
LLVM_ABI LoopVectorizeResult runImpl(Function &F)
LLVM_ABI bool processLoop(Loop *L)
ProfileSummaryInfo * PSI
LoopAccessInfoManager * LAIs
LLVM_ABI void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName)
LLVM_ABI LoopVectorizePass(LoopVectorizeOptions Opts={})
ScalarEvolution * SE
AssumptionCache * AC
LLVM_ABI PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
OptimizationRemarkEmitter * ORE
std::function< BlockFrequencyInfo &()> GetBFI
TargetTransformInfo * TTI
Storage for information about made changes.
A CRTP mix-in to automatically provide informational APIs needed for passes.
Definition PassManager.h:70
A marker analysis to determine if extra passes should be run after loop vectorization.
static LLVM_ABI AnalysisKey Key
Holds the VFShape for a specific scalar to vector function mapping.
std::optional< unsigned > getParamIndexForOptionalMask() const
Instruction Set Architecture.
Encapsulates information needed to describe a parameter.
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
ElementCount End
Struct to hold various analysis needed for cost computations.
unsigned getPredBlockCostDivisor(BasicBlock *BB) const
LoopVectorizationCostModel & CM
bool isLegacyUniformAfterVectorization(Instruction *I, ElementCount VF) const
Return true if I is considered uniform-after-vectorization in the legacy cost model for VF.
bool skipCostComputation(Instruction *UI, bool IsVector) const
Return true if the cost for UI shouldn't be computed, e.g.
InstructionCost getLegacyCost(Instruction *UI, ElementCount VF) const
Return the cost for UI with VF using the legacy cost model as fallback until computing the cost of al...
TargetTransformInfo::TargetCostKind CostKind
SmallPtrSet< Instruction *, 8 > SkipCostComputation
A struct that represents some properties of the register usage of a loop.
VPTransformState holds information passed down when "executing" a VPlan, needed for generating the ou...
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3540
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3623
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializePacksAndUnpacks(VPlan &Plan)
Add explicit Build[Struct]Vector recipes to Pack multiple scalar values into vectors and Unpack recip...
static LLVM_ABI_FOR_TEST std::unique_ptr< VPlan > buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy, DebugLoc IVDL, PredicatedScalarEvolution &PSE, LoopVersioning *LVer=nullptr)
Create a base VPlan0, serving as the common starting point for all later candidates.
static void optimizeInductionExitUsers(VPlan &Plan, DenseMap< VPValue *, VPValue * > &EndValues, PredicatedScalarEvolution &PSE)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void materializeFactors(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize UF, VF and VFxUF to be computed explicitly using VPInstructions.
static void createInLoopReductionRecipes(VPlan &Plan, const DenseSet< BasicBlock * > &BlocksNeedingPredication, ElementCount MinVF)
Create VPReductionRecipes for in-loop reductions.
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static LLVM_ABI_FOR_TEST void handleEarlyExits(VPlan &Plan, bool HasUncountableExit)
Update Plan to account for all early exits.
static bool handleMultiUseReductions(VPlan &Plan, OptimizationRemarkEmitter *ORE, Loop *TheLoop)
Try to legalize reductions with multiple in-loop uses.
static void dropPoisonGeneratingRecipes(VPlan &Plan, const std::function< bool(BasicBlock *)> &BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void convertToVariableLengthStep(VPlan &Plan)
Transform loops with variable-length stepping after region dissolution.
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed)
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static std::unique_ptr< VPlan > narrowInterleaveGroups(VPlan &Plan, const TargetTransformInfo &TTI)
Try to find a single VF among Plan's VFs for which all interleave groups (with known minimum VF eleme...
static bool handleFindLastReductions(VPlan &Plan)
Check if Plan contains any FindLast reductions.
static void unrollByUF(VPlan &Plan, unsigned UF)
Explicitly unroll Plan by UF.
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand VPExpandSCEVRecipes in Plan's entry block.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void expandBranchOnTwoConds(VPlan &Plan)
Expand BranchOnTwoConds instructions into explicit CFG with BranchOnCond instructions.
static void hoistPredicatedLoads(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Hoist predicated loads from the same address to the loop entry block, if they are guaranteed to execu...
static void optimizeFindIVReductions(VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &L)
Optimize FindLast reductions selecting IVs (or expressions of IVs) by converting them to FindIV reduc...
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range)
Handle users in the exit block for first order reductions in the original exit block.
static void createHeaderPhiRecipes(VPlan &Plan, PredicatedScalarEvolution &PSE, Loop &OrigLoop, const MapVector< PHINode *, InductionDescriptor > &Inductions, const MapVector< PHINode *, RecurrenceDescriptor > &Reductions, const SmallPtrSetImpl< const PHINode * > &FixedOrderRecurrences, const SmallPtrSetImpl< PHINode * > &InLoopReductions, bool AllowReordering)
Replace VPPhi recipes in Plan's header with corresponding VPHeaderPHIRecipe subclasses for inductions...
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPCurrentIterationPHIRecipe and related recipes to Plan and replaces all uses except the canoni...
static void introduceMasksAndLinearize(VPlan &Plan, bool FoldTail)
Predicate and linearize the control-flow in the only loop region of Plan.
static void optimizeEVLMasks(VPlan &Plan)
Optimize recipes which use an EVL-based header mask to VP intrinsics, for example:
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static bool handleMaxMinNumReductions(VPlan &Plan)
Check if Plan contains any FMaxNum or FMinNum reductions.
static void removeBranchOnConst(VPlan &Plan)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static LLVM_ABI_FOR_TEST void createLoopRegions(VPlan &Plan)
Replace loops in Plan's flat CFG with VPRegionBlocks, turning Plan's flat CFG into a hierarchical CFG...
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void attachCheckBlock(VPlan &Plan, Value *Cond, BasicBlock *CheckBlock, bool AddBranchWeights)
Wrap runtime check block CheckBlock in a VPIRBB and Cond in a VPValue and connect the block to Plan,...
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue)
Materialize vector trip count computations to a set of VPInstructions.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static void sinkPredicatedStores(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Sink predicated stores to the same address with complementary predicates (P and NOT P) to an uncondit...
static void addMinimumIterationCheck(VPlan &Plan, ElementCount VF, unsigned UF, ElementCount MinProfitableTripCount, bool RequiresScalarEpilogue, bool TailFolded, Loop *OrigLoop, const uint32_t *MinItersBypassWeights, DebugLoc DL, PredicatedScalarEvolution &PSE)
static void replicateByVF(VPlan &Plan, ElementCount VF)
Replace each replicating VPReplicateRecipe and VPInstruction outside of any replicate region in Plan ...
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void createPartialReductions(VPlan &Plan, VPCostContext &CostCtx, VFRange &Range)
Detect and create partial reduction recipes for scaled reductions in Plan.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static LLVM_ABI_FOR_TEST void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Try to have all users of fixed-order recurrences appear after the recipe defining their previous valu...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void addMinimumVectorEpilogueIterationCheck(VPlan &Plan, Value *TripCount, Value *VectorTripCount, bool RequiresScalarEpilogue, ElementCount EpilogueVF, unsigned EpilogueUF, unsigned MainLoopStep, unsigned EpilogueLoopStep, ScalarEvolution &SE)
Add a check to Plan to see if the epilogue vector loop should be executed.
static void updateScalarResumePhis(VPlan &Plan, DenseMap< VPValue *, VPValue * > &IVEndValues)
Update the resume phis in the scalar preheader after creating wide recipes for first-order recurrence...
static void convertEVLExitCond(VPlan &Plan)
Replaces the exit condition from (branch-on-cond eq CanonicalIVInc, VectorTripCount) to (branch-on-co...
static LLVM_ABI_FOR_TEST void addMiddleCheck(VPlan &Plan, bool RequiresScalarEpilogueCheck, bool TailFolded)
If a check is needed to guard executing the scalar epilogue loop, it will be added to the middle bloc...
TODO: The following VectorizationFactor was pulled out of LoopVectorizationCostModel class.
InstructionCost Cost
Cost of the loop with that width.
ElementCount MinProfitableTripCount
The minimum trip count required to make vectorization profitable, e.g.
ElementCount Width
Vector width with best cost.
InstructionCost ScalarCost
Cost of the scalar loop.
static VectorizationFactor Disabled()
Width 1 means no vectorization, cost 0 means uncomputed cost.
static LLVM_ABI bool HoistRuntimeChecks