LLVM 20.0.0git
LoopVectorize.cpp
Go to the documentation of this file.
1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanAnalysis.h"
61#include "VPlanHCFGBuilder.h"
62#include "VPlanPatternMatch.h"
63#include "VPlanTransforms.h"
64#include "VPlanUtils.h"
65#include "VPlanVerifier.h"
66#include "llvm/ADT/APInt.h"
67#include "llvm/ADT/ArrayRef.h"
68#include "llvm/ADT/DenseMap.h"
70#include "llvm/ADT/Hashing.h"
71#include "llvm/ADT/MapVector.h"
72#include "llvm/ADT/STLExtras.h"
75#include "llvm/ADT/Statistic.h"
76#include "llvm/ADT/StringRef.h"
77#include "llvm/ADT/Twine.h"
78#include "llvm/ADT/TypeSwitch.h"
83#include "llvm/Analysis/CFG.h"
99#include "llvm/IR/Attributes.h"
100#include "llvm/IR/BasicBlock.h"
101#include "llvm/IR/CFG.h"
102#include "llvm/IR/Constant.h"
103#include "llvm/IR/Constants.h"
104#include "llvm/IR/DataLayout.h"
105#include "llvm/IR/DebugInfo.h"
106#include "llvm/IR/DebugLoc.h"
107#include "llvm/IR/DerivedTypes.h"
109#include "llvm/IR/Dominators.h"
110#include "llvm/IR/Function.h"
111#include "llvm/IR/IRBuilder.h"
112#include "llvm/IR/InstrTypes.h"
113#include "llvm/IR/Instruction.h"
114#include "llvm/IR/Instructions.h"
116#include "llvm/IR/Intrinsics.h"
117#include "llvm/IR/MDBuilder.h"
118#include "llvm/IR/Metadata.h"
119#include "llvm/IR/Module.h"
120#include "llvm/IR/Operator.h"
121#include "llvm/IR/PatternMatch.h"
123#include "llvm/IR/Type.h"
124#include "llvm/IR/Use.h"
125#include "llvm/IR/User.h"
126#include "llvm/IR/Value.h"
127#include "llvm/IR/Verifier.h"
128#include "llvm/Support/Casting.h"
130#include "llvm/Support/Debug.h"
145#include <algorithm>
146#include <cassert>
147#include <cstdint>
148#include <functional>
149#include <iterator>
150#include <limits>
151#include <memory>
152#include <string>
153#include <tuple>
154#include <utility>
155
156using namespace llvm;
157
158#define LV_NAME "loop-vectorize"
159#define DEBUG_TYPE LV_NAME
160
161#ifndef NDEBUG
162const char VerboseDebug[] = DEBUG_TYPE "-verbose";
163#endif
164
165/// @{
166/// Metadata attribute names
167const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
169 "llvm.loop.vectorize.followup_vectorized";
171 "llvm.loop.vectorize.followup_epilogue";
172/// @}
173
174STATISTIC(LoopsVectorized, "Number of loops vectorized");
175STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
176STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
177
179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
180 cl::desc("Enable vectorization of epilogue loops."));
181
183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
184 cl::desc("When epilogue vectorization is enabled, and a value greater than "
185 "1 is specified, forces the given VF for all applicable epilogue "
186 "loops."));
187
189 "epilogue-vectorization-minimum-VF", cl::Hidden,
190 cl::desc("Only loops with vectorization factor equal to or larger than "
191 "the specified value are considered for epilogue vectorization."));
192
193/// Loops with a known constant trip count below this number are vectorized only
194/// if no scalar iteration overheads are incurred.
196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
197 cl::desc("Loops with a constant trip count that is smaller than this "
198 "value are vectorized only if no scalar iteration overheads "
199 "are incurred."));
200
202 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
203 cl::desc("The maximum allowed number of runtime memory checks"));
204
205// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
206// that predication is preferred, and this lists all options. I.e., the
207// vectorizer will try to fold the tail-loop (epilogue) into the vector body
208// and predicate the instructions accordingly. If tail-folding fails, there are
209// different fallback strategies depending on these values:
211 enum Option {
215 };
216} // namespace PreferPredicateTy
217
219 "prefer-predicate-over-epilogue",
222 cl::desc("Tail-folding and predication preferences over creating a scalar "
223 "epilogue loop."),
225 "scalar-epilogue",
226 "Don't tail-predicate loops, create scalar epilogue"),
228 "predicate-else-scalar-epilogue",
229 "prefer tail-folding, create scalar epilogue if tail "
230 "folding fails."),
232 "predicate-dont-vectorize",
233 "prefers tail-folding, don't attempt vectorization if "
234 "tail-folding fails.")));
235
237 "force-tail-folding-style", cl::desc("Force the tail folding style"),
238 cl::init(TailFoldingStyle::None),
240 clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
242 TailFoldingStyle::Data, "data",
243 "Create lane mask for data only, using active.lane.mask intrinsic"),
244 clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
245 "data-without-lane-mask",
246 "Create lane mask with compare/stepvector"),
247 clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
248 "Create lane mask using active.lane.mask intrinsic, and use "
249 "it for both data and control flow"),
250 clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
251 "data-and-control-without-rt-check",
252 "Similar to data-and-control, but remove the runtime check"),
253 clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",
254 "Use predicated EVL instructions for tail folding. If EVL "
255 "is unsupported, fallback to data-without-lane-mask.")));
256
258 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
259 cl::desc("Maximize bandwidth when selecting vectorization factor which "
260 "will be determined by the smallest type in loop."));
261
263 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
264 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
265
266/// An interleave-group may need masking if it resides in a block that needs
267/// predication, or in order to mask away gaps.
269 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
270 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
271
273 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
274 cl::desc("A flag that overrides the target's number of scalar registers."));
275
277 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
278 cl::desc("A flag that overrides the target's number of vector registers."));
279
281 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
282 cl::desc("A flag that overrides the target's max interleave factor for "
283 "scalar loops."));
284
286 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
287 cl::desc("A flag that overrides the target's max interleave factor for "
288 "vectorized loops."));
289
291 "force-target-instruction-cost", cl::init(0), cl::Hidden,
292 cl::desc("A flag that overrides the target's expected cost for "
293 "an instruction to a single constant value. Mostly "
294 "useful for getting consistent testing."));
295
297 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
298 cl::desc(
299 "Pretend that scalable vectors are supported, even if the target does "
300 "not support them. This flag should only be used for testing."));
301
303 "small-loop-cost", cl::init(20), cl::Hidden,
304 cl::desc(
305 "The cost of a loop that is considered 'small' by the interleaver."));
306
308 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
309 cl::desc("Enable the use of the block frequency analysis to access PGO "
310 "heuristics minimizing code growth in cold regions and being more "
311 "aggressive in hot regions."));
312
313// Runtime interleave loops for load/store throughput.
315 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
316 cl::desc(
317 "Enable runtime interleaving until load/store ports are saturated"));
318
319/// The number of stores in a loop that are allowed to need predication.
321 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
322 cl::desc("Max number of stores to be predicated behind an if."));
323
325 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
326 cl::desc("Count the induction variable only once when interleaving"));
327
329 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
330 cl::desc("Enable if predication of stores during vectorization."));
331
333 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
334 cl::desc("The maximum interleave count to use when interleaving a scalar "
335 "reduction in a nested loop."));
336
337static cl::opt<bool>
338 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
340 cl::desc("Prefer in-loop vector reductions, "
341 "overriding the targets preference."));
342
344 "force-ordered-reductions", cl::init(false), cl::Hidden,
345 cl::desc("Enable the vectorisation of loops with in-order (strict) "
346 "FP reductions"));
347
349 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
350 cl::desc(
351 "Prefer predicating a reduction operation over an after loop select."));
352
353namespace llvm {
355 "enable-vplan-native-path", cl::Hidden,
356 cl::desc("Enable VPlan-native vectorization path with "
357 "support for outer loop vectorization."));
358} // namespace llvm
359
360// This flag enables the stress testing of the VPlan H-CFG construction in the
361// VPlan-native vectorization path. It must be used in conjuction with
362// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
363// verification of the H-CFGs built.
365 "vplan-build-stress-test", cl::init(false), cl::Hidden,
366 cl::desc(
367 "Build VPlan for every supported loop nest in the function and bail "
368 "out right after the build (stress test the VPlan H-CFG construction "
369 "in the VPlan-native vectorization path)."));
370
372 "interleave-loops", cl::init(true), cl::Hidden,
373 cl::desc("Enable loop interleaving in Loop vectorization passes"));
375 "vectorize-loops", cl::init(true), cl::Hidden,
376 cl::desc("Run the Loop vectorization passes"));
377
379 "force-widen-divrem-via-safe-divisor", cl::Hidden,
380 cl::desc(
381 "Override cost based safe divisor widening for div/rem instructions"));
382
384 "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
386 cl::desc("Try wider VFs if they enable the use of vector variants"));
387
389 "enable-early-exit-vectorization", cl::init(false), cl::Hidden,
390 cl::desc(
391 "Enable vectorization of early exit loops with uncountable exits."));
392
393// Likelyhood of bypassing the vectorized loop because assumptions about SCEV
394// variables not overflowing do not hold. See `emitSCEVChecks`.
395static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
396// Likelyhood of bypassing the vectorized loop because pointers overlap. See
397// `emitMemRuntimeChecks`.
398static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
399// Likelyhood of bypassing the vectorized loop because there are zero trips left
400// after prolog. See `emitIterationCountCheck`.
401static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
402
403/// A helper function that returns true if the given type is irregular. The
404/// type is irregular if its allocated size doesn't equal the store size of an
405/// element of the corresponding vector type.
406static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
407 // Determine if an array of N elements of type Ty is "bitcast compatible"
408 // with a <N x Ty> vector.
409 // This is only true if there is no padding between the array elements.
410 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
411}
412
413/// Returns "best known" trip count for the specified loop \p L as defined by
414/// the following procedure:
415/// 1) Returns exact trip count if it is known.
416/// 2) Returns expected trip count according to profile data if any.
417/// 3) Returns upper bound estimate if known, and if \p CanUseConstantMax.
418/// 4) Returns std::nullopt if all of the above failed.
419static std::optional<unsigned>
421 bool CanUseConstantMax = true) {
422 // Check if exact trip count is known.
423 if (unsigned ExpectedTC = PSE.getSE()->getSmallConstantTripCount(L))
424 return ExpectedTC;
425
426 // Check if there is an expected trip count available from profile data.
428 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
429 return *EstimatedTC;
430
431 if (!CanUseConstantMax)
432 return std::nullopt;
433
434 // Check if upper bound estimate is known.
435 if (unsigned ExpectedTC = PSE.getSmallConstantMaxTripCount())
436 return ExpectedTC;
437
438 return std::nullopt;
439}
440
441namespace {
442// Forward declare GeneratedRTChecks.
443class GeneratedRTChecks;
444
445using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
446} // namespace
447
448namespace llvm {
449
451
452/// InnerLoopVectorizer vectorizes loops which contain only one basic
453/// block to a specified vectorization factor (VF).
454/// This class performs the widening of scalars into vectors, or multiple
455/// scalars. This class also implements the following features:
456/// * It inserts an epilogue loop for handling loops that don't have iteration
457/// counts that are known to be a multiple of the vectorization factor.
458/// * It handles the code generation for reduction variables.
459/// * Scalarization (implementation using scalars) of un-vectorizable
460/// instructions.
461/// InnerLoopVectorizer does not perform any vectorization-legality
462/// checks, and relies on the caller to check for the different legality
463/// aspects. The InnerLoopVectorizer relies on the
464/// LoopVectorizationLegality class to provide information about the induction
465/// and reduction variables that were found to a given vectorization factor.
467public:
470 const TargetLibraryInfo *TLI,
474 unsigned UnrollFactor, LoopVectorizationLegality *LVL,
476 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks,
477 VPlan &Plan)
478 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
479 AC(AC), ORE(ORE), VF(VecWidth),
481 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
483 VectorPHVPB(Plan.getEntry()->getSingleSuccessor()) {
484 // Query this against the original loop and save it here because the profile
485 // of the original loop header may change as the transformation happens.
488 }
489
490 virtual ~InnerLoopVectorizer() = default;
491
492 /// Create a new empty loop that will contain vectorized instructions later
493 /// on, while the old loop will be used as the scalar remainder. Control flow
494 /// is generated around the vectorized (and scalar epilogue) loops consisting
495 /// of various checks and bypasses. Return the pre-header block of the new
496 /// loop. In the case of epilogue vectorization, this function is overriden to
497 /// handle the more complex control flow around the loops. \p ExpandedSCEVs is
498 /// used to look up SCEV expansions for expressions needed during skeleton
499 /// creation.
500 virtual BasicBlock *
501 createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
502
503 /// Fix the vectorized code, taking care of header phi's, and more.
505
506 // Return true if any runtime check is added.
508
509 /// A helper function to scalarize a single Instruction in the innermost loop.
510 /// Generates a sequence of scalar instances for each lane between \p MinLane
511 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
512 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
513 /// Instr's operands.
514 void scalarizeInstruction(const Instruction *Instr,
515 VPReplicateRecipe *RepRecipe, const VPLane &Lane,
516 VPTransformState &State);
517
518 /// Fix the non-induction PHIs in \p Plan.
520
521 /// Returns the original loop trip count.
522 Value *getTripCount() const { return TripCount; }
523
524 /// Used to set the trip count after ILV's construction and after the
525 /// preheader block has been executed. Note that this always holds the trip
526 /// count of the original loop for both main loop and epilogue vectorization.
527 void setTripCount(Value *TC) { TripCount = TC; }
528
529 // Retrieve the additional bypass value associated with an original
530 /// induction header phi.
532 return Induction2AdditionalBypassValue.at(OrigPhi);
533 }
534
535 /// Return the additional bypass block which targets the scalar loop by
536 /// skipping the epilogue loop after completing the main loop.
539 "Trying to access AdditionalBypassBlock but it has not been set");
541 }
542
543protected:
545
546 /// Iteratively sink the scalarized operands of a predicated instruction into
547 /// the block that was created for it.
548 void sinkScalarOperands(Instruction *PredInst);
549
550 /// Returns (and creates if needed) the trip count of the widened loop.
552
553 /// Emit a bypass check to see if the vector trip count is zero, including if
554 /// it overflows.
556
557 /// Emit a bypass check to see if all of the SCEV assumptions we've
558 /// had to make are correct. Returns the block containing the checks or
559 /// nullptr if no checks have been added.
561
562 /// Emit bypass checks to check any memory assumptions we may have made.
563 /// Returns the block containing the checks or nullptr if no checks have been
564 /// added.
566
567 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
568 /// vector loop preheader, middle block and scalar preheader.
570
571 /// Create and record the values for induction variables to resume coming from
572 /// the additional bypass block.
573 void createInductionAdditionalBypassValues(const SCEV2ValueTy &ExpandedSCEVs,
574 Value *MainVectorTripCount);
575
576 /// Allow subclasses to override and print debug traces before/after vplan
577 /// execution, when trace information is requested.
578 virtual void printDebugTracesAtStart() {}
579 virtual void printDebugTracesAtEnd() {}
580
581 /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the
582 /// vector preheader and its predecessor, also connecting the new block to the
583 /// scalar preheader.
584 void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB);
585
586 /// The original loop.
588
589 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
590 /// dynamic knowledge to simplify SCEV expressions and converts them to a
591 /// more usable form.
593
594 /// Loop Info.
596
597 /// Dominator Tree.
599
600 /// Target Library Info.
602
603 /// Target Transform Info.
605
606 /// Assumption Cache.
608
609 /// Interface to emit optimization remarks.
611
612 /// The vectorization SIMD factor to use. Each vector will have this many
613 /// vector elements.
615
617
618 /// The vectorization unroll factor to use. Each scalar is vectorized to this
619 /// many different vector instructions.
620 unsigned UF;
621
622 /// The builder that we use
624
625 // --- Vectorization state ---
626
627 /// The vector-loop preheader.
629
630 /// The scalar-loop preheader.
632
633 /// Middle Block between the vector and the scalar.
635
636 /// A list of all bypass blocks. The first block is the entry of the loop.
638
639 /// Store instructions that were predicated.
641
642 /// Trip count of the original loop.
643 Value *TripCount = nullptr;
644
645 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
647
648 /// The legality analysis.
650
651 /// The profitablity analysis.
653
654 // Record whether runtime checks are added.
655 bool AddedSafetyChecks = false;
656
657 /// BFI and PSI are used to check for profile guided size optimizations.
660
661 // Whether this loop should be optimized for size based on profile guided size
662 // optimizatios.
664
665 /// Structure to hold information about generated runtime checks, responsible
666 /// for cleaning the checks, if vectorization turns out unprofitable.
667 GeneratedRTChecks &RTChecks;
668
669 /// Mapping of induction phis to their additional bypass values. They
670 /// need to be added as operands to phi nodes in the scalar loop preheader
671 /// after the epilogue skeleton has been created.
673
674 /// The additional bypass block which conditionally skips over the epilogue
675 /// loop after executing the main loop. Needed to resume inductions and
676 /// reductions during epilogue vectorization.
678
680
681 /// The vector preheader block of \p Plan, used as target for check blocks
682 /// introduced during skeleton creation.
684};
685
686/// Encapsulate information regarding vectorization of a loop and its epilogue.
687/// This information is meant to be updated and used across two stages of
688/// epilogue vectorization.
691 unsigned MainLoopUF = 0;
693 unsigned EpilogueUF = 0;
698 Value *TripCount = nullptr;
701
703 ElementCount EVF, unsigned EUF,
705 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF),
707 assert(EUF == 1 &&
708 "A high UF for the epilogue loop is likely not beneficial.");
709 }
710};
711
712/// An extension of the inner loop vectorizer that creates a skeleton for a
713/// vectorized loop that has its epilogue (residual) also vectorized.
714/// The idea is to run the vplan on a given loop twice, firstly to setup the
715/// skeleton and vectorize the main loop, and secondly to complete the skeleton
716/// from the first step and vectorize the epilogue. This is achieved by
717/// deriving two concrete strategy classes from this base class and invoking
718/// them in succession from the loop vectorizer planner.
720public:
728 GeneratedRTChecks &Checks, VPlan &Plan)
730 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
731 CM, BFI, PSI, Checks, Plan),
732 EPI(EPI) {}
733
734 // Override this function to handle the more complex control flow around the
735 // three loops.
736 BasicBlock *
737 createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final {
738 return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs);
739 }
740
741 /// The interface for creating a vectorized skeleton using one of two
742 /// different strategies, each corresponding to one execution of the vplan
743 /// as described above.
744 virtual BasicBlock *
745 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0;
746
747 /// Holds and updates state information required to vectorize the main loop
748 /// and its epilogue in two separate passes. This setup helps us avoid
749 /// regenerating and recomputing runtime safety checks. It also helps us to
750 /// shorten the iteration-count-check path length for the cases where the
751 /// iteration count of the loop is so small that the main vector loop is
752 /// completely skipped.
754};
755
756/// A specialized derived class of inner loop vectorizer that performs
757/// vectorization of *main* loops in the process of vectorizing loops and their
758/// epilogues.
760public:
768 GeneratedRTChecks &Check, VPlan &Plan)
770 EPI, LVL, CM, BFI, PSI, Check, Plan) {}
771 /// Implements the interface for creating a vectorized skeleton using the
772 /// *main loop* strategy (ie the first pass of vplan execution).
773 BasicBlock *
774 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
775
776protected:
777 /// Emits an iteration count bypass check once for the main loop (when \p
778 /// ForEpilogue is false) and once for the epilogue loop (when \p
779 /// ForEpilogue is true).
780 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
781 void printDebugTracesAtStart() override;
782 void printDebugTracesAtEnd() override;
783};
784
785// A specialized derived class of inner loop vectorizer that performs
786// vectorization of *epilogue* loops in the process of vectorizing loops and
787// their epilogues.
789public:
797 GeneratedRTChecks &Checks, VPlan &Plan)
799 EPI, LVL, CM, BFI, PSI, Checks, Plan) {
801 }
802 /// Implements the interface for creating a vectorized skeleton using the
803 /// *epilogue loop* strategy (ie the second pass of vplan execution).
804 BasicBlock *
805 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
806
807protected:
808 /// Emits an iteration count bypass check after the main vector loop has
809 /// finished to see if there are any iterations left to execute by either
810 /// the vector epilogue or the scalar epilogue.
812 BasicBlock *Bypass,
813 BasicBlock *Insert);
814 void printDebugTracesAtStart() override;
815 void printDebugTracesAtEnd() override;
816};
817} // end namespace llvm
818
819/// Look for a meaningful debug location on the instruction or its operands.
821 if (!I)
822 return DebugLoc();
823
824 DebugLoc Empty;
825 if (I->getDebugLoc() != Empty)
826 return I->getDebugLoc();
827
828 for (Use &Op : I->operands()) {
829 if (Instruction *OpInst = dyn_cast<Instruction>(Op))
830 if (OpInst->getDebugLoc() != Empty)
831 return OpInst->getDebugLoc();
832 }
833
834 return I->getDebugLoc();
835}
836
837/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
838/// is passed, the message relates to that particular instruction.
839#ifndef NDEBUG
840static void debugVectorizationMessage(const StringRef Prefix,
841 const StringRef DebugMsg,
842 Instruction *I) {
843 dbgs() << "LV: " << Prefix << DebugMsg;
844 if (I != nullptr)
845 dbgs() << " " << *I;
846 else
847 dbgs() << '.';
848 dbgs() << '\n';
849}
850#endif
851
852/// Create an analysis remark that explains why vectorization failed
853///
854/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
855/// RemarkName is the identifier for the remark. If \p I is passed it is an
856/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
857/// the location of the remark. If \p DL is passed, use it as debug location for
858/// the remark. \return the remark object that can be streamed to.
860createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,
861 Instruction *I, DebugLoc DL = {}) {
862 Value *CodeRegion = I ? I->getParent() : TheLoop->getHeader();
863 // If debug location is attached to the instruction, use it. Otherwise if DL
864 // was not provided, use the loop's.
865 if (I && I->getDebugLoc())
866 DL = I->getDebugLoc();
867 else if (!DL)
868 DL = TheLoop->getStartLoc();
869
870 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
871}
872
873namespace llvm {
874
875/// Return a value for Step multiplied by VF.
877 int64_t Step) {
878 assert(Ty->isIntegerTy() && "Expected an integer step");
879 return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step));
880}
881
882/// Return the runtime value for VF.
884 return B.CreateElementCount(Ty, VF);
885}
886
888 const StringRef OREMsg, const StringRef ORETag,
889 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
890 Instruction *I) {
891 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
892 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
893 ORE->emit(
894 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
895 << "loop not vectorized: " << OREMsg);
896}
897
898/// Reports an informative message: print \p Msg for debugging purposes as well
899/// as an optimization remark. Uses either \p I as location of the remark, or
900/// otherwise \p TheLoop. If \p DL is passed, use it as debug location for the
901/// remark. If \p DL is passed, use it as debug location for the remark.
902static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
904 Loop *TheLoop, Instruction *I = nullptr,
905 DebugLoc DL = {}) {
907 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
908 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop,
909 I, DL)
910 << Msg);
911}
912
913/// Report successful vectorization of the loop. In case an outer loop is
914/// vectorized, prepend "outer" to the vectorization remark.
916 VectorizationFactor VF, unsigned IC) {
918 "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
919 nullptr));
920 StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
921 ORE->emit([&]() {
922 return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
923 TheLoop->getHeader())
924 << "vectorized " << LoopType << "loop (vectorization width: "
925 << ore::NV("VectorizationFactor", VF.Width)
926 << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
927 });
928}
929
930} // end namespace llvm
931
932namespace llvm {
933
934// Loop vectorization cost-model hints how the scalar epilogue loop should be
935// lowered.
937
938 // The default: allowing scalar epilogues.
940
941 // Vectorization with OptForSize: don't allow epilogues.
943
944 // A special case of vectorisation with OptForSize: loops with a very small
945 // trip count are considered for vectorization under OptForSize, thereby
946 // making sure the cost of their loop body is dominant, free of runtime
947 // guards and scalar iteration overheads.
949
950 // Loop hint predicate indicating an epilogue is undesired.
952
953 // Directive indicating we must either tail fold or not vectorize
956
957using InstructionVFPair = std::pair<Instruction *, ElementCount>;
958
959/// LoopVectorizationCostModel - estimates the expected speedups due to
960/// vectorization.
961/// In many cases vectorization is not profitable. This can happen because of
962/// a number of reasons. In this class we mainly attempt to predict the
963/// expected speedup/slowdowns due to the supported instruction set. We use the
964/// TargetTransformInfo to query the different backends for the cost of
965/// different operations.
968
969public:
979 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
980 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
981 Hints(Hints), InterleaveInfo(IAI), CostKind(TTI::TCK_RecipThroughput) {}
982
983 /// \return An upper bound for the vectorization factors (both fixed and
984 /// scalable). If the factors are 0, vectorization and interleaving should be
985 /// avoided up front.
986 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
987
988 /// \return True if runtime checks are required for vectorization, and false
989 /// otherwise.
991
992 /// Setup cost-based decisions for user vectorization factor.
993 /// \return true if the UserVF is a feasible VF to be chosen.
997 return expectedCost(UserVF).isValid();
998 }
999
1000 /// \return The size (in bits) of the smallest and widest types in the code
1001 /// that needs to be vectorized. We ignore values that remain scalar such as
1002 /// 64 bit loop indices.
1003 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1004
1005 /// \return The desired interleave count.
1006 /// If interleave count has been specified by metadata it will be returned.
1007 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1008 /// are the selected vectorization factor and the cost of the selected VF.
1009 unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
1010
1011 /// Memory access instruction may be vectorized in more than one way.
1012 /// Form of instruction after vectorization depends on cost.
1013 /// This function takes cost-based decisions for Load/Store instructions
1014 /// and collects them in a map. This decisions map is used for building
1015 /// the lists of loop-uniform and loop-scalar instructions.
1016 /// The calculated cost is saved with widening decision in order to
1017 /// avoid redundant calculations.
1019
1020 /// A call may be vectorized in different ways depending on whether we have
1021 /// vectorized variants available and whether the target supports masking.
1022 /// This function analyzes all calls in the function at the supplied VF,
1023 /// makes a decision based on the costs of available options, and stores that
1024 /// decision in a map for use in planning and plan execution.
1026
1027 /// A struct that represents some properties of the register usage
1028 /// of a loop.
1030 /// Holds the number of loop invariant values that are used in the loop.
1031 /// The key is ClassID of target-provided register class.
1033 /// Holds the maximum number of concurrent live intervals in the loop.
1034 /// The key is ClassID of target-provided register class.
1036 };
1037
1038 /// \return Returns information about the register usages of the loop for the
1039 /// given vectorization factors.
1042
1043 /// Collect values we want to ignore in the cost model.
1044 void collectValuesToIgnore();
1045
1046 /// Collect all element types in the loop for which widening is needed.
1048
1049 /// Split reductions into those that happen in the loop, and those that happen
1050 /// outside. In loop reductions are collected into InLoopReductions.
1052
1053 /// Returns true if we should use strict in-order reductions for the given
1054 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1055 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1056 /// of FP operations.
1057 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1058 return !Hints->allowReordering() && RdxDesc.isOrdered();
1059 }
1060
1061 /// \returns The smallest bitwidth each instruction can be represented with.
1062 /// The vector equivalents of these instructions should be truncated to this
1063 /// type.
1065 return MinBWs;
1066 }
1067
1068 /// \returns True if it is more profitable to scalarize instruction \p I for
1069 /// vectorization factor \p VF.
1071 assert(VF.isVector() &&
1072 "Profitable to scalarize relevant only for VF > 1.");
1073 assert(
1074 TheLoop->isInnermost() &&
1075 "cost-model should not be used for outer loops (in VPlan-native path)");
1076
1077 auto Scalars = InstsToScalarize.find(VF);
1078 assert(Scalars != InstsToScalarize.end() &&
1079 "VF not yet analyzed for scalarization profitability");
1080 return Scalars->second.contains(I);
1081 }
1082
1083 /// Returns true if \p I is known to be uniform after vectorization.
1085 assert(
1086 TheLoop->isInnermost() &&
1087 "cost-model should not be used for outer loops (in VPlan-native path)");
1088 // Pseudo probe needs to be duplicated for each unrolled iteration and
1089 // vector lane so that profiled loop trip count can be accurately
1090 // accumulated instead of being under counted.
1091 if (isa<PseudoProbeInst>(I))
1092 return false;
1093
1094 if (VF.isScalar())
1095 return true;
1096
1097 auto UniformsPerVF = Uniforms.find(VF);
1098 assert(UniformsPerVF != Uniforms.end() &&
1099 "VF not yet analyzed for uniformity");
1100 return UniformsPerVF->second.count(I);
1101 }
1102
1103 /// Returns true if \p I is known to be scalar after vectorization.
1105 assert(
1106 TheLoop->isInnermost() &&
1107 "cost-model should not be used for outer loops (in VPlan-native path)");
1108 if (VF.isScalar())
1109 return true;
1110
1111 auto ScalarsPerVF = Scalars.find(VF);
1112 assert(ScalarsPerVF != Scalars.end() &&
1113 "Scalar values are not calculated for VF");
1114 return ScalarsPerVF->second.count(I);
1115 }
1116
1117 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1118 /// for vectorization factor \p VF.
1120 return VF.isVector() && MinBWs.contains(I) &&
1121 !isProfitableToScalarize(I, VF) &&
1123 }
1124
1125 /// Decision that was taken during cost calculation for memory instruction.
1128 CM_Widen, // For consecutive accesses with stride +1.
1129 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1136
1137 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1138 /// instruction \p I and vector width \p VF.
1141 assert(VF.isVector() && "Expected VF >=2");
1142 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1143 }
1144
1145 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1146 /// interleaving group \p Grp and vector width \p VF.
1150 assert(VF.isVector() && "Expected VF >=2");
1151 /// Broadcast this decicion to all instructions inside the group.
1152 /// When interleaving, the cost will only be assigned one instruction, the
1153 /// insert position. For other cases, add the appropriate fraction of the
1154 /// total cost to each instruction. This ensures accurate costs are used,
1155 /// even if the insert position instruction is not used.
1156 InstructionCost InsertPosCost = Cost;
1157 InstructionCost OtherMemberCost = 0;
1158 if (W != CM_Interleave)
1159 OtherMemberCost = InsertPosCost = Cost / Grp->getNumMembers();
1160 ;
1161 for (unsigned Idx = 0; Idx < Grp->getFactor(); ++Idx) {
1162 if (auto *I = Grp->getMember(Idx)) {
1163 if (Grp->getInsertPos() == I)
1164 WideningDecisions[std::make_pair(I, VF)] =
1165 std::make_pair(W, InsertPosCost);
1166 else
1167 WideningDecisions[std::make_pair(I, VF)] =
1168 std::make_pair(W, OtherMemberCost);
1169 }
1170 }
1171 }
1172
1173 /// Return the cost model decision for the given instruction \p I and vector
1174 /// width \p VF. Return CM_Unknown if this instruction did not pass
1175 /// through the cost modeling.
1177 assert(VF.isVector() && "Expected VF to be a vector VF");
1178 assert(
1179 TheLoop->isInnermost() &&
1180 "cost-model should not be used for outer loops (in VPlan-native path)");
1181
1182 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1183 auto Itr = WideningDecisions.find(InstOnVF);
1184 if (Itr == WideningDecisions.end())
1185 return CM_Unknown;
1186 return Itr->second.first;
1187 }
1188
1189 /// Return the vectorization cost for the given instruction \p I and vector
1190 /// width \p VF.
1192 assert(VF.isVector() && "Expected VF >=2");
1193 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1194 assert(WideningDecisions.contains(InstOnVF) &&
1195 "The cost is not calculated");
1196 return WideningDecisions[InstOnVF].second;
1197 }
1198
1203 std::optional<unsigned> MaskPos;
1205 };
1206
1208 Function *Variant, Intrinsic::ID IID,
1209 std::optional<unsigned> MaskPos,
1211 assert(!VF.isScalar() && "Expected vector VF");
1212 CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID,
1213 MaskPos, Cost};
1214 }
1215
1217 ElementCount VF) const {
1218 assert(!VF.isScalar() && "Expected vector VF");
1219 return CallWideningDecisions.at(std::make_pair(CI, VF));
1220 }
1221
1222 /// Return True if instruction \p I is an optimizable truncate whose operand
1223 /// is an induction variable. Such a truncate will be removed by adding a new
1224 /// induction variable with the destination type.
1226 // If the instruction is not a truncate, return false.
1227 auto *Trunc = dyn_cast<TruncInst>(I);
1228 if (!Trunc)
1229 return false;
1230
1231 // Get the source and destination types of the truncate.
1232 Type *SrcTy = toVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1233 Type *DestTy = toVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1234
1235 // If the truncate is free for the given types, return false. Replacing a
1236 // free truncate with an induction variable would add an induction variable
1237 // update instruction to each iteration of the loop. We exclude from this
1238 // check the primary induction variable since it will need an update
1239 // instruction regardless.
1240 Value *Op = Trunc->getOperand(0);
1241 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1242 return false;
1243
1244 // If the truncated value is not an induction variable, return false.
1245 return Legal->isInductionPhi(Op);
1246 }
1247
1248 /// Collects the instructions to scalarize for each predicated instruction in
1249 /// the loop.
1251
1252 /// Collect Uniform and Scalar values for the given \p VF.
1253 /// The sets depend on CM decision for Load/Store instructions
1254 /// that may be vectorized as interleave, gather-scatter or scalarized.
1255 /// Also make a decision on what to do about call instructions in the loop
1256 /// at that VF -- scalarize, call a known vector routine, or call a
1257 /// vector intrinsic.
1259 // Do the analysis once.
1260 if (VF.isScalar() || Uniforms.contains(VF))
1261 return;
1263 collectLoopUniforms(VF);
1265 collectLoopScalars(VF);
1266 }
1267
1268 /// Returns true if the target machine supports masked store operation
1269 /// for the given \p DataType and kind of access to \p Ptr.
1270 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1271 return Legal->isConsecutivePtr(DataType, Ptr) &&
1272 TTI.isLegalMaskedStore(DataType, Alignment);
1273 }
1274
1275 /// Returns true if the target machine supports masked load operation
1276 /// for the given \p DataType and kind of access to \p Ptr.
1277 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1278 return Legal->isConsecutivePtr(DataType, Ptr) &&
1279 TTI.isLegalMaskedLoad(DataType, Alignment);
1280 }
1281
1282 /// Returns true if the target machine can represent \p V as a masked gather
1283 /// or scatter operation.
1285 bool LI = isa<LoadInst>(V);
1286 bool SI = isa<StoreInst>(V);
1287 if (!LI && !SI)
1288 return false;
1289 auto *Ty = getLoadStoreType(V);
1291 if (VF.isVector())
1292 Ty = VectorType::get(Ty, VF);
1293 return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1294 (SI && TTI.isLegalMaskedScatter(Ty, Align));
1295 }
1296
1297 /// Returns true if the target machine supports all of the reduction
1298 /// variables found for the given VF.
1300 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1301 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1302 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1303 }));
1304 }
1305
1306 /// Given costs for both strategies, return true if the scalar predication
1307 /// lowering should be used for div/rem. This incorporates an override
1308 /// option so it is not simply a cost comparison.
1310 InstructionCost SafeDivisorCost) const {
1311 switch (ForceSafeDivisor) {
1312 case cl::BOU_UNSET:
1313 return ScalarCost < SafeDivisorCost;
1314 case cl::BOU_TRUE:
1315 return false;
1316 case cl::BOU_FALSE:
1317 return true;
1318 }
1319 llvm_unreachable("impossible case value");
1320 }
1321
1322 /// Returns true if \p I is an instruction which requires predication and
1323 /// for which our chosen predication strategy is scalarization (i.e. we
1324 /// don't have an alternate strategy such as masking available).
1325 /// \p VF is the vectorization factor that will be used to vectorize \p I.
1327
1328 /// Returns true if \p I is an instruction that needs to be predicated
1329 /// at runtime. The result is independent of the predication mechanism.
1330 /// Superset of instructions that return true for isScalarWithPredication.
1331 bool isPredicatedInst(Instruction *I) const;
1332
1333 /// Return the costs for our two available strategies for lowering a
1334 /// div/rem operation which requires speculating at least one lane.
1335 /// First result is for scalarization (will be invalid for scalable
1336 /// vectors); second is for the safe-divisor strategy.
1337 std::pair<InstructionCost, InstructionCost>
1339 ElementCount VF) const;
1340
1341 /// Returns true if \p I is a memory instruction with consecutive memory
1342 /// access that can be widened.
1344
1345 /// Returns true if \p I is a memory instruction in an interleaved-group
1346 /// of memory accesses that can be vectorized with wide vector loads/stores
1347 /// and shuffles.
1349
1350 /// Check if \p Instr belongs to any interleaved access group.
1352 return InterleaveInfo.isInterleaved(Instr);
1353 }
1354
1355 /// Get the interleaved access group that \p Instr belongs to.
1358 return InterleaveInfo.getInterleaveGroup(Instr);
1359 }
1360
1361 /// Returns true if we're required to use a scalar epilogue for at least
1362 /// the final iteration of the original loop.
1363 bool requiresScalarEpilogue(bool IsVectorizing) const {
1364 if (!isScalarEpilogueAllowed()) {
1365 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1366 return false;
1367 }
1368 // If we might exit from anywhere but the latch and early exit vectorization
1369 // is disabled, we must run the exiting iteration in scalar form.
1372 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting "
1373 "from latch block\n");
1374 return true;
1375 }
1376 if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
1377 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
1378 "interleaved group requires scalar epilogue\n");
1379 return true;
1380 }
1381 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1382 return false;
1383 }
1384
1385 /// Returns true if we're required to use a scalar epilogue for at least
1386 /// the final iteration of the original loop for all VFs in \p Range.
1387 /// A scalar epilogue must either be required for all VFs in \p Range or for
1388 /// none.
1390 auto RequiresScalarEpilogue = [this](ElementCount VF) {
1391 return requiresScalarEpilogue(VF.isVector());
1392 };
1393 bool IsRequired = all_of(Range, RequiresScalarEpilogue);
1394 assert(
1395 (IsRequired || none_of(Range, RequiresScalarEpilogue)) &&
1396 "all VFs in range must agree on whether a scalar epilogue is required");
1397 return IsRequired;
1398 }
1399
1400 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1401 /// loop hint annotation.
1403 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1404 }
1405
1406 /// Returns the TailFoldingStyle that is best for the current loop.
1407 TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1408 if (!ChosenTailFoldingStyle)
1410 return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first
1411 : ChosenTailFoldingStyle->second;
1412 }
1413
1414 /// Selects and saves TailFoldingStyle for 2 options - if IV update may
1415 /// overflow or not.
1416 /// \param IsScalableVF true if scalable vector factors enabled.
1417 /// \param UserIC User specific interleave count.
1418 void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
1419 assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
1420 if (!Legal->canFoldTailByMasking()) {
1421 ChosenTailFoldingStyle =
1423 return;
1424 }
1425
1426 if (!ForceTailFoldingStyle.getNumOccurrences()) {
1427 ChosenTailFoldingStyle = std::make_pair(
1428 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
1429 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false));
1430 return;
1431 }
1432
1433 // Set styles when forced.
1434 ChosenTailFoldingStyle = std::make_pair(ForceTailFoldingStyle.getValue(),
1435 ForceTailFoldingStyle.getValue());
1437 return;
1438 // Override forced styles if needed.
1439 // FIXME: use actual opcode/data type for analysis here.
1440 // FIXME: Investigate opportunity for fixed vector factor.
1441 // FIXME: support fixed-order recurrences by fixing splice of non VFxUF
1442 // penultimate EVL.
1443 bool EVLIsLegal =
1444 UserIC <= 1 && TTI.hasActiveVectorLength(0, nullptr, Align()) &&
1446 if (!EVLIsLegal) {
1447 // If for some reason EVL mode is unsupported, fallback to
1448 // DataWithoutLaneMask to try to vectorize the loop with folded tail
1449 // in a generic way.
1450 ChosenTailFoldingStyle =
1453 LLVM_DEBUG(
1454 dbgs()
1455 << "LV: Preference for VP intrinsics indicated. Will "
1456 "not try to generate VP Intrinsics "
1457 << (UserIC > 1
1458 ? "since interleave count specified is greater than 1.\n"
1459 : "due to non-interleaving reasons.\n"));
1460 }
1461 }
1462
1463 /// Returns true if all loop blocks should be masked to fold tail loop.
1464 bool foldTailByMasking() const {
1465 // TODO: check if it is possible to check for None style independent of
1466 // IVUpdateMayOverflow flag in getTailFoldingStyle.
1468 }
1469
1470 /// Return maximum safe number of elements to be processed per vector
1471 /// iteration, which do not prevent store-load forwarding and are safe with
1472 /// regard to the memory dependencies. Required for EVL-based VPlans to
1473 /// correctly calculate AVL (application vector length) as min(remaining AVL,
1474 /// MaxSafeElements).
1475 /// TODO: need to consider adjusting cost model to use this value as a
1476 /// vectorization factor for EVL-based vectorization.
1477 std::optional<unsigned> getMaxSafeElements() const { return MaxSafeElements; }
1478
1479 /// Returns true if the instructions in this block requires predication
1480 /// for any reason, e.g. because tail folding now requires a predicate
1481 /// or because the block in the original loop was predicated.
1484 }
1485
1486 /// Returns true if VP intrinsics with explicit vector length support should
1487 /// be generated in the tail folded loop.
1488 bool foldTailWithEVL() const {
1490 }
1491
1492 /// Returns true if the Phi is part of an inloop reduction.
1493 bool isInLoopReduction(PHINode *Phi) const {
1494 return InLoopReductions.contains(Phi);
1495 }
1496
1497 /// Returns true if the predicated reduction select should be used to set the
1498 /// incoming value for the reduction phi.
1499 bool usePredicatedReductionSelect(unsigned Opcode, Type *PhiTy) const {
1500 // Force to use predicated reduction select since the EVL of the
1501 // second-to-last iteration might not be VF*UF.
1502 if (foldTailWithEVL())
1503 return true;
1506 Opcode, PhiTy, TargetTransformInfo::ReductionFlags());
1507 }
1508
1509 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1510 /// with factor VF. Return the cost of the instruction, including
1511 /// scalarization overhead if it's needed.
1513
1514 /// Estimate cost of a call instruction CI if it were vectorized with factor
1515 /// VF. Return the cost of the instruction, including scalarization overhead
1516 /// if it's needed.
1518
1519 /// Invalidates decisions already taken by the cost model.
1521 WideningDecisions.clear();
1522 CallWideningDecisions.clear();
1523 Uniforms.clear();
1524 Scalars.clear();
1525 }
1526
1527 /// Returns the expected execution cost. The unit of the cost does
1528 /// not matter because we use the 'cost' units to compare different
1529 /// vector widths. The cost that is returned is *not* normalized by
1530 /// the factor width.
1532
1533 bool hasPredStores() const { return NumPredStores > 0; }
1534
1535 /// Returns true if epilogue vectorization is considered profitable, and
1536 /// false otherwise.
1537 /// \p VF is the vectorization factor chosen for the original loop.
1538 /// \p Multiplier is an aditional scaling factor applied to VF before
1539 /// comparing to EpilogueVectorizationMinVF.
1541 const unsigned IC) const;
1542
1543 /// Returns the execution time cost of an instruction for a given vector
1544 /// width. Vector width of one means scalar.
1546
1547 /// Return the cost of instructions in an inloop reduction pattern, if I is
1548 /// part of that pattern.
1549 std::optional<InstructionCost> getReductionPatternCost(Instruction *I,
1550 ElementCount VF,
1551 Type *VectorTy) const;
1552
1553 /// Returns true if \p Op should be considered invariant and if it is
1554 /// trivially hoistable.
1556
1557private:
1558 unsigned NumPredStores = 0;
1559
1560 /// \return An upper bound for the vectorization factors for both
1561 /// fixed and scalable vectorization, where the minimum-known number of
1562 /// elements is a power-of-2 larger than zero. If scalable vectorization is
1563 /// disabled or unsupported, then the scalable part will be equal to
1564 /// ElementCount::getScalable(0).
1565 FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1566 ElementCount UserVF,
1567 bool FoldTailByMasking);
1568
1569 /// \return the maximized element count based on the targets vector
1570 /// registers and the loop trip-count, but limited to a maximum safe VF.
1571 /// This is a helper function of computeFeasibleMaxVF.
1572 ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1573 unsigned SmallestType,
1574 unsigned WidestType,
1575 ElementCount MaxSafeVF,
1576 bool FoldTailByMasking);
1577
1578 /// Checks if scalable vectorization is supported and enabled. Caches the
1579 /// result to avoid repeated debug dumps for repeated queries.
1580 bool isScalableVectorizationAllowed();
1581
1582 /// \return the maximum legal scalable VF, based on the safe max number
1583 /// of elements.
1584 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1585
1586 /// Calculate vectorization cost of memory instruction \p I.
1587 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1588
1589 /// The cost computation for scalarized memory instruction.
1590 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1591
1592 /// The cost computation for interleaving group of memory instructions.
1593 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1594
1595 /// The cost computation for Gather/Scatter instruction.
1596 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1597
1598 /// The cost computation for widening instruction \p I with consecutive
1599 /// memory access.
1600 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1601
1602 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1603 /// Load: scalar load + broadcast.
1604 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1605 /// element)
1606 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1607
1608 /// Estimate the overhead of scalarizing an instruction. This is a
1609 /// convenience wrapper for the type-based getScalarizationOverhead API.
1610 InstructionCost getScalarizationOverhead(Instruction *I,
1611 ElementCount VF) const;
1612
1613 /// Returns true if an artificially high cost for emulated masked memrefs
1614 /// should be used.
1615 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1616
1617 /// Map of scalar integer values to the smallest bitwidth they can be legally
1618 /// represented as. The vector equivalents of these values should be truncated
1619 /// to this type.
1621
1622 /// A type representing the costs for instructions if they were to be
1623 /// scalarized rather than vectorized. The entries are Instruction-Cost
1624 /// pairs.
1625 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1626
1627 /// A set containing all BasicBlocks that are known to present after
1628 /// vectorization as a predicated block.
1630 PredicatedBBsAfterVectorization;
1631
1632 /// Records whether it is allowed to have the original scalar loop execute at
1633 /// least once. This may be needed as a fallback loop in case runtime
1634 /// aliasing/dependence checks fail, or to handle the tail/remainder
1635 /// iterations when the trip count is unknown or doesn't divide by the VF,
1636 /// or as a peel-loop to handle gaps in interleave-groups.
1637 /// Under optsize and when the trip count is very small we don't allow any
1638 /// iterations to execute in the scalar loop.
1639 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1640
1641 /// Control finally chosen tail folding style. The first element is used if
1642 /// the IV update may overflow, the second element - if it does not.
1643 std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
1644 ChosenTailFoldingStyle;
1645
1646 /// true if scalable vectorization is supported and enabled.
1647 std::optional<bool> IsScalableVectorizationAllowed;
1648
1649 /// Maximum safe number of elements to be processed per vector iteration,
1650 /// which do not prevent store-load forwarding and are safe with regard to the
1651 /// memory dependencies. Required for EVL-based veectorization, where this
1652 /// value is used as the upper bound of the safe AVL.
1653 std::optional<unsigned> MaxSafeElements;
1654
1655 /// A map holding scalar costs for different vectorization factors. The
1656 /// presence of a cost for an instruction in the mapping indicates that the
1657 /// instruction will be scalarized when vectorizing with the associated
1658 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1660
1661 /// Holds the instructions known to be uniform after vectorization.
1662 /// The data is collected per VF.
1664
1665 /// Holds the instructions known to be scalar after vectorization.
1666 /// The data is collected per VF.
1668
1669 /// Holds the instructions (address computations) that are forced to be
1670 /// scalarized.
1672
1673 /// PHINodes of the reductions that should be expanded in-loop.
1674 SmallPtrSet<PHINode *, 4> InLoopReductions;
1675
1676 /// A Map of inloop reduction operations and their immediate chain operand.
1677 /// FIXME: This can be removed once reductions can be costed correctly in
1678 /// VPlan. This was added to allow quick lookup of the inloop operations.
1679 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1680
1681 /// Returns the expected difference in cost from scalarizing the expression
1682 /// feeding a predicated instruction \p PredInst. The instructions to
1683 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1684 /// non-negative return value implies the expression will be scalarized.
1685 /// Currently, only single-use chains are considered for scalarization.
1686 InstructionCost computePredInstDiscount(Instruction *PredInst,
1687 ScalarCostsTy &ScalarCosts,
1688 ElementCount VF);
1689
1690 /// Collect the instructions that are uniform after vectorization. An
1691 /// instruction is uniform if we represent it with a single scalar value in
1692 /// the vectorized loop corresponding to each vector iteration. Examples of
1693 /// uniform instructions include pointer operands of consecutive or
1694 /// interleaved memory accesses. Note that although uniformity implies an
1695 /// instruction will be scalar, the reverse is not true. In general, a
1696 /// scalarized instruction will be represented by VF scalar values in the
1697 /// vectorized loop, each corresponding to an iteration of the original
1698 /// scalar loop.
1699 void collectLoopUniforms(ElementCount VF);
1700
1701 /// Collect the instructions that are scalar after vectorization. An
1702 /// instruction is scalar if it is known to be uniform or will be scalarized
1703 /// during vectorization. collectLoopScalars should only add non-uniform nodes
1704 /// to the list if they are used by a load/store instruction that is marked as
1705 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1706 /// VF values in the vectorized loop, each corresponding to an iteration of
1707 /// the original scalar loop.
1708 void collectLoopScalars(ElementCount VF);
1709
1710 /// Keeps cost model vectorization decision and cost for instructions.
1711 /// Right now it is used for memory instructions only.
1713 std::pair<InstWidening, InstructionCost>>;
1714
1715 DecisionList WideningDecisions;
1716
1717 using CallDecisionList =
1718 DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1719
1720 CallDecisionList CallWideningDecisions;
1721
1722 /// Returns true if \p V is expected to be vectorized and it needs to be
1723 /// extracted.
1724 bool needsExtract(Value *V, ElementCount VF) const {
1725 Instruction *I = dyn_cast<Instruction>(V);
1726 if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1729 return false;
1730
1731 // Assume we can vectorize V (and hence we need extraction) if the
1732 // scalars are not computed yet. This can happen, because it is called
1733 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1734 // the scalars are collected. That should be a safe assumption in most
1735 // cases, because we check if the operands have vectorizable types
1736 // beforehand in LoopVectorizationLegality.
1737 return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
1738 };
1739
1740 /// Returns a range containing only operands needing to be extracted.
1741 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1742 ElementCount VF) const {
1744 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1745 }
1746
1747public:
1748 /// The loop that we evaluate.
1750
1751 /// Predicated scalar evolution analysis.
1753
1754 /// Loop Info analysis.
1756
1757 /// Vectorization legality.
1759
1760 /// Vector target information.
1762
1763 /// Target Library Info.
1765
1766 /// Demanded bits analysis.
1768
1769 /// Assumption cache.
1771
1772 /// Interface to emit optimization remarks.
1774
1776
1777 /// Loop Vectorize Hint.
1779
1780 /// The interleave access information contains groups of interleaved accesses
1781 /// with the same stride and close to each other.
1783
1784 /// Values to ignore in the cost model.
1786
1787 /// Values to ignore in the cost model when VF > 1.
1789
1790 /// All element types found in the loop.
1792
1793 /// The kind of cost that we are calculating
1795};
1796} // end namespace llvm
1797
1798namespace {
1799/// Helper struct to manage generating runtime checks for vectorization.
1800///
1801/// The runtime checks are created up-front in temporary blocks to allow better
1802/// estimating the cost and un-linked from the existing IR. After deciding to
1803/// vectorize, the checks are moved back. If deciding not to vectorize, the
1804/// temporary blocks are completely removed.
1805class GeneratedRTChecks {
1806 /// Basic block which contains the generated SCEV checks, if any.
1807 BasicBlock *SCEVCheckBlock = nullptr;
1808
1809 /// The value representing the result of the generated SCEV checks. If it is
1810 /// nullptr, either no SCEV checks have been generated or they have been used.
1811 Value *SCEVCheckCond = nullptr;
1812
1813 /// Basic block which contains the generated memory runtime checks, if any.
1814 BasicBlock *MemCheckBlock = nullptr;
1815
1816 /// The value representing the result of the generated memory runtime checks.
1817 /// If it is nullptr, either no memory runtime checks have been generated or
1818 /// they have been used.
1819 Value *MemRuntimeCheckCond = nullptr;
1820
1821 DominatorTree *DT;
1822 LoopInfo *LI;
1824
1825 SCEVExpander SCEVExp;
1826 SCEVExpander MemCheckExp;
1827
1828 bool CostTooHigh = false;
1829 const bool AddBranchWeights;
1830
1831 Loop *OuterLoop = nullptr;
1832
1834
1835 /// The kind of cost that we are calculating
1836 TTI::TargetCostKind CostKind;
1837
1838public:
1839 GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
1841 const DataLayout &DL, bool AddBranchWeights,
1842 TTI::TargetCostKind CostKind)
1843 : DT(DT), LI(LI), TTI(TTI), SCEVExp(*PSE.getSE(), DL, "scev.check"),
1844 MemCheckExp(*PSE.getSE(), DL, "scev.check"),
1845 AddBranchWeights(AddBranchWeights), PSE(PSE), CostKind(CostKind) {}
1846
1847 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1848 /// accurately estimate the cost of the runtime checks. The blocks are
1849 /// un-linked from the IR and are added back during vector code generation. If
1850 /// there is no vector code generation, the check blocks are removed
1851 /// completely.
1852 void create(Loop *L, const LoopAccessInfo &LAI,
1853 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1854
1855 // Hard cutoff to limit compile-time increase in case a very large number of
1856 // runtime checks needs to be generated.
1857 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1858 // profile info.
1859 CostTooHigh =
1861 if (CostTooHigh)
1862 return;
1863
1864 BasicBlock *LoopHeader = L->getHeader();
1865 BasicBlock *Preheader = L->getLoopPreheader();
1866
1867 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1868 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1869 // may be used by SCEVExpander. The blocks will be un-linked from their
1870 // predecessors and removed from LI & DT at the end of the function.
1871 if (!UnionPred.isAlwaysTrue()) {
1872 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1873 nullptr, "vector.scevcheck");
1874
1875 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1876 &UnionPred, SCEVCheckBlock->getTerminator());
1877 }
1878
1879 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1880 if (RtPtrChecking.Need) {
1881 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1882 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1883 "vector.memcheck");
1884
1885 auto DiffChecks = RtPtrChecking.getDiffChecks();
1886 if (DiffChecks) {
1887 Value *RuntimeVF = nullptr;
1888 MemRuntimeCheckCond = addDiffRuntimeChecks(
1889 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
1890 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1891 if (!RuntimeVF)
1892 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1893 return RuntimeVF;
1894 },
1895 IC);
1896 } else {
1897 MemRuntimeCheckCond = addRuntimeChecks(
1898 MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
1900 }
1901 assert(MemRuntimeCheckCond &&
1902 "no RT checks generated although RtPtrChecking "
1903 "claimed checks are required");
1904 }
1905
1906 if (!MemCheckBlock && !SCEVCheckBlock)
1907 return;
1908
1909 // Unhook the temporary block with the checks, update various places
1910 // accordingly.
1911 if (SCEVCheckBlock)
1912 SCEVCheckBlock->replaceAllUsesWith(Preheader);
1913 if (MemCheckBlock)
1914 MemCheckBlock->replaceAllUsesWith(Preheader);
1915
1916 if (SCEVCheckBlock) {
1917 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1918 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1919 Preheader->getTerminator()->eraseFromParent();
1920 }
1921 if (MemCheckBlock) {
1922 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1923 new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1924 Preheader->getTerminator()->eraseFromParent();
1925 }
1926
1927 DT->changeImmediateDominator(LoopHeader, Preheader);
1928 if (MemCheckBlock) {
1929 DT->eraseNode(MemCheckBlock);
1930 LI->removeBlock(MemCheckBlock);
1931 }
1932 if (SCEVCheckBlock) {
1933 DT->eraseNode(SCEVCheckBlock);
1934 LI->removeBlock(SCEVCheckBlock);
1935 }
1936
1937 // Outer loop is used as part of the later cost calculations.
1938 OuterLoop = L->getParentLoop();
1939 }
1940
1941 InstructionCost getCost() {
1942 if (SCEVCheckBlock || MemCheckBlock)
1943 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
1944
1945 if (CostTooHigh) {
1947 Cost.setInvalid();
1948 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n");
1949 return Cost;
1950 }
1951
1952 InstructionCost RTCheckCost = 0;
1953 if (SCEVCheckBlock)
1954 for (Instruction &I : *SCEVCheckBlock) {
1955 if (SCEVCheckBlock->getTerminator() == &I)
1956 continue;
1957 InstructionCost C = TTI->getInstructionCost(&I, CostKind);
1958 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1959 RTCheckCost += C;
1960 }
1961 if (MemCheckBlock) {
1962 InstructionCost MemCheckCost = 0;
1963 for (Instruction &I : *MemCheckBlock) {
1964 if (MemCheckBlock->getTerminator() == &I)
1965 continue;
1966 InstructionCost C = TTI->getInstructionCost(&I, CostKind);
1967 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1968 MemCheckCost += C;
1969 }
1970
1971 // If the runtime memory checks are being created inside an outer loop
1972 // we should find out if these checks are outer loop invariant. If so,
1973 // the checks will likely be hoisted out and so the effective cost will
1974 // reduce according to the outer loop trip count.
1975 if (OuterLoop) {
1976 ScalarEvolution *SE = MemCheckExp.getSE();
1977 // TODO: If profitable, we could refine this further by analysing every
1978 // individual memory check, since there could be a mixture of loop
1979 // variant and invariant checks that mean the final condition is
1980 // variant.
1981 const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
1982 if (SE->isLoopInvariant(Cond, OuterLoop)) {
1983 // It seems reasonable to assume that we can reduce the effective
1984 // cost of the checks even when we know nothing about the trip
1985 // count. Assume that the outer loop executes at least twice.
1986 unsigned BestTripCount = 2;
1987
1988 // Get the best known TC estimate.
1989 if (auto EstimatedTC = getSmallBestKnownTC(
1990 PSE, OuterLoop, /* CanUseConstantMax = */ false))
1991 BestTripCount = *EstimatedTC;
1992
1993 BestTripCount = std::max(BestTripCount, 1U);
1994 InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
1995
1996 // Let's ensure the cost is always at least 1.
1997 NewMemCheckCost = std::max(*NewMemCheckCost.getValue(),
1999
2000 if (BestTripCount > 1)
2002 << "We expect runtime memory checks to be hoisted "
2003 << "out of the outer loop. Cost reduced from "
2004 << MemCheckCost << " to " << NewMemCheckCost << '\n');
2005
2006 MemCheckCost = NewMemCheckCost;
2007 }
2008 }
2009
2010 RTCheckCost += MemCheckCost;
2011 }
2012
2013 if (SCEVCheckBlock || MemCheckBlock)
2014 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2015 << "\n");
2016
2017 return RTCheckCost;
2018 }
2019
2020 /// Remove the created SCEV & memory runtime check blocks & instructions, if
2021 /// unused.
2022 ~GeneratedRTChecks() {
2023 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2024 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2025 if (!SCEVCheckCond)
2026 SCEVCleaner.markResultUsed();
2027
2028 if (!MemRuntimeCheckCond)
2029 MemCheckCleaner.markResultUsed();
2030
2031 if (MemRuntimeCheckCond) {
2032 auto &SE = *MemCheckExp.getSE();
2033 // Memory runtime check generation creates compares that use expanded
2034 // values. Remove them before running the SCEVExpanderCleaners.
2035 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2036 if (MemCheckExp.isInsertedInstruction(&I))
2037 continue;
2038 SE.forgetValue(&I);
2039 I.eraseFromParent();
2040 }
2041 }
2042 MemCheckCleaner.cleanup();
2043 SCEVCleaner.cleanup();
2044
2045 if (SCEVCheckCond)
2046 SCEVCheckBlock->eraseFromParent();
2047 if (MemRuntimeCheckCond)
2048 MemCheckBlock->eraseFromParent();
2049 }
2050
2051 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2052 /// adjusts the branches to branch to the vector preheader or \p Bypass,
2053 /// depending on the generated condition.
2054 BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2055 BasicBlock *LoopVectorPreHeader) {
2056 if (!SCEVCheckCond)
2057 return nullptr;
2058
2059 Value *Cond = SCEVCheckCond;
2060 // Mark the check as used, to prevent it from being removed during cleanup.
2061 SCEVCheckCond = nullptr;
2062 if (auto *C = dyn_cast<ConstantInt>(Cond))
2063 if (C->isZero())
2064 return nullptr;
2065
2066 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2067
2068 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2069 // Create new preheader for vector loop.
2070 if (OuterLoop)
2071 OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2072
2073 SCEVCheckBlock->getTerminator()->eraseFromParent();
2074 SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2075 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2076 SCEVCheckBlock);
2077
2078 DT->addNewBlock(SCEVCheckBlock, Pred);
2079 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2080
2081 BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond);
2082 if (AddBranchWeights)
2083 setBranchWeights(BI, SCEVCheckBypassWeights, /*IsExpected=*/false);
2084 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI);
2085 return SCEVCheckBlock;
2086 }
2087
2088 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2089 /// the branches to branch to the vector preheader or \p Bypass, depending on
2090 /// the generated condition.
2091 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2092 BasicBlock *LoopVectorPreHeader) {
2093 // Check if we generated code that checks in runtime if arrays overlap.
2094 if (!MemRuntimeCheckCond)
2095 return nullptr;
2096
2097 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2098 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2099 MemCheckBlock);
2100
2101 DT->addNewBlock(MemCheckBlock, Pred);
2102 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2103 MemCheckBlock->moveBefore(LoopVectorPreHeader);
2104
2105 if (OuterLoop)
2106 OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI);
2107
2108 BranchInst &BI =
2109 *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
2110 if (AddBranchWeights) {
2111 setBranchWeights(BI, MemCheckBypassWeights, /*IsExpected=*/false);
2112 }
2113 ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
2114 MemCheckBlock->getTerminator()->setDebugLoc(
2115 Pred->getTerminator()->getDebugLoc());
2116
2117 // Mark the check as used, to prevent it from being removed during cleanup.
2118 MemRuntimeCheckCond = nullptr;
2119 return MemCheckBlock;
2120 }
2121};
2122} // namespace
2123
2125 return Style == TailFoldingStyle::Data ||
2126 Style == TailFoldingStyle::DataAndControlFlow ||
2127 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2128}
2129
2131 return Style == TailFoldingStyle::DataAndControlFlow ||
2132 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2133}
2134
2135// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2136// vectorization. The loop needs to be annotated with #pragma omp simd
2137// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2138// vector length information is not provided, vectorization is not considered
2139// explicit. Interleave hints are not allowed either. These limitations will be
2140// relaxed in the future.
2141// Please, note that we are currently forced to abuse the pragma 'clang
2142// vectorize' semantics. This pragma provides *auto-vectorization hints*
2143// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2144// provides *explicit vectorization hints* (LV can bypass legal checks and
2145// assume that vectorization is legal). However, both hints are implemented
2146// using the same metadata (llvm.loop.vectorize, processed by
2147// LoopVectorizeHints). This will be fixed in the future when the native IR
2148// representation for pragma 'omp simd' is introduced.
2149static bool isExplicitVecOuterLoop(Loop *OuterLp,
2151 assert(!OuterLp->isInnermost() && "This is not an outer loop");
2152 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2153
2154 // Only outer loops with an explicit vectorization hint are supported.
2155 // Unannotated outer loops are ignored.
2157 return false;
2158
2159 Function *Fn = OuterLp->getHeader()->getParent();
2160 if (!Hints.allowVectorization(Fn, OuterLp,
2161 true /*VectorizeOnlyWhenForced*/)) {
2162 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2163 return false;
2164 }
2165
2166 if (Hints.getInterleave() > 1) {
2167 // TODO: Interleave support is future work.
2168 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2169 "outer loops.\n");
2170 Hints.emitRemarkWithHints();
2171 return false;
2172 }
2173
2174 return true;
2175}
2176
2180 // Collect inner loops and outer loops without irreducible control flow. For
2181 // now, only collect outer loops that have explicit vectorization hints. If we
2182 // are stress testing the VPlan H-CFG construction, we collect the outermost
2183 // loop of every loop nest.
2184 if (L.isInnermost() || VPlanBuildStressTest ||
2186 LoopBlocksRPO RPOT(&L);
2187 RPOT.perform(LI);
2188 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2189 V.push_back(&L);
2190 // TODO: Collect inner loops inside marked outer loops in case
2191 // vectorization fails for the outer loop. Do not invoke
2192 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2193 // already known to be reducible. We can use an inherited attribute for
2194 // that.
2195 return;
2196 }
2197 }
2198 for (Loop *InnerL : L)
2199 collectSupportedLoops(*InnerL, LI, ORE, V);
2200}
2201
2202//===----------------------------------------------------------------------===//
2203// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2204// LoopVectorizationCostModel and LoopVectorizationPlanner.
2205//===----------------------------------------------------------------------===//
2206
2207/// Compute the transformed value of Index at offset StartValue using step
2208/// StepValue.
2209/// For integer induction, returns StartValue + Index * StepValue.
2210/// For pointer induction, returns StartValue[Index * StepValue].
2211/// FIXME: The newly created binary instructions should contain nsw/nuw
2212/// flags, which can be found from the original scalar operations.
2213static Value *
2215 Value *Step,
2217 const BinaryOperator *InductionBinOp) {
2218 Type *StepTy = Step->getType();
2219 Value *CastedIndex = StepTy->isIntegerTy()
2220 ? B.CreateSExtOrTrunc(Index, StepTy)
2221 : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2222 if (CastedIndex != Index) {
2223 CastedIndex->setName(CastedIndex->getName() + ".cast");
2224 Index = CastedIndex;
2225 }
2226
2227 // Note: the IR at this point is broken. We cannot use SE to create any new
2228 // SCEV and then expand it, hoping that SCEV's simplification will give us
2229 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2230 // lead to various SCEV crashes. So all we can do is to use builder and rely
2231 // on InstCombine for future simplifications. Here we handle some trivial
2232 // cases only.
2233 auto CreateAdd = [&B](Value *X, Value *Y) {
2234 assert(X->getType() == Y->getType() && "Types don't match!");
2235 if (auto *CX = dyn_cast<ConstantInt>(X))
2236 if (CX->isZero())
2237 return Y;
2238 if (auto *CY = dyn_cast<ConstantInt>(Y))
2239 if (CY->isZero())
2240 return X;
2241 return B.CreateAdd(X, Y);
2242 };
2243
2244 // We allow X to be a vector type, in which case Y will potentially be
2245 // splatted into a vector with the same element count.
2246 auto CreateMul = [&B](Value *X, Value *Y) {
2247 assert(X->getType()->getScalarType() == Y->getType() &&
2248 "Types don't match!");
2249 if (auto *CX = dyn_cast<ConstantInt>(X))
2250 if (CX->isOne())
2251 return Y;
2252 if (auto *CY = dyn_cast<ConstantInt>(Y))
2253 if (CY->isOne())
2254 return X;
2255 VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2256 if (XVTy && !isa<VectorType>(Y->getType()))
2257 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2258 return B.CreateMul(X, Y);
2259 };
2260
2261 switch (InductionKind) {
2263 assert(!isa<VectorType>(Index->getType()) &&
2264 "Vector indices not supported for integer inductions yet");
2265 assert(Index->getType() == StartValue->getType() &&
2266 "Index type does not match StartValue type");
2267 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2268 return B.CreateSub(StartValue, Index);
2269 auto *Offset = CreateMul(Index, Step);
2270 return CreateAdd(StartValue, Offset);
2271 }
2273 return B.CreatePtrAdd(StartValue, CreateMul(Index, Step));
2275 assert(!isa<VectorType>(Index->getType()) &&
2276 "Vector indices not supported for FP inductions yet");
2277 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2278 assert(InductionBinOp &&
2279 (InductionBinOp->getOpcode() == Instruction::FAdd ||
2280 InductionBinOp->getOpcode() == Instruction::FSub) &&
2281 "Original bin op should be defined for FP induction");
2282
2283 Value *MulExp = B.CreateFMul(Step, Index);
2284 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2285 "induction");
2286 }
2288 return nullptr;
2289 }
2290 llvm_unreachable("invalid enum");
2291}
2292
2293std::optional<unsigned> getMaxVScale(const Function &F,
2294 const TargetTransformInfo &TTI) {
2295 if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2296 return MaxVScale;
2297
2298 if (F.hasFnAttribute(Attribute::VScaleRange))
2299 return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2300
2301 return std::nullopt;
2302}
2303
2304/// For the given VF and UF and maximum trip count computed for the loop, return
2305/// whether the induction variable might overflow in the vectorized loop. If not,
2306/// then we know a runtime overflow check always evaluates to false and can be
2307/// removed.
2310 ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2311 // Always be conservative if we don't know the exact unroll factor.
2312 unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2313
2314 Type *IdxTy = Cost->Legal->getWidestInductionType();
2315 APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask();
2316
2317 // We know the runtime overflow check is known false iff the (max) trip-count
2318 // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2319 // the vector loop induction variable.
2320 if (unsigned TC = Cost->PSE.getSmallConstantMaxTripCount()) {
2321 uint64_t MaxVF = VF.getKnownMinValue();
2322 if (VF.isScalable()) {
2323 std::optional<unsigned> MaxVScale =
2324 getMaxVScale(*Cost->TheFunction, Cost->TTI);
2325 if (!MaxVScale)
2326 return false;
2327 MaxVF *= *MaxVScale;
2328 }
2329
2330 return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
2331 }
2332
2333 return false;
2334}
2335
2336// Return whether we allow using masked interleave-groups (for dealing with
2337// strided loads/stores that reside in predicated blocks, or for dealing
2338// with gaps).
2340 // If an override option has been passed in for interleaved accesses, use it.
2343
2345}
2346
2348 VPReplicateRecipe *RepRecipe,
2349 const VPLane &Lane,
2350 VPTransformState &State) {
2351 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2352
2353 // Does this instruction return a value ?
2354 bool IsVoidRetTy = Instr->getType()->isVoidTy();
2355
2356 Instruction *Cloned = Instr->clone();
2357 if (!IsVoidRetTy) {
2358 Cloned->setName(Instr->getName() + ".cloned");
2359#if !defined(NDEBUG)
2360 // Verify that VPlan type inference results agree with the type of the
2361 // generated values.
2362 assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() &&
2363 "inferred type and type from generated instructions do not match");
2364#endif
2365 }
2366
2367 RepRecipe->setFlags(Cloned);
2368
2369 if (auto DL = Instr->getDebugLoc())
2370 State.setDebugLocFrom(DL);
2371
2372 // Replace the operands of the cloned instructions with their scalar
2373 // equivalents in the new loop.
2374 for (const auto &I : enumerate(RepRecipe->operands())) {
2375 auto InputLane = Lane;
2376 VPValue *Operand = I.value();
2378 InputLane = VPLane::getFirstLane();
2379 Cloned->setOperand(I.index(), State.get(Operand, InputLane));
2380 }
2381 State.addNewMetadata(Cloned, Instr);
2382
2383 // Place the cloned scalar in the new loop.
2384 State.Builder.Insert(Cloned);
2385
2386 State.set(RepRecipe, Cloned, Lane);
2387
2388 // If we just cloned a new assumption, add it the assumption cache.
2389 if (auto *II = dyn_cast<AssumeInst>(Cloned))
2391
2392 // End if-block.
2393 VPRegionBlock *Parent = RepRecipe->getParent()->getParent();
2394 bool IfPredicateInstr = Parent ? Parent->isReplicator() : false;
2395 assert(
2396 (Parent || !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() ||
2397 all_of(RepRecipe->operands(),
2398 [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) &&
2399 "Expected a recipe is either within a region or all of its operands "
2400 "are defined outside the vectorized region.");
2401 if (IfPredicateInstr)
2402 PredicatedInstructions.push_back(Cloned);
2403}
2404
2405Value *
2407 if (VectorTripCount)
2408 return VectorTripCount;
2409
2410 Value *TC = getTripCount();
2411 IRBuilder<> Builder(InsertBlock->getTerminator());
2412
2413 Type *Ty = TC->getType();
2414 // This is where we can make the step a runtime constant.
2415 Value *Step = createStepForVF(Builder, Ty, VF, UF);
2416
2417 // If the tail is to be folded by masking, round the number of iterations N
2418 // up to a multiple of Step instead of rounding down. This is done by first
2419 // adding Step-1 and then rounding down. Note that it's ok if this addition
2420 // overflows: the vector induction variable will eventually wrap to zero given
2421 // that it starts at zero and its Step is a power of two; the loop will then
2422 // exit, with the last early-exit vector comparison also producing all-true.
2423 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2424 // is accounted for in emitIterationCountCheck that adds an overflow check.
2425 if (Cost->foldTailByMasking()) {
2427 "VF*UF must be a power of 2 when folding tail by masking");
2428 TC = Builder.CreateAdd(TC, Builder.CreateSub(Step, ConstantInt::get(Ty, 1)),
2429 "n.rnd.up");
2430 }
2431
2432 // Now we need to generate the expression for the part of the loop that the
2433 // vectorized body will execute. This is equal to N - (N % Step) if scalar
2434 // iterations are not required for correctness, or N - Step, otherwise. Step
2435 // is equal to the vectorization factor (number of SIMD elements) times the
2436 // unroll factor (number of SIMD instructions).
2437 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2438
2439 // There are cases where we *must* run at least one iteration in the remainder
2440 // loop. See the cost model for when this can happen. If the step evenly
2441 // divides the trip count, we set the remainder to be equal to the step. If
2442 // the step does not evenly divide the trip count, no adjustment is necessary
2443 // since there will already be scalar iterations. Note that the minimum
2444 // iterations check ensures that N >= Step.
2445 if (Cost->requiresScalarEpilogue(VF.isVector())) {
2446 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2447 R = Builder.CreateSelect(IsZero, Step, R);
2448 }
2449
2450 VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2451
2452 return VectorTripCount;
2453}
2454
2456 VPBlockBase *ScalarPH = Plan.getScalarPreheader();
2458 if (PreVectorPH->getNumSuccessors() != 1) {
2459 assert(PreVectorPH->getNumSuccessors() == 2 && "Expected 2 successors");
2460 assert(PreVectorPH->getSuccessors()[0] == ScalarPH &&
2461 "Unexpected successor");
2462 VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(CheckIRBB);
2463 VPBlockUtils::insertOnEdge(PreVectorPH, VectorPHVPB, CheckVPIRBB);
2464 PreVectorPH = CheckVPIRBB;
2465 }
2466 VPBlockUtils::connectBlocks(PreVectorPH, ScalarPH);
2467 PreVectorPH->swapSuccessors();
2468}
2469
2471 Value *Count = getTripCount();
2472 // Reuse existing vector loop preheader for TC checks.
2473 // Note that new preheader block is generated for vector loop.
2474 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2475 IRBuilder<> Builder(TCCheckBlock->getTerminator());
2476
2477 // Generate code to check if the loop's trip count is less than VF * UF, or
2478 // equal to it in case a scalar epilogue is required; this implies that the
2479 // vector trip count is zero. This check also covers the case where adding one
2480 // to the backedge-taken count overflowed leading to an incorrect trip count
2481 // of zero. In this case we will also jump to the scalar loop.
2482 auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
2484
2485 // If tail is to be folded, vector loop takes care of all iterations.
2486 Type *CountTy = Count->getType();
2487 Value *CheckMinIters = Builder.getFalse();
2488 auto CreateStep = [&]() -> Value * {
2489 // Create step with max(MinProTripCount, UF * VF).
2491 return createStepForVF(Builder, CountTy, VF, UF);
2492
2493 Value *MinProfTC =
2495 if (!VF.isScalable())
2496 return MinProfTC;
2498 Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2499 };
2500
2501 TailFoldingStyle Style = Cost->getTailFoldingStyle();
2502 if (Style == TailFoldingStyle::None) {
2503 Value *Step = CreateStep();
2504 ScalarEvolution &SE = *PSE.getSE();
2505 // TODO: Emit unconditional branch to vector preheader instead of
2506 // conditional branch with known condition.
2507 const SCEV *TripCountSCEV = SE.applyLoopGuards(SE.getSCEV(Count), OrigLoop);
2508 // Check if the trip count is < the step.
2509 if (SE.isKnownPredicate(P, TripCountSCEV, SE.getSCEV(Step))) {
2510 // TODO: Ensure step is at most the trip count when determining max VF and
2511 // UF, w/o tail folding.
2512 CheckMinIters = Builder.getTrue();
2514 TripCountSCEV, SE.getSCEV(Step))) {
2515 // Generate the minimum iteration check only if we cannot prove the
2516 // check is known to be true, or known to be false.
2517 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
2518 } // else step known to be < trip count, use CheckMinIters preset to false.
2519 } else if (VF.isScalable() &&
2522 // vscale is not necessarily a power-of-2, which means we cannot guarantee
2523 // an overflow to zero when updating induction variables and so an
2524 // additional overflow check is required before entering the vector loop.
2525
2526 // Get the maximum unsigned value for the type.
2527 Value *MaxUIntTripCount =
2528 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2529 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
2530
2531 // Don't execute the vector loop if (UMax - n) < (VF * UF).
2532 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
2533 }
2534
2535 // Create new preheader for vector loop.
2537 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2538 "vector.ph");
2539
2540 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2541 DT->getNode(Bypass)->getIDom()) &&
2542 "TC check is expected to dominate Bypass");
2543
2544 BranchInst &BI =
2545 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
2547 setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
2548 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
2549 LoopBypassBlocks.push_back(TCCheckBlock);
2550
2551 // TODO: Wrap LoopVectorPreHeader in VPIRBasicBlock here.
2552 introduceCheckBlockInVPlan(TCCheckBlock);
2553}
2554
2556 BasicBlock *const SCEVCheckBlock =
2557 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader);
2558 if (!SCEVCheckBlock)
2559 return nullptr;
2560
2561 assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2563 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
2564 "Cannot SCEV check stride or overflow when optimizing for size");
2565 assert(!LoopBypassBlocks.empty() &&
2566 "Should already be a bypass block due to iteration count check");
2567 LoopBypassBlocks.push_back(SCEVCheckBlock);
2568 AddedSafetyChecks = true;
2569
2570 introduceCheckBlockInVPlan(SCEVCheckBlock);
2571 return SCEVCheckBlock;
2572}
2573
2575 // VPlan-native path does not do any analysis for runtime checks currently.
2577 return nullptr;
2578
2579 BasicBlock *const MemCheckBlock =
2580 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
2581
2582 // Check if we generated code that checks in runtime if arrays overlap. We put
2583 // the checks into a separate block to make the more common case of few
2584 // elements faster.
2585 if (!MemCheckBlock)
2586 return nullptr;
2587
2588 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
2589 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2590 "Cannot emit memory checks when optimizing for size, unless forced "
2591 "to vectorize.");
2592 ORE->emit([&]() {
2593 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2596 << "Code-size may be reduced by not forcing "
2597 "vectorization, or by source-code modifications "
2598 "eliminating the need for runtime checks "
2599 "(e.g., adding 'restrict').";
2600 });
2601 }
2602
2603 LoopBypassBlocks.push_back(MemCheckBlock);
2604
2605 AddedSafetyChecks = true;
2606
2607 introduceCheckBlockInVPlan(MemCheckBlock);
2608 return MemCheckBlock;
2609}
2610
2611/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
2612/// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must
2613/// have a single predecessor, which is rewired to the new VPIRBasicBlock. All
2614/// successors of VPBB, if any, are rewired to the new VPIRBasicBlock.
2616 VPIRBasicBlock *IRVPBB = VPBB->getPlan()->createVPIRBasicBlock(IRBB);
2617 for (auto &R : make_early_inc_range(*VPBB)) {
2618 assert(!R.isPhi() && "Tried to move phi recipe to end of block");
2619 R.moveBefore(*IRVPBB, IRVPBB->end());
2620 }
2621
2622 VPBlockUtils::reassociateBlocks(VPBB, IRVPBB);
2623 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
2624}
2625
2628 assert(LoopVectorPreHeader && "Invalid loop structure");
2630 Cost->requiresScalarEpilogue(VF.isVector())) &&
2631 "loops not exiting via the latch without required epilogue?");
2632
2635 LI, nullptr, Twine(Prefix) + "middle.block");
2639 nullptr, Twine(Prefix) + "scalar.ph");
2641}
2642
2643/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
2644/// expansion results.
2646 const SCEV2ValueTy &ExpandedSCEVs) {
2647 const SCEV *Step = ID.getStep();
2648 if (auto *C = dyn_cast<SCEVConstant>(Step))
2649 return C->getValue();
2650 if (auto *U = dyn_cast<SCEVUnknown>(Step))
2651 return U->getValue();
2652 auto I = ExpandedSCEVs.find(Step);
2653 assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point");
2654 return I->second;
2655}
2656
2657/// Knowing that loop \p L executes a single vector iteration, add instructions
2658/// that will get simplified and thus should not have any cost to \p
2659/// InstsToIgnore.
2662 SmallPtrSetImpl<Instruction *> &InstsToIgnore) {
2663 auto *Cmp = L->getLatchCmpInst();
2664 if (Cmp)
2665 InstsToIgnore.insert(Cmp);
2666 for (const auto &KV : IL) {
2667 // Extract the key by hand so that it can be used in the lambda below. Note
2668 // that captured structured bindings are a C++20 extension.
2669 const PHINode *IV = KV.first;
2670
2671 // Get next iteration value of the induction variable.
2672 Instruction *IVInst =
2673 cast<Instruction>(IV->getIncomingValueForBlock(L->getLoopLatch()));
2674 if (all_of(IVInst->users(),
2675 [&](const User *U) { return U == IV || U == Cmp; }))
2676 InstsToIgnore.insert(IVInst);
2677 }
2678}
2679
2681 const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount) {
2682 assert(MainVectorTripCount && "Must have bypass information");
2683
2684 Instruction *OldInduction = Legal->getPrimaryInduction();
2685 IRBuilder<> BypassBuilder(getAdditionalBypassBlock(),
2686 getAdditionalBypassBlock()->getFirstInsertionPt());
2687 for (const auto &InductionEntry : Legal->getInductionVars()) {
2688 PHINode *OrigPhi = InductionEntry.first;
2689 const InductionDescriptor &II = InductionEntry.second;
2690 Value *Step = getExpandedStep(II, ExpandedSCEVs);
2691 // For the primary induction the additional bypass end value is known.
2692 // Otherwise it is computed.
2693 Value *EndValueFromAdditionalBypass = MainVectorTripCount;
2694 if (OrigPhi != OldInduction) {
2695 auto *BinOp = II.getInductionBinOp();
2696 // Fast-math-flags propagate from the original induction instruction.
2697 if (isa_and_nonnull<FPMathOperator>(BinOp))
2698 BypassBuilder.setFastMathFlags(BinOp->getFastMathFlags());
2699
2700 // Compute the end value for the additional bypass.
2701 EndValueFromAdditionalBypass =
2702 emitTransformedIndex(BypassBuilder, MainVectorTripCount,
2703 II.getStartValue(), Step, II.getKind(), BinOp);
2704 EndValueFromAdditionalBypass->setName("ind.end");
2705 }
2706
2707 // Store the bypass value here, as it needs to be added as operand to its
2708 // scalar preheader phi node after the epilogue skeleton has been created.
2709 // TODO: Directly add as extra operand to the VPResumePHI recipe.
2710 assert(!Induction2AdditionalBypassValue.contains(OrigPhi) &&
2711 "entry for OrigPhi already exits");
2712 Induction2AdditionalBypassValue[OrigPhi] = EndValueFromAdditionalBypass;
2713 }
2714}
2715
2717 const SCEV2ValueTy &ExpandedSCEVs) {
2718 /*
2719 In this function we generate a new loop. The new loop will contain
2720 the vectorized instructions while the old loop will continue to run the
2721 scalar remainder.
2722
2723 [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
2724 / | preheader are expanded here. Eventually all required SCEV
2725 / | expansion should happen here.
2726 / v
2727 | [ ] <-- vector loop bypass (may consist of multiple blocks).
2728 | / |
2729 | / v
2730 || [ ] <-- vector pre header.
2731 |/ |
2732 | v
2733 | [ ] \
2734 | [ ]_| <-- vector loop (created during VPlan execution).
2735 | |
2736 | v
2737 \ -[ ] <--- middle-block (wrapped in VPIRBasicBlock with the branch to
2738 | | successors created during VPlan execution)
2739 \/ |
2740 /\ v
2741 | ->[ ] <--- new preheader (wrapped in VPIRBasicBlock).
2742 | |
2743 (opt) v <-- edge from middle to exit iff epilogue is not required.
2744 | [ ] \
2745 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue, header
2746 | | wrapped in VPIRBasicBlock).
2747 \ |
2748 \ v
2749 >[ ] <-- exit block(s). (wrapped in VPIRBasicBlock)
2750 ...
2751 */
2752
2753 // Create an empty vector loop, and prepare basic blocks for the runtime
2754 // checks.
2756
2757 // Now, compare the new count to zero. If it is zero skip the vector loop and
2758 // jump to the scalar loop. This check also covers the case where the
2759 // backedge-taken count is uint##_max: adding one to it will overflow leading
2760 // to an incorrect trip count of zero. In this (rare) case we will also jump
2761 // to the scalar loop.
2763
2764 // Generate the code to check any assumptions that we've made for SCEV
2765 // expressions.
2767
2768 // Generate the code that checks in runtime if arrays overlap. We put the
2769 // checks into a separate block to make the more common case of few elements
2770 // faster.
2772
2773 return LoopVectorPreHeader;
2774}
2775
2776namespace {
2777
2778struct CSEDenseMapInfo {
2779 static bool canHandle(const Instruction *I) {
2780 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
2781 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
2782 }
2783
2784 static inline Instruction *getEmptyKey() {
2786 }
2787
2788 static inline Instruction *getTombstoneKey() {
2790 }
2791
2792 static unsigned getHashValue(const Instruction *I) {
2793 assert(canHandle(I) && "Unknown instruction!");
2794 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
2795 I->value_op_end()));
2796 }
2797
2798 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
2799 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
2800 LHS == getTombstoneKey() || RHS == getTombstoneKey())
2801 return LHS == RHS;
2802 return LHS->isIdenticalTo(RHS);
2803 }
2804};
2805
2806} // end anonymous namespace
2807
2808///Perform cse of induction variable instructions.
2809static void cse(BasicBlock *BB) {
2810 // Perform simple cse.
2812 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
2813 if (!CSEDenseMapInfo::canHandle(&In))
2814 continue;
2815
2816 // Check if we can replace this instruction with any of the
2817 // visited instructions.
2818 if (Instruction *V = CSEMap.lookup(&In)) {
2819 In.replaceAllUsesWith(V);
2820 In.eraseFromParent();
2821 continue;
2822 }
2823
2824 CSEMap[&In] = &In;
2825 }
2826}
2827
2830 ElementCount VF) const {
2831 // We only need to calculate a cost if the VF is scalar; for actual vectors
2832 // we should already have a pre-calculated cost at each VF.
2833 if (!VF.isScalar())
2834 return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost;
2835
2836 Type *RetTy = CI->getType();
2838 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy))
2839 return *RedCost;
2840
2842 for (auto &ArgOp : CI->args())
2843 Tys.push_back(ArgOp->getType());
2844
2845 InstructionCost ScalarCallCost =
2847
2848 // If this is an intrinsic we may have a lower cost for it.
2850 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
2851 return std::min(ScalarCallCost, IntrinsicCost);
2852 }
2853 return ScalarCallCost;
2854}
2855
2857 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
2858 return Elt;
2859 return VectorType::get(Elt, VF);
2860}
2861
2864 ElementCount VF) const {
2866 assert(ID && "Expected intrinsic call!");
2867 Type *RetTy = maybeVectorizeType(CI->getType(), VF);
2868 FastMathFlags FMF;
2869 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
2870 FMF = FPMO->getFastMathFlags();
2871
2874 SmallVector<Type *> ParamTys;
2875 std::transform(FTy->param_begin(), FTy->param_end(),
2876 std::back_inserter(ParamTys),
2877 [&](Type *Ty) { return maybeVectorizeType(Ty, VF); });
2878
2879 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
2880 dyn_cast<IntrinsicInst>(CI));
2881 return TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
2882}
2883
2885 // Fix widened non-induction PHIs by setting up the PHI operands.
2887 fixNonInductionPHIs(State);
2888
2889 // Forget the original basic block.
2892
2893 // After vectorization, the exit blocks of the original loop will have
2894 // additional predecessors. Invalidate SCEVs for the exit phis in case SE
2895 // looked through single-entry phis.
2896 SmallVector<BasicBlock *> ExitBlocks;
2897 OrigLoop->getExitBlocks(ExitBlocks);
2898 for (BasicBlock *Exit : ExitBlocks)
2899 for (PHINode &PN : Exit->phis())
2901
2902 // Don't apply optimizations below when no vector region remains, as they all
2903 // require a vector loop at the moment.
2904 if (!State.Plan->getVectorLoopRegion())
2905 return;
2906
2908 sinkScalarOperands(&*PI);
2909
2910 VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
2911 VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock();
2912 BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB];
2913
2914 // Remove redundant induction instructions.
2915 cse(HeaderBB);
2916
2917 // Set/update profile weights for the vector and remainder loops as original
2918 // loop iterations are now distributed among them. Note that original loop
2919 // becomes the scalar remainder loop after vectorization.
2920 //
2921 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
2922 // end up getting slightly roughened result but that should be OK since
2923 // profile is not inherently precise anyway. Note also possible bypass of
2924 // vector code caused by legality checks is ignored, assigning all the weight
2925 // to the vector loop, optimistically.
2926 //
2927 // For scalable vectorization we can't know at compile time how many
2928 // iterations of the loop are handled in one vector iteration, so instead
2929 // assume a pessimistic vscale of '1'.
2930 Loop *VectorLoop = LI->getLoopFor(HeaderBB);
2932 VF.getKnownMinValue() * UF);
2933}
2934
2936 // The basic block and loop containing the predicated instruction.
2937 auto *PredBB = PredInst->getParent();
2938 auto *VectorLoop = LI->getLoopFor(PredBB);
2939
2940 // Initialize a worklist with the operands of the predicated instruction.
2941 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
2942
2943 // Holds instructions that we need to analyze again. An instruction may be
2944 // reanalyzed if we don't yet know if we can sink it or not.
2945 SmallVector<Instruction *, 8> InstsToReanalyze;
2946
2947 // Returns true if a given use occurs in the predicated block. Phi nodes use
2948 // their operands in their corresponding predecessor blocks.
2949 auto IsBlockOfUsePredicated = [&](Use &U) -> bool {
2950 auto *I = cast<Instruction>(U.getUser());
2951 BasicBlock *BB = I->getParent();
2952 if (auto *Phi = dyn_cast<PHINode>(I))
2953 BB = Phi->getIncomingBlock(
2954 PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
2955 return BB == PredBB;
2956 };
2957
2958 // Iteratively sink the scalarized operands of the predicated instruction
2959 // into the block we created for it. When an instruction is sunk, it's
2960 // operands are then added to the worklist. The algorithm ends after one pass
2961 // through the worklist doesn't sink a single instruction.
2962 bool Changed;
2963 do {
2964 // Add the instructions that need to be reanalyzed to the worklist, and
2965 // reset the changed indicator.
2966 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
2967 InstsToReanalyze.clear();
2968 Changed = false;
2969
2970 while (!Worklist.empty()) {
2971 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
2972
2973 // We can't sink an instruction if it is a phi node, is not in the loop,
2974 // may have side effects or may read from memory.
2975 // TODO: Could do more granular checking to allow sinking
2976 // a load past non-store instructions.
2977 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
2978 I->mayHaveSideEffects() || I->mayReadFromMemory())
2979 continue;
2980
2981 // If the instruction is already in PredBB, check if we can sink its
2982 // operands. In that case, VPlan's sinkScalarOperands() succeeded in
2983 // sinking the scalar instruction I, hence it appears in PredBB; but it
2984 // may have failed to sink I's operands (recursively), which we try
2985 // (again) here.
2986 if (I->getParent() == PredBB) {
2987 Worklist.insert(I->op_begin(), I->op_end());
2988 continue;
2989 }
2990
2991 // It's legal to sink the instruction if all its uses occur in the
2992 // predicated block. Otherwise, there's nothing to do yet, and we may
2993 // need to reanalyze the instruction.
2994 if (!llvm::all_of(I->uses(), IsBlockOfUsePredicated)) {
2995 InstsToReanalyze.push_back(I);
2996 continue;
2997 }
2998
2999 // Move the instruction to the beginning of the predicated block, and add
3000 // it's operands to the worklist.
3001 I->moveBefore(&*PredBB->getFirstInsertionPt());
3002 Worklist.insert(I->op_begin(), I->op_end());
3003
3004 // The sinking may have enabled other instructions to be sunk, so we will
3005 // need to iterate.
3006 Changed = true;
3007 }
3008 } while (Changed);
3009}
3010
3012 auto Iter = vp_depth_first_deep(Plan.getEntry());
3013 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
3014 for (VPRecipeBase &P : VPBB->phis()) {
3015 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
3016 if (!VPPhi)
3017 continue;
3018 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi));
3019 // Make sure the builder has a valid insert point.
3020 Builder.SetInsertPoint(NewPhi);
3021 for (unsigned Idx = 0; Idx < VPPhi->getNumOperands(); ++Idx) {
3022 VPValue *Inc = VPPhi->getIncomingValue(Idx);
3023 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(Idx);
3024 NewPhi->addIncoming(State.get(Inc), State.CFG.VPBB2IRBB[VPBB]);
3025 }
3026 }
3027 }
3028}
3029
3030void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
3031 // We should not collect Scalars more than once per VF. Right now, this
3032 // function is called from collectUniformsAndScalars(), which already does
3033 // this check. Collecting Scalars for VF=1 does not make any sense.
3034 assert(VF.isVector() && !Scalars.contains(VF) &&
3035 "This function should not be visited twice for the same VF");
3036
3037 // This avoids any chances of creating a REPLICATE recipe during planning
3038 // since that would result in generation of scalarized code during execution,
3039 // which is not supported for scalable vectors.
3040 if (VF.isScalable()) {
3041 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
3042 return;
3043 }
3044
3046
3047 // These sets are used to seed the analysis with pointers used by memory
3048 // accesses that will remain scalar.
3050 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
3051 auto *Latch = TheLoop->getLoopLatch();
3052
3053 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
3054 // The pointer operands of loads and stores will be scalar as long as the
3055 // memory access is not a gather or scatter operation. The value operand of a
3056 // store will remain scalar if the store is scalarized.
3057 auto IsScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
3058 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
3059 assert(WideningDecision != CM_Unknown &&
3060 "Widening decision should be ready at this moment");
3061 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
3062 if (Ptr == Store->getValueOperand())
3063 return WideningDecision == CM_Scalarize;
3064 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
3065 "Ptr is neither a value or pointer operand");
3066 return WideningDecision != CM_GatherScatter;
3067 };
3068
3069 // A helper that returns true if the given value is a getelementptr
3070 // instruction contained in the loop.
3071 auto IsLoopVaryingGEP = [&](Value *V) {
3072 return isa<GetElementPtrInst>(V) && !TheLoop->isLoopInvariant(V);
3073 };
3074
3075 // A helper that evaluates a memory access's use of a pointer. If the use will
3076 // be a scalar use and the pointer is only used by memory accesses, we place
3077 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
3078 // PossibleNonScalarPtrs.
3079 auto EvaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
3080 // We only care about bitcast and getelementptr instructions contained in
3081 // the loop.
3082 if (!IsLoopVaryingGEP(Ptr))
3083 return;
3084
3085 // If the pointer has already been identified as scalar (e.g., if it was
3086 // also identified as uniform), there's nothing to do.
3087 auto *I = cast<Instruction>(Ptr);
3088 if (Worklist.count(I))
3089 return;
3090
3091 // If the use of the pointer will be a scalar use, and all users of the
3092 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
3093 // place the pointer in PossibleNonScalarPtrs.
3094 if (IsScalarUse(MemAccess, Ptr) &&
3095 all_of(I->users(), IsaPred<LoadInst, StoreInst>))
3096 ScalarPtrs.insert(I);
3097 else
3098 PossibleNonScalarPtrs.insert(I);
3099 };
3100
3101 // We seed the scalars analysis with three classes of instructions: (1)
3102 // instructions marked uniform-after-vectorization and (2) bitcast,
3103 // getelementptr and (pointer) phi instructions used by memory accesses
3104 // requiring a scalar use.
3105 //
3106 // (1) Add to the worklist all instructions that have been identified as
3107 // uniform-after-vectorization.
3108 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
3109
3110 // (2) Add to the worklist all bitcast and getelementptr instructions used by
3111 // memory accesses requiring a scalar use. The pointer operands of loads and
3112 // stores will be scalar unless the operation is a gather or scatter.
3113 // The value operand of a store will remain scalar if the store is scalarized.
3114 for (auto *BB : TheLoop->blocks())
3115 for (auto &I : *BB) {
3116 if (auto *Load = dyn_cast<LoadInst>(&I)) {
3117 EvaluatePtrUse(Load, Load->getPointerOperand());
3118 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
3119 EvaluatePtrUse(Store, Store->getPointerOperand());
3120 EvaluatePtrUse(Store, Store->getValueOperand());
3121 }
3122 }
3123 for (auto *I : ScalarPtrs)
3124 if (!PossibleNonScalarPtrs.count(I)) {
3125 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
3126 Worklist.insert(I);
3127 }
3128
3129 // Insert the forced scalars.
3130 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
3131 // induction variable when the PHI user is scalarized.
3132 auto ForcedScalar = ForcedScalars.find(VF);
3133 if (ForcedScalar != ForcedScalars.end())
3134 for (auto *I : ForcedScalar->second) {
3135 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
3136 Worklist.insert(I);
3137 }
3138
3139 // Expand the worklist by looking through any bitcasts and getelementptr
3140 // instructions we've already identified as scalar. This is similar to the
3141 // expansion step in collectLoopUniforms(); however, here we're only
3142 // expanding to include additional bitcasts and getelementptr instructions.
3143 unsigned Idx = 0;
3144 while (Idx != Worklist.size()) {
3145 Instruction *Dst = Worklist[Idx++];
3146 if (!IsLoopVaryingGEP(Dst->getOperand(0)))
3147 continue;
3148 auto *Src = cast<Instruction>(Dst->getOperand(0));
3149 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
3150 auto *J = cast<Instruction>(U);
3151 return !TheLoop->contains(J) || Worklist.count(J) ||
3152 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
3153 IsScalarUse(J, Src));
3154 })) {
3155 Worklist.insert(Src);
3156 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
3157 }
3158 }
3159
3160 // An induction variable will remain scalar if all users of the induction
3161 // variable and induction variable update remain scalar.
3162 for (const auto &Induction : Legal->getInductionVars()) {
3163 auto *Ind = Induction.first;
3164 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3165
3166 // If tail-folding is applied, the primary induction variable will be used
3167 // to feed a vector compare.
3168 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
3169 continue;
3170
3171 // Returns true if \p Indvar is a pointer induction that is used directly by
3172 // load/store instruction \p I.
3173 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
3174 Instruction *I) {
3175 return Induction.second.getKind() ==
3177 (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
3178 Indvar == getLoadStorePointerOperand(I) && IsScalarUse(I, Indvar);
3179 };
3180
3181 // Determine if all users of the induction variable are scalar after
3182 // vectorization.
3183 bool ScalarInd = all_of(Ind->users(), [&](User *U) -> bool {
3184 auto *I = cast<Instruction>(U);
3185 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3186 IsDirectLoadStoreFromPtrIndvar(Ind, I);
3187 });
3188 if (!ScalarInd)
3189 continue;
3190
3191 // If the induction variable update is a fixed-order recurrence, neither the
3192 // induction variable or its update should be marked scalar after
3193 // vectorization.
3194 auto *IndUpdatePhi = dyn_cast<PHINode>(IndUpdate);
3195 if (IndUpdatePhi && Legal->isFixedOrderRecurrence(IndUpdatePhi))
3196 continue;
3197
3198 // Determine if all users of the induction variable update instruction are
3199 // scalar after vectorization.
3200 bool ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
3201 auto *I = cast<Instruction>(U);
3202 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
3203 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
3204 });
3205 if (!ScalarIndUpdate)
3206 continue;
3207
3208 // The induction variable and its update instruction will remain scalar.
3209 Worklist.insert(Ind);
3210 Worklist.insert(IndUpdate);
3211 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
3212 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
3213 << "\n");
3214 }
3215
3216 Scalars[VF].insert(Worklist.begin(), Worklist.end());
3217}
3218
3220 Instruction *I, ElementCount VF) const {
3221 if (!isPredicatedInst(I))
3222 return false;
3223
3224 // Do we have a non-scalar lowering for this predicated
3225 // instruction? No - it is scalar with predication.
3226 switch(I->getOpcode()) {
3227 default:
3228 return true;
3229 case Instruction::Call:
3230 if (VF.isScalar())
3231 return true;
3232 return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF))
3233 .Kind == CM_Scalarize;
3234 case Instruction::Load:
3235 case Instruction::Store: {
3237 auto *Ty = getLoadStoreType(I);
3238 Type *VTy = Ty;
3239 if (VF.isVector())
3240 VTy = VectorType::get(Ty, VF);
3241 const Align Alignment = getLoadStoreAlignment(I);
3242 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
3243 TTI.isLegalMaskedGather(VTy, Alignment))
3244 : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
3245 TTI.isLegalMaskedScatter(VTy, Alignment));
3246 }
3247 case Instruction::UDiv:
3248 case Instruction::SDiv:
3249 case Instruction::SRem:
3250 case Instruction::URem: {
3251 // We have the option to use the safe-divisor idiom to avoid predication.
3252 // The cost based decision here will always select safe-divisor for
3253 // scalable vectors as scalarization isn't legal.
3254 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
3255 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
3256 }
3257 }
3258}
3259
3260// TODO: Fold into LoopVectorizationLegality::isMaskRequired.
3262 // If predication is not needed, avoid it.
3263 // TODO: We can use the loop-preheader as context point here and get
3264 // context sensitive reasoning for isSafeToSpeculativelyExecute.
3265 if (!blockNeedsPredicationForAnyReason(I->getParent()) ||
3267 (isa<LoadInst, StoreInst, CallInst>(I) && !Legal->isMaskRequired(I)) ||
3268 isa<BranchInst, SwitchInst, PHINode, AllocaInst>(I))
3269 return false;
3270
3271 // If the instruction was executed conditionally in the original scalar loop,
3272 // predication is needed with a mask whose lanes are all possibly inactive.
3273 if (Legal->blockNeedsPredication(I->getParent()))
3274 return true;
3275
3276 // All that remain are instructions with side-effects originally executed in
3277 // the loop unconditionally, but now execute under a tail-fold mask (only)
3278 // having at least one active lane (the first). If the side-effects of the
3279 // instruction are invariant, executing it w/o (the tail-folding) mask is safe
3280 // - it will cause the same side-effects as when masked.
3281 switch(I->getOpcode()) {
3282 default:
3284 "instruction should have been considered by earlier checks");
3285 case Instruction::Call:
3286 // Side-effects of a Call are assumed to be non-invariant, needing a
3287 // (fold-tail) mask.
3289 "should have returned earlier for calls not needing a mask");
3290 return true;
3291 case Instruction::Load:
3292 // If the address is loop invariant no predication is needed.
3294 case Instruction::Store: {
3295 // For stores, we need to prove both speculation safety (which follows from
3296 // the same argument as loads), but also must prove the value being stored
3297 // is correct. The easiest form of the later is to require that all values
3298 // stored are the same.
3300 TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()));
3301 }
3302 case Instruction::UDiv:
3303 case Instruction::SDiv:
3304 case Instruction::SRem:
3305 case Instruction::URem:
3306 // If the divisor is loop-invariant no predication is needed.
3307 return !TheLoop->isLoopInvariant(I->getOperand(1));
3308 }
3309}
3310
3311std::pair<InstructionCost, InstructionCost>
3313 ElementCount VF) const {
3314 assert(I->getOpcode() == Instruction::UDiv ||
3315 I->getOpcode() == Instruction::SDiv ||
3316 I->getOpcode() == Instruction::SRem ||
3317 I->getOpcode() == Instruction::URem);
3319
3320 // Scalarization isn't legal for scalable vector types
3321 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
3322 if (!VF.isScalable()) {
3323 // Get the scalarization cost and scale this amount by the probability of
3324 // executing the predicated block. If the instruction is not predicated,
3325 // we fall through to the next case.
3326 ScalarizationCost = 0;
3327
3328 // These instructions have a non-void type, so account for the phi nodes
3329 // that we will create. This cost is likely to be zero. The phi node
3330 // cost, if any, should be scaled by the block probability because it
3331 // models a copy at the end of each predicated block.
3332 ScalarizationCost += VF.getKnownMinValue() *
3333 TTI.getCFInstrCost(Instruction::PHI, CostKind);
3334
3335 // The cost of the non-predicated instruction.
3336 ScalarizationCost += VF.getKnownMinValue() *
3337 TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
3338
3339 // The cost of insertelement and extractelement instructions needed for
3340 // scalarization.
3341 ScalarizationCost += getScalarizationOverhead(I, VF);
3342
3343 // Scale the cost by the probability of executing the predicated blocks.
3344 // This assumes the predicated block for each vector lane is equally
3345 // likely.
3346 ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
3347 }
3348 InstructionCost SafeDivisorCost = 0;
3349
3350 auto *VecTy = toVectorTy(I->getType(), VF);
3351
3352 // The cost of the select guard to ensure all lanes are well defined
3353 // after we speculate above any internal control flow.
3354 SafeDivisorCost +=
3355 TTI.getCmpSelInstrCost(Instruction::Select, VecTy,
3356 toVectorTy(Type::getInt1Ty(I->getContext()), VF),
3358
3359 // Certain instructions can be cheaper to vectorize if they have a constant
3360 // second vector operand. One example of this are shifts on x86.
3361 Value *Op2 = I->getOperand(1);
3362 auto Op2Info = TTI.getOperandInfo(Op2);
3363 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
3364 Legal->isInvariant(Op2))
3366
3367 SmallVector<const Value *, 4> Operands(I->operand_values());
3368 SafeDivisorCost += TTI.getArithmeticInstrCost(
3369 I->getOpcode(), VecTy, CostKind,
3370 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
3371 Op2Info, Operands, I);
3372 return {ScalarizationCost, SafeDivisorCost};
3373}
3374
3376 Instruction *I, ElementCount VF) const {
3377 assert(isAccessInterleaved(I) && "Expecting interleaved access.");
3379 "Decision should not be set yet.");
3380 auto *Group = getInterleavedAccessGroup(I);
3381 assert(Group && "Must have a group.");
3382 unsigned InterleaveFactor = Group->getFactor();
3383
3384 // If the instruction's allocated size doesn't equal its type size, it
3385 // requires padding and will be scalarized.
3386 auto &DL = I->getDataLayout();
3387 auto *ScalarTy = getLoadStoreType(I);
3388 if (hasIrregularType(ScalarTy, DL))
3389 return false;
3390
3391 // We currently only know how to emit interleave/deinterleave with
3392 // Factor=2 for scalable vectors. This is purely an implementation
3393 // limit.
3394 if (VF.isScalable() && InterleaveFactor != 2)
3395 return false;
3396
3397 // If the group involves a non-integral pointer, we may not be able to
3398 // losslessly cast all values to a common type.
3399 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
3400 for (unsigned Idx = 0; Idx < InterleaveFactor; Idx++) {
3401 Instruction *Member = Group->getMember(Idx);
3402 if (!Member)
3403 continue;
3404 auto *MemberTy = getLoadStoreType(Member);
3405 bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
3406 // Don't coerce non-integral pointers to integers or vice versa.
3407 if (MemberNI != ScalarNI)
3408 // TODO: Consider adding special nullptr value case here
3409 return false;
3410 if (MemberNI && ScalarNI &&
3411 ScalarTy->getPointerAddressSpace() !=
3412 MemberTy->getPointerAddressSpace())
3413 return false;
3414 }
3415
3416 // Check if masking is required.
3417 // A Group may need masking for one of two reasons: it resides in a block that
3418 // needs predication, or it was decided to use masking to deal with gaps
3419 // (either a gap at the end of a load-access that may result in a speculative
3420 // load, or any gaps in a store-access).
3421 bool PredicatedAccessRequiresMasking =
3422 blockNeedsPredicationForAnyReason(I->getParent()) &&
3424 bool LoadAccessWithGapsRequiresEpilogMasking =
3425 isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
3427 bool StoreAccessWithGapsRequiresMasking =
3428 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
3429 if (!PredicatedAccessRequiresMasking &&
3430 !LoadAccessWithGapsRequiresEpilogMasking &&
3431 !StoreAccessWithGapsRequiresMasking)
3432 return true;
3433
3434 // If masked interleaving is required, we expect that the user/target had
3435 // enabled it, because otherwise it either wouldn't have been created or
3436 // it should have been invalidated by the CostModel.
3438 "Masked interleave-groups for predicated accesses are not enabled.");
3439
3440 if (Group->isReverse())
3441 return false;
3442
3443 auto *Ty = getLoadStoreType(I);
3444 const Align Alignment = getLoadStoreAlignment(I);
3445 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
3446 : TTI.isLegalMaskedStore(Ty, Alignment);
3447}
3448
3450 Instruction *I, ElementCount VF) {
3451 // Get and ensure we have a valid memory instruction.
3452 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
3453
3455 auto *ScalarTy = getLoadStoreType(I);
3456
3457 // In order to be widened, the pointer should be consecutive, first of all.
3458 if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
3459 return false;
3460
3461 // If the instruction is a store located in a predicated block, it will be
3462 // scalarized.
3463 if (isScalarWithPredication(I, VF))
3464 return false;
3465
3466 // If the instruction's allocated size doesn't equal it's type size, it
3467 // requires padding and will be scalarized.
3468 auto &DL = I->getDataLayout();
3469 if (hasIrregularType(ScalarTy, DL))
3470 return false;
3471
3472 return true;
3473}
3474
3475void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
3476 // We should not collect Uniforms more than once per VF. Right now,
3477 // this function is called from collectUniformsAndScalars(), which
3478 // already does this check. Collecting Uniforms for VF=1 does not make any
3479 // sense.
3480
3481 assert(VF.isVector() && !Uniforms.contains(VF) &&
3482 "This function should not be visited twice for the same VF");
3483
3484 // Visit the list of Uniforms. If we find no uniform value, we won't
3485 // analyze again. Uniforms.count(VF) will return 1.
3486 Uniforms[VF].clear();
3487
3488 // Now we know that the loop is vectorizable!
3489 // Collect instructions inside the loop that will remain uniform after
3490 // vectorization.
3491
3492 // Global values, params and instructions outside of current loop are out of
3493 // scope.
3494 auto IsOutOfScope = [&](Value *V) -> bool {
3495 Instruction *I = dyn_cast<Instruction>(V);
3496 return (!I || !TheLoop->contains(I));
3497 };
3498
3499 // Worklist containing uniform instructions demanding lane 0.
3500 SetVector<Instruction *> Worklist;
3501
3502 // Add uniform instructions demanding lane 0 to the worklist. Instructions
3503 // that require predication must not be considered uniform after
3504 // vectorization, because that would create an erroneous replicating region
3505 // where only a single instance out of VF should be formed.
3506 auto AddToWorklistIfAllowed = [&](Instruction *I) -> void {
3507 if (IsOutOfScope(I)) {
3508 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
3509 << *I << "\n");
3510 return;
3511 }
3512 if (isPredicatedInst(I)) {
3513 LLVM_DEBUG(
3514 dbgs() << "LV: Found not uniform due to requiring predication: " << *I
3515 << "\n");
3516 return;
3517 }
3518 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
3519 Worklist.insert(I);
3520 };
3521
3522 // Start with the conditional branches exiting the loop. If the branch
3523 // condition is an instruction contained in the loop that is only used by the
3524 // branch, it is uniform. Note conditions from uncountable early exits are not
3525 // uniform.
3527 TheLoop->getExitingBlocks(Exiting);
3528 for (BasicBlock *E : Exiting) {
3530 continue;
3531 auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
3532 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
3533 AddToWorklistIfAllowed(Cmp);
3534 }
3535
3536 auto PrevVF = VF.divideCoefficientBy(2);
3537 // Return true if all lanes perform the same memory operation, and we can
3538 // thus choose to execute only one.
3539 auto IsUniformMemOpUse = [&](Instruction *I) {
3540 // If the value was already known to not be uniform for the previous
3541 // (smaller VF), it cannot be uniform for the larger VF.
3542 if (PrevVF.isVector()) {
3543 auto Iter = Uniforms.find(PrevVF);
3544 if (Iter != Uniforms.end() && !Iter->second.contains(I))
3545 return false;
3546 }
3547 if (!Legal->isUniformMemOp(*I, VF))
3548 return false;
3549 if (isa<LoadInst>(I))
3550 // Loading the same address always produces the same result - at least
3551 // assuming aliasing and ordering which have already been checked.
3552 return true;
3553 // Storing the same value on every iteration.
3554 return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
3555 };
3556
3557 auto IsUniformDecision = [&](Instruction *I, ElementCount VF) {
3558 InstWidening WideningDecision = getWideningDecision(I, VF);
3559 assert(WideningDecision != CM_Unknown &&
3560 "Widening decision should be ready at this moment");
3561
3562 if (IsUniformMemOpUse(I))
3563 return true;
3564
3565 return (WideningDecision == CM_Widen ||
3566 WideningDecision == CM_Widen_Reverse ||
3567 WideningDecision == CM_Interleave);
3568 };
3569
3570 // Returns true if Ptr is the pointer operand of a memory access instruction
3571 // I, I is known to not require scalarization, and the pointer is not also
3572 // stored.
3573 auto IsVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
3574 if (isa<StoreInst>(I) && I->getOperand(0) == Ptr)
3575 return false;
3576 return getLoadStorePointerOperand(I) == Ptr &&
3577 (IsUniformDecision(I, VF) || Legal->isInvariant(Ptr));
3578 };
3579
3580 // Holds a list of values which are known to have at least one uniform use.
3581 // Note that there may be other uses which aren't uniform. A "uniform use"
3582 // here is something which only demands lane 0 of the unrolled iterations;
3583 // it does not imply that all lanes produce the same value (e.g. this is not
3584 // the usual meaning of uniform)
3585 SetVector<Value *> HasUniformUse;
3586
3587 // Scan the loop for instructions which are either a) known to have only
3588 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
3589 for (auto *BB : TheLoop->blocks())
3590 for (auto &I : *BB) {
3591 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
3592 switch (II->getIntrinsicID()) {
3593 case Intrinsic::sideeffect:
3594 case Intrinsic::experimental_noalias_scope_decl:
3595 case Intrinsic::assume:
3596 case Intrinsic::lifetime_start:
3597 case Intrinsic::lifetime_end:
3599 AddToWorklistIfAllowed(&I);
3600 break;
3601 default:
3602 break;
3603 }
3604 }
3605
3606 // ExtractValue instructions must be uniform, because the operands are
3607 // known to be loop-invariant.
3608 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
3609 assert(IsOutOfScope(EVI->getAggregateOperand()) &&
3610 "Expected aggregate value to be loop invariant");
3611 AddToWorklistIfAllowed(EVI);
3612 continue;
3613 }
3614
3615 // If there's no pointer operand, there's nothing to do.
3617 if (!Ptr)
3618 continue;
3619
3620 if (IsUniformMemOpUse(&I))
3621 AddToWorklistIfAllowed(&I);
3622
3623 if (IsVectorizedMemAccessUse(&I, Ptr))
3624 HasUniformUse.insert(Ptr);
3625 }
3626
3627 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
3628 // demanding) users. Since loops are assumed to be in LCSSA form, this
3629 // disallows uses outside the loop as well.
3630 for (auto *V : HasUniformUse) {
3631 if (IsOutOfScope(V))
3632 continue;
3633 auto *I = cast<Instruction>(V);
3634 bool UsersAreMemAccesses = all_of(I->users(), [&](User *U) -> bool {
3635 auto *UI = cast<Instruction>(U);
3636 return TheLoop->contains(UI) && IsVectorizedMemAccessUse(UI, V);
3637 });
3638 if (UsersAreMemAccesses)
3639 AddToWorklistIfAllowed(I);
3640 }
3641
3642 // Expand Worklist in topological order: whenever a new instruction
3643 // is added , its users should be already inside Worklist. It ensures
3644 // a uniform instruction will only be used by uniform instructions.
3645 unsigned Idx = 0;
3646 while (Idx != Worklist.size()) {
3647 Instruction *I = Worklist[Idx++];
3648
3649 for (auto *OV : I->operand_values()) {
3650 // isOutOfScope operands cannot be uniform instructions.
3651 if (IsOutOfScope(OV))
3652 continue;
3653 // First order recurrence Phi's should typically be considered
3654 // non-uniform.
3655 auto *OP = dyn_cast<PHINode>(OV);
3657 continue;
3658 // If all the users of the operand are uniform, then add the
3659 // operand into the uniform worklist.
3660 auto *OI = cast<Instruction>(OV);
3661 if (llvm::all_of(OI->users(), [&](User *U) -> bool {
3662 auto *J = cast<Instruction>(U);
3663 return Worklist.count(J) || IsVectorizedMemAccessUse(J, OI);
3664 }))
3665 AddToWorklistIfAllowed(OI);
3666 }
3667 }
3668
3669 // For an instruction to be added into Worklist above, all its users inside
3670 // the loop should also be in Worklist. However, this condition cannot be
3671 // true for phi nodes that form a cyclic dependence. We must process phi
3672 // nodes separately. An induction variable will remain uniform if all users
3673 // of the induction variable and induction variable update remain uniform.
3674 // The code below handles both pointer and non-pointer induction variables.
3675 BasicBlock *Latch = TheLoop->getLoopLatch();
3676 for (const auto &Induction : Legal->getInductionVars()) {
3677 auto *Ind = Induction.first;
3678 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3679
3680 // Determine if all users of the induction variable are uniform after
3681 // vectorization.
3682 bool UniformInd = all_of(Ind->users(), [&](User *U) -> bool {
3683 auto *I = cast<Instruction>(U);
3684 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3685 IsVectorizedMemAccessUse(I, Ind);
3686 });
3687 if (!UniformInd)
3688 continue;
3689
3690 // Determine if all users of the induction variable update instruction are
3691 // uniform after vectorization.
3692 bool UniformIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
3693 auto *I = cast<Instruction>(U);
3694 return I == Ind || Worklist.count(I) ||
3695 IsVectorizedMemAccessUse(I, IndUpdate);
3696 });
3697 if (!UniformIndUpdate)
3698 continue;
3699
3700 // The induction variable and its update instruction will remain uniform.
3701 AddToWorklistIfAllowed(Ind);
3702 AddToWorklistIfAllowed(IndUpdate);
3703 }
3704
3705 Uniforms[VF].insert(Worklist.begin(), Worklist.end());
3706}
3707
3709 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
3710
3712 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
3713 "runtime pointer checks needed. Enable vectorization of this "
3714 "loop with '#pragma clang loop vectorize(enable)' when "
3715 "compiling with -Os/-Oz",
3716 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3717 return true;
3718 }
3719
3720 if (!PSE.getPredicate().isAlwaysTrue()) {
3721 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
3722 "runtime SCEV checks needed. Enable vectorization of this "
3723 "loop with '#pragma clang loop vectorize(enable)' when "
3724 "compiling with -Os/-Oz",
3725 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3726 return true;
3727 }
3728
3729 // FIXME: Avoid specializing for stride==1 instead of bailing out.
3730 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
3731 reportVectorizationFailure("Runtime stride check for small trip count",
3732 "runtime stride == 1 checks needed. Enable vectorization of "
3733 "this loop without such check by compiling with -Os/-Oz",
3734 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3735 return true;
3736 }
3737
3738 return false;
3739}
3740
3741bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
3742 if (IsScalableVectorizationAllowed)
3743 return *IsScalableVectorizationAllowed;
3744
3745 IsScalableVectorizationAllowed = false;
3747 return false;
3748
3750 reportVectorizationInfo("Scalable vectorization is explicitly disabled",
3751 "ScalableVectorizationDisabled", ORE, TheLoop);
3752 return false;
3753 }
3754
3755 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
3756
3757 auto MaxScalableVF = ElementCount::getScalable(
3758 std::numeric_limits<ElementCount::ScalarTy>::max());
3759
3760 // Test that the loop-vectorizer can legalize all operations for this MaxVF.
3761 // FIXME: While for scalable vectors this is currently sufficient, this should
3762 // be replaced by a more detailed mechanism that filters out specific VFs,
3763 // instead of invalidating vectorization for a whole set of VFs based on the
3764 // MaxVF.
3765
3766 // Disable scalable vectorization if the loop contains unsupported reductions.
3767 if (!canVectorizeReductions(MaxScalableVF)) {
3769 "Scalable vectorization not supported for the reduction "
3770 "operations found in this loop.",
3771 "ScalableVFUnfeasible", ORE, TheLoop);
3772 return false;
3773 }
3774
3775 // Disable scalable vectorization if the loop contains any instructions
3776 // with element types not supported for scalable vectors.
3777 if (any_of(ElementTypesInLoop, [&](Type *Ty) {
3778 return !Ty->isVoidTy() &&
3780 })) {
3781 reportVectorizationInfo("Scalable vectorization is not supported "
3782 "for all element types found in this loop.",
3783 "ScalableVFUnfeasible", ORE, TheLoop);
3784 return false;
3785 }
3786
3788 reportVectorizationInfo("The target does not provide maximum vscale value "
3789 "for safe distance analysis.",
3790 "ScalableVFUnfeasible", ORE, TheLoop);
3791 return false;
3792 }
3793
3794 IsScalableVectorizationAllowed = true;
3795 return true;
3796}
3797
3799LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
3800 if (!isScalableVectorizationAllowed())
3801 return ElementCount::getScalable(0);
3802
3803 auto MaxScalableVF = ElementCount::getScalable(
3804 std::numeric_limits<ElementCount::ScalarTy>::max());
3806 return MaxScalableVF;
3807
3808 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
3809 // Limit MaxScalableVF by the maximum safe dependence distance.
3810 MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);
3811
3812 if (!MaxScalableVF)
3814 "Max legal vector width too small, scalable vectorization "
3815 "unfeasible.",
3816 "ScalableVFUnfeasible", ORE, TheLoop);
3817
3818 return MaxScalableVF;
3819}
3820
3821FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
3822 unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
3824 unsigned SmallestType, WidestType;
3825 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
3826
3827 // Get the maximum safe dependence distance in bits computed by LAA.
3828 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
3829 // the memory accesses that is most restrictive (involved in the smallest
3830 // dependence distance).
3831 unsigned MaxSafeElements =
3833
3834 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
3835 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
3837 this->MaxSafeElements = MaxSafeElements;
3838
3839 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
3840 << ".\n");
3841 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
3842 << ".\n");
3843
3844 // First analyze the UserVF, fall back if the UserVF should be ignored.
3845 if (UserVF) {
3846 auto MaxSafeUserVF =
3847 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
3848
3849 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
3850 // If `VF=vscale x N` is safe, then so is `VF=N`
3851 if (UserVF.isScalable())
3852 return FixedScalableVFPair(
3853 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
3854
3855 return UserVF;
3856 }
3857
3858 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
3859
3860 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
3861 // is better to ignore the hint and let the compiler choose a suitable VF.
3862 if (!UserVF.isScalable()) {
3863 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3864 << " is unsafe, clamping to max safe VF="
3865 << MaxSafeFixedVF << ".\n");
3866 ORE->emit([&]() {
3867 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3869 TheLoop->getHeader())
3870 << "User-specified vectorization factor "
3871 << ore::NV("UserVectorizationFactor", UserVF)
3872 << " is unsafe, clamping to maximum safe vectorization factor "
3873 << ore::NV("VectorizationFactor", MaxSafeFixedVF);
3874 });
3875 return MaxSafeFixedVF;
3876 }
3877
3879 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3880 << " is ignored because scalable vectors are not "
3881 "available.\n");
3882 ORE->emit([&]() {
3883 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3885 TheLoop->getHeader())
3886 << "User-specified vectorization factor "
3887 << ore::NV("UserVectorizationFactor", UserVF)
3888 << " is ignored because the target does not support scalable "
3889 "vectors. The compiler will pick a more suitable value.";
3890 });
3891 } else {
3892 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3893 << " is unsafe. Ignoring scalable UserVF.\n");
3894 ORE->emit([&]() {
3895 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3897 TheLoop->getHeader())
3898 << "User-specified vectorization factor "
3899 << ore::NV("UserVectorizationFactor", UserVF)
3900 << " is unsafe. Ignoring the hint to let the compiler pick a "
3901 "more suitable value.";
3902 });
3903 }
3904 }
3905
3906 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
3907 << " / " << WidestType << " bits.\n");
3908
3911 if (auto MaxVF =
3912 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3913 MaxSafeFixedVF, FoldTailByMasking))
3914 Result.FixedVF = MaxVF;
3915
3916 if (auto MaxVF =
3917 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3918 MaxSafeScalableVF, FoldTailByMasking))
3919 if (MaxVF.isScalable()) {
3920 Result.ScalableVF = MaxVF;
3921 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
3922 << "\n");
3923 }
3924
3925 return Result;
3926}
3927
3931 // TODO: It may be useful to do since it's still likely to be dynamically
3932 // uniform if the target can skip.
3934 "Not inserting runtime ptr check for divergent target",
3935 "runtime pointer checks needed. Not enabled for divergent target",
3936 "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
3938 }
3939
3940 ScalarEvolution *SE = PSE.getSE();
3941 unsigned TC = SE->getSmallConstantTripCount(TheLoop);
3942 unsigned MaxTC = PSE.getSmallConstantMaxTripCount();
3943 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
3944 if (TC != MaxTC)
3945 LLVM_DEBUG(dbgs() << "LV: Found maximum trip count: " << MaxTC << '\n');
3946 if (TC == 1) {
3947 reportVectorizationFailure("Single iteration (non) loop",
3948 "loop trip count is one, irrelevant for vectorization",
3949 "SingleIterationLoop", ORE, TheLoop);
3951 }
3952
3953 // If BTC matches the widest induction type and is -1 then the trip count
3954 // computation will wrap to 0 and the vector trip count will be 0. Do not try
3955 // to vectorize.
3956 const SCEV *BTC = SE->getBackedgeTakenCount(TheLoop);
3957 if (!isa<SCEVCouldNotCompute>(BTC) &&
3958 BTC->getType()->getScalarSizeInBits() >=
3961 SE->getMinusOne(BTC->getType()))) {
3963 "Trip count computation wrapped",
3964 "backedge-taken count is -1, loop trip count wrapped to 0",
3965 "TripCountWrapped", ORE, TheLoop);
3967 }
3968
3969 switch (ScalarEpilogueStatus) {
3971 return computeFeasibleMaxVF(MaxTC, UserVF, false);
3973 [[fallthrough]];
3975 LLVM_DEBUG(
3976 dbgs() << "LV: vector predicate hint/switch found.\n"
3977 << "LV: Not allowing scalar epilogue, creating predicated "
3978 << "vector loop.\n");
3979 break;
3981 // fallthrough as a special case of OptForSize
3983 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
3984 LLVM_DEBUG(
3985 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
3986 else
3987 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
3988 << "count.\n");
3989
3990 // Bail if runtime checks are required, which are not good when optimising
3991 // for size.
3994
3995 break;
3996 }
3997
3998 // The only loops we can vectorize without a scalar epilogue, are loops with
3999 // a bottom-test and a single exiting block. We'd have to handle the fact
4000 // that not every instruction executes on the last iteration. This will
4001 // require a lane mask which varies through the vector loop body. (TODO)
4003 // If there was a tail-folding hint/switch, but we can't fold the tail by
4004 // masking, fallback to a vectorization with a scalar epilogue.
4005 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4006 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4007 "scalar epilogue instead.\n");
4008 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4009 return computeFeasibleMaxVF(MaxTC, UserVF, false);
4010 }
4012 }
4013
4014 // Now try the tail folding
4015
4016 // Invalidate interleave groups that require an epilogue if we can't mask
4017 // the interleave-group.
4019 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
4020 "No decisions should have been taken at this point");
4021 // Note: There is no need to invalidate any cost modeling decisions here, as
4022 // none were taken so far.
4024 }
4025
4026 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
4027
4028 // Avoid tail folding if the trip count is known to be a multiple of any VF
4029 // we choose.
4030 std::optional<unsigned> MaxPowerOf2RuntimeVF =
4031 MaxFactors.FixedVF.getFixedValue();
4032 if (MaxFactors.ScalableVF) {
4033 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
4034 if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
4035 MaxPowerOf2RuntimeVF = std::max<unsigned>(
4036 *MaxPowerOf2RuntimeVF,
4037 *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
4038 } else
4039 MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
4040 }
4041
4042 if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
4043 assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
4044 "MaxFixedVF must be a power of 2");
4045 unsigned MaxVFtimesIC =
4046 UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF;
4047 ScalarEvolution *SE = PSE.getSE();
4048 // Currently only loops with countable exits are vectorized, but calling
4049 // getSymbolicMaxBackedgeTakenCount allows enablement work for loops with
4050 // uncountable exits whilst also ensuring the symbolic maximum and known
4051 // back-edge taken count remain identical for loops with countable exits.
4052 const SCEV *BackedgeTakenCount = PSE.getSymbolicMaxBackedgeTakenCount();
4053 assert(BackedgeTakenCount == PSE.getBackedgeTakenCount() &&
4054 "Invalid loop count");
4055 const SCEV *ExitCount = SE->getAddExpr(
4056 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
4057 const SCEV *Rem = SE->getURemExpr(
4058 SE->applyLoopGuards(ExitCount, TheLoop),
4059 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
4060 if (Rem->isZero()) {
4061 // Accept MaxFixedVF if we do not have a tail.
4062 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4063 return MaxFactors;
4064 }
4065 }
4066
4067 // If we don't know the precise trip count, or if the trip count that we
4068 // found modulo the vectorization factor is not zero, try to fold the tail
4069 // by masking.
4070 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4071 setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC);
4072 if (foldTailByMasking()) {
4074 LLVM_DEBUG(
4075 dbgs()
4076 << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
4077 "try to generate VP Intrinsics with scalable vector "
4078 "factors only.\n");
4079 // Tail folded loop using VP intrinsics restricts the VF to be scalable
4080 // for now.
4081 // TODO: extend it for fixed vectors, if required.
4082 assert(MaxFactors.ScalableVF.isScalable() &&
4083 "Expected scalable vector factor.");
4084
4085 MaxFactors.FixedVF = ElementCount::getFixed(1);
4086 }
4087 return MaxFactors;
4088 }
4089
4090 // If there was a tail-folding hint/switch, but we can't fold the tail by
4091 // masking, fallback to a vectorization with a scalar epilogue.
4092 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4093 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4094 "scalar epilogue instead.\n");
4095 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4096 return MaxFactors;
4097 }
4098
4099 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
4100 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
4102 }
4103
4104 if (TC == 0) {
4106 "unable to calculate the loop count due to complex control flow",
4107 "UnknownLoopCountComplexCFG", ORE, TheLoop);
4109 }
4110
4112 "Cannot optimize for size and vectorize at the same time.",
4113 "cannot optimize for size and vectorize at the same time. "
4114 "Enable vectorization of this loop with '#pragma clang loop "
4115 "vectorize(enable)' when compiling with -Os/-Oz",
4116 "NoTailLoopWithOptForSize", ORE, TheLoop);
4118}
4119
4120ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
4121 unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
4122 ElementCount MaxSafeVF, bool FoldTailByMasking) {
4123 bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
4124 const TypeSize WidestRegister = TTI.getRegisterBitWidth(
4125 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4127
4128 // Convenience function to return the minimum of two ElementCounts.
4129 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
4130 assert((LHS.isScalable() == RHS.isScalable()) &&
4131 "Scalable flags must match");
4132 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
4133 };
4134
4135 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
4136 // Note that both WidestRegister and WidestType may not be a powers of 2.
4137 auto MaxVectorElementCount = ElementCount::get(
4138 llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),
4139 ComputeScalableMaxVF);
4140 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
4141 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4142 << (MaxVectorElementCount * WidestType) << " bits.\n");
4143
4144 if (!MaxVectorElementCount) {
4145 LLVM_DEBUG(dbgs() << "LV: The target has no "
4146 << (ComputeScalableMaxVF ? "scalable" : "fixed")
4147 << " vector registers.\n");
4148 return ElementCount::getFixed(1);
4149 }
4150
4151 unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
4152 if (MaxVectorElementCount.isScalable() &&
4153 TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
4154 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
4155 auto Min = Attr.getVScaleRangeMin();
4156 WidestRegisterMinEC *= Min;
4157 }
4158
4159 // When a scalar epilogue is required, at least one iteration of the scalar
4160 // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
4161 // max VF that results in a dead vector loop.
4162 if (MaxTripCount > 0 && requiresScalarEpilogue(true))
4163 MaxTripCount -= 1;
4164
4165 if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
4166 (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
4167 // If upper bound loop trip count (TC) is known at compile time there is no
4168 // point in choosing VF greater than TC (as done in the loop below). Select
4169 // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
4170 // scalable, we only fall back on a fixed VF when the TC is less than or
4171 // equal to the known number of lanes.
4172 auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount);
4173 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
4174 "exceeding the constant trip count: "
4175 << ClampedUpperTripCount << "\n");
4176 return ElementCount::get(
4177 ClampedUpperTripCount,
4178 FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
4179 }
4180
4182 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4184 ElementCount MaxVF = MaxVectorElementCount;
4185 if (MaximizeBandwidth ||
4189 auto MaxVectorElementCountMaxBW = ElementCount::get(
4190 llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
4191 ComputeScalableMaxVF);
4192 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
4193
4194 // Collect all viable vectorization factors larger than the default MaxVF
4195 // (i.e. MaxVectorElementCount).
4197 for (ElementCount VS = MaxVectorElementCount * 2;
4198 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
4199 VFs.push_back(VS);
4200
4201 // For each VF calculate its register usage.
4202 auto RUs = calculateRegisterUsage(VFs);
4203
4204 // Select the largest VF which doesn't require more registers than existing
4205 // ones.
4206 for (int I = RUs.size() - 1; I >= 0; --I) {
4207 const auto &MLU = RUs[I].MaxLocalUsers;
4208 if (all_of(MLU, [&](decltype(MLU.front()) &LU) {
4209 return LU.second <= TTI.getNumberOfRegisters(LU.first);
4210 })) {
4211 MaxVF = VFs[I];
4212 break;
4213 }
4214 }
4215 if (ElementCount MinVF =
4216 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
4217 if (ElementCount::isKnownLT(MaxVF, MinVF)) {
4218 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4219 << ") with target's minimum: " << MinVF << '\n');
4220 MaxVF = MinVF;
4221 }
4222 }
4223
4224 // Invalidate any widening decisions we might have made, in case the loop
4225 // requires prediction (decided later), but we have already made some
4226 // load/store widening decisions.
4228 }
4229 return MaxVF;
4230}
4231
4232/// Convenience function that returns the value of vscale_range iff
4233/// vscale_range.min == vscale_range.max or otherwise returns the value
4234/// returned by the corresponding TTI method.
4235static std::optional<unsigned>
4237 const Function *Fn = L->getHeader()->getParent();
4238 if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
4239 auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
4240 auto Min = Attr.getVScaleRangeMin();
4241 auto Max = Attr.getVScaleRangeMax();
4242 if (Max && Min == Max)
4243 return Max;
4244 }
4245
4246 return TTI.getVScaleForTuning();
4247}
4248
4249/// This function attempts to return a value that represents the vectorization
4250/// factor at runtime. For fixed-width VFs we know this precisely at compile
4251/// time, but for scalable VFs we calculate it based on an estimate of the
4252/// vscale value.
4253static unsigned getEstimatedRuntimeVF(const Loop *L,
4254 const TargetTransformInfo &TTI,
4255 ElementCount VF) {
4256 unsigned EstimatedVF = VF.getKnownMinValue();
4257 if (VF.isScalable())
4258 if (std::optional<unsigned> VScale = getVScaleForTuning(L, TTI))
4259 EstimatedVF *= *VScale;
4260 assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
4261 return EstimatedVF;
4262}
4263
4264bool LoopVectorizationPlanner::isMoreProfitable(
4266 const unsigned MaxTripCount) const {
4267 InstructionCost CostA = A.Cost;
4268 InstructionCost CostB = B.Cost;
4269
4270 // Improve estimate for the vector width if it is scalable.
4271 unsigned EstimatedWidthA = A.Width.getKnownMinValue();
4272 unsigned EstimatedWidthB = B.Width.getKnownMinValue();
4273 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) {
4274 if (A.Width.isScalable())
4275 EstimatedWidthA *= *VScale;
4276 if (B.Width.isScalable())
4277 EstimatedWidthB *= *VScale;
4278 }
4279
4280 // Assume vscale may be larger than 1 (or the value being tuned for),
4281 // so that scalable vectorization is slightly favorable over fixed-width
4282 // vectorization.
4283 bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() &&
4284 A.Width.isScalable() && !B.Width.isScalable();
4285
4286 auto CmpFn = [PreferScalable](const InstructionCost &LHS,
4287 const InstructionCost &RHS) {
4288 return PreferScalable ? LHS <= RHS : LHS < RHS;
4289 };
4290
4291 // To avoid the need for FP division:
4292 // (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)
4293 // <=> (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA)
4294 if (!MaxTripCount)
4295 return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);
4296
4297 auto GetCostForTC = [MaxTripCount, this](unsigned VF,
4298 InstructionCost VectorCost,
4299 InstructionCost ScalarCost) {
4300 // If the trip count is a known (possibly small) constant, the trip count
4301 // will be rounded up to an integer number of iterations under
4302 // FoldTailByMasking. The total cost in that case will be
4303 // VecCost*ceil(TripCount/VF). When not folding the tail, the total
4304 // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
4305 // some extra overheads, but for the purpose of comparing the costs of
4306 // different VFs we can use this to compare the total loop-body cost
4307 // expected after vectorization.
4308 if (CM.foldTailByMasking())
4309 return VectorCost * divideCeil(MaxTripCount, VF);
4310 return VectorCost * (MaxTripCount / VF) + ScalarCost * (MaxTripCount % VF);
4311 };
4312
4313 auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost);
4314 auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost);
4315 return CmpFn(RTCostA, RTCostB);
4316}
4317
4318bool LoopVectorizationPlanner::isMoreProfitable(
4319 const VectorizationFactor &A, const VectorizationFactor &B) const {
4320 const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount();
4321 return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount);
4322}
4323
4326 using RecipeVFPair = std::pair<VPRecipeBase *, ElementCount>;
4327 SmallVector<RecipeVFPair> InvalidCosts;
4328 for (const auto &Plan : VPlans) {
4329 for (ElementCount VF : Plan->vectorFactors()) {
4330 VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(),
4331 CM, CM.CostKind);
4332 precomputeCosts(*Plan, VF, CostCtx);
4333 auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
4334 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
4335 for (auto &R : *VPBB) {
4336 if (!R.cost(VF, CostCtx).isValid())
4337 InvalidCosts.emplace_back(&R, VF);
4338 }
4339 }
4340 }
4341 }
4342 if (InvalidCosts.empty())
4343 return;
4344
4345 // Emit a report of VFs with invalid costs in the loop.
4346
4347 // Group the remarks per recipe, keeping the recipe order from InvalidCosts.
4349 unsigned I = 0;
4350 for (auto &Pair : InvalidCosts)
4351 if (!Numbering.count(Pair.first))
4352 Numbering[Pair.first] = I++;
4353
4354 // Sort the list, first on recipe(number) then on VF.
4355 sort(InvalidCosts, [&Numbering](RecipeVFPair &A, RecipeVFPair &B) {
4356 if (Numbering[A.first] != Numbering[B.first])
4357 return Numbering[A.first] < Numbering[B.first];
4358 const auto &LHS = A.second;
4359 const auto &RHS = B.second;
4360 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
4361 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
4362 });
4363
4364 // For a list of ordered recipe-VF pairs:
4365 // [(load, VF1), (load, VF2), (store, VF1)]
4366 // group the recipes together to emit separate remarks for:
4367 // load (VF1, VF2)
4368 // store (VF1)
4369 auto Tail = ArrayRef<RecipeVFPair>(InvalidCosts);
4370 auto Subset = ArrayRef<RecipeVFPair>();
4371 do {
4372 if (Subset.empty())
4373 Subset = Tail.take_front(1);
4374
4375 VPRecipeBase *R = Subset.front().first;
4376
4377 unsigned Opcode =
4380 [](const auto *R) { return Instruction::PHI; })
4381 .Case<VPWidenSelectRecipe>(
4382 [](const auto *R) { return Instruction::Select; })
4383 .Case<VPWidenStoreRecipe>(
4384 [](const auto *R) { return Instruction::Store; })
4385 .Case<VPWidenLoadRecipe>(
4386 [](const auto *R) { return Instruction::Load; })
4387 .Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>(
4388 [](const auto *R) { return Instruction::Call; })
4391 [](const auto *R) { return R->getOpcode(); })
4392 .Case<VPInterleaveRecipe>([](const VPInterleaveRecipe *R) {
4393 return R->getStoredValues().empty() ? Instruction::Load
4394 : Instruction::Store;
4395 });
4396
4397 // If the next recipe is different, or if there are no other pairs,
4398 // emit a remark for the collated subset. e.g.
4399 // [(load, VF1), (load, VF2))]
4400 // to emit:
4401 // remark: invalid costs for 'load' at VF=(VF1, VF2)
4402 if (Subset == Tail || Tail[Subset.size()].first != R) {
4403 std::string OutString;
4404 raw_string_ostream OS(OutString);
4405 assert(!Subset.empty() && "Unexpected empty range");
4406 OS << "Recipe with invalid costs prevented vectorization at VF=(";
4407 for (const auto &Pair : Subset)
4408 OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
4409 OS << "):";
4410 if (Opcode == Instruction::Call) {
4411 StringRef Name = "";
4412 if (auto *Int = dyn_cast<VPWidenIntrinsicRecipe>(R)) {
4413 Name = Int->getIntrinsicName();
4414 } else {
4415 auto *WidenCall = dyn_cast<VPWidenCallRecipe>(R);
4416 Function *CalledFn =
4417 WidenCall ? WidenCall->getCalledScalarFunction()
4418 : cast<Function>(R->getOperand(R->getNumOperands() - 1)
4419 ->getLiveInIRValue());
4420 Name = CalledFn->getName();
4421 }
4422 OS << " call to " << Name;
4423 } else
4424 OS << " " << Instruction::getOpcodeName(Opcode);
4425 reportVectorizationInfo(OutString, "InvalidCost", ORE, OrigLoop, nullptr,
4426 R->getDebugLoc());
4427 Tail = Tail.drop_front(Subset.size());
4428 Subset = {};
4429 } else
4430 // Grow the subset by one element
4431 Subset = Tail.take_front(Subset.size() + 1);
4432 } while (!Tail.empty());
4433}
4434
4435/// Check if any recipe of \p Plan will generate a vector value, which will be
4436/// assigned a vector register.
4438 const TargetTransformInfo &TTI) {
4439 assert(VF.isVector() && "Checking a scalar VF?");
4440 VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
4441 DenseSet<VPRecipeBase *> EphemeralRecipes;
4442 collectEphemeralRecipesForVPlan(Plan, EphemeralRecipes);
4443 // Set of already visited types.
4444 DenseSet<Type *> Visited;
4445 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4447 for (VPRecipeBase &R : *VPBB) {
4448 if (EphemeralRecipes.contains(&R))
4449 continue;
4450 // Continue early if the recipe is considered to not produce a vector
4451 // result. Note that this includes VPInstruction where some opcodes may
4452 // produce a vector, to preserve existing behavior as VPInstructions model
4453 // aspects not directly mapped to existing IR instructions.
4454 switch (R.getVPDefID()) {
4455 case VPDef::VPDerivedIVSC:
4456 case VPDef::VPScalarIVStepsSC:
4457 case VPDef::VPScalarCastSC:
4458 case VPDef::VPReplicateSC:
4459 case VPDef::VPInstructionSC:
4460 case VPDef::VPCanonicalIVPHISC:
4461 case VPDef::VPVectorPointerSC:
4462 case VPDef::VPReverseVectorPointerSC:
4463 case VPDef::VPExpandSCEVSC:
4464 case VPDef::VPEVLBasedIVPHISC:
4465 case VPDef::VPPredInstPHISC:
4466 case VPDef::VPBranchOnMaskSC:
4467 continue;
4468 case VPDef::VPReductionSC:
4469 case VPDef::VPActiveLaneMaskPHISC:
4470 case VPDef::VPWidenCallSC:
4471 case VPDef::VPWidenCanonicalIVSC:
4472 case VPDef::VPWidenCastSC:
4473 case VPDef::VPWidenGEPSC:
4474 case VPDef::VPWidenIntrinsicSC:
4475 case VPDef::VPWidenSC:
4476 case VPDef::VPWidenSelectSC:
4477 case VPDef::VPBlendSC:
4478 case VPDef::VPFirstOrderRecurrencePHISC:
4479 case VPDef::VPWidenPHISC:
4480 case VPDef::VPWidenIntOrFpInductionSC:
4481 case VPDef::VPWidenPointerInductionSC:
4482 case VPDef::VPReductionPHISC:
4483 case VPDef::VPInterleaveSC:
4484 case VPDef::VPWidenLoadEVLSC:
4485 case VPDef::VPWidenLoadSC:
4486 case VPDef::VPWidenStoreEVLSC:
4487 case VPDef::VPWidenStoreSC:
4488 break;
4489 default:
4490 llvm_unreachable("unhandled recipe");
4491 }
4492
4493 auto WillWiden = [&TTI, VF](Type *ScalarTy) {
4494 Type *VectorTy = toVectorTy(ScalarTy, VF);
4495 unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
4496 if (!NumLegalParts)
4497 return false;
4498 if (VF.isScalable()) {
4499 // <vscale x 1 x iN> is assumed to be profitable over iN because
4500 // scalable registers are a distinct register class from scalar
4501 // ones. If we ever find a target which wants to lower scalable
4502 // vectors back to scalars, we'll need to update this code to
4503 // explicitly ask TTI about the register class uses for each part.
4504 return NumLegalParts <= VF.getKnownMinValue();
4505 }
4506 // Two or more parts that share a register - are vectorized.
4507 return NumLegalParts < VF.getKnownMinValue();
4508 };
4509
4510 // If no def nor is a store, e.g., branches, continue - no value to check.
4511 if (R.getNumDefinedValues() == 0 &&
4512 !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>(
4513 &R))
4514 continue;
4515 // For multi-def recipes, currently only interleaved loads, suffice to
4516 // check first def only.
4517 // For stores check their stored value; for interleaved stores suffice
4518 // the check first stored value only. In all cases this is the second
4519 // operand.
4520 VPValue *ToCheck =
4521 R.getNumDefinedValues() >= 1 ? R.getVPValue(0) : R.getOperand(1);
4522 Type *ScalarTy = TypeInfo.inferScalarType(ToCheck);
4523 if (!Visited.insert({ScalarTy}).second)
4524 continue;
4525 if (WillWiden(ScalarTy))
4526 return true;
4527 }
4528 }
4529
4530 return false;
4531}
4532
4533#ifndef NDEBUG
4534VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
4536 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
4537 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
4538 assert(any_of(VPlans,
4539 [](std::unique_ptr<VPlan> &P) {
4540 return P->hasVF(ElementCount::getFixed(1));
4541 }) &&
4542 "Expected Scalar VF to be a candidate");
4543
4544 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
4545 ExpectedCost);
4546 VectorizationFactor ChosenFactor = ScalarCost;
4547
4548 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
4549 if (ForceVectorization &&
4550 (VPlans.size() > 1 || !VPlans[0]->hasScalarVFOnly())) {
4551 // Ignore scalar width, because the user explicitly wants vectorization.
4552 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4553 // evaluation.
4554 ChosenFactor.Cost = InstructionCost::getMax();
4555 }
4556
4557 for (auto &P : VPlans) {
4558 for (ElementCount VF : P->vectorFactors()) {
4559 // The cost for scalar VF=1 is already calculated, so ignore it.
4560 if (VF.isScalar())
4561 continue;
4562
4564 VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
4565
4566 unsigned Width = getEstimatedRuntimeVF(OrigLoop, TTI, Candidate.Width);
4567 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
4568 << " costs: " << (Candidate.Cost / Width));
4569 if (VF.isScalable())
4570 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
4571 << getVScaleForTuning(OrigLoop, TTI).value_or(1)
4572 << ")");
4573 LLVM_DEBUG(dbgs() << ".\n");
4574
4575 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
4576 LLVM_DEBUG(
4577 dbgs()
4578 << "LV: Not considering vector loop of width " << VF
4579 << " because it will not generate any vector instructions.\n");
4580 continue;
4581 }
4582
4583 if (isMoreProfitable(Candidate, ChosenFactor))
4584 ChosenFactor = Candidate;
4585 }
4586 }
4587
4590 "There are conditional stores.",
4591 "store that is conditionally executed prevents vectorization",
4592 "ConditionalStore", ORE, OrigLoop);
4593 ChosenFactor = ScalarCost;
4594 }
4595
4596 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
4597 !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
4598 << "LV: Vectorization seems to be not beneficial, "
4599 << "but was forced by a user.\n");
4600 return ChosenFactor;
4601}
4602#endif
4603
4604bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
4605 ElementCount VF) const {
4606 // Cross iteration phis such as reductions need special handling and are
4607 // currently unsupported.
4608 if (any_of(OrigLoop->getHeader()->phis(),
4609 [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
4610 return false;
4611
4612 // Phis with uses outside of the loop require special handling and are
4613 // currently unsupported.
4614 for (const auto &Entry : Legal->getInductionVars()) {
4615 // Look for uses of the value of the induction at the last iteration.
4616 Value *PostInc =
4617 Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch());
4618 for (User *U : PostInc->users())
4619 if (!OrigLoop->contains(cast<Instruction>(U)))
4620 return false;
4621 // Look for uses of penultimate value of the induction.
4622 for (User *U : Entry.first->users())
4623 if (!OrigLoop->contains(cast<Instruction>(U)))
4624 return false;
4625 }
4626
4627 // Epilogue vectorization code has not been auditted to ensure it handles
4628 // non-latch exits properly. It may be fine, but it needs auditted and
4629 // tested.
4630 // TODO: Add support for loops with an early exit.
4631 if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
4632 return false;
4633
4634 return true;
4635}
4636
4638 const ElementCount VF, const unsigned IC) const {
4639 // FIXME: We need a much better cost-model to take different parameters such
4640 // as register pressure, code size increase and cost of extra branches into
4641 // account. For now we apply a very crude heuristic and only consider loops
4642 // with vectorization factors larger than a certain value.
4643
4644 // Allow the target to opt out entirely.
4646 return false;
4647
4648 // We also consider epilogue vectorization unprofitable for targets that don't
4649 // consider interleaving beneficial (eg. MVE).
4650 if (TTI.getMaxInterleaveFactor(VF) <= 1)
4651 return false;
4652
4653 // TODO: PR #108190 introduced a discrepancy between fixed-width and scalable
4654 // VFs when deciding profitability.
4655 // See related "TODO: extend to support scalable VFs." in
4656 // selectEpilogueVectorizationFactor.
4657 unsigned Multiplier = VF.isFixed() ? IC : 1;
4658 unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
4661 return getEstimatedRuntimeVF(TheLoop, TTI, VF * Multiplier) >= MinVFThreshold;
4662}
4663
4665 const ElementCount MainLoopVF, unsigned IC) {
4668 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
4669 return Result;
4670 }
4671
4672 if (!CM.isScalarEpilogueAllowed()) {
4673 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
4674 "epilogue is allowed.\n");
4675 return Result;
4676 }
4677
4678 // Not really a cost consideration, but check for unsupported cases here to
4679 // simplify the logic.
4680 if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
4681 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
4682 "is not a supported candidate.\n");
4683 return Result;
4684 }
4685
4687 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
4689 if (hasPlanWithVF(ForcedEC))
4690 return {ForcedEC, 0, 0};
4691
4692 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
4693 "viable.\n");
4694 return Result;
4695 }
4696
4697 if (OrigLoop->getHeader()->getParent()->hasOptSize() ||
4698 OrigLoop->getHeader()->getParent()->hasMinSize()) {
4699 LLVM_DEBUG(
4700 dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
4701 return Result;
4702 }
4703
4704 if (!CM.isEpilogueVectorizationProfitable(MainLoopVF, IC)) {
4705 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
4706 "this loop\n");
4707 return Result;
4708 }
4709
4710 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
4711 // the main loop handles 8 lanes per iteration. We could still benefit from
4712 // vectorizing the epilogue loop with VF=4.
4713 ElementCount EstimatedRuntimeVF =
4714 ElementCount::getFixed(getEstimatedRuntimeVF(OrigLoop, TTI, MainLoopVF));
4715
4716 ScalarEvolution &SE = *PSE.getSE();
4717 Type *TCType = Legal->getWidestInductionType();
4718 const SCEV *RemainingIterations = nullptr;
4719 unsigned MaxTripCount = 0;
4720 for (auto &NextVF : ProfitableVFs) {
4721 // Skip candidate VFs without a corresponding VPlan.
4722 if (!hasPlanWithVF(NextVF.Width))
4723 continue;
4724
4725 // Skip candidate VFs with widths >= the (estimated) runtime VF (scalable
4726 // vectors) or > the VF of the main loop (fixed vectors).
4727 if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
4728 ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||
4729 (NextVF.Width.isScalable() &&
4730 ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) ||
4731 (!NextVF.Width.isScalable() && !MainLoopVF.isScalable() &&
4732 ElementCount::isKnownGT(NextVF.Width, MainLoopVF)))
4733 continue;
4734
4735 // If NextVF is greater than the number of remaining iterations, the
4736 // epilogue loop would be dead. Skip such factors.
4737 if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) {
4738 // TODO: extend to support scalable VFs.
4739 if (!RemainingIterations) {
4741 getPlanFor(NextVF.Width).getTripCount(), SE);
4742 assert(!isa<SCEVCouldNotCompute>(TC) &&
4743 "Trip count SCEV must be computable");
4744 RemainingIterations = SE.getURemExpr(
4745 TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC));
4746 MaxTripCount = MainLoopVF.getKnownMinValue() * IC - 1;
4747 if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations,
4748 SE.getConstant(TCType, MaxTripCount))) {
4749 MaxTripCount =
4750 SE.getUnsignedRangeMax(RemainingIterations).getZExtValue();
4751 }
4752 LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: "
4753 << MaxTripCount << "\n");
4754 }
4755 if (SE.isKnownPredicate(
4757 SE.getConstant(TCType, NextVF.Width.getKnownMinValue()),
4758 RemainingIterations))
4759 continue;
4760 }
4761
4762 if (Result.Width.isScalar() ||
4763 isMoreProfitable(NextVF, Result, MaxTripCount))
4764 Result = NextVF;
4765 }
4766
4767 if (Result != VectorizationFactor::Disabled())
4768 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
4769 << Result.Width << "\n");
4770 return Result;
4771}
4772
4773std::pair<unsigned, unsigned>
4775 unsigned MinWidth = -1U;
4776 unsigned MaxWidth = 8;
4778 // For in-loop reductions, no element types are added to ElementTypesInLoop
4779 // if there are no loads/stores in the loop. In this case, check through the
4780 // reduction variables to determine the maximum width.
4781 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
4782 // Reset MaxWidth so that we can find the smallest type used by recurrences
4783 // in the loop.
4784 MaxWidth = -1U;
4785 for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
4786 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
4787 // When finding the min width used by the recurrence we need to account
4788 // for casts on the input operands of the recurrence.
4789 MaxWidth = std::min<unsigned>(
4790 MaxWidth, std::min<unsigned>(
4793 }
4794 } else {
4795 for (Type *T : ElementTypesInLoop) {
4796 MinWidth = std::min<unsigned>(
4797 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4798 MaxWidth = std::max<unsigned>(
4799 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4800 }
4801 }
4802 return {MinWidth, MaxWidth};
4803}
4804
4806 ElementTypesInLoop.clear();
4807 // For each block.
4808 for (BasicBlock *BB : TheLoop->blocks()) {
4809 // For each instruction in the loop.
4810 for (Instruction &I : BB->instructionsWithoutDebug()) {
4811 Type *T = I.getType();
4812
4813 // Skip ignored values.
4814 if (ValuesToIgnore.count(&I))
4815 continue;
4816
4817 // Only examine Loads, Stores and PHINodes.
4818 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
4819 continue;
4820
4821 // Examine PHI nodes that are reduction variables. Update the type to
4822 // account for the recurrence type.
4823 if (auto *PN = dyn_cast<PHINode>(&I)) {
4824 if (!Legal->isReductionVariable(PN))
4825 continue;
4826 const RecurrenceDescriptor &RdxDesc =
4827 Legal->getReductionVars().find(PN)->second;
4830 RdxDesc.getRecurrenceType(),
4832 continue;
4833 T = RdxDesc.getRecurrenceType();
4834 }
4835
4836 // Examine the stored values.
4837 if (auto *ST = dyn_cast<StoreInst>(&I))
4838 T = ST->getValueOperand()->getType();
4839
4840 assert(T->isSized() &&
4841 "Expected the load/store/recurrence type to be sized");
4842
4843 ElementTypesInLoop.insert(T);
4844 }
4845 }
4846}
4847
4848unsigned
4850 InstructionCost LoopCost) {
4851 // -- The interleave heuristics --
4852 // We interleave the loop in order to expose ILP and reduce the loop overhead.
4853 // There are many micro-architectural considerations that we can't predict
4854 // at this level. For example, frontend pressure (on decode or fetch) due to
4855 // code size, or the number and capabilities of the execution ports.
4856 //
4857 // We use the following heuristics to select the interleave count:
4858 // 1. If the code has reductions, then we interleave to break the cross
4859 // iteration dependency.
4860 // 2. If the loop is really small, then we interleave to reduce the loop
4861 // overhead.
4862 // 3. We don't interleave if we think that we will spill registers to memory
4863 // due to the increased register pressure.
4864
4866 return 1;
4867
4868 // Do not interleave if EVL is preferred and no User IC is specified.
4869 if (foldTailWithEVL()) {
4870 LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. "
4871 "Unroll factor forced to be 1.\n");
4872 return 1;
4873 }
4874
4875 // We used the distance for the interleave count.
4877 return 1;
4878
4879 // We don't attempt to perform interleaving for loops with uncountable early
4880 // exits because the VPInstruction::AnyOf code cannot currently handle
4881 // multiple parts.
4883 return 1;
4884
4885 auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop);
4886 const bool HasReductions = !Legal->getReductionVars().empty();
4887
4888 // If we did not calculate the cost for VF (because the user selected the VF)
4889 // then we calculate the cost of VF here.
4890 if (LoopCost == 0) {
4891 LoopCost = expectedCost(VF);
4892 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
4893
4894 // Loop body is free and there is no need for interleaving.
4895 if (LoopCost == 0)
4896 return 1;
4897 }
4898
4900 // We divide by these constants so assume that we have at least one
4901 // instruction that uses at least one register.
4902 for (auto &Pair : R.MaxLocalUsers) {
4903 Pair.second = std::max(Pair.second, 1U);
4904 }
4905
4906 // We calculate the interleave count using the following formula.
4907 // Subtract the number of loop invariants from the number of available
4908 // registers. These registers are used by all of the interleaved instances.
4909 // Next, divide the remaining registers by the number of registers that is
4910 // required by the loop, in order to estimate how many parallel instances
4911 // fit without causing spills. All of this is rounded down if necessary to be
4912 // a power of two. We want power of two interleave count to simplify any
4913 // addressing operations or alignment considerations.
4914 // We also want power of two interleave counts to ensure that the induction
4915 // variable of the vector loop wraps to zero, when tail is folded by masking;
4916 // this currently happens when OptForSize, in which case IC is set to 1 above.
4917 unsigned IC = UINT_MAX;
4918
4919 for (const auto &Pair : R.MaxLocalUsers) {
4920 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(Pair.first);
4921 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
4922 << " registers of "
4923 << TTI.getRegisterClassName(Pair.first)
4924 << " register class\n");
4925 if (VF.isScalar()) {
4926 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
4927 TargetNumRegisters = ForceTargetNumScalarRegs;
4928 } else {
4929 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
4930 TargetNumRegisters = ForceTargetNumVectorRegs;
4931 }
4932 unsigned MaxLocalUsers = Pair.second;
4933 unsigned LoopInvariantRegs = 0;
4934 if (R.LoopInvariantRegs.find(Pair.first) != R.LoopInvariantRegs.end())
4935 LoopInvariantRegs = R.LoopInvariantRegs[Pair.first];
4936
4937 unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /
4938 MaxLocalUsers);
4939 // Don't count the induction variable as interleaved.
4941 TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) /
4942 std::max(1U, (MaxLocalUsers - 1)));
4943 }
4944
4945 IC = std::min(IC, TmpIC);
4946 }
4947
4948 // Clamp the interleave ranges to reasonable counts.
4949 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
4950
4951 // Check if the user has overridden the max.
4952 if (VF.isScalar()) {
4953 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
4954 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
4955 } else {
4956 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
4957 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
4958 }
4959
4960 unsigned EstimatedVF = getEstimatedRuntimeVF(TheLoop, TTI, VF);
4961 unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4962 if (KnownTC > 0) {
4963 // At least one iteration must be scalar when this constraint holds. So the
4964 // maximum available iterations for interleaving is one less.
4965 unsigned AvailableTC =
4966 requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC;
4967
4968 // If trip count is known we select between two prospective ICs, where
4969 // 1) the aggressive IC is capped by the trip count divided by VF
4970 // 2) the conservative IC is capped by the trip count divided by (VF * 2)
4971 // The final IC is selected in a way that the epilogue loop trip count is
4972 // minimized while maximizing the IC itself, so that we either run the
4973 // vector loop at least once if it generates a small epilogue loop, or else
4974 // we run the vector loop at least twice.
4975
4976 unsigned InterleaveCountUB = bit_floor(
4977 std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
4978 unsigned InterleaveCountLB = bit_floor(std::max(
4979 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
4980 MaxInterleaveCount = InterleaveCountLB;
4981
4982 if (InterleaveCountUB != InterleaveCountLB) {
4983 unsigned TailTripCountUB =
4984 (AvailableTC % (EstimatedVF * InterleaveCountUB));
4985 unsigned TailTripCountLB =
4986 (AvailableTC % (EstimatedVF * InterleaveCountLB));
4987 // If both produce same scalar tail, maximize the IC to do the same work
4988 // in fewer vector loop iterations
4989 if (TailTripCountUB == TailTripCountLB)
4990 MaxInterleaveCount = InterleaveCountUB;
4991 }
4992 } else if (BestKnownTC && *BestKnownTC > 0) {
4993 // At least one iteration must be scalar when this constraint holds. So the
4994 // maximum available iterations for interleaving is one less.
4995 unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
4996 ? (*BestKnownTC) - 1
4997 : *BestKnownTC;
4998
4999 // If trip count is an estimated compile time constant, limit the
5000 // IC to be capped by the trip count divided by VF * 2, such that the vector
5001 // loop runs at least twice to make interleaving seem profitable when there
5002 // is an epilogue loop present. Since exact Trip count is not known we
5003 // choose to be conservative in our IC estimate.
5004 MaxInterleaveCount = bit_floor(std::max(
5005 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
5006 }
5007
5008 assert(MaxInterleaveCount > 0 &&
5009 "Maximum interleave count must be greater than 0");
5010
5011 // Clamp the calculated IC to be between the 1 and the max interleave count
5012 // that the target and trip count allows.
5013 if (IC > MaxInterleaveCount)
5014 IC = MaxInterleaveCount;
5015 else
5016 // Make sure IC is greater than 0.
5017 IC = std::max(1u, IC);
5018
5019 assert(IC > 0 && "Interleave count must be greater than 0.");
5020
5021 // Interleave if we vectorized this loop and there is a reduction that could
5022 // benefit from interleaving.
5023 if (VF.isVector() && HasReductions) {
5024 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5025 return IC;
5026 }
5027
5028 // For any scalar loop that either requires runtime checks or predication we
5029 // are better off leaving this to the unroller. Note that if we've already
5030 // vectorized the loop we will have done the runtime check and so interleaving
5031 // won't require further checks.
5032 bool ScalarInterleavingRequiresPredication =
5033 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
5034 return Legal->blockNeedsPredication(BB);
5035 }));
5036 bool ScalarInterleavingRequiresRuntimePointerCheck =
5038
5039 // We want to interleave small loops in order to reduce the loop overhead and
5040 // potentially expose ILP opportunities.
5041 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5042 << "LV: IC is " << IC << '\n'
5043 << "LV: VF is " << VF << '\n');
5044 const bool AggressivelyInterleaveReductions =
5045 TTI.enableAggressiveInterleaving(HasReductions);
5046 if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5047 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5048 // We assume that the cost overhead is 1 and we use the cost model
5049 // to estimate the cost of the loop and interleave until the cost of the
5050 // loop overhead is about 5% of the cost of the loop.
5051 unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>(
5052 SmallLoopCost / *LoopCost.getValue()));
5053
5054 // Interleave until store/load ports (estimated by max interleave count) are
5055 // saturated.
5056 unsigned NumStores = Legal->getNumStores();
5057 unsigned NumLoads = Legal->getNumLoads();
5058 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5059 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5060
5061 // There is little point in interleaving for reductions containing selects
5062 // and compares when VF=1 since it may just create more overhead than it's
5063 // worth for loops with small trip counts. This is because we still have to
5064 // do the final reduction after the loop.
5065 bool HasSelectCmpReductions =
5066 HasReductions &&
5067 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5068 const RecurrenceDescriptor &RdxDesc = Reduction.second;
5069 RecurKind RK = RdxDesc.getRecurrenceKind();
5070 return RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) ||
5071 RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK);
5072 });
5073 if (HasSelectCmpReductions) {
5074 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5075 return 1;
5076 }
5077
5078 // If we have a scalar reduction (vector reductions are already dealt with
5079 // by this point), we can increase the critical path length if the loop
5080 // we're interleaving is inside another loop. For tree-wise reductions
5081 // set the limit to 2, and for ordered reductions it's best to disable
5082 // interleaving entirely.
5083 if (HasReductions && TheLoop->getLoopDepth() > 1) {
5084 bool HasOrderedReductions =
5085 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5086 const RecurrenceDescriptor &RdxDesc = Reduction.second;
5087 return RdxDesc.isOrdered();
5088 });
5089 if (HasOrderedReductions) {
5090 LLVM_DEBUG(
5091 dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5092 return 1;
5093 }
5094
5095 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5096 SmallIC = std::min(SmallIC, F);
5097 StoresIC = std::min(StoresIC, F);
5098 LoadsIC = std::min(LoadsIC, F);
5099 }
5100
5102 std::max(StoresIC, LoadsIC) > SmallIC) {
5103 LLVM_DEBUG(
5104 dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5105 return std::max(StoresIC, LoadsIC);
5106 }
5107
5108 // If there are scalar reductions and TTI has enabled aggressive
5109 // interleaving for reductions, we will interleave to expose ILP.
5110 if (VF.isScalar() && AggressivelyInterleaveReductions) {
5111 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5112 // Interleave no less than SmallIC but not as aggressive as the normal IC
5113 // to satisfy the rare situation when resources are too limited.
5114 return std::max(IC / 2, SmallIC);
5115 }
5116
5117 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5118 return SmallIC;
5119 }
5120
5121 // Interleave if this is a large loop (small loops are already dealt with by
5122 // this point) that could benefit from interleaving.
5123 if (AggressivelyInterleaveReductions) {
5124 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5125 return IC;
5126 }
5127
5128 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5129 return 1;
5130}
5131
5134 // This function calculates the register usage by measuring the highest number
5135 // of values that are alive at a single location. Obviously, this is a very
5136 // rough estimation. We scan the loop in a topological order in order and
5137 // assign a number to each instruction. We use RPO to ensure that defs are
5138 // met before their users. We assume that each instruction that has in-loop
5139 // users starts an interval. We record every time that an in-loop value is
5140 // used, so we have a list of the first and last occurrences of each
5141 // instruction. Next, we transpose this data structure into a multi map that
5142 // holds the list of intervals that *end* at a specific location. This multi
5143 // map allows us to perform a linear search. We scan the instructions linearly
5144 // and record each time that a new interval starts, by placing it in a set.
5145 // If we find this value in the multi-map then we remove it from the set.
5146 // The max register usage is the maximum size of the set.
5147 // We also search for instructions that are defined outside the loop, but are
5148 // used inside the loop. We need this number separately from the max-interval
5149 // usage number because when we unroll, loop-invariant values do not take
5150 // more register.
5152 DFS.perform(LI);
5153
5154 RegisterUsage RU;
5155
5156 // Each 'key' in the map opens a new interval. The values
5157 // of the map are the index of the 'last seen' usage of the
5158 // instruction that is the key.
5160
5161 // Maps instruction to its index.
5163 // Marks the end of each interval.
5164 IntervalMap EndPoint;
5165 // Saves the list of instruction indices that are used in the loop.
5167 // Saves the list of values that are used in the loop but are defined outside
5168 // the loop (not including non-instruction values such as arguments and
5169 // constants).
5170 SmallSetVector<Instruction *, 8> LoopInvariants;
5171
5172 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5173 for (Instruction &I : BB->instructionsWithoutDebug()) {
5174 IdxToInstr.push_back(&I);
5175
5176 // Save the end location of each USE.
5177 for (Value *U : I.operands()) {
5178 auto *Instr = dyn_cast<Instruction>(U);
5179
5180 // Ignore non-instruction values such as arguments, constants, etc.
5181 // FIXME: Might need some motivation why these values are ignored. If
5182 // for example an argument is used inside the loop it will increase the
5183 // register pressure (so shouldn't we add it to LoopInvariants).
5184 if (!Instr)
5185 continue;
5186
5187 // If this instruction is outside the loop then record it and continue.
5188 if (!TheLoop->contains(Instr)) {
5189 LoopInvariants.insert(Instr);
5190 continue;
5191 }
5192
5193 // Overwrite previous end points.
5194 EndPoint[Instr] = IdxToInstr.size();
5195 Ends.insert(Instr);
5196 }
5197 }
5198 }
5199
5200 // Saves the list of intervals that end with the index in 'key'.
5201 using InstrList = SmallVector<Instruction *, 2>;
5203
5204 // Transpose the EndPoints to a list of values that end at each index.
5205 for (auto &Interval : EndPoint)
5206 TransposeEnds[Interval.second].push_back(Interval.first);
5207
5208 SmallPtrSet<Instruction *, 8> OpenIntervals;
5211
5212 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5213
5214 const auto &TTICapture = TTI;
5215 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
5216 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty) ||
5217 (VF.isScalable() &&
5218 !TTICapture.isElementTypeLegalForScalableVector(Ty)))
5219 return 0;
5220 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
5221 };
5222
5223 for (unsigned int Idx = 0, Sz = IdxToInstr.size(); Idx < Sz; ++Idx) {
5224 Instruction *I = IdxToInstr[Idx];
5225
5226 // Remove all of the instructions that end at this location.
5227 InstrList &List = TransposeEnds[Idx];
5228 for (Instruction *ToRemove : List)
5229 OpenIntervals.erase(ToRemove);
5230
5231 // Ignore instructions that are never used within the loop.
5232 if (!Ends.count(I))
5233 continue;
5234
5235 // Skip ignored values.
5236 if (ValuesToIgnore.count(I))
5237 continue;
5238
5240
5241 // For each VF find the maximum usage of registers.
5242 for (unsigned J = 0, E = VFs.size(); J < E; ++J) {
5243 // Count the number of registers used, per register class, given all open
5244 // intervals.
5245 // Note that elements in this SmallMapVector will be default constructed
5246 // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
5247 // there is no previous entry for ClassID.
5249
5250 if (VFs[J].isScalar()) {
5251 for (auto *Inst : OpenIntervals) {
5252 unsigned ClassID =
5253 TTI.getRegisterClassForType(false, Inst->getType());
5254 // FIXME: The target might use more than one register for the type
5255 // even in the scalar case.
5256 RegUsage[ClassID] += 1;
5257 }
5258 } else {
5260 for (auto *Inst : OpenIntervals) {
5261 // Skip ignored values for VF > 1.
5262 if (VecValuesToIgnore.count(Inst))
5263 continue;
5264 if (isScalarAfterVectorization(Inst, VFs[J])) {
5265 unsigned ClassID =
5266 TTI.getRegisterClassForType(false, Inst->getType());
5267 // FIXME: The target might use more than one register for the type
5268 // even in the scalar case.
5269 RegUsage[ClassID] += 1;
5270 } else {
5271 unsigned ClassID =
5272 TTI.getRegisterClassForType(true, Inst->getType());
5273 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[J]);
5274 }
5275 }
5276 }
5277
5278 for (const auto &Pair : RegUsage) {
5279 auto &Entry = MaxUsages[J][Pair.first];
5280 Entry = std::max(Entry, Pair.second);
5281 }
5282 }
5283
5284 LLVM_DEBUG(dbgs() << "LV(REG): At #" << Idx << " Interval # "
5285 << OpenIntervals.size() << '\n');
5286
5287 // Add the current instruction to the list of open intervals.
5288 OpenIntervals.insert(I);
5289 }
5290
5291 for (unsigned Idx = 0, End = VFs.size(); Idx < End; ++Idx) {
5292 // Note that elements in this SmallMapVector will be default constructed
5293 // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5294 // there is no previous entry for ClassID.
5296
5297 for (auto *Inst : LoopInvariants) {
5298 // FIXME: The target might use more than one register for the type
5299 // even in the scalar case.
5300 bool IsScalar = all_of(Inst->users(), [&](User *U) {
5301 auto *I = cast<Instruction>(U);
5302 return TheLoop != LI->getLoopFor(I->getParent()) ||
5303 isScalarAfterVectorization(I, VFs[Idx]);
5304 });
5305
5306 ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[Idx];
5307 unsigned ClassID =
5308 TTI.getRegisterClassForType(VF.isVector(), Inst->getType());
5309 Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);
5310 }
5311
5312 LLVM_DEBUG({
5313 dbgs() << "LV(REG): VF = " << VFs[Idx] << '\n';
5314 dbgs() << "LV(REG): Found max usage: " << MaxUsages[Idx].size()
5315 << " item\n";
5316 for (const auto &pair : MaxUsages[Idx]) {
5317 dbgs() << "LV(REG): RegisterClass: "
5318 << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5319 << " registers\n";
5320 }
5321 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5322 << " item\n";
5323 for (const auto &pair : Invariant) {
5324 dbgs() << "LV(REG): RegisterClass: "
5325 << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5326 << " registers\n";
5327 }
5328 });
5329
5330 RU.LoopInvariantRegs = Invariant;
5331 RU.MaxLocalUsers = MaxUsages[Idx];
5332 RUs[Idx] = RU;
5333 }
5334
5335 return RUs;
5336}
5337
5338bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
5339 ElementCount VF) {
5340 // TODO: Cost model for emulated masked load/store is completely
5341 // broken. This hack guides the cost model to use an artificially
5342 // high enough value to practically disable vectorization with such
5343 // operations, except where previously deployed legality hack allowed
5344 // using very low cost values. This is to avoid regressions coming simply
5345 // from moving "masked load/store" check from legality to cost model.
5346 // Masked Load/Gather emulation was previously never allowed.
5347 // Limited number of Masked Store/Scatter emulation was allowed.
5349 "Expecting a scalar emulated instruction");
5350 return isa<LoadInst>(I) ||
5351 (isa<StoreInst>(I) &&
5352 NumPredStores > NumberOfStoresToPredicate);
5353}
5354
5356 // If we aren't vectorizing the loop, or if we've already collected the
5357 // instructions to scalarize, there's nothing to do. Collection may already
5358 // have occurred if we have a user-selected VF and are now computing the
5359 // expected cost for interleaving.
5360 if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF))
5361 return;
5362
5363 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5364 // not profitable to scalarize any instructions, the presence of VF in the
5365 // map will indicate that we've analyzed it already.
5366 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5367
5368 PredicatedBBsAfterVectorization[VF].clear();
5369
5370 // Find all the instructions that are scalar with predication in the loop and
5371 // determine if it would be better to not if-convert the blocks they are in.
5372 // If so, we also record the instructions to scalarize.
5373 for (BasicBlock *BB : TheLoop->blocks()) {
5375 continue;
5376 for (Instruction &I : *BB)
5377 if (isScalarWithPredication(&I, VF)) {
5378 ScalarCostsTy ScalarCosts;
5379 // Do not apply discount logic for:
5380 // 1. Scalars after vectorization, as there will only be a single copy
5381 // of the instruction.
5382 // 2. Scalable VF, as that would lead to invalid scalarization costs.
5383 // 3. Emulated masked memrefs, if a hacked cost is needed.
5384 if (!isScalarAfterVectorization(&I, VF) && !VF.isScalable() &&
5385 !useEmulatedMaskMemRefHack(&I, VF) &&
5386 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) {
5387 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5388 // Check if we decided to scalarize a call. If so, update the widening
5389 // decision of the call to CM_Scalarize with the computed scalar cost.
5390 for (const auto &[I, _] : ScalarCosts) {
5391 auto *CI = dyn_cast<CallInst>(I);
5392 if (!CI || !CallWideningDecisions.contains({CI, VF}))
5393 continue;
5394 CallWideningDecisions[{CI, VF}].Kind = CM_Scalarize;
5395 CallWideningDecisions[{CI, VF}].Cost = ScalarCosts[CI];
5396 }
5397 }
5398 // Remember that BB will remain after vectorization.
5399 PredicatedBBsAfterVectorization[VF].insert(BB);
5400 for (auto *Pred : predecessors(BB)) {
5401 if (Pred->getSingleSuccessor() == BB)
5402 PredicatedBBsAfterVectorization[VF].insert(Pred);
5403 }
5404 }
5405 }
5406}
5407
5408InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
5409 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
5410 assert(!isUniformAfterVectorization(PredInst, VF) &&
5411 "Instruction marked uniform-after-vectorization will be predicated");
5412
5413 // Initialize the discount to zero, meaning that the scalar version and the
5414 // vector version cost the same.
5415 InstructionCost Discount = 0;
5416
5417 // Holds instructions to analyze. The instructions we visit are mapped in
5418 // ScalarCosts. Those instructions are the ones that would be scalarized if
5419 // we find that the scalar version costs less.
5421
5422 // Returns true if the given instruction can be scalarized.
5423 auto CanBeScalarized = [&](Instruction *I) -> bool {
5424 // We only attempt to scalarize instructions forming a single-use chain
5425 // from the original predicated block that would otherwise be vectorized.
5426 // Although not strictly necessary, we give up on instructions we know will
5427 // already be scalar to avoid traversing chains that are unlikely to be
5428 // beneficial.
5429 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5431 return false;
5432
5433 // If the instruction is scalar with predication, it will be analyzed
5434 // separately. We ignore it within the context of PredInst.
5435 if (isScalarWithPredication(I, VF))
5436 return false;
5437
5438 // If any of the instruction's operands are uniform after vectorization,
5439 // the instruction cannot be scalarized. This prevents, for example, a
5440 // masked load from being scalarized.
5441 //
5442 // We assume we will only emit a value for lane zero of an instruction
5443 // marked uniform after vectorization, rather than VF identical values.
5444 // Thus, if we scalarize an instruction that uses a uniform, we would
5445 // create uses of values corresponding to the lanes we aren't emitting code
5446 // for. This behavior can be changed by allowing getScalarValue to clone
5447 // the lane zero values for uniforms rather than asserting.
5448 for (Use &U : I->operands())
5449 if (auto *J = dyn_cast<Instruction>(U.get()))
5450 if (isUniformAfterVectorization(J, VF))
5451 return false;
5452
5453 // Otherwise, we can scalarize the instruction.
5454 return true;
5455 };
5456
5457 // Compute the expected cost discount from scalarizing the entire expression
5458 // feeding the predicated instruction. We currently only consider expressions
5459 // that are single-use instruction chains.
5460 Worklist.push_back(PredInst);
5461 while (!Worklist.empty()) {
5462 Instruction *I = Worklist.pop_back_val();
5463
5464 // If we've already analyzed the instruction, there's nothing to do.
5465 if (ScalarCosts.contains(I))
5466 continue;
5467
5468 // Compute the cost of the vector instruction. Note that this cost already
5469 // includes the scalarization overhead of the predicated instruction.
5470 InstructionCost VectorCost = getInstructionCost(I, VF);
5471
5472 // Compute the cost of the scalarized instruction. This cost is the cost of
5473 // the instruction as if it wasn't if-converted and instead remained in the
5474 // predicated block. We will scale this cost by block probability after
5475 // computing the scalarization overhead.
5476 InstructionCost ScalarCost =
5478
5479 // Compute the scalarization overhead of needed insertelement instructions
5480 // and phi nodes.
5481 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5482 ScalarCost += TTI.getScalarizationOverhead(
5483 cast<VectorType>(toVectorTy(I->getType(), VF)),
5484 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
5485 /*Extract*/ false, CostKind);
5486 ScalarCost +=
5487 VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
5488 }
5489
5490 // Compute the scalarization overhead of needed extractelement
5491 // instructions. For each of the instruction's operands, if the operand can
5492 // be scalarized, add it to the worklist; otherwise, account for the
5493 // overhead.
5494 for (Use &U : I->operands())
5495 if (auto *J = dyn_cast<Instruction>(U.get())) {
5496 assert(VectorType::isValidElementType(J->getType()) &&
5497 "Instruction has non-scalar type");
5498 if (CanBeScalarized(J))
5499 Worklist.push_back(J);
5500 else if (needsExtract(J, VF)) {
5501 ScalarCost += TTI.getScalarizationOverhead(
5502 cast<VectorType>(toVectorTy(J->getType(), VF)),
5503 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
5504 /*Extract*/ true, CostKind);
5505 }
5506 }
5507
5508 // Scale the total scalar cost by block probability.
5509 ScalarCost /= getReciprocalPredBlockProb();
5510
5511 // Compute the discount. A non-negative discount means the vector version
5512 // of the instruction costs more, and scalarizing would be beneficial.
5513 Discount += VectorCost - ScalarCost;
5514 ScalarCosts[I] = ScalarCost;
5515 }
5516
5517 return Discount;
5518}
5519
5522
5523 // If the vector loop gets executed exactly once with the given VF, ignore the
5524 // costs of comparison and induction instructions, as they'll get simplified
5525 // away.
5526 SmallPtrSet<Instruction *, 2> ValuesToIgnoreForVF;
5528 if (VF.isFixed() && TC == VF.getFixedValue() && !foldTailByMasking())
5530 ValuesToIgnoreForVF);
5531
5532 // For each block.
5533 for (BasicBlock *BB : TheLoop->blocks()) {
5534 InstructionCost BlockCost;
5535
5536 // For each instruction in the old loop.
5537 for (Instruction &I : BB->instructionsWithoutDebug()) {
5538 // Skip ignored values.
5539 if (ValuesToIgnore.count(&I) || ValuesToIgnoreForVF.count(&I) ||
5540 (VF.isVector() && VecValuesToIgnore.count(&I)))
5541 continue;
5542
5544
5545 // Check if we should override the cost.
5546 if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0)
5548
5549 BlockCost += C;
5550 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
5551 << VF << " For instruction: " << I << '\n');
5552 }
5553
5554 // If we are vectorizing a predicated block, it will have been
5555 // if-converted. This means that the block's instructions (aside from
5556 // stores and instructions that may divide by zero) will now be
5557 // unconditionally executed. For the scalar case, we may not always execute
5558 // the predicated block, if it is an if-else block. Thus, scale the block's
5559 // cost by the probability of executing it. blockNeedsPredication from
5560 // Legal is used so as to not include all blocks in tail folded loops.
5561 if (VF.isScalar() && Legal->blockNeedsPredication(BB))
5562 BlockCost /= getReciprocalPredBlockProb();
5563
5564 Cost += BlockCost;
5565 }
5566
5567 return Cost;
5568}
5569
5570/// Gets Address Access SCEV after verifying that the access pattern
5571/// is loop invariant except the induction variable dependence.
5572///
5573/// This SCEV can be sent to the Target in order to estimate the address
5574/// calculation cost.
5576 Value *Ptr,
5579 const Loop *TheLoop) {
5580
5581 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5582 if (!Gep)
5583 return nullptr;
5584
5585 // We are looking for a gep with all loop invariant indices except for one
5586 // which should be an induction variable.
5587 auto *SE = PSE.getSE();
5588 unsigned NumOperands = Gep->getNumOperands();
5589 for (unsigned Idx = 1; Idx < NumOperands; ++Idx) {
5590 Value *Opd = Gep->getOperand(Idx);
5591 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5592 !Legal->isInductionVariable(Opd))
5593 return nullptr;
5594 }
5595
5596 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5597 return PSE.getSCEV(Ptr);
5598}
5599
5601LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5602 ElementCount VF) {
5603 assert(VF.isVector() &&
5604 "Scalarization cost of instruction implies vectorization.");
5605 if (VF.isScalable())
5607
5608 Type *ValTy = getLoadStoreType(I);
5609 auto *SE = PSE.getSE();
5610
5611 unsigned AS = getLoadStoreAddressSpace(I);
5613 Type *PtrTy = toVectorTy(Ptr->getType(), VF);
5614 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
5615 // that it is being called from this specific place.
5616
5617 // Figure out whether the access is strided and get the stride value
5618 // if it's known in compile time
5619 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5620
5621 // Get the cost of the scalar memory instruction and address computation.
5623 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5624
5625 // Don't pass *I here, since it is scalar but will actually be part of a
5626 // vectorized loop where the user of it is a vectorized instruction.
5627 const Align Alignment = getLoadStoreAlignment(I);
5628 Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
5629 ValTy->getScalarType(),
5630 Alignment, AS, CostKind);
5631
5632 // Get the overhead of the extractelement and insertelement instructions
5633 // we might create due to scalarization.
5634 Cost += getScalarizationOverhead(I, VF);
5635
5636 // If we have a predicated load/store, it will need extra i1 extracts and
5637 // conditional branches, but may not be executed for each vector lane. Scale
5638 // the cost by the probability of executing the predicated block.
5639 if (isPredicatedInst(I)) {
5641
5642 // Add the cost of an i1 extract and a branch
5643 auto *VecI1Ty =
5646 VecI1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
5647 /*Insert=*/false, /*Extract=*/true, CostKind);
5648 Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
5649
5650 if (useEmulatedMaskMemRefHack(I, VF))
5651 // Artificially setting to a high enough value to practically disable
5652 // vectorization with such operations.
5653 Cost = 3000000;
5654 }
5655
5656 return Cost;
5657}
5658
5660LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5661 ElementCount VF) {
5662 Type *ValTy = getLoadStoreType(I);
5663 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5665 unsigned AS = getLoadStoreAddressSpace(I);
5666 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
5667
5668 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5669 "Stride should be 1 or -1 for consecutive memory access");
5670 const Align Alignment = getLoadStoreAlignment(I);
5672 if (Legal->isMaskRequired(I)) {
5673 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5674 CostKind);
5675 } else {
5676 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
5677 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5678 CostKind, OpInfo, I);
5679 }
5680
5681 bool Reverse = ConsecutiveStride < 0;
5682 if (Reverse)
5684 CostKind, 0);
5685 return Cost;
5686}
5687
5689LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5690 ElementCount VF) {
5691 assert(Legal->isUniformMemOp(*I, VF));
5692
5693 Type *ValTy = getLoadStoreType(I);
5694 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5695 const Align Alignment = getLoadStoreAlignment(I);
5696 unsigned AS = getLoadStoreAddressSpace(I);
5697 if (isa<LoadInst>(I)) {
5698 return TTI.getAddressComputationCost(ValTy) +
5699 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
5700 CostKind) +
5702 CostKind);
5703 }
5704 StoreInst *SI = cast<StoreInst>(I);
5705
5706 bool IsLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
5707 return TTI.getAddressComputationCost(ValTy) +
5708 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
5709 CostKind) +
5710 (IsLoopInvariantStoreValue
5711 ? 0
5712 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
5713 CostKind, VF.getKnownMinValue() - 1));
5714}
5715
5717LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5718 ElementCount VF) {
5719 Type *ValTy = getLoadStoreType(I);
5720 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5721 const Align Alignment = getLoadStoreAlignment(I);
5723
5724 return TTI.getAddressComputationCost(VectorTy) +
5725 TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5726 Legal->isMaskRequired(I), Alignment,
5727 CostKind, I);
5728}
5729
5731LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5732 ElementCount VF) {
5733 const auto *Group = getInterleavedAccessGroup(I);
5734 assert(Group && "Fail to get an interleaved access group.");
5735
5736 Instruction *InsertPos = Group->getInsertPos();
5737 Type *ValTy = getLoadStoreType(InsertPos);
5738 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5739 unsigned AS = getLoadStoreAddressSpace(InsertPos);
5740
5741 unsigned InterleaveFactor = Group->getFactor();
5742 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5743
5744 // Holds the indices of existing members in the interleaved group.
5746 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
5747 if (Group->getMember(IF))
5748 Indices.push_back(IF);
5749
5750 // Calculate the cost of the whole interleaved group.
5751 bool UseMaskForGaps =
5752 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
5753 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
5755 InsertPos->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5756 Group->getAlign(), AS, CostKind, Legal->isMaskRequired(I),
5757 UseMaskForGaps);
5758
5759 if (Group->isReverse()) {
5760 // TODO: Add support for reversed masked interleaved access.
5762 "Reverse masked interleaved access not supported.");
5763 Cost += Group->getNumMembers() *
5765 CostKind, 0);
5766 }
5767 return Cost;
5768}
5769
5770std::optional<InstructionCost>
5772 ElementCount VF,
5773 Type *Ty) const {
5774 using namespace llvm::PatternMatch;
5775 // Early exit for no inloop reductions
5776 if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
5777 return std::nullopt;
5778 auto *VectorTy = cast<VectorType>(Ty);
5779
5780 // We are looking for a pattern of, and finding the minimal acceptable cost:
5781 // reduce(mul(ext(A), ext(B))) or
5782 // reduce(mul(A, B)) or
5783 // reduce(ext(A)) or
5784 // reduce(A).
5785 // The basic idea is that we walk down the tree to do that, finding the root
5786 // reduction instruction in InLoopReductionImmediateChains. From there we find
5787 // the pattern of mul/ext and test the cost of the entire pattern vs the cost
5788 // of the components. If the reduction cost is lower then we return it for the
5789 // reduction instruction and 0 for the other instructions in the pattern. If
5790 // it is not we return an invalid cost specifying the orignal cost method
5791 // should be used.
5792 Instruction *RetI = I;
5793 if (match(RetI, m_ZExtOrSExt(m_Value()))) {
5794 if (!RetI->hasOneUser())
5795 return std::nullopt;
5796 RetI = RetI->user_back();
5797 }
5798
5799 if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
5800 RetI->user_back()->getOpcode() == Instruction::Add) {
5801 RetI = RetI->user_back();
5802 }
5803
5804 // Test if the found instruction is a reduction, and if not return an invalid
5805 // cost specifying the parent to use the original cost modelling.
5806 if (!InLoopReductionImmediateChains.count(RetI))
5807 return std::nullopt;
5808
5809 // Find the reduction this chain is a part of and calculate the basic cost of
5810 // the reduction on its own.
5811 Instruction *LastChain = InLoopReductionImmediateChains.at(RetI);
5812 Instruction *ReductionPhi = LastChain;
5813 while (!isa<PHINode>(ReductionPhi))
5814 ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi);
5815
5816 const RecurrenceDescriptor &RdxDesc =
5817 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
5818
5819 InstructionCost BaseCost;
5820 RecurKind RK = RdxDesc.getRecurrenceKind();
5823 BaseCost = TTI.getMinMaxReductionCost(MinMaxID, VectorTy,
5824 RdxDesc.getFastMathFlags(), CostKind);
5825 } else {
5827 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
5828 }
5829
5830 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
5831 // normal fmul instruction to the cost of the fadd reduction.
5832 if (RK == RecurKind::FMulAdd)
5833 BaseCost +=
5834 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
5835
5836 // If we're using ordered reductions then we can just return the base cost
5837 // here, since getArithmeticReductionCost calculates the full ordered
5838 // reduction cost when FP reassociation is not allowed.
5839 if (useOrderedReductions(RdxDesc))
5840 return BaseCost;
5841
5842 // Get the operand that was not the reduction chain and match it to one of the
5843 // patterns, returning the better cost if it is found.
5844 Instruction *RedOp = RetI->getOperand(1) == LastChain
5845 ? dyn_cast<Instruction>(RetI->getOperand(0))
5846 : dyn_cast<Instruction>(RetI->getOperand(1));
5847
5848 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
5849
5850 Instruction *Op0, *Op1;
5851 if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5852 match(RedOp,
5854 match(Op0, m_ZExtOrSExt(m_Value())) &&
5855 Op0->getOpcode() == Op1->getOpcode() &&
5856 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
5858 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
5859
5860 // Matched reduce.add(ext(mul(ext(A), ext(B)))
5861 // Note that the extend opcodes need to all match, or if A==B they will have
5862 // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
5863 // which is equally fine.
5864 bool IsUnsigned = isa<ZExtInst>(Op0);
5865 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
5866 auto *MulType = VectorType::get(Op0->getType(), VectorTy);
5867
5868 InstructionCost ExtCost =
5869 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
5871 InstructionCost MulCost =
5872 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
5873 InstructionCost Ext2Cost =
5874 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
5876
5878 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
5879
5880 if (RedCost.isValid() &&
5881 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
5882 return I == RetI ? RedCost : 0;
5883 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
5884 !TheLoop->isLoopInvariant(RedOp)) {
5885 // Matched reduce(ext(A))
5886 bool IsUnsigned = isa<ZExtInst>(RedOp);
5887 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
5889 RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
5890 RdxDesc.getFastMathFlags(), CostKind);
5891
5892 InstructionCost ExtCost =
5893 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
5895 if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
5896 return I == RetI ? RedCost : 0;
5897 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5898 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
5899 if (match(Op0, m_ZExtOrSExt(m_Value())) &&
5900 Op0->getOpcode() == Op1->getOpcode() &&
5902 bool IsUnsigned = isa<ZExtInst>(Op0);
5903 Type *Op0Ty = Op0->getOperand(0)->getType();
5904 Type *Op1Ty = Op1->getOperand(0)->getType();
5905 Type *LargestOpTy =
5906 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
5907 : Op0Ty;
5908 auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
5909
5910 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
5911 // different sizes. We take the largest type as the ext to reduce, and add
5912 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
5914 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
5917 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
5919 InstructionCost MulCost =
5920 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
5921
5923 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
5924 InstructionCost ExtraExtCost = 0;
5925 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
5926 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
5927 ExtraExtCost = TTI.getCastInstrCost(
5928 ExtraExtOp->getOpcode(), ExtType,
5929 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
5931 }
5932
5933 if (RedCost.isValid() &&
5934 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
5935 return I == RetI ? RedCost : 0;
5936 } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
5937 // Matched reduce.add(mul())
5938 InstructionCost MulCost =
5939 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
5940
5942 true, RdxDesc.getRecurrenceType(), VectorTy, CostKind);
5943
5944 if (RedCost.isValid() && RedCost < MulCost + BaseCost)
5945 return I == RetI ? RedCost : 0;
5946 }
5947 }
5948
5949 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
5950}
5951
5953LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5954 ElementCount VF) {
5955 // Calculate scalar cost only. Vectorization cost should be ready at this
5956 // moment.
5957 if (VF.isScalar()) {
5958 Type *ValTy = getLoadStoreType(I);
5959 const Align Alignment = getLoadStoreAlignment(I);
5960 unsigned AS = getLoadStoreAddressSpace(I);
5961
5962 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
5963 return TTI.getAddressComputationCost(ValTy) +
5964 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, CostKind,
5965 OpInfo, I);
5966 }
5967 return getWideningCost(I, VF);
5968}
5969
5971LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5972 ElementCount VF) const {
5973
5974 // There is no mechanism yet to create a scalable scalarization loop,
5975 // so this is currently Invalid.
5976 if (VF.isScalable())
5978
5979 if (VF.isScalar())
5980 return 0;
5981
5983 Type *RetTy = toVectorTy(I->getType(), VF);
5984 if (!RetTy->isVoidTy() &&
5985 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
5987 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
5988 /*Insert*/ true,
5989 /*Extract*/ false, CostKind);
5990
5991 // Some targets keep addresses scalar.
5992 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
5993 return Cost;
5994
5995 // Some targets support efficient element stores.
5996 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
5997 return Cost;
5998
5999 // Collect operands to consider.
6000 CallInst *CI = dyn_cast<CallInst>(I);
6001 Instruction::op_range Ops = CI ? CI->args() : I->operands();
6002
6003 // Skip operands that do not require extraction/scalarization and do not incur
6004 // any overhead.
6006 for (auto *V : filterExtractingOperands(Ops, VF))
6007 Tys.push_back(maybeVectorizeType(V->getType(), VF));
6009 filterExtractingOperands(Ops, VF), Tys, CostKind);
6010}
6011
6013 if (VF.isScalar())
6014 return;
6015 NumPredStores = 0;
6016 for (BasicBlock *BB : TheLoop->blocks()) {
6017 // For each instruction in the old loop.
6018 for (Instruction &I : *BB) {
6020 if (!Ptr)
6021 continue;
6022
6023 // TODO: We should generate better code and update the cost model for
6024 // predicated uniform stores. Today they are treated as any other
6025 // predicated store (see added test cases in
6026 // invariant-store-vectorization.ll).
6027 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
6028 NumPredStores++;
6029
6030 if (Legal->isUniformMemOp(I, VF)) {
6031 auto IsLegalToScalarize = [&]() {
6032 if (!VF.isScalable())
6033 // Scalarization of fixed length vectors "just works".
6034 return true;
6035
6036 // We have dedicated lowering for unpredicated uniform loads and
6037 // stores. Note that even with tail folding we know that at least
6038 // one lane is active (i.e. generalized predication is not possible
6039 // here), and the logic below depends on this fact.
6040 if (!foldTailByMasking())
6041 return true;
6042
6043 // For scalable vectors, a uniform memop load is always
6044 // uniform-by-parts and we know how to scalarize that.
6045 if (isa<LoadInst>(I))
6046 return true;
6047
6048 // A uniform store isn't neccessarily uniform-by-part
6049 // and we can't assume scalarization.
6050 auto &SI = cast<StoreInst>(I);
6051 return TheLoop->isLoopInvariant(SI.getValueOperand());
6052 };
6053
6054 const InstructionCost GatherScatterCost =
6056 getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
6057
6058 // Load: Scalar load + broadcast
6059 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6060 // FIXME: This cost is a significant under-estimate for tail folded
6061 // memory ops.
6062 const InstructionCost ScalarizationCost =
6063 IsLegalToScalarize() ? getUniformMemOpCost(&I, VF)
6065
6066 // Choose better solution for the current VF, Note that Invalid
6067 // costs compare as maximumal large. If both are invalid, we get
6068 // scalable invalid which signals a failure and a vectorization abort.
6069 if (GatherScatterCost < ScalarizationCost)
6070 setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
6071 else
6072 setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
6073 continue;
6074 }
6075
6076 // We assume that widening is the best solution when possible.
6077 if (memoryInstructionCanBeWidened(&I, VF)) {
6078 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6079 int ConsecutiveStride = Legal->isConsecutivePtr(
6081 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6082 "Expected consecutive stride.");
6083 InstWidening Decision =
6084 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6085 setWideningDecision(&I, VF, Decision, Cost);
6086 continue;
6087 }
6088
6089 // Choose between Interleaving, Gather/Scatter or Scalarization.
6091 unsigned NumAccesses = 1;
6092 if (isAccessInterleaved(&I)) {
6093 const auto *Group = getInterleavedAccessGroup(&I);
6094 assert(Group && "Fail to get an interleaved access group.");
6095
6096 // Make one decision for the whole group.
6097 if (getWideningDecision(&I, VF) != CM_Unknown)
6098 continue;
6099
6100 NumAccesses = Group->getNumMembers();
6102 InterleaveCost = getInterleaveGroupCost(&I, VF);
6103 }
6104
6105 InstructionCost GatherScatterCost =
6107 ? getGatherScatterCost(&I, VF) * NumAccesses
6109
6110 InstructionCost ScalarizationCost =
6111 getMemInstScalarizationCost(&I, VF) * NumAccesses;
6112
6113 // Choose better solution for the current VF,
6114 // write down this decision and use it during vectorization.
6116 InstWidening Decision;
6117 if (InterleaveCost <= GatherScatterCost &&
6118 InterleaveCost < ScalarizationCost) {
6119 Decision = CM_Interleave;
6120 Cost = InterleaveCost;
6121 } else if (GatherScatterCost < ScalarizationCost) {
6122 Decision = CM_GatherScatter;
6123 Cost = GatherScatterCost;
6124 } else {
6125 Decision = CM_Scalarize;
6126 Cost = ScalarizationCost;
6127 }
6128 // If the instructions belongs to an interleave group, the whole group
6129 // receives the same decision. The whole group receives the cost, but
6130 // the cost will actually be assigned to one instruction.
6131 if (const auto *Group = getInterleavedAccessGroup(&I))
6132 setWideningDecision(Group, VF, Decision, Cost);
6133 else
6134 setWideningDecision(&I, VF, Decision, Cost);
6135 }
6136 }
6137
6138 // Make sure that any load of address and any other address computation
6139 // remains scalar unless there is gather/scatter support. This avoids
6140 // inevitable extracts into address registers, and also has the benefit of
6141 // activating LSR more, since that pass can't optimize vectorized
6142 // addresses.
6144 return;
6145
6146 // Start with all scalar pointer uses.
6148 for (BasicBlock *BB : TheLoop->blocks())
6149 for (Instruction &I : *BB) {
6150 Instruction *PtrDef =
6151 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6152 if (PtrDef && TheLoop->contains(PtrDef) &&
6154 AddrDefs.insert(PtrDef);
6155 }
6156
6157 // Add all instructions used to generate the addresses.
6159 append_range(Worklist, AddrDefs);
6160 while (!Worklist.empty()) {
6161 Instruction *I = Worklist.pop_back_val();
6162 for (auto &Op : I->operands())
6163 if (auto *InstOp = dyn_cast<Instruction>(Op))
6164 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6165 AddrDefs.insert(InstOp).second)
6166 Worklist.push_back(InstOp);
6167 }
6168
6169 for (auto *I : AddrDefs) {
6170 if (isa<LoadInst>(I)) {
6171 // Setting the desired widening decision should ideally be handled in
6172 // by cost functions, but since this involves the task of finding out
6173 // if the loaded register is involved in an address computation, it is
6174 // instead changed here when we know this is the case.
6175 InstWidening Decision = getWideningDecision(I, VF);
6176 if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6177 // Scalarize a widened load of address.
6179 I, VF, CM_Scalarize,
6180 (VF.getKnownMinValue() *
6181 getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6182 else if (const auto *Group = getInterleavedAccessGroup(I)) {
6183 // Scalarize an interleave group of address loads.
6184 for (unsigned I = 0; I < Group->getFactor(); ++I) {
6185 if (Instruction *Member = Group->getMember(I))
6187 Member, VF, CM_Scalarize,
6188 (VF.getKnownMinValue() *
6189 getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6190 }
6191 }
6192 } else
6193 // Make sure I gets scalarized and a cost estimate without
6194 // scalarization overhead.
6195 ForcedScalars[VF].insert(I);
6196 }
6197}
6198
6200 assert(!VF.isScalar() &&
6201 "Trying to set a vectorization decision for a scalar VF");
6202
6203 auto ForcedScalar = ForcedScalars.find(VF);
6204 for (BasicBlock *BB : TheLoop->blocks()) {
6205 // For each instruction in the old loop.
6206 for (Instruction &I : *BB) {
6207 CallInst *CI = dyn_cast<CallInst>(&I);
6208
6209 if (!CI)
6210 continue;
6211
6215 Function *ScalarFunc = CI->getCalledFunction();
6216 Type *ScalarRetTy = CI->getType();
6217 SmallVector<Type *, 4> Tys, ScalarTys;
6218 for (auto &ArgOp : CI->args())
6219 ScalarTys.push_back(ArgOp->getType());
6220
6221 // Estimate cost of scalarized vector call. The source operands are
6222 // assumed to be vectors, so we need to extract individual elements from
6223 // there, execute VF scalar calls, and then gather the result into the
6224 // vector return value.
6225 InstructionCost ScalarCallCost =
6226 TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
6227
6228 // Compute costs of unpacking argument values for the scalar calls and
6229 // packing the return values to a vector.
6230 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
6231
6232 ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
6233 // Honor ForcedScalars and UniformAfterVectorization decisions.
6234 // TODO: For calls, it might still be more profitable to widen. Use
6235 // VPlan-based cost model to compare different options.
6236 if (VF.isVector() && ((ForcedScalar != ForcedScalars.end() &&
6237 ForcedScalar->second.contains(CI)) ||
6238 isUniformAfterVectorization(CI, VF))) {
6239 setCallWideningDecision(CI, VF, CM_Scalarize, nullptr,
6240 Intrinsic::not_intrinsic, std::nullopt,
6241 ScalarCost);
6242 continue;
6243 }
6244
6245 bool MaskRequired = Legal->isMaskRequired(CI);
6246 // Compute corresponding vector type for return value and arguments.
6247 Type *RetTy = toVectorTy(ScalarRetTy, VF);
6248 for (Type *ScalarTy : ScalarTys)
6249 Tys.push_back(toVectorTy(ScalarTy, VF));
6250
6251 // An in-loop reduction using an fmuladd intrinsic is a special case;
6252 // we don't want the normal cost for that intrinsic.
6254 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy)) {
6257 std::nullopt, *RedCost);
6258 continue;
6259 }
6260
6261 // Find the cost of vectorizing the call, if we can find a suitable
6262 // vector variant of the function.
6263 bool UsesMask = false;
6264 VFInfo FuncInfo;
6265 Function *VecFunc = nullptr;
6266 // Search through any available variants for one we can use at this VF.
6267 for (VFInfo &Info : VFDatabase::getMappings(*CI)) {
6268 // Must match requested VF.
6269 if (Info.Shape.VF != VF)
6270 continue;
6271
6272 // Must take a mask argument if one is required
6273 if (MaskRequired && !Info.isMasked())
6274 continue;
6275
6276 // Check that all parameter kinds are supported
6277 bool ParamsOk = true;
6278 for (VFParameter Param : Info.Shape.Parameters) {
6279 switch (Param.ParamKind) {
6281 break;
6283 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6284 // Make sure the scalar parameter in the loop is invariant.
6285 if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),
6286 TheLoop))
6287 ParamsOk = false;
6288 break;
6289 }
6291 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6292 // Find the stride for the scalar parameter in this loop and see if
6293 // it matches the stride for the variant.
6294 // TODO: do we need to figure out the cost of an extract to get the
6295 // first lane? Or do we hope that it will be folded away?
6296 ScalarEvolution *SE = PSE.getSE();
6297 const auto *SAR =
6298 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(ScalarParam));
6299
6300 if (!SAR || SAR->getLoop() != TheLoop) {
6301 ParamsOk = false;
6302 break;
6303 }
6304
6305 const SCEVConstant *Step =
6306 dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE));
6307
6308 if (!Step ||
6309 Step->getAPInt().getSExtValue() != Param.LinearStepOrPos)
6310 ParamsOk = false;
6311
6312 break;
6313 }
6315 UsesMask = true;
6316 break;
6317 default:
6318 ParamsOk = false;
6319 break;
6320 }
6321 }
6322
6323 if (!ParamsOk)
6324 continue;
6325
6326 // Found a suitable candidate, stop here.
6327 VecFunc = CI->getModule()->getFunction(Info.VectorName);
6328 FuncInfo = Info;
6329 break;
6330 }
6331
6332 // Add in the cost of synthesizing a mask if one wasn't required.
6333 InstructionCost MaskCost = 0;
6334 if (VecFunc && UsesMask && !MaskRequired)
6335 MaskCost = TTI.getShuffleCost(
6338 VecFunc->getFunctionType()->getContext()),
6339 VF),
6340 {}, CostKind);
6341
6342 if (TLI && VecFunc && !CI->isNoBuiltin())
6343 VectorCost =
6344 TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost;
6345
6346 // Find the cost of an intrinsic; some targets may have instructions that
6347 // perform the operation without needing an actual call.
6349 if (IID != Intrinsic::not_intrinsic)
6350 IntrinsicCost = getVectorIntrinsicCost(CI, VF);
6351
6352 InstructionCost Cost = ScalarCost;
6353 InstWidening Decision = CM_Scalarize;
6354
6355 if (VectorCost <= Cost) {
6356 Cost = VectorCost;
6357 Decision = CM_VectorCall;
6358 }
6359
6360 if (IntrinsicCost <= Cost) {
6361 Cost = IntrinsicCost;
6362 Decision = CM_IntrinsicCall;
6363 }
6364
6365 setCallWideningDecision(CI, VF, Decision, VecFunc, IID,
6367 }
6368 }
6369}
6370
6372 if (!Legal->isInvariant(Op))
6373 return false;
6374 // Consider Op invariant, if it or its operands aren't predicated
6375 // instruction in the loop. In that case, it is not trivially hoistable.
6376 auto *OpI = dyn_cast<Instruction>(Op);
6377 return !OpI || !TheLoop->contains(OpI) ||
6378 (!isPredicatedInst(OpI) &&
6379 (!isa<PHINode>(OpI) || OpI->getParent() != TheLoop->getHeader()) &&
6380 all_of(OpI->operands(),
6381 [this](Value *Op) { return shouldConsiderInvariant(Op); }));
6382}
6383
6386 ElementCount VF) {
6387 // If we know that this instruction will remain uniform, check the cost of
6388 // the scalar version.
6390 VF = ElementCount::getFixed(1);
6391
6392 if (VF.isVector() && isProfitableToScalarize(I, VF))
6393 return InstsToScalarize[VF][I];
6394
6395 // Forced scalars do not have any scalarization overhead.
6396 auto ForcedScalar = ForcedScalars.find(VF);
6397 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6398 auto InstSet = ForcedScalar->second;
6399 if (InstSet.count(I))
6401 VF.getKnownMinValue();
6402 }
6403
6404 Type *RetTy = I->getType();
6406 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6407 auto *SE = PSE.getSE();
6408
6409 auto HasSingleCopyAfterVectorization = [this](Instruction *I,
6410 ElementCount VF) -> bool {
6411 if (VF.isScalar())
6412 return true;
6413
6414 auto Scalarized = InstsToScalarize.find(VF);
6415 assert(Scalarized != InstsToScalarize.end() &&
6416 "VF not yet analyzed for scalarization profitability");
6417 return !Scalarized->second.count(I) &&
6418 llvm::all_of(I->users(), [&](User *U) {
6419 auto *UI = cast<Instruction>(U);
6420 return !Scalarized->second.count(UI);
6421 });
6422 };
6423 (void)HasSingleCopyAfterVectorization;
6424
6425 Type *VectorTy;
6426 if (isScalarAfterVectorization(I, VF)) {
6427 // With the exception of GEPs and PHIs, after scalarization there should
6428 // only be one copy of the instruction generated in the loop. This is
6429 // because the VF is either 1, or any instructions that need scalarizing
6430 // have already been dealt with by the time we get here. As a result,
6431 // it means we don't have to multiply the instruction cost by VF.
6432 assert(I->getOpcode() == Instruction::GetElementPtr ||
6433 I->getOpcode() == Instruction::PHI ||
6434 (I->getOpcode() == Instruction::BitCast &&
6435 I->getType()->isPointerTy()) ||
6436 HasSingleCopyAfterVectorization(I, VF));
6437 VectorTy = RetTy;
6438 } else
6439 VectorTy = toVectorTy(RetTy, VF);
6440
6441 if (VF.isVector() && VectorTy->isVectorTy() &&
6442 !TTI.getNumberOfParts(VectorTy))
6444
6445 // TODO: We need to estimate the cost of intrinsic calls.
6446 switch (I->getOpcode()) {
6447 case Instruction::GetElementPtr:
6448 // We mark this instruction as zero-cost because the cost of GEPs in
6449 // vectorized code depends on whether the corresponding memory instruction
6450 // is scalarized or not. Therefore, we handle GEPs with the memory
6451 // instruction cost.
6452 return 0;
6453 case Instruction::Br: {
6454 // In cases of scalarized and predicated instructions, there will be VF
6455 // predicated blocks in the vectorized loop. Each branch around these
6456 // blocks requires also an extract of its vector compare i1 element.
6457 // Note that the conditional branch from the loop latch will be replaced by
6458 // a single branch controlling the loop, so there is no extra overhead from
6459 // scalarization.
6460 bool ScalarPredicatedBB = false;
6461 BranchInst *BI = cast<BranchInst>(I);
6462 if (VF.isVector() && BI->isConditional() &&
6463 (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
6464 PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))) &&
6465 BI->getParent() != TheLoop->getLoopLatch())
6466 ScalarPredicatedBB = true;
6467
6468 if (ScalarPredicatedBB) {
6469 // Not possible to scalarize scalable vector with predicated instructions.
6470 if (VF.isScalable())
6472 // Return cost for branches around scalarized and predicated blocks.
6473 auto *VecI1Ty =
6474 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6475 return (
6477 VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
6478 /*Insert*/ false, /*Extract*/ true, CostKind) +
6479 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
6480 }
6481
6482 if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6483 // The back-edge branch will remain, as will all scalar branches.
6484 return TTI.getCFInstrCost(Instruction::Br, CostKind);
6485
6486 // This branch will be eliminated by if-conversion.
6487 return 0;
6488 // Note: We currently assume zero cost for an unconditional branch inside
6489 // a predicated block since it will become a fall-through, although we
6490 // may decide in the future to call TTI for all branches.
6491 }
6492 case Instruction::Switch: {
6493 if (VF.isScalar())
6494 return TTI.getCFInstrCost(Instruction::Switch, CostKind);
6495 auto *Switch = cast<SwitchInst>(I);
6496 return Switch->getNumCases() *
6498 Instruction::ICmp,
6499 toVectorTy(Switch->getCondition()->getType(), VF),
6500 toVectorTy(Type::getInt1Ty(I->getContext()), VF),
6502 }
6503 case Instruction::PHI: {
6504 auto *Phi = cast<PHINode>(I);
6505
6506 // First-order recurrences are replaced by vector shuffles inside the loop.
6507 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6508 // For <vscale x 1 x i64>, if vscale = 1 we are unable to extract the
6509 // penultimate value of the recurrence.
6510 // TODO: Consider vscale_range info.
6511 if (VF.isScalable() && VF.getKnownMinValue() == 1)
6514 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
6516 cast<VectorType>(VectorTy), Mask, CostKind,
6517 VF.getKnownMinValue() - 1);
6518 }
6519
6520 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6521 // converted into select instructions. We require N - 1 selects per phi
6522 // node, where N is the number of incoming values.
6523 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) {
6524 Type *ResultTy = Phi->getType();
6525
6526 // All instructions in an Any-of reduction chain are narrowed to bool.
6527 // Check if that is the case for this phi node.
6528 auto *HeaderUser = cast_if_present<PHINode>(
6529 find_singleton<User>(Phi->users(), [this](User *U, bool) -> User * {
6530 auto *Phi = dyn_cast<PHINode>(U);
6531 if (Phi && Phi->getParent() == TheLoop->getHeader())
6532 return Phi;
6533 return nullptr;
6534 }));
6535 if (HeaderUser) {
6536 auto &ReductionVars = Legal->getReductionVars();
6537 auto Iter = ReductionVars.find(HeaderUser);
6538 if (Iter != ReductionVars.end() &&
6540 Iter->second.getRecurrenceKind()))
6541 ResultTy = Type::getInt1Ty(Phi->getContext());
6542 }
6543 return (Phi->getNumIncomingValues() - 1) *
6545 Instruction::Select, toVectorTy(ResultTy, VF),
6546 toVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6548 }
6549
6550 // When tail folding with EVL, if the phi is part of an out of loop
6551 // reduction then it will be transformed into a wide vp_merge.
6552 if (VF.isVector() && foldTailWithEVL() &&
6555 Intrinsic::vp_merge, toVectorTy(Phi->getType(), VF),
6556 {toVectorTy(Type::getInt1Ty(Phi->getContext()), VF)});
6557 return TTI.getIntrinsicInstrCost(ICA, CostKind);
6558 }
6559
6560 return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6561 }
6562 case Instruction::UDiv:
6563 case Instruction::SDiv:
6564 case Instruction::URem:
6565 case Instruction::SRem:
6566 if (VF.isVector() && isPredicatedInst(I)) {
6567 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6568 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6569 ScalarCost : SafeDivisorCost;
6570 }
6571 // We've proven all lanes safe to speculate, fall through.
6572 [[fallthrough]];
6573 case Instruction::Add:
6574 case Instruction::Sub: {
6575 auto Info = Legal->getHistogramInfo(I);
6576 if (Info && VF.isVector()) {
6577 const HistogramInfo *HGram = Info.value();
6578 // Assume that a non-constant update value (or a constant != 1) requires
6579 // a multiply, and add that into the cost.
6581 ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1));
6582 if (!RHS || RHS->getZExtValue() != 1)
6583 MulCost =
6584 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6585
6586 // Find the cost of the histogram operation itself.
6587 Type *PtrTy = VectorType::get(HGram->Load->getPointerOperandType(), VF);
6588 Type *ScalarTy = I->getType();
6589 Type *MaskTy = VectorType::get(Type::getInt1Ty(I->getContext()), VF);
6590 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
6591 Type::getVoidTy(I->getContext()),
6592 {PtrTy, ScalarTy, MaskTy});
6593
6594 // Add the costs together with the add/sub operation.
6595 return TTI.getIntrinsicInstrCost(ICA, CostKind) + MulCost +
6596 TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, CostKind);
6597 }
6598 [[fallthrough]];
6599 }
6600 case Instruction::FAdd:
6601 case Instruction::FSub:
6602 case Instruction::Mul:
6603 case Instruction::FMul:
6604 case Instruction::FDiv:
6605 case Instruction::FRem:
6606 case Instruction::Shl:
6607 case Instruction::LShr:
6608 case Instruction::AShr:
6609 case Instruction::And:
6610 case Instruction::Or:
6611 case Instruction::Xor: {
6612 // If we're speculating on the stride being 1, the multiplication may
6613 // fold away. We can generalize this for all operations using the notion
6614 // of neutral elements. (TODO)
6615 if (I->getOpcode() == Instruction::Mul &&
6616 (PSE.getSCEV(I->getOperand(0))->isOne() ||
6617 PSE.getSCEV(I->getOperand(1))->isOne()))
6618 return 0;
6619
6620 // Detect reduction patterns
6621 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy))
6622 return *RedCost;
6623
6624 // Certain instructions can be cheaper to vectorize if they have a constant
6625 // second vector operand. One example of this are shifts on x86.
6626 Value *Op2 = I->getOperand(1);
6627 if (!isa<Constant>(Op2) && PSE.getSE()->isSCEVable(Op2->getType()) &&
6628 isa<SCEVConstant>(PSE.getSCEV(Op2))) {
6629 Op2 = cast<SCEVConstant>(PSE.getSCEV(Op2))->getValue();
6630 }
6631 auto Op2Info = TTI.getOperandInfo(Op2);
6632 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6635
6636 SmallVector<const Value *, 4> Operands(I->operand_values());
6638 I->getOpcode(), VectorTy, CostKind,
6639 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6640 Op2Info, Operands, I, TLI);
6641 }
6642 case Instruction::FNeg: {
6644 I->getOpcode(), VectorTy, CostKind,
6645 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6646 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6647 I->getOperand(0), I);
6648 }
6649 case Instruction::Select: {
6650 SelectInst *SI = cast<SelectInst>(I);
6651 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6652 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6653
6654 const Value *Op0, *Op1;
6655 using namespace llvm::PatternMatch;
6656 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
6657 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
6658 // select x, y, false --> x & y
6659 // select x, true, y --> x | y
6660 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);
6661 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);
6662 assert(Op0->getType()->getScalarSizeInBits() == 1 &&
6663 Op1->getType()->getScalarSizeInBits() == 1);
6664
6667 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
6668 CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I);
6669 }
6670
6671 Type *CondTy = SI->getCondition()->getType();
6672 if (!ScalarCond)
6673 CondTy = VectorType::get(CondTy, VF);
6674
6676 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
6677 Pred = Cmp->getPredicate();
6678 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
6679 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
6680 {TTI::OK_AnyValue, TTI::OP_None}, I);
6681 }
6682 case Instruction::ICmp:
6683 case Instruction::FCmp: {
6684 Type *ValTy = I->getOperand(0)->getType();
6685
6687 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6688 (void)Op0AsInstruction;
6689 assert((!canTruncateToMinimalBitwidth(Op0AsInstruction, VF) ||
6690 MinBWs[I] == MinBWs[Op0AsInstruction]) &&
6691 "if both the operand and the compare are marked for "
6692 "truncation, they must have the same bitwidth");
6693 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[I]);
6694 }
6695
6696 VectorTy = toVectorTy(ValTy, VF);
6697 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
6698 cast<CmpInst>(I)->getPredicate(), CostKind,
6699 {TTI::OK_AnyValue, TTI::OP_None},
6700 {TTI::OK_AnyValue, TTI::OP_None}, I);
6701 }
6702 case Instruction::Store:
6703 case Instruction::Load: {
6704 ElementCount Width = VF;
6705 if (Width.isVector()) {
6706 InstWidening Decision = getWideningDecision(I, Width);
6707 assert(Decision != CM_Unknown &&
6708 "CM decision should be taken at this point");
6711 if (Decision == CM_Scalarize)
6712 Width = ElementCount::getFixed(1);
6713 }
6714 VectorTy = toVectorTy(getLoadStoreType(I), Width);
6715 return getMemoryInstructionCost(I, VF);
6716 }
6717 case Instruction::BitCast:
6718 if (I->getType()->isPointerTy())
6719 return 0;
6720 [[fallthrough]];
6721 case Instruction::ZExt:
6722 case Instruction::SExt:
6723 case Instruction::FPToUI:
6724 case Instruction::FPToSI:
6725 case Instruction::FPExt:
6726 case Instruction::PtrToInt:
6727 case Instruction::IntToPtr:
6728 case Instruction::SIToFP:
6729 case Instruction::UIToFP:
6730 case Instruction::Trunc:
6731 case Instruction::FPTrunc: {
6732 // Computes the CastContextHint from a Load/Store instruction.
6733 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6734 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
6735 "Expected a load or a store!");
6736
6737 if (VF.isScalar() || !TheLoop->contains(I))
6739
6740 switch (getWideningDecision(I, VF)) {
6752 llvm_unreachable("Instr did not go through cost modelling?");
6755 llvm_unreachable_internal("Instr has invalid widening decision");
6756 }
6757
6758 llvm_unreachable("Unhandled case!");
6759 };
6760
6761 unsigned Opcode = I->getOpcode();
6763 // For Trunc, the context is the only user, which must be a StoreInst.
6764 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
6765 if (I->hasOneUse())
6766 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
6767 CCH = ComputeCCH(Store);
6768 }
6769 // For Z/Sext, the context is the operand, which must be a LoadInst.
6770 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
6771 Opcode == Instruction::FPExt) {
6772 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
6773 CCH = ComputeCCH(Load);
6774 }
6775
6776 // We optimize the truncation of induction variables having constant
6777 // integer steps. The cost of these truncations is the same as the scalar
6778 // operation.
6779 if (isOptimizableIVTruncate(I, VF)) {
6780 auto *Trunc = cast<TruncInst>(I);
6781 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6782 Trunc->getSrcTy(), CCH, CostKind, Trunc);
6783 }
6784
6785 // Detect reduction patterns
6786 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy))
6787 return *RedCost;
6788
6789 Type *SrcScalarTy = I->getOperand(0)->getType();
6790 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6791 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6792 SrcScalarTy =
6793 IntegerType::get(SrcScalarTy->getContext(), MinBWs[Op0AsInstruction]);
6794 Type *SrcVecTy =
6795 VectorTy->isVectorTy() ? toVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6796
6798 // If the result type is <= the source type, there will be no extend
6799 // after truncating the users to the minimal required bitwidth.
6800 if (VectorTy->getScalarSizeInBits() <= SrcVecTy->getScalarSizeInBits() &&
6801 (I->getOpcode() == Instruction::ZExt ||
6802 I->getOpcode() == Instruction::SExt))
6803 return 0;
6804 }
6805
6806 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
6807 }
6808 case Instruction::Call:
6809 return getVectorCallCost(cast<CallInst>(I), VF);
6810 case Instruction::ExtractValue:
6812 case Instruction::Alloca:
6813 // We cannot easily widen alloca to a scalable alloca, as
6814 // the result would need to be a vector of pointers.
6815 if (VF.isScalable())
6817 [[fallthrough]];
6818 default:
6819 // This opcode is unknown. Assume that it is the same as 'mul'.
6820 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6821 } // end of switch.
6822}
6823
6825 // Ignore ephemeral values.
6827
6828 SmallVector<Value *, 4> DeadInterleavePointerOps;
6830
6831 // If a scalar epilogue is required, users outside the loop won't use
6832 // live-outs from the vector loop but from the scalar epilogue. Ignore them if
6833 // that is the case.
6834 bool RequiresScalarEpilogue = requiresScalarEpilogue(true);
6835 auto IsLiveOutDead = [this, RequiresScalarEpilogue](User *U) {
6836 return RequiresScalarEpilogue &&
6837 !TheLoop->contains(cast<Instruction>(U)->getParent());
6838 };
6839
6841 DFS.perform(LI);
6842 MapVector<Value *, SmallVector<Value *>> DeadInvariantStoreOps;
6843 for (BasicBlock *BB : reverse(make_range(DFS.beginRPO(), DFS.endRPO())))
6844 for (Instruction &I : reverse(*BB)) {
6845 // Find all stores to invariant variables. Since they are going to sink
6846 // outside the loop we do not need calculate cost for them.
6847 StoreInst *SI;
6848 if ((SI = dyn_cast<StoreInst>(&I)) &&
6849 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
6850 ValuesToIgnore.insert(&I);
6851 DeadInvariantStoreOps[SI->getPointerOperand()].push_back(
6852 SI->getValueOperand());
6853 }
6854
6855 if (VecValuesToIgnore.contains(&I) || ValuesToIgnore.contains(&I))
6856 continue;
6857
6858 // Add instructions that would be trivially dead and are only used by
6859 // values already ignored to DeadOps to seed worklist.
6861 all_of(I.users(), [this, IsLiveOutDead](User *U) {
6862 return VecValuesToIgnore.contains(U) ||
6863 ValuesToIgnore.contains(U) || IsLiveOutDead(U);
6864 }))
6865 DeadOps.push_back(&I);
6866
6867 // For interleave groups, we only create a pointer for the start of the
6868 // interleave group. Queue up addresses of group members except the insert
6869 // position for further processing.
6870 if (isAccessInterleaved(&I)) {
6871 auto *Group = getInterleavedAccessGroup(&I);
6872 if (Group->getInsertPos() == &I)
6873 continue;
6874 Value *PointerOp = getLoadStorePointerOperand(&I);
6875 DeadInterleavePointerOps.push_back(PointerOp);
6876 }
6877
6878 // Queue branches for analysis. They are dead, if their successors only
6879 // contain dead instructions.
6880 if (auto *Br = dyn_cast<BranchInst>(&I)) {
6881 if (Br->isConditional())
6882 DeadOps.push_back(&I);
6883 }
6884 }
6885
6886 // Mark ops feeding interleave group members as free, if they are only used
6887 // by other dead computations.
6888 for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) {
6889 auto *Op = dyn_cast<Instruction>(DeadInterleavePointerOps[I]);
6890 if (!Op || !TheLoop->contains(Op) || any_of(Op->users(), [this](User *U) {
6891 Instruction *UI = cast<Instruction>(U);
6892 return !VecValuesToIgnore.contains(U) &&
6893 (!isAccessInterleaved(UI) ||
6894 getInterleavedAccessGroup(UI)->getInsertPos() == UI);
6895 }))
6896 continue;
6897 VecValuesToIgnore.insert(Op);
6898 DeadInterleavePointerOps.append(Op->op_begin(), Op->op_end());
6899 }
6900
6901 for (const auto &[_, Ops] : DeadInvariantStoreOps) {
6902 for (Value *Op : ArrayRef(Ops).drop_back())
6903 DeadOps.push_back(Op);
6904 }
6905 // Mark ops that would be trivially dead and are only used by ignored
6906 // instructions as free.
6907 BasicBlock *Header = TheLoop->getHeader();
6908
6909 // Returns true if the block contains only dead instructions. Such blocks will
6910 // be removed by VPlan-to-VPlan transforms and won't be considered by the
6911 // VPlan-based cost model, so skip them in the legacy cost-model as well.
6912 auto IsEmptyBlock = [this](BasicBlock *BB) {
6913 return all_of(*BB, [this](Instruction &I) {
6914 return ValuesToIgnore.contains(&I) || VecValuesToIgnore.contains(&I) ||
6915 (isa<BranchInst>(&I) && !cast<BranchInst>(&I)->isConditional());
6916 });
6917 };
6918 for (unsigned I = 0; I != DeadOps.size(); ++I) {
6919 auto *Op = dyn_cast<Instruction>(DeadOps[I]);
6920
6921 // Check if the branch should be considered dead.
6922 if (auto *Br = dyn_cast_or_null<BranchInst>(Op)) {
6923 BasicBlock *ThenBB = Br->getSuccessor(0);
6924 BasicBlock *ElseBB = Br->getSuccessor(1);
6925 // Don't considers branches leaving the loop for simplification.
6926 if (!TheLoop->contains(ThenBB) || !TheLoop->contains(ElseBB))
6927 continue;
6928 bool ThenEmpty = IsEmptyBlock(ThenBB);
6929 bool ElseEmpty = IsEmptyBlock(ElseBB);
6930 if ((ThenEmpty && ElseEmpty) ||
6931 (ThenEmpty && ThenBB->getSingleSuccessor() == ElseBB &&
6932 ElseBB->phis().empty()) ||
6933 (ElseEmpty && ElseBB->getSingleSuccessor() == ThenBB &&
6934 ThenBB->phis().empty())) {
6935 VecValuesToIgnore.insert(Br);
6936 DeadOps.push_back(Br->getCondition());
6937 }
6938 continue;
6939 }
6940
6941 // Skip any op that shouldn't be considered dead.
6942 if (!Op || !TheLoop->contains(Op) ||
6943 (isa<PHINode>(Op) && Op->getParent() == Header) ||
6945 any_of(Op->users(), [this, IsLiveOutDead](User *U) {
6946 return !VecValuesToIgnore.contains(U) &&
6947 !ValuesToIgnore.contains(U) && !IsLiveOutDead(U);
6948 }))
6949 continue;
6950
6951 if (!TheLoop->contains(Op->getParent()))
6952 continue;
6953
6954 // If all of Op's users are in ValuesToIgnore, add it to ValuesToIgnore
6955 // which applies for both scalar and vector versions. Otherwise it is only
6956 // dead in vector versions, so only add it to VecValuesToIgnore.
6957 if (all_of(Op->users(),
6958 [this](User *U) { return ValuesToIgnore.contains(U); }))
6959 ValuesToIgnore.insert(Op);
6960
6961 VecValuesToIgnore.insert(Op);
6962 DeadOps.append(Op->op_begin(), Op->op_end());
6963 }
6964
6965 // Ignore type-promoting instructions we identified during reduction
6966 // detection.
6967 for (const auto &Reduction : Legal->getReductionVars()) {
6968 const RecurrenceDescriptor &RedDes = Reduction.second;
6969 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6970 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6971 }
6972 // Ignore type-casting instructions we identified during induction
6973 // detection.
6974 for (const auto &Induction : Legal->getInductionVars()) {
6975 const InductionDescriptor &IndDes = Induction.second;
6976 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6977 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6978 }
6979}
6980
6982 for (const auto &Reduction : Legal->getReductionVars()) {
6983 PHINode *Phi = Reduction.first;
6984 const RecurrenceDescriptor &RdxDesc = Reduction.second;
6985
6986 // We don't collect reductions that are type promoted (yet).
6987 if (RdxDesc.getRecurrenceType() != Phi->getType())
6988 continue;
6989
6990 // If the target would prefer this reduction to happen "in-loop", then we
6991 // want to record it as such.
6992 unsigned Opcode = RdxDesc.getOpcode();
6993 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
6994 !TTI.preferInLoopReduction(Opcode, Phi->getType(),
6996 continue;
6997
6998 // Check that we can correctly put the reductions into the loop, by
6999 // finding the chain of operations that leads from the phi to the loop
7000 // exit value.
7001 SmallVector<Instruction *, 4> ReductionOperations =
7002 RdxDesc.getReductionOpChain(Phi, TheLoop);
7003 bool InLoop = !ReductionOperations.empty();
7004
7005 if (InLoop) {
7006 InLoopReductions.insert(Phi);
7007 // Add the elements to InLoopReductionImmediateChains for cost modelling.
7008 Instruction *LastChain = Phi;
7009 for (auto *I : ReductionOperations) {
7010 InLoopReductionImmediateChains[I] = LastChain;
7011 LastChain = I;
7012 }
7013 }
7014 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7015 << " reduction for phi: " << *Phi << "\n");
7016 }
7017}
7018
7019// This function will select a scalable VF if the target supports scalable
7020// vectors and a fixed one otherwise.
7021// TODO: we could return a pair of values that specify the max VF and
7022// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7023// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7024// doesn't have a cost model that can choose which plan to execute if
7025// more than one is generated.
7028 unsigned WidestType;
7029 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7030
7035
7037 unsigned N = RegSize.getKnownMinValue() / WidestType;
7038 return ElementCount::get(N, RegSize.isScalable());
7039}
7040
7043 ElementCount VF = UserVF;
7044 // Outer loop handling: They may require CFG and instruction level
7045 // transformations before even evaluating whether vectorization is profitable.
7046 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7047 // the vectorization pipeline.
7048 if (!OrigLoop->isInnermost()) {
7049 // If the user doesn't provide a vectorization factor, determine a
7050 // reasonable one.
7051 if (UserVF.isZero()) {
7052 VF = determineVPlanVF(TTI, CM);
7053 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7054
7055 // Make sure we have a VF > 1 for stress testing.
7056 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7057 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7058 << "overriding computed VF.\n");
7059 VF = ElementCount::getFixed(4);
7060 }
7061 } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
7063 LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
7064 << "not supported by the target.\n");
7066 "Scalable vectorization requested but not supported by the target",
7067 "the scalable user-specified vectorization width for outer-loop "
7068 "vectorization cannot be used because the target does not support "
7069 "scalable vectors.",
7070 "ScalableVFUnfeasible", ORE, OrigLoop);
7072 }
7073 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7075 "VF needs to be a power of two");
7076 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7077 << "VF " << VF << " to build VPlans.\n");
7078 buildVPlans(VF, VF);
7079
7080 // For VPlan build stress testing, we bail out after VPlan construction.
7083
7084 return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
7085 }
7086
7087 LLVM_DEBUG(
7088 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7089 "VPlan-native path.\n");
7091}
7092
7093void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7094 assert(OrigLoop->isInnermost() && "Inner loop expected.");
7097
7098 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7099 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7100 return;
7101
7102 // Invalidate interleave groups if all blocks of loop will be predicated.
7103 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7105 LLVM_DEBUG(
7106 dbgs()
7107 << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7108 "which requires masked-interleaved support.\n");
7110 // Invalidating interleave groups also requires invalidating all decisions
7111 // based on them, which includes widening decisions and uniform and scalar
7112 // values.
7114 }
7115
7116 if (CM.foldTailByMasking())
7118
7119 ElementCount MaxUserVF =
7120 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7121 if (UserVF) {
7122 if (!ElementCount::isKnownLE(UserVF, MaxUserVF)) {
7124 "UserVF ignored because it may be larger than the maximal safe VF",
7125 "InvalidUserVF", ORE, OrigLoop);
7126 } else {
7128 "VF needs to be a power of two");
7129 // Collect the instructions (and their associated costs) that will be more
7130 // profitable to scalarize.
7132 if (CM.selectUserVectorizationFactor(UserVF)) {
7133 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7134 buildVPlansWithVPRecipes(UserVF, UserVF);
7136 return;
7137 }
7138 reportVectorizationInfo("UserVF ignored because of invalid costs.",
7139 "InvalidCost", ORE, OrigLoop);
7140 }
7141 }
7142
7143 // Collect the Vectorization Factor Candidates.
7144 SmallVector<ElementCount> VFCandidates;
7145 for (auto VF = ElementCount::getFixed(1);
7146 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7147 VFCandidates.push_back(VF);
7148 for (auto VF = ElementCount::getScalable(1);
7149 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7150 VFCandidates.push_back(VF);
7151
7153 for (const auto &VF : VFCandidates) {
7154 // Collect Uniform and Scalar instructions after vectorization with VF.
7156
7157 // Collect the instructions (and their associated costs) that will be more
7158 // profitable to scalarize.
7159 if (VF.isVector())
7161 }
7162
7163 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7164 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7165
7167}
7168
7170 ElementCount VF) const {
7171 if (ForceTargetInstructionCost.getNumOccurrences())
7172 return InstructionCost(ForceTargetInstructionCost.getNumOccurrences());
7173 return CM.getInstructionCost(UI, VF);
7174}
7175
7176bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
7177 return CM.ValuesToIgnore.contains(UI) ||
7178 (IsVector && CM.VecValuesToIgnore.contains(UI)) ||
7179 SkipCostComputation.contains(UI);
7180}
7181
7183LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
7184 VPCostContext &CostCtx) const {
7186 // Cost modeling for inductions is inaccurate in the legacy cost model
7187 // compared to the recipes that are generated. To match here initially during
7188 // VPlan cost model bring up directly use the induction costs from the legacy
7189 // cost model. Note that we do this as pre-processing; the VPlan may not have
7190 // any recipes associated with the original induction increment instruction
7191 // and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute
7192 // the cost of induction phis and increments (both that are represented by
7193 // recipes and those that are not), to avoid distinguishing between them here,
7194 // and skip all recipes that represent induction phis and increments (the
7195 // former case) later on, if they exist, to avoid counting them twice.
7196 // Similarly we pre-compute the cost of any optimized truncates.
7197 // TODO: Switch to more accurate costing based on VPlan.
7198 for (const auto &[IV, IndDesc] : Legal->getInductionVars()) {
7199 Instruction *IVInc = cast<Instruction>(
7200 IV->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
7201 SmallVector<Instruction *> IVInsts = {IVInc};
7202 for (unsigned I = 0; I != IVInsts.size(); I++) {
7203 for (Value *Op : IVInsts[I]->operands()) {
7204 auto *OpI = dyn_cast<Instruction>(Op);
7205 if (Op == IV || !OpI || !OrigLoop->contains(OpI) || !Op->hasOneUse())
7206 continue;
7207 IVInsts.push_back(OpI);
7208 }
7209 }
7210 IVInsts.push_back(IV);
7211 for (User *U : IV->users()) {
7212 auto *CI = cast<Instruction>(U);
7213 if (!CostCtx.CM.isOptimizableIVTruncate(CI, VF))
7214 continue;
7215 IVInsts.push_back(CI);
7216 }
7217
7218 // If the vector loop gets executed exactly once with the given VF, ignore
7219 // the costs of comparison and induction instructions, as they'll get
7220 // simplified away.
7221 // TODO: Remove this code after stepping away from the legacy cost model and
7222 // adding code to simplify VPlans before calculating their costs.
7223 auto TC = PSE.getSE()->getSmallConstantTripCount(OrigLoop);
7224 if (VF.isFixed() && TC == VF.getFixedValue() && !CM.foldTailByMasking())
7226 CostCtx.SkipCostComputation);
7227
7228 for (Instruction *IVInst : IVInsts) {
7229 if (CostCtx.skipCostComputation(IVInst, VF.isVector()))
7230 continue;
7231 InstructionCost InductionCost = CostCtx.getLegacyCost(IVInst, VF);
7232 LLVM_DEBUG({
7233 dbgs() << "Cost of " << InductionCost << " for VF " << VF
7234 << ": induction instruction " << *IVInst << "\n";
7235 });
7236 Cost += InductionCost;
7237 CostCtx.SkipCostComputation.insert(IVInst);
7238 }
7239 }
7240
7241 /// Compute the cost of all exiting conditions of the loop using the legacy
7242 /// cost model. This is to match the legacy behavior, which adds the cost of
7243 /// all exit conditions. Note that this over-estimates the cost, as there will
7244 /// be a single condition to control the vector loop.
7246 CM.TheLoop->getExitingBlocks(Exiting);
7247 SetVector<Instruction *> ExitInstrs;
7248 // Collect all exit conditions.
7249 for (BasicBlock *EB : Exiting) {
7250 auto *Term = dyn_cast<BranchInst>(EB->getTerminator());
7251 if (!Term)
7252 continue;
7253 if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) {
7254 ExitInstrs.insert(CondI);
7255 }
7256 }
7257 // Compute the cost of all instructions only feeding the exit conditions.
7258 for (unsigned I = 0; I != ExitInstrs.size(); ++I) {
7259 Instruction *CondI = ExitInstrs[I];
7260 if (!OrigLoop->contains(CondI) ||
7261 !CostCtx.SkipCostComputation.insert(CondI).second)
7262 continue;
7263 InstructionCost CondICost = CostCtx.getLegacyCost(CondI, VF);
7264 LLVM_DEBUG({
7265 dbgs() << "Cost of " << CondICost << " for VF " << VF
7266 << ": exit condition instruction " << *CondI << "\n";
7267 });
7268 Cost += CondICost;
7269 for (Value *Op : CondI->operands()) {
7270 auto *OpI = dyn_cast<Instruction>(Op);
7271 if (!OpI || any_of(OpI->users(), [&ExitInstrs, this](User *U) {
7272 return OrigLoop->contains(cast<Instruction>(U)->getParent()) &&
7273 !ExitInstrs.contains(cast<Instruction>(U));
7274 }))
7275 continue;
7276 ExitInstrs.insert(OpI);
7277 }
7278 }
7279
7280 // The legacy cost model has special logic to compute the cost of in-loop
7281 // reductions, which may be smaller than the sum of all instructions involved
7282 // in the reduction.
7283 // TODO: Switch to costing based on VPlan once the logic has been ported.
7284 for (const auto &[RedPhi, RdxDesc] : Legal->getReductionVars()) {
7285 if (ForceTargetInstructionCost.getNumOccurrences())
7286 continue;
7287
7288 if (!CM.isInLoopReduction(RedPhi))
7289 continue;
7290
7291 const auto &ChainOps = RdxDesc.getReductionOpChain(RedPhi, OrigLoop);
7292 SetVector<Instruction *> ChainOpsAndOperands(ChainOps.begin(),
7293 ChainOps.end());
7294 auto IsZExtOrSExt = [](const unsigned Opcode) -> bool {
7295 return Opcode == Instruction::ZExt || Opcode == Instruction::SExt;
7296 };
7297 // Also include the operands of instructions in the chain, as the cost-model
7298 // may mark extends as free.
7299 //
7300 // For ARM, some of the instruction can folded into the reducion
7301 // instruction. So we need to mark all folded instructions free.
7302 // For example: We can fold reduce(mul(ext(A), ext(B))) into one
7303 // instruction.
7304 for (auto *ChainOp : ChainOps) {
7305 for (Value *Op : ChainOp->operands()) {
7306 if (auto *I = dyn_cast<Instruction>(Op)) {
7307 ChainOpsAndOperands.insert(I);
7308 if (I->getOpcode() == Instruction::Mul) {
7309 auto *Ext0 = dyn_cast<Instruction>(I->getOperand(0));
7310 auto *Ext1 = dyn_cast<Instruction>(I->getOperand(1));
7311 if (Ext0 && IsZExtOrSExt(Ext0->getOpcode()) && Ext1 &&
7312 Ext0->getOpcode() == Ext1->getOpcode()) {
7313 ChainOpsAndOperands.insert(Ext0);
7314 ChainOpsAndOperands.insert(Ext1);
7315 }
7316 }
7317 }
7318 }
7319 }
7320
7321 // Pre-compute the cost for I, if it has a reduction pattern cost.
7322 for (Instruction *I : ChainOpsAndOperands) {
7323 auto ReductionCost =
7324 CM.getReductionPatternCost(I, VF, toVectorTy(I->getType(), VF));
7325 if (!ReductionCost)
7326 continue;
7327
7328 assert(!CostCtx.SkipCostComputation.contains(I) &&
7329 "reduction op visited multiple times");
7330 CostCtx.SkipCostComputation.insert(I);
7331 LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF
7332 << ":\n in-loop reduction " << *I << "\n");
7333 Cost += *ReductionCost;
7334 }
7335 }
7336
7337 // Pre-compute the costs for branches except for the backedge, as the number
7338 // of replicate regions in a VPlan may not directly match the number of
7339 // branches, which would lead to different decisions.
7340 // TODO: Compute cost of branches for each replicate region in the VPlan,
7341 // which is more accurate than the legacy cost model.
7342 for (BasicBlock *BB : OrigLoop->blocks()) {
7343 if (CostCtx.skipCostComputation(BB->getTerminator(), VF.isVector()))
7344 continue;
7345 CostCtx.SkipCostComputation.insert(BB->getTerminator());
7346 if (BB == OrigLoop->getLoopLatch())
7347 continue;
7348 auto BranchCost = CostCtx.getLegacyCost(BB->getTerminator(), VF);
7349 Cost += BranchCost;
7350 }
7351
7352 // Pre-compute costs for instructions that are forced-scalar or profitable to
7353 // scalarize. Their costs will be computed separately in the legacy cost
7354 // model.
7355 for (Instruction *ForcedScalar : CM.ForcedScalars[VF]) {
7356 if (CostCtx.skipCostComputation(ForcedScalar, VF.isVector()))
7357 continue;
7358 CostCtx.SkipCostComputation.insert(ForcedScalar);
7359 InstructionCost ForcedCost = CostCtx.getLegacyCost(ForcedScalar, VF);
7360 LLVM_DEBUG({
7361 dbgs() << "Cost of " << ForcedCost << " for VF " << VF
7362 << ": forced scalar " << *ForcedScalar << "\n";
7363 });
7364 Cost += ForcedCost;
7365 }
7366 for (const auto &[Scalarized, ScalarCost] : CM.InstsToScalarize[VF]) {
7367 if (CostCtx.skipCostComputation(Scalarized, VF.isVector()))
7368 continue;
7369 CostCtx.SkipCostComputation.insert(Scalarized);
7370 LLVM_DEBUG({
7371 dbgs() << "Cost of " << ScalarCost << " for VF " << VF
7372 << ": profitable to scalarize " << *Scalarized << "\n";
7373 });
7374 Cost += ScalarCost;
7375 }
7376
7377 return Cost;
7378}
7379
7380InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
7381 ElementCount VF) const {
7382 VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
7383 CM.CostKind);
7384 InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
7385
7386 // Now compute and add the VPlan-based cost.
7387 Cost += Plan.cost(VF, CostCtx);
7388#ifndef NDEBUG
7389 unsigned EstimatedWidth = getEstimatedRuntimeVF(OrigLoop, CM.TTI, VF);
7390 LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
7391 << " (Estimated cost per lane: ");
7392 if (Cost.isValid()) {
7393 double CostPerLane = double(*Cost.getValue()) / EstimatedWidth;
7394 LLVM_DEBUG(dbgs() << format("%.1f", CostPerLane));
7395 } else /* No point dividing an invalid cost - it will still be invalid */
7396 LLVM_DEBUG(dbgs() << "Invalid");
7397 LLVM_DEBUG(dbgs() << ")\n");
7398#endif
7399 return Cost;
7400}
7401
7402#ifndef NDEBUG
7403/// Return true if the original loop \ TheLoop contains any instructions that do
7404/// not have corresponding recipes in \p Plan and are not marked to be ignored
7405/// in \p CostCtx. This means the VPlan contains simplification that the legacy
7406/// cost-model did not account for.
7408 VPCostContext &CostCtx,
7409 Loop *TheLoop) {
7410 // First collect all instructions for the recipes in Plan.
7411 auto GetInstructionForCost = [](const VPRecipeBase *R) -> Instruction * {
7412 if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
7413 return dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
7414 if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))
7415 return &WidenMem->getIngredient();
7416 return nullptr;
7417 };
7418
7419 DenseSet<Instruction *> SeenInstrs;
7420 auto Iter = vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry());
7421 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
7422 for (VPRecipeBase &R : *VPBB) {
7423 if (auto *IR = dyn_cast<VPInterleaveRecipe>(&R)) {
7424 auto *IG = IR->getInterleaveGroup();
7425 unsigned NumMembers = IG->getNumMembers();
7426 for (unsigned I = 0; I != NumMembers; ++I) {
7427 if (Instruction *M = IG->getMember(I))
7428 SeenInstrs.insert(M);
7429 }
7430 continue;
7431 }
7432 // The VPlan-based cost model is more accurate for partial reduction and
7433 // comparing against the legacy cost isn't desirable.
7434 if (isa<VPPartialReductionRecipe>(&R))
7435 return true;
7436 if (Instruction *UI = GetInstructionForCost(&R))
7437 SeenInstrs.insert(UI);
7438 }
7439 }
7440
7441 // Return true if the loop contains any instructions that are not also part of
7442 // the VPlan or are skipped for VPlan-based cost computations. This indicates
7443 // that the VPlan contains extra simplifications.
7444 return any_of(TheLoop->blocks(), [&SeenInstrs, &CostCtx,
7445 TheLoop](BasicBlock *BB) {
7446 return any_of(*BB, [&SeenInstrs, &CostCtx, TheLoop, BB](Instruction &I) {
7447 if (isa<PHINode>(&I) && BB == TheLoop->getHeader())
7448 return false;
7449 return !SeenInstrs.contains(&I) && !CostCtx.skipCostComputation(&I, true);
7450 });
7451 });
7452}
7453#endif
7454
7456 if (VPlans.empty())
7458 // If there is a single VPlan with a single VF, return it directly.
7459 VPlan &FirstPlan = *VPlans[0];
7460 if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1)
7461 return {*FirstPlan.vectorFactors().begin(), 0, 0};
7462
7463 LLVM_DEBUG(dbgs() << "LV: Computing best VF using cost kind: "
7465 ? "Reciprocal Throughput\n"
7467 ? "Instruction Latency\n"
7468 : CM.CostKind == TTI::TCK_CodeSize ? "Code Size\n"
7470 ? "Code Size and Latency\n"
7471 : "Unknown\n"));
7472
7474 assert(hasPlanWithVF(ScalarVF) &&
7475 "More than a single plan/VF w/o any plan having scalar VF");
7476
7477 // TODO: Compute scalar cost using VPlan-based cost model.
7478 InstructionCost ScalarCost = CM.expectedCost(ScalarVF);
7479 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ScalarCost << ".\n");
7480 VectorizationFactor ScalarFactor(ScalarVF, ScalarCost, ScalarCost);
7481 VectorizationFactor BestFactor = ScalarFactor;
7482
7483 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
7484 if (ForceVectorization) {
7485 // Ignore scalar width, because the user explicitly wants vectorization.
7486 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
7487 // evaluation.
7488 BestFactor.Cost = InstructionCost::getMax();
7489 }
7490
7491 for (auto &P : VPlans) {
7492 for (ElementCount VF : P->vectorFactors()) {
7493 if (VF.isScalar())
7494 continue;
7495 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
7496 LLVM_DEBUG(
7497 dbgs()
7498 << "LV: Not considering vector loop of width " << VF
7499 << " because it will not generate any vector instructions.\n");
7500 continue;
7501 }
7502
7503 InstructionCost Cost = cost(*P, VF);
7504 VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
7505 if (isMoreProfitable(CurrentFactor, BestFactor))
7506 BestFactor = CurrentFactor;
7507
7508 // If profitable add it to ProfitableVF list.
7509 if (isMoreProfitable(CurrentFactor, ScalarFactor))
7510 ProfitableVFs.push_back(CurrentFactor);
7511 }
7512 }
7513
7514#ifndef NDEBUG
7515 // Select the optimal vectorization factor according to the legacy cost-model.
7516 // This is now only used to verify the decisions by the new VPlan-based
7517 // cost-model and will be retired once the VPlan-based cost-model is
7518 // stabilized.
7519 VectorizationFactor LegacyVF = selectVectorizationFactor();
7520 VPlan &BestPlan = getPlanFor(BestFactor.Width);
7521
7522 // Pre-compute the cost and use it to check if BestPlan contains any
7523 // simplifications not accounted for in the legacy cost model. If that's the
7524 // case, don't trigger the assertion, as the extra simplifications may cause a
7525 // different VF to be picked by the VPlan-based cost model.
7526 VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
7527 CM.CostKind);
7528 precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
7529 assert((BestFactor.Width == LegacyVF.Width ||
7531 CostCtx, OrigLoop) ||
7533 CostCtx, OrigLoop)) &&
7534 " VPlan cost model and legacy cost model disagreed");
7535 assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) &&
7536 "when vectorizing, the scalar cost must be computed.");
7537#endif
7538
7539 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << BestFactor.Width << ".\n");
7540 return BestFactor;
7541}
7542
7545 // Reserve first location for self reference to the LoopID metadata node.
7546 MDs.push_back(nullptr);
7547 bool IsUnrollMetadata = false;
7548 MDNode *LoopID = L->getLoopID();
7549 if (LoopID) {
7550 // First find existing loop unrolling disable metadata.
7551 for (unsigned I = 1, IE = LoopID->getNumOperands(); I < IE; ++I) {
7552 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(I));
7553 if (MD) {
7554 const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7555 IsUnrollMetadata =
7556 S && S->getString().starts_with("llvm.loop.unroll.disable");
7557 }
7558 MDs.push_back(LoopID->getOperand(I));
7559 }
7560 }
7561
7562 if (!IsUnrollMetadata) {
7563 // Add runtime unroll disable metadata.
7564 LLVMContext &Context = L->getHeader()->getContext();
7565 SmallVector<Metadata *, 1> DisableOperands;
7566 DisableOperands.push_back(
7567 MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7568 MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7569 MDs.push_back(DisableNode);
7570 MDNode *NewLoopID = MDNode::get(Context, MDs);
7571 // Set operand 0 to refer to the loop id itself.
7572 NewLoopID->replaceOperandWith(0, NewLoopID);
7573 L->setLoopID(NewLoopID);
7574 }
7575}
7576
7577// If \p R is a ComputeReductionResult when vectorizing the epilog loop,
7578// fix the reduction's scalar PHI node by adding the incoming value from the
7579// main vector loop.
7581 VPRecipeBase *R, VPTransformState &State, BasicBlock *LoopMiddleBlock,
7582 BasicBlock *BypassBlock) {
7583 auto *EpiRedResult = dyn_cast<VPInstruction>(R);
7584 if (!EpiRedResult ||
7585 EpiRedResult->getOpcode() != VPInstruction::ComputeReductionResult)
7586 return;
7587
7588 auto *EpiRedHeaderPhi =
7589 cast<VPReductionPHIRecipe>(EpiRedResult->getOperand(0));
7590 const RecurrenceDescriptor &RdxDesc =
7591 EpiRedHeaderPhi->getRecurrenceDescriptor();
7592 Value *MainResumeValue =
7593 EpiRedHeaderPhi->getStartValue()->getUnderlyingValue();
7595 RdxDesc.getRecurrenceKind())) {
7596 auto *Cmp = cast<ICmpInst>(MainResumeValue);
7597 assert(Cmp->getPredicate() == CmpInst::ICMP_NE &&
7598 "AnyOf expected to start with ICMP_NE");
7599 assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue() &&
7600 "AnyOf expected to start by comparing main resume value to original "
7601 "start value");
7602 MainResumeValue = Cmp->getOperand(0);
7604 RdxDesc.getRecurrenceKind())) {
7605 using namespace llvm::PatternMatch;
7606 Value *Cmp, *OrigResumeV;
7607 bool IsExpectedPattern =
7608 match(MainResumeValue, m_Select(m_OneUse(m_Value(Cmp)),
7609 m_Specific(RdxDesc.getSentinelValue()),
7610 m_Value(OrigResumeV))) &&
7611 match(Cmp,
7614 assert(IsExpectedPattern && "Unexpected reduction resume pattern");
7615 (void)IsExpectedPattern;
7616 MainResumeValue = OrigResumeV;
7617 }
7618 PHINode *MainResumePhi = cast<PHINode>(MainResumeValue);
7619
7620 // When fixing reductions in the epilogue loop we should already have
7621 // created a bc.merge.rdx Phi after the main vector body. Ensure that we carry
7622 // over the incoming values correctly.
7623 using namespace VPlanPatternMatch;
7624 auto IsResumePhi = [](VPUser *U) {
7625 return match(
7626 U, m_VPInstruction<VPInstruction::ResumePhi>(m_VPValue(), m_VPValue()));
7627 };
7628 assert(count_if(EpiRedResult->users(), IsResumePhi) == 1 &&
7629 "ResumePhi must have a single user");
7630 auto *EpiResumePhiVPI =
7631 cast<VPInstruction>(*find_if(EpiRedResult->users(), IsResumePhi));
7632 auto *EpiResumePhi = cast<PHINode>(State.get(EpiResumePhiVPI, true));
7633 EpiResumePhi->setIncomingValueForBlock(
7634 BypassBlock, MainResumePhi->getIncomingValueForBlock(BypassBlock));
7635}
7636
7638 ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7639 InnerLoopVectorizer &ILV, DominatorTree *DT, bool VectorizingEpilogue,
7640 const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {
7641 assert(BestVPlan.hasVF(BestVF) &&
7642 "Trying to execute plan with unsupported VF");
7643 assert(BestVPlan.hasUF(BestUF) &&
7644 "Trying to execute plan with unsupported UF");
7645 assert(
7646 ((VectorizingEpilogue && ExpandedSCEVs) ||
7647 (!VectorizingEpilogue && !ExpandedSCEVs)) &&
7648 "expanded SCEVs to reuse can only be used during epilogue vectorization");
7649
7650 // TODO: Move to VPlan transform stage once the transition to the VPlan-based
7651 // cost model is complete for better cost estimates.
7652 VPlanTransforms::unrollByUF(BestVPlan, BestUF,
7653 OrigLoop->getHeader()->getContext());
7654 VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
7656
7657 // Perform the actual loop transformation.
7658 VPTransformState State(&TTI, BestVF, BestUF, LI, DT, ILV.Builder, &ILV,
7659 &BestVPlan, OrigLoop->getParentLoop(),
7660 Legal->getWidestInductionType());
7661
7662#ifdef EXPENSIVE_CHECKS
7663 assert(DT->verify(DominatorTree::VerificationLevel::Fast));
7664#endif
7665
7666 // 0. Generate SCEV-dependent code in the entry, including TripCount, before
7667 // making any changes to the CFG.
7668 if (!BestVPlan.getEntry()->empty())
7669 BestVPlan.getEntry()->execute(&State);
7670
7671 if (!ILV.getTripCount())
7672 ILV.setTripCount(State.get(BestVPlan.getTripCount(), VPLane(0)));
7673 else
7674 assert(VectorizingEpilogue && "should only re-use the existing trip "
7675 "count during epilogue vectorization");
7676
7677 // 1. Set up the skeleton for vectorization, including vector pre-header and
7678 // middle block. The vector loop is created during VPlan execution.
7679 VPBasicBlock *VectorPH =
7680 cast<VPBasicBlock>(BestVPlan.getEntry()->getSingleSuccessor());
7682 ExpandedSCEVs ? *ExpandedSCEVs : State.ExpandedSCEVs);
7683 if (VectorizingEpilogue)
7685
7686 // Only use noalias metadata when using memory checks guaranteeing no overlap
7687 // across all iterations.
7688 const LoopAccessInfo *LAI = ILV.Legal->getLAI();
7689 std::unique_ptr<LoopVersioning> LVer = nullptr;
7690 if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
7692
7693 // We currently don't use LoopVersioning for the actual loop cloning but we
7694 // still use it to add the noalias metadata.
7695 // TODO: Find a better way to re-use LoopVersioning functionality to add
7696 // metadata.
7697 LVer = std::make_unique<LoopVersioning>(
7698 *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT,
7699 PSE.getSE());
7700 State.LVer = &*LVer;
7702 }
7703
7705
7706 //===------------------------------------------------===//
7707 //
7708 // Notice: any optimization or new instruction that go
7709 // into the code below should also be implemented in
7710 // the cost-model.
7711 //
7712 //===------------------------------------------------===//
7713
7714 // 2. Copy and widen instructions from the old loop into the new loop.
7715 BestVPlan.prepareToExecute(
7716 ILV.getTripCount(),
7718 replaceVPBBWithIRVPBB(VectorPH, State.CFG.PrevBB);
7719
7720 BestVPlan.execute(&State);
7721
7722 auto *MiddleVPBB = BestVPlan.getMiddleBlock();
7723 // 2.5 When vectorizing the epilogue, fix reduction and induction resume
7724 // values from the additional bypass block.
7725 if (VectorizingEpilogue) {
7727 "Epilogue vectorisation not yet supported with early exits");
7728 BasicBlock *BypassBlock = ILV.getAdditionalBypassBlock();
7729 for (VPRecipeBase &R : *MiddleVPBB) {
7731 &R, State, State.CFG.VPBB2IRBB[MiddleVPBB], BypassBlock);
7732 }
7733 BasicBlock *PH = OrigLoop->getLoopPreheader();
7734 for (const auto &[IVPhi, _] : Legal->getInductionVars()) {
7735 auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH));
7737 Inc->setIncomingValueForBlock(BypassBlock, V);
7738 }
7739 }
7740
7741 // 2.6. Maintain Loop Hints
7742 // Keep all loop hints from the original loop on the vector loop (we'll
7743 // replace the vectorizer-specific hints below).
7744 if (auto *LoopRegion = BestVPlan.getVectorLoopRegion()) {
7745 MDNode *OrigLoopID = OrigLoop->getLoopID();
7746
7747 std::optional<MDNode *> VectorizedLoopID =
7750
7751 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
7752 Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
7753 if (VectorizedLoopID) {
7754 L->setLoopID(*VectorizedLoopID);
7755 } else {
7756 // Keep all loop hints from the original loop on the vector loop (we'll
7757 // replace the vectorizer-specific hints below).
7758 if (MDNode *LID = OrigLoop->getLoopID())
7759 L->setLoopID(LID);
7760
7761 LoopVectorizeHints Hints(L, true, *ORE);
7762 Hints.setAlreadyVectorized();
7763 }
7765 TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
7766 if (!UP.UnrollVectorizedLoop || VectorizingEpilogue)
7768 }
7769
7770 // 3. Fix the vectorized code: take care of header phi's, live-outs,
7771 // predication, updating analyses.
7772 ILV.fixVectorizedLoop(State);
7773
7775
7776 // 4. Adjust branch weight of the branch in the middle block.
7777 if (BestVPlan.getVectorLoopRegion()) {
7778 auto *MiddleVPBB = BestVPlan.getMiddleBlock();
7779 auto *MiddleTerm =
7780 cast<BranchInst>(State.CFG.VPBB2IRBB[MiddleVPBB]->getTerminator());
7781 if (MiddleTerm->isConditional() &&
7782 hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
7783 // Assume that `Count % VectorTripCount` is equally distributed.
7784 unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
7785 assert(TripCount > 0 && "trip count should not be zero");
7786 const uint32_t Weights[] = {1, TripCount - 1};
7787 setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
7788 }
7789 }
7790
7791 return State.ExpandedSCEVs;
7792}
7793
7794//===--------------------------------------------------------------------===//
7795// EpilogueVectorizerMainLoop
7796//===--------------------------------------------------------------------===//
7797
7798/// This function is partially responsible for generating the control flow
7799/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7801 const SCEV2ValueTy &ExpandedSCEVs) {
7803
7804 // Generate the code to check the minimum iteration count of the vector
7805 // epilogue (see below).
7809
7810 // Generate the code to check any assumptions that we've made for SCEV
7811 // expressions.
7813
7814 // Generate the code that checks at runtime if arrays overlap. We put the
7815 // checks into a separate block to make the more common case of few elements
7816 // faster.
7818
7819 // Generate the iteration count check for the main loop, *after* the check
7820 // for the epilogue loop, so that the path-length is shorter for the case
7821 // that goes directly through the vector epilogue. The longer-path length for
7822 // the main loop is compensated for, by the gain from vectorizing the larger
7823 // trip count. Note: the branch will get updated later on when we vectorize
7824 // the epilogue.
7827
7828 // Generate the induction variable.
7830
7831 return LoopVectorPreHeader;
7832}
7833
7835 LLVM_DEBUG({
7836 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7837 << "Main Loop VF:" << EPI.MainLoopVF
7838 << ", Main Loop UF:" << EPI.MainLoopUF
7839 << ", Epilogue Loop VF:" << EPI.EpilogueVF
7840 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7841 });
7842}
7843
7846 dbgs() << "intermediate fn:\n"
7847 << *OrigLoop->getHeader()->getParent() << "\n";
7848 });
7849}
7850
7851BasicBlock *
7853 bool ForEpilogue) {
7854 assert(Bypass && "Expected valid bypass basic block.");
7855 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7856 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7857 Value *Count = getTripCount();
7858 // Reuse existing vector loop preheader for TC checks.
7859 // Note that new preheader block is generated for vector loop.
7860 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7861 IRBuilder<> Builder(TCCheckBlock->getTerminator());
7862
7863 // Generate code to check if the loop's trip count is less than VF * UF of the
7864 // main vector loop.
7865 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector()
7866 : VF.isVector())
7869
7870 Value *CheckMinIters = Builder.CreateICmp(
7871 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
7872 "min.iters.check");
7873
7874 if (!ForEpilogue)
7875 TCCheckBlock->setName("vector.main.loop.iter.check");
7876
7877 // Create new preheader for vector loop.
7878 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7879 DT, LI, nullptr, "vector.ph");
7880
7881 if (ForEpilogue) {
7882 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7883 DT->getNode(Bypass)->getIDom()) &&
7884 "TC check is expected to dominate Bypass");
7885
7886 LoopBypassBlocks.push_back(TCCheckBlock);
7887
7888 // Save the trip count so we don't have to regenerate it in the
7889 // vec.epilog.iter.check. This is safe to do because the trip count
7890 // generated here dominates the vector epilog iter check.
7891 EPI.TripCount = Count;
7892 }
7893
7894 BranchInst &BI =
7895 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7897 setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
7898 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
7899
7900 introduceCheckBlockInVPlan(TCCheckBlock);
7901 return TCCheckBlock;
7902}
7903
7904//===--------------------------------------------------------------------===//
7905// EpilogueVectorizerEpilogueLoop
7906//===--------------------------------------------------------------------===//
7907
7908/// This function is partially responsible for generating the control flow
7909/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7910BasicBlock *
7912 const SCEV2ValueTy &ExpandedSCEVs) {
7913 createVectorLoopSkeleton("vec.epilog.");
7914
7915 // Now, compare the remaining count and if there aren't enough iterations to
7916 // execute the vectorized epilogue skip to the scalar part.
7917 LoopVectorPreHeader->setName("vec.epilog.ph");
7918 BasicBlock *VecEpilogueIterationCountCheck =
7920 nullptr, "vec.epilog.iter.check", true);
7922 VecEpilogueIterationCountCheck);
7923 AdditionalBypassBlock = VecEpilogueIterationCountCheck;
7924
7925 // Adjust the control flow taking the state info from the main loop
7926 // vectorization into account.
7928 "expected this to be saved from the previous pass.");
7930 VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7931
7933 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7934
7935 if (EPI.SCEVSafetyCheck)
7937 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7938 if (EPI.MemSafetyCheck)
7940 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7941
7944 // Keep track of bypass blocks, as they feed start values to the induction and
7945 // reduction phis in the scalar loop preheader.
7946 if (EPI.SCEVSafetyCheck)
7948 if (EPI.MemSafetyCheck)
7951
7952 // The vec.epilog.iter.check block may contain Phi nodes from inductions or
7953 // reductions which merge control-flow from the latch block and the middle
7954 // block. Update the incoming values here and move the Phi into the preheader.
7955 SmallVector<PHINode *, 4> PhisInBlock;
7956 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
7957 PhisInBlock.push_back(&Phi);
7958
7959 for (PHINode *Phi : PhisInBlock) {
7960 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
7961 Phi->replaceIncomingBlockWith(
7962 VecEpilogueIterationCountCheck->getSinglePredecessor(),
7963 VecEpilogueIterationCountCheck);
7964
7965 // If the phi doesn't have an incoming value from the
7966 // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
7967 // value and also those from other check blocks. This is needed for
7968 // reduction phis only.
7969 if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
7970 return EPI.EpilogueIterationCountCheck == IncB;
7971 }))
7972 continue;
7973 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
7974 if (EPI.SCEVSafetyCheck)
7975 Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
7976 if (EPI.MemSafetyCheck)
7977 Phi->removeIncomingValue(EPI.MemSafetyCheck);
7978 }
7979
7980 // Generate bypass values from the additional bypass block. Note that when the
7981 // vectorized epilogue is skipped due to iteration count check, then the
7982 // resume value for the induction variable comes from the trip count of the
7983 // main vector loop, passed as the second argument.
7985 return LoopVectorPreHeader;
7986}
7987
7988BasicBlock *
7990 BasicBlock *Bypass, BasicBlock *Insert) {
7991
7993 "Expected trip count to have been saved in the first pass.");
7994 assert(
7995 (!isa<Instruction>(EPI.TripCount) ||
7996 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
7997 "saved trip count does not dominate insertion point.");
7998 Value *TC = EPI.TripCount;
7999 IRBuilder<> Builder(Insert->getTerminator());
8000 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
8001
8002 // Generate code to check if the loop's trip count is less than VF * UF of the
8003 // vector epilogue loop.
8004 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())
8007
8008 Value *CheckMinIters =
8009 Builder.CreateICmp(P, Count,
8012 "min.epilog.iters.check");
8013
8014 BranchInst &BI =
8015 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
8017 unsigned MainLoopStep = UF * VF.getKnownMinValue();
8018 unsigned EpilogueLoopStep =
8020 // We assume the remaining `Count` is equally distributed in
8021 // [0, MainLoopStep)
8022 // So the probability for `Count < EpilogueLoopStep` should be
8023 // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
8024 unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
8025 const uint32_t Weights[] = {EstimatedSkipCount,
8026 MainLoopStep - EstimatedSkipCount};
8027 setBranchWeights(BI, Weights, /*IsExpected=*/false);
8028 }
8029 ReplaceInstWithInst(Insert->getTerminator(), &BI);
8030 LoopBypassBlocks.push_back(Insert);
8031
8032 // A new entry block has been created for the epilogue VPlan. Hook it in, as
8033 // otherwise we would try to modify the entry to the main vector loop.
8034 VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(Insert);
8035 VPBasicBlock *OldEntry = Plan.getEntry();
8036 VPBlockUtils::reassociateBlocks(OldEntry, NewEntry);
8037 Plan.setEntry(NewEntry);
8038 // OldEntry is now dead and will be cleaned up when the plan gets destroyed.
8039
8041 return Insert;
8042}
8043
8045 LLVM_DEBUG({
8046 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
8047 << "Epilogue Loop VF:" << EPI.EpilogueVF
8048 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8049 });
8050}
8051
8054 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
8055 });
8056}
8057
8058iterator_range<mapped_iterator<Use *, std::function<VPValue *(Value *)>>>
8060 std::function<VPValue *(Value *)> Fn = [this](Value *Op) {
8061 return getVPValueOrAddLiveIn(Op);
8062 };
8063 return map_range(Operands, Fn);
8064}
8065
8067 BasicBlock *Src = SI->getParent();
8068 assert(!OrigLoop->isLoopExiting(Src) &&
8069 all_of(successors(Src),
8070 [this](BasicBlock *Succ) {
8071 return OrigLoop->getHeader() != Succ;
8072 }) &&
8073 "unsupported switch either exiting loop or continuing to header");
8074 // Create masks where the terminator in Src is a switch. We create mask for
8075 // all edges at the same time. This is more efficient, as we can create and
8076 // collect compares for all cases once.
8077 VPValue *Cond = getVPValueOrAddLiveIn(SI->getCondition());
8078 BasicBlock *DefaultDst = SI->getDefaultDest();
8080 for (auto &C : SI->cases()) {
8081 BasicBlock *Dst = C.getCaseSuccessor();
8082 assert(!EdgeMaskCache.contains({Src, Dst}) && "Edge masks already created");
8083 // Cases whose destination is the same as default are redundant and can be
8084 // ignored - they will get there anyhow.
8085 if (Dst == DefaultDst)
8086 continue;
8087 auto &Compares = Dst2Compares[Dst];
8088 VPValue *V = getVPValueOrAddLiveIn(C.getCaseValue());
8089 Compares.push_back(Builder.createICmp(CmpInst::ICMP_EQ, Cond, V));
8090 }
8091
8092 // We need to handle 2 separate cases below for all entries in Dst2Compares,
8093 // which excludes destinations matching the default destination.
8094 VPValue *SrcMask = getBlockInMask(Src);
8095 VPValue *DefaultMask = nullptr;
8096 for (const auto &[Dst, Conds] : Dst2Compares) {
8097 // 1. Dst is not the default destination. Dst is reached if any of the cases
8098 // with destination == Dst are taken. Join the conditions for each case
8099 // whose destination == Dst using an OR.
8100 VPValue *Mask = Conds[0];
8101 for (VPValue *V : ArrayRef<VPValue *>(Conds).drop_front())
8102 Mask = Builder.createOr(Mask, V);
8103 if (SrcMask)
8104 Mask = Builder.createLogicalAnd(SrcMask, Mask);
8105 EdgeMaskCache[{Src, Dst}] = Mask;
8106
8107 // 2. Create the mask for the default destination, which is reached if none
8108 // of the cases with destination != default destination are taken. Join the
8109 // conditions for each case where the destination is != Dst using an OR and
8110 // negate it.
8111 DefaultMask = DefaultMask ? Builder.createOr(DefaultMask, Mask) : Mask;
8112 }
8113
8114 if (DefaultMask) {
8115 DefaultMask = Builder.createNot(DefaultMask);
8116 if (SrcMask)
8117 DefaultMask = Builder.createLogicalAnd(SrcMask, DefaultMask);
8118 }
8119 EdgeMaskCache[{Src, DefaultDst}] = DefaultMask;
8120}
8121
8123 assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8124
8125 // Look for cached value.
8126 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8127 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8128 if (ECEntryIt != EdgeMaskCache.end())
8129 return ECEntryIt->second;
8130
8131 if (auto *SI = dyn_cast<SwitchInst>(Src->getTerminator())) {
8133 assert(EdgeMaskCache.contains(Edge) && "Mask for Edge not created?");
8134 return EdgeMaskCache[Edge];
8135 }
8136
8137 VPValue *SrcMask = getBlockInMask(Src);
8138
8139 // The terminator has to be a branch inst!
8140 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8141 assert(BI && "Unexpected terminator found");
8142 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8143 return EdgeMaskCache[Edge] = SrcMask;
8144
8145 // If source is an exiting block, we know the exit edge is dynamically dead
8146 // in the vector loop, and thus we don't need to restrict the mask. Avoid
8147 // adding uses of an otherwise potentially dead instruction unless we are
8148 // vectorizing a loop with uncountable exits. In that case, we always
8149 // materialize the mask.
8150 if (OrigLoop->isLoopExiting(Src) &&
8151 Src != Legal->getUncountableEarlyExitingBlock())
8152 return EdgeMaskCache[Edge] = SrcMask;
8153
8154 VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition());
8155 assert(EdgeMask && "No Edge Mask found for condition");
8156
8157 if (BI->getSuccessor(0) != Dst)
8158 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
8159
8160 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8161 // The bitwise 'And' of SrcMask and EdgeMask introduces new UB if SrcMask
8162 // is false and EdgeMask is poison. Avoid that by using 'LogicalAnd'
8163 // instead which generates 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8164 EdgeMask = Builder.createLogicalAnd(SrcMask, EdgeMask, BI->getDebugLoc());
8165 }
8166
8167 return EdgeMaskCache[Edge] = EdgeMask;
8168}
8169
8171 assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8172
8173 // Look for cached value.
8174 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8175 EdgeMaskCacheTy::const_iterator ECEntryIt = EdgeMaskCache.find(Edge);
8176 assert(ECEntryIt != EdgeMaskCache.end() &&
8177 "looking up mask for edge which has not been created");
8178 return ECEntryIt->second;
8179}
8180
8182 BasicBlock *Header = OrigLoop->getHeader();
8183
8184 // When not folding the tail, use nullptr to model all-true mask.
8185 if (!CM.foldTailByMasking()) {
8186 BlockMaskCache[Header] = nullptr;
8187 return;
8188 }
8189
8190 // Introduce the early-exit compare IV <= BTC to form header block mask.
8191 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8192 // constructing the desired canonical IV in the header block as its first
8193 // non-phi instructions.
8194
8195 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
8196 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8197 auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
8198 HeaderVPBB->insert(IV, NewInsertionPoint);
8199
8200 VPBuilder::InsertPointGuard Guard(Builder);
8201 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
8202 VPValue *BlockMask = nullptr;
8204 BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
8205 BlockMaskCache[Header] = BlockMask;
8206}
8207
8209 // Return the cached value.
8210 BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB);
8211 assert(BCEntryIt != BlockMaskCache.end() &&
8212 "Trying to access mask for block without one.");
8213 return BCEntryIt->second;
8214}
8215
8217 assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8218 assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed");
8219 assert(OrigLoop->getHeader() != BB &&
8220 "Loop header must have cached block mask");
8221
8222 // All-one mask is modelled as no-mask following the convention for masked
8223 // load/store/gather/scatter. Initialize BlockMask to no-mask.
8224 VPValue *BlockMask = nullptr;
8225 // This is the block mask. We OR all unique incoming edges.
8226 for (auto *Predecessor :
8228 VPValue *EdgeMask = createEdgeMask(Predecessor, BB);
8229 if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too.
8230 BlockMaskCache[BB] = EdgeMask;
8231 return;
8232 }
8233
8234 if (!BlockMask) { // BlockMask has its initialized nullptr value.
8235 BlockMask = EdgeMask;
8236 continue;
8237 }
8238
8239 BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8240 }
8241
8242 BlockMaskCache[BB] = BlockMask;
8243}
8244
8246VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
8247 VFRange &Range) {
8248 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8249 "Must be called with either a load or store");
8250
8251 auto WillWiden = [&](ElementCount VF) -> bool {
8253 CM.getWideningDecision(I, VF);
8255 "CM decision should be taken at this point.");
8257 return true;
8258 if (CM.isScalarAfterVectorization(I, VF) ||
8259 CM.isProfitableToScalarize(I, VF))
8260 return false;
8262 };
8263
8265 return nullptr;
8266
8267 VPValue *Mask = nullptr;
8268 if (Legal->isMaskRequired(I))
8269 Mask = getBlockInMask(I->getParent());
8270
8271 // Determine if the pointer operand of the access is either consecutive or
8272 // reverse consecutive.
8274 CM.getWideningDecision(I, Range.Start);
8276 bool Consecutive =
8278
8279 VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
8280 if (Consecutive) {
8281 auto *GEP = dyn_cast<GetElementPtrInst>(
8282 Ptr->getUnderlyingValue()->stripPointerCasts());
8283 VPSingleDefRecipe *VectorPtr;
8284 if (Reverse) {
8285 // When folding the tail, we may compute an address that we don't in the
8286 // original scalar loop and it may not be inbounds. Drop Inbounds in that
8287 // case.
8288 GEPNoWrapFlags Flags =
8289 (CM.foldTailByMasking() || !GEP || !GEP->isInBounds())
8292 VectorPtr = new VPReverseVectorPointerRecipe(
8293 Ptr, &Plan.getVF(), getLoadStoreType(I), Flags, I->getDebugLoc());
8294 } else {
8295 VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
8296 GEP ? GEP->getNoWrapFlags()
8298 I->getDebugLoc());
8299 }
8300 Builder.getInsertBlock()->appendRecipe(VectorPtr);
8301 Ptr = VectorPtr;
8302 }
8303 if (LoadInst *Load = dyn_cast<LoadInst>(I))
8304 return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
8305 I->getDebugLoc());
8306
8307 StoreInst *Store = cast<StoreInst>(I);
8308 return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
8309 Reverse, I->getDebugLoc());
8310}
8311
8312/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
8313/// insert a recipe to expand the step for the induction recipe.
8316 VPValue *Start, const InductionDescriptor &IndDesc,
8317 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop) {
8318 assert(IndDesc.getStartValue() ==
8319 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
8320 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
8321 "step must be loop invariant");
8322
8323 VPValue *Step =
8325 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8326 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
8327 IndDesc, TruncI,
8328 TruncI->getDebugLoc());
8329 }
8330 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
8331 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
8332 IndDesc, Phi->getDebugLoc());
8333}
8334
8335VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
8337
8338 // Check if this is an integer or fp induction. If so, build the recipe that
8339 // produces its scalar and vector values.
8340 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8341 return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan,
8342 *PSE.getSE(), *OrigLoop);
8343
8344 // Check if this is pointer induction. If so, build the recipe for it.
8345 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
8346 VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(),
8347 *PSE.getSE());
8349 Phi, Operands[0], Step, *II,
8351 [&](ElementCount VF) {
8352 return CM.isScalarAfterVectorization(Phi, VF);
8353 },
8354 Range),
8355 Phi->getDebugLoc());
8356 }
8357 return nullptr;
8358}
8359
8360VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8362 // Optimize the special case where the source is a constant integer
8363 // induction variable. Notice that we can only optimize the 'trunc' case
8364 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8365 // (c) other casts depend on pointer size.
8366
8367 // Determine whether \p K is a truncation based on an induction variable that
8368 // can be optimized.
8369 auto IsOptimizableIVTruncate =
8370 [&](Instruction *K) -> std::function<bool(ElementCount)> {
8371 return [=](ElementCount VF) -> bool {
8372 return CM.isOptimizableIVTruncate(K, VF);
8373 };
8374 };
8375
8377 IsOptimizableIVTruncate(I), Range)) {
8378
8379 auto *Phi = cast<PHINode>(I->getOperand(0));
8381 VPValue *Start = Plan.getOrAddLiveIn(II.getStartValue());
8382 return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(),
8383 *OrigLoop);
8384 }
8385 return nullptr;
8386}
8387
8388VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi,
8390 unsigned NumIncoming = Phi->getNumIncomingValues();
8391
8392 // We know that all PHIs in non-header blocks are converted into selects, so
8393 // we don't have to worry about the insertion order and we can just use the
8394 // builder. At this point we generate the predication tree. There may be
8395 // duplications since this is a simple recursive scan, but future
8396 // optimizations will clean it up.
8397 SmallVector<VPValue *, 2> OperandsWithMask;
8398
8399 for (unsigned In = 0; In < NumIncoming; In++) {
8400 OperandsWithMask.push_back(Operands[In]);
8401 VPValue *EdgeMask =
8402 getEdgeMask(Phi->getIncomingBlock(In), Phi->getParent());
8403 if (!EdgeMask) {
8404 assert(In == 0 && "Both null and non-null edge masks found");
8406 "Distinct incoming values with one having a full mask");
8407 break;
8408 }
8409 OperandsWithMask.push_back(EdgeMask);
8410 }
8411 return new VPBlendRecipe(Phi, OperandsWithMask);
8412}
8413
8414VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8416 VFRange &Range) {
8418 [this, CI](ElementCount VF) {
8419 return CM.isScalarWithPredication(CI, VF);
8420 },
8421 Range);
8422
8423 if (IsPredicated)
8424 return nullptr;
8425
8427 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8428 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8429 ID == Intrinsic::pseudoprobe ||
8430 ID == Intrinsic::experimental_noalias_scope_decl))
8431 return nullptr;
8432
8433 SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size()));
8434
8435 // Is it beneficial to perform intrinsic call compared to lib call?
8436 bool ShouldUseVectorIntrinsic =
8438 [&](ElementCount VF) -> bool {
8439 return CM.getCallWideningDecision(CI, VF).Kind ==
8441 },
8442 Range);
8443 if (ShouldUseVectorIntrinsic)
8444 return new VPWidenIntrinsicRecipe(*CI, ID, Ops, CI->getType(),
8445 CI->getDebugLoc());
8446
8447 Function *Variant = nullptr;
8448 std::optional<unsigned> MaskPos;
8449 // Is better to call a vectorized version of the function than to to scalarize
8450 // the call?
8451 auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
8452 [&](ElementCount VF) -> bool {
8453 // The following case may be scalarized depending on the VF.
8454 // The flag shows whether we can use a usual Call for vectorized
8455 // version of the instruction.
8456
8457 // If we've found a variant at a previous VF, then stop looking. A
8458 // vectorized variant of a function expects input in a certain shape
8459 // -- basically the number of input registers, the number of lanes
8460 // per register, and whether there's a mask required.
8461 // We store a pointer to the variant in the VPWidenCallRecipe, so
8462 // once we have an appropriate variant it's only valid for that VF.
8463 // This will force a different vplan to be generated for each VF that
8464 // finds a valid variant.
8465 if (Variant)
8466 return false;
8468 CM.getCallWideningDecision(CI, VF);
8470 Variant = Decision.Variant;
8471 MaskPos = Decision.MaskPos;
8472 return true;
8473 }
8474
8475 return false;
8476 },
8477 Range);
8478 if (ShouldUseVectorCall) {
8479 if (MaskPos.has_value()) {
8480 // We have 2 cases that would require a mask:
8481 // 1) The block needs to be predicated, either due to a conditional
8482 // in the scalar loop or use of an active lane mask with
8483 // tail-folding, and we use the appropriate mask for the block.
8484 // 2) No mask is required for the block, but the only available
8485 // vector variant at this VF requires a mask, so we synthesize an
8486 // all-true mask.
8487 VPValue *Mask = nullptr;
8488 if (Legal->isMaskRequired(CI))
8489 Mask = getBlockInMask(CI->getParent());
8490 else
8491 Mask = Plan.getOrAddLiveIn(
8493
8494 Ops.insert(Ops.begin() + *MaskPos, Mask);
8495 }
8496
8497 Ops.push_back(Operands.back());
8498 return new VPWidenCallRecipe(CI, Variant, Ops, CI->getDebugLoc());
8499 }
8500
8501 return nullptr;
8502}
8503
8504bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8505 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8506 !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8507 // Instruction should be widened, unless it is scalar after vectorization,
8508 // scalarization is profitable or it is predicated.
8509 auto WillScalarize = [this, I](ElementCount VF) -> bool {
8510 return CM.isScalarAfterVectorization(I, VF) ||
8511 CM.isProfitableToScalarize(I, VF) ||
8512 CM.isScalarWithPredication(I, VF);
8513 };
8515 Range);
8516}
8517
8518VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8520 VPBasicBlock *VPBB) {
8521 switch (I->getOpcode()) {
8522 default:
8523 return nullptr;
8524 case Instruction::SDiv:
8525 case Instruction::UDiv:
8526 case Instruction::SRem:
8527 case Instruction::URem: {
8528 // If not provably safe, use a select to form a safe divisor before widening the
8529 // div/rem operation itself. Otherwise fall through to general handling below.
8530 if (CM.isPredicatedInst(I)) {
8532 VPValue *Mask = getBlockInMask(I->getParent());
8533 VPValue *One =
8534 Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false));
8535 auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc());
8536 Ops[1] = SafeRHS;
8537 return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end()));
8538 }
8539 [[fallthrough]];
8540 }
8541 case Instruction::Add:
8542 case Instruction::And:
8543 case Instruction::AShr:
8544 case Instruction::FAdd:
8545 case Instruction::FCmp:
8546 case Instruction::FDiv:
8547 case Instruction::FMul:
8548 case Instruction::FNeg:
8549 case Instruction::FRem:
8550 case Instruction::FSub:
8551 case Instruction::ICmp:
8552 case Instruction::LShr:
8553 case Instruction::Mul:
8554 case Instruction::Or:
8555 case Instruction::Select:
8556 case Instruction::Shl:
8557 case Instruction::Sub:
8558 case Instruction::Xor:
8559 case Instruction::Freeze:
8561 if (Instruction::isBinaryOp(I->getOpcode())) {
8562 // The legacy cost model uses SCEV to check if some of the operands are
8563 // constants. To match the legacy cost model's behavior, use SCEV to try
8564 // to replace operands with constants.
8565 ScalarEvolution &SE = *PSE.getSE();
8566 auto GetConstantViaSCEV = [this, &SE](VPValue *Op) {
8567 Value *V = Op->getUnderlyingValue();
8568 if (isa<Constant>(V) || !SE.isSCEVable(V->getType()))
8569 return Op;
8570 auto *C = dyn_cast<SCEVConstant>(SE.getSCEV(V));
8571 if (!C)
8572 return Op;
8573 return Plan.getOrAddLiveIn(C->getValue());
8574 };
8575 // For Mul, the legacy cost model checks both operands.
8576 if (I->getOpcode() == Instruction::Mul)
8577 NewOps[0] = GetConstantViaSCEV(NewOps[0]);
8578 // For other binops, the legacy cost model only checks the second operand.
8579 NewOps[1] = GetConstantViaSCEV(NewOps[1]);
8580 }
8581 return new VPWidenRecipe(*I, make_range(NewOps.begin(), NewOps.end()));
8582 };
8583}
8584
8586VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
8588 // FIXME: Support other operations.
8589 unsigned Opcode = HI->Update->getOpcode();
8590 assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) &&
8591 "Histogram update operation must be an Add or Sub");
8592
8594 // Bucket address.
8595 HGramOps.push_back(Operands[1]);
8596 // Increment value.
8597 HGramOps.push_back(getVPValueOrAddLiveIn(HI->Update->getOperand(1)));
8598
8599 // In case of predicated execution (due to tail-folding, or conditional
8600 // execution, or both), pass the relevant mask.
8601 if (Legal->isMaskRequired(HI->Store))
8602 HGramOps.push_back(getBlockInMask(HI->Store->getParent()));
8603
8604 return new VPHistogramRecipe(Opcode,
8605 make_range(HGramOps.begin(), HGramOps.end()),
8606 HI->Store->getDebugLoc());
8607}
8608
8610 BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8611 for (VPHeaderPHIRecipe *R : PhisToFix) {
8612 auto *PN = cast<PHINode>(R->getUnderlyingValue());
8613 VPRecipeBase *IncR =
8614 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8615 R->addOperand(IncR->getVPSingleValue());
8616 }
8617}
8618
8620 VFRange &Range) {
8622 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8623 Range);
8624
8625 bool IsPredicated = CM.isPredicatedInst(I);
8626
8627 // Even if the instruction is not marked as uniform, there are certain
8628 // intrinsic calls that can be effectively treated as such, so we check for
8629 // them here. Conservatively, we only do this for scalable vectors, since
8630 // for fixed-width VFs we can always fall back on full scalarization.
8631 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8632 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8633 case Intrinsic::assume:
8634 case Intrinsic::lifetime_start:
8635 case Intrinsic::lifetime_end:
8636 // For scalable vectors if one of the operands is variant then we still
8637 // want to mark as uniform, which will generate one instruction for just
8638 // the first lane of the vector. We can't scalarize the call in the same
8639 // way as for fixed-width vectors because we don't know how many lanes
8640 // there are.
8641 //
8642 // The reasons for doing it this way for scalable vectors are:
8643 // 1. For the assume intrinsic generating the instruction for the first
8644 // lane is still be better than not generating any at all. For
8645 // example, the input may be a splat across all lanes.
8646 // 2. For the lifetime start/end intrinsics the pointer operand only
8647 // does anything useful when the input comes from a stack object,
8648 // which suggests it should always be uniform. For non-stack objects
8649 // the effect is to poison the object, which still allows us to
8650 // remove the call.
8651 IsUniform = true;
8652 break;
8653 default:
8654 break;
8655 }
8656 }
8657 VPValue *BlockInMask = nullptr;
8658 if (!IsPredicated) {
8659 // Finalize the recipe for Instr, first if it is not predicated.
8660 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8661 } else {
8662 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8663 // Instructions marked for predication are replicated and a mask operand is
8664 // added initially. Masked replicate recipes will later be placed under an
8665 // if-then construct to prevent side-effects. Generate recipes to compute
8666 // the block mask for this region.
8667 BlockInMask = getBlockInMask(I->getParent());
8668 }
8669
8670 // Note that there is some custom logic to mark some intrinsics as uniform
8671 // manually above for scalable vectors, which this assert needs to account for
8672 // as well.
8673 assert((Range.Start.isScalar() || !IsUniform || !IsPredicated ||
8674 (Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
8675 "Should not predicate a uniform recipe");
8676 auto *Recipe = new VPReplicateRecipe(I, mapToVPValues(I->operands()),
8677 IsUniform, BlockInMask);
8678 return Recipe;
8679}
8680
8681/// Find all possible partial reductions in the loop and track all of those that
8682/// are valid so recipes can be formed later.
8684 // Find all possible partial reductions.
8686 PartialReductionChains;
8687 for (const auto &[Phi, RdxDesc] : Legal->getReductionVars())
8688 if (std::optional<std::pair<PartialReductionChain, unsigned>> Pair =
8689 getScaledReduction(Phi, RdxDesc, Range))
8690 PartialReductionChains.push_back(*Pair);
8691
8692 // A partial reduction is invalid if any of its extends are used by
8693 // something that isn't another partial reduction. This is because the
8694 // extends are intended to be lowered along with the reduction itself.
8695
8696 // Build up a set of partial reduction bin ops for efficient use checking.
8697 SmallSet<User *, 4> PartialReductionBinOps;
8698 for (const auto &[PartialRdx, _] : PartialReductionChains)
8699 PartialReductionBinOps.insert(PartialRdx.BinOp);
8700
8701 auto ExtendIsOnlyUsedByPartialReductions =
8702 [&PartialReductionBinOps](Instruction *Extend) {
8703 return all_of(Extend->users(), [&](const User *U) {
8704 return PartialReductionBinOps.contains(U);
8705 });
8706 };
8707
8708 // Check if each use of a chain's two extends is a partial reduction
8709 // and only add those that don't have non-partial reduction users.
8710 for (auto Pair : PartialReductionChains) {
8711 PartialReductionChain Chain = Pair.first;
8712 if (ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) &&
8713 ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB))
8714 ScaledReductionMap.insert(std::make_pair(Chain.Reduction, Pair.second));
8715 }
8716}
8717
8718std::optional<std::pair<PartialReductionChain, unsigned>>
8719VPRecipeBuilder::getScaledReduction(PHINode *PHI,
8720 const RecurrenceDescriptor &Rdx,
8721 VFRange &Range) {
8722 // TODO: Allow scaling reductions when predicating. The select at
8723 // the end of the loop chooses between the phi value and most recent
8724 // reduction result, both of which have different VFs to the active lane
8725 // mask when scaling.
8727 return std::nullopt;
8728
8729 auto *Update = dyn_cast<BinaryOperator>(Rdx.getLoopExitInstr());
8730 if (!Update)
8731 return std::nullopt;
8732
8733 Value *Op = Update->getOperand(0);
8734 Value *PhiOp = Update->getOperand(1);
8735 if (Op == PHI) {
8736 Op = Update->getOperand(1);
8737 PhiOp = Update->getOperand(0);
8738 }
8739 if (PhiOp != PHI)
8740 return std::nullopt;
8741
8742 auto *BinOp = dyn_cast<BinaryOperator>(Op);
8743 if (!BinOp || !BinOp->hasOneUse())
8744 return std::nullopt;
8745
8746 using namespace llvm::PatternMatch;
8747 Value *A, *B;
8748 if (!match(BinOp->getOperand(0), m_ZExtOrSExt(m_Value(A))) ||
8749 !match(BinOp->getOperand(1), m_ZExtOrSExt(m_Value(B))))
8750 return std::nullopt;
8751
8752 Instruction *ExtA = cast<Instruction>(BinOp->getOperand(0));
8753 Instruction *ExtB = cast<Instruction>(BinOp->getOperand(1));
8754
8759
8760 PartialReductionChain Chain(Rdx.getLoopExitInstr(), ExtA, ExtB, BinOp);
8761
8762 unsigned TargetScaleFactor =
8763 PHI->getType()->getPrimitiveSizeInBits().getKnownScalarFactor(
8764 A->getType()->getPrimitiveSizeInBits());
8765
8767 [&](ElementCount VF) {
8769 Update->getOpcode(), A->getType(), B->getType(), PHI->getType(),
8770 VF, OpAExtend, OpBExtend,
8771 std::make_optional(BinOp->getOpcode()));
8772 return Cost.isValid();
8773 },
8774 Range))
8775 return std::make_pair(Chain, TargetScaleFactor);
8776
8777 return std::nullopt;
8778}
8779
8783 VFRange &Range, VPBasicBlock *VPBB) {
8784 // First, check for specific widening recipes that deal with inductions, Phi
8785 // nodes, calls and memory operations.
8786 VPRecipeBase *Recipe;
8787 if (auto *Phi = dyn_cast<PHINode>(Instr)) {
8788 if (Phi->getParent() != OrigLoop->getHeader())
8789 return tryToBlend(Phi, Operands);
8790
8791 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
8792 return Recipe;
8793
8794 VPHeaderPHIRecipe *PhiRecipe = nullptr;
8795 assert((Legal->isReductionVariable(Phi) ||
8796 Legal->isFixedOrderRecurrence(Phi)) &&
8797 "can only widen reductions and fixed-order recurrences here");
8798 VPValue *StartV = Operands[0];
8799 if (Legal->isReductionVariable(Phi)) {
8800 const RecurrenceDescriptor &RdxDesc =
8801 Legal->getReductionVars().find(Phi)->second;
8802 assert(RdxDesc.getRecurrenceStartValue() ==
8803 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8804
8805 // If the PHI is used by a partial reduction, set the scale factor.
8806 unsigned ScaleFactor =
8807 getScalingForReduction(RdxDesc.getLoopExitInstr()).value_or(1);
8808 PhiRecipe = new VPReductionPHIRecipe(
8809 Phi, RdxDesc, *StartV, CM.isInLoopReduction(Phi),
8810 CM.useOrderedReductions(RdxDesc), ScaleFactor);
8811 } else {
8812 // TODO: Currently fixed-order recurrences are modeled as chains of
8813 // first-order recurrences. If there are no users of the intermediate
8814 // recurrences in the chain, the fixed order recurrence should be modeled
8815 // directly, enabling more efficient codegen.
8816 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8817 }
8818
8819 PhisToFix.push_back(PhiRecipe);
8820 return PhiRecipe;
8821 }
8822
8823 if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(
8824 cast<TruncInst>(Instr), Operands, Range)))
8825 return Recipe;
8826
8827 // All widen recipes below deal only with VF > 1.
8829 [&](ElementCount VF) { return VF.isScalar(); }, Range))
8830 return nullptr;
8831
8832 if (auto *CI = dyn_cast<CallInst>(Instr))
8833 return tryToWidenCall(CI, Operands, Range);
8834
8835 if (StoreInst *SI = dyn_cast<StoreInst>(Instr))
8836 if (auto HistInfo = Legal->getHistogramInfo(SI))
8837 return tryToWidenHistogram(*HistInfo, Operands);
8838
8839 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8840 return tryToWidenMemory(Instr, Operands, Range);
8841
8842 if (getScalingForReduction(Instr))
8844
8845 if (!shouldWiden(Instr, Range))
8846 return nullptr;
8847
8848 if (auto *GEP = dyn_cast<GetElementPtrInst>(Instr))
8849 return new VPWidenGEPRecipe(GEP,
8850 make_range(Operands.begin(), Operands.end()));
8851
8852 if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8853 return new VPWidenSelectRecipe(
8854 *SI, make_range(Operands.begin(), Operands.end()));
8855 }
8856
8857 if (auto *CI = dyn_cast<CastInst>(Instr)) {
8858 return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(),
8859 *CI);
8860 }
8861
8862 return tryToWiden(Instr, Operands, VPBB);
8863}
8864
8868 assert(Operands.size() == 2 &&
8869 "Unexpected number of operands for partial reduction");
8870
8871 VPValue *BinOp = Operands[0];
8872 VPValue *Phi = Operands[1];
8873 if (isa<VPReductionPHIRecipe>(BinOp->getDefiningRecipe()))
8874 std::swap(BinOp, Phi);
8875
8876 return new VPPartialReductionRecipe(Reduction->getOpcode(), BinOp, Phi,
8877 Reduction);
8878}
8879
8880void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8881 ElementCount MaxVF) {
8882 assert(OrigLoop->isInnermost() && "Inner loop expected.");
8883
8884 auto MaxVFTimes2 = MaxVF * 2;
8885 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
8886 VFRange SubRange = {VF, MaxVFTimes2};
8887 if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) {
8888 // Now optimize the initial VPlan.
8889 if (!Plan->hasVF(ElementCount::getFixed(1)))
8891 CM.getMinimalBitwidths());
8893 // TODO: try to put it close to addActiveLaneMask().
8894 // Discard the plan if it is not EVL-compatible
8896 *Plan, CM.getMaxSafeElements()))
8897 break;
8898 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8899 VPlans.push_back(std::move(Plan));
8900 }
8901 VF = SubRange.End;
8902 }
8903}
8904
8905// Add the necessary canonical IV and branch recipes required to control the
8906// loop.
8907static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
8908 DebugLoc DL) {
8909 Value *StartIdx = ConstantInt::get(IdxTy, 0);
8910 auto *StartV = Plan.getOrAddLiveIn(StartIdx);
8911
8912 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
8913 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
8914 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
8915 VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
8916 Header->insert(CanonicalIVPHI, Header->begin());
8917
8918 VPBuilder Builder(TopRegion->getExitingBasicBlock());
8919 // Add a VPInstruction to increment the scalar canonical IV by VF * UF.
8920 auto *CanonicalIVIncrement = Builder.createOverflowingOp(
8921 Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {HasNUW, false}, DL,
8922 "index.next");
8923 CanonicalIVPHI->addOperand(CanonicalIVIncrement);
8924
8925 // Add the BranchOnCount VPInstruction to the latch.
8927 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
8928}
8929
8930/// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the
8931/// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute
8932/// the end value of the induction.
8934 VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder,
8935 VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC) {
8936 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
8937 // Truncated wide inductions resume from the last lane of their vector value
8938 // in the last vector iteration which is handled elsewhere.
8939 if (WideIntOrFp && WideIntOrFp->getTruncInst())
8940 return nullptr;
8941
8942 VPValue *Start = WideIV->getStartValue();
8943 VPValue *Step = WideIV->getStepValue();
8945 VPValue *EndValue = VectorTC;
8946 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
8947 EndValue = VectorPHBuilder.createDerivedIV(
8948 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
8949 Start, VectorTC, Step);
8950 }
8951
8952 // EndValue is derived from the vector trip count (which has the same type as
8953 // the widest induction) and thus may be wider than the induction here.
8954 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
8955 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
8956 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
8957 ScalarTypeOfWideIV,
8958 WideIV->getDebugLoc());
8959 }
8960
8961 auto *ResumePhiRecipe =
8962 ScalarPHBuilder.createNaryOp(VPInstruction::ResumePhi, {EndValue, Start},
8963 WideIV->getDebugLoc(), "bc.resume.val");
8964 return ResumePhiRecipe;
8965}
8966
8967/// Create resume phis in the scalar preheader for first-order recurrences,
8968/// reductions and inductions, and update the VPIRInstructions wrapping the
8969/// original phis in the scalar header. End values for inductions are added to
8970/// \p IVEndValues.
8971static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan,
8972 DenseMap<VPValue *, VPValue *> &IVEndValues) {
8973 VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
8974 auto *ScalarPH = Plan.getScalarPreheader();
8975 auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getSinglePredecessor());
8976 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
8977 VPBuilder VectorPHBuilder(
8978 cast<VPBasicBlock>(VectorRegion->getSinglePredecessor()));
8979 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
8980 VPBuilder ScalarPHBuilder(ScalarPH);
8981 VPValue *OneVPV = Plan.getOrAddLiveIn(
8982 ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
8983 for (VPRecipeBase &ScalarPhiR : *Plan.getScalarHeader()) {
8984 auto *ScalarPhiIRI = cast<VPIRInstruction>(&ScalarPhiR);
8985 auto *ScalarPhiI = dyn_cast<PHINode>(&ScalarPhiIRI->getInstruction());
8986 if (!ScalarPhiI)
8987 break;
8988
8989 // TODO: Extract final value from induction recipe initially, optimize to
8990 // pre-computed end value together in optimizeInductionExitUsers.
8991 auto *VectorPhiR = cast<VPHeaderPHIRecipe>(Builder.getRecipe(ScalarPhiI));
8992 if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
8994 WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo,
8995 &Plan.getVectorTripCount())) {
8996 assert(ResumePhi->getOpcode() == VPInstruction::ResumePhi &&
8997 "Expected a ResumePhi");
8998 IVEndValues[WideIVR] = ResumePhi->getOperand(0);
8999 ScalarPhiIRI->addOperand(ResumePhi);
9000 continue;
9001 }
9002 // TODO: Also handle truncated inductions here. Computing end-values
9003 // separately should be done as VPlan-to-VPlan optimization, after
9004 // legalizing all resume values to use the last lane from the loop.
9005 assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
9006 "should only skip truncated wide inductions");
9007 continue;
9008 }
9009
9010 // The backedge value provides the value to resume coming out of a loop,
9011 // which for FORs is a vector whose last element needs to be extracted. The
9012 // start value provides the value if the loop is bypassed.
9013 bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR);
9014 auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
9015 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
9016 "Cannot handle loops with uncountable early exits");
9017 if (IsFOR)
9018 ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
9019 VPInstruction::ExtractFromEnd, {ResumeFromVectorLoop, OneVPV}, {},
9020 "vector.recur.extract");
9021 StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx";
9022 auto *ResumePhiR = ScalarPHBuilder.createNaryOp(
9024 {ResumeFromVectorLoop, VectorPhiR->getStartValue()}, {}, Name);
9025 ScalarPhiIRI->addOperand(ResumePhiR);
9026 }
9027}
9028
9029// Collect VPIRInstructions for phis in the exit blocks that are modeled
9030// in VPlan and add the exiting VPValue as operand.
9033 VPlan &Plan) {
9034 auto *MiddleVPBB = Plan.getMiddleBlock();
9035 SetVector<VPIRInstruction *> ExitUsersToFix;
9036 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
9037 for (VPRecipeBase &R : *ExitVPBB) {
9038 auto *ExitIRI = dyn_cast<VPIRInstruction>(&R);
9039 if (!ExitIRI)
9040 continue;
9041 auto *ExitPhi = dyn_cast<PHINode>(&ExitIRI->getInstruction());
9042 if (!ExitPhi)
9043 break;
9044 for (VPBlockBase *PredVPBB : ExitVPBB->getPredecessors()) {
9045 BasicBlock *ExitingBB = OrigLoop->getLoopLatch();
9046 if (PredVPBB != MiddleVPBB) {
9047 SmallVector<BasicBlock *> ExitingBlocks;
9048 OrigLoop->getExitingBlocks(ExitingBlocks);
9049 assert(ExitingBlocks.size() == 2 && "only support 2 exiting blocks");
9050 ExitingBB = ExitingBB == ExitingBlocks[0] ? ExitingBlocks[1]
9051 : ExitingBlocks[0];
9052 }
9053 Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB);
9054 VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue);
9055 ExitUsersToFix.insert(ExitIRI);
9056 ExitIRI->addOperand(V);
9057 }
9058 }
9059 }
9060 return ExitUsersToFix;
9061}
9062
9063// Add exit values to \p Plan. Extracts are added for each entry in \p
9064// ExitUsersToFix if needed and their operands are updated. Returns true if all
9065// exit users can be handled, otherwise return false.
9066static bool
9068 const SetVector<VPIRInstruction *> &ExitUsersToFix) {
9069 if (ExitUsersToFix.empty())
9070 return true;
9071
9072 auto *MiddleVPBB = Plan.getMiddleBlock();
9073 VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
9074 VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
9075
9076 // Introduce extract for exiting values and update the VPIRInstructions
9077 // modeling the corresponding LCSSA phis.
9078 for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
9079 for (const auto &[Idx, Op] : enumerate(ExitIRI->operands())) {
9080 // Pass live-in values used by exit phis directly through to their users
9081 // in the exit block.
9082 if (Op->isLiveIn())
9083 continue;
9084
9085 // Currently only live-ins can be used by exit values from blocks not
9086 // exiting via the vector latch through to the middle block.
9087 if (ExitIRI->getParent()->getSinglePredecessor() != MiddleVPBB)
9088 return false;
9089
9090 LLVMContext &Ctx = ExitIRI->getInstruction().getContext();
9091 VPValue *Ext = B.createNaryOp(VPInstruction::ExtractFromEnd,
9092 {Op, Plan.getOrAddLiveIn(ConstantInt::get(
9093 IntegerType::get(Ctx, 32), 1))});
9094 ExitIRI->setOperand(Idx, Ext);
9095 }
9096 }
9097 return true;
9098}
9099
9100/// Handle users in the exit block for first order reductions in the original
9101/// exit block. The penultimate value of recurrences is fed to their LCSSA phi
9102/// users in the original exit block using the VPIRInstruction wrapping to the
9103/// LCSSA phi.
9105 VPlan &Plan, SetVector<VPIRInstruction *> &ExitUsersToFix) {
9106 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
9107 auto *ScalarPHVPBB = Plan.getScalarPreheader();
9108 auto *MiddleVPBB = Plan.getMiddleBlock();
9109 VPBuilder ScalarPHBuilder(ScalarPHVPBB);
9110 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
9111 VPValue *TwoVPV = Plan.getOrAddLiveIn(
9112 ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 2));
9113
9114 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
9115 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
9116 if (!FOR)
9117 continue;
9118
9119 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
9120 "Cannot handle loops with uncountable early exits");
9121
9122 // This is the second phase of vectorizing first-order recurrences, creating
9123 // extract for users outside the loop. An overview of the transformation is
9124 // described below. Suppose we have the following loop with some use after
9125 // the loop of the last a[i-1],
9126 //
9127 // for (int i = 0; i < n; ++i) {
9128 // t = a[i - 1];
9129 // b[i] = a[i] - t;
9130 // }
9131 // use t;
9132 //
9133 // There is a first-order recurrence on "a". For this loop, the shorthand
9134 // scalar IR looks like:
9135 //
9136 // scalar.ph:
9137 // s.init = a[-1]
9138 // br scalar.body
9139 //
9140 // scalar.body:
9141 // i = phi [0, scalar.ph], [i+1, scalar.body]
9142 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
9143 // s2 = a[i]
9144 // b[i] = s2 - s1
9145 // br cond, scalar.body, exit.block
9146 //
9147 // exit.block:
9148 // use = lcssa.phi [s1, scalar.body]
9149 //
9150 // In this example, s1 is a recurrence because it's value depends on the
9151 // previous iteration. In the first phase of vectorization, we created a
9152 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
9153 // for users in the scalar preheader and exit block.
9154 //
9155 // vector.ph:
9156 // v_init = vector(..., ..., ..., a[-1])
9157 // br vector.body
9158 //
9159 // vector.body
9160 // i = phi [0, vector.ph], [i+4, vector.body]
9161 // v1 = phi [v_init, vector.ph], [v2, vector.body]
9162 // v2 = a[i, i+1, i+2, i+3]
9163 // b[i] = v2 - v1
9164 // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
9165 // b[i, i+1, i+2, i+3] = v2 - v1
9166 // br cond, vector.body, middle.block
9167 //
9168 // middle.block:
9169 // vector.recur.extract.for.phi = v2(2)
9170 // vector.recur.extract = v2(3)
9171 // br cond, scalar.ph, exit.block
9172 //
9173 // scalar.ph:
9174 // scalar.recur.init = phi [vector.recur.extract, middle.block],
9175 // [s.init, otherwise]
9176 // br scalar.body
9177 //
9178 // scalar.body:
9179 // i = phi [0, scalar.ph], [i+1, scalar.body]
9180 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
9181 // s2 = a[i]
9182 // b[i] = s2 - s1
9183 // br cond, scalar.body, exit.block
9184 //
9185 // exit.block:
9186 // lo = lcssa.phi [s1, scalar.body],
9187 // [vector.recur.extract.for.phi, middle.block]
9188 //
9189 // Now update VPIRInstructions modeling LCSSA phis in the exit block.
9190 // Extract the penultimate value of the recurrence and use it as operand for
9191 // the VPIRInstruction modeling the phi.
9192 for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
9193 if (ExitIRI->getOperand(0) != FOR)
9194 continue;
9195 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
9196 VPInstruction::ExtractFromEnd, {FOR->getBackedgeValue(), TwoVPV}, {},
9197 "vector.recur.extract.for.phi");
9198 ExitIRI->setOperand(0, PenultimateElement);
9199 ExitUsersToFix.remove(ExitIRI);
9200 }
9201 }
9202}
9203
9205LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
9206
9208
9209 // ---------------------------------------------------------------------------
9210 // Build initial VPlan: Scan the body of the loop in a topological order to
9211 // visit each basic block after having visited its predecessor basic blocks.
9212 // ---------------------------------------------------------------------------
9213
9214 // Create initial VPlan skeleton, having a basic block for the pre-header
9215 // which contains SCEV expansions that need to happen before the CFG is
9216 // modified; a basic block for the vector pre-header, followed by a region for
9217 // the vector loop, followed by the middle basic block. The skeleton vector
9218 // loop region contains a header and latch basic blocks.
9219
9220 bool RequiresScalarEpilogueCheck =
9222 [this](ElementCount VF) {
9223 return !CM.requiresScalarEpilogue(VF.isVector());
9224 },
9225 Range);
9227 PSE, RequiresScalarEpilogueCheck,
9228 CM.foldTailByMasking(), OrigLoop);
9229
9230 // Don't use getDecisionAndClampRange here, because we don't know the UF
9231 // so this function is better to be conservative, rather than to split
9232 // it up into different VPlans.
9233 // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
9234 bool IVUpdateMayOverflow = false;
9235 for (ElementCount VF : Range)
9236 IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
9237
9239 TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
9240 // Use NUW for the induction increment if we proved that it won't overflow in
9241 // the vector loop or when not folding the tail. In the later case, we know
9242 // that the canonical induction increment will not overflow as the vector trip
9243 // count is >= increment and a multiple of the increment.
9244 bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None;
9245 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
9246
9247 VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
9248 Builder);
9249
9250 // ---------------------------------------------------------------------------
9251 // Pre-construction: record ingredients whose recipes we'll need to further
9252 // process after constructing the initial VPlan.
9253 // ---------------------------------------------------------------------------
9254
9255 // For each interleave group which is relevant for this (possibly trimmed)
9256 // Range, add it to the set of groups to be later applied to the VPlan and add
9257 // placeholders for its members' Recipes which we'll be replacing with a
9258 // single VPInterleaveRecipe.
9260 auto ApplyIG = [IG, this](ElementCount VF) -> bool {
9261 bool Result = (VF.isVector() && // Query is illegal for VF == 1
9262 CM.getWideningDecision(IG->getInsertPos(), VF) ==
9264 // For scalable vectors, the only interleave factor currently supported
9265 // is 2 since we require the (de)interleave2 intrinsics instead of
9266 // shufflevectors.
9267 assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
9268 "Unsupported interleave factor for scalable vectors");
9269 return Result;
9270 };
9271 if (!getDecisionAndClampRange(ApplyIG, Range))
9272 continue;
9273 InterleaveGroups.insert(IG);
9274 }
9275
9276 // ---------------------------------------------------------------------------
9277 // Construct recipes for the instructions in the loop
9278 // ---------------------------------------------------------------------------
9279
9280 // Scan the body of the loop in a topological order to visit each basic block
9281 // after having visited its predecessor basic blocks.
9282 LoopBlocksDFS DFS(OrigLoop);
9283 DFS.perform(LI);
9284
9285 VPBasicBlock *HeaderVPBB = Plan->getVectorLoopRegion()->getEntryBasicBlock();
9286 VPBasicBlock *VPBB = HeaderVPBB;
9287 BasicBlock *HeaderBB = OrigLoop->getHeader();
9288 bool NeedsMasks =
9289 CM.foldTailByMasking() ||
9290 any_of(OrigLoop->blocks(), [this, HeaderBB](BasicBlock *BB) {
9291 bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();
9292 return Legal->blockNeedsPredication(BB) || NeedsBlends;
9293 });
9294
9295 RecipeBuilder.collectScaledReductions(Range);
9296
9297 auto *MiddleVPBB = Plan->getMiddleBlock();
9298 VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
9299 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
9300 // Relevant instructions from basic block BB will be grouped into VPRecipe
9301 // ingredients and fill a new VPBasicBlock.
9302 if (VPBB != HeaderVPBB)
9303 VPBB->setName(BB->getName());
9304 Builder.setInsertPoint(VPBB);
9305
9306 if (VPBB == HeaderVPBB)
9307 RecipeBuilder.createHeaderMask();
9308 else if (NeedsMasks)
9309 RecipeBuilder.createBlockInMask(BB);
9310
9311 // Introduce each ingredient into VPlan.
9312 // TODO: Model and preserve debug intrinsics in VPlan.
9313 for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
9314 Instruction *Instr = &I;
9316 auto *Phi = dyn_cast<PHINode>(Instr);
9317 if (Phi && Phi->getParent() == HeaderBB) {
9318 Operands.push_back(Plan->getOrAddLiveIn(
9319 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
9320 } else {
9321 auto OpRange = RecipeBuilder.mapToVPValues(Instr->operands());
9322 Operands = {OpRange.begin(), OpRange.end()};
9323 }
9324
9325 // The stores with invariant address inside the loop will be deleted, and
9326 // in the exit block, a uniform store recipe will be created for the final
9327 // invariant store of the reduction.
9328 StoreInst *SI;
9329 if ((SI = dyn_cast<StoreInst>(&I)) &&
9330 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
9331 // Only create recipe for the final invariant store of the reduction.
9332 if (!Legal->isInvariantStoreOfReduction(SI))
9333 continue;
9334 auto *Recipe = new VPReplicateRecipe(
9335 SI, RecipeBuilder.mapToVPValues(Instr->operands()),
9336 true /* IsUniform */);
9337 Recipe->insertBefore(*MiddleVPBB, MBIP);
9338 continue;
9339 }
9340
9341 VPRecipeBase *Recipe =
9342 RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);
9343 if (!Recipe)
9344 Recipe = RecipeBuilder.handleReplication(Instr, Range);
9345
9346 RecipeBuilder.setRecipe(Instr, Recipe);
9347 if (isa<VPHeaderPHIRecipe>(Recipe)) {
9348 // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In
9349 // the following cases, VPHeaderPHIRecipes may be created after non-phi
9350 // recipes and need to be moved to the phi section of HeaderVPBB:
9351 // * tail-folding (non-phi recipes computing the header mask are
9352 // introduced earlier than regular header phi recipes, and should appear
9353 // after them)
9354 // * Optimizing truncates to VPWidenIntOrFpInductionRecipe.
9355
9356 assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() ||
9357 CM.foldTailByMasking() || isa<TruncInst>(Instr)) &&
9358 "unexpected recipe needs moving");
9359 Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
9360 } else
9361 VPBB->appendRecipe(Recipe);
9362 }
9363
9364 VPBlockUtils::insertBlockAfter(Plan->createVPBasicBlock(""), VPBB);
9365 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
9366 }
9367
9368 // After here, VPBB should not be used.
9369 VPBB = nullptr;
9370
9371 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
9372 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
9373 "entry block must be set to a VPRegionBlock having a non-empty entry "
9374 "VPBasicBlock");
9375 RecipeBuilder.fixHeaderPhis();
9376
9377 // Update wide induction increments to use the same step as the corresponding
9378 // wide induction. This enables detecting induction increments directly in
9379 // VPlan and removes redundant splats.
9380 for (const auto &[Phi, ID] : Legal->getInductionVars()) {
9381 auto *IVInc = cast<Instruction>(
9382 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
9383 if (IVInc->getOperand(0) != Phi || IVInc->getOpcode() != Instruction::Add)
9384 continue;
9385 VPWidenInductionRecipe *WideIV =
9386 cast<VPWidenInductionRecipe>(RecipeBuilder.getRecipe(Phi));
9387 VPRecipeBase *R = RecipeBuilder.getRecipe(IVInc);
9388 R->setOperand(1, WideIV->getStepValue());
9389 }
9390
9391 if (auto *UncountableExitingBlock =
9394 *Plan, *PSE.getSE(), OrigLoop, UncountableExitingBlock, RecipeBuilder);
9395 }
9397 addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues);
9398 SetVector<VPIRInstruction *> ExitUsersToFix =
9399 collectUsersInExitBlocks(OrigLoop, RecipeBuilder, *Plan);
9400 addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix);
9401 if (!addUsersInExitBlocks(*Plan, ExitUsersToFix)) {
9403 "Some exit values in loop with uncountable exit not supported yet",
9404 "UncountableEarlyExitLoopsUnsupportedExitValue", ORE, OrigLoop);
9405 return nullptr;
9406 }
9407
9408 // ---------------------------------------------------------------------------
9409 // Transform initial VPlan: Apply previously taken decisions, in order, to
9410 // bring the VPlan to its final state.
9411 // ---------------------------------------------------------------------------
9412
9413 // Adjust the recipes for any inloop reductions.
9414 adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start);
9415
9416 // Interleave memory: for each Interleave Group we marked earlier as relevant
9417 // for this VPlan, replace the Recipes widening its memory instructions with a
9418 // single VPInterleaveRecipe at its insertion point.
9420 *Plan, InterleaveGroups, RecipeBuilder, CM.isScalarEpilogueAllowed());
9421
9422 for (ElementCount VF : Range)
9423 Plan->addVF(VF);
9424 Plan->setName("Initial VPlan");
9425
9426 // Replace VPValues for known constant strides guaranteed by predicate scalar
9427 // evolution.
9428 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
9429 auto *R = cast<VPRecipeBase>(&U);
9430 return R->getParent()->getParent() ||
9431 R->getParent() ==
9432 Plan->getVectorLoopRegion()->getSinglePredecessor();
9433 };
9434 for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) {
9435 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
9436 auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV));
9437 // Only handle constant strides for now.
9438 if (!ScevStride)
9439 continue;
9440
9441 auto *CI = Plan->getOrAddLiveIn(
9442 ConstantInt::get(Stride->getType(), ScevStride->getAPInt()));
9443 if (VPValue *StrideVPV = Plan->getLiveIn(StrideV))
9444 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
9445
9446 // The versioned value may not be used in the loop directly but through a
9447 // sext/zext. Add new live-ins in those cases.
9448 for (Value *U : StrideV->users()) {
9449 if (!isa<SExtInst, ZExtInst>(U))
9450 continue;
9451 VPValue *StrideVPV = Plan->getLiveIn(U);
9452 if (!StrideVPV)
9453 continue;
9454 unsigned BW = U->getType()->getScalarSizeInBits();
9455 APInt C = isa<SExtInst>(U) ? ScevStride->getAPInt().sext(BW)
9456 : ScevStride->getAPInt().zext(BW);
9457 VPValue *CI = Plan->getOrAddLiveIn(ConstantInt::get(U->getType(), C));
9458 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
9459 }
9460 }
9461
9463 return Legal->blockNeedsPredication(BB);
9464 });
9465
9466 // Sink users of fixed-order recurrence past the recipe defining the previous
9467 // value and introduce FirstOrderRecurrenceSplice VPInstructions.
9469 return nullptr;
9470
9471 if (useActiveLaneMask(Style)) {
9472 // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
9473 // TailFoldingStyle is visible there.
9474 bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
9475 bool WithoutRuntimeCheck =
9477 VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
9478 WithoutRuntimeCheck);
9479 }
9481
9482 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
9483 return Plan;
9484}
9485
9486VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9487 // Outer loop handling: They may require CFG and instruction level
9488 // transformations before even evaluating whether vectorization is profitable.
9489 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
9490 // the vectorization pipeline.
9491 assert(!OrigLoop->isInnermost());
9492 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
9493
9494 // Create new empty VPlan
9495 auto Plan = VPlan::createInitialVPlan(Legal->getWidestInductionType(), PSE,
9496 true, false, OrigLoop);
9497
9498 // Build hierarchical CFG
9499 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
9500 HCFGBuilder.buildHierarchicalCFG();
9501
9502 for (ElementCount VF : Range)
9503 Plan->addVF(VF);
9504
9506 Plan,
9507 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
9508 *PSE.getSE(), *TLI);
9509
9510 // Remove the existing terminator of the exiting block of the top-most region.
9511 // A BranchOnCount will be added instead when adding the canonical IV recipes.
9512 auto *Term =
9513 Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
9514 Term->eraseFromParent();
9515
9516 // Tail folding is not supported for outer loops, so the induction increment
9517 // is guaranteed to not wrap.
9518 bool HasNUW = true;
9519 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
9520 DebugLoc());
9521
9522 // Collect mapping of IR header phis to header phi recipes, to be used in
9523 // addScalarResumePhis.
9524 VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
9525 Builder);
9526 for (auto &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9527 if (isa<VPCanonicalIVPHIRecipe>(&R))
9528 continue;
9529 auto *HeaderR = cast<VPHeaderPHIRecipe>(&R);
9530 RecipeBuilder.setRecipe(HeaderR->getUnderlyingInstr(), HeaderR);
9531 }
9533 // TODO: IVEndValues are not used yet in the native path, to optimize exit
9534 // values.
9535 addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues);
9536
9537 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
9538 return Plan;
9539}
9540
9541// Adjust the recipes for reductions. For in-loop reductions the chain of
9542// instructions leading from the loop exit instr to the phi need to be converted
9543// to reductions, with one operand being vector and the other being the scalar
9544// reduction chain. For other reductions, a select is introduced between the phi
9545// and users outside the vector region when folding the tail.
9546//
9547// A ComputeReductionResult recipe is added to the middle block, also for
9548// in-loop reductions which compute their result in-loop, because generating
9549// the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
9550//
9551// Adjust AnyOf reductions; replace the reduction phi for the selected value
9552// with a boolean reduction phi node to check if the condition is true in any
9553// iteration. The final value is selected by the final ComputeReductionResult.
9554void LoopVectorizationPlanner::adjustRecipesForReductions(
9555 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
9556 using namespace VPlanPatternMatch;
9557 VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
9558 VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
9559 VPBasicBlock *MiddleVPBB = Plan->getMiddleBlock();
9561
9562 for (VPRecipeBase &R : Header->phis()) {
9563 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9564 if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
9565 continue;
9566
9567 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9568 RecurKind Kind = RdxDesc.getRecurrenceKind();
9569 assert(
9572 "AnyOf and FindLast reductions are not allowed for in-loop reductions");
9573
9574 // Collect the chain of "link" recipes for the reduction starting at PhiR.
9576 Worklist.insert(PhiR);
9577 for (unsigned I = 0; I != Worklist.size(); ++I) {
9578 VPSingleDefRecipe *Cur = Worklist[I];
9579 for (VPUser *U : Cur->users()) {
9580 auto *UserRecipe = cast<VPSingleDefRecipe>(U);
9581 if (!UserRecipe->getParent()->getEnclosingLoopRegion()) {
9582 assert((UserRecipe->getParent() == MiddleVPBB ||
9583 UserRecipe->getParent() == Plan->getScalarPreheader()) &&
9584 "U must be either in the loop region, the middle block or the "
9585 "scalar preheader.");
9586 continue;
9587 }
9588 Worklist.insert(UserRecipe);
9589 }
9590 }
9591
9592 // Visit operation "Links" along the reduction chain top-down starting from
9593 // the phi until LoopExitValue. We keep track of the previous item
9594 // (PreviousLink) to tell which of the two operands of a Link will remain
9595 // scalar and which will be reduced. For minmax by select(cmp), Link will be
9596 // the select instructions. Blend recipes of in-loop reduction phi's will
9597 // get folded to their non-phi operand, as the reduction recipe handles the
9598 // condition directly.
9599 VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0].
9600 for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) {
9601 Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
9602
9603 // Index of the first operand which holds a non-mask vector operand.
9604 unsigned IndexOfFirstOperand;
9605 // Recognize a call to the llvm.fmuladd intrinsic.
9606 bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9607 VPValue *VecOp;
9608 VPBasicBlock *LinkVPBB = CurrentLink->getParent();
9609 if (IsFMulAdd) {
9610 assert(
9612 "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9613 assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) ||
9614 isa<VPWidenIntrinsicRecipe>(CurrentLink)) &&
9615 CurrentLink->getOperand(2) == PreviousLink &&
9616 "expected a call where the previous link is the added operand");
9617
9618 // If the instruction is a call to the llvm.fmuladd intrinsic then we
9619 // need to create an fmul recipe (multiplying the first two operands of
9620 // the fmuladd together) to use as the vector operand for the fadd
9621 // reduction.
9622 VPInstruction *FMulRecipe = new VPInstruction(
9623 Instruction::FMul,
9624 {CurrentLink->getOperand(0), CurrentLink->getOperand(1)},
9625 CurrentLinkI->getFastMathFlags());
9626 LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());
9627 VecOp = FMulRecipe;
9628 } else {
9629 auto *Blend = dyn_cast<VPBlendRecipe>(CurrentLink);
9630 if (PhiR->isInLoop() && Blend) {
9631 assert(Blend->getNumIncomingValues() == 2 &&
9632 "Blend must have 2 incoming values");
9633 if (Blend->getIncomingValue(0) == PhiR)
9634 Blend->replaceAllUsesWith(Blend->getIncomingValue(1));
9635 else {
9636 assert(Blend->getIncomingValue(1) == PhiR &&
9637 "PhiR must be an operand of the blend");
9638 Blend->replaceAllUsesWith(Blend->getIncomingValue(0));
9639 }
9640 continue;
9641 }
9642
9644 if (isa<VPWidenRecipe>(CurrentLink)) {
9645 assert(isa<CmpInst>(CurrentLinkI) &&
9646 "need to have the compare of the select");
9647 continue;
9648 }
9649 assert(isa<VPWidenSelectRecipe>(CurrentLink) &&
9650 "must be a select recipe");
9651 IndexOfFirstOperand = 1;
9652 } else {
9653 assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) &&
9654 "Expected to replace a VPWidenSC");
9655 IndexOfFirstOperand = 0;
9656 }
9657 // Note that for non-commutable operands (cmp-selects), the semantics of
9658 // the cmp-select are captured in the recurrence kind.
9659 unsigned VecOpId =
9660 CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLink
9661 ? IndexOfFirstOperand + 1
9662 : IndexOfFirstOperand;
9663 VecOp = CurrentLink->getOperand(VecOpId);
9664 assert(VecOp != PreviousLink &&
9665 CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 -
9666 (VecOpId - IndexOfFirstOperand)) ==
9667 PreviousLink &&
9668 "PreviousLink must be the operand other than VecOp");
9669 }
9670
9671 BasicBlock *BB = CurrentLinkI->getParent();
9672 VPValue *CondOp = nullptr;
9674 CondOp = RecipeBuilder.getBlockInMask(BB);
9675
9676 auto *RedRecipe = new VPReductionRecipe(
9677 RdxDesc, CurrentLinkI, PreviousLink, VecOp, CondOp,
9678 CM.useOrderedReductions(RdxDesc), CurrentLinkI->getDebugLoc());
9679 // Append the recipe to the end of the VPBasicBlock because we need to
9680 // ensure that it comes after all of it's inputs, including CondOp.
9681 // Delete CurrentLink as it will be invalid if its operand is replaced
9682 // with a reduction defined at the bottom of the block in the next link.
9683 LinkVPBB->appendRecipe(RedRecipe);
9684 CurrentLink->replaceAllUsesWith(RedRecipe);
9685 ToDelete.push_back(CurrentLink);
9686 PreviousLink = RedRecipe;
9687 }
9688 }
9689 VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
9690 Builder.setInsertPoint(&*LatchVPBB->begin());
9691 VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
9692 for (VPRecipeBase &R :
9693 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9694 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9695 if (!PhiR)
9696 continue;
9697
9698 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9699 // If tail is folded by masking, introduce selects between the phi
9700 // and the users outside the vector region of each reduction, at the
9701 // beginning of the dedicated latch block.
9702 auto *OrigExitingVPV = PhiR->getBackedgeValue();
9703 auto *NewExitingVPV = PhiR->getBackedgeValue();
9704 if (!PhiR->isInLoop() && CM.foldTailByMasking()) {
9705 VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader());
9706 assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB &&
9707 "reduction recipe must be defined before latch");
9708 Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType();
9709 std::optional<FastMathFlags> FMFs =
9710 PhiTy->isFloatingPointTy()
9711 ? std::make_optional(RdxDesc.getFastMathFlags())
9712 : std::nullopt;
9713 NewExitingVPV =
9714 Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs);
9715 OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) {
9716 return isa<VPInstruction>(&U) &&
9717 cast<VPInstruction>(&U)->getOpcode() ==
9719 });
9721 PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy))
9722 PhiR->setOperand(1, NewExitingVPV);
9723 }
9724
9725 // If the vector reduction can be performed in a smaller type, we truncate
9726 // then extend the loop exit value to enable InstCombine to evaluate the
9727 // entire expression in the smaller type.
9728 Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType();
9729 if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&
9731 RdxDesc.getRecurrenceKind())) {
9732 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
9733 Type *RdxTy = RdxDesc.getRecurrenceType();
9734 auto *Trunc =
9735 new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy);
9736 auto *Extnd =
9737 RdxDesc.isSigned()
9738 ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy)
9739 : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy);
9740
9741 Trunc->insertAfter(NewExitingVPV->getDefiningRecipe());
9742 Extnd->insertAfter(Trunc);
9743 if (PhiR->getOperand(1) == NewExitingVPV)
9744 PhiR->setOperand(1, Extnd->getVPSingleValue());
9745 NewExitingVPV = Extnd;
9746 }
9747
9748 // We want code in the middle block to appear to execute on the location of
9749 // the scalar loop's latch terminator because: (a) it is all compiler
9750 // generated, (b) these instructions are always executed after evaluating
9751 // the latch conditional branch, and (c) other passes may add new
9752 // predecessors which terminate on this line. This is the easiest way to
9753 // ensure we don't accidentally cause an extra step back into the loop while
9754 // debugging.
9755 DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
9756
9757 // TODO: At the moment ComputeReductionResult also drives creation of the
9758 // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
9759 // even for in-loop reductions, until the reduction resume value handling is
9760 // also modeled in VPlan.
9761 auto *FinalReductionResult = new VPInstruction(
9762 VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
9763 // Update all users outside the vector region.
9764 OrigExitingVPV->replaceUsesWithIf(
9765 FinalReductionResult, [](VPUser &User, unsigned) {
9766 auto *Parent = cast<VPRecipeBase>(&User)->getParent();
9767 return Parent && !Parent->getParent();
9768 });
9769 FinalReductionResult->insertBefore(*MiddleVPBB, IP);
9770
9771 // Adjust AnyOf reductions; replace the reduction phi for the selected value
9772 // with a boolean reduction phi node to check if the condition is true in
9773 // any iteration. The final value is selected by the final
9774 // ComputeReductionResult.
9776 RdxDesc.getRecurrenceKind())) {
9777 auto *Select = cast<VPRecipeBase>(*find_if(PhiR->users(), [](VPUser *U) {
9778 return isa<VPWidenSelectRecipe>(U) ||
9779 (isa<VPReplicateRecipe>(U) &&
9780 cast<VPReplicateRecipe>(U)->getUnderlyingInstr()->getOpcode() ==
9781 Instruction::Select);
9782 }));
9783 VPValue *Cmp = Select->getOperand(0);
9784 // If the compare is checking the reduction PHI node, adjust it to check
9785 // the start value.
9786 if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe()) {
9787 for (unsigned I = 0; I != CmpR->getNumOperands(); ++I)
9788 if (CmpR->getOperand(I) == PhiR)
9789 CmpR->setOperand(I, PhiR->getStartValue());
9790 }
9791 VPBuilder::InsertPointGuard Guard(Builder);
9792 Builder.setInsertPoint(Select);
9793
9794 // If the true value of the select is the reduction phi, the new value is
9795 // selected if the negated condition is true in any iteration.
9796 if (Select->getOperand(1) == PhiR)
9797 Cmp = Builder.createNot(Cmp);
9798 VPValue *Or = Builder.createOr(PhiR, Cmp);
9799 Select->getVPSingleValue()->replaceAllUsesWith(Or);
9800 // Delete Select now that it has invalid types.
9801 ToDelete.push_back(Select);
9802
9803 // Convert the reduction phi to operate on bools.
9804 PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse(
9805 OrigLoop->getHeader()->getContext())));
9806 continue;
9807 }
9808
9810 RdxDesc.getRecurrenceKind())) {
9811 // Adjust the start value for FindLastIV recurrences to use the sentinel
9812 // value after generating the ResumePhi recipe, which uses the original
9813 // start value.
9814 PhiR->setOperand(0, Plan->getOrAddLiveIn(RdxDesc.getSentinelValue()));
9815 }
9816 }
9817
9819 for (VPRecipeBase *R : ToDelete)
9820 R->eraseFromParent();
9821}
9822
9824 assert(!State.Lane && "VPDerivedIVRecipe being replicated.");
9825
9826 // Fast-math-flags propagate from the original induction instruction.
9828 if (FPBinOp)
9829 State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
9830
9831 Value *Step = State.get(getStepValue(), VPLane(0));
9832 Value *Index = State.get(getOperand(1), VPLane(0));
9833 Value *DerivedIV = emitTransformedIndex(
9834 State.Builder, Index, getStartValue()->getLiveInIRValue(), Step, Kind,
9835 cast_if_present<BinaryOperator>(FPBinOp));
9836 DerivedIV->setName(Name);
9837 // If index is the vector trip count, the concrete value will only be set in
9838 // prepareToExecute, leading to missed simplifications, e.g. if it is 0.
9839 // TODO: Remove the special case for the vector trip count once it is computed
9840 // in VPlan and can be used during VPlan simplification.
9841 assert((DerivedIV != Index ||
9842 getOperand(1) == &getParent()->getPlan()->getVectorTripCount()) &&
9843 "IV didn't need transforming?");
9844 State.set(this, DerivedIV, VPLane(0));
9845}
9846
9849 if (State.Lane) { // Generate a single instance.
9850 assert((State.VF.isScalar() || !isUniform()) &&
9851 "uniform recipe shouldn't be predicated");
9852 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9853 State.ILV->scalarizeInstruction(UI, this, *State.Lane, State);
9854 // Insert scalar instance packing it into a vector.
9855 if (State.VF.isVector() && shouldPack()) {
9856 // If we're constructing lane 0, initialize to start from poison.
9857 if (State.Lane->isFirstLane()) {
9858 assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9860 VectorType::get(UI->getType(), State.VF));
9861 State.set(this, Poison);
9862 }
9863 State.packScalarIntoVectorValue(this, *State.Lane);
9864 }
9865 return;
9866 }
9867
9868 if (IsUniform) {
9869 // Uniform within VL means we need to generate lane 0.
9870 State.ILV->scalarizeInstruction(UI, this, VPLane(0), State);
9871 return;
9872 }
9873
9874 // A store of a loop varying value to a uniform address only needs the last
9875 // copy of the store.
9876 if (isa<StoreInst>(UI) &&
9878 auto Lane = VPLane::getLastLaneForVF(State.VF);
9879 State.ILV->scalarizeInstruction(UI, this, VPLane(Lane), State);
9880 return;
9881 }
9882
9883 // Generate scalar instances for all VF lanes.
9884 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9885 const unsigned EndLane = State.VF.getKnownMinValue();
9886 for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9887 State.ILV->scalarizeInstruction(UI, this, VPLane(Lane), State);
9888}
9889
9890// Determine how to lower the scalar epilogue, which depends on 1) optimising
9891// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9892// predication, and 4) a TTI hook that analyses whether the loop is suitable
9893// for predication.
9898 // 1) OptSize takes precedence over all other options, i.e. if this is set,
9899 // don't look at hints or options, and don't request a scalar epilogue.
9900 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9901 // LoopAccessInfo (due to code dependency and not being able to reliably get
9902 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9903 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9904 // versioning when the vectorization is forced, unlike hasOptSize. So revert
9905 // back to the old way and vectorize with versioning when forced. See D81345.)
9906 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9910
9911 // 2) If set, obey the directives
9912 if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9920 };
9921 }
9922
9923 // 3) If set, obey the hints
9924 switch (Hints.getPredicate()) {
9929 };
9930
9931 // 4) if the TTI hook indicates this is profitable, request predication.
9932 TailFoldingInfo TFI(TLI, &LVL, IAI);
9935
9937}
9938
9939// Process the loop in the VPlan-native vectorization path. This path builds
9940// VPlan upfront in the vectorization pipeline, which allows to apply
9941// VPlan-to-VPlan transformations from the very beginning without modifying the
9942// input LLVM IR.
9949 LoopVectorizationRequirements &Requirements) {
9950
9951 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
9952 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9953 return false;
9954 }
9955 assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9956 Function *F = L->getHeader()->getParent();
9957 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9958
9960 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
9961
9962 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9963 &Hints, IAI);
9964 // Use the planner for outer loop vectorization.
9965 // TODO: CM is not used at this point inside the planner. Turn CM into an
9966 // optional argument if we don't need it in the future.
9967 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
9968 ORE);
9969
9970 // Get user vectorization factor.
9971 ElementCount UserVF = Hints.getWidth();
9972
9974
9975 // Plan how to best vectorize, return the best VF and its cost.
9976 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9977
9978 // If we are stress testing VPlan builds, do not attempt to generate vector
9979 // code. Masked vector code generation support will follow soon.
9980 // Also, do not attempt to vectorize if no vector code will be produced.
9982 return false;
9983
9984 VPlan &BestPlan = LVP.getPlanFor(VF.Width);
9985
9986 {
9987 bool AddBranchWeights =
9988 hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
9989 GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
9990 AddBranchWeights, CM.CostKind);
9991 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
9992 VF.Width, 1, LVL, &CM, BFI, PSI, Checks, BestPlan);
9993 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
9994 << L->getHeader()->getParent()->getName() << "\"\n");
9995 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
9996 }
9997
9998 reportVectorization(ORE, L, VF, 1);
9999
10000 // Mark the loop as already vectorized to avoid vectorizing again.
10001 Hints.setAlreadyVectorized();
10002 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10003 return true;
10004}
10005
10006// Emit a remark if there are stores to floats that required a floating point
10007// extension. If the vectorized loop was generated with floating point there
10008// will be a performance penalty from the conversion overhead and the change in
10009// the vector width.
10012 for (BasicBlock *BB : L->getBlocks()) {
10013 for (Instruction &Inst : *BB) {
10014 if (auto *S = dyn_cast<StoreInst>(&Inst)) {
10015 if (S->getValueOperand()->getType()->isFloatTy())
10016 Worklist.push_back(S);
10017 }
10018 }
10019 }
10020
10021 // Traverse the floating point stores upwards searching, for floating point
10022 // conversions.
10025 while (!Worklist.empty()) {
10026 auto *I = Worklist.pop_back_val();
10027 if (!L->contains(I))
10028 continue;
10029 if (!Visited.insert(I).second)
10030 continue;
10031
10032 // Emit a remark if the floating point store required a floating
10033 // point conversion.
10034 // TODO: More work could be done to identify the root cause such as a
10035 // constant or a function return type and point the user to it.
10036 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
10037 ORE->emit([&]() {
10038 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
10039 I->getDebugLoc(), L->getHeader())
10040 << "floating point conversion changes vector width. "
10041 << "Mixed floating point precision requires an up/down "
10042 << "cast that will negatively impact performance.";
10043 });
10044
10045 for (Use &Op : I->operands())
10046 if (auto *OpI = dyn_cast<Instruction>(Op))
10047 Worklist.push_back(OpI);
10048 }
10049}
10050
10051static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
10052 VectorizationFactor &VF, Loop *L,
10053 const TargetTransformInfo &TTI,
10056 InstructionCost CheckCost = Checks.getCost();
10057 if (!CheckCost.isValid())
10058 return false;
10059
10060 // When interleaving only scalar and vector cost will be equal, which in turn
10061 // would lead to a divide by 0. Fall back to hard threshold.
10062 if (VF.Width.isScalar()) {
10063 if (CheckCost > VectorizeMemoryCheckThreshold) {
10064 LLVM_DEBUG(
10065 dbgs()
10066 << "LV: Interleaving only is not profitable due to runtime checks\n");
10067 return false;
10068 }
10069 return true;
10070 }
10071
10072 // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
10073 uint64_t ScalarC = *VF.ScalarCost.getValue();
10074 if (ScalarC == 0)
10075 return true;
10076
10077 // First, compute the minimum iteration count required so that the vector
10078 // loop outperforms the scalar loop.
10079 // The total cost of the scalar loop is
10080 // ScalarC * TC
10081 // where
10082 // * TC is the actual trip count of the loop.
10083 // * ScalarC is the cost of a single scalar iteration.
10084 //
10085 // The total cost of the vector loop is
10086 // RtC + VecC * (TC / VF) + EpiC
10087 // where
10088 // * RtC is the cost of the generated runtime checks
10089 // * VecC is the cost of a single vector iteration.
10090 // * TC is the actual trip count of the loop
10091 // * VF is the vectorization factor
10092 // * EpiCost is the cost of the generated epilogue, including the cost
10093 // of the remaining scalar operations.
10094 //
10095 // Vectorization is profitable once the total vector cost is less than the
10096 // total scalar cost:
10097 // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC
10098 //
10099 // Now we can compute the minimum required trip count TC as
10100 // VF * (RtC + EpiC) / (ScalarC * VF - VecC) < TC
10101 //
10102 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
10103 // the computations are performed on doubles, not integers and the result
10104 // is rounded up, hence we get an upper estimate of the TC.
10105 unsigned IntVF = getEstimatedRuntimeVF(L, TTI, VF.Width);
10106 uint64_t RtC = *CheckCost.getValue();
10107 uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue();
10108 uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
10109
10110 // Second, compute a minimum iteration count so that the cost of the
10111 // runtime checks is only a fraction of the total scalar loop cost. This
10112 // adds a loop-dependent bound on the overhead incurred if the runtime
10113 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
10114 // * TC. To bound the runtime check to be a fraction 1/X of the scalar
10115 // cost, compute
10116 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC
10117 uint64_t MinTC2 = divideCeil(RtC * 10, ScalarC);
10118
10119 // Now pick the larger minimum. If it is not a multiple of VF and a scalar
10120 // epilogue is allowed, choose the next closest multiple of VF. This should
10121 // partly compensate for ignoring the epilogue cost.
10122 uint64_t MinTC = std::max(MinTC1, MinTC2);
10123 if (SEL == CM_ScalarEpilogueAllowed)
10124 MinTC = alignTo(MinTC, IntVF);
10126
10127 LLVM_DEBUG(
10128 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
10129 << VF.MinProfitableTripCount << "\n");
10130
10131 // Skip vectorization if the expected trip count is less than the minimum
10132 // required trip count.
10133 if (auto ExpectedTC = getSmallBestKnownTC(PSE, L)) {
10136 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
10137 "trip count < minimum profitable VF ("
10138 << *ExpectedTC << " < " << VF.MinProfitableTripCount
10139 << ")\n");
10140
10141 return false;
10142 }
10143 }
10144 return true;
10145}
10146
10148 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
10150 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
10152
10153/// Prepare \p MainPlan for vectorizing the main vector loop during epilogue
10154/// vectorization. Remove ResumePhis from \p MainPlan for inductions that
10155/// don't have a corresponding wide induction in \p EpiPlan.
10156static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
10157 // Collect PHI nodes of widened phis in the VPlan for the epilogue. Those
10158 // will need their resume-values computed in the main vector loop. Others
10159 // can be removed from the main VPlan.
10160 SmallPtrSet<PHINode *, 2> EpiWidenedPhis;
10161 for (VPRecipeBase &R :
10163 if (isa<VPCanonicalIVPHIRecipe>(&R))
10164 continue;
10165 EpiWidenedPhis.insert(
10166 cast<PHINode>(R.getVPSingleValue()->getUnderlyingValue()));
10167 }
10169 *cast<VPIRBasicBlock>(MainPlan.getScalarHeader()))) {
10170 auto *VPIRInst = cast<VPIRInstruction>(&R);
10171 auto *IRI = dyn_cast<PHINode>(&VPIRInst->getInstruction());
10172 if (!IRI)
10173 break;
10174 if (EpiWidenedPhis.contains(IRI))
10175 continue;
10176 // There is no corresponding wide induction in the epilogue plan that would
10177 // need a resume value. Remove the VPIRInst wrapping the scalar header phi
10178 // together with the corresponding ResumePhi. The resume values for the
10179 // scalar loop will be created during execution of EpiPlan.
10180 VPRecipeBase *ResumePhi = VPIRInst->getOperand(0)->getDefiningRecipe();
10181 VPIRInst->eraseFromParent();
10182 ResumePhi->eraseFromParent();
10183 }
10185
10186 using namespace VPlanPatternMatch;
10187 VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader();
10188 VPValue *VectorTC = &MainPlan.getVectorTripCount();
10189 // If there is a suitable resume value for the canonical induction in the
10190 // scalar (which will become vector) epilogue loop we are done. Otherwise
10191 // create it below.
10192 if (any_of(*MainScalarPH, [VectorTC](VPRecipeBase &R) {
10193 return match(&R, m_VPInstruction<VPInstruction::ResumePhi>(
10194 m_Specific(VectorTC), m_SpecificInt(0)));
10195 }))
10196 return;
10197 VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin());
10198 ScalarPHBuilder.createNaryOp(
10200 {VectorTC, MainPlan.getCanonicalIV()->getStartValue()}, {},
10201 "vec.epilog.resume.val");
10202}
10203
10204/// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded
10205/// SCEVs from \p ExpandedSCEVs and set resume values for header recipes.
10206static void
10208 const SCEV2ValueTy &ExpandedSCEVs,
10209 const EpilogueLoopVectorizationInfo &EPI) {
10210 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
10211 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
10212 Header->setName("vec.epilog.vector.body");
10213
10214 // Re-use the trip count and steps expanded for the main loop, as
10215 // skeleton creation needs it as a value that dominates both the scalar
10216 // and vector epilogue loops
10217 // TODO: This is a workaround needed for epilogue vectorization and it
10218 // should be removed once induction resume value creation is done
10219 // directly in VPlan.
10220 for (auto &R : make_early_inc_range(*Plan.getEntry())) {
10221 auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(&R);
10222 if (!ExpandR)
10223 continue;
10224 auto *ExpandedVal =
10225 Plan.getOrAddLiveIn(ExpandedSCEVs.find(ExpandR->getSCEV())->second);
10226 ExpandR->replaceAllUsesWith(ExpandedVal);
10227 if (Plan.getTripCount() == ExpandR)
10228 Plan.resetTripCount(ExpandedVal);
10229 ExpandR->eraseFromParent();
10230 }
10231
10232 // Ensure that the start values for all header phi recipes are updated before
10233 // vectorizing the epilogue loop.
10234 for (VPRecipeBase &R : Header->phis()) {
10235 if (auto *IV = dyn_cast<VPCanonicalIVPHIRecipe>(&R)) {
10236 // When vectorizing the epilogue loop, the canonical induction start
10237 // value needs to be changed from zero to the value after the main
10238 // vector loop. Find the resume value created during execution of the main
10239 // VPlan.
10240 // FIXME: Improve modeling for canonical IV start values in the epilogue
10241 // loop.
10242 BasicBlock *MainMiddle = find_singleton<BasicBlock>(
10243 predecessors(L->getLoopPreheader()),
10244 [&EPI](BasicBlock *BB, bool) -> BasicBlock * {
10245 if (BB != EPI.MainLoopIterationCountCheck &&
10246 BB != EPI.EpilogueIterationCountCheck &&
10247 BB != EPI.SCEVSafetyCheck && BB != EPI.MemSafetyCheck)
10248 return BB;
10249 return nullptr;
10250 });
10251 using namespace llvm::PatternMatch;
10252 Type *IdxTy = IV->getScalarType();
10253 PHINode *EPResumeVal = find_singleton<PHINode>(
10254 L->getLoopPreheader()->phis(),
10255 [&EPI, IdxTy, MainMiddle](PHINode &P, bool) -> PHINode * {
10256 if (P.getType() == IdxTy &&
10257 P.getIncomingValueForBlock(MainMiddle) == EPI.VectorTripCount &&
10258 match(
10259 P.getIncomingValueForBlock(EPI.MainLoopIterationCountCheck),
10260 m_SpecificInt(0)))
10261 return &P;
10262 return nullptr;
10263 });
10264 assert(EPResumeVal && "must have a resume value for the canonical IV");
10265 VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal);
10266 assert(all_of(IV->users(),
10267 [](const VPUser *U) {
10268 return isa<VPScalarIVStepsRecipe>(U) ||
10269 isa<VPScalarCastRecipe>(U) ||
10270 isa<VPDerivedIVRecipe>(U) ||
10271 cast<VPInstruction>(U)->getOpcode() ==
10272 Instruction::Add;
10273 }) &&
10274 "the canonical IV should only be used by its increment or "
10275 "ScalarIVSteps when resetting the start value");
10276 IV->setOperand(0, VPV);
10277 continue;
10278 }
10279
10280 Value *ResumeV = nullptr;
10281 // TODO: Move setting of resume values to prepareToExecute.
10282 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10283 ResumeV = cast<PHINode>(ReductionPhi->getUnderlyingInstr())
10284 ->getIncomingValueForBlock(L->getLoopPreheader());
10285 const RecurrenceDescriptor &RdxDesc =
10286 ReductionPhi->getRecurrenceDescriptor();
10287 RecurKind RK = RdxDesc.getRecurrenceKind();
10289 // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
10290 // start value; compare the final value from the main vector loop
10291 // to the start value.
10292 IRBuilder<> Builder(
10293 cast<Instruction>(ResumeV)->getParent()->getFirstNonPHI());
10294 ResumeV =
10295 Builder.CreateICmpNE(ResumeV, RdxDesc.getRecurrenceStartValue());
10297 // VPReductionPHIRecipe for FindLastIV reductions requires an adjustment
10298 // to the resume value. The resume value is adjusted to the sentinel
10299 // value when the final value from the main vector loop equals the start
10300 // value. This ensures correctness when the start value might not be
10301 // less than the minimum value of a monotonically increasing induction
10302 // variable.
10303 IRBuilder<> Builder(
10304 cast<Instruction>(ResumeV)->getParent()->getFirstNonPHI());
10305 Value *Cmp =
10306 Builder.CreateICmpEQ(ResumeV, RdxDesc.getRecurrenceStartValue());
10307 ResumeV =
10308 Builder.CreateSelect(Cmp, RdxDesc.getSentinelValue(), ResumeV);
10309 }
10310 } else {
10311 // Retrieve the induction resume values for wide inductions from
10312 // their original phi nodes in the scalar loop.
10313 PHINode *IndPhi = cast<VPWidenInductionRecipe>(&R)->getPHINode();
10314 // Hook up to the PHINode generated by a ResumePhi recipe of main
10315 // loop VPlan, which feeds the scalar loop.
10316 ResumeV = IndPhi->getIncomingValueForBlock(L->getLoopPreheader());
10317 }
10318 assert(ResumeV && "Must have a resume value");
10319 VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
10320 cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
10321 }
10322}
10323
10325 assert((EnableVPlanNativePath || L->isInnermost()) &&
10326 "VPlan-native path is not enabled. Only process inner loops.");
10327
10328 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
10329 << L->getHeader()->getParent()->getName() << "' from "
10330 << L->getLocStr() << "\n");
10331
10332 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
10333
10334 LLVM_DEBUG(
10335 dbgs() << "LV: Loop hints:"
10336 << " force="
10338 ? "disabled"
10340 ? "enabled"
10341 : "?"))
10342 << " width=" << Hints.getWidth()
10343 << " interleave=" << Hints.getInterleave() << "\n");
10344
10345 // Function containing loop
10346 Function *F = L->getHeader()->getParent();
10347
10348 // Looking at the diagnostic output is the only way to determine if a loop
10349 // was vectorized (other than looking at the IR or machine code), so it
10350 // is important to generate an optimization remark for each loop. Most of
10351 // these messages are generated as OptimizationRemarkAnalysis. Remarks
10352 // generated as OptimizationRemark and OptimizationRemarkMissed are
10353 // less verbose reporting vectorized loops and unvectorized loops that may
10354 // benefit from vectorization, respectively.
10355
10356 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
10357 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
10358 return false;
10359 }
10360
10361 PredicatedScalarEvolution PSE(*SE, *L);
10362
10363 // Check if it is legal to vectorize the loop.
10364 LoopVectorizationRequirements Requirements;
10365 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
10366 &Requirements, &Hints, DB, AC, BFI, PSI);
10368 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
10369 Hints.emitRemarkWithHints();
10370 return false;
10371 }
10372
10374 reportVectorizationFailure("Auto-vectorization of loops with uncountable "
10375 "early exit is not enabled",
10376 "UncountableEarlyExitLoopsDisabled", ORE, L);
10377 return false;
10378 }
10379
10380 if (LVL.hasStructVectorCall()) {
10381 reportVectorizationFailure("Auto-vectorization of calls that return struct "
10382 "types is not yet supported",
10383 "StructCallVectorizationUnsupported", ORE, L);
10384 return false;
10385 }
10386
10387 // Entrance to the VPlan-native vectorization path. Outer loops are processed
10388 // here. They may require CFG and instruction level transformations before
10389 // even evaluating whether vectorization is profitable. Since we cannot modify
10390 // the incoming IR, we need to build VPlan upfront in the vectorization
10391 // pipeline.
10392 if (!L->isInnermost())
10393 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
10394 ORE, BFI, PSI, Hints, Requirements);
10395
10396 assert(L->isInnermost() && "Inner loop expected.");
10397
10398 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
10399 bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
10400
10401 // If an override option has been passed in for interleaved accesses, use it.
10403 UseInterleaved = EnableInterleavedMemAccesses;
10404
10405 // Analyze interleaved memory accesses.
10406 if (UseInterleaved)
10408
10409 if (LVL.hasUncountableEarlyExit()) {
10410 BasicBlock *LoopLatch = L->getLoopLatch();
10411 if (IAI.requiresScalarEpilogue() ||
10413 [LoopLatch](BasicBlock *BB) { return BB != LoopLatch; })) {
10414 reportVectorizationFailure("Auto-vectorization of early exit loops "
10415 "requiring a scalar epilogue is unsupported",
10416 "UncountableEarlyExitUnsupported", ORE, L);
10417 return false;
10418 }
10419 }
10420
10421 // Check the function attributes and profiles to find out if this function
10422 // should be optimized for size.
10424 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI);
10425
10426 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
10427 // count by optimizing for size, to minimize overheads.
10428 auto ExpectedTC = getSmallBestKnownTC(PSE, L);
10429 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
10430 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
10431 << "This loop is worth vectorizing only if no scalar "
10432 << "iteration overheads are incurred.");
10434 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
10435 else {
10436 if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
10437 LLVM_DEBUG(dbgs() << "\n");
10438 // Predicate tail-folded loops are efficient even when the loop
10439 // iteration count is low. However, setting the epilogue policy to
10440 // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
10441 // with runtime checks. It's more effective to let
10442 // `areRuntimeChecksProfitable` determine if vectorization is beneficial
10443 // for the loop.
10446 } else {
10447 LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
10448 "small to consider vectorizing.\n");
10450 "The trip count is below the minial threshold value.",
10451 "loop trip count is too low, avoiding vectorization",
10452 "LowTripCount", ORE, L);
10453 Hints.emitRemarkWithHints();
10454 return false;
10455 }
10456 }
10457 }
10458
10459 // Check the function attributes to see if implicit floats or vectors are
10460 // allowed.
10461 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
10463 "Can't vectorize when the NoImplicitFloat attribute is used",
10464 "loop not vectorized due to NoImplicitFloat attribute",
10465 "NoImplicitFloat", ORE, L);
10466 Hints.emitRemarkWithHints();
10467 return false;
10468 }
10469
10470 // Check if the target supports potentially unsafe FP vectorization.
10471 // FIXME: Add a check for the type of safety issue (denormal, signaling)
10472 // for the target we're vectorizing for, to make sure none of the
10473 // additional fp-math flags can help.
10474 if (Hints.isPotentiallyUnsafe() &&
10477 "Potentially unsafe FP op prevents vectorization",
10478 "loop not vectorized due to unsafe FP support.",
10479 "UnsafeFP", ORE, L);
10480 Hints.emitRemarkWithHints();
10481 return false;
10482 }
10483
10484 bool AllowOrderedReductions;
10485 // If the flag is set, use that instead and override the TTI behaviour.
10487 AllowOrderedReductions = ForceOrderedReductions;
10488 else
10489 AllowOrderedReductions = TTI->enableOrderedReductions();
10490 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
10491 ORE->emit([&]() {
10492 auto *ExactFPMathInst = Requirements.getExactFPInst();
10493 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
10494 ExactFPMathInst->getDebugLoc(),
10495 ExactFPMathInst->getParent())
10496 << "loop not vectorized: cannot prove it is safe to reorder "
10497 "floating-point operations";
10498 });
10499 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10500 "reorder floating-point operations\n");
10501 Hints.emitRemarkWithHints();
10502 return false;
10503 }
10504
10505 // Use the cost model.
10506 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10507 F, &Hints, IAI);
10508 // Use the planner for vectorization.
10509 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
10510 ORE);
10511
10512 // Get user vectorization factor and interleave count.
10513 ElementCount UserVF = Hints.getWidth();
10514 unsigned UserIC = Hints.getInterleave();
10515
10516 // Plan how to best vectorize.
10517 LVP.plan(UserVF, UserIC);
10519 unsigned IC = 1;
10520
10523
10524 bool AddBranchWeights =
10525 hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
10526 GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
10527 AddBranchWeights, CM.CostKind);
10528 if (LVP.hasPlanWithVF(VF.Width)) {
10529 // Select the interleave count.
10530 IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
10531
10532 unsigned SelectedIC = std::max(IC, UserIC);
10533 // Optimistically generate runtime checks if they are needed. Drop them if
10534 // they turn out to not be profitable.
10535 if (VF.Width.isVector() || SelectedIC > 1)
10536 Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
10537
10538 // Check if it is profitable to vectorize with runtime checks.
10539 bool ForceVectorization =
10541 if (!ForceVectorization &&
10542 !areRuntimeChecksProfitable(Checks, VF, L, *TTI, PSE, SEL)) {
10543 ORE->emit([&]() {
10545 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
10546 L->getHeader())
10547 << "loop not vectorized: cannot prove it is safe to reorder "
10548 "memory operations";
10549 });
10550 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
10551 Hints.emitRemarkWithHints();
10552 return false;
10553 }
10554 }
10555
10556 // Identify the diagnostic messages that should be produced.
10557 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10558 bool VectorizeLoop = true, InterleaveLoop = true;
10559 if (VF.Width.isScalar()) {
10560 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10561 VecDiagMsg = std::make_pair(
10562 "VectorizationNotBeneficial",
10563 "the cost-model indicates that vectorization is not beneficial");
10564 VectorizeLoop = false;
10565 }
10566
10567 if (!LVP.hasPlanWithVF(VF.Width) && UserIC > 1) {
10568 // Tell the user interleaving was avoided up-front, despite being explicitly
10569 // requested.
10570 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10571 "interleaving should be avoided up front\n");
10572 IntDiagMsg = std::make_pair(
10573 "InterleavingAvoided",
10574 "Ignoring UserIC, because interleaving was avoided up front");
10575 InterleaveLoop = false;
10576 } else if (IC == 1 && UserIC <= 1) {
10577 // Tell the user interleaving is not beneficial.
10578 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10579 IntDiagMsg = std::make_pair(
10580 "InterleavingNotBeneficial",
10581 "the cost-model indicates that interleaving is not beneficial");
10582 InterleaveLoop = false;
10583 if (UserIC == 1) {
10584 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10585 IntDiagMsg.second +=
10586 " and is explicitly disabled or interleave count is set to 1";
10587 }
10588 } else if (IC > 1 && UserIC == 1) {
10589 // Tell the user interleaving is beneficial, but it explicitly disabled.
10590 LLVM_DEBUG(
10591 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10592 IntDiagMsg = std::make_pair(
10593 "InterleavingBeneficialButDisabled",
10594 "the cost-model indicates that interleaving is beneficial "
10595 "but is explicitly disabled or interleave count is set to 1");
10596 InterleaveLoop = false;
10597 }
10598
10599 // If there is a histogram in the loop, do not just interleave without
10600 // vectorizing. The order of operations will be incorrect without the
10601 // histogram intrinsics, which are only used for recipes with VF > 1.
10602 if (!VectorizeLoop && InterleaveLoop && LVL.hasHistograms()) {
10603 LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due "
10604 << "to histogram operations.\n");
10605 IntDiagMsg = std::make_pair(
10606 "HistogramPreventsScalarInterleaving",
10607 "Unable to interleave without vectorization due to constraints on "
10608 "the order of histogram operations");
10609 InterleaveLoop = false;
10610 }
10611
10612 // Override IC if user provided an interleave count.
10613 IC = UserIC > 0 ? UserIC : IC;
10614
10615 // Emit diagnostic messages, if any.
10616 const char *VAPassName = Hints.vectorizeAnalysisPassName();
10617 if (!VectorizeLoop && !InterleaveLoop) {
10618 // Do not vectorize or interleaving the loop.
10619 ORE->emit([&]() {
10620 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10621 L->getStartLoc(), L->getHeader())
10622 << VecDiagMsg.second;
10623 });
10624 ORE->emit([&]() {
10625 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10626 L->getStartLoc(), L->getHeader())
10627 << IntDiagMsg.second;
10628 });
10629 return false;
10630 }
10631
10632 if (!VectorizeLoop && InterleaveLoop) {
10633 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10634 ORE->emit([&]() {
10635 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10636 L->getStartLoc(), L->getHeader())
10637 << VecDiagMsg.second;
10638 });
10639 } else if (VectorizeLoop && !InterleaveLoop) {
10640 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10641 << ") in " << L->getLocStr() << '\n');
10642 ORE->emit([&]() {
10643 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10644 L->getStartLoc(), L->getHeader())
10645 << IntDiagMsg.second;
10646 });
10647 } else if (VectorizeLoop && InterleaveLoop) {
10648 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10649 << ") in " << L->getLocStr() << '\n');
10650 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10651 }
10652
10653 bool DisableRuntimeUnroll = false;
10654 MDNode *OrigLoopID = L->getLoopID();
10655 {
10656 using namespace ore;
10657 if (!VectorizeLoop) {
10658 assert(IC > 1 && "interleave count should not be 1 or 0");
10659 // If we decided that it is not legal to vectorize the loop, then
10660 // interleave it.
10661 VPlan &BestPlan = LVP.getPlanFor(VF.Width);
10662 InnerLoopVectorizer Unroller(
10663 L, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed(1),
10664 ElementCount::getFixed(1), IC, &LVL, &CM, BFI, PSI, Checks, BestPlan);
10665
10666 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
10667
10668 ORE->emit([&]() {
10669 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10670 L->getHeader())
10671 << "interleaved loop (interleaved count: "
10672 << NV("InterleaveCount", IC) << ")";
10673 });
10674 } else {
10675 // If we decided that it is *legal* to vectorize the loop, then do it.
10676
10677 VPlan &BestPlan = LVP.getPlanFor(VF.Width);
10678 // Consider vectorizing the epilogue too if it's profitable.
10679 VectorizationFactor EpilogueVF =
10681 if (EpilogueVF.Width.isVector()) {
10682 std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
10683
10684 // The first pass vectorizes the main loop and creates a scalar epilogue
10685 // to be vectorized by executing the plan (potentially with a different
10686 // factor) again shortly afterwards.
10687 VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width);
10688 preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan);
10689 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1,
10690 BestEpiPlan);
10691 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10692 EPI, &LVL, &CM, BFI, PSI, Checks,
10693 *BestMainPlan);
10694 auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
10695 *BestMainPlan, MainILV, DT, false);
10696 ++LoopsVectorized;
10697
10698 // Second pass vectorizes the epilogue and adjusts the control flow
10699 // edges from the first pass.
10700 EPI.MainLoopVF = EPI.EpilogueVF;
10701 EPI.MainLoopUF = EPI.EpilogueUF;
10702 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10703 ORE, EPI, &LVL, &CM, BFI, PSI,
10704 Checks, BestEpiPlan);
10705 EpilogILV.setTripCount(MainILV.getTripCount());
10706 preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI);
10707
10708 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10709 DT, true, &ExpandedSCEVs);
10710 ++LoopsEpilogueVectorized;
10711
10712 if (!MainILV.areSafetyChecksAdded())
10713 DisableRuntimeUnroll = true;
10714 } else {
10715 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10716 VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
10717 PSI, Checks, BestPlan);
10718 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
10719 ++LoopsVectorized;
10720
10721 // Add metadata to disable runtime unrolling a scalar loop when there
10722 // are no runtime checks about strides and memory. A scalar loop that is
10723 // rarely used is not worth unrolling.
10724 if (!LB.areSafetyChecksAdded())
10725 DisableRuntimeUnroll = true;
10726 }
10727 // Report the vectorization decision.
10728 reportVectorization(ORE, L, VF, IC);
10729 }
10730
10733 }
10734
10735 assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
10736 "DT not preserved correctly");
10737
10738 std::optional<MDNode *> RemainderLoopID =
10741 if (RemainderLoopID) {
10742 L->setLoopID(*RemainderLoopID);
10743 } else {
10744 if (DisableRuntimeUnroll)
10746
10747 // Mark the loop as already vectorized to avoid vectorizing again.
10748 Hints.setAlreadyVectorized();
10749 }
10750
10751 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10752 return true;
10753}
10754
10756
10757 // Don't attempt if
10758 // 1. the target claims to have no vector registers, and
10759 // 2. interleaving won't help ILP.
10760 //
10761 // The second condition is necessary because, even if the target has no
10762 // vector registers, loop vectorization may still enable scalar
10763 // interleaving.
10766 return LoopVectorizeResult(false, false);
10767
10768 bool Changed = false, CFGChanged = false;
10769
10770 // The vectorizer requires loops to be in simplified form.
10771 // Since simplification may add new inner loops, it has to run before the
10772 // legality and profitability checks. This means running the loop vectorizer
10773 // will simplify all loops, regardless of whether anything end up being
10774 // vectorized.
10775 for (const auto &L : *LI)
10776 Changed |= CFGChanged |=
10777 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10778
10779 // Build up a worklist of inner-loops to vectorize. This is necessary as
10780 // the act of vectorizing or partially unrolling a loop creates new loops
10781 // and can invalidate iterators across the loops.
10782 SmallVector<Loop *, 8> Worklist;
10783
10784 for (Loop *L : *LI)
10785 collectSupportedLoops(*L, LI, ORE, Worklist);
10786
10787 LoopsAnalyzed += Worklist.size();
10788
10789 // Now walk the identified inner loops.
10790 while (!Worklist.empty()) {
10791 Loop *L = Worklist.pop_back_val();
10792
10793 // For the inner loops we actually process, form LCSSA to simplify the
10794 // transform.
10795 Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10796
10797 Changed |= CFGChanged |= processLoop(L);
10798
10799 if (Changed) {
10800 LAIs->clear();
10801
10802#ifndef NDEBUG
10803 if (VerifySCEV)
10804 SE->verify();
10805#endif
10806 }
10807 }
10808
10809 // Process each loop nest in the function.
10810 return LoopVectorizeResult(Changed, CFGChanged);
10811}
10812
10815 LI = &AM.getResult<LoopAnalysis>(F);
10816 // There are no loops in the function. Return before computing other
10817 // expensive analyses.
10818 if (LI->empty())
10819 return PreservedAnalyses::all();
10828
10829 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10830 PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10831 BFI = nullptr;
10832 if (PSI && PSI->hasProfileSummary())
10834 LoopVectorizeResult Result = runImpl(F);
10835 if (!Result.MadeAnyChange)
10836 return PreservedAnalyses::all();
10838
10839 if (isAssignmentTrackingEnabled(*F.getParent())) {
10840 for (auto &BB : F)
10842 }
10843
10844 PA.preserve<LoopAnalysis>();
10848
10849 if (Result.MadeCFGChange) {
10850 // Making CFG changes likely means a loop got vectorized. Indicate that
10851 // extra simplification passes should be run.
10852 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10853 // be run if runtime checks have been added.
10856 } else {
10858 }
10859 return PA;
10860}
10861
10863 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10864 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10865 OS, MapClassName2PassName);
10866
10867 OS << '<';
10868 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10869 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10870 OS << '>';
10871}
@ Poison
static unsigned getIntrinsicID(const SDNode *N)
unsigned RegSize
AMDGPU Lower Kernel Arguments
AMDGPU Register Bank Select
Rewrite undef for PHI
This file implements a class to represent arbitrary precision integral constant values and operations...
@ PostInc
ReachingDefAnalysis InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
This is the interface for LLVM's primary stateless and local alias analysis.
static bool IsEmptyBlock(MachineBasicBlock *MBB)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition: CommandLine.h:686
This file contains the declarations for the subclasses of Constant, which represent the different fla...
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
#define DEBUG_WITH_TYPE(TYPE,...)
DEBUG_WITH_TYPE macro - This macro should be used by passes to emit debug information.
Definition: Debug.h:64
This file defines DenseMapInfo traits for DenseMap.
This file defines the DenseMap class.
std::string Name
bool End
Definition: ELF_riscv.cpp:480
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
#define Check(C,...)
#define DEBUG_TYPE
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
#define _
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
Module.h This file contains the declarations for the Module class.
This defines the Use class.
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
Legalize the Machine IR a function s Machine IR
Definition: Legalizer.cpp:80
This header provides classes for managing per-loop analyses.
static const char * VerboseDebug
loop Loop Strength Reduction
#define LV_NAME
This file defines the LoopVectorizationLegality class.
This file provides a LoopVectorizationPlanner class.
static void collectSupportedLoops(Loop &L, LoopInfo *LI, OptimizationRemarkEmitter *ORE, SmallVectorImpl< Loop * > &V)
static cl::opt< unsigned > EpilogueVectorizationMinVF("epilogue-vectorization-minimum-VF", cl::Hidden, cl::desc("Only loops with vectorization factor equal to or larger than " "the specified value are considered for epilogue vectorization."))
static cl::opt< unsigned > EpilogueVectorizationForceVF("epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, cl::desc("When epilogue vectorization is enabled, and a value greater than " "1 is specified, forces the given VF for all applicable epilogue " "loops."))
static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan, DenseMap< VPValue *, VPValue * > &IVEndValues)
Create resume phis in the scalar preheader for first-order recurrences, reductions and inductions,...
static void addRuntimeUnrollDisableMetaData(Loop *L)
static ElementCount determineVPlanVF(const TargetTransformInfo &TTI, LoopVectorizationCostModel &CM)
static cl::opt< unsigned > VectorizeMemoryCheckThreshold("vectorize-memory-check-threshold", cl::init(128), cl::Hidden, cl::desc("The maximum allowed number of runtime memory checks"))
static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan)
Prepare MainPlan for vectorizing the main vector loop during epilogue vectorization.
static cl::opt< unsigned > TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), cl::Hidden, cl::desc("Loops with a constant trip count that is smaller than this " "value are vectorized only if no scalar iteration overheads " "are incurred."))
Loops with a known constant trip count below this number are vectorized only if no scalar iteration o...
static void debugVectorizationMessage(const StringRef Prefix, const StringRef DebugMsg, Instruction *I)
Write a DebugMsg about vectorization to the debug output stream.
static cl::opt< bool > EnableCondStoresVectorization("enable-cond-stores-vec", cl::init(true), cl::Hidden, cl::desc("Enable if predication of stores during vectorization."))
static VPInstruction * addResumePhiRecipeForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC)
Create and return a ResumePhi for WideIV, unless it is truncated.
static Value * emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue, Value *Step, InductionDescriptor::InductionKind InductionKind, const BinaryOperator *InductionBinOp)
Compute the transformed value of Index at offset StartValue using step StepValue.
static DebugLoc getDebugLocFromInstOrOperands(Instruction *I)
Look for a meaningful debug location on the instruction or its operands.
static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, VectorizationFactor &VF, Loop *L, const TargetTransformInfo &TTI, PredicatedScalarEvolution &PSE, ScalarEpilogueLowering SEL)
static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB)
Replace VPBB with a VPIRBasicBlock wrapping IRBB.
const char LLVMLoopVectorizeFollowupAll[]
static SetVector< VPIRInstruction * > collectUsersInExitBlocks(Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan)
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, SetVector< VPIRInstruction * > &ExitUsersToFix)
Handle users in the exit block for first order reductions in the original exit block.
static cl::opt< bool > ForceTargetSupportsScalableVectors("force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, cl::desc("Pretend that scalable vectors are supported, even if the target does " "not support them. This flag should only be used for testing."))
static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, DebugLoc DL)
static std::optional< unsigned > getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI)
Convenience function that returns the value of vscale_range iff vscale_range.min == vscale_range....
static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style)
static constexpr uint32_t MemCheckBypassWeights[]
cl::opt< unsigned > ForceTargetInstructionCost("force-target-instruction-cost", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's expected cost for " "an instruction to a single constant value. Mostly " "useful for getting consistent testing."))
std::optional< unsigned > getMaxVScale(const Function &F, const TargetTransformInfo &TTI)
static constexpr uint32_t MinItersBypassWeights[]
static cl::opt< unsigned > ForceTargetNumScalarRegs("force-target-num-scalar-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of scalar registers."))
static cl::opt< bool > UseWiderVFIfCallVariantsPresent("vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true), cl::Hidden, cl::desc("Try wider VFs if they enable the use of vector variants"))
static bool planContainsAdditionalSimplifications(VPlan &Plan, VPCostContext &CostCtx, Loop *TheLoop)
Return true if the original loop \ TheLoop contains any instructions that do not have corresponding r...
static cl::opt< unsigned > SmallLoopCost("small-loop-cost", cl::init(20), cl::Hidden, cl::desc("The cost of a loop that is considered 'small' by the interleaver."))
static cl::opt< unsigned > ForceTargetNumVectorRegs("force-target-num-vector-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of vector registers."))
static bool isExplicitVecOuterLoop(Loop *OuterLp, OptimizationRemarkEmitter *ORE)
static cl::opt< bool > EnableIndVarRegisterHeur("enable-ind-var-reg-heur", cl::init(true), cl::Hidden, cl::desc("Count the induction variable only once when interleaving"))
static Type * maybeVectorizeType(Type *Elt, ElementCount VF)
static std::optional< unsigned > getSmallBestKnownTC(PredicatedScalarEvolution &PSE, Loop *L, bool CanUseConstantMax=true)
Returns "best known" trip count for the specified loop L as defined by the following procedure: 1) Re...
static cl::opt< TailFoldingStyle > ForceTailFoldingStyle("force-tail-folding-style", cl::desc("Force the tail folding style"), cl::init(TailFoldingStyle::None), cl::values(clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"), clEnumValN(TailFoldingStyle::Data, "data", "Create lane mask for data only, using active.lane.mask intrinsic"), clEnumValN(TailFoldingStyle::DataWithoutLaneMask, "data-without-lane-mask", "Create lane mask with compare/stepvector"), clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control", "Create lane mask using active.lane.mask intrinsic, and use " "it for both data and control flow"), clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, "data-and-control-without-rt-check", "Similar to data-and-control, but remove the runtime check"), clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl", "Use predicated EVL instructions for tail folding. If EVL " "is unsupported, fallback to data-without-lane-mask.")))
static cl::opt< bool > EnableEpilogueVectorization("enable-epilogue-vectorization", cl::init(true), cl::Hidden, cl::desc("Enable vectorization of epilogue loops."))
static ScalarEpilogueLowering getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI)
const char VerboseDebug[]
static void fixReductionScalarResumeWhenVectorizingEpilog(VPRecipeBase *R, VPTransformState &State, BasicBlock *LoopMiddleBlock, BasicBlock *BypassBlock)
static cl::opt< bool > PreferPredicatedReductionSelect("prefer-predicated-reduction-select", cl::init(false), cl::Hidden, cl::desc("Prefer predicating a reduction operation over an after loop select."))
static VPWidenIntOrFpInductionRecipe * createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start, const InductionDescriptor &IndDesc, VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop)
Creates a VPWidenIntOrFpInductionRecpipe for Phi.
static constexpr uint32_t SCEVCheckBypassWeights[]
static cl::opt< bool > PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), cl::Hidden, cl::desc("Prefer in-loop vector reductions, " "overriding the targets preference."))
const char LLVMLoopVectorizeFollowupVectorized[]
static cl::opt< bool > EnableLoadStoreRuntimeInterleave("enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, cl::desc("Enable runtime interleaving until load/store ports are saturated"))
static cl::opt< bool > VPlanBuildStressTest("vplan-build-stress-test", cl::init(false), cl::Hidden, cl::desc("Build VPlan for every supported loop nest in the function and bail " "out right after the build (stress test the VPlan H-CFG construction " "in the VPlan-native vectorization path)."))
static bool hasIrregularType(Type *Ty, const DataLayout &DL)
A helper function that returns true if the given type is irregular.
static cl::opt< bool > LoopVectorizeWithBlockFrequency("loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, cl::desc("Enable the use of the block frequency analysis to access PGO " "heuristics minimizing code growth in cold regions and being more " "aggressive in hot regions."))
static Value * getExpandedStep(const InductionDescriptor &ID, const SCEV2ValueTy &ExpandedSCEVs)
Return the expanded step for ID using ExpandedSCEVs to look up SCEV expansion results.
const char LLVMLoopVectorizeFollowupEpilogue[]
static void preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L, const SCEV2ValueTy &ExpandedSCEVs, const EpilogueLoopVectorizationInfo &EPI)
Prepare Plan for vectorizing the epilogue loop.
static bool useActiveLaneMask(TailFoldingStyle Style)
static unsigned getEstimatedRuntimeVF(const Loop *L, const TargetTransformInfo &TTI, ElementCount VF)
This function attempts to return a value that represents the vectorization factor at runtime.
static bool isIndvarOverflowCheckKnownFalse(const LoopVectorizationCostModel *Cost, ElementCount VF, std::optional< unsigned > UF=std::nullopt)
For the given VF and UF and maximum trip count computed for the loop, return whether the induction va...
static void addFullyUnrolledInstructionsToIgnore(Loop *L, const LoopVectorizationLegality::InductionList &IL, SmallPtrSetImpl< Instruction * > &InstsToIgnore)
Knowing that loop L executes a single vector iteration, add instructions that will get simplified and...
static cl::opt< PreferPredicateTy::Option > PreferPredicateOverEpilogue("prefer-predicate-over-epilogue", cl::init(PreferPredicateTy::ScalarEpilogue), cl::Hidden, cl::desc("Tail-folding and predication preferences over creating a scalar " "epilogue loop."), cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, "scalar-epilogue", "Don't tail-predicate loops, create scalar epilogue"), clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, "predicate-else-scalar-epilogue", "prefer tail-folding, create scalar epilogue if tail " "folding fails."), clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, "predicate-dont-vectorize", "prefers tail-folding, don't attempt vectorization if " "tail-folding fails.")))
static cl::opt< bool > EnableInterleavedMemAccesses("enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on interleaved memory accesses in a loop"))
static bool addUsersInExitBlocks(VPlan &Plan, const SetVector< VPIRInstruction * > &ExitUsersToFix)
static cl::opt< bool > EnableMaskedInterleavedMemAccesses("enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"))
An interleave-group may need masking if it resides in a block that needs predication,...
static cl::opt< bool > ForceOrderedReductions("force-ordered-reductions", cl::init(false), cl::Hidden, cl::desc("Enable the vectorisation of loops with in-order (strict) " "FP reductions"))
static void cse(BasicBlock *BB)
Perform cse of induction variable instructions.
static const SCEV * getAddressAccessSCEV(Value *Ptr, LoopVectorizationLegality *Legal, PredicatedScalarEvolution &PSE, const Loop *TheLoop)
Gets Address Access SCEV after verifying that the access pattern is loop invariant except the inducti...
static cl::opt< cl::boolOrDefault > ForceSafeDivisor("force-widen-divrem-via-safe-divisor", cl::Hidden, cl::desc("Override cost based safe divisor widening for div/rem instructions"))
#define DEBUG_TYPE
static cl::opt< unsigned > ForceTargetMaxVectorInterleaveFactor("force-target-max-vector-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "vectorized loops."))
static bool processLoopInVPlanNativePath(Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, LoopVectorizationRequirements &Requirements)
static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI)
static cl::opt< unsigned > NumberOfStoresToPredicate("vectorize-num-stores-pred", cl::init(1), cl::Hidden, cl::desc("Max number of stores to be predicated behind an if."))
The number of stores in a loop that are allowed to need predication.
static cl::opt< unsigned > MaxNestedScalarReductionIC("max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, cl::desc("The maximum interleave count to use when interleaving a scalar " "reduction in a nested loop."))
static cl::opt< unsigned > ForceTargetMaxScalarInterleaveFactor("force-target-max-scalar-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "scalar loops."))
static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE)
static bool willGenerateVectors(VPlan &Plan, ElementCount VF, const TargetTransformInfo &TTI)
Check if any recipe of Plan will generate a vector value, which will be assigned a vector register.
static cl::opt< bool > EnableEarlyExitVectorization("enable-early-exit-vectorization", cl::init(false), cl::Hidden, cl::desc("Enable vectorization of early exit loops with uncountable exits."))
static cl::opt< bool > MaximizeBandwidth("vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " "will be determined by the smallest type in loop."))
static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop, Instruction *I, DebugLoc DL={})
Create an analysis remark that explains why vectorization failed.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
This file implements a map that provides insertion order iteration.
std::pair< uint64_t, uint64_t > Interval
This file contains the declarations for metadata subclasses.
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
if(PassOpts->AAPipeline)
This file contains the declarations for profiling metadata utility functions.
const SmallVectorImpl< MachineOperand > & Cond
static BinaryOperator * CreateMul(Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore, Value *FlagsOp)
static BinaryOperator * CreateAdd(Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore, Value *FlagsOp)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
#define OP(OPC)
Definition: Instruction.h:45
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
This pass exposes codegen information to IR-level passes.
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file defines the VPlanHCFGBuilder class which contains the public interface (buildHierarchicalCF...
This file provides utility VPlan to VPlan transformations.
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const char PassName[]
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:234
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1520
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1542
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:410
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
void registerAssumption(AssumeInst *CI)
Add an @llvm.assume intrinsic to this function's cache.
unsigned getVScaleRangeMin() const
Returns the minimum value for the vscale_range attribute.
Definition: Attributes.cpp:460
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:448
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition: BasicBlock.h:517
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:367
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:459
const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
Definition: BasicBlock.cpp:489
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:219
LLVMContext & getContext() const
Get the context in which this basic block lives.
Definition: BasicBlock.cpp:168
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:239
BinaryOps getOpcode() const
Definition: InstrTypes.h:370
Analysis pass which computes BlockFrequencyInfo.
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Conditional or Unconditional Branch instruction.
bool isConditional() const
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
BasicBlock * getSuccessor(unsigned i) const
Value * getCondition() const
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:72
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Definition: InstrTypes.h:1875
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1341
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1286
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1277
unsigned arg_size() const
Definition: InstrTypes.h:1284
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:696
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:698
@ ICMP_EQ
equal
Definition: InstrTypes.h:694
@ ICMP_NE
not equal
Definition: InstrTypes.h:695
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:699
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:787
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:866
static ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:873
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
A debug info location.
Definition: DebugLoc.h:33
An analysis that produces DemandedBits for a function.
Definition: DemandedBits.h:103
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:194
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:156
bool empty() const
Definition: DenseMap.h:98
iterator begin()
Definition: DenseMap.h:75
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:152
iterator end()
Definition: DenseMap.h:84
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition: DenseMap.h:202
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:147
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:211
Implements a dense probed hash-table based set.
Definition: DenseSet.h:278
DomTreeNodeBase * getIDom() const
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
bool verify(VerificationLevel VL=VerificationLevel::Full) const
verify - checks if the tree is correct.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
void eraseNode(NodeT *BB)
eraseNode - Removes a node from the dominator tree.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
constexpr bool isVector() const
One or more elements.
Definition: TypeSize.h:326
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:314
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:311
static constexpr ElementCount get(ScalarTy MinVal, bool Scalable)
Definition: TypeSize.h:317
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:322
BasicBlock * emitMinimumVectorEpilogueIterCountCheck(BasicBlock *Bypass, BasicBlock *Insert)
Emits an iteration count bypass check after the main vector loop has finished to see if there are any...
EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan)
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
BasicBlock * createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Implements the interface for creating a vectorized skeleton using the epilogue loop strategy (ie the ...
A specialized derived class of inner loop vectorizer that performs vectorization of main loops in the...
BasicBlock * createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Implements the interface for creating a vectorized skeleton using the main loop strategy (ie the firs...
EpilogueVectorizerMainLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Check, VPlan &Plan)
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
BasicBlock * emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue)
Emits an iteration count bypass check once for the main loop (when ForEpilogue is false) and once for...
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition: Operator.h:338
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Class to represent function types.
Definition: DerivedTypes.h:105
param_iterator param_begin() const
Definition: DerivedTypes.h:130
param_iterator param_end() const
Definition: DerivedTypes.h:131
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:707
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:216
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition: Function.cpp:373
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:766
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:704
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:731
Represents flags for the getelementptr instruction/expression.
static GEPNoWrapFlags inBounds()
static GEPNoWrapFlags none()
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:113
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:485
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1053
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:330
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2274
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Definition: IRBuilder.cpp:889
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2270
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition: IRBuilder.h:164
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1387
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1370
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition: IRBuilder.h:490
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:199
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2380
Value * CreateURem(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1447
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2705
A struct for saving information about induction variables.
const SCEV * getStep() const
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_NoInduction
Not an induction variable.
@ IK_FpInduction
Floating point induction variable.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
const SmallVectorImpl< Instruction * > & getCastInsts() const
Returns a reference to the type cast instructions in the induction update chain, that are redundant w...
Value * getStartValue() const
An extension of the inner loop vectorizer that creates a skeleton for a vectorized loop that has its ...
InnerLoopAndEpilogueVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan)
BasicBlock * createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Create a new empty loop that will contain vectorized instructions later on, while the old loop will b...
virtual BasicBlock * createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs)=0
The interface for creating a vectorized skeleton using one of two different strategies,...
EpilogueLoopVectorizationInfo & EPI
Holds and updates state information required to vectorize the main loop and its epilogue in two separ...
InnerLoopVectorizer vectorizes loops which contain only one basic block to a specified vectorization ...
virtual void printDebugTracesAtStart()
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
Value * TripCount
Trip count of the original loop.
void sinkScalarOperands(Instruction *PredInst)
Iteratively sink the scalarized operands of a predicated instruction into the block that was created ...
const TargetLibraryInfo * TLI
Target Library Info.
ElementCount MinProfitableTripCount
const TargetTransformInfo * TTI
Target Transform Info.
Value * VectorTripCount
Trip count of the widened loop (TripCount - TripCount % (VF*UF))
BasicBlock * emitSCEVChecks(BasicBlock *Bypass)
Emit a bypass check to see if all of the SCEV assumptions we've had to make are correct.
virtual BasicBlock * createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs)
Create a new empty loop that will contain vectorized instructions later on, while the old loop will b...
LoopVectorizationCostModel * Cost
The profitablity analysis.
BasicBlock * AdditionalBypassBlock
The additional bypass block which conditionally skips over the epilogue loop after executing the main...
BlockFrequencyInfo * BFI
BFI and PSI are used to check for profile guided size optimizations.
Value * getTripCount() const
Returns the original loop trip count.
BasicBlock * LoopMiddleBlock
Middle Block between the vector and the scalar.
OptimizationRemarkEmitter * ORE
Interface to emit optimization remarks.
void scalarizeInstruction(const Instruction *Instr, VPReplicateRecipe *RepRecipe, const VPLane &Lane, VPTransformState &State)
A helper function to scalarize a single Instruction in the innermost loop.
SmallVector< Instruction *, 4 > PredicatedInstructions
Store instructions that were predicated.
DenseMap< PHINode *, Value * > Induction2AdditionalBypassValue
Mapping of induction phis to their additional bypass values.
void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB)
Introduces a new VPIRBasicBlock for CheckIRBB to Plan between the vector preheader and its predecesso...
void createVectorLoopSkeleton(StringRef Prefix)
Emit basic blocks (prefixed with Prefix) for the iteration check, vector loop preheader,...
BasicBlock * emitMemRuntimeChecks(BasicBlock *Bypass)
Emit bypass checks to check any memory assumptions we may have made.
BasicBlock * LoopScalarPreHeader
The scalar-loop preheader.
void createInductionAdditionalBypassValues(const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount)
Create and record the values for induction variables to resume coming from the additional bypass bloc...
VPBlockBase * VectorPHVPB
The vector preheader block of Plan, used as target for check blocks introduced during skeleton creati...
LoopVectorizationLegality * Legal
The legality analysis.
InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, ElementCount VecWidth, ElementCount MinProfitableTripCount, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks, VPlan &Plan)
void emitIterationCountCheck(BasicBlock *Bypass)
Emit a bypass check to see if the vector trip count is zero, including if it overflows.
PredicatedScalarEvolution & PSE
A wrapper around ScalarEvolution used to add runtime SCEV checks.
LoopInfo * LI
Loop Info.
ProfileSummaryInfo * PSI
Value * getInductionAdditionalBypassValue(PHINode *OrigPhi) const
induction header phi.
BasicBlock * getAdditionalBypassBlock() const
Return the additional bypass block which targets the scalar loop by skipping the epilogue loop after ...
DominatorTree * DT
Dominator Tree.
void setTripCount(Value *TC)
Used to set the trip count after ILV's construction and after the preheader block has been executed.
void fixVectorizedLoop(VPTransformState &State)
Fix the vectorized code, taking care of header phi's, and more.
BasicBlock * LoopVectorPreHeader
The vector-loop preheader.
virtual void printDebugTracesAtEnd()
AssumptionCache * AC
Assumption Cache.
Value * getOrCreateVectorTripCount(BasicBlock *InsertBlock)
Returns (and creates if needed) the trip count of the widened loop.
IRBuilder Builder
The builder that we use.
void fixNonInductionPHIs(VPTransformState &State)
Fix the non-induction PHIs in Plan.
unsigned UF
The vectorization unroll factor to use.
SmallVector< BasicBlock *, 4 > LoopBypassBlocks
A list of all bypass blocks. The first block is the entry of the loop.
GeneratedRTChecks & RTChecks
Structure to hold information about generated runtime checks, responsible for cleaning the checks,...
virtual ~InnerLoopVectorizer()=default
ElementCount VF
The vectorization SIMD factor to use.
Loop * OrigLoop
The original loop.
static InstructionCost getInvalid(CostType Val=0)
static InstructionCost getMax()
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:475
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:68
bool isBinaryOp() const
Definition: Instruction.h:279
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:94
void replaceSuccessorWith(BasicBlock *OldBB, BasicBlock *NewBB)
Replace specified successor OldBB to point at the provided block.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Definition: Instruction.h:169
FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
const char * getOpcodeName() const
Definition: Instruction.h:276
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:311
The group of interleaved loads/stores sharing the same stride and close to each other.
Definition: VectorUtils.h:488
uint32_t getFactor() const
Definition: VectorUtils.h:504
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
Definition: VectorUtils.h:558
InstTy * getInsertPos() const
Definition: VectorUtils.h:574
uint32_t getNumMembers() const
Definition: VectorUtils.h:506
Drive the analysis of interleaved memory accesses in the loop.
Definition: VectorUtils.h:630
InterleaveGroup< Instruction > * getInterleaveGroup(const Instruction *Instr) const
Get the interleave group that Instr belongs to.
Definition: VectorUtils.h:675
bool requiresScalarEpilogue() const
Returns true if an interleaved group that may access memory out-of-bounds requires a scalar epilogue ...
Definition: VectorUtils.h:686
bool isInterleaved(Instruction *Instr) const
Check if Instr belongs to any interleave group.
Definition: VectorUtils.h:667
bool invalidateGroups()
Invalidate groups, e.g., in case all blocks in loop will be predicated contrary to original assumptio...
Definition: VectorUtils.h:650
iterator_range< SmallPtrSetIterator< llvm::InterleaveGroup< Instruction > * > > getInterleaveGroups()
Definition: VectorUtils.h:680
void analyzeInterleaving(bool EnableMaskedInterleavedGroup)
Analyze the interleaved accesses and collect them in interleave groups.
void invalidateGroupsRequiringScalarEpilogue()
Invalidate groups that require a scalar epilogue (due to gaps).
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:176
Type * getPointerOperandType() const
Definition: Instructions.h:258
This analysis provides dependence information for the memory accesses of a loop.
Drive the analysis of memory accesses in the loop.
const RuntimePointerChecking * getRuntimePointerChecking() const
unsigned getNumRuntimePointerChecks() const
Number of memchecks required to prove independence of otherwise may-alias pointers.
const DenseMap< Value *, const SCEV * > & getSymbolicStrides() const
If an access has a symbolic strides, this maps the pointer value to the stride symbol.
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
void getExitBlocks(SmallVectorImpl< BlockT * > &ExitBlocks) const
Return all of the successor blocks of this loop.
BlockT * getUniqueLatchExitBlock() const
Return the unique exit block for the latch, or null if there are multiple different exit blocks or th...
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
BlockT * getHeader() const
unsigned getLoopDepth() const
Return the nesting level of this loop.
void addBasicBlockToLoop(BlockT *NewBB, LoopInfoBase< BlockT, LoopT > &LI)
This method is used by other analyses to update loop information.
iterator_range< block_iterator > blocks() const
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
BlockT * getExitingBlock() const
If getExitingBlocks would return exactly one block, return that block.
LoopT * getParentLoop() const
Return the parent loop if it exists or nullptr for top level loops.
bool isLoopExiting(const BlockT *BB) const
True if terminator in the block can branch to another block that is outside of the current loop.
Store the result of a depth first search within basic blocks contained by a single loop.
Definition: LoopIterator.h:97
RPOIterator beginRPO() const
Reverse iterate over the cached postorder blocks.
Definition: LoopIterator.h:136
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
Definition: LoopInfo.cpp:1266
RPOIterator endRPO() const
Definition: LoopIterator.h:140
Wrapper class to LoopBlocksDFS that provides a standard begin()/end() interface for the DFS reverse p...
Definition: LoopIterator.h:172
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
Definition: LoopIterator.h:180
void removeBlock(BlockT *BB)
This method completely removes BB from all data structures, including all of the Loop objects it is n...
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
LoopVectorizationCostModel - estimates the expected speedups due to vectorization.
SmallPtrSet< Type *, 16 > ElementTypesInLoop
All element types found in the loop.
void collectElementTypesForWidening()
Collect all element types in the loop for which widening is needed.
bool canVectorizeReductions(ElementCount VF) const
Returns true if the target machine supports all of the reduction variables found for the given VF.
bool isEpilogueVectorizationProfitable(const ElementCount VF, const unsigned IC) const
Returns true if epilogue vectorization is considered profitable, and false otherwise.
bool requiresScalarEpilogue(VFRange Range) const
Returns true if we're required to use a scalar epilogue for at least the final iteration of the origi...
bool isPredicatedInst(Instruction *I) const
Returns true if I is an instruction that needs to be predicated at runtime.
void collectValuesToIgnore()
Collect values we want to ignore in the cost model.
void collectInLoopReductions()
Split reductions into those that happen in the loop, and those that happen outside.
std::pair< unsigned, unsigned > getSmallestAndWidestTypes()
bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be uniform after vectorization.
bool usePredicatedReductionSelect(unsigned Opcode, Type *PhiTy) const
Returns true if the predicated reduction select should be used to set the incoming value for the redu...
PredicatedScalarEvolution & PSE
Predicated scalar evolution analysis.
const LoopVectorizeHints * Hints
Loop Vectorize Hint.
std::optional< unsigned > getMaxSafeElements() const
Return maximum safe number of elements to be processed per vector iteration, which do not prevent sto...
const TargetTransformInfo & TTI
Vector target information.
LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, const Function *F, const LoopVectorizeHints *Hints, InterleavedAccessInfo &IAI)
LoopVectorizationLegality * Legal
Vectorization legality.
bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const
Returns true if the target machine supports masked load operation for the given DataType and kind of ...
std::optional< InstructionCost > getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy) const
Return the cost of instructions in an inloop reduction pattern, if I is part of that pattern.
InstructionCost getInstructionCost(Instruction *I, ElementCount VF)
Returns the execution time cost of an instruction for a given vector width.
DemandedBits * DB
Demanded bits analysis.
bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const
Returns true if I is a memory instruction in an interleaved-group of memory accesses that can be vect...
const TargetLibraryInfo * TLI
Target Library Info.
bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF)
Returns true if I is a memory instruction with consecutive memory access that can be widened.
const InterleaveGroup< Instruction > * getInterleavedAccessGroup(Instruction *Instr) const
Get the interleaved access group that Instr belongs to.
InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const
Estimate cost of an intrinsic call instruction CI if it were vectorized with factor VF.
bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be scalar after vectorization.
bool isOptimizableIVTruncate(Instruction *I, ElementCount VF)
Return True if instruction I is an optimizable truncate whose operand is an induction variable.
FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC)
Loop * TheLoop
The loop that we evaluate.
TTI::TargetCostKind CostKind
The kind of cost that we are calculating.
TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow=true) const
Returns the TailFoldingStyle that is best for the current loop.
InterleavedAccessInfo & InterleaveInfo
The interleave access information contains groups of interleaved accesses with the same stride and cl...
SmallPtrSet< const Value *, 16 > ValuesToIgnore
Values to ignore in the cost model.
void setVectorizedCallDecision(ElementCount VF)
A call may be vectorized in different ways depending on whether we have vectorized variants available...
void invalidateCostModelingDecisions()
Invalidates decisions already taken by the cost model.
bool isAccessInterleaved(Instruction *Instr) const
Check if Instr belongs to any interleaved access group.
bool selectUserVectorizationFactor(ElementCount UserVF)
Setup cost-based decisions for user vectorization factor.
OptimizationRemarkEmitter * ORE
Interface to emit optimization remarks.
bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const
Returns true if the target machine supports masked store operation for the given DataType and kind of...
LoopInfo * LI
Loop Info analysis.
bool requiresScalarEpilogue(bool IsVectorizing) const
Returns true if we're required to use a scalar epilogue for at least the final iteration of the origi...
SmallVector< RegisterUsage, 8 > calculateRegisterUsage(ArrayRef< ElementCount > VFs)
SmallPtrSet< const Value *, 16 > VecValuesToIgnore
Values to ignore in the cost model when VF > 1.
bool isInLoopReduction(PHINode *Phi) const
Returns true if the Phi is part of an inloop reduction.
bool isProfitableToScalarize(Instruction *I, ElementCount VF) const
void setWideningDecision(const InterleaveGroup< Instruction > *Grp, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for interleaving group Grp and vector ...
const MapVector< Instruction *, uint64_t > & getMinimalBitwidths() const
CallWideningDecision getCallWideningDecision(CallInst *CI, ElementCount VF) const
bool isLegalGatherOrScatter(Value *V, ElementCount VF)
Returns true if the target machine can represent V as a masked gather or scatter operation.
bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const
bool shouldConsiderInvariant(Value *Op)
Returns true if Op should be considered invariant and if it is trivially hoistable.
bool foldTailByMasking() const
Returns true if all loop blocks should be masked to fold tail loop.
bool foldTailWithEVL() const
Returns true if VP intrinsics with explicit vector length support should be generated in the tail fol...
void collectUniformsAndScalars(ElementCount VF)
Collect Uniform and Scalar values for the given VF.
bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const
Returns true if the instructions in this block requires predication for any reason,...
void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind, Function *Variant, Intrinsic::ID IID, std::optional< unsigned > MaskPos, InstructionCost Cost)
void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC)
Selects and saves TailFoldingStyle for 2 options - if IV update may overflow or not.
AssumptionCache * AC
Assumption cache.
void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for instruction I and vector width VF.
InstWidening
Decision that was taken during cost calculation for memory instruction.
bool isScalarWithPredication(Instruction *I, ElementCount VF) const
Returns true if I is an instruction which requires predication and for which our chosen predication s...
InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const
Estimate cost of a call instruction CI if it were vectorized with factor VF.
bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const
Returns true if we should use strict in-order reductions for the given RdxDesc.
std::pair< InstructionCost, InstructionCost > getDivRemSpeculationCost(Instruction *I, ElementCount VF) const
Return the costs for our two available strategies for lowering a div/rem operation which requires spe...
bool isDivRemScalarWithPredication(InstructionCost ScalarCost, InstructionCost SafeDivisorCost) const
Given costs for both strategies, return true if the scalar predication lowering should be used for di...
InstructionCost expectedCost(ElementCount VF)
Returns the expected execution cost.
void setCostBasedWideningDecision(ElementCount VF)
Memory access instruction may be vectorized in more than one way.
InstWidening getWideningDecision(Instruction *I, ElementCount VF) const
Return the cost model decision for the given instruction I and vector width VF.
bool isScalarEpilogueAllowed() const
Returns true if a scalar epilogue is not allowed due to optsize or a loop hint annotation.
InstructionCost getWideningCost(Instruction *I, ElementCount VF)
Return the vectorization cost for the given instruction I and vector width VF.
unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost)
void collectInstsToScalarize(ElementCount VF)
Collects the instructions to scalarize for each predicated instruction in the loop.
LoopVectorizationLegality checks if it is legal to vectorize a loop, and to what vectorization factor...
bool isInvariantStoreOfReduction(StoreInst *SI)
Returns True if given store is a final invariant store of one of the reductions found in the loop.
bool hasVectorCallVariants() const
Returns true if there is at least one function call in the loop which has a vectorized variant availa...
RecurrenceSet & getFixedOrderRecurrences()
Return the fixed-order recurrences found in the loop.
bool isInvariantAddressOfReduction(Value *V)
Returns True if given address is invariant and is used to store recurrent expression.
bool blockNeedsPredication(BasicBlock *BB) const
Return true if the block BB needs to be predicated in order for the loop to be vectorized.
bool canVectorize(bool UseVPlanNativePath)
Returns true if it is legal to vectorize this loop.
int isConsecutivePtr(Type *AccessTy, Value *Ptr) const
Check if this pointer is consecutive when vectorizing.
std::optional< const HistogramInfo * > getHistogramInfo(Instruction *I) const
Returns a HistogramInfo* for the given instruction if it was determined to be part of a load -> updat...
bool canVectorizeFPMath(bool EnableStrictReductions)
Returns true if it is legal to vectorize the FP math operations in this loop.
bool isReductionVariable(PHINode *PN) const
Returns True if PN is a reduction variable in this loop.
bool isFixedOrderRecurrence(const PHINode *Phi) const
Returns True if Phi is a fixed-order recurrence in this loop.
const InductionDescriptor * getPointerInductionDescriptor(PHINode *Phi) const
Returns a pointer to the induction descriptor, if Phi is pointer induction.
const InductionDescriptor * getIntOrFpInductionDescriptor(PHINode *Phi) const
Returns a pointer to the induction descriptor, if Phi is an integer or floating point induction.
bool isInductionPhi(const Value *V) const
Returns True if V is a Phi node of an induction variable in this loop.
PHINode * getPrimaryInduction()
Returns the primary induction variable.
const SmallVector< BasicBlock *, 4 > & getCountableExitingBlocks() const
Returns all exiting blocks with a countable exit, i.e.
const InductionList & getInductionVars() const
Returns the induction variables found in the loop.
bool hasStructVectorCall() const
Returns true if there is at least one function call in the loop which returns a struct type and needs...
bool isInvariant(Value *V) const
Returns true if V is invariant across all loop iterations according to SCEV.
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
bool canFoldTailByMasking() const
Return true if we can vectorize this loop while folding its tail by masking.
void prepareToFoldTailByMasking()
Mark all respective loads/stores for masking.
Type * getWidestInductionType()
Returns the widest induction type.
bool hasUncountableEarlyExit() const
Returns true if the loop has exactly one uncountable early exit, i.e.
bool hasHistograms() const
Returns a list of all known histogram operations in the loop.
const LoopAccessInfo * getLAI() const
bool isUniformMemOp(Instruction &I, ElementCount VF) const
A uniform memory op is a load or store which accesses the same memory location on all VF lanes,...
BasicBlock * getUncountableEarlyExitingBlock() const
Returns the uncountable early exiting block, if there is exactly one.
bool isMaskRequired(const Instruction *I) const
Returns true if vector representation of the instruction I requires mask.
const RuntimePointerChecking * getRuntimePointerChecking() const
Returns the information that we collected about runtime memory check.
Planner drives the vectorization process after having passed Legality checks.
VectorizationFactor selectEpilogueVectorizationFactor(const ElementCount MaxVF, unsigned IC)
VPlan & getPlanFor(ElementCount VF) const
Return the VPlan for VF.
Definition: VPlan.cpp:1637
VectorizationFactor planInVPlanNativePath(ElementCount UserVF)
Use the VPlan-native path to plan how to best vectorize, return the best VF and its cost.
void buildVPlans(ElementCount MinVF, ElementCount MaxVF)
Build VPlans for power-of-2 VF's between MinVF and MaxVF inclusive, according to the information gath...
Definition: VPlan.cpp:1625
VectorizationFactor computeBestVF()
Compute and return the most profitable vectorization factor.
void emitInvalidCostRemarks(OptimizationRemarkEmitter *ORE)
Emit remarks for recipes with invalid costs in the available VPlans.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition: VPlan.cpp:1606
void printPlans(raw_ostream &O)
Definition: VPlan.cpp:1651
void plan(ElementCount UserVF, unsigned UserIC)
Build VPlans for the specified UserVF and UserIC if they are non-zero or all applicable candidate VFs...
DenseMap< const SCEV *, Value * > executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan, InnerLoopVectorizer &LB, DominatorTree *DT, bool VectorizingEpilogue, const DenseMap< const SCEV *, Value * > *ExpandedSCEVs=nullptr)
Generate the IR code for the vectorized loop captured in VPlan BestPlan according to the best selecte...
bool hasPlanWithVF(ElementCount VF) const
Look through the existing plans and return true if we have one with vectorization factor VF.
This holds vectorization requirements that must be verified late in the process.
Utility class for getting and setting loop vectorizer hints in the form of loop metadata.
bool allowVectorization(Function *F, Loop *L, bool VectorizeOnlyWhenForced) const
bool allowReordering() const
When enabling loop hints are provided we allow the vectorizer to change the order of operations that ...
void emitRemarkWithHints() const
Dumps all the hint information.
void setAlreadyVectorized()
Mark the loop L as already vectorized by setting the width to 1.
const char * vectorizeAnalysisPassName() const
If hints are provided that force vectorization, use the AlwaysPrint pass name to force the frontend t...
void prepareNoAliasMetadata()
Set up the aliasing scopes based on the memchecks.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
bool hasLoopInvariantOperands(const Instruction *I) const
Return true if all the operands of the specified instruction are loop invariant.
Definition: LoopInfo.cpp:67
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:632
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition: LoopInfo.cpp:61
MDNode * getLoopID() const
Return the llvm.loop loop id metadata node for this loop if it is present.
Definition: LoopInfo.cpp:502
Metadata node.
Definition: Metadata.h:1073
void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
Definition: Metadata.cpp:1077
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1434
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1549
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1440
static MDString * get(LLVMContext &Context, StringRef Str)
Definition: Metadata.cpp:606
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
iterator find(const KeyT &Key)
Definition: MapVector.h:167
bool contains(const KeyT &Key) const
Definition: MapVector.h:163
bool empty() const
Definition: MapVector.h:79
size_type size() const
Definition: MapVector.h:60
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition: Module.cpp:228
Diagnostic information for optimization analysis remarks related to pointer aliasing.
Diagnostic information for optimization analysis remarks related to floating-point non-commutativity.
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
bool allowExtraAnalysis(StringRef PassName) const
Whether we allow for extra compile-time budget to perform more analysis to produce fewer false positi...
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
An analysis over an "inner" IR unit that provides access to an analysis manager over a "outer" IR uni...
Definition: PassManager.h:692
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
static unsigned getIncomingValueNumForOperand(unsigned i)
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1878
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
const SCEVPredicate & getPredicate() const
unsigned getSmallConstantMaxTripCount()
Returns the upper bound of the loop trip count as a normal unsigned value, or 0 if the trip count is ...
const SCEV * getBackedgeTakenCount()
Get the (predicated) backedge count for the analyzed loop.
const SCEV * getSymbolicMaxBackedgeTakenCount()
Get the (predicated) symbolic max backedge count for the analyzed loop.
const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
void preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:146
void preserve()
Mark an analysis as preserved.
Definition: Analysis.h:131
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
Analysis providing profile information.
bool hasProfileSummary() const
Returns true if profile summary is available.
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Definition: IVDescriptors.h:77
static bool isFMulAddIntrinsic(Instruction *I)
Returns true if the instruction is a call to the llvm.fmuladd intrinsic.
FastMathFlags getFastMathFlags() const
Instruction * getLoopExitInstr() const
static unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
Type * getRecurrenceType() const
Returns the type of the recurrence.
const SmallPtrSet< Instruction *, 8 > & getCastInsts() const
Returns a reference to the instructions used for type-promoting the recurrence.
unsigned getMinWidthCastToRecurrenceTypeInBits() const
Returns the minimum width used by the recurrence in bits.
TrackingVH< Value > getRecurrenceStartValue() const
SmallVector< Instruction *, 4 > getReductionOpChain(PHINode *Phi, Loop *L) const
Attempts to find a chain of operations from Phi to LoopExitInst that can be treated as a set of reduc...
static bool isAnyOfRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
static bool isFindLastIVRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
bool isSigned() const
Returns true if all source operands of the recurrence are SExtInsts.
RecurKind getRecurrenceKind() const
bool isOrdered() const
Expose an ordered FP reduction to the instance users.
Value * getSentinelValue() const
Returns the sentinel value for FindLastIV recurrences to replace the start value.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
bool Need
This flag indicates if we need to add the runtime check.
std::optional< ArrayRef< PointerDiffInfo > > getDiffChecks() const
const SmallVectorImpl< RuntimePointerCheck > & getChecks() const
Returns the checks that generateChecks created.
This class represents a constant integer value.
const APInt & getAPInt() const
Helper to remove instructions inserted during SCEV expansion, unless they are marked as used.
This class uses information about analyze scalars to rewrite expressions in canonical form.
ScalarEvolution * getSE()
bool isInsertedInstruction(Instruction *I) const
Return true if the specified instruction was inserted by the code rewriter.
Value * expandCodeForPredicate(const SCEVPredicate *Pred, Instruction *Loc)
Generates a code sequence that evaluates this predicate.
This class represents an assumption made using SCEV expressions which can be checked at run-time.
virtual bool isAlwaysTrue() const =0
Returns true if the predicate is always true.
This class represents an analyzed expression in the program.
bool isOne() const
Return true if the expression is a constant one.
bool isZero() const
Return true if the expression is a constant zero.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getURemExpr(const SCEV *LHS, const SCEV *RHS)
Represents an unsigned remainder expression based on unsigned division.
const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
void forgetBlockAndLoopDispositions(Value *V=nullptr)
Called when the client has changed the disposition of values in a loop or block.
const SCEV * getMinusOne(Type *Ty)
Return a SCEV for the constant -1 of a specific type.
void forgetLcssaPhiWithNewPredecessor(Loop *L, PHINode *V)
Forget LCSSA phi node V of loop L to which a new predecessor was added, such that it may no longer be...
unsigned getSmallConstantTripCount(const Loop *L)
Returns the exact trip count of the loop if we can compute it, and the result is a small constant.
APInt getUnsignedRangeMax(const SCEV *S)
Determine the max of the unsigned range for a particular SCEV.
const SCEV * applyLoopGuards(const SCEV *Expr, const Loop *L)
Try to apply information from loop guards for L to Expr.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
bool isKnownPredicate(CmpPredicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:57
ArrayRef< value_type > getArrayRef() const
Definition: SetVector.h:84
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
iterator end()
Get an iterator to the end of the SetVector.
Definition: SetVector.h:113
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:264
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition: SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
value_type pop_back_val()
Definition: SetVector.h:285
size_type size() const
Definition: SmallPtrSet.h:94
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:363
bool erase(PtrType Ptr)
Remove pointer from the set.
Definition: SmallPtrSet.h:401
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:452
iterator end() const
Definition: SmallPtrSet.h:477
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
iterator begin() const
Definition: SmallPtrSet.h:472
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:458
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:132
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:181
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:937
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:683
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
An instruction for storing to memory.
Definition: Instructions.h:292
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
Multiway switch.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
std::optional< unsigned > getVScaleForTuning() const
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const
Return true if the target supports masked scatter.
bool enableAggressiveInterleaving(bool LoopHasReductions) const
Don't restrict interleaved unrolling to small loops.
bool preferInLoopReduction(unsigned Opcode, Type *Ty, ReductionFlags Flags) const
bool supportsEfficientVectorElementLoadStore() const
If target has efficient vector element load/store instructions, it can return true here so that inser...
bool prefersVectorizedAddressing() const
Return true if target doesn't mind addresses in vectors.
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
bool hasBranchDivergence(const Function *F=nullptr) const
Return true if branch divergence exists.
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE=nullptr, const SCEV *Ptr=nullptr) const
void getUnrollingPreferences(Loop *L, ScalarEvolution &, UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const
Get target-customized preferences for the generic loop unrolling transformation.
InstructionCost getOperandsScalarizationOverhead(ArrayRef< const Value * > Args, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const
Estimate the overhead of scalarizing an instructions unique non-constant operands.
TypeSize getRegisterBitWidth(RegisterKind K) const
bool isLegalMaskedGather(Type *DataType, Align Alignment) const
Return true if the target supports masked gather.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
std::optional< unsigned > getMaxVScale() const
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
bool enableOrderedReductions() const
Return true if we should be enabling ordered reductions for the target.
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
static PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const
TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true) const
Query the target what the preferred style of tail folding is.
unsigned getRegUsageForType(Type *Ty) const
Returns the estimated number of registers required to represent Ty.
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of an Add ...
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
bool isElementTypeLegalForScalableVector(Type *Ty) const
ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const
bool enableMaskedInterleavedAccessVectorization() const
Enable matching of interleaved access groups that contain predicated accesses or gaps and therefore v...
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, ReductionFlags Flags) const
InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
const char * getRegisterClassName(unsigned ClassID) const
bool preferEpilogueVectorization() const
Return true if the loop vectorizer should consider vectorizing an otherwise scalar epilogue loop.
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
bool hasActiveVectorLength(unsigned Opcode, Type *DataType, Align Alignment) const
unsigned getEpilogueVectorizationMinVF() const
unsigned getNumberOfRegisters(unsigned ClassID) const
bool isFPVectorizationPotentiallyUnsafe() const
Indicate that it is potentially unsafe to automatically vectorize floating-point operations because t...
bool isLegalMaskedStore(Type *DataType, Align Alignment) const
Return true if the target supports masked store.
@ TCC_Free
Expected to fold away in lowering.
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
bool enableInterleavedAccessVectorization() const
Enable matching of interleaved access groups.
unsigned getMinTripCountTailFoldingThreshold() const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
unsigned getMaxInterleaveFactor(ElementCount VF) const
unsigned getNumberOfParts(Type *Tp) const
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, PartialReductionExtendKind OpAExtend, PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp=std::nullopt) const
bool isTruncateFree(Type *Ty1, Type *Ty2) const
Return true if it's free to truncate a value of type Ty1 to type Ty2.
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const
Query the target whether it would be prefered to create a predicated vector loop, which can avoid the...
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, Value *Op0=nullptr, Value *Op1=nullptr) const
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ Interleave
The cast is used with an interleaved load/store.
@ GatherScatter
The cast is used with a gather/scatter.
bool isLegalMaskedLoad(Type *DataType, Align Alignment) const
Return true if the target supports masked load.
This class represents a truncation of integer types.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition: TypeSwitch.h:87
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition: TypeSwitch.h:96
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:270
static IntegerType * getInt1Ty(LLVMContext &C)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static Type * getVoidTy(LLVMContext &C)
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
bool isIntOrPtrTy() const
Return true if this is an integer type or a pointer type.
Definition: Type.h:252
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
bool isTokenTy() const
Return true if this is 'token'.
Definition: Type.h:234
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
This function has undefined behavior.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
op_range operands()
Definition: User.h:288
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
op_iterator op_begin()
Definition: User.h:280
void setOperand(unsigned i, Value *Val)
Definition: User.h:233
Value * getOperand(unsigned i) const
Definition: User.h:228
op_iterator op_end()
Definition: User.h:282
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition: VectorUtils.h:72
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition: VPlan.h:3536
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition: VPlan.h:3611
RecipeListTy::iterator iterator
Instruction iterators...
Definition: VPlan.h:3563
void execute(VPTransformState *State) override
The method which generates the output IR instructions that correspond to this VPBasicBlock,...
Definition: VPlan.cpp:480
iterator end()
Definition: VPlan.h:3573
iterator begin()
Recipe iterator methods.
Definition: VPlan.h:3571
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition: VPlan.h:3624
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition: VPlan.cpp:208
void insert(VPRecipeBase *Recipe, iterator InsertPt)
Definition: VPlan.h:3602
bool empty() const
Definition: VPlan.h:3582
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition: VPlan.h:2494
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition: VPlan.h:397
VPRegionBlock * getParent()
Definition: VPlan.h:489
const VPBasicBlock * getExitingBasicBlock() const
Definition: VPlan.cpp:178
void setName(const Twine &newName)
Definition: VPlan.h:482
size_t getNumSuccessors() const
Definition: VPlan.h:535
void swapSuccessors()
Swap successors of the block. The block must have exactly 2 successors.
Definition: VPlan.h:628
const VPBlocksTy & getPredecessors() const
Definition: VPlan.h:520
VPlan * getPlan()
Definition: VPlan.cpp:153
VPBlockBase * getSinglePredecessor() const
Definition: VPlan.h:531
const VPBasicBlock * getEntryBasicBlock() const
Definition: VPlan.cpp:158
VPBlockBase * getSingleSuccessor() const
Definition: VPlan.h:525
const VPBlocksTy & getSuccessors() const
Definition: VPlan.h:514
static void insertBlockAfter(VPBlockBase *NewBlock, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBase NewBlock after BlockPtr.
Definition: VPlan.h:4219
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition: VPlan.h:4335
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition: VPlan.h:4273
static void reassociateBlocks(VPBlockBase *Old, VPBlockBase *New)
Reassociate all the blocks connected to Old so that they now point to New.
Definition: VPlan.h:4300
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPValue * createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL={}, const Twine &Name="")
Create a new ICmp VPInstruction with predicate Pred and operands A and B.
VPValue * createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL={}, const Twine &Name="")
VPBasicBlock * getInsertBlock() const
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPValue *Start, VPValue *Current, VPValue *Step, const Twine &Name="")
Convert the input value Current to the corresponding value of an induction with Start and Step values...
VPScalarCastRecipe * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL)
VPInstruction * createOverflowingOp(unsigned Opcode, std::initializer_list< VPValue * > Operands, VPRecipeWithIRFlags::WrapFlagsTy WrapFlags, DebugLoc DL={}, const Twine &Name="")
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
VPValue * createNot(VPValue *Operand, DebugLoc DL={}, const Twine &Name="")
VPValue * createLogicalAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL={}, const Twine &Name="")
VPValue * createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL={}, const Twine &Name="", std::optional< FastMathFlags > FMFs=std::nullopt)
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
Canonical scalar induction phi of the vector loop.
Definition: VPlan.h:3233
Type * getScalarType() const
Returns the scalar type of the induction.
Definition: VPlan.h:3264
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition: VPlanValue.h:394
void execute(VPTransformState &State) override
Generate the transformed value of the induction at offset StartValue (1.
VPValue * getStepValue() const
Definition: VPlan.h:3466
VPValue * getStartValue() const
Definition: VPlan.h:3465
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition: VPlan.h:2033
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition: VPlan.h:2081
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition: VPlan.h:2070
A recipe representing a sequence of load -> update -> store as part of a histogram operation.
Definition: VPlan.h:1783
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition: VPlan.h:3678
A recipe to wrap on original IR instruction not to be modified during execution, execept for PHIs.
Definition: VPlan.h:1382
This is a concrete Recipe that models a single VPlan-level instruction.
Definition: VPlan.h:1194
@ ResumePhi
Creates a scalar phi in a leaf VPBB with a single predecessor in VPlan.
Definition: VPlan.h:1212
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition: VPlan.h:2561
In what follows, the term "input IR" refers to code that is fed into the vectorizer whereas the term ...
Definition: VPlan.h:153
static VPLane getLastLaneForVF(const ElementCount &VF)
Definition: VPlan.h:194
static VPLane getFirstLane()
Definition: VPlan.h:178
A recipe for forming partial reductions.
Definition: VPlan.h:2451
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition: VPlan.h:716
VPBasicBlock * getParent()
Definition: VPlan.h:741
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition: VPlan.h:810
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPRecipeBase * tryToCreatePartialReduction(Instruction *Reduction, ArrayRef< VPValue * > Operands)
Create and return a partial reduction recipe for a reduction instruction along with binary operation ...
VPValue * createEdgeMask(BasicBlock *Src, BasicBlock *Dst)
A helper function that computes the predicate of the edge between SRC and DST.
VPReplicateRecipe * handleReplication(Instruction *I, VFRange &Range)
Build a VPReplicationRecipe for I.
void createSwitchEdgeMasks(SwitchInst *SI)
Create an edge mask for every destination of cases and/or default.
VPValue * getBlockInMask(BasicBlock *BB) const
Returns the entry mask for the block BB.
VPValue * getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const
A helper that returns the previously computed predicate of the edge between SRC and DST.
iterator_range< mapped_iterator< Use *, std::function< VPValue *(Value *)> > > mapToVPValues(User::op_range Operands)
Returns a range mapping the values of the range Operands to their corresponding VPValues.
void fixHeaderPhis()
Add the incoming values from the backedge to reduction & first-order recurrence cross-iteration phis.
VPRecipeBase * tryToCreateWidenRecipe(Instruction *Instr, ArrayRef< VPValue * > Operands, VFRange &Range, VPBasicBlock *VPBB)
Create and return a widened recipe for I if one can be created within the given VF Range.
VPValue * getVPValueOrAddLiveIn(Value *V)
void createHeaderMask()
Create the mask for the vector loop header block.
std::optional< unsigned > getScalingForReduction(const Instruction *ExitInst)
void createBlockInMask(BasicBlock *BB)
A helper function that computes the predicate of the block BB, assuming that the header block of the ...
void collectScaledReductions(VFRange &Range)
Find all possible partial reductions in the loop and track all of those that are valid so recipes can...
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
void setFlags(Instruction *I) const
Set the IR flags for I.
Definition: VPlan.h:1108
A recipe for handling reduction phis.
Definition: VPlan.h:2385
bool isInLoop() const
Returns true, if the phi is part of an in-loop reduction.
Definition: VPlan.h:2444
const RecurrenceDescriptor & getRecurrenceDescriptor() const
Definition: VPlan.h:2436
A recipe to represent inloop reduction operations, performing a reduction on a vector operand into a ...
Definition: VPlan.h:2656
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition: VPlan.h:3713
const VPBlockBase * getEntry() const
Definition: VPlan.h:3749
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition: VPlan.h:3781
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition: VPlan.h:2777
void execute(VPTransformState &State) override
Generate replicas of the desired Ingredient.
bool isUniform() const
Definition: VPlan.h:2821
bool shouldPack() const
Returns true if the recipe is used by a widened recipe via an intervening VPPredInstPHIRecipe.
A recipe to compute the pointers for widened memory accesses of IndexTy in reverse order.
Definition: VPlan.h:1910
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition: VPlan.h:843
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition: VPlan.h:913
An analysis for type-inference for VPValues.
Definition: VPlanAnalysis.h:40
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition: VPlanValue.h:206
operand_range operands()
Definition: VPlanValue.h:263
void setOperand(unsigned I, VPValue *New)
Definition: VPlanValue.h:248
unsigned getNumOperands() const
Definition: VPlanValue.h:242
VPValue * getOperand(unsigned N) const
Definition: VPlanValue.h:243
void addOperand(VPValue *Operand)
Definition: VPlanValue.h:237
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition: VPlan.cpp:123
void replaceAllUsesWith(VPValue *New)
Definition: VPlan.cpp:1420
Value * getLiveInIRValue()
Returns the underlying IR value, if this VPValue is defined outside the scope of VPlan.
Definition: VPlanValue.h:178
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition: VPlan.cpp:1424
user_range users()
Definition: VPlanValue.h:138
A recipe to compute the pointers for widened memory accesses of IndexTy.
Definition: VPlan.h:1963
A recipe for widening Call instructions using library calls.
Definition: VPlan.h:1727
A Recipe for widening the canonical induction variable of the vector loop.
Definition: VPlan.h:3374
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition: VPlan.h:1535
A recipe for handling GEP instructions.
Definition: VPlan.h:1861
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition: VPlan.h:2095
VPValue * getStepValue()
Returns the step value of the induction.
Definition: VPlan.h:2123
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition: VPlan.h:2129
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition: VPlan.h:2148
A recipe for widening vector intrinsics.
Definition: VPlan.h:1635
A common base class for widening memory operations.
Definition: VPlan.h:2950
A recipe for handling phis that are widened in the vector loop.
Definition: VPlan.h:2308
VPValue * getIncomingValue(unsigned I)
Returns the I th incoming VPValue.
Definition: VPlan.h:2347
VPBasicBlock * getIncomingBlock(unsigned I)
Returns the I th incoming VPBasicBlock.
Definition: VPlan.h:2344
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition: VPlan.h:1437
Main class to build the VPlan H-CFG for an incoming IR.
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition: VPlan.h:3812
void prepareToExecute(Value *TripCount, Value *VectorTripCount, VPTransformState &State)
Prepare the plan for execution, setting up the required live-in values.
Definition: VPlan.cpp:924
VPBasicBlock * getEntry()
Definition: VPlan.h:3925
VPValue & getVectorTripCount()
The vector trip count.
Definition: VPlan.h:3987
VPValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition: VPlan.h:3993
VPValue & getVF()
Returns the VF of the vector loop region.
Definition: VPlan.h:3990
VPValue * getTripCount() const
The trip count of the original loop.
Definition: VPlan.h:3966
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition: VPlan.h:3980
iterator_range< SmallSetVector< ElementCount, 2 >::iterator > vectorFactors() const
Returns an iterator range over all VFs of the plan.
Definition: VPlan.h:4010
unsigned getUF() const
Definition: VPlan.h:4018
static VPlanPtr createInitialVPlan(Type *InductionTy, PredicatedScalarEvolution &PSE, bool RequiresScalarEpilogueCheck, bool TailFolded, Loop *TheLoop)
Create initial VPlan, having an "entry" VPBasicBlock (wrapping original scalar pre-header) which cont...
Definition: VPlan.cpp:845
bool hasVF(ElementCount VF)
Definition: VPlan.h:4003
bool hasUF(unsigned UF) const
Definition: VPlan.h:4016
auto getExitBlocks()
Return an iterator range over the VPIRBasicBlock wrapping the exit blocks of the VPlan,...
Definition: VPlanCFG.h:309
VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition: VPlan.cpp:1052
InstructionCost cost(ElementCount VF, VPCostContext &Ctx)
Return the cost of this plan.
Definition: VPlan.cpp:1046
const VPBasicBlock * getMiddleBlock() const
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition: VPlan.h:3944
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition: VPlan.h:3973
void setEntry(VPBasicBlock *VPBB)
Definition: VPlan.h:3895
VPIRBasicBlock * createVPIRBasicBlock(BasicBlock *IRBB)
Create a VPIRBasicBlock from IRBB containing VPIRInstructions for all instructions in IRBB,...
Definition: VPlan.cpp:1252
VPValue * getOrAddLiveIn(Value *V)
Gets the live-in VPValue for V or adds a new live-in (if none exists yet) for V.
Definition: VPlan.h:4036
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition: VPlan.h:3952
void execute(VPTransformState *State)
Generate the IR code for this VPlan.
Definition: VPlan.cpp:956
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the vector loop.
Definition: VPlan.h:4070
VPIRBasicBlock * getScalarHeader() const
Return the VPIRBasicBlock wrapping the header of the scalar loop.
Definition: VPlan.h:3957
VPlan * duplicate()
Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...
Definition: VPlan.cpp:1192
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUser() const
Return true if there is exactly one user of this value.
Definition: Value.cpp:157
void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:377
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
void replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition: Value.cpp:542
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1075
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
int getNumOccurrences() const
Definition: CommandLine.h:399
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:213
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:193
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:232
constexpr bool isNonZero() const
Definition: TypeSize.h:158
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:218
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition: TypeSize.h:258
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition: TypeSize.h:174
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
constexpr bool isZero() const
Definition: TypeSize.h:156
static constexpr bool isKnownGT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:225
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition: TypeSize.h:254
static constexpr bool isKnownGE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:239
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition: ilist_node.h:32
A range adaptor for a pair of iterators.
IteratorT end() const
IteratorT begin() const
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:661
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
@ Entry
Definition: COFF.h:844
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition: CallingConv.h:76
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
ID ArrayRef< Type * > Tys
Definition: Intrinsics.h:102
std::variant< std::monostate, Loc::Single, Loc::Multi, Loc::MMI, Loc::EntryValue > Variant
Alias for the std::variant specialization base class of DbgVariable.
Definition: DwarfDebug.h:190
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
Definition: PatternMatch.h:982
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:826
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:885
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:711
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< InstrNode * > Instr
Definition: RDFGraph.h:389
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
const_iterator begin(StringRef path LLVM_LIFETIME_BOUND, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:226
const_iterator end(StringRef path LLVM_LIFETIME_BOUND)
Get end iterator over path.
Definition: Path.cpp:235
bool isUniformAfterVectorization(const VPValue *VPV)
Returns true if VPV is uniform after vectorization.
Definition: VPlanUtils.h:39
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr, ScalarEvolution &SE)
Get or create a VPValue that corresponds to the expansion of Expr.
Definition: VPlanUtils.cpp:26
const SCEV * getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE)
Return the SCEV expression for V.
Definition: VPlanUtils.cpp:65
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, AssumptionCache *AC, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
Simplify each loop in a loop nest recursively.
void ReplaceInstWithInst(BasicBlock *BB, BasicBlock::iterator &BI, Instruction *I)
Replace the instruction specified by BI with the instruction specified by I.
@ Offset
Definition: DWP.cpp:480
Value * addRuntimeChecks(Instruction *Loc, Loop *TheLoop, const SmallVectorImpl< RuntimePointerCheck > &PointerChecks, SCEVExpander &Expander, bool HoistRuntimeChecks=false)
Add code that checks at runtime if the accessed arrays in PointerChecks overlap.
Definition: LoopUtils.cpp:1954
bool RemoveRedundantDbgInstrs(BasicBlock *BB)
Try to remove redundant dbg.value instructions from given basic block.
std::optional< unsigned > getLoopEstimatedTripCount(Loop *L, unsigned *EstimatedLoopInvocationWeight=nullptr)
Returns a loop's estimated trip count based on branch weight metadata.
Definition: LoopUtils.cpp:850
static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop, VectorizationFactor VF, unsigned IC)
Report successful vectorization of the loop.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
unsigned getLoadStoreAddressSpace(const Value *I)
A helper function that returns the address space of the pointer operand of load or store instruction.
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
Definition: LoopUtils.cpp:989
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1697
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
auto pred_end(const MachineBasicBlock *BB)
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Definition: Verifier.cpp:7297
auto successors(const MachineBasicBlock *BB)
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
std::pair< Instruction *, ElementCount > InstructionVFPair
Value * getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF)
Return the runtime value for VF.
bool formLCSSARecursively(Loop &L, const DominatorTree &DT, const LoopInfo *LI, ScalarEvolution *SE)
Put a loop nest into LCSSA form.
Definition: LCSSA.cpp:465
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
std::optional< MDNode * > makeFollowupLoopID(MDNode *OrigLoopID, ArrayRef< StringRef > FollowupAttrs, const char *InheritOptionsAttrsPrefix="", bool AlwaysNew=false)
Create a new loop identifier for a loop created from a loop transformation.
Definition: LoopUtils.cpp:263
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2115
bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:657
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition: VPlanCFG.h:214
bool VerifySCEV
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition: VPlanCFG.h:226
auto map_range(ContainerTy &&C, FuncTy F)
Definition: STLExtras.h:377
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
void collectEphemeralRecipesForVPlan(VPlan &Plan, DenseSet< VPRecipeBase * > &EphRecipes)
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:420
void setBranchWeights(Instruction &I, ArrayRef< uint32_t > Weights, bool IsExpected)
Create a new branch_weights metadata node and add or overwrite a prof metadata reference to instructi...
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:293
cl::opt< bool > EnableVPlanNativePath("enable-vplan-native-path", cl::Hidden, cl::desc("Enable VPlan-native vectorization path with " "support for outer loop vectorization."))
Definition: VPlan.cpp:53
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1664
std::unique_ptr< VPlan > VPlanPtr
Definition: VPlan.h:144
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753
cl::opt< bool > EnableLoopVectorization
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition: Local.cpp:425
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition: STLExtras.h:573
void llvm_unreachable_internal(const char *msg=nullptr, const char *file=nullptr, unsigned line=0)
This function calls abort(), and prints the optional message to stderr.
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:125
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition: STLExtras.h:336
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:405
TargetTransformInfo TTI
static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr, DebugLoc DL={})
Reports an informative message: print Msg for debugging purposes as well as an optimization remark.
bool isAssignmentTrackingEnabled(const Module &M)
Return true if assignment tracking is enabled for module M.
Definition: DebugInfo.cpp:2299
RecurKind
These are the kinds of recurrences that we support.
Definition: IVDescriptors.h:33
@ Or
Bitwise or logical OR of integers.
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
void setProfileInfoAfterUnrolling(Loop *OrigLoop, Loop *UnrolledLoop, Loop *RemainderLoop, uint64_t UF)
Set weights for UnrolledLoop and RemainderLoop based on weights for OrigLoop and the following distri...
Definition: LoopUtils.cpp:1761
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
void reportVectorizationFailure(const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr)
Reports a vectorization failure: print DebugMsg for debugging purposes along with the corresponding o...
DWARFExpression::Operation Op
ScalarEpilogueLowering
@ CM_ScalarEpilogueNotAllowedLowTripLoop
@ CM_ScalarEpilogueNotNeededUsePredicate
@ CM_ScalarEpilogueNotAllowedOptSize
@ CM_ScalarEpilogueAllowed
@ CM_ScalarEpilogueNotAllowedUsePredicate
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1945
auto pred_begin(const MachineBasicBlock *BB)
Value * createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, int64_t Step)
Return a value for Step multiplied by VF.
BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1766
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
Value * addDiffRuntimeChecks(Instruction *Loc, ArrayRef< PointerDiffInfo > Checks, SCEVExpander &Expander, function_ref< Value *(IRBuilderBase &, unsigned)> GetVF, unsigned IC)
Definition: LoopUtils.cpp:2012
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2087
InstructionCost Cost
@ DataAndControlFlowWithoutRuntimeCheck
Use predicate to control both data and control flow, but modify the trip count so that a runtime over...
@ None
Don't use tail folding.
@ DataWithEVL
Use predicated EVL instructions for tail-folding.
@ DataWithoutLaneMask
Same as Data, but avoids using the get.active.lane.mask intrinsic to calculate the mask and instead i...
unsigned getReciprocalPredBlockProb()
A helper function that returns the reciprocal of the block probability of predicated blocks.
Definition: VPlan.h:92
bool hasBranchWeightMD(const Instruction &I)
Checks if an instructions has Branch Weight Metadata.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition: Hashing.h:590
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
bool verifyVPlanIsValid(const VPlan &Plan)
Verify invariants for general VPlans.
MapVector< Instruction *, uint64_t > computeMinimumValueSizes(ArrayRef< BasicBlock * > Blocks, DemandedBits &DB, const TargetTransformInfo *TTI=nullptr)
Compute a map of integer instructions to their minimum legal type size.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:468
cl::opt< bool > EnableLoopInterleaving
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
A special type used by analysis passes to provide an address that identifies that particular analysis...
Definition: Analysis.h:28
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:71
An information struct used to provide DenseMap with the various necessary components for a given valu...
Definition: DenseMapInfo.h:52
Encapsulate information regarding vectorization of a loop and its epilogue.
EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, ElementCount EVF, unsigned EUF, VPlan &EpiloguePlan)
A class that represents two vectorization factors (initialized with 0 by default).
static FixedScalableVFPair getNone()
This holds details about a histogram operation – a load -> update -> store sequence where each lane i...
A struct that represents some properties of the register usage of a loop.
SmallMapVector< unsigned, unsigned, 4 > MaxLocalUsers
Holds the maximum number of concurrent live intervals in the loop.
SmallMapVector< unsigned, unsigned, 4 > LoopInvariantRegs
Holds the number of loop invariant values that are used in the loop.
TargetLibraryInfo * TLI
LoopVectorizeResult runImpl(Function &F)
ProfileSummaryInfo * PSI
LoopAccessInfoManager * LAIs
void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName)
LoopVectorizePass(LoopVectorizeOptions Opts={})
BlockFrequencyInfo * BFI
ScalarEvolution * SE
AssumptionCache * AC
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
OptimizationRemarkEmitter * ORE
Storage for information about made changes.
A chain of instructions that form a partial reduction.
Instruction * Reduction
The top-level binary operation that forms the reduction to a scalar after the loop body.
Instruction * ExtendA
The extension of each of the inner binary operation's operands.
A CRTP mix-in to automatically provide informational APIs needed for passes.
Definition: PassManager.h:69
A marker analysis to determine if extra passes should be run after loop vectorization.
A MapVector that performs no allocations if smaller than a certain size.
Definition: MapVector.h:254
Flags describing the kind of vector reduction.
Parameters that control the generic loop unrolling transformation.
bool UnrollVectorizedLoop
Don't disable runtime unroll for the loops which were vectorized.
Holds the VFShape for a specific scalar to vector function mapping.
std::optional< unsigned > getParamIndexForOptionalMask() const
Instruction Set Architecture.
Encapsulates information needed to describe a parameter.
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Definition: VPlan.h:97
ElementCount End
Definition: VPlan.h:102
Struct to hold various analysis needed for cost computations.
Definition: VPlan.h:682
LoopVectorizationCostModel & CM
Definition: VPlan.h:687
bool skipCostComputation(Instruction *UI, bool IsVector) const
Return true if the cost for UI shouldn't be computed, e.g.
InstructionCost getLegacyCost(Instruction *UI, ElementCount VF) const
Return the cost for UI with VF using the legacy cost model as fallback until computing the cost of al...
SmallPtrSet< Instruction *, 8 > SkipCostComputation
Definition: VPlan.h:688
A recipe for handling first-order recurrence phis.
Definition: VPlan.h:2353
BasicBlock * PrevBB
The previous IR BasicBlock created or used.
Definition: VPlan.h:344
SmallDenseMap< VPBasicBlock *, BasicBlock * > VPBB2IRBB
A mapping of each VPBasicBlock to the corresponding BasicBlock.
Definition: VPlan.h:352
VPTransformState holds information passed down when "executing" a VPlan, needed for generating the ou...
Definition: VPlan.h:236
DenseMap< const SCEV *, Value * > ExpandedSCEVs
Map SCEVs to their expanded values.
Definition: VPlan.h:389
VPTypeAnalysis TypeAnalysis
VPlan-based type analysis.
Definition: VPlan.h:392
void packScalarIntoVectorValue(VPValue *Def, const VPLane &Lane)
Construct the vector value of a scalarized value V one lane at a time.
Definition: VPlan.cpp:394
Value * get(VPValue *Def, bool IsScalar=false)
Get the generated vector Value for a given VPValue Def if IsScalar is false, otherwise return the gen...
Definition: VPlan.cpp:249
struct llvm::VPTransformState::CFGState CFG
LoopVersioning * LVer
LoopVersioning.
Definition: VPlan.h:385
void addNewMetadata(Instruction *To, const Instruction *Orig)
Add additional metadata to To that was not present on Orig.
Definition: VPlan.cpp:353
std::optional< VPLane > Lane
Hold the index to generate specific scalar instructions.
Definition: VPlan.h:250
IRBuilderBase & Builder
Hold a reference to the IRBuilder used to generate output IR code.
Definition: VPlan.h:369
VPlan * Plan
Pointer to the VPlan code is generated for.
Definition: VPlan.h:375
InnerLoopVectorizer * ILV
Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods.
Definition: VPlan.h:372
ElementCount VF
The chosen Vectorization Factor of the loop being vectorized.
Definition: VPlan.h:245
void setDebugLocFrom(DebugLoc DL)
Set the debug location in the builder using the debug location DL.
Definition: VPlan.cpp:372
void set(VPValue *Def, Value *V, bool IsScalar=false)
Set the generated vector Value for a given VPValue, if IsScalar is false.
Definition: VPlan.h:279
A recipe for widening load operations, using the address to load from and an optional mask.
Definition: VPlan.h:3030
A recipe for widening select instructions.
Definition: VPlan.h:1824
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition: VPlan.h:3108
static void handleUncountableEarlyExit(VPlan &Plan, ScalarEvolution &SE, Loop *OrigLoop, BasicBlock *UncountableExitingBlock, VPRecipeBuilder &RecipeBuilder)
Update Plan to account for the uncountable early exit block in UncountableExitingBlock by.
static void optimizeInductionExitUsers(VPlan &Plan, DenseMap< VPValue *, VPValue * > &EndValues)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx)
Explicitly unroll Plan by UF.
static void dropPoisonGeneratingRecipes(VPlan &Plan, function_ref< bool(BasicBlock *)> BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, bool ScalarEpilogueAllowed)
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static bool tryAddExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPEVLBasedIVPHIRecipe and related recipes to Plan and replaces all uses except the canonical IV...
static void VPInstructionsToVPRecipes(VPlanPtr &Plan, function_ref< const InductionDescriptor *(PHINode *)> GetIntOrFpInductionDescriptor, ScalarEvolution &SE, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Try to have all users of fixed-order recurrences appear after the recipe defining their previous valu...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
TODO: The following VectorizationFactor was pulled out of LoopVectorizationCostModel class.
InstructionCost Cost
Cost of the loop with that width.
ElementCount MinProfitableTripCount
The minimum trip count required to make vectorization profitable, e.g.
ElementCount Width
Vector width with best cost.
InstructionCost ScalarCost
Cost of the scalar loop.
static VectorizationFactor Disabled()
Width 1 means no vectorization, cost 0 means uncomputed cost.