LLVM 20.0.0git
LoopVectorize.cpp
Go to the documentation of this file.
1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanAnalysis.h"
61#include "VPlanHCFGBuilder.h"
62#include "VPlanPatternMatch.h"
63#include "VPlanTransforms.h"
64#include "VPlanUtils.h"
65#include "VPlanVerifier.h"
66#include "llvm/ADT/APInt.h"
67#include "llvm/ADT/ArrayRef.h"
68#include "llvm/ADT/DenseMap.h"
70#include "llvm/ADT/Hashing.h"
71#include "llvm/ADT/MapVector.h"
72#include "llvm/ADT/STLExtras.h"
75#include "llvm/ADT/Statistic.h"
76#include "llvm/ADT/StringRef.h"
77#include "llvm/ADT/Twine.h"
78#include "llvm/ADT/TypeSwitch.h"
83#include "llvm/Analysis/CFG.h"
99#include "llvm/IR/Attributes.h"
100#include "llvm/IR/BasicBlock.h"
101#include "llvm/IR/CFG.h"
102#include "llvm/IR/Constant.h"
103#include "llvm/IR/Constants.h"
104#include "llvm/IR/DataLayout.h"
105#include "llvm/IR/DebugInfo.h"
106#include "llvm/IR/DebugLoc.h"
107#include "llvm/IR/DerivedTypes.h"
109#include "llvm/IR/Dominators.h"
110#include "llvm/IR/Function.h"
111#include "llvm/IR/IRBuilder.h"
112#include "llvm/IR/InstrTypes.h"
113#include "llvm/IR/Instruction.h"
114#include "llvm/IR/Instructions.h"
116#include "llvm/IR/Intrinsics.h"
117#include "llvm/IR/MDBuilder.h"
118#include "llvm/IR/Metadata.h"
119#include "llvm/IR/Module.h"
120#include "llvm/IR/Operator.h"
121#include "llvm/IR/PatternMatch.h"
123#include "llvm/IR/Type.h"
124#include "llvm/IR/Use.h"
125#include "llvm/IR/User.h"
126#include "llvm/IR/Value.h"
127#include "llvm/IR/Verifier.h"
128#include "llvm/Support/Casting.h"
130#include "llvm/Support/Debug.h"
145#include <algorithm>
146#include <cassert>
147#include <cstdint>
148#include <functional>
149#include <iterator>
150#include <limits>
151#include <memory>
152#include <string>
153#include <tuple>
154#include <utility>
155
156using namespace llvm;
157
158#define LV_NAME "loop-vectorize"
159#define DEBUG_TYPE LV_NAME
160
161#ifndef NDEBUG
162const char VerboseDebug[] = DEBUG_TYPE "-verbose";
163#endif
164
165/// @{
166/// Metadata attribute names
167const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
169 "llvm.loop.vectorize.followup_vectorized";
171 "llvm.loop.vectorize.followup_epilogue";
172/// @}
173
174STATISTIC(LoopsVectorized, "Number of loops vectorized");
175STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
176STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
177
179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
180 cl::desc("Enable vectorization of epilogue loops."));
181
183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
184 cl::desc("When epilogue vectorization is enabled, and a value greater than "
185 "1 is specified, forces the given VF for all applicable epilogue "
186 "loops."));
187
189 "epilogue-vectorization-minimum-VF", cl::Hidden,
190 cl::desc("Only loops with vectorization factor equal to or larger than "
191 "the specified value are considered for epilogue vectorization."));
192
193/// Loops with a known constant trip count below this number are vectorized only
194/// if no scalar iteration overheads are incurred.
196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
197 cl::desc("Loops with a constant trip count that is smaller than this "
198 "value are vectorized only if no scalar iteration overheads "
199 "are incurred."));
200
202 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
203 cl::desc("The maximum allowed number of runtime memory checks"));
204
205// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
206// that predication is preferred, and this lists all options. I.e., the
207// vectorizer will try to fold the tail-loop (epilogue) into the vector body
208// and predicate the instructions accordingly. If tail-folding fails, there are
209// different fallback strategies depending on these values:
211 enum Option {
215 };
216} // namespace PreferPredicateTy
217
219 "prefer-predicate-over-epilogue",
222 cl::desc("Tail-folding and predication preferences over creating a scalar "
223 "epilogue loop."),
225 "scalar-epilogue",
226 "Don't tail-predicate loops, create scalar epilogue"),
228 "predicate-else-scalar-epilogue",
229 "prefer tail-folding, create scalar epilogue if tail "
230 "folding fails."),
232 "predicate-dont-vectorize",
233 "prefers tail-folding, don't attempt vectorization if "
234 "tail-folding fails.")));
235
237 "force-tail-folding-style", cl::desc("Force the tail folding style"),
238 cl::init(TailFoldingStyle::None),
240 clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
242 TailFoldingStyle::Data, "data",
243 "Create lane mask for data only, using active.lane.mask intrinsic"),
244 clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
245 "data-without-lane-mask",
246 "Create lane mask with compare/stepvector"),
247 clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
248 "Create lane mask using active.lane.mask intrinsic, and use "
249 "it for both data and control flow"),
250 clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
251 "data-and-control-without-rt-check",
252 "Similar to data-and-control, but remove the runtime check"),
253 clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",
254 "Use predicated EVL instructions for tail folding. If EVL "
255 "is unsupported, fallback to data-without-lane-mask.")));
256
258 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
259 cl::desc("Maximize bandwidth when selecting vectorization factor which "
260 "will be determined by the smallest type in loop."));
261
263 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
264 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
265
266/// An interleave-group may need masking if it resides in a block that needs
267/// predication, or in order to mask away gaps.
269 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
270 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
271
273 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
274 cl::desc("A flag that overrides the target's number of scalar registers."));
275
277 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
278 cl::desc("A flag that overrides the target's number of vector registers."));
279
281 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
282 cl::desc("A flag that overrides the target's max interleave factor for "
283 "scalar loops."));
284
286 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
287 cl::desc("A flag that overrides the target's max interleave factor for "
288 "vectorized loops."));
289
291 "force-target-instruction-cost", cl::init(0), cl::Hidden,
292 cl::desc("A flag that overrides the target's expected cost for "
293 "an instruction to a single constant value. Mostly "
294 "useful for getting consistent testing."));
295
297 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
298 cl::desc(
299 "Pretend that scalable vectors are supported, even if the target does "
300 "not support them. This flag should only be used for testing."));
301
303 "small-loop-cost", cl::init(20), cl::Hidden,
304 cl::desc(
305 "The cost of a loop that is considered 'small' by the interleaver."));
306
308 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
309 cl::desc("Enable the use of the block frequency analysis to access PGO "
310 "heuristics minimizing code growth in cold regions and being more "
311 "aggressive in hot regions."));
312
313// Runtime interleave loops for load/store throughput.
315 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
316 cl::desc(
317 "Enable runtime interleaving until load/store ports are saturated"));
318
319/// The number of stores in a loop that are allowed to need predication.
321 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
322 cl::desc("Max number of stores to be predicated behind an if."));
323
325 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
326 cl::desc("Count the induction variable only once when interleaving"));
327
329 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
330 cl::desc("Enable if predication of stores during vectorization."));
331
333 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
334 cl::desc("The maximum interleave count to use when interleaving a scalar "
335 "reduction in a nested loop."));
336
337static cl::opt<bool>
338 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
340 cl::desc("Prefer in-loop vector reductions, "
341 "overriding the targets preference."));
342
344 "force-ordered-reductions", cl::init(false), cl::Hidden,
345 cl::desc("Enable the vectorisation of loops with in-order (strict) "
346 "FP reductions"));
347
349 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
350 cl::desc(
351 "Prefer predicating a reduction operation over an after loop select."));
352
353namespace llvm {
355 "enable-vplan-native-path", cl::Hidden,
356 cl::desc("Enable VPlan-native vectorization path with "
357 "support for outer loop vectorization."));
358} // namespace llvm
359
360// This flag enables the stress testing of the VPlan H-CFG construction in the
361// VPlan-native vectorization path. It must be used in conjuction with
362// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
363// verification of the H-CFGs built.
365 "vplan-build-stress-test", cl::init(false), cl::Hidden,
366 cl::desc(
367 "Build VPlan for every supported loop nest in the function and bail "
368 "out right after the build (stress test the VPlan H-CFG construction "
369 "in the VPlan-native vectorization path)."));
370
372 "interleave-loops", cl::init(true), cl::Hidden,
373 cl::desc("Enable loop interleaving in Loop vectorization passes"));
375 "vectorize-loops", cl::init(true), cl::Hidden,
376 cl::desc("Run the Loop vectorization passes"));
377
379 "force-widen-divrem-via-safe-divisor", cl::Hidden,
380 cl::desc(
381 "Override cost based safe divisor widening for div/rem instructions"));
382
384 "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
386 cl::desc("Try wider VFs if they enable the use of vector variants"));
387
389 "enable-early-exit-vectorization", cl::init(false), cl::Hidden,
390 cl::desc(
391 "Enable vectorization of early exit loops with uncountable exits."));
392
393// Likelyhood of bypassing the vectorized loop because assumptions about SCEV
394// variables not overflowing do not hold. See `emitSCEVChecks`.
395static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
396// Likelyhood of bypassing the vectorized loop because pointers overlap. See
397// `emitMemRuntimeChecks`.
398static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
399// Likelyhood of bypassing the vectorized loop because there are zero trips left
400// after prolog. See `emitIterationCountCheck`.
401static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
402
403/// A helper function that returns true if the given type is irregular. The
404/// type is irregular if its allocated size doesn't equal the store size of an
405/// element of the corresponding vector type.
406static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
407 // Determine if an array of N elements of type Ty is "bitcast compatible"
408 // with a <N x Ty> vector.
409 // This is only true if there is no padding between the array elements.
410 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
411}
412
413/// Returns "best known" trip count for the specified loop \p L as defined by
414/// the following procedure:
415/// 1) Returns exact trip count if it is known.
416/// 2) Returns expected trip count according to profile data if any.
417/// 3) Returns upper bound estimate if known, and if \p CanUseConstantMax.
418/// 4) Returns std::nullopt if all of the above failed.
419static std::optional<unsigned>
421 bool CanUseConstantMax = true) {
422 // Check if exact trip count is known.
423 if (unsigned ExpectedTC = PSE.getSE()->getSmallConstantTripCount(L))
424 return ExpectedTC;
425
426 // Check if there is an expected trip count available from profile data.
428 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
429 return *EstimatedTC;
430
431 if (!CanUseConstantMax)
432 return std::nullopt;
433
434 // Check if upper bound estimate is known.
435 if (unsigned ExpectedTC = PSE.getSmallConstantMaxTripCount())
436 return ExpectedTC;
437
438 return std::nullopt;
439}
440
441namespace {
442// Forward declare GeneratedRTChecks.
443class GeneratedRTChecks;
444
445using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
446} // namespace
447
448namespace llvm {
449
451
452/// InnerLoopVectorizer vectorizes loops which contain only one basic
453/// block to a specified vectorization factor (VF).
454/// This class performs the widening of scalars into vectors, or multiple
455/// scalars. This class also implements the following features:
456/// * It inserts an epilogue loop for handling loops that don't have iteration
457/// counts that are known to be a multiple of the vectorization factor.
458/// * It handles the code generation for reduction variables.
459/// * Scalarization (implementation using scalars) of un-vectorizable
460/// instructions.
461/// InnerLoopVectorizer does not perform any vectorization-legality
462/// checks, and relies on the caller to check for the different legality
463/// aspects. The InnerLoopVectorizer relies on the
464/// LoopVectorizationLegality class to provide information about the induction
465/// and reduction variables that were found to a given vectorization factor.
467public:
470 const TargetLibraryInfo *TLI,
474 unsigned UnrollFactor, LoopVectorizationLegality *LVL,
476 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks,
477 VPlan &Plan)
478 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
479 AC(AC), ORE(ORE), VF(VecWidth),
481 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
483 VectorPHVPB(Plan.getEntry()->getSingleSuccessor()) {
484 // Query this against the original loop and save it here because the profile
485 // of the original loop header may change as the transformation happens.
488 }
489
490 virtual ~InnerLoopVectorizer() = default;
491
492 /// Create a new empty loop that will contain vectorized instructions later
493 /// on, while the old loop will be used as the scalar remainder. Control flow
494 /// is generated around the vectorized (and scalar epilogue) loops consisting
495 /// of various checks and bypasses. Return the pre-header block of the new
496 /// loop. In the case of epilogue vectorization, this function is overriden to
497 /// handle the more complex control flow around the loops. \p ExpandedSCEVs is
498 /// used to look up SCEV expansions for expressions needed during skeleton
499 /// creation.
500 virtual BasicBlock *
501 createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
502
503 /// Fix the vectorized code, taking care of header phi's, and more.
505
506 // Return true if any runtime check is added.
508
509 /// A helper function to scalarize a single Instruction in the innermost loop.
510 /// Generates a sequence of scalar instances for each lane between \p MinLane
511 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
512 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
513 /// Instr's operands.
514 void scalarizeInstruction(const Instruction *Instr,
515 VPReplicateRecipe *RepRecipe, const VPLane &Lane,
516 VPTransformState &State);
517
518 /// Fix the non-induction PHIs in \p Plan.
520
521 /// Returns the original loop trip count.
522 Value *getTripCount() const { return TripCount; }
523
524 /// Used to set the trip count after ILV's construction and after the
525 /// preheader block has been executed. Note that this always holds the trip
526 /// count of the original loop for both main loop and epilogue vectorization.
527 void setTripCount(Value *TC) { TripCount = TC; }
528
529 // Retrieve the additional bypass value associated with an original
530 /// induction header phi.
532 return Induction2AdditionalBypassValue.at(OrigPhi);
533 }
534
535 /// Return the additional bypass block which targets the scalar loop by
536 /// skipping the epilogue loop after completing the main loop.
539 "Trying to access AdditionalBypassBlock but it has not been set");
541 }
542
543protected:
545
546 /// Set up the values of the IVs correctly when exiting the vector loop.
547 virtual void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
548 Value *VectorTripCount, BasicBlock *MiddleBlock,
549 VPTransformState &State);
550
551 /// Iteratively sink the scalarized operands of a predicated instruction into
552 /// the block that was created for it.
553 void sinkScalarOperands(Instruction *PredInst);
554
555 /// Returns (and creates if needed) the trip count of the widened loop.
557
558 /// Emit a bypass check to see if the vector trip count is zero, including if
559 /// it overflows.
561
562 /// Emit a bypass check to see if all of the SCEV assumptions we've
563 /// had to make are correct. Returns the block containing the checks or
564 /// nullptr if no checks have been added.
566
567 /// Emit bypass checks to check any memory assumptions we may have made.
568 /// Returns the block containing the checks or nullptr if no checks have been
569 /// added.
571
572 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
573 /// vector loop preheader, middle block and scalar preheader.
575
576 /// Create and record the values for induction variables to resume coming from
577 /// the additional bypass block.
578 void createInductionAdditionalBypassValues(const SCEV2ValueTy &ExpandedSCEVs,
579 Value *MainVectorTripCount);
580
581 /// Allow subclasses to override and print debug traces before/after vplan
582 /// execution, when trace information is requested.
583 virtual void printDebugTracesAtStart() {}
584 virtual void printDebugTracesAtEnd() {}
585
586 /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the
587 /// vector preheader and its predecessor, also connecting the new block to the
588 /// scalar preheader.
589 void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB);
590
591 /// The original loop.
593
594 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
595 /// dynamic knowledge to simplify SCEV expressions and converts them to a
596 /// more usable form.
598
599 /// Loop Info.
601
602 /// Dominator Tree.
604
605 /// Target Library Info.
607
608 /// Target Transform Info.
610
611 /// Assumption Cache.
613
614 /// Interface to emit optimization remarks.
616
617 /// The vectorization SIMD factor to use. Each vector will have this many
618 /// vector elements.
620
622
623 /// The vectorization unroll factor to use. Each scalar is vectorized to this
624 /// many different vector instructions.
625 unsigned UF;
626
627 /// The builder that we use
629
630 // --- Vectorization state ---
631
632 /// The vector-loop preheader.
634
635 /// The scalar-loop preheader.
637
638 /// Middle Block between the vector and the scalar.
640
641 /// A list of all bypass blocks. The first block is the entry of the loop.
643
644 /// Store instructions that were predicated.
646
647 /// Trip count of the original loop.
648 Value *TripCount = nullptr;
649
650 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
652
653 /// The legality analysis.
655
656 /// The profitablity analysis.
658
659 // Record whether runtime checks are added.
660 bool AddedSafetyChecks = false;
661
662 /// BFI and PSI are used to check for profile guided size optimizations.
665
666 // Whether this loop should be optimized for size based on profile guided size
667 // optimizatios.
669
670 /// Structure to hold information about generated runtime checks, responsible
671 /// for cleaning the checks, if vectorization turns out unprofitable.
672 GeneratedRTChecks &RTChecks;
673
674 /// Mapping of induction phis to their additional bypass values. They
675 /// need to be added as operands to phi nodes in the scalar loop preheader
676 /// after the epilogue skeleton has been created.
678
679 /// The additional bypass block which conditionally skips over the epilogue
680 /// loop after executing the main loop. Needed to resume inductions and
681 /// reductions during epilogue vectorization.
683
685
686 /// The vector preheader block of \p Plan, used as target for check blocks
687 /// introduced during skeleton creation.
689};
690
691/// Encapsulate information regarding vectorization of a loop and its epilogue.
692/// This information is meant to be updated and used across two stages of
693/// epilogue vectorization.
696 unsigned MainLoopUF = 0;
698 unsigned EpilogueUF = 0;
703 Value *TripCount = nullptr;
706
708 ElementCount EVF, unsigned EUF,
710 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF),
712 assert(EUF == 1 &&
713 "A high UF for the epilogue loop is likely not beneficial.");
714 }
715};
716
717/// An extension of the inner loop vectorizer that creates a skeleton for a
718/// vectorized loop that has its epilogue (residual) also vectorized.
719/// The idea is to run the vplan on a given loop twice, firstly to setup the
720/// skeleton and vectorize the main loop, and secondly to complete the skeleton
721/// from the first step and vectorize the epilogue. This is achieved by
722/// deriving two concrete strategy classes from this base class and invoking
723/// them in succession from the loop vectorizer planner.
725public:
733 GeneratedRTChecks &Checks, VPlan &Plan)
735 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
736 CM, BFI, PSI, Checks, Plan),
737 EPI(EPI) {}
738
739 // Override this function to handle the more complex control flow around the
740 // three loops.
741 BasicBlock *
742 createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final {
743 return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs);
744 }
745
746 /// The interface for creating a vectorized skeleton using one of two
747 /// different strategies, each corresponding to one execution of the vplan
748 /// as described above.
749 virtual BasicBlock *
750 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0;
751
752 /// Holds and updates state information required to vectorize the main loop
753 /// and its epilogue in two separate passes. This setup helps us avoid
754 /// regenerating and recomputing runtime safety checks. It also helps us to
755 /// shorten the iteration-count-check path length for the cases where the
756 /// iteration count of the loop is so small that the main vector loop is
757 /// completely skipped.
759};
760
761/// A specialized derived class of inner loop vectorizer that performs
762/// vectorization of *main* loops in the process of vectorizing loops and their
763/// epilogues.
765public:
773 GeneratedRTChecks &Check, VPlan &Plan)
775 EPI, LVL, CM, BFI, PSI, Check, Plan) {}
776 /// Implements the interface for creating a vectorized skeleton using the
777 /// *main loop* strategy (ie the first pass of vplan execution).
778 BasicBlock *
779 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
780
781protected:
782 /// Emits an iteration count bypass check once for the main loop (when \p
783 /// ForEpilogue is false) and once for the epilogue loop (when \p
784 /// ForEpilogue is true).
785 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
786 void printDebugTracesAtStart() override;
787 void printDebugTracesAtEnd() override;
788
790 Value *VectorTripCount, BasicBlock *MiddleBlock,
791 VPTransformState &State) override {};
792};
793
794// A specialized derived class of inner loop vectorizer that performs
795// vectorization of *epilogue* loops in the process of vectorizing loops and
796// their epilogues.
798public:
806 GeneratedRTChecks &Checks, VPlan &Plan)
808 EPI, LVL, CM, BFI, PSI, Checks, Plan) {
810 }
811 /// Implements the interface for creating a vectorized skeleton using the
812 /// *epilogue loop* strategy (ie the second pass of vplan execution).
813 BasicBlock *
814 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
815
816protected:
817 /// Emits an iteration count bypass check after the main vector loop has
818 /// finished to see if there are any iterations left to execute by either
819 /// the vector epilogue or the scalar epilogue.
821 BasicBlock *Bypass,
822 BasicBlock *Insert);
823 void printDebugTracesAtStart() override;
824 void printDebugTracesAtEnd() override;
825};
826} // end namespace llvm
827
828/// Look for a meaningful debug location on the instruction or its operands.
830 if (!I)
831 return DebugLoc();
832
833 DebugLoc Empty;
834 if (I->getDebugLoc() != Empty)
835 return I->getDebugLoc();
836
837 for (Use &Op : I->operands()) {
838 if (Instruction *OpInst = dyn_cast<Instruction>(Op))
839 if (OpInst->getDebugLoc() != Empty)
840 return OpInst->getDebugLoc();
841 }
842
843 return I->getDebugLoc();
844}
845
846/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
847/// is passed, the message relates to that particular instruction.
848#ifndef NDEBUG
849static void debugVectorizationMessage(const StringRef Prefix,
850 const StringRef DebugMsg,
851 Instruction *I) {
852 dbgs() << "LV: " << Prefix << DebugMsg;
853 if (I != nullptr)
854 dbgs() << " " << *I;
855 else
856 dbgs() << '.';
857 dbgs() << '\n';
858}
859#endif
860
861/// Create an analysis remark that explains why vectorization failed
862///
863/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
864/// RemarkName is the identifier for the remark. If \p I is passed it is an
865/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
866/// the location of the remark. If \p DL is passed, use it as debug location for
867/// the remark. \return the remark object that can be streamed to.
869createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,
870 Instruction *I, DebugLoc DL = {}) {
871 Value *CodeRegion = I ? I->getParent() : TheLoop->getHeader();
872 // If debug location is attached to the instruction, use it. Otherwise if DL
873 // was not provided, use the loop's.
874 if (I && I->getDebugLoc())
875 DL = I->getDebugLoc();
876 else if (!DL)
877 DL = TheLoop->getStartLoc();
878
879 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
880}
881
882namespace llvm {
883
884/// Return a value for Step multiplied by VF.
886 int64_t Step) {
887 assert(Ty->isIntegerTy() && "Expected an integer step");
888 return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step));
889}
890
891/// Return the runtime value for VF.
893 return B.CreateElementCount(Ty, VF);
894}
895
897 const StringRef OREMsg, const StringRef ORETag,
898 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
899 Instruction *I) {
900 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
901 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
902 ORE->emit(
903 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
904 << "loop not vectorized: " << OREMsg);
905}
906
907/// Reports an informative message: print \p Msg for debugging purposes as well
908/// as an optimization remark. Uses either \p I as location of the remark, or
909/// otherwise \p TheLoop. If \p DL is passed, use it as debug location for the
910/// remark. If \p DL is passed, use it as debug location for the remark.
911static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
913 Loop *TheLoop, Instruction *I = nullptr,
914 DebugLoc DL = {}) {
916 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
917 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop,
918 I, DL)
919 << Msg);
920}
921
922/// Report successful vectorization of the loop. In case an outer loop is
923/// vectorized, prepend "outer" to the vectorization remark.
925 VectorizationFactor VF, unsigned IC) {
927 "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
928 nullptr));
929 StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
930 ORE->emit([&]() {
931 return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
932 TheLoop->getHeader())
933 << "vectorized " << LoopType << "loop (vectorization width: "
934 << ore::NV("VectorizationFactor", VF.Width)
935 << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
936 });
937}
938
939} // end namespace llvm
940
941namespace llvm {
942
943// Loop vectorization cost-model hints how the scalar epilogue loop should be
944// lowered.
946
947 // The default: allowing scalar epilogues.
949
950 // Vectorization with OptForSize: don't allow epilogues.
952
953 // A special case of vectorisation with OptForSize: loops with a very small
954 // trip count are considered for vectorization under OptForSize, thereby
955 // making sure the cost of their loop body is dominant, free of runtime
956 // guards and scalar iteration overheads.
958
959 // Loop hint predicate indicating an epilogue is undesired.
961
962 // Directive indicating we must either tail fold or not vectorize
965
966using InstructionVFPair = std::pair<Instruction *, ElementCount>;
967
968/// LoopVectorizationCostModel - estimates the expected speedups due to
969/// vectorization.
970/// In many cases vectorization is not profitable. This can happen because of
971/// a number of reasons. In this class we mainly attempt to predict the
972/// expected speedup/slowdowns due to the supported instruction set. We use the
973/// TargetTransformInfo to query the different backends for the cost of
974/// different operations.
977
978public:
988 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
989 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
990 Hints(Hints), InterleaveInfo(IAI) {}
991
992 /// \return An upper bound for the vectorization factors (both fixed and
993 /// scalable). If the factors are 0, vectorization and interleaving should be
994 /// avoided up front.
995 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
996
997 /// \return True if runtime checks are required for vectorization, and false
998 /// otherwise.
1000
1001 /// Setup cost-based decisions for user vectorization factor.
1002 /// \return true if the UserVF is a feasible VF to be chosen.
1006 return expectedCost(UserVF).isValid();
1007 }
1008
1009 /// \return The size (in bits) of the smallest and widest types in the code
1010 /// that needs to be vectorized. We ignore values that remain scalar such as
1011 /// 64 bit loop indices.
1012 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1013
1014 /// \return The desired interleave count.
1015 /// If interleave count has been specified by metadata it will be returned.
1016 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1017 /// are the selected vectorization factor and the cost of the selected VF.
1018 unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
1019
1020 /// Memory access instruction may be vectorized in more than one way.
1021 /// Form of instruction after vectorization depends on cost.
1022 /// This function takes cost-based decisions for Load/Store instructions
1023 /// and collects them in a map. This decisions map is used for building
1024 /// the lists of loop-uniform and loop-scalar instructions.
1025 /// The calculated cost is saved with widening decision in order to
1026 /// avoid redundant calculations.
1028
1029 /// A call may be vectorized in different ways depending on whether we have
1030 /// vectorized variants available and whether the target supports masking.
1031 /// This function analyzes all calls in the function at the supplied VF,
1032 /// makes a decision based on the costs of available options, and stores that
1033 /// decision in a map for use in planning and plan execution.
1035
1036 /// A struct that represents some properties of the register usage
1037 /// of a loop.
1039 /// Holds the number of loop invariant values that are used in the loop.
1040 /// The key is ClassID of target-provided register class.
1042 /// Holds the maximum number of concurrent live intervals in the loop.
1043 /// The key is ClassID of target-provided register class.
1045 };
1046
1047 /// \return Returns information about the register usages of the loop for the
1048 /// given vectorization factors.
1051
1052 /// Collect values we want to ignore in the cost model.
1053 void collectValuesToIgnore();
1054
1055 /// Collect all element types in the loop for which widening is needed.
1057
1058 /// Split reductions into those that happen in the loop, and those that happen
1059 /// outside. In loop reductions are collected into InLoopReductions.
1061
1062 /// Returns true if we should use strict in-order reductions for the given
1063 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1064 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1065 /// of FP operations.
1066 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1067 return !Hints->allowReordering() && RdxDesc.isOrdered();
1068 }
1069
1070 /// \returns The smallest bitwidth each instruction can be represented with.
1071 /// The vector equivalents of these instructions should be truncated to this
1072 /// type.
1074 return MinBWs;
1075 }
1076
1077 /// \returns True if it is more profitable to scalarize instruction \p I for
1078 /// vectorization factor \p VF.
1080 assert(VF.isVector() &&
1081 "Profitable to scalarize relevant only for VF > 1.");
1082 assert(
1083 TheLoop->isInnermost() &&
1084 "cost-model should not be used for outer loops (in VPlan-native path)");
1085
1086 auto Scalars = InstsToScalarize.find(VF);
1087 assert(Scalars != InstsToScalarize.end() &&
1088 "VF not yet analyzed for scalarization profitability");
1089 return Scalars->second.contains(I);
1090 }
1091
1092 /// Returns true if \p I is known to be uniform after vectorization.
1094 assert(
1095 TheLoop->isInnermost() &&
1096 "cost-model should not be used for outer loops (in VPlan-native path)");
1097 // Pseudo probe needs to be duplicated for each unrolled iteration and
1098 // vector lane so that profiled loop trip count can be accurately
1099 // accumulated instead of being under counted.
1100 if (isa<PseudoProbeInst>(I))
1101 return false;
1102
1103 if (VF.isScalar())
1104 return true;
1105
1106 auto UniformsPerVF = Uniforms.find(VF);
1107 assert(UniformsPerVF != Uniforms.end() &&
1108 "VF not yet analyzed for uniformity");
1109 return UniformsPerVF->second.count(I);
1110 }
1111
1112 /// Returns true if \p I is known to be scalar after vectorization.
1114 assert(
1115 TheLoop->isInnermost() &&
1116 "cost-model should not be used for outer loops (in VPlan-native path)");
1117 if (VF.isScalar())
1118 return true;
1119
1120 auto ScalarsPerVF = Scalars.find(VF);
1121 assert(ScalarsPerVF != Scalars.end() &&
1122 "Scalar values are not calculated for VF");
1123 return ScalarsPerVF->second.count(I);
1124 }
1125
1126 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1127 /// for vectorization factor \p VF.
1129 return VF.isVector() && MinBWs.contains(I) &&
1130 !isProfitableToScalarize(I, VF) &&
1132 }
1133
1134 /// Decision that was taken during cost calculation for memory instruction.
1137 CM_Widen, // For consecutive accesses with stride +1.
1138 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1145
1146 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1147 /// instruction \p I and vector width \p VF.
1150 assert(VF.isVector() && "Expected VF >=2");
1151 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1152 }
1153
1154 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1155 /// interleaving group \p Grp and vector width \p VF.
1159 assert(VF.isVector() && "Expected VF >=2");
1160 /// Broadcast this decicion to all instructions inside the group.
1161 /// When interleaving, the cost will only be assigned one instruction, the
1162 /// insert position. For other cases, add the appropriate fraction of the
1163 /// total cost to each instruction. This ensures accurate costs are used,
1164 /// even if the insert position instruction is not used.
1165 InstructionCost InsertPosCost = Cost;
1166 InstructionCost OtherMemberCost = 0;
1167 if (W != CM_Interleave)
1168 OtherMemberCost = InsertPosCost = Cost / Grp->getNumMembers();
1169 ;
1170 for (unsigned Idx = 0; Idx < Grp->getFactor(); ++Idx) {
1171 if (auto *I = Grp->getMember(Idx)) {
1172 if (Grp->getInsertPos() == I)
1173 WideningDecisions[std::make_pair(I, VF)] =
1174 std::make_pair(W, InsertPosCost);
1175 else
1176 WideningDecisions[std::make_pair(I, VF)] =
1177 std::make_pair(W, OtherMemberCost);
1178 }
1179 }
1180 }
1181
1182 /// Return the cost model decision for the given instruction \p I and vector
1183 /// width \p VF. Return CM_Unknown if this instruction did not pass
1184 /// through the cost modeling.
1186 assert(VF.isVector() && "Expected VF to be a vector VF");
1187 assert(
1188 TheLoop->isInnermost() &&
1189 "cost-model should not be used for outer loops (in VPlan-native path)");
1190
1191 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1192 auto Itr = WideningDecisions.find(InstOnVF);
1193 if (Itr == WideningDecisions.end())
1194 return CM_Unknown;
1195 return Itr->second.first;
1196 }
1197
1198 /// Return the vectorization cost for the given instruction \p I and vector
1199 /// width \p VF.
1201 assert(VF.isVector() && "Expected VF >=2");
1202 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1203 assert(WideningDecisions.contains(InstOnVF) &&
1204 "The cost is not calculated");
1205 return WideningDecisions[InstOnVF].second;
1206 }
1207
1212 std::optional<unsigned> MaskPos;
1214 };
1215
1217 Function *Variant, Intrinsic::ID IID,
1218 std::optional<unsigned> MaskPos,
1220 assert(!VF.isScalar() && "Expected vector VF");
1221 CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID,
1222 MaskPos, Cost};
1223 }
1224
1226 ElementCount VF) const {
1227 assert(!VF.isScalar() && "Expected vector VF");
1228 return CallWideningDecisions.at(std::make_pair(CI, VF));
1229 }
1230
1231 /// Return True if instruction \p I is an optimizable truncate whose operand
1232 /// is an induction variable. Such a truncate will be removed by adding a new
1233 /// induction variable with the destination type.
1235 // If the instruction is not a truncate, return false.
1236 auto *Trunc = dyn_cast<TruncInst>(I);
1237 if (!Trunc)
1238 return false;
1239
1240 // Get the source and destination types of the truncate.
1241 Type *SrcTy = toVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1242 Type *DestTy = toVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1243
1244 // If the truncate is free for the given types, return false. Replacing a
1245 // free truncate with an induction variable would add an induction variable
1246 // update instruction to each iteration of the loop. We exclude from this
1247 // check the primary induction variable since it will need an update
1248 // instruction regardless.
1249 Value *Op = Trunc->getOperand(0);
1250 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1251 return false;
1252
1253 // If the truncated value is not an induction variable, return false.
1254 return Legal->isInductionPhi(Op);
1255 }
1256
1257 /// Collects the instructions to scalarize for each predicated instruction in
1258 /// the loop.
1260
1261 /// Collect Uniform and Scalar values for the given \p VF.
1262 /// The sets depend on CM decision for Load/Store instructions
1263 /// that may be vectorized as interleave, gather-scatter or scalarized.
1264 /// Also make a decision on what to do about call instructions in the loop
1265 /// at that VF -- scalarize, call a known vector routine, or call a
1266 /// vector intrinsic.
1268 // Do the analysis once.
1269 if (VF.isScalar() || Uniforms.contains(VF))
1270 return;
1272 collectLoopUniforms(VF);
1274 collectLoopScalars(VF);
1275 }
1276
1277 /// Returns true if the target machine supports masked store operation
1278 /// for the given \p DataType and kind of access to \p Ptr.
1279 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1280 return Legal->isConsecutivePtr(DataType, Ptr) &&
1281 TTI.isLegalMaskedStore(DataType, Alignment);
1282 }
1283
1284 /// Returns true if the target machine supports masked load operation
1285 /// for the given \p DataType and kind of access to \p Ptr.
1286 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1287 return Legal->isConsecutivePtr(DataType, Ptr) &&
1288 TTI.isLegalMaskedLoad(DataType, Alignment);
1289 }
1290
1291 /// Returns true if the target machine can represent \p V as a masked gather
1292 /// or scatter operation.
1294 bool LI = isa<LoadInst>(V);
1295 bool SI = isa<StoreInst>(V);
1296 if (!LI && !SI)
1297 return false;
1298 auto *Ty = getLoadStoreType(V);
1300 if (VF.isVector())
1301 Ty = VectorType::get(Ty, VF);
1302 return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1303 (SI && TTI.isLegalMaskedScatter(Ty, Align));
1304 }
1305
1306 /// Returns true if the target machine supports all of the reduction
1307 /// variables found for the given VF.
1309 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1310 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1311 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1312 }));
1313 }
1314
1315 /// Given costs for both strategies, return true if the scalar predication
1316 /// lowering should be used for div/rem. This incorporates an override
1317 /// option so it is not simply a cost comparison.
1319 InstructionCost SafeDivisorCost) const {
1320 switch (ForceSafeDivisor) {
1321 case cl::BOU_UNSET:
1322 return ScalarCost < SafeDivisorCost;
1323 case cl::BOU_TRUE:
1324 return false;
1325 case cl::BOU_FALSE:
1326 return true;
1327 }
1328 llvm_unreachable("impossible case value");
1329 }
1330
1331 /// Returns true if \p I is an instruction which requires predication and
1332 /// for which our chosen predication strategy is scalarization (i.e. we
1333 /// don't have an alternate strategy such as masking available).
1334 /// \p VF is the vectorization factor that will be used to vectorize \p I.
1336
1337 /// Returns true if \p I is an instruction that needs to be predicated
1338 /// at runtime. The result is independent of the predication mechanism.
1339 /// Superset of instructions that return true for isScalarWithPredication.
1340 bool isPredicatedInst(Instruction *I) const;
1341
1342 /// Return the costs for our two available strategies for lowering a
1343 /// div/rem operation which requires speculating at least one lane.
1344 /// First result is for scalarization (will be invalid for scalable
1345 /// vectors); second is for the safe-divisor strategy.
1346 std::pair<InstructionCost, InstructionCost>
1348 ElementCount VF) const;
1349
1350 /// Returns true if \p I is a memory instruction with consecutive memory
1351 /// access that can be widened.
1353
1354 /// Returns true if \p I is a memory instruction in an interleaved-group
1355 /// of memory accesses that can be vectorized with wide vector loads/stores
1356 /// and shuffles.
1358
1359 /// Check if \p Instr belongs to any interleaved access group.
1361 return InterleaveInfo.isInterleaved(Instr);
1362 }
1363
1364 /// Get the interleaved access group that \p Instr belongs to.
1367 return InterleaveInfo.getInterleaveGroup(Instr);
1368 }
1369
1370 /// Returns true if we're required to use a scalar epilogue for at least
1371 /// the final iteration of the original loop.
1372 bool requiresScalarEpilogue(bool IsVectorizing) const {
1373 if (!isScalarEpilogueAllowed()) {
1374 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1375 return false;
1376 }
1377 // If we might exit from anywhere but the latch and early exit vectorization
1378 // is disabled, we must run the exiting iteration in scalar form.
1381 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting "
1382 "from latch block\n");
1383 return true;
1384 }
1385 if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
1386 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
1387 "interleaved group requires scalar epilogue\n");
1388 return true;
1389 }
1390 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1391 return false;
1392 }
1393
1394 /// Returns true if we're required to use a scalar epilogue for at least
1395 /// the final iteration of the original loop for all VFs in \p Range.
1396 /// A scalar epilogue must either be required for all VFs in \p Range or for
1397 /// none.
1399 auto RequiresScalarEpilogue = [this](ElementCount VF) {
1400 return requiresScalarEpilogue(VF.isVector());
1401 };
1402 bool IsRequired = all_of(Range, RequiresScalarEpilogue);
1403 assert(
1404 (IsRequired || none_of(Range, RequiresScalarEpilogue)) &&
1405 "all VFs in range must agree on whether a scalar epilogue is required");
1406 return IsRequired;
1407 }
1408
1409 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1410 /// loop hint annotation.
1412 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1413 }
1414
1415 /// Returns the TailFoldingStyle that is best for the current loop.
1416 TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1417 if (!ChosenTailFoldingStyle)
1419 return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first
1420 : ChosenTailFoldingStyle->second;
1421 }
1422
1423 /// Selects and saves TailFoldingStyle for 2 options - if IV update may
1424 /// overflow or not.
1425 /// \param IsScalableVF true if scalable vector factors enabled.
1426 /// \param UserIC User specific interleave count.
1427 void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
1428 assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
1429 if (!Legal->canFoldTailByMasking()) {
1430 ChosenTailFoldingStyle =
1432 return;
1433 }
1434
1435 if (!ForceTailFoldingStyle.getNumOccurrences()) {
1436 ChosenTailFoldingStyle = std::make_pair(
1437 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
1438 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false));
1439 return;
1440 }
1441
1442 // Set styles when forced.
1443 ChosenTailFoldingStyle = std::make_pair(ForceTailFoldingStyle.getValue(),
1444 ForceTailFoldingStyle.getValue());
1446 return;
1447 // Override forced styles if needed.
1448 // FIXME: use actual opcode/data type for analysis here.
1449 // FIXME: Investigate opportunity for fixed vector factor.
1450 bool EVLIsLegal = UserIC <= 1 &&
1451 TTI.hasActiveVectorLength(0, nullptr, Align()) &&
1453 if (!EVLIsLegal) {
1454 // If for some reason EVL mode is unsupported, fallback to
1455 // DataWithoutLaneMask to try to vectorize the loop with folded tail
1456 // in a generic way.
1457 ChosenTailFoldingStyle =
1460 LLVM_DEBUG(
1461 dbgs()
1462 << "LV: Preference for VP intrinsics indicated. Will "
1463 "not try to generate VP Intrinsics "
1464 << (UserIC > 1
1465 ? "since interleave count specified is greater than 1.\n"
1466 : "due to non-interleaving reasons.\n"));
1467 }
1468 }
1469
1470 /// Returns true if all loop blocks should be masked to fold tail loop.
1471 bool foldTailByMasking() const {
1472 // TODO: check if it is possible to check for None style independent of
1473 // IVUpdateMayOverflow flag in getTailFoldingStyle.
1475 }
1476
1477 /// Return maximum safe number of elements to be processed per vector
1478 /// iteration, which do not prevent store-load forwarding and are safe with
1479 /// regard to the memory dependencies. Required for EVL-based VPlans to
1480 /// correctly calculate AVL (application vector length) as min(remaining AVL,
1481 /// MaxSafeElements).
1482 /// TODO: need to consider adjusting cost model to use this value as a
1483 /// vectorization factor for EVL-based vectorization.
1484 std::optional<unsigned> getMaxSafeElements() const { return MaxSafeElements; }
1485
1486 /// Returns true if the instructions in this block requires predication
1487 /// for any reason, e.g. because tail folding now requires a predicate
1488 /// or because the block in the original loop was predicated.
1491 }
1492
1493 /// Returns true if VP intrinsics with explicit vector length support should
1494 /// be generated in the tail folded loop.
1495 bool foldTailWithEVL() const {
1497 }
1498
1499 /// Returns true if the Phi is part of an inloop reduction.
1500 bool isInLoopReduction(PHINode *Phi) const {
1501 return InLoopReductions.contains(Phi);
1502 }
1503
1504 /// Returns true if the predicated reduction select should be used to set the
1505 /// incoming value for the reduction phi.
1506 bool usePredicatedReductionSelect(unsigned Opcode, Type *PhiTy) const {
1507 // Force to use predicated reduction select since the EVL of the
1508 // second-to-last iteration might not be VF*UF.
1509 if (foldTailWithEVL())
1510 return true;
1513 Opcode, PhiTy, TargetTransformInfo::ReductionFlags());
1514 }
1515
1516 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1517 /// with factor VF. Return the cost of the instruction, including
1518 /// scalarization overhead if it's needed.
1520
1521 /// Estimate cost of a call instruction CI if it were vectorized with factor
1522 /// VF. Return the cost of the instruction, including scalarization overhead
1523 /// if it's needed.
1525
1526 /// Invalidates decisions already taken by the cost model.
1528 WideningDecisions.clear();
1529 CallWideningDecisions.clear();
1530 Uniforms.clear();
1531 Scalars.clear();
1532 }
1533
1534 /// Returns the expected execution cost. The unit of the cost does
1535 /// not matter because we use the 'cost' units to compare different
1536 /// vector widths. The cost that is returned is *not* normalized by
1537 /// the factor width.
1539
1540 bool hasPredStores() const { return NumPredStores > 0; }
1541
1542 /// Returns true if epilogue vectorization is considered profitable, and
1543 /// false otherwise.
1544 /// \p VF is the vectorization factor chosen for the original loop.
1545 /// \p Multiplier is an aditional scaling factor applied to VF before
1546 /// comparing to EpilogueVectorizationMinVF.
1548 const unsigned IC) const;
1549
1550 /// Returns the execution time cost of an instruction for a given vector
1551 /// width. Vector width of one means scalar.
1553
1554 /// Return the cost of instructions in an inloop reduction pattern, if I is
1555 /// part of that pattern.
1556 std::optional<InstructionCost>
1559
1560 /// Returns true if \p Op should be considered invariant and if it is
1561 /// trivially hoistable.
1563
1564private:
1565 unsigned NumPredStores = 0;
1566
1567 /// \return An upper bound for the vectorization factors for both
1568 /// fixed and scalable vectorization, where the minimum-known number of
1569 /// elements is a power-of-2 larger than zero. If scalable vectorization is
1570 /// disabled or unsupported, then the scalable part will be equal to
1571 /// ElementCount::getScalable(0).
1572 FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1573 ElementCount UserVF,
1574 bool FoldTailByMasking);
1575
1576 /// \return the maximized element count based on the targets vector
1577 /// registers and the loop trip-count, but limited to a maximum safe VF.
1578 /// This is a helper function of computeFeasibleMaxVF.
1579 ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1580 unsigned SmallestType,
1581 unsigned WidestType,
1582 ElementCount MaxSafeVF,
1583 bool FoldTailByMasking);
1584
1585 /// Checks if scalable vectorization is supported and enabled. Caches the
1586 /// result to avoid repeated debug dumps for repeated queries.
1587 bool isScalableVectorizationAllowed();
1588
1589 /// \return the maximum legal scalable VF, based on the safe max number
1590 /// of elements.
1591 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1592
1593 /// Calculate vectorization cost of memory instruction \p I.
1594 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1595
1596 /// The cost computation for scalarized memory instruction.
1597 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1598
1599 /// The cost computation for interleaving group of memory instructions.
1600 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1601
1602 /// The cost computation for Gather/Scatter instruction.
1603 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1604
1605 /// The cost computation for widening instruction \p I with consecutive
1606 /// memory access.
1607 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1608
1609 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1610 /// Load: scalar load + broadcast.
1611 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1612 /// element)
1613 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1614
1615 /// Estimate the overhead of scalarizing an instruction. This is a
1616 /// convenience wrapper for the type-based getScalarizationOverhead API.
1617 InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF,
1619
1620 /// Returns true if an artificially high cost for emulated masked memrefs
1621 /// should be used.
1622 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1623
1624 /// Map of scalar integer values to the smallest bitwidth they can be legally
1625 /// represented as. The vector equivalents of these values should be truncated
1626 /// to this type.
1628
1629 /// A type representing the costs for instructions if they were to be
1630 /// scalarized rather than vectorized. The entries are Instruction-Cost
1631 /// pairs.
1632 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1633
1634 /// A set containing all BasicBlocks that are known to present after
1635 /// vectorization as a predicated block.
1637 PredicatedBBsAfterVectorization;
1638
1639 /// Records whether it is allowed to have the original scalar loop execute at
1640 /// least once. This may be needed as a fallback loop in case runtime
1641 /// aliasing/dependence checks fail, or to handle the tail/remainder
1642 /// iterations when the trip count is unknown or doesn't divide by the VF,
1643 /// or as a peel-loop to handle gaps in interleave-groups.
1644 /// Under optsize and when the trip count is very small we don't allow any
1645 /// iterations to execute in the scalar loop.
1646 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1647
1648 /// Control finally chosen tail folding style. The first element is used if
1649 /// the IV update may overflow, the second element - if it does not.
1650 std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
1651 ChosenTailFoldingStyle;
1652
1653 /// true if scalable vectorization is supported and enabled.
1654 std::optional<bool> IsScalableVectorizationAllowed;
1655
1656 /// Maximum safe number of elements to be processed per vector iteration,
1657 /// which do not prevent store-load forwarding and are safe with regard to the
1658 /// memory dependencies. Required for EVL-based veectorization, where this
1659 /// value is used as the upper bound of the safe AVL.
1660 std::optional<unsigned> MaxSafeElements;
1661
1662 /// A map holding scalar costs for different vectorization factors. The
1663 /// presence of a cost for an instruction in the mapping indicates that the
1664 /// instruction will be scalarized when vectorizing with the associated
1665 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1667
1668 /// Holds the instructions known to be uniform after vectorization.
1669 /// The data is collected per VF.
1671
1672 /// Holds the instructions known to be scalar after vectorization.
1673 /// The data is collected per VF.
1675
1676 /// Holds the instructions (address computations) that are forced to be
1677 /// scalarized.
1679
1680 /// PHINodes of the reductions that should be expanded in-loop.
1681 SmallPtrSet<PHINode *, 4> InLoopReductions;
1682
1683 /// A Map of inloop reduction operations and their immediate chain operand.
1684 /// FIXME: This can be removed once reductions can be costed correctly in
1685 /// VPlan. This was added to allow quick lookup of the inloop operations.
1686 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1687
1688 /// Returns the expected difference in cost from scalarizing the expression
1689 /// feeding a predicated instruction \p PredInst. The instructions to
1690 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1691 /// non-negative return value implies the expression will be scalarized.
1692 /// Currently, only single-use chains are considered for scalarization.
1693 InstructionCost computePredInstDiscount(Instruction *PredInst,
1694 ScalarCostsTy &ScalarCosts,
1695 ElementCount VF);
1696
1697 /// Collect the instructions that are uniform after vectorization. An
1698 /// instruction is uniform if we represent it with a single scalar value in
1699 /// the vectorized loop corresponding to each vector iteration. Examples of
1700 /// uniform instructions include pointer operands of consecutive or
1701 /// interleaved memory accesses. Note that although uniformity implies an
1702 /// instruction will be scalar, the reverse is not true. In general, a
1703 /// scalarized instruction will be represented by VF scalar values in the
1704 /// vectorized loop, each corresponding to an iteration of the original
1705 /// scalar loop.
1706 void collectLoopUniforms(ElementCount VF);
1707
1708 /// Collect the instructions that are scalar after vectorization. An
1709 /// instruction is scalar if it is known to be uniform or will be scalarized
1710 /// during vectorization. collectLoopScalars should only add non-uniform nodes
1711 /// to the list if they are used by a load/store instruction that is marked as
1712 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1713 /// VF values in the vectorized loop, each corresponding to an iteration of
1714 /// the original scalar loop.
1715 void collectLoopScalars(ElementCount VF);
1716
1717 /// Keeps cost model vectorization decision and cost for instructions.
1718 /// Right now it is used for memory instructions only.
1720 std::pair<InstWidening, InstructionCost>>;
1721
1722 DecisionList WideningDecisions;
1723
1724 using CallDecisionList =
1725 DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1726
1727 CallDecisionList CallWideningDecisions;
1728
1729 /// Returns true if \p V is expected to be vectorized and it needs to be
1730 /// extracted.
1731 bool needsExtract(Value *V, ElementCount VF) const {
1732 Instruction *I = dyn_cast<Instruction>(V);
1733 if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1736 return false;
1737
1738 // Assume we can vectorize V (and hence we need extraction) if the
1739 // scalars are not computed yet. This can happen, because it is called
1740 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1741 // the scalars are collected. That should be a safe assumption in most
1742 // cases, because we check if the operands have vectorizable types
1743 // beforehand in LoopVectorizationLegality.
1744 return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
1745 };
1746
1747 /// Returns a range containing only operands needing to be extracted.
1748 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1749 ElementCount VF) const {
1751 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1752 }
1753
1754public:
1755 /// The loop that we evaluate.
1757
1758 /// Predicated scalar evolution analysis.
1760
1761 /// Loop Info analysis.
1763
1764 /// Vectorization legality.
1766
1767 /// Vector target information.
1769
1770 /// Target Library Info.
1772
1773 /// Demanded bits analysis.
1775
1776 /// Assumption cache.
1778
1779 /// Interface to emit optimization remarks.
1781
1783
1784 /// Loop Vectorize Hint.
1786
1787 /// The interleave access information contains groups of interleaved accesses
1788 /// with the same stride and close to each other.
1790
1791 /// Values to ignore in the cost model.
1793
1794 /// Values to ignore in the cost model when VF > 1.
1796
1797 /// All element types found in the loop.
1799};
1800} // end namespace llvm
1801
1802namespace {
1803/// Helper struct to manage generating runtime checks for vectorization.
1804///
1805/// The runtime checks are created up-front in temporary blocks to allow better
1806/// estimating the cost and un-linked from the existing IR. After deciding to
1807/// vectorize, the checks are moved back. If deciding not to vectorize, the
1808/// temporary blocks are completely removed.
1809class GeneratedRTChecks {
1810 /// Basic block which contains the generated SCEV checks, if any.
1811 BasicBlock *SCEVCheckBlock = nullptr;
1812
1813 /// The value representing the result of the generated SCEV checks. If it is
1814 /// nullptr, either no SCEV checks have been generated or they have been used.
1815 Value *SCEVCheckCond = nullptr;
1816
1817 /// Basic block which contains the generated memory runtime checks, if any.
1818 BasicBlock *MemCheckBlock = nullptr;
1819
1820 /// The value representing the result of the generated memory runtime checks.
1821 /// If it is nullptr, either no memory runtime checks have been generated or
1822 /// they have been used.
1823 Value *MemRuntimeCheckCond = nullptr;
1824
1825 DominatorTree *DT;
1826 LoopInfo *LI;
1828
1829 SCEVExpander SCEVExp;
1830 SCEVExpander MemCheckExp;
1831
1832 bool CostTooHigh = false;
1833 const bool AddBranchWeights;
1834
1835 Loop *OuterLoop = nullptr;
1836
1838
1839public:
1840 GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
1842 const DataLayout &DL, bool AddBranchWeights)
1843 : DT(DT), LI(LI), TTI(TTI), SCEVExp(*PSE.getSE(), DL, "scev.check"),
1844 MemCheckExp(*PSE.getSE(), DL, "scev.check"),
1845 AddBranchWeights(AddBranchWeights), PSE(PSE) {}
1846
1847 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1848 /// accurately estimate the cost of the runtime checks. The blocks are
1849 /// un-linked from the IR and are added back during vector code generation. If
1850 /// there is no vector code generation, the check blocks are removed
1851 /// completely.
1852 void create(Loop *L, const LoopAccessInfo &LAI,
1853 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1854
1855 // Hard cutoff to limit compile-time increase in case a very large number of
1856 // runtime checks needs to be generated.
1857 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1858 // profile info.
1859 CostTooHigh =
1861 if (CostTooHigh)
1862 return;
1863
1864 BasicBlock *LoopHeader = L->getHeader();
1865 BasicBlock *Preheader = L->getLoopPreheader();
1866
1867 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1868 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1869 // may be used by SCEVExpander. The blocks will be un-linked from their
1870 // predecessors and removed from LI & DT at the end of the function.
1871 if (!UnionPred.isAlwaysTrue()) {
1872 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1873 nullptr, "vector.scevcheck");
1874
1875 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1876 &UnionPred, SCEVCheckBlock->getTerminator());
1877 }
1878
1879 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1880 if (RtPtrChecking.Need) {
1881 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1882 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1883 "vector.memcheck");
1884
1885 auto DiffChecks = RtPtrChecking.getDiffChecks();
1886 if (DiffChecks) {
1887 Value *RuntimeVF = nullptr;
1888 MemRuntimeCheckCond = addDiffRuntimeChecks(
1889 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
1890 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1891 if (!RuntimeVF)
1892 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1893 return RuntimeVF;
1894 },
1895 IC);
1896 } else {
1897 MemRuntimeCheckCond = addRuntimeChecks(
1898 MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
1900 }
1901 assert(MemRuntimeCheckCond &&
1902 "no RT checks generated although RtPtrChecking "
1903 "claimed checks are required");
1904 }
1905
1906 if (!MemCheckBlock && !SCEVCheckBlock)
1907 return;
1908
1909 // Unhook the temporary block with the checks, update various places
1910 // accordingly.
1911 if (SCEVCheckBlock)
1912 SCEVCheckBlock->replaceAllUsesWith(Preheader);
1913 if (MemCheckBlock)
1914 MemCheckBlock->replaceAllUsesWith(Preheader);
1915
1916 if (SCEVCheckBlock) {
1917 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1918 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1919 Preheader->getTerminator()->eraseFromParent();
1920 }
1921 if (MemCheckBlock) {
1922 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1923 new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1924 Preheader->getTerminator()->eraseFromParent();
1925 }
1926
1927 DT->changeImmediateDominator(LoopHeader, Preheader);
1928 if (MemCheckBlock) {
1929 DT->eraseNode(MemCheckBlock);
1930 LI->removeBlock(MemCheckBlock);
1931 }
1932 if (SCEVCheckBlock) {
1933 DT->eraseNode(SCEVCheckBlock);
1934 LI->removeBlock(SCEVCheckBlock);
1935 }
1936
1937 // Outer loop is used as part of the later cost calculations.
1938 OuterLoop = L->getParentLoop();
1939 }
1940
1941 InstructionCost getCost() {
1942 if (SCEVCheckBlock || MemCheckBlock)
1943 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
1944
1945 if (CostTooHigh) {
1947 Cost.setInvalid();
1948 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n");
1949 return Cost;
1950 }
1951
1952 InstructionCost RTCheckCost = 0;
1953 if (SCEVCheckBlock)
1954 for (Instruction &I : *SCEVCheckBlock) {
1955 if (SCEVCheckBlock->getTerminator() == &I)
1956 continue;
1959 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1960 RTCheckCost += C;
1961 }
1962 if (MemCheckBlock) {
1963 InstructionCost MemCheckCost = 0;
1964 for (Instruction &I : *MemCheckBlock) {
1965 if (MemCheckBlock->getTerminator() == &I)
1966 continue;
1969 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1970 MemCheckCost += C;
1971 }
1972
1973 // If the runtime memory checks are being created inside an outer loop
1974 // we should find out if these checks are outer loop invariant. If so,
1975 // the checks will likely be hoisted out and so the effective cost will
1976 // reduce according to the outer loop trip count.
1977 if (OuterLoop) {
1978 ScalarEvolution *SE = MemCheckExp.getSE();
1979 // TODO: If profitable, we could refine this further by analysing every
1980 // individual memory check, since there could be a mixture of loop
1981 // variant and invariant checks that mean the final condition is
1982 // variant.
1983 const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
1984 if (SE->isLoopInvariant(Cond, OuterLoop)) {
1985 // It seems reasonable to assume that we can reduce the effective
1986 // cost of the checks even when we know nothing about the trip
1987 // count. Assume that the outer loop executes at least twice.
1988 unsigned BestTripCount = 2;
1989
1990 // Get the best known TC estimate.
1991 if (auto EstimatedTC = getSmallBestKnownTC(
1992 PSE, OuterLoop, /* CanUseConstantMax = */ false))
1993 BestTripCount = *EstimatedTC;
1994
1995 BestTripCount = std::max(BestTripCount, 1U);
1996 InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
1997
1998 // Let's ensure the cost is always at least 1.
1999 NewMemCheckCost = std::max(*NewMemCheckCost.getValue(),
2001
2002 if (BestTripCount > 1)
2004 << "We expect runtime memory checks to be hoisted "
2005 << "out of the outer loop. Cost reduced from "
2006 << MemCheckCost << " to " << NewMemCheckCost << '\n');
2007
2008 MemCheckCost = NewMemCheckCost;
2009 }
2010 }
2011
2012 RTCheckCost += MemCheckCost;
2013 }
2014
2015 if (SCEVCheckBlock || MemCheckBlock)
2016 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2017 << "\n");
2018
2019 return RTCheckCost;
2020 }
2021
2022 /// Remove the created SCEV & memory runtime check blocks & instructions, if
2023 /// unused.
2024 ~GeneratedRTChecks() {
2025 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2026 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2027 if (!SCEVCheckCond)
2028 SCEVCleaner.markResultUsed();
2029
2030 if (!MemRuntimeCheckCond)
2031 MemCheckCleaner.markResultUsed();
2032
2033 if (MemRuntimeCheckCond) {
2034 auto &SE = *MemCheckExp.getSE();
2035 // Memory runtime check generation creates compares that use expanded
2036 // values. Remove them before running the SCEVExpanderCleaners.
2037 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2038 if (MemCheckExp.isInsertedInstruction(&I))
2039 continue;
2040 SE.forgetValue(&I);
2041 I.eraseFromParent();
2042 }
2043 }
2044 MemCheckCleaner.cleanup();
2045 SCEVCleaner.cleanup();
2046
2047 if (SCEVCheckCond)
2048 SCEVCheckBlock->eraseFromParent();
2049 if (MemRuntimeCheckCond)
2050 MemCheckBlock->eraseFromParent();
2051 }
2052
2053 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2054 /// adjusts the branches to branch to the vector preheader or \p Bypass,
2055 /// depending on the generated condition.
2056 BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2057 BasicBlock *LoopVectorPreHeader) {
2058 if (!SCEVCheckCond)
2059 return nullptr;
2060
2061 Value *Cond = SCEVCheckCond;
2062 // Mark the check as used, to prevent it from being removed during cleanup.
2063 SCEVCheckCond = nullptr;
2064 if (auto *C = dyn_cast<ConstantInt>(Cond))
2065 if (C->isZero())
2066 return nullptr;
2067
2068 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2069
2070 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2071 // Create new preheader for vector loop.
2072 if (OuterLoop)
2073 OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2074
2075 SCEVCheckBlock->getTerminator()->eraseFromParent();
2076 SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2077 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2078 SCEVCheckBlock);
2079
2080 DT->addNewBlock(SCEVCheckBlock, Pred);
2081 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2082
2083 BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond);
2084 if (AddBranchWeights)
2085 setBranchWeights(BI, SCEVCheckBypassWeights, /*IsExpected=*/false);
2086 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI);
2087 return SCEVCheckBlock;
2088 }
2089
2090 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2091 /// the branches to branch to the vector preheader or \p Bypass, depending on
2092 /// the generated condition.
2093 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2094 BasicBlock *LoopVectorPreHeader) {
2095 // Check if we generated code that checks in runtime if arrays overlap.
2096 if (!MemRuntimeCheckCond)
2097 return nullptr;
2098
2099 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2100 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2101 MemCheckBlock);
2102
2103 DT->addNewBlock(MemCheckBlock, Pred);
2104 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2105 MemCheckBlock->moveBefore(LoopVectorPreHeader);
2106
2107 if (OuterLoop)
2108 OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI);
2109
2110 BranchInst &BI =
2111 *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
2112 if (AddBranchWeights) {
2113 setBranchWeights(BI, MemCheckBypassWeights, /*IsExpected=*/false);
2114 }
2115 ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
2116 MemCheckBlock->getTerminator()->setDebugLoc(
2117 Pred->getTerminator()->getDebugLoc());
2118
2119 // Mark the check as used, to prevent it from being removed during cleanup.
2120 MemRuntimeCheckCond = nullptr;
2121 return MemCheckBlock;
2122 }
2123};
2124} // namespace
2125
2127 return Style == TailFoldingStyle::Data ||
2128 Style == TailFoldingStyle::DataAndControlFlow ||
2129 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2130}
2131
2133 return Style == TailFoldingStyle::DataAndControlFlow ||
2134 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2135}
2136
2137// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2138// vectorization. The loop needs to be annotated with #pragma omp simd
2139// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2140// vector length information is not provided, vectorization is not considered
2141// explicit. Interleave hints are not allowed either. These limitations will be
2142// relaxed in the future.
2143// Please, note that we are currently forced to abuse the pragma 'clang
2144// vectorize' semantics. This pragma provides *auto-vectorization hints*
2145// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2146// provides *explicit vectorization hints* (LV can bypass legal checks and
2147// assume that vectorization is legal). However, both hints are implemented
2148// using the same metadata (llvm.loop.vectorize, processed by
2149// LoopVectorizeHints). This will be fixed in the future when the native IR
2150// representation for pragma 'omp simd' is introduced.
2151static bool isExplicitVecOuterLoop(Loop *OuterLp,
2153 assert(!OuterLp->isInnermost() && "This is not an outer loop");
2154 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2155
2156 // Only outer loops with an explicit vectorization hint are supported.
2157 // Unannotated outer loops are ignored.
2159 return false;
2160
2161 Function *Fn = OuterLp->getHeader()->getParent();
2162 if (!Hints.allowVectorization(Fn, OuterLp,
2163 true /*VectorizeOnlyWhenForced*/)) {
2164 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2165 return false;
2166 }
2167
2168 if (Hints.getInterleave() > 1) {
2169 // TODO: Interleave support is future work.
2170 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2171 "outer loops.\n");
2172 Hints.emitRemarkWithHints();
2173 return false;
2174 }
2175
2176 return true;
2177}
2178
2182 // Collect inner loops and outer loops without irreducible control flow. For
2183 // now, only collect outer loops that have explicit vectorization hints. If we
2184 // are stress testing the VPlan H-CFG construction, we collect the outermost
2185 // loop of every loop nest.
2186 if (L.isInnermost() || VPlanBuildStressTest ||
2188 LoopBlocksRPO RPOT(&L);
2189 RPOT.perform(LI);
2190 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2191 V.push_back(&L);
2192 // TODO: Collect inner loops inside marked outer loops in case
2193 // vectorization fails for the outer loop. Do not invoke
2194 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2195 // already known to be reducible. We can use an inherited attribute for
2196 // that.
2197 return;
2198 }
2199 }
2200 for (Loop *InnerL : L)
2201 collectSupportedLoops(*InnerL, LI, ORE, V);
2202}
2203
2204//===----------------------------------------------------------------------===//
2205// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2206// LoopVectorizationCostModel and LoopVectorizationPlanner.
2207//===----------------------------------------------------------------------===//
2208
2209/// Compute the transformed value of Index at offset StartValue using step
2210/// StepValue.
2211/// For integer induction, returns StartValue + Index * StepValue.
2212/// For pointer induction, returns StartValue[Index * StepValue].
2213/// FIXME: The newly created binary instructions should contain nsw/nuw
2214/// flags, which can be found from the original scalar operations.
2215static Value *
2217 Value *Step,
2219 const BinaryOperator *InductionBinOp) {
2220 Type *StepTy = Step->getType();
2221 Value *CastedIndex = StepTy->isIntegerTy()
2222 ? B.CreateSExtOrTrunc(Index, StepTy)
2223 : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2224 if (CastedIndex != Index) {
2225 CastedIndex->setName(CastedIndex->getName() + ".cast");
2226 Index = CastedIndex;
2227 }
2228
2229 // Note: the IR at this point is broken. We cannot use SE to create any new
2230 // SCEV and then expand it, hoping that SCEV's simplification will give us
2231 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2232 // lead to various SCEV crashes. So all we can do is to use builder and rely
2233 // on InstCombine for future simplifications. Here we handle some trivial
2234 // cases only.
2235 auto CreateAdd = [&B](Value *X, Value *Y) {
2236 assert(X->getType() == Y->getType() && "Types don't match!");
2237 if (auto *CX = dyn_cast<ConstantInt>(X))
2238 if (CX->isZero())
2239 return Y;
2240 if (auto *CY = dyn_cast<ConstantInt>(Y))
2241 if (CY->isZero())
2242 return X;
2243 return B.CreateAdd(X, Y);
2244 };
2245
2246 // We allow X to be a vector type, in which case Y will potentially be
2247 // splatted into a vector with the same element count.
2248 auto CreateMul = [&B](Value *X, Value *Y) {
2249 assert(X->getType()->getScalarType() == Y->getType() &&
2250 "Types don't match!");
2251 if (auto *CX = dyn_cast<ConstantInt>(X))
2252 if (CX->isOne())
2253 return Y;
2254 if (auto *CY = dyn_cast<ConstantInt>(Y))
2255 if (CY->isOne())
2256 return X;
2257 VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2258 if (XVTy && !isa<VectorType>(Y->getType()))
2259 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2260 return B.CreateMul(X, Y);
2261 };
2262
2263 switch (InductionKind) {
2265 assert(!isa<VectorType>(Index->getType()) &&
2266 "Vector indices not supported for integer inductions yet");
2267 assert(Index->getType() == StartValue->getType() &&
2268 "Index type does not match StartValue type");
2269 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2270 return B.CreateSub(StartValue, Index);
2271 auto *Offset = CreateMul(Index, Step);
2272 return CreateAdd(StartValue, Offset);
2273 }
2275 return B.CreatePtrAdd(StartValue, CreateMul(Index, Step));
2277 assert(!isa<VectorType>(Index->getType()) &&
2278 "Vector indices not supported for FP inductions yet");
2279 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2280 assert(InductionBinOp &&
2281 (InductionBinOp->getOpcode() == Instruction::FAdd ||
2282 InductionBinOp->getOpcode() == Instruction::FSub) &&
2283 "Original bin op should be defined for FP induction");
2284
2285 Value *MulExp = B.CreateFMul(Step, Index);
2286 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2287 "induction");
2288 }
2290 return nullptr;
2291 }
2292 llvm_unreachable("invalid enum");
2293}
2294
2295std::optional<unsigned> getMaxVScale(const Function &F,
2296 const TargetTransformInfo &TTI) {
2297 if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2298 return MaxVScale;
2299
2300 if (F.hasFnAttribute(Attribute::VScaleRange))
2301 return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2302
2303 return std::nullopt;
2304}
2305
2306/// For the given VF and UF and maximum trip count computed for the loop, return
2307/// whether the induction variable might overflow in the vectorized loop. If not,
2308/// then we know a runtime overflow check always evaluates to false and can be
2309/// removed.
2312 ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2313 // Always be conservative if we don't know the exact unroll factor.
2314 unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2315
2316 Type *IdxTy = Cost->Legal->getWidestInductionType();
2317 APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask();
2318
2319 // We know the runtime overflow check is known false iff the (max) trip-count
2320 // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2321 // the vector loop induction variable.
2322 if (unsigned TC = Cost->PSE.getSmallConstantMaxTripCount()) {
2323 uint64_t MaxVF = VF.getKnownMinValue();
2324 if (VF.isScalable()) {
2325 std::optional<unsigned> MaxVScale =
2326 getMaxVScale(*Cost->TheFunction, Cost->TTI);
2327 if (!MaxVScale)
2328 return false;
2329 MaxVF *= *MaxVScale;
2330 }
2331
2332 return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
2333 }
2334
2335 return false;
2336}
2337
2338// Return whether we allow using masked interleave-groups (for dealing with
2339// strided loads/stores that reside in predicated blocks, or for dealing
2340// with gaps).
2342 // If an override option has been passed in for interleaved accesses, use it.
2345
2347}
2348
2350 VPReplicateRecipe *RepRecipe,
2351 const VPLane &Lane,
2352 VPTransformState &State) {
2353 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2354
2355 // Does this instruction return a value ?
2356 bool IsVoidRetTy = Instr->getType()->isVoidTy();
2357
2358 Instruction *Cloned = Instr->clone();
2359 if (!IsVoidRetTy) {
2360 Cloned->setName(Instr->getName() + ".cloned");
2361#if !defined(NDEBUG)
2362 // Verify that VPlan type inference results agree with the type of the
2363 // generated values.
2364 assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() &&
2365 "inferred type and type from generated instructions do not match");
2366#endif
2367 }
2368
2369 RepRecipe->setFlags(Cloned);
2370
2371 if (auto DL = Instr->getDebugLoc())
2372 State.setDebugLocFrom(DL);
2373
2374 // Replace the operands of the cloned instructions with their scalar
2375 // equivalents in the new loop.
2376 for (const auto &I : enumerate(RepRecipe->operands())) {
2377 auto InputLane = Lane;
2378 VPValue *Operand = I.value();
2380 InputLane = VPLane::getFirstLane();
2381 Cloned->setOperand(I.index(), State.get(Operand, InputLane));
2382 }
2383 State.addNewMetadata(Cloned, Instr);
2384
2385 // Place the cloned scalar in the new loop.
2386 State.Builder.Insert(Cloned);
2387
2388 State.set(RepRecipe, Cloned, Lane);
2389
2390 // If we just cloned a new assumption, add it the assumption cache.
2391 if (auto *II = dyn_cast<AssumeInst>(Cloned))
2393
2394 // End if-block.
2395 VPRegionBlock *Parent = RepRecipe->getParent()->getParent();
2396 bool IfPredicateInstr = Parent ? Parent->isReplicator() : false;
2397 assert(
2398 (Parent || !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() ||
2399 all_of(RepRecipe->operands(),
2400 [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) &&
2401 "Expected a recipe is either within a region or all of its operands "
2402 "are defined outside the vectorized region.");
2403 if (IfPredicateInstr)
2404 PredicatedInstructions.push_back(Cloned);
2405}
2406
2407Value *
2409 if (VectorTripCount)
2410 return VectorTripCount;
2411
2412 Value *TC = getTripCount();
2413 IRBuilder<> Builder(InsertBlock->getTerminator());
2414
2415 Type *Ty = TC->getType();
2416 // This is where we can make the step a runtime constant.
2417 Value *Step = createStepForVF(Builder, Ty, VF, UF);
2418
2419 // If the tail is to be folded by masking, round the number of iterations N
2420 // up to a multiple of Step instead of rounding down. This is done by first
2421 // adding Step-1 and then rounding down. Note that it's ok if this addition
2422 // overflows: the vector induction variable will eventually wrap to zero given
2423 // that it starts at zero and its Step is a power of two; the loop will then
2424 // exit, with the last early-exit vector comparison also producing all-true.
2425 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2426 // is accounted for in emitIterationCountCheck that adds an overflow check.
2427 if (Cost->foldTailByMasking()) {
2429 "VF*UF must be a power of 2 when folding tail by masking");
2430 TC = Builder.CreateAdd(TC, Builder.CreateSub(Step, ConstantInt::get(Ty, 1)),
2431 "n.rnd.up");
2432 }
2433
2434 // Now we need to generate the expression for the part of the loop that the
2435 // vectorized body will execute. This is equal to N - (N % Step) if scalar
2436 // iterations are not required for correctness, or N - Step, otherwise. Step
2437 // is equal to the vectorization factor (number of SIMD elements) times the
2438 // unroll factor (number of SIMD instructions).
2439 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2440
2441 // There are cases where we *must* run at least one iteration in the remainder
2442 // loop. See the cost model for when this can happen. If the step evenly
2443 // divides the trip count, we set the remainder to be equal to the step. If
2444 // the step does not evenly divide the trip count, no adjustment is necessary
2445 // since there will already be scalar iterations. Note that the minimum
2446 // iterations check ensures that N >= Step.
2447 if (Cost->requiresScalarEpilogue(VF.isVector())) {
2448 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2449 R = Builder.CreateSelect(IsZero, Step, R);
2450 }
2451
2452 VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2453
2454 return VectorTripCount;
2455}
2456
2458 VPBlockBase *ScalarPH = Plan.getScalarPreheader();
2460 if (PreVectorPH->getNumSuccessors() != 1) {
2461 assert(PreVectorPH->getNumSuccessors() == 2 && "Expected 2 successors");
2462 assert(PreVectorPH->getSuccessors()[0] == ScalarPH &&
2463 "Unexpected successor");
2464 VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(CheckIRBB);
2465 VPBlockUtils::insertOnEdge(PreVectorPH, VectorPHVPB, CheckVPIRBB);
2466 PreVectorPH = CheckVPIRBB;
2467 }
2468 VPBlockUtils::connectBlocks(PreVectorPH, ScalarPH);
2469 PreVectorPH->swapSuccessors();
2470}
2471
2473 Value *Count = getTripCount();
2474 // Reuse existing vector loop preheader for TC checks.
2475 // Note that new preheader block is generated for vector loop.
2476 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2477 IRBuilder<> Builder(TCCheckBlock->getTerminator());
2478
2479 // Generate code to check if the loop's trip count is less than VF * UF, or
2480 // equal to it in case a scalar epilogue is required; this implies that the
2481 // vector trip count is zero. This check also covers the case where adding one
2482 // to the backedge-taken count overflowed leading to an incorrect trip count
2483 // of zero. In this case we will also jump to the scalar loop.
2484 auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
2486
2487 // If tail is to be folded, vector loop takes care of all iterations.
2488 Type *CountTy = Count->getType();
2489 Value *CheckMinIters = Builder.getFalse();
2490 auto CreateStep = [&]() -> Value * {
2491 // Create step with max(MinProTripCount, UF * VF).
2493 return createStepForVF(Builder, CountTy, VF, UF);
2494
2495 Value *MinProfTC =
2497 if (!VF.isScalable())
2498 return MinProfTC;
2500 Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2501 };
2502
2503 TailFoldingStyle Style = Cost->getTailFoldingStyle();
2504 if (Style == TailFoldingStyle::None) {
2505 Value *Step = CreateStep();
2506 ScalarEvolution &SE = *PSE.getSE();
2507 // TODO: Emit unconditional branch to vector preheader instead of
2508 // conditional branch with known condition.
2509 const SCEV *TripCountSCEV = SE.applyLoopGuards(SE.getSCEV(Count), OrigLoop);
2510 // Check if the trip count is < the step.
2511 if (SE.isKnownPredicate(P, TripCountSCEV, SE.getSCEV(Step))) {
2512 // TODO: Ensure step is at most the trip count when determining max VF and
2513 // UF, w/o tail folding.
2514 CheckMinIters = Builder.getTrue();
2516 TripCountSCEV, SE.getSCEV(Step))) {
2517 // Generate the minimum iteration check only if we cannot prove the
2518 // check is known to be true, or known to be false.
2519 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
2520 } // else step known to be < trip count, use CheckMinIters preset to false.
2521 } else if (VF.isScalable() &&
2524 // vscale is not necessarily a power-of-2, which means we cannot guarantee
2525 // an overflow to zero when updating induction variables and so an
2526 // additional overflow check is required before entering the vector loop.
2527
2528 // Get the maximum unsigned value for the type.
2529 Value *MaxUIntTripCount =
2530 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2531 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
2532
2533 // Don't execute the vector loop if (UMax - n) < (VF * UF).
2534 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
2535 }
2536
2537 // Create new preheader for vector loop.
2539 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2540 "vector.ph");
2541
2542 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2543 DT->getNode(Bypass)->getIDom()) &&
2544 "TC check is expected to dominate Bypass");
2545
2546 BranchInst &BI =
2547 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
2549 setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
2550 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
2551 LoopBypassBlocks.push_back(TCCheckBlock);
2552
2553 // TODO: Wrap LoopVectorPreHeader in VPIRBasicBlock here.
2554 introduceCheckBlockInVPlan(TCCheckBlock);
2555}
2556
2558 BasicBlock *const SCEVCheckBlock =
2559 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader);
2560 if (!SCEVCheckBlock)
2561 return nullptr;
2562
2563 assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2565 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
2566 "Cannot SCEV check stride or overflow when optimizing for size");
2567 assert(!LoopBypassBlocks.empty() &&
2568 "Should already be a bypass block due to iteration count check");
2569 LoopBypassBlocks.push_back(SCEVCheckBlock);
2570 AddedSafetyChecks = true;
2571
2572 introduceCheckBlockInVPlan(SCEVCheckBlock);
2573 return SCEVCheckBlock;
2574}
2575
2577 // VPlan-native path does not do any analysis for runtime checks currently.
2579 return nullptr;
2580
2581 BasicBlock *const MemCheckBlock =
2582 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
2583
2584 // Check if we generated code that checks in runtime if arrays overlap. We put
2585 // the checks into a separate block to make the more common case of few
2586 // elements faster.
2587 if (!MemCheckBlock)
2588 return nullptr;
2589
2590 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
2591 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2592 "Cannot emit memory checks when optimizing for size, unless forced "
2593 "to vectorize.");
2594 ORE->emit([&]() {
2595 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2598 << "Code-size may be reduced by not forcing "
2599 "vectorization, or by source-code modifications "
2600 "eliminating the need for runtime checks "
2601 "(e.g., adding 'restrict').";
2602 });
2603 }
2604
2605 LoopBypassBlocks.push_back(MemCheckBlock);
2606
2607 AddedSafetyChecks = true;
2608
2609 introduceCheckBlockInVPlan(MemCheckBlock);
2610 return MemCheckBlock;
2611}
2612
2613/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
2614/// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must
2615/// have a single predecessor, which is rewired to the new VPIRBasicBlock. All
2616/// successors of VPBB, if any, are rewired to the new VPIRBasicBlock.
2618 VPIRBasicBlock *IRVPBB = VPBB->getPlan()->createVPIRBasicBlock(IRBB);
2619 for (auto &R : make_early_inc_range(*VPBB)) {
2620 assert(!R.isPhi() && "Tried to move phi recipe to end of block");
2621 R.moveBefore(*IRVPBB, IRVPBB->end());
2622 }
2623
2624 VPBlockUtils::reassociateBlocks(VPBB, IRVPBB);
2625 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
2626}
2627
2630 assert(LoopVectorPreHeader && "Invalid loop structure");
2632 Cost->requiresScalarEpilogue(VF.isVector())) &&
2633 "loops not exiting via the latch without required epilogue?");
2634
2637 LI, nullptr, Twine(Prefix) + "middle.block");
2641 nullptr, Twine(Prefix) + "scalar.ph");
2643}
2644
2645/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
2646/// expansion results.
2648 const SCEV2ValueTy &ExpandedSCEVs) {
2649 const SCEV *Step = ID.getStep();
2650 if (auto *C = dyn_cast<SCEVConstant>(Step))
2651 return C->getValue();
2652 if (auto *U = dyn_cast<SCEVUnknown>(Step))
2653 return U->getValue();
2654 auto I = ExpandedSCEVs.find(Step);
2655 assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point");
2656 return I->second;
2657}
2658
2659/// Knowing that loop \p L executes a single vector iteration, add instructions
2660/// that will get simplified and thus should not have any cost to \p
2661/// InstsToIgnore.
2664 SmallPtrSetImpl<Instruction *> &InstsToIgnore) {
2665 auto *Cmp = L->getLatchCmpInst();
2666 if (Cmp)
2667 InstsToIgnore.insert(Cmp);
2668 for (const auto &KV : IL) {
2669 // Extract the key by hand so that it can be used in the lambda below. Note
2670 // that captured structured bindings are a C++20 extension.
2671 const PHINode *IV = KV.first;
2672
2673 // Get next iteration value of the induction variable.
2674 Instruction *IVInst =
2675 cast<Instruction>(IV->getIncomingValueForBlock(L->getLoopLatch()));
2676 if (all_of(IVInst->users(),
2677 [&](const User *U) { return U == IV || U == Cmp; }))
2678 InstsToIgnore.insert(IVInst);
2679 }
2680}
2681
2683 const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount) {
2684 assert(MainVectorTripCount && "Must have bypass information");
2685
2686 Instruction *OldInduction = Legal->getPrimaryInduction();
2687 IRBuilder<> BypassBuilder(getAdditionalBypassBlock(),
2688 getAdditionalBypassBlock()->getFirstInsertionPt());
2689 for (const auto &InductionEntry : Legal->getInductionVars()) {
2690 PHINode *OrigPhi = InductionEntry.first;
2691 const InductionDescriptor &II = InductionEntry.second;
2692 Value *Step = getExpandedStep(II, ExpandedSCEVs);
2693 // For the primary induction the additional bypass end value is known.
2694 // Otherwise it is computed.
2695 Value *EndValueFromAdditionalBypass = MainVectorTripCount;
2696 if (OrigPhi != OldInduction) {
2697 auto *BinOp = II.getInductionBinOp();
2698 // Fast-math-flags propagate from the original induction instruction.
2699 if (isa_and_nonnull<FPMathOperator>(BinOp))
2700 BypassBuilder.setFastMathFlags(BinOp->getFastMathFlags());
2701
2702 // Compute the end value for the additional bypass.
2703 EndValueFromAdditionalBypass =
2704 emitTransformedIndex(BypassBuilder, MainVectorTripCount,
2705 II.getStartValue(), Step, II.getKind(), BinOp);
2706 EndValueFromAdditionalBypass->setName("ind.end");
2707 }
2708
2709 // Store the bypass value here, as it needs to be added as operand to its
2710 // scalar preheader phi node after the epilogue skeleton has been created.
2711 // TODO: Directly add as extra operand to the VPResumePHI recipe.
2712 assert(!Induction2AdditionalBypassValue.contains(OrigPhi) &&
2713 "entry for OrigPhi already exits");
2714 Induction2AdditionalBypassValue[OrigPhi] = EndValueFromAdditionalBypass;
2715 }
2716}
2717
2719 const SCEV2ValueTy &ExpandedSCEVs) {
2720 /*
2721 In this function we generate a new loop. The new loop will contain
2722 the vectorized instructions while the old loop will continue to run the
2723 scalar remainder.
2724
2725 [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
2726 / | preheader are expanded here. Eventually all required SCEV
2727 / | expansion should happen here.
2728 / v
2729 | [ ] <-- vector loop bypass (may consist of multiple blocks).
2730 | / |
2731 | / v
2732 || [ ] <-- vector pre header.
2733 |/ |
2734 | v
2735 | [ ] \
2736 | [ ]_| <-- vector loop (created during VPlan execution).
2737 | |
2738 | v
2739 \ -[ ] <--- middle-block (wrapped in VPIRBasicBlock with the branch to
2740 | | successors created during VPlan execution)
2741 \/ |
2742 /\ v
2743 | ->[ ] <--- new preheader (wrapped in VPIRBasicBlock).
2744 | |
2745 (opt) v <-- edge from middle to exit iff epilogue is not required.
2746 | [ ] \
2747 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue, header
2748 | | wrapped in VPIRBasicBlock).
2749 \ |
2750 \ v
2751 >[ ] <-- exit block(s). (wrapped in VPIRBasicBlock)
2752 ...
2753 */
2754
2755 // Create an empty vector loop, and prepare basic blocks for the runtime
2756 // checks.
2758
2759 // Now, compare the new count to zero. If it is zero skip the vector loop and
2760 // jump to the scalar loop. This check also covers the case where the
2761 // backedge-taken count is uint##_max: adding one to it will overflow leading
2762 // to an incorrect trip count of zero. In this (rare) case we will also jump
2763 // to the scalar loop.
2765
2766 // Generate the code to check any assumptions that we've made for SCEV
2767 // expressions.
2769
2770 // Generate the code that checks in runtime if arrays overlap. We put the
2771 // checks into a separate block to make the more common case of few elements
2772 // faster.
2774
2775 return LoopVectorPreHeader;
2776}
2777
2778// Fix up external users of the induction variable. At this point, we are
2779// in LCSSA form, with all external PHIs that use the IV having one input value,
2780// coming from the remainder loop. We need those PHIs to also have a correct
2781// value for the IV when arriving directly from the middle block.
2783 const InductionDescriptor &II,
2784 Value *VectorTripCount,
2785 BasicBlock *MiddleBlock,
2786 VPTransformState &State) {
2787 // There are two kinds of external IV usages - those that use the value
2788 // computed in the last iteration (the PHI) and those that use the penultimate
2789 // value (the value that feeds into the phi from the loop latch).
2790 // We allow both, but they, obviously, have different values.
2791
2792 DenseMap<Value *, Value *> MissingVals;
2793
2794 Value *EndValue = cast<PHINode>(OrigPhi->getIncomingValueForBlock(
2796 ->getIncomingValueForBlock(MiddleBlock);
2797
2798 // An external user of the last iteration's value should see the value that
2799 // the remainder loop uses to initialize its own IV.
2801 for (User *U : PostInc->users()) {
2802 Instruction *UI = cast<Instruction>(U);
2803 if (!OrigLoop->contains(UI)) {
2804 assert(isa<PHINode>(UI) && "Expected LCSSA form");
2805 MissingVals[UI] = EndValue;
2806 }
2807 }
2808
2809 // An external user of the penultimate value need to see EndValue - Step.
2810 // The simplest way to get this is to recompute it from the constituent SCEVs,
2811 // that is Start + (Step * (CRD - 1)).
2812 for (User *U : OrigPhi->users()) {
2813 auto *UI = cast<Instruction>(U);
2814 if (!OrigLoop->contains(UI)) {
2815 assert(isa<PHINode>(UI) && "Expected LCSSA form");
2816 IRBuilder<> B(MiddleBlock->getTerminator());
2817
2818 // Fast-math-flags propagate from the original induction instruction.
2819 if (isa_and_nonnull<FPMathOperator>(II.getInductionBinOp()))
2820 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
2821
2822 VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep());
2823 assert(StepVPV && "step must have been expanded during VPlan execution");
2824 Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()
2825 : State.get(StepVPV, VPLane(0));
2826 Value *Escape = nullptr;
2827 if (EndValue->getType()->isIntegerTy())
2828 Escape = B.CreateSub(EndValue, Step);
2829 else if (EndValue->getType()->isPointerTy())
2830 Escape = B.CreatePtrAdd(EndValue, B.CreateNeg(Step));
2831 else {
2832 assert(EndValue->getType()->isFloatingPointTy() &&
2833 "Unexpected induction type");
2834 Escape = B.CreateBinOp(II.getInductionBinOp()->getOpcode() ==
2835 Instruction::FAdd
2836 ? Instruction::FSub
2837 : Instruction::FAdd,
2838 EndValue, Step);
2839 }
2840 Escape->setName("ind.escape");
2841 MissingVals[UI] = Escape;
2842 }
2843 }
2844
2845 assert((MissingVals.empty() ||
2846 all_of(MissingVals,
2847 [MiddleBlock, this](const std::pair<Value *, Value *> &P) {
2848 return all_of(
2849 predecessors(cast<Instruction>(P.first)->getParent()),
2850 [MiddleBlock, this](BasicBlock *Pred) {
2851 return Pred == MiddleBlock ||
2852 Pred == OrigLoop->getLoopLatch();
2853 });
2854 })) &&
2855 "Expected escaping values from latch/middle.block only");
2856
2857 for (auto &I : MissingVals) {
2858 PHINode *PHI = cast<PHINode>(I.first);
2859 // One corner case we have to handle is two IVs "chasing" each-other,
2860 // that is %IV2 = phi [...], [ %IV1, %latch ]
2861 // In this case, if IV1 has an external use, we need to avoid adding both
2862 // "last value of IV1" and "penultimate value of IV2". So, verify that we
2863 // don't already have an incoming value for the middle block.
2864 if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
2865 PHI->addIncoming(I.second, MiddleBlock);
2866 }
2867}
2868
2869namespace {
2870
2871struct CSEDenseMapInfo {
2872 static bool canHandle(const Instruction *I) {
2873 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
2874 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
2875 }
2876
2877 static inline Instruction *getEmptyKey() {
2879 }
2880
2881 static inline Instruction *getTombstoneKey() {
2883 }
2884
2885 static unsigned getHashValue(const Instruction *I) {
2886 assert(canHandle(I) && "Unknown instruction!");
2887 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
2888 I->value_op_end()));
2889 }
2890
2891 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
2892 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
2893 LHS == getTombstoneKey() || RHS == getTombstoneKey())
2894 return LHS == RHS;
2895 return LHS->isIdenticalTo(RHS);
2896 }
2897};
2898
2899} // end anonymous namespace
2900
2901///Perform cse of induction variable instructions.
2902static void cse(BasicBlock *BB) {
2903 // Perform simple cse.
2905 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
2906 if (!CSEDenseMapInfo::canHandle(&In))
2907 continue;
2908
2909 // Check if we can replace this instruction with any of the
2910 // visited instructions.
2911 if (Instruction *V = CSEMap.lookup(&In)) {
2912 In.replaceAllUsesWith(V);
2913 In.eraseFromParent();
2914 continue;
2915 }
2916
2917 CSEMap[&In] = &In;
2918 }
2919}
2920
2923 ElementCount VF) const {
2924 // We only need to calculate a cost if the VF is scalar; for actual vectors
2925 // we should already have a pre-calculated cost at each VF.
2926 if (!VF.isScalar())
2927 return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost;
2928
2930 Type *RetTy = CI->getType();
2932 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind))
2933 return *RedCost;
2934
2936 for (auto &ArgOp : CI->args())
2937 Tys.push_back(ArgOp->getType());
2938
2939 InstructionCost ScalarCallCost =
2941
2942 // If this is an intrinsic we may have a lower cost for it.
2944 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
2945 return std::min(ScalarCallCost, IntrinsicCost);
2946 }
2947 return ScalarCallCost;
2948}
2949
2951 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
2952 return Elt;
2953 return VectorType::get(Elt, VF);
2954}
2955
2958 ElementCount VF) const {
2960 assert(ID && "Expected intrinsic call!");
2961 Type *RetTy = maybeVectorizeType(CI->getType(), VF);
2962 FastMathFlags FMF;
2963 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
2964 FMF = FPMO->getFastMathFlags();
2965
2968 SmallVector<Type *> ParamTys;
2969 std::transform(FTy->param_begin(), FTy->param_end(),
2970 std::back_inserter(ParamTys),
2971 [&](Type *Ty) { return maybeVectorizeType(Ty, VF); });
2972
2973 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
2974 dyn_cast<IntrinsicInst>(CI));
2975 return TTI.getIntrinsicInstrCost(CostAttrs,
2977}
2978
2980 // Fix widened non-induction PHIs by setting up the PHI operands.
2982 fixNonInductionPHIs(State);
2983
2984 // Forget the original basic block.
2987
2988 // After vectorization, the exit blocks of the original loop will have
2989 // additional predecessors. Invalidate SCEVs for the exit phis in case SE
2990 // looked through single-entry phis.
2991 SmallVector<BasicBlock *> ExitBlocks;
2992 OrigLoop->getExitBlocks(ExitBlocks);
2993 for (BasicBlock *Exit : ExitBlocks)
2994 for (PHINode &PN : Exit->phis())
2996
2997 if (Cost->requiresScalarEpilogue(VF.isVector())) {
2998 // No edge from the middle block to the unique exit block has been inserted
2999 // and there is nothing to fix from vector loop; phis should have incoming
3000 // from scalar loop only.
3001 } else {
3002 // TODO: Check in VPlan to see if IV users need fixing instead of checking
3003 // the cost model.
3004
3005 // If we inserted an edge from the middle block to the unique exit block,
3006 // update uses outside the loop (phis) to account for the newly inserted
3007 // edge.
3008
3009 // Fix-up external users of the induction variables.
3010 for (const auto &Entry : Legal->getInductionVars())
3011 fixupIVUsers(Entry.first, Entry.second,
3013 }
3014
3015 // Don't apply optimizations below when no vector region remains, as they all
3016 // require a vector loop at the moment.
3017 if (!State.Plan->getVectorLoopRegion())
3018 return;
3019
3021 sinkScalarOperands(&*PI);
3022
3023 VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
3024 VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock();
3025 BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB];
3026
3027 // Remove redundant induction instructions.
3028 cse(HeaderBB);
3029
3030 // Set/update profile weights for the vector and remainder loops as original
3031 // loop iterations are now distributed among them. Note that original loop
3032 // becomes the scalar remainder loop after vectorization.
3033 //
3034 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3035 // end up getting slightly roughened result but that should be OK since
3036 // profile is not inherently precise anyway. Note also possible bypass of
3037 // vector code caused by legality checks is ignored, assigning all the weight
3038 // to the vector loop, optimistically.
3039 //
3040 // For scalable vectorization we can't know at compile time how many
3041 // iterations of the loop are handled in one vector iteration, so instead
3042 // assume a pessimistic vscale of '1'.
3043 Loop *VectorLoop = LI->getLoopFor(HeaderBB);
3045 VF.getKnownMinValue() * UF);
3046}
3047
3049 // The basic block and loop containing the predicated instruction.
3050 auto *PredBB = PredInst->getParent();
3051 auto *VectorLoop = LI->getLoopFor(PredBB);
3052
3053 // Initialize a worklist with the operands of the predicated instruction.
3054 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3055
3056 // Holds instructions that we need to analyze again. An instruction may be
3057 // reanalyzed if we don't yet know if we can sink it or not.
3058 SmallVector<Instruction *, 8> InstsToReanalyze;
3059
3060 // Returns true if a given use occurs in the predicated block. Phi nodes use
3061 // their operands in their corresponding predecessor blocks.
3062 auto IsBlockOfUsePredicated = [&](Use &U) -> bool {
3063 auto *I = cast<Instruction>(U.getUser());
3064 BasicBlock *BB = I->getParent();
3065 if (auto *Phi = dyn_cast<PHINode>(I))
3066 BB = Phi->getIncomingBlock(
3067 PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3068 return BB == PredBB;
3069 };
3070
3071 // Iteratively sink the scalarized operands of the predicated instruction
3072 // into the block we created for it. When an instruction is sunk, it's
3073 // operands are then added to the worklist. The algorithm ends after one pass
3074 // through the worklist doesn't sink a single instruction.
3075 bool Changed;
3076 do {
3077 // Add the instructions that need to be reanalyzed to the worklist, and
3078 // reset the changed indicator.
3079 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3080 InstsToReanalyze.clear();
3081 Changed = false;
3082
3083 while (!Worklist.empty()) {
3084 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3085
3086 // We can't sink an instruction if it is a phi node, is not in the loop,
3087 // may have side effects or may read from memory.
3088 // TODO: Could do more granular checking to allow sinking
3089 // a load past non-store instructions.
3090 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
3091 I->mayHaveSideEffects() || I->mayReadFromMemory())
3092 continue;
3093
3094 // If the instruction is already in PredBB, check if we can sink its
3095 // operands. In that case, VPlan's sinkScalarOperands() succeeded in
3096 // sinking the scalar instruction I, hence it appears in PredBB; but it
3097 // may have failed to sink I's operands (recursively), which we try
3098 // (again) here.
3099 if (I->getParent() == PredBB) {
3100 Worklist.insert(I->op_begin(), I->op_end());
3101 continue;
3102 }
3103
3104 // It's legal to sink the instruction if all its uses occur in the
3105 // predicated block. Otherwise, there's nothing to do yet, and we may
3106 // need to reanalyze the instruction.
3107 if (!llvm::all_of(I->uses(), IsBlockOfUsePredicated)) {
3108 InstsToReanalyze.push_back(I);
3109 continue;
3110 }
3111
3112 // Move the instruction to the beginning of the predicated block, and add
3113 // it's operands to the worklist.
3114 I->moveBefore(&*PredBB->getFirstInsertionPt());
3115 Worklist.insert(I->op_begin(), I->op_end());
3116
3117 // The sinking may have enabled other instructions to be sunk, so we will
3118 // need to iterate.
3119 Changed = true;
3120 }
3121 } while (Changed);
3122}
3123
3125 auto Iter = vp_depth_first_deep(Plan.getEntry());
3126 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
3127 for (VPRecipeBase &P : VPBB->phis()) {
3128 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
3129 if (!VPPhi)
3130 continue;
3131 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi));
3132 // Make sure the builder has a valid insert point.
3133 Builder.SetInsertPoint(NewPhi);
3134 for (unsigned Idx = 0; Idx < VPPhi->getNumOperands(); ++Idx) {
3135 VPValue *Inc = VPPhi->getIncomingValue(Idx);
3136 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(Idx);
3137 NewPhi->addIncoming(State.get(Inc), State.CFG.VPBB2IRBB[VPBB]);
3138 }
3139 }
3140 }
3141}
3142
3143void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
3144 // We should not collect Scalars more than once per VF. Right now, this
3145 // function is called from collectUniformsAndScalars(), which already does
3146 // this check. Collecting Scalars for VF=1 does not make any sense.
3147 assert(VF.isVector() && !Scalars.contains(VF) &&
3148 "This function should not be visited twice for the same VF");
3149
3150 // This avoids any chances of creating a REPLICATE recipe during planning
3151 // since that would result in generation of scalarized code during execution,
3152 // which is not supported for scalable vectors.
3153 if (VF.isScalable()) {
3154 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
3155 return;
3156 }
3157
3159
3160 // These sets are used to seed the analysis with pointers used by memory
3161 // accesses that will remain scalar.
3163 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
3164 auto *Latch = TheLoop->getLoopLatch();
3165
3166 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
3167 // The pointer operands of loads and stores will be scalar as long as the
3168 // memory access is not a gather or scatter operation. The value operand of a
3169 // store will remain scalar if the store is scalarized.
3170 auto IsScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
3171 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
3172 assert(WideningDecision != CM_Unknown &&
3173 "Widening decision should be ready at this moment");
3174 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
3175 if (Ptr == Store->getValueOperand())
3176 return WideningDecision == CM_Scalarize;
3177 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
3178 "Ptr is neither a value or pointer operand");
3179 return WideningDecision != CM_GatherScatter;
3180 };
3181
3182 // A helper that returns true if the given value is a getelementptr
3183 // instruction contained in the loop.
3184 auto IsLoopVaryingGEP = [&](Value *V) {
3185 return isa<GetElementPtrInst>(V) && !TheLoop->isLoopInvariant(V);
3186 };
3187
3188 // A helper that evaluates a memory access's use of a pointer. If the use will
3189 // be a scalar use and the pointer is only used by memory accesses, we place
3190 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
3191 // PossibleNonScalarPtrs.
3192 auto EvaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
3193 // We only care about bitcast and getelementptr instructions contained in
3194 // the loop.
3195 if (!IsLoopVaryingGEP(Ptr))
3196 return;
3197
3198 // If the pointer has already been identified as scalar (e.g., if it was
3199 // also identified as uniform), there's nothing to do.
3200 auto *I = cast<Instruction>(Ptr);
3201 if (Worklist.count(I))
3202 return;
3203
3204 // If the use of the pointer will be a scalar use, and all users of the
3205 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
3206 // place the pointer in PossibleNonScalarPtrs.
3207 if (IsScalarUse(MemAccess, Ptr) &&
3208 all_of(I->users(), IsaPred<LoadInst, StoreInst>))
3209 ScalarPtrs.insert(I);
3210 else
3211 PossibleNonScalarPtrs.insert(I);
3212 };
3213
3214 // We seed the scalars analysis with three classes of instructions: (1)
3215 // instructions marked uniform-after-vectorization and (2) bitcast,
3216 // getelementptr and (pointer) phi instructions used by memory accesses
3217 // requiring a scalar use.
3218 //
3219 // (1) Add to the worklist all instructions that have been identified as
3220 // uniform-after-vectorization.
3221 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
3222
3223 // (2) Add to the worklist all bitcast and getelementptr instructions used by
3224 // memory accesses requiring a scalar use. The pointer operands of loads and
3225 // stores will be scalar unless the operation is a gather or scatter.
3226 // The value operand of a store will remain scalar if the store is scalarized.
3227 for (auto *BB : TheLoop->blocks())
3228 for (auto &I : *BB) {
3229 if (auto *Load = dyn_cast<LoadInst>(&I)) {
3230 EvaluatePtrUse(Load, Load->getPointerOperand());
3231 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
3232 EvaluatePtrUse(Store, Store->getPointerOperand());
3233 EvaluatePtrUse(Store, Store->getValueOperand());
3234 }
3235 }
3236 for (auto *I : ScalarPtrs)
3237 if (!PossibleNonScalarPtrs.count(I)) {
3238 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
3239 Worklist.insert(I);
3240 }
3241
3242 // Insert the forced scalars.
3243 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
3244 // induction variable when the PHI user is scalarized.
3245 auto ForcedScalar = ForcedScalars.find(VF);
3246 if (ForcedScalar != ForcedScalars.end())
3247 for (auto *I : ForcedScalar->second) {
3248 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
3249 Worklist.insert(I);
3250 }
3251
3252 // Expand the worklist by looking through any bitcasts and getelementptr
3253 // instructions we've already identified as scalar. This is similar to the
3254 // expansion step in collectLoopUniforms(); however, here we're only
3255 // expanding to include additional bitcasts and getelementptr instructions.
3256 unsigned Idx = 0;
3257 while (Idx != Worklist.size()) {
3258 Instruction *Dst = Worklist[Idx++];
3259 if (!IsLoopVaryingGEP(Dst->getOperand(0)))
3260 continue;
3261 auto *Src = cast<Instruction>(Dst->getOperand(0));
3262 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
3263 auto *J = cast<Instruction>(U);
3264 return !TheLoop->contains(J) || Worklist.count(J) ||
3265 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
3266 IsScalarUse(J, Src));
3267 })) {
3268 Worklist.insert(Src);
3269 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
3270 }
3271 }
3272
3273 // An induction variable will remain scalar if all users of the induction
3274 // variable and induction variable update remain scalar.
3275 for (const auto &Induction : Legal->getInductionVars()) {
3276 auto *Ind = Induction.first;
3277 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3278
3279 // If tail-folding is applied, the primary induction variable will be used
3280 // to feed a vector compare.
3281 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
3282 continue;
3283
3284 // Returns true if \p Indvar is a pointer induction that is used directly by
3285 // load/store instruction \p I.
3286 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
3287 Instruction *I) {
3288 return Induction.second.getKind() ==
3290 (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
3291 Indvar == getLoadStorePointerOperand(I) && IsScalarUse(I, Indvar);
3292 };
3293
3294 // Determine if all users of the induction variable are scalar after
3295 // vectorization.
3296 bool ScalarInd = all_of(Ind->users(), [&](User *U) -> bool {
3297 auto *I = cast<Instruction>(U);
3298 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3299 IsDirectLoadStoreFromPtrIndvar(Ind, I);
3300 });
3301 if (!ScalarInd)
3302 continue;
3303
3304 // If the induction variable update is a fixed-order recurrence, neither the
3305 // induction variable or its update should be marked scalar after
3306 // vectorization.
3307 auto *IndUpdatePhi = dyn_cast<PHINode>(IndUpdate);
3308 if (IndUpdatePhi && Legal->isFixedOrderRecurrence(IndUpdatePhi))
3309 continue;
3310
3311 // Determine if all users of the induction variable update instruction are
3312 // scalar after vectorization.
3313 bool ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
3314 auto *I = cast<Instruction>(U);
3315 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
3316 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
3317 });
3318 if (!ScalarIndUpdate)
3319 continue;
3320
3321 // The induction variable and its update instruction will remain scalar.
3322 Worklist.insert(Ind);
3323 Worklist.insert(IndUpdate);
3324 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
3325 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
3326 << "\n");
3327 }
3328
3329 Scalars[VF].insert(Worklist.begin(), Worklist.end());
3330}
3331
3333 Instruction *I, ElementCount VF) const {
3334 if (!isPredicatedInst(I))
3335 return false;
3336
3337 // Do we have a non-scalar lowering for this predicated
3338 // instruction? No - it is scalar with predication.
3339 switch(I->getOpcode()) {
3340 default:
3341 return true;
3342 case Instruction::Call:
3343 if (VF.isScalar())
3344 return true;
3345 return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF))
3346 .Kind == CM_Scalarize;
3347 case Instruction::Load:
3348 case Instruction::Store: {
3350 auto *Ty = getLoadStoreType(I);
3351 Type *VTy = Ty;
3352 if (VF.isVector())
3353 VTy = VectorType::get(Ty, VF);
3354 const Align Alignment = getLoadStoreAlignment(I);
3355 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
3356 TTI.isLegalMaskedGather(VTy, Alignment))
3357 : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
3358 TTI.isLegalMaskedScatter(VTy, Alignment));
3359 }
3360 case Instruction::UDiv:
3361 case Instruction::SDiv:
3362 case Instruction::SRem:
3363 case Instruction::URem: {
3364 // We have the option to use the safe-divisor idiom to avoid predication.
3365 // The cost based decision here will always select safe-divisor for
3366 // scalable vectors as scalarization isn't legal.
3367 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
3368 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
3369 }
3370 }
3371}
3372
3373// TODO: Fold into LoopVectorizationLegality::isMaskRequired.
3375 // If predication is not needed, avoid it.
3376 // TODO: We can use the loop-preheader as context point here and get
3377 // context sensitive reasoning for isSafeToSpeculativelyExecute.
3378 if (!blockNeedsPredicationForAnyReason(I->getParent()) ||
3380 (isa<LoadInst, StoreInst, CallInst>(I) && !Legal->isMaskRequired(I)) ||
3381 isa<BranchInst, SwitchInst, PHINode, AllocaInst>(I))
3382 return false;
3383
3384 // If the instruction was executed conditionally in the original scalar loop,
3385 // predication is needed with a mask whose lanes are all possibly inactive.
3386 if (Legal->blockNeedsPredication(I->getParent()))
3387 return true;
3388
3389 // All that remain are instructions with side-effects originally executed in
3390 // the loop unconditionally, but now execute under a tail-fold mask (only)
3391 // having at least one active lane (the first). If the side-effects of the
3392 // instruction are invariant, executing it w/o (the tail-folding) mask is safe
3393 // - it will cause the same side-effects as when masked.
3394 switch(I->getOpcode()) {
3395 default:
3397 "instruction should have been considered by earlier checks");
3398 case Instruction::Call:
3399 // Side-effects of a Call are assumed to be non-invariant, needing a
3400 // (fold-tail) mask.
3402 "should have returned earlier for calls not needing a mask");
3403 return true;
3404 case Instruction::Load:
3405 // If the address is loop invariant no predication is needed.
3407 case Instruction::Store: {
3408 // For stores, we need to prove both speculation safety (which follows from
3409 // the same argument as loads), but also must prove the value being stored
3410 // is correct. The easiest form of the later is to require that all values
3411 // stored are the same.
3413 TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()));
3414 }
3415 case Instruction::UDiv:
3416 case Instruction::SDiv:
3417 case Instruction::SRem:
3418 case Instruction::URem:
3419 // If the divisor is loop-invariant no predication is needed.
3420 return !TheLoop->isLoopInvariant(I->getOperand(1));
3421 }
3422}
3423
3424std::pair<InstructionCost, InstructionCost>
3426 ElementCount VF) const {
3427 assert(I->getOpcode() == Instruction::UDiv ||
3428 I->getOpcode() == Instruction::SDiv ||
3429 I->getOpcode() == Instruction::SRem ||
3430 I->getOpcode() == Instruction::URem);
3432
3434
3435 // Scalarization isn't legal for scalable vector types
3436 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
3437 if (!VF.isScalable()) {
3438 // Get the scalarization cost and scale this amount by the probability of
3439 // executing the predicated block. If the instruction is not predicated,
3440 // we fall through to the next case.
3441 ScalarizationCost = 0;
3442
3443 // These instructions have a non-void type, so account for the phi nodes
3444 // that we will create. This cost is likely to be zero. The phi node
3445 // cost, if any, should be scaled by the block probability because it
3446 // models a copy at the end of each predicated block.
3447 ScalarizationCost += VF.getKnownMinValue() *
3448 TTI.getCFInstrCost(Instruction::PHI, CostKind);
3449
3450 // The cost of the non-predicated instruction.
3451 ScalarizationCost += VF.getKnownMinValue() *
3452 TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
3453
3454 // The cost of insertelement and extractelement instructions needed for
3455 // scalarization.
3456 ScalarizationCost += getScalarizationOverhead(I, VF, CostKind);
3457
3458 // Scale the cost by the probability of executing the predicated blocks.
3459 // This assumes the predicated block for each vector lane is equally
3460 // likely.
3461 ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
3462 }
3463 InstructionCost SafeDivisorCost = 0;
3464
3465 auto *VecTy = toVectorTy(I->getType(), VF);
3466
3467 // The cost of the select guard to ensure all lanes are well defined
3468 // after we speculate above any internal control flow.
3469 SafeDivisorCost +=
3470 TTI.getCmpSelInstrCost(Instruction::Select, VecTy,
3471 toVectorTy(Type::getInt1Ty(I->getContext()), VF),
3473
3474 // Certain instructions can be cheaper to vectorize if they have a constant
3475 // second vector operand. One example of this are shifts on x86.
3476 Value *Op2 = I->getOperand(1);
3477 auto Op2Info = TTI.getOperandInfo(Op2);
3478 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
3479 Legal->isInvariant(Op2))
3481
3482 SmallVector<const Value *, 4> Operands(I->operand_values());
3483 SafeDivisorCost += TTI.getArithmeticInstrCost(
3484 I->getOpcode(), VecTy, CostKind,
3485 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
3486 Op2Info, Operands, I);
3487 return {ScalarizationCost, SafeDivisorCost};
3488}
3489
3491 Instruction *I, ElementCount VF) const {
3492 assert(isAccessInterleaved(I) && "Expecting interleaved access.");
3494 "Decision should not be set yet.");
3495 auto *Group = getInterleavedAccessGroup(I);
3496 assert(Group && "Must have a group.");
3497 unsigned InterleaveFactor = Group->getFactor();
3498
3499 // If the instruction's allocated size doesn't equal its type size, it
3500 // requires padding and will be scalarized.
3501 auto &DL = I->getDataLayout();
3502 auto *ScalarTy = getLoadStoreType(I);
3503 if (hasIrregularType(ScalarTy, DL))
3504 return false;
3505
3506 // We currently only know how to emit interleave/deinterleave with
3507 // Factor=2 for scalable vectors. This is purely an implementation
3508 // limit.
3509 if (VF.isScalable() && InterleaveFactor != 2)
3510 return false;
3511
3512 // If the group involves a non-integral pointer, we may not be able to
3513 // losslessly cast all values to a common type.
3514 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
3515 for (unsigned Idx = 0; Idx < InterleaveFactor; Idx++) {
3516 Instruction *Member = Group->getMember(Idx);
3517 if (!Member)
3518 continue;
3519 auto *MemberTy = getLoadStoreType(Member);
3520 bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
3521 // Don't coerce non-integral pointers to integers or vice versa.
3522 if (MemberNI != ScalarNI)
3523 // TODO: Consider adding special nullptr value case here
3524 return false;
3525 if (MemberNI && ScalarNI &&
3526 ScalarTy->getPointerAddressSpace() !=
3527 MemberTy->getPointerAddressSpace())
3528 return false;
3529 }
3530
3531 // Check if masking is required.
3532 // A Group may need masking for one of two reasons: it resides in a block that
3533 // needs predication, or it was decided to use masking to deal with gaps
3534 // (either a gap at the end of a load-access that may result in a speculative
3535 // load, or any gaps in a store-access).
3536 bool PredicatedAccessRequiresMasking =
3537 blockNeedsPredicationForAnyReason(I->getParent()) &&
3539 bool LoadAccessWithGapsRequiresEpilogMasking =
3540 isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
3542 bool StoreAccessWithGapsRequiresMasking =
3543 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
3544 if (!PredicatedAccessRequiresMasking &&
3545 !LoadAccessWithGapsRequiresEpilogMasking &&
3546 !StoreAccessWithGapsRequiresMasking)
3547 return true;
3548
3549 // If masked interleaving is required, we expect that the user/target had
3550 // enabled it, because otherwise it either wouldn't have been created or
3551 // it should have been invalidated by the CostModel.
3553 "Masked interleave-groups for predicated accesses are not enabled.");
3554
3555 if (Group->isReverse())
3556 return false;
3557
3558 auto *Ty = getLoadStoreType(I);
3559 const Align Alignment = getLoadStoreAlignment(I);
3560 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
3561 : TTI.isLegalMaskedStore(Ty, Alignment);
3562}
3563
3565 Instruction *I, ElementCount VF) {
3566 // Get and ensure we have a valid memory instruction.
3567 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
3568
3570 auto *ScalarTy = getLoadStoreType(I);
3571
3572 // In order to be widened, the pointer should be consecutive, first of all.
3573 if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
3574 return false;
3575
3576 // If the instruction is a store located in a predicated block, it will be
3577 // scalarized.
3578 if (isScalarWithPredication(I, VF))
3579 return false;
3580
3581 // If the instruction's allocated size doesn't equal it's type size, it
3582 // requires padding and will be scalarized.
3583 auto &DL = I->getDataLayout();
3584 if (hasIrregularType(ScalarTy, DL))
3585 return false;
3586
3587 return true;
3588}
3589
3590void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
3591 // We should not collect Uniforms more than once per VF. Right now,
3592 // this function is called from collectUniformsAndScalars(), which
3593 // already does this check. Collecting Uniforms for VF=1 does not make any
3594 // sense.
3595
3596 assert(VF.isVector() && !Uniforms.contains(VF) &&
3597 "This function should not be visited twice for the same VF");
3598
3599 // Visit the list of Uniforms. If we find no uniform value, we won't
3600 // analyze again. Uniforms.count(VF) will return 1.
3601 Uniforms[VF].clear();
3602
3603 // Now we know that the loop is vectorizable!
3604 // Collect instructions inside the loop that will remain uniform after
3605 // vectorization.
3606
3607 // Global values, params and instructions outside of current loop are out of
3608 // scope.
3609 auto IsOutOfScope = [&](Value *V) -> bool {
3610 Instruction *I = dyn_cast<Instruction>(V);
3611 return (!I || !TheLoop->contains(I));
3612 };
3613
3614 // Worklist containing uniform instructions demanding lane 0.
3615 SetVector<Instruction *> Worklist;
3616
3617 // Add uniform instructions demanding lane 0 to the worklist. Instructions
3618 // that require predication must not be considered uniform after
3619 // vectorization, because that would create an erroneous replicating region
3620 // where only a single instance out of VF should be formed.
3621 auto AddToWorklistIfAllowed = [&](Instruction *I) -> void {
3622 if (IsOutOfScope(I)) {
3623 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
3624 << *I << "\n");
3625 return;
3626 }
3627 if (isPredicatedInst(I)) {
3628 LLVM_DEBUG(
3629 dbgs() << "LV: Found not uniform due to requiring predication: " << *I
3630 << "\n");
3631 return;
3632 }
3633 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
3634 Worklist.insert(I);
3635 };
3636
3637 // Start with the conditional branches exiting the loop. If the branch
3638 // condition is an instruction contained in the loop that is only used by the
3639 // branch, it is uniform. Note conditions from uncountable early exits are not
3640 // uniform.
3642 TheLoop->getExitingBlocks(Exiting);
3643 for (BasicBlock *E : Exiting) {
3645 continue;
3646 auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
3647 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
3648 AddToWorklistIfAllowed(Cmp);
3649 }
3650
3651 auto PrevVF = VF.divideCoefficientBy(2);
3652 // Return true if all lanes perform the same memory operation, and we can
3653 // thus choose to execute only one.
3654 auto IsUniformMemOpUse = [&](Instruction *I) {
3655 // If the value was already known to not be uniform for the previous
3656 // (smaller VF), it cannot be uniform for the larger VF.
3657 if (PrevVF.isVector()) {
3658 auto Iter = Uniforms.find(PrevVF);
3659 if (Iter != Uniforms.end() && !Iter->second.contains(I))
3660 return false;
3661 }
3662 if (!Legal->isUniformMemOp(*I, VF))
3663 return false;
3664 if (isa<LoadInst>(I))
3665 // Loading the same address always produces the same result - at least
3666 // assuming aliasing and ordering which have already been checked.
3667 return true;
3668 // Storing the same value on every iteration.
3669 return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
3670 };
3671
3672 auto IsUniformDecision = [&](Instruction *I, ElementCount VF) {
3673 InstWidening WideningDecision = getWideningDecision(I, VF);
3674 assert(WideningDecision != CM_Unknown &&
3675 "Widening decision should be ready at this moment");
3676
3677 if (IsUniformMemOpUse(I))
3678 return true;
3679
3680 return (WideningDecision == CM_Widen ||
3681 WideningDecision == CM_Widen_Reverse ||
3682 WideningDecision == CM_Interleave);
3683 };
3684
3685 // Returns true if Ptr is the pointer operand of a memory access instruction
3686 // I, I is known to not require scalarization, and the pointer is not also
3687 // stored.
3688 auto IsVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
3689 if (isa<StoreInst>(I) && I->getOperand(0) == Ptr)
3690 return false;
3691 return getLoadStorePointerOperand(I) == Ptr &&
3692 (IsUniformDecision(I, VF) || Legal->isInvariant(Ptr));
3693 };
3694
3695 // Holds a list of values which are known to have at least one uniform use.
3696 // Note that there may be other uses which aren't uniform. A "uniform use"
3697 // here is something which only demands lane 0 of the unrolled iterations;
3698 // it does not imply that all lanes produce the same value (e.g. this is not
3699 // the usual meaning of uniform)
3700 SetVector<Value *> HasUniformUse;
3701
3702 // Scan the loop for instructions which are either a) known to have only
3703 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
3704 for (auto *BB : TheLoop->blocks())
3705 for (auto &I : *BB) {
3706 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
3707 switch (II->getIntrinsicID()) {
3708 case Intrinsic::sideeffect:
3709 case Intrinsic::experimental_noalias_scope_decl:
3710 case Intrinsic::assume:
3711 case Intrinsic::lifetime_start:
3712 case Intrinsic::lifetime_end:
3714 AddToWorklistIfAllowed(&I);
3715 break;
3716 default:
3717 break;
3718 }
3719 }
3720
3721 // ExtractValue instructions must be uniform, because the operands are
3722 // known to be loop-invariant.
3723 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
3724 assert(IsOutOfScope(EVI->getAggregateOperand()) &&
3725 "Expected aggregate value to be loop invariant");
3726 AddToWorklistIfAllowed(EVI);
3727 continue;
3728 }
3729
3730 // If there's no pointer operand, there's nothing to do.
3732 if (!Ptr)
3733 continue;
3734
3735 if (IsUniformMemOpUse(&I))
3736 AddToWorklistIfAllowed(&I);
3737
3738 if (IsVectorizedMemAccessUse(&I, Ptr))
3739 HasUniformUse.insert(Ptr);
3740 }
3741
3742 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
3743 // demanding) users. Since loops are assumed to be in LCSSA form, this
3744 // disallows uses outside the loop as well.
3745 for (auto *V : HasUniformUse) {
3746 if (IsOutOfScope(V))
3747 continue;
3748 auto *I = cast<Instruction>(V);
3749 bool UsersAreMemAccesses = all_of(I->users(), [&](User *U) -> bool {
3750 auto *UI = cast<Instruction>(U);
3751 return TheLoop->contains(UI) && IsVectorizedMemAccessUse(UI, V);
3752 });
3753 if (UsersAreMemAccesses)
3754 AddToWorklistIfAllowed(I);
3755 }
3756
3757 // Expand Worklist in topological order: whenever a new instruction
3758 // is added , its users should be already inside Worklist. It ensures
3759 // a uniform instruction will only be used by uniform instructions.
3760 unsigned Idx = 0;
3761 while (Idx != Worklist.size()) {
3762 Instruction *I = Worklist[Idx++];
3763
3764 for (auto *OV : I->operand_values()) {
3765 // isOutOfScope operands cannot be uniform instructions.
3766 if (IsOutOfScope(OV))
3767 continue;
3768 // First order recurrence Phi's should typically be considered
3769 // non-uniform.
3770 auto *OP = dyn_cast<PHINode>(OV);
3772 continue;
3773 // If all the users of the operand are uniform, then add the
3774 // operand into the uniform worklist.
3775 auto *OI = cast<Instruction>(OV);
3776 if (llvm::all_of(OI->users(), [&](User *U) -> bool {
3777 auto *J = cast<Instruction>(U);
3778 return Worklist.count(J) || IsVectorizedMemAccessUse(J, OI);
3779 }))
3780 AddToWorklistIfAllowed(OI);
3781 }
3782 }
3783
3784 // For an instruction to be added into Worklist above, all its users inside
3785 // the loop should also be in Worklist. However, this condition cannot be
3786 // true for phi nodes that form a cyclic dependence. We must process phi
3787 // nodes separately. An induction variable will remain uniform if all users
3788 // of the induction variable and induction variable update remain uniform.
3789 // The code below handles both pointer and non-pointer induction variables.
3790 BasicBlock *Latch = TheLoop->getLoopLatch();
3791 for (const auto &Induction : Legal->getInductionVars()) {
3792 auto *Ind = Induction.first;
3793 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3794
3795 // Determine if all users of the induction variable are uniform after
3796 // vectorization.
3797 bool UniformInd = all_of(Ind->users(), [&](User *U) -> bool {
3798 auto *I = cast<Instruction>(U);
3799 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3800 IsVectorizedMemAccessUse(I, Ind);
3801 });
3802 if (!UniformInd)
3803 continue;
3804
3805 // Determine if all users of the induction variable update instruction are
3806 // uniform after vectorization.
3807 bool UniformIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
3808 auto *I = cast<Instruction>(U);
3809 return I == Ind || Worklist.count(I) ||
3810 IsVectorizedMemAccessUse(I, IndUpdate);
3811 });
3812 if (!UniformIndUpdate)
3813 continue;
3814
3815 // The induction variable and its update instruction will remain uniform.
3816 AddToWorklistIfAllowed(Ind);
3817 AddToWorklistIfAllowed(IndUpdate);
3818 }
3819
3820 Uniforms[VF].insert(Worklist.begin(), Worklist.end());
3821}
3822
3824 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
3825
3827 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
3828 "runtime pointer checks needed. Enable vectorization of this "
3829 "loop with '#pragma clang loop vectorize(enable)' when "
3830 "compiling with -Os/-Oz",
3831 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3832 return true;
3833 }
3834
3835 if (!PSE.getPredicate().isAlwaysTrue()) {
3836 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
3837 "runtime SCEV checks needed. Enable vectorization of this "
3838 "loop with '#pragma clang loop vectorize(enable)' when "
3839 "compiling with -Os/-Oz",
3840 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3841 return true;
3842 }
3843
3844 // FIXME: Avoid specializing for stride==1 instead of bailing out.
3845 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
3846 reportVectorizationFailure("Runtime stride check for small trip count",
3847 "runtime stride == 1 checks needed. Enable vectorization of "
3848 "this loop without such check by compiling with -Os/-Oz",
3849 "CantVersionLoopWithOptForSize", ORE, TheLoop);
3850 return true;
3851 }
3852
3853 return false;
3854}
3855
3856bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
3857 if (IsScalableVectorizationAllowed)
3858 return *IsScalableVectorizationAllowed;
3859
3860 IsScalableVectorizationAllowed = false;
3862 return false;
3863
3865 reportVectorizationInfo("Scalable vectorization is explicitly disabled",
3866 "ScalableVectorizationDisabled", ORE, TheLoop);
3867 return false;
3868 }
3869
3870 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
3871
3872 auto MaxScalableVF = ElementCount::getScalable(
3873 std::numeric_limits<ElementCount::ScalarTy>::max());
3874
3875 // Test that the loop-vectorizer can legalize all operations for this MaxVF.
3876 // FIXME: While for scalable vectors this is currently sufficient, this should
3877 // be replaced by a more detailed mechanism that filters out specific VFs,
3878 // instead of invalidating vectorization for a whole set of VFs based on the
3879 // MaxVF.
3880
3881 // Disable scalable vectorization if the loop contains unsupported reductions.
3882 if (!canVectorizeReductions(MaxScalableVF)) {
3884 "Scalable vectorization not supported for the reduction "
3885 "operations found in this loop.",
3886 "ScalableVFUnfeasible", ORE, TheLoop);
3887 return false;
3888 }
3889
3890 // Disable scalable vectorization if the loop contains any instructions
3891 // with element types not supported for scalable vectors.
3892 if (any_of(ElementTypesInLoop, [&](Type *Ty) {
3893 return !Ty->isVoidTy() &&
3895 })) {
3896 reportVectorizationInfo("Scalable vectorization is not supported "
3897 "for all element types found in this loop.",
3898 "ScalableVFUnfeasible", ORE, TheLoop);
3899 return false;
3900 }
3901
3903 reportVectorizationInfo("The target does not provide maximum vscale value "
3904 "for safe distance analysis.",
3905 "ScalableVFUnfeasible", ORE, TheLoop);
3906 return false;
3907 }
3908
3909 IsScalableVectorizationAllowed = true;
3910 return true;
3911}
3912
3914LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
3915 if (!isScalableVectorizationAllowed())
3916 return ElementCount::getScalable(0);
3917
3918 auto MaxScalableVF = ElementCount::getScalable(
3919 std::numeric_limits<ElementCount::ScalarTy>::max());
3921 return MaxScalableVF;
3922
3923 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
3924 // Limit MaxScalableVF by the maximum safe dependence distance.
3925 MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);
3926
3927 if (!MaxScalableVF)
3929 "Max legal vector width too small, scalable vectorization "
3930 "unfeasible.",
3931 "ScalableVFUnfeasible", ORE, TheLoop);
3932
3933 return MaxScalableVF;
3934}
3935
3936FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
3937 unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
3939 unsigned SmallestType, WidestType;
3940 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
3941
3942 // Get the maximum safe dependence distance in bits computed by LAA.
3943 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
3944 // the memory accesses that is most restrictive (involved in the smallest
3945 // dependence distance).
3946 unsigned MaxSafeElements =
3948
3949 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
3950 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
3952 this->MaxSafeElements = MaxSafeElements;
3953
3954 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
3955 << ".\n");
3956 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
3957 << ".\n");
3958
3959 // First analyze the UserVF, fall back if the UserVF should be ignored.
3960 if (UserVF) {
3961 auto MaxSafeUserVF =
3962 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
3963
3964 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
3965 // If `VF=vscale x N` is safe, then so is `VF=N`
3966 if (UserVF.isScalable())
3967 return FixedScalableVFPair(
3968 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
3969
3970 return UserVF;
3971 }
3972
3973 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
3974
3975 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
3976 // is better to ignore the hint and let the compiler choose a suitable VF.
3977 if (!UserVF.isScalable()) {
3978 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3979 << " is unsafe, clamping to max safe VF="
3980 << MaxSafeFixedVF << ".\n");
3981 ORE->emit([&]() {
3982 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3984 TheLoop->getHeader())
3985 << "User-specified vectorization factor "
3986 << ore::NV("UserVectorizationFactor", UserVF)
3987 << " is unsafe, clamping to maximum safe vectorization factor "
3988 << ore::NV("VectorizationFactor", MaxSafeFixedVF);
3989 });
3990 return MaxSafeFixedVF;
3991 }
3992
3994 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3995 << " is ignored because scalable vectors are not "
3996 "available.\n");
3997 ORE->emit([&]() {
3998 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4000 TheLoop->getHeader())
4001 << "User-specified vectorization factor "
4002 << ore::NV("UserVectorizationFactor", UserVF)
4003 << " is ignored because the target does not support scalable "
4004 "vectors. The compiler will pick a more suitable value.";
4005 });
4006 } else {
4007 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4008 << " is unsafe. Ignoring scalable UserVF.\n");
4009 ORE->emit([&]() {
4010 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4012 TheLoop->getHeader())
4013 << "User-specified vectorization factor "
4014 << ore::NV("UserVectorizationFactor", UserVF)
4015 << " is unsafe. Ignoring the hint to let the compiler pick a "
4016 "more suitable value.";
4017 });
4018 }
4019 }
4020
4021 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4022 << " / " << WidestType << " bits.\n");
4023
4026 if (auto MaxVF =
4027 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
4028 MaxSafeFixedVF, FoldTailByMasking))
4029 Result.FixedVF = MaxVF;
4030
4031 if (auto MaxVF =
4032 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
4033 MaxSafeScalableVF, FoldTailByMasking))
4034 if (MaxVF.isScalable()) {
4035 Result.ScalableVF = MaxVF;
4036 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
4037 << "\n");
4038 }
4039
4040 return Result;
4041}
4042
4046 // TODO: It may be useful to do since it's still likely to be dynamically
4047 // uniform if the target can skip.
4049 "Not inserting runtime ptr check for divergent target",
4050 "runtime pointer checks needed. Not enabled for divergent target",
4051 "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4053 }
4054
4055 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4056 unsigned MaxTC = PSE.getSmallConstantMaxTripCount();
4057 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4058 if (TC != MaxTC)
4059 LLVM_DEBUG(dbgs() << "LV: Found maximum trip count: " << MaxTC << '\n');
4060 if (TC == 1) {
4061 reportVectorizationFailure("Single iteration (non) loop",
4062 "loop trip count is one, irrelevant for vectorization",
4063 "SingleIterationLoop", ORE, TheLoop);
4065 }
4066
4067 switch (ScalarEpilogueStatus) {
4069 return computeFeasibleMaxVF(MaxTC, UserVF, false);
4071 [[fallthrough]];
4073 LLVM_DEBUG(
4074 dbgs() << "LV: vector predicate hint/switch found.\n"
4075 << "LV: Not allowing scalar epilogue, creating predicated "
4076 << "vector loop.\n");
4077 break;
4079 // fallthrough as a special case of OptForSize
4081 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4082 LLVM_DEBUG(
4083 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4084 else
4085 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4086 << "count.\n");
4087
4088 // Bail if runtime checks are required, which are not good when optimising
4089 // for size.
4092
4093 break;
4094 }
4095
4096 // The only loops we can vectorize without a scalar epilogue, are loops with
4097 // a bottom-test and a single exiting block. We'd have to handle the fact
4098 // that not every instruction executes on the last iteration. This will
4099 // require a lane mask which varies through the vector loop body. (TODO)
4101 // If there was a tail-folding hint/switch, but we can't fold the tail by
4102 // masking, fallback to a vectorization with a scalar epilogue.
4103 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4104 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4105 "scalar epilogue instead.\n");
4106 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4107 return computeFeasibleMaxVF(MaxTC, UserVF, false);
4108 }
4110 }
4111
4112 // Now try the tail folding
4113
4114 // Invalidate interleave groups that require an epilogue if we can't mask
4115 // the interleave-group.
4117 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
4118 "No decisions should have been taken at this point");
4119 // Note: There is no need to invalidate any cost modeling decisions here, as
4120 // none were taken so far.
4122 }
4123
4124 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
4125
4126 // Avoid tail folding if the trip count is known to be a multiple of any VF
4127 // we choose.
4128 std::optional<unsigned> MaxPowerOf2RuntimeVF =
4129 MaxFactors.FixedVF.getFixedValue();
4130 if (MaxFactors.ScalableVF) {
4131 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
4132 if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
4133 MaxPowerOf2RuntimeVF = std::max<unsigned>(
4134 *MaxPowerOf2RuntimeVF,
4135 *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
4136 } else
4137 MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
4138 }
4139
4140 if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
4141 assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
4142 "MaxFixedVF must be a power of 2");
4143 unsigned MaxVFtimesIC =
4144 UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF;
4145 ScalarEvolution *SE = PSE.getSE();
4146 // Currently only loops with countable exits are vectorized, but calling
4147 // getSymbolicMaxBackedgeTakenCount allows enablement work for loops with
4148 // uncountable exits whilst also ensuring the symbolic maximum and known
4149 // back-edge taken count remain identical for loops with countable exits.
4150 const SCEV *BackedgeTakenCount = PSE.getSymbolicMaxBackedgeTakenCount();
4151 assert(BackedgeTakenCount == PSE.getBackedgeTakenCount() &&
4152 "Invalid loop count");
4153 const SCEV *ExitCount = SE->getAddExpr(
4154 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
4155 const SCEV *Rem = SE->getURemExpr(
4156 SE->applyLoopGuards(ExitCount, TheLoop),
4157 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
4158 if (Rem->isZero()) {
4159 // Accept MaxFixedVF if we do not have a tail.
4160 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4161 return MaxFactors;
4162 }
4163 }
4164
4165 // If we don't know the precise trip count, or if the trip count that we
4166 // found modulo the vectorization factor is not zero, try to fold the tail
4167 // by masking.
4168 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4169 setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC);
4170 if (foldTailByMasking()) {
4172 LLVM_DEBUG(
4173 dbgs()
4174 << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
4175 "try to generate VP Intrinsics with scalable vector "
4176 "factors only.\n");
4177 // Tail folded loop using VP intrinsics restricts the VF to be scalable
4178 // for now.
4179 // TODO: extend it for fixed vectors, if required.
4180 assert(MaxFactors.ScalableVF.isScalable() &&
4181 "Expected scalable vector factor.");
4182
4183 MaxFactors.FixedVF = ElementCount::getFixed(1);
4184 }
4185 return MaxFactors;
4186 }
4187
4188 // If there was a tail-folding hint/switch, but we can't fold the tail by
4189 // masking, fallback to a vectorization with a scalar epilogue.
4190 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4191 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4192 "scalar epilogue instead.\n");
4193 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4194 return MaxFactors;
4195 }
4196
4197 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
4198 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
4200 }
4201
4202 if (TC == 0) {
4204 "unable to calculate the loop count due to complex control flow",
4205 "UnknownLoopCountComplexCFG", ORE, TheLoop);
4207 }
4208
4210 "Cannot optimize for size and vectorize at the same time.",
4211 "cannot optimize for size and vectorize at the same time. "
4212 "Enable vectorization of this loop with '#pragma clang loop "
4213 "vectorize(enable)' when compiling with -Os/-Oz",
4214 "NoTailLoopWithOptForSize", ORE, TheLoop);
4216}
4217
4218ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
4219 unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
4220 ElementCount MaxSafeVF, bool FoldTailByMasking) {
4221 bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
4222 const TypeSize WidestRegister = TTI.getRegisterBitWidth(
4223 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4225
4226 // Convenience function to return the minimum of two ElementCounts.
4227 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
4228 assert((LHS.isScalable() == RHS.isScalable()) &&
4229 "Scalable flags must match");
4230 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
4231 };
4232
4233 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
4234 // Note that both WidestRegister and WidestType may not be a powers of 2.
4235 auto MaxVectorElementCount = ElementCount::get(
4236 llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),
4237 ComputeScalableMaxVF);
4238 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
4239 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4240 << (MaxVectorElementCount * WidestType) << " bits.\n");
4241
4242 if (!MaxVectorElementCount) {
4243 LLVM_DEBUG(dbgs() << "LV: The target has no "
4244 << (ComputeScalableMaxVF ? "scalable" : "fixed")
4245 << " vector registers.\n");
4246 return ElementCount::getFixed(1);
4247 }
4248
4249 unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
4250 if (MaxVectorElementCount.isScalable() &&
4251 TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
4252 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
4253 auto Min = Attr.getVScaleRangeMin();
4254 WidestRegisterMinEC *= Min;
4255 }
4256
4257 // When a scalar epilogue is required, at least one iteration of the scalar
4258 // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
4259 // max VF that results in a dead vector loop.
4260 if (MaxTripCount > 0 && requiresScalarEpilogue(true))
4261 MaxTripCount -= 1;
4262
4263 if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
4264 (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
4265 // If upper bound loop trip count (TC) is known at compile time there is no
4266 // point in choosing VF greater than TC (as done in the loop below). Select
4267 // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
4268 // scalable, we only fall back on a fixed VF when the TC is less than or
4269 // equal to the known number of lanes.
4270 auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount);
4271 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
4272 "exceeding the constant trip count: "
4273 << ClampedUpperTripCount << "\n");
4274 return ElementCount::get(
4275 ClampedUpperTripCount,
4276 FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
4277 }
4278
4280 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4282 ElementCount MaxVF = MaxVectorElementCount;
4283 if (MaximizeBandwidth ||
4287 auto MaxVectorElementCountMaxBW = ElementCount::get(
4288 llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
4289 ComputeScalableMaxVF);
4290 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
4291
4292 // Collect all viable vectorization factors larger than the default MaxVF
4293 // (i.e. MaxVectorElementCount).
4295 for (ElementCount VS = MaxVectorElementCount * 2;
4296 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
4297 VFs.push_back(VS);
4298
4299 // For each VF calculate its register usage.
4300 auto RUs = calculateRegisterUsage(VFs);
4301
4302 // Select the largest VF which doesn't require more registers than existing
4303 // ones.
4304 for (int I = RUs.size() - 1; I >= 0; --I) {
4305 const auto &MLU = RUs[I].MaxLocalUsers;
4306 if (all_of(MLU, [&](decltype(MLU.front()) &LU) {
4307 return LU.second <= TTI.getNumberOfRegisters(LU.first);
4308 })) {
4309 MaxVF = VFs[I];
4310 break;
4311 }
4312 }
4313 if (ElementCount MinVF =
4314 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
4315 if (ElementCount::isKnownLT(MaxVF, MinVF)) {
4316 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4317 << ") with target's minimum: " << MinVF << '\n');
4318 MaxVF = MinVF;
4319 }
4320 }
4321
4322 // Invalidate any widening decisions we might have made, in case the loop
4323 // requires prediction (decided later), but we have already made some
4324 // load/store widening decisions.
4326 }
4327 return MaxVF;
4328}
4329
4330/// Convenience function that returns the value of vscale_range iff
4331/// vscale_range.min == vscale_range.max or otherwise returns the value
4332/// returned by the corresponding TTI method.
4333static std::optional<unsigned>
4335 const Function *Fn = L->getHeader()->getParent();
4336 if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
4337 auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
4338 auto Min = Attr.getVScaleRangeMin();
4339 auto Max = Attr.getVScaleRangeMax();
4340 if (Max && Min == Max)
4341 return Max;
4342 }
4343
4344 return TTI.getVScaleForTuning();
4345}
4346
4347/// This function attempts to return a value that represents the vectorization
4348/// factor at runtime. For fixed-width VFs we know this precisely at compile
4349/// time, but for scalable VFs we calculate it based on an estimate of the
4350/// vscale value.
4351static unsigned getEstimatedRuntimeVF(const Loop *L,
4352 const TargetTransformInfo &TTI,
4353 ElementCount VF) {
4354 unsigned EstimatedVF = VF.getKnownMinValue();
4355 if (VF.isScalable())
4356 if (std::optional<unsigned> VScale = getVScaleForTuning(L, TTI))
4357 EstimatedVF *= *VScale;
4358 assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
4359 return EstimatedVF;
4360}
4361
4362bool LoopVectorizationPlanner::isMoreProfitable(
4364 const unsigned MaxTripCount) const {
4365 InstructionCost CostA = A.Cost;
4366 InstructionCost CostB = B.Cost;
4367
4368 // Improve estimate for the vector width if it is scalable.
4369 unsigned EstimatedWidthA = A.Width.getKnownMinValue();
4370 unsigned EstimatedWidthB = B.Width.getKnownMinValue();
4371 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) {
4372 if (A.Width.isScalable())
4373 EstimatedWidthA *= *VScale;
4374 if (B.Width.isScalable())
4375 EstimatedWidthB *= *VScale;
4376 }
4377
4378 // Assume vscale may be larger than 1 (or the value being tuned for),
4379 // so that scalable vectorization is slightly favorable over fixed-width
4380 // vectorization.
4381 bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() &&
4382 A.Width.isScalable() && !B.Width.isScalable();
4383
4384 auto CmpFn = [PreferScalable](const InstructionCost &LHS,
4385 const InstructionCost &RHS) {
4386 return PreferScalable ? LHS <= RHS : LHS < RHS;
4387 };
4388
4389 // To avoid the need for FP division:
4390 // (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)
4391 // <=> (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA)
4392 if (!MaxTripCount)
4393 return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);
4394
4395 auto GetCostForTC = [MaxTripCount, this](unsigned VF,
4396 InstructionCost VectorCost,
4397 InstructionCost ScalarCost) {
4398 // If the trip count is a known (possibly small) constant, the trip count
4399 // will be rounded up to an integer number of iterations under
4400 // FoldTailByMasking. The total cost in that case will be
4401 // VecCost*ceil(TripCount/VF). When not folding the tail, the total
4402 // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
4403 // some extra overheads, but for the purpose of comparing the costs of
4404 // different VFs we can use this to compare the total loop-body cost
4405 // expected after vectorization.
4406 if (CM.foldTailByMasking())
4407 return VectorCost * divideCeil(MaxTripCount, VF);
4408 return VectorCost * (MaxTripCount / VF) + ScalarCost * (MaxTripCount % VF);
4409 };
4410
4411 auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost);
4412 auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost);
4413 return CmpFn(RTCostA, RTCostB);
4414}
4415
4416bool LoopVectorizationPlanner::isMoreProfitable(
4417 const VectorizationFactor &A, const VectorizationFactor &B) const {
4418 const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount();
4419 return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount);
4420}
4421
4424 using RecipeVFPair = std::pair<VPRecipeBase *, ElementCount>;
4425 SmallVector<RecipeVFPair> InvalidCosts;
4426 for (const auto &Plan : VPlans) {
4427 for (ElementCount VF : Plan->vectorFactors()) {
4428 VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(),
4429 CM);
4430 precomputeCosts(*Plan, VF, CostCtx);
4431 auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
4432 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
4433 for (auto &R : *VPBB) {
4434 if (!R.cost(VF, CostCtx).isValid())
4435 InvalidCosts.emplace_back(&R, VF);
4436 }
4437 }
4438 }
4439 }
4440 if (InvalidCosts.empty())
4441 return;
4442
4443 // Emit a report of VFs with invalid costs in the loop.
4444
4445 // Group the remarks per recipe, keeping the recipe order from InvalidCosts.
4447 unsigned I = 0;
4448 for (auto &Pair : InvalidCosts)
4449 if (!Numbering.count(Pair.first))
4450 Numbering[Pair.first] = I++;
4451
4452 // Sort the list, first on recipe(number) then on VF.
4453 sort(InvalidCosts, [&Numbering](RecipeVFPair &A, RecipeVFPair &B) {
4454 if (Numbering[A.first] != Numbering[B.first])
4455 return Numbering[A.first] < Numbering[B.first];
4456 const auto &LHS = A.second;
4457 const auto &RHS = B.second;
4458 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
4459 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
4460 });
4461
4462 // For a list of ordered recipe-VF pairs:
4463 // [(load, VF1), (load, VF2), (store, VF1)]
4464 // group the recipes together to emit separate remarks for:
4465 // load (VF1, VF2)
4466 // store (VF1)
4467 auto Tail = ArrayRef<RecipeVFPair>(InvalidCosts);
4468 auto Subset = ArrayRef<RecipeVFPair>();
4469 do {
4470 if (Subset.empty())
4471 Subset = Tail.take_front(1);
4472
4473 VPRecipeBase *R = Subset.front().first;
4474
4475 unsigned Opcode =
4478 [](const auto *R) { return Instruction::PHI; })
4479 .Case<VPWidenSelectRecipe>(
4480 [](const auto *R) { return Instruction::Select; })
4481 .Case<VPWidenStoreRecipe>(
4482 [](const auto *R) { return Instruction::Store; })
4483 .Case<VPWidenLoadRecipe>(
4484 [](const auto *R) { return Instruction::Load; })
4485 .Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>(
4486 [](const auto *R) { return Instruction::Call; })
4489 [](const auto *R) { return R->getOpcode(); })
4490 .Case<VPInterleaveRecipe>([](const VPInterleaveRecipe *R) {
4491 return R->getStoredValues().empty() ? Instruction::Load
4492 : Instruction::Store;
4493 });
4494
4495 // If the next recipe is different, or if there are no other pairs,
4496 // emit a remark for the collated subset. e.g.
4497 // [(load, VF1), (load, VF2))]
4498 // to emit:
4499 // remark: invalid costs for 'load' at VF=(VF1, VF2)
4500 if (Subset == Tail || Tail[Subset.size()].first != R) {
4501 std::string OutString;
4502 raw_string_ostream OS(OutString);
4503 assert(!Subset.empty() && "Unexpected empty range");
4504 OS << "Recipe with invalid costs prevented vectorization at VF=(";
4505 for (const auto &Pair : Subset)
4506 OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
4507 OS << "):";
4508 if (Opcode == Instruction::Call) {
4509 StringRef Name = "";
4510 if (auto *Int = dyn_cast<VPWidenIntrinsicRecipe>(R)) {
4511 Name = Int->getIntrinsicName();
4512 } else {
4513 auto *WidenCall = dyn_cast<VPWidenCallRecipe>(R);
4514 Function *CalledFn =
4515 WidenCall ? WidenCall->getCalledScalarFunction()
4516 : cast<Function>(R->getOperand(R->getNumOperands() - 1)
4517 ->getLiveInIRValue());
4518 Name = CalledFn->getName();
4519 }
4520 OS << " call to " << Name;
4521 } else
4522 OS << " " << Instruction::getOpcodeName(Opcode);
4523 reportVectorizationInfo(OutString, "InvalidCost", ORE, OrigLoop, nullptr,
4524 R->getDebugLoc());
4525 Tail = Tail.drop_front(Subset.size());
4526 Subset = {};
4527 } else
4528 // Grow the subset by one element
4529 Subset = Tail.take_front(Subset.size() + 1);
4530 } while (!Tail.empty());
4531}
4532
4533/// Check if any recipe of \p Plan will generate a vector value, which will be
4534/// assigned a vector register.
4536 const TargetTransformInfo &TTI) {
4537 assert(VF.isVector() && "Checking a scalar VF?");
4538 VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
4539 DenseSet<VPRecipeBase *> EphemeralRecipes;
4540 collectEphemeralRecipesForVPlan(Plan, EphemeralRecipes);
4541 // Set of already visited types.
4542 DenseSet<Type *> Visited;
4543 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4545 for (VPRecipeBase &R : *VPBB) {
4546 if (EphemeralRecipes.contains(&R))
4547 continue;
4548 // Continue early if the recipe is considered to not produce a vector
4549 // result. Note that this includes VPInstruction where some opcodes may
4550 // produce a vector, to preserve existing behavior as VPInstructions model
4551 // aspects not directly mapped to existing IR instructions.
4552 switch (R.getVPDefID()) {
4553 case VPDef::VPDerivedIVSC:
4554 case VPDef::VPScalarIVStepsSC:
4555 case VPDef::VPScalarCastSC:
4556 case VPDef::VPReplicateSC:
4557 case VPDef::VPInstructionSC:
4558 case VPDef::VPCanonicalIVPHISC:
4559 case VPDef::VPVectorPointerSC:
4560 case VPDef::VPReverseVectorPointerSC:
4561 case VPDef::VPExpandSCEVSC:
4562 case VPDef::VPEVLBasedIVPHISC:
4563 case VPDef::VPPredInstPHISC:
4564 case VPDef::VPBranchOnMaskSC:
4565 continue;
4566 case VPDef::VPReductionSC:
4567 case VPDef::VPActiveLaneMaskPHISC:
4568 case VPDef::VPWidenCallSC:
4569 case VPDef::VPWidenCanonicalIVSC:
4570 case VPDef::VPWidenCastSC:
4571 case VPDef::VPWidenGEPSC:
4572 case VPDef::VPWidenIntrinsicSC:
4573 case VPDef::VPWidenSC:
4574 case VPDef::VPWidenSelectSC:
4575 case VPDef::VPBlendSC:
4576 case VPDef::VPFirstOrderRecurrencePHISC:
4577 case VPDef::VPWidenPHISC:
4578 case VPDef::VPWidenIntOrFpInductionSC:
4579 case VPDef::VPWidenPointerInductionSC:
4580 case VPDef::VPReductionPHISC:
4581 case VPDef::VPInterleaveSC:
4582 case VPDef::VPWidenLoadEVLSC:
4583 case VPDef::VPWidenLoadSC:
4584 case VPDef::VPWidenStoreEVLSC:
4585 case VPDef::VPWidenStoreSC:
4586 break;
4587 default:
4588 llvm_unreachable("unhandled recipe");
4589 }
4590
4591 auto WillWiden = [&TTI, VF](Type *ScalarTy) {
4592 Type *VectorTy = toVectorTy(ScalarTy, VF);
4593 unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
4594 if (!NumLegalParts)
4595 return false;
4596 if (VF.isScalable()) {
4597 // <vscale x 1 x iN> is assumed to be profitable over iN because
4598 // scalable registers are a distinct register class from scalar
4599 // ones. If we ever find a target which wants to lower scalable
4600 // vectors back to scalars, we'll need to update this code to
4601 // explicitly ask TTI about the register class uses for each part.
4602 return NumLegalParts <= VF.getKnownMinValue();
4603 }
4604 // Two or more parts that share a register - are vectorized.
4605 return NumLegalParts < VF.getKnownMinValue();
4606 };
4607
4608 // If no def nor is a store, e.g., branches, continue - no value to check.
4609 if (R.getNumDefinedValues() == 0 &&
4610 !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>(
4611 &R))
4612 continue;
4613 // For multi-def recipes, currently only interleaved loads, suffice to
4614 // check first def only.
4615 // For stores check their stored value; for interleaved stores suffice
4616 // the check first stored value only. In all cases this is the second
4617 // operand.
4618 VPValue *ToCheck =
4619 R.getNumDefinedValues() >= 1 ? R.getVPValue(0) : R.getOperand(1);
4620 Type *ScalarTy = TypeInfo.inferScalarType(ToCheck);
4621 if (!Visited.insert({ScalarTy}).second)
4622 continue;
4623 if (WillWiden(ScalarTy))
4624 return true;
4625 }
4626 }
4627
4628 return false;
4629}
4630
4631#ifndef NDEBUG
4632VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
4634 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
4635 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
4636 assert(any_of(VPlans,
4637 [](std::unique_ptr<VPlan> &P) {
4638 return P->hasVF(ElementCount::getFixed(1));
4639 }) &&
4640 "Expected Scalar VF to be a candidate");
4641
4642 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
4643 ExpectedCost);
4644 VectorizationFactor ChosenFactor = ScalarCost;
4645
4646 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
4647 if (ForceVectorization &&
4648 (VPlans.size() > 1 || !VPlans[0]->hasScalarVFOnly())) {
4649 // Ignore scalar width, because the user explicitly wants vectorization.
4650 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4651 // evaluation.
4652 ChosenFactor.Cost = InstructionCost::getMax();
4653 }
4654
4655 for (auto &P : VPlans) {
4656 for (ElementCount VF : P->vectorFactors()) {
4657 // The cost for scalar VF=1 is already calculated, so ignore it.
4658 if (VF.isScalar())
4659 continue;
4660
4662 VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
4663
4664 unsigned Width = getEstimatedRuntimeVF(OrigLoop, TTI, Candidate.Width);
4665 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
4666 << " costs: " << (Candidate.Cost / Width));
4667 if (VF.isScalable())
4668 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
4669 << getVScaleForTuning(OrigLoop, TTI).value_or(1)
4670 << ")");
4671 LLVM_DEBUG(dbgs() << ".\n");
4672
4673 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
4674 LLVM_DEBUG(
4675 dbgs()
4676 << "LV: Not considering vector loop of width " << VF
4677 << " because it will not generate any vector instructions.\n");
4678 continue;
4679 }
4680
4681 if (isMoreProfitable(Candidate, ChosenFactor))
4682 ChosenFactor = Candidate;
4683 }
4684 }
4685
4688 "There are conditional stores.",
4689 "store that is conditionally executed prevents vectorization",
4690 "ConditionalStore", ORE, OrigLoop);
4691 ChosenFactor = ScalarCost;
4692 }
4693
4694 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
4695 !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
4696 << "LV: Vectorization seems to be not beneficial, "
4697 << "but was forced by a user.\n");
4698 return ChosenFactor;
4699}
4700#endif
4701
4702bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
4703 ElementCount VF) const {
4704 // Cross iteration phis such as reductions need special handling and are
4705 // currently unsupported.
4706 if (any_of(OrigLoop->getHeader()->phis(),
4707 [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
4708 return false;
4709
4710 // Phis with uses outside of the loop require special handling and are
4711 // currently unsupported.
4712 for (const auto &Entry : Legal->getInductionVars()) {
4713 // Look for uses of the value of the induction at the last iteration.
4714 Value *PostInc =
4715 Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch());
4716 for (User *U : PostInc->users())
4717 if (!OrigLoop->contains(cast<Instruction>(U)))
4718 return false;
4719 // Look for uses of penultimate value of the induction.
4720 for (User *U : Entry.first->users())
4721 if (!OrigLoop->contains(cast<Instruction>(U)))
4722 return false;
4723 }
4724
4725 // Epilogue vectorization code has not been auditted to ensure it handles
4726 // non-latch exits properly. It may be fine, but it needs auditted and
4727 // tested.
4728 // TODO: Add support for loops with an early exit.
4729 if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
4730 return false;
4731
4732 return true;
4733}
4734
4736 const ElementCount VF, const unsigned IC) const {
4737 // FIXME: We need a much better cost-model to take different parameters such
4738 // as register pressure, code size increase and cost of extra branches into
4739 // account. For now we apply a very crude heuristic and only consider loops
4740 // with vectorization factors larger than a certain value.
4741
4742 // Allow the target to opt out entirely.
4744 return false;
4745
4746 // We also consider epilogue vectorization unprofitable for targets that don't
4747 // consider interleaving beneficial (eg. MVE).
4748 if (TTI.getMaxInterleaveFactor(VF) <= 1)
4749 return false;
4750
4751 // TODO: PR #108190 introduced a discrepancy between fixed-width and scalable
4752 // VFs when deciding profitability.
4753 // See related "TODO: extend to support scalable VFs." in
4754 // selectEpilogueVectorizationFactor.
4755 unsigned Multiplier = VF.isFixed() ? IC : 1;
4756 unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
4759 return getEstimatedRuntimeVF(TheLoop, TTI, VF * Multiplier) >= MinVFThreshold;
4760}
4761
4763 const ElementCount MainLoopVF, unsigned IC) {
4766 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
4767 return Result;
4768 }
4769
4770 if (!CM.isScalarEpilogueAllowed()) {
4771 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
4772 "epilogue is allowed.\n");
4773 return Result;
4774 }
4775
4776 // Not really a cost consideration, but check for unsupported cases here to
4777 // simplify the logic.
4778 if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
4779 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
4780 "is not a supported candidate.\n");
4781 return Result;
4782 }
4783
4785 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
4787 if (hasPlanWithVF(ForcedEC))
4788 return {ForcedEC, 0, 0};
4789
4790 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
4791 "viable.\n");
4792 return Result;
4793 }
4794
4795 if (OrigLoop->getHeader()->getParent()->hasOptSize() ||
4796 OrigLoop->getHeader()->getParent()->hasMinSize()) {
4797 LLVM_DEBUG(
4798 dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
4799 return Result;
4800 }
4801
4802 if (!CM.isEpilogueVectorizationProfitable(MainLoopVF, IC)) {
4803 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
4804 "this loop\n");
4805 return Result;
4806 }
4807
4808 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
4809 // the main loop handles 8 lanes per iteration. We could still benefit from
4810 // vectorizing the epilogue loop with VF=4.
4811 ElementCount EstimatedRuntimeVF =
4812 ElementCount::getFixed(getEstimatedRuntimeVF(OrigLoop, TTI, MainLoopVF));
4813
4814 ScalarEvolution &SE = *PSE.getSE();
4815 Type *TCType = Legal->getWidestInductionType();
4816 const SCEV *RemainingIterations = nullptr;
4817 unsigned MaxTripCount = 0;
4818 for (auto &NextVF : ProfitableVFs) {
4819 // Skip candidate VFs without a corresponding VPlan.
4820 if (!hasPlanWithVF(NextVF.Width))
4821 continue;
4822
4823 // Skip candidate VFs with widths >= the (estimated) runtime VF (scalable
4824 // vectors) or > the VF of the main loop (fixed vectors).
4825 if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
4826 ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||
4827 (NextVF.Width.isScalable() &&
4828 ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) ||
4829 (!NextVF.Width.isScalable() && !MainLoopVF.isScalable() &&
4830 ElementCount::isKnownGT(NextVF.Width, MainLoopVF)))
4831 continue;
4832
4833 // If NextVF is greater than the number of remaining iterations, the
4834 // epilogue loop would be dead. Skip such factors.
4835 if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) {
4836 // TODO: extend to support scalable VFs.
4837 if (!RemainingIterations) {
4839 getPlanFor(NextVF.Width).getTripCount(), SE);
4840 assert(!isa<SCEVCouldNotCompute>(TC) &&
4841 "Trip count SCEV must be computable");
4842 RemainingIterations = SE.getURemExpr(
4843 TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC));
4844 MaxTripCount = MainLoopVF.getKnownMinValue() * IC - 1;
4845 if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations,
4846 SE.getConstant(TCType, MaxTripCount))) {
4847 MaxTripCount =
4848 SE.getUnsignedRangeMax(RemainingIterations).getZExtValue();
4849 }
4850 LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: "
4851 << MaxTripCount << "\n");
4852 }
4853 if (SE.isKnownPredicate(
4855 SE.getConstant(TCType, NextVF.Width.getKnownMinValue()),
4856 RemainingIterations))
4857 continue;
4858 }
4859
4860 if (Result.Width.isScalar() ||
4861 isMoreProfitable(NextVF, Result, MaxTripCount))
4862 Result = NextVF;
4863 }
4864
4865 if (Result != VectorizationFactor::Disabled())
4866 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
4867 << Result.Width << "\n");
4868 return Result;
4869}
4870
4871std::pair<unsigned, unsigned>
4873 unsigned MinWidth = -1U;
4874 unsigned MaxWidth = 8;
4876 // For in-loop reductions, no element types are added to ElementTypesInLoop
4877 // if there are no loads/stores in the loop. In this case, check through the
4878 // reduction variables to determine the maximum width.
4879 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
4880 // Reset MaxWidth so that we can find the smallest type used by recurrences
4881 // in the loop.
4882 MaxWidth = -1U;
4883 for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
4884 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
4885 // When finding the min width used by the recurrence we need to account
4886 // for casts on the input operands of the recurrence.
4887 MaxWidth = std::min<unsigned>(
4888 MaxWidth, std::min<unsigned>(
4891 }
4892 } else {
4893 for (Type *T : ElementTypesInLoop) {
4894 MinWidth = std::min<unsigned>(
4895 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4896 MaxWidth = std::max<unsigned>(
4897 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
4898 }
4899 }
4900 return {MinWidth, MaxWidth};
4901}
4902
4904 ElementTypesInLoop.clear();
4905 // For each block.
4906 for (BasicBlock *BB : TheLoop->blocks()) {
4907 // For each instruction in the loop.
4908 for (Instruction &I : BB->instructionsWithoutDebug()) {
4909 Type *T = I.getType();
4910
4911 // Skip ignored values.
4912 if (ValuesToIgnore.count(&I))
4913 continue;
4914
4915 // Only examine Loads, Stores and PHINodes.
4916 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
4917 continue;
4918
4919 // Examine PHI nodes that are reduction variables. Update the type to
4920 // account for the recurrence type.
4921 if (auto *PN = dyn_cast<PHINode>(&I)) {
4922 if (!Legal->isReductionVariable(PN))
4923 continue;
4924 const RecurrenceDescriptor &RdxDesc =
4925 Legal->getReductionVars().find(PN)->second;
4928 RdxDesc.getRecurrenceType(),
4930 continue;
4931 T = RdxDesc.getRecurrenceType();
4932 }
4933
4934 // Examine the stored values.
4935 if (auto *ST = dyn_cast<StoreInst>(&I))
4936 T = ST->getValueOperand()->getType();
4937
4938 assert(T->isSized() &&
4939 "Expected the load/store/recurrence type to be sized");
4940
4941 ElementTypesInLoop.insert(T);
4942 }
4943 }
4944}
4945
4946unsigned
4948 InstructionCost LoopCost) {
4949 // -- The interleave heuristics --
4950 // We interleave the loop in order to expose ILP and reduce the loop overhead.
4951 // There are many micro-architectural considerations that we can't predict
4952 // at this level. For example, frontend pressure (on decode or fetch) due to
4953 // code size, or the number and capabilities of the execution ports.
4954 //
4955 // We use the following heuristics to select the interleave count:
4956 // 1. If the code has reductions, then we interleave to break the cross
4957 // iteration dependency.
4958 // 2. If the loop is really small, then we interleave to reduce the loop
4959 // overhead.
4960 // 3. We don't interleave if we think that we will spill registers to memory
4961 // due to the increased register pressure.
4962
4964 return 1;
4965
4966 // Do not interleave if EVL is preferred and no User IC is specified.
4967 if (foldTailWithEVL()) {
4968 LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. "
4969 "Unroll factor forced to be 1.\n");
4970 return 1;
4971 }
4972
4973 // We used the distance for the interleave count.
4975 return 1;
4976
4977 // We don't attempt to perform interleaving for loops with uncountable early
4978 // exits because the VPInstruction::AnyOf code cannot currently handle
4979 // multiple parts.
4981 return 1;
4982
4983 auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop);
4984 const bool HasReductions = !Legal->getReductionVars().empty();
4985
4986 // If we did not calculate the cost for VF (because the user selected the VF)
4987 // then we calculate the cost of VF here.
4988 if (LoopCost == 0) {
4989 LoopCost = expectedCost(VF);
4990 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
4991
4992 // Loop body is free and there is no need for interleaving.
4993 if (LoopCost == 0)
4994 return 1;
4995 }
4996
4998 // We divide by these constants so assume that we have at least one
4999 // instruction that uses at least one register.
5000 for (auto &Pair : R.MaxLocalUsers) {
5001 Pair.second = std::max(Pair.second, 1U);
5002 }
5003
5004 // We calculate the interleave count using the following formula.
5005 // Subtract the number of loop invariants from the number of available
5006 // registers. These registers are used by all of the interleaved instances.
5007 // Next, divide the remaining registers by the number of registers that is
5008 // required by the loop, in order to estimate how many parallel instances
5009 // fit without causing spills. All of this is rounded down if necessary to be
5010 // a power of two. We want power of two interleave count to simplify any
5011 // addressing operations or alignment considerations.
5012 // We also want power of two interleave counts to ensure that the induction
5013 // variable of the vector loop wraps to zero, when tail is folded by masking;
5014 // this currently happens when OptForSize, in which case IC is set to 1 above.
5015 unsigned IC = UINT_MAX;
5016
5017 for (const auto &Pair : R.MaxLocalUsers) {
5018 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(Pair.first);
5019 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5020 << " registers of "
5021 << TTI.getRegisterClassName(Pair.first)
5022 << " register class\n");
5023 if (VF.isScalar()) {
5024 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5025 TargetNumRegisters = ForceTargetNumScalarRegs;
5026 } else {
5027 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5028 TargetNumRegisters = ForceTargetNumVectorRegs;
5029 }
5030 unsigned MaxLocalUsers = Pair.second;
5031 unsigned LoopInvariantRegs = 0;
5032 if (R.LoopInvariantRegs.find(Pair.first) != R.LoopInvariantRegs.end())
5033 LoopInvariantRegs = R.LoopInvariantRegs[Pair.first];
5034
5035 unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /
5036 MaxLocalUsers);
5037 // Don't count the induction variable as interleaved.
5039 TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5040 std::max(1U, (MaxLocalUsers - 1)));
5041 }
5042
5043 IC = std::min(IC, TmpIC);
5044 }
5045
5046 // Clamp the interleave ranges to reasonable counts.
5047 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5048
5049 // Check if the user has overridden the max.
5050 if (VF.isScalar()) {
5051 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5052 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5053 } else {
5054 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5055 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5056 }
5057
5058 unsigned EstimatedVF = getEstimatedRuntimeVF(TheLoop, TTI, VF);
5059 unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5060 if (KnownTC > 0) {
5061 // At least one iteration must be scalar when this constraint holds. So the
5062 // maximum available iterations for interleaving is one less.
5063 unsigned AvailableTC =
5064 requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC;
5065
5066 // If trip count is known we select between two prospective ICs, where
5067 // 1) the aggressive IC is capped by the trip count divided by VF
5068 // 2) the conservative IC is capped by the trip count divided by (VF * 2)
5069 // The final IC is selected in a way that the epilogue loop trip count is
5070 // minimized while maximizing the IC itself, so that we either run the
5071 // vector loop at least once if it generates a small epilogue loop, or else
5072 // we run the vector loop at least twice.
5073
5074 unsigned InterleaveCountUB = bit_floor(
5075 std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
5076 unsigned InterleaveCountLB = bit_floor(std::max(
5077 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
5078 MaxInterleaveCount = InterleaveCountLB;
5079
5080 if (InterleaveCountUB != InterleaveCountLB) {
5081 unsigned TailTripCountUB =
5082 (AvailableTC % (EstimatedVF * InterleaveCountUB));
5083 unsigned TailTripCountLB =
5084 (AvailableTC % (EstimatedVF * InterleaveCountLB));
5085 // If both produce same scalar tail, maximize the IC to do the same work
5086 // in fewer vector loop iterations
5087 if (TailTripCountUB == TailTripCountLB)
5088 MaxInterleaveCount = InterleaveCountUB;
5089 }
5090 } else if (BestKnownTC && *BestKnownTC > 0) {
5091 // At least one iteration must be scalar when this constraint holds. So the
5092 // maximum available iterations for interleaving is one less.
5093 unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
5094 ? (*BestKnownTC) - 1
5095 : *BestKnownTC;
5096
5097 // If trip count is an estimated compile time constant, limit the
5098 // IC to be capped by the trip count divided by VF * 2, such that the vector
5099 // loop runs at least twice to make interleaving seem profitable when there
5100 // is an epilogue loop present. Since exact Trip count is not known we
5101 // choose to be conservative in our IC estimate.
5102 MaxInterleaveCount = bit_floor(std::max(
5103 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
5104 }
5105
5106 assert(MaxInterleaveCount > 0 &&
5107 "Maximum interleave count must be greater than 0");
5108
5109 // Clamp the calculated IC to be between the 1 and the max interleave count
5110 // that the target and trip count allows.
5111 if (IC > MaxInterleaveCount)
5112 IC = MaxInterleaveCount;
5113 else
5114 // Make sure IC is greater than 0.
5115 IC = std::max(1u, IC);
5116
5117 assert(IC > 0 && "Interleave count must be greater than 0.");
5118
5119 // Interleave if we vectorized this loop and there is a reduction that could
5120 // benefit from interleaving.
5121 if (VF.isVector() && HasReductions) {
5122 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5123 return IC;
5124 }
5125
5126 // For any scalar loop that either requires runtime checks or predication we
5127 // are better off leaving this to the unroller. Note that if we've already
5128 // vectorized the loop we will have done the runtime check and so interleaving
5129 // won't require further checks.
5130 bool ScalarInterleavingRequiresPredication =
5131 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
5132 return Legal->blockNeedsPredication(BB);
5133 }));
5134 bool ScalarInterleavingRequiresRuntimePointerCheck =
5136
5137 // We want to interleave small loops in order to reduce the loop overhead and
5138 // potentially expose ILP opportunities.
5139 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5140 << "LV: IC is " << IC << '\n'
5141 << "LV: VF is " << VF << '\n');
5142 const bool AggressivelyInterleaveReductions =
5143 TTI.enableAggressiveInterleaving(HasReductions);
5144 if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5145 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5146 // We assume that the cost overhead is 1 and we use the cost model
5147 // to estimate the cost of the loop and interleave until the cost of the
5148 // loop overhead is about 5% of the cost of the loop.
5149 unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>(
5150 SmallLoopCost / *LoopCost.getValue()));
5151
5152 // Interleave until store/load ports (estimated by max interleave count) are
5153 // saturated.
5154 unsigned NumStores = Legal->getNumStores();
5155 unsigned NumLoads = Legal->getNumLoads();
5156 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5157 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5158
5159 // There is little point in interleaving for reductions containing selects
5160 // and compares when VF=1 since it may just create more overhead than it's
5161 // worth for loops with small trip counts. This is because we still have to
5162 // do the final reduction after the loop.
5163 bool HasSelectCmpReductions =
5164 HasReductions &&
5165 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5166 const RecurrenceDescriptor &RdxDesc = Reduction.second;
5167 RecurKind RK = RdxDesc.getRecurrenceKind();
5168 return RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) ||
5169 RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK);
5170 });
5171 if (HasSelectCmpReductions) {
5172 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5173 return 1;
5174 }
5175
5176 // If we have a scalar reduction (vector reductions are already dealt with
5177 // by this point), we can increase the critical path length if the loop
5178 // we're interleaving is inside another loop. For tree-wise reductions
5179 // set the limit to 2, and for ordered reductions it's best to disable
5180 // interleaving entirely.
5181 if (HasReductions && TheLoop->getLoopDepth() > 1) {
5182 bool HasOrderedReductions =
5183 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5184 const RecurrenceDescriptor &RdxDesc = Reduction.second;
5185 return RdxDesc.isOrdered();
5186 });
5187 if (HasOrderedReductions) {
5188 LLVM_DEBUG(
5189 dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5190 return 1;
5191 }
5192
5193 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5194 SmallIC = std::min(SmallIC, F);
5195 StoresIC = std::min(StoresIC, F);
5196 LoadsIC = std::min(LoadsIC, F);
5197 }
5198
5200 std::max(StoresIC, LoadsIC) > SmallIC) {
5201 LLVM_DEBUG(
5202 dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5203 return std::max(StoresIC, LoadsIC);
5204 }
5205
5206 // If there are scalar reductions and TTI has enabled aggressive
5207 // interleaving for reductions, we will interleave to expose ILP.
5208 if (VF.isScalar() && AggressivelyInterleaveReductions) {
5209 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5210 // Interleave no less than SmallIC but not as aggressive as the normal IC
5211 // to satisfy the rare situation when resources are too limited.
5212 return std::max(IC / 2, SmallIC);
5213 }
5214
5215 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5216 return SmallIC;
5217 }
5218
5219 // Interleave if this is a large loop (small loops are already dealt with by
5220 // this point) that could benefit from interleaving.
5221 if (AggressivelyInterleaveReductions) {
5222 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5223 return IC;
5224 }
5225
5226 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5227 return 1;
5228}
5229
5232 // This function calculates the register usage by measuring the highest number
5233 // of values that are alive at a single location. Obviously, this is a very
5234 // rough estimation. We scan the loop in a topological order in order and
5235 // assign a number to each instruction. We use RPO to ensure that defs are
5236 // met before their users. We assume that each instruction that has in-loop
5237 // users starts an interval. We record every time that an in-loop value is
5238 // used, so we have a list of the first and last occurrences of each
5239 // instruction. Next, we transpose this data structure into a multi map that
5240 // holds the list of intervals that *end* at a specific location. This multi
5241 // map allows us to perform a linear search. We scan the instructions linearly
5242 // and record each time that a new interval starts, by placing it in a set.
5243 // If we find this value in the multi-map then we remove it from the set.
5244 // The max register usage is the maximum size of the set.
5245 // We also search for instructions that are defined outside the loop, but are
5246 // used inside the loop. We need this number separately from the max-interval
5247 // usage number because when we unroll, loop-invariant values do not take
5248 // more register.
5250 DFS.perform(LI);
5251
5252 RegisterUsage RU;
5253
5254 // Each 'key' in the map opens a new interval. The values
5255 // of the map are the index of the 'last seen' usage of the
5256 // instruction that is the key.
5258
5259 // Maps instruction to its index.
5261 // Marks the end of each interval.
5262 IntervalMap EndPoint;
5263 // Saves the list of instruction indices that are used in the loop.
5265 // Saves the list of values that are used in the loop but are defined outside
5266 // the loop (not including non-instruction values such as arguments and
5267 // constants).
5268 SmallSetVector<Instruction *, 8> LoopInvariants;
5269
5270 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5271 for (Instruction &I : BB->instructionsWithoutDebug()) {
5272 IdxToInstr.push_back(&I);
5273
5274 // Save the end location of each USE.
5275 for (Value *U : I.operands()) {
5276 auto *Instr = dyn_cast<Instruction>(U);
5277
5278 // Ignore non-instruction values such as arguments, constants, etc.
5279 // FIXME: Might need some motivation why these values are ignored. If
5280 // for example an argument is used inside the loop it will increase the
5281 // register pressure (so shouldn't we add it to LoopInvariants).
5282 if (!Instr)
5283 continue;
5284
5285 // If this instruction is outside the loop then record it and continue.
5286 if (!TheLoop->contains(Instr)) {
5287 LoopInvariants.insert(Instr);
5288 continue;
5289 }
5290
5291 // Overwrite previous end points.
5292 EndPoint[Instr] = IdxToInstr.size();
5293 Ends.insert(Instr);
5294 }
5295 }
5296 }
5297
5298 // Saves the list of intervals that end with the index in 'key'.
5299 using InstrList = SmallVector<Instruction *, 2>;
5301
5302 // Transpose the EndPoints to a list of values that end at each index.
5303 for (auto &Interval : EndPoint)
5304 TransposeEnds[Interval.second].push_back(Interval.first);
5305
5306 SmallPtrSet<Instruction *, 8> OpenIntervals;
5309
5310 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5311
5312 const auto &TTICapture = TTI;
5313 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
5314 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty) ||
5315 (VF.isScalable() &&
5316 !TTICapture.isElementTypeLegalForScalableVector(Ty)))
5317 return 0;
5318 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
5319 };
5320
5321 for (unsigned int Idx = 0, Sz = IdxToInstr.size(); Idx < Sz; ++Idx) {
5322 Instruction *I = IdxToInstr[Idx];
5323
5324 // Remove all of the instructions that end at this location.
5325 InstrList &List = TransposeEnds[Idx];
5326 for (Instruction *ToRemove : List)
5327 OpenIntervals.erase(ToRemove);
5328
5329 // Ignore instructions that are never used within the loop.
5330 if (!Ends.count(I))
5331 continue;
5332
5333 // Skip ignored values.
5334 if (ValuesToIgnore.count(I))
5335 continue;
5336
5338
5339 // For each VF find the maximum usage of registers.
5340 for (unsigned J = 0, E = VFs.size(); J < E; ++J) {
5341 // Count the number of registers used, per register class, given all open
5342 // intervals.
5343 // Note that elements in this SmallMapVector will be default constructed
5344 // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
5345 // there is no previous entry for ClassID.
5347
5348 if (VFs[J].isScalar()) {
5349 for (auto *Inst : OpenIntervals) {
5350 unsigned ClassID =
5351 TTI.getRegisterClassForType(false, Inst->getType());
5352 // FIXME: The target might use more than one register for the type
5353 // even in the scalar case.
5354 RegUsage[ClassID] += 1;
5355 }
5356 } else {
5358 for (auto *Inst : OpenIntervals) {
5359 // Skip ignored values for VF > 1.
5360 if (VecValuesToIgnore.count(Inst))
5361 continue;
5362 if (isScalarAfterVectorization(Inst, VFs[J])) {
5363 unsigned ClassID =
5364 TTI.getRegisterClassForType(false, Inst->getType());
5365 // FIXME: The target might use more than one register for the type
5366 // even in the scalar case.
5367 RegUsage[ClassID] += 1;
5368 } else {
5369 unsigned ClassID =
5370 TTI.getRegisterClassForType(true, Inst->getType());
5371 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[J]);
5372 }
5373 }
5374 }
5375
5376 for (const auto &Pair : RegUsage) {
5377 auto &Entry = MaxUsages[J][Pair.first];
5378 Entry = std::max(Entry, Pair.second);
5379 }
5380 }
5381
5382 LLVM_DEBUG(dbgs() << "LV(REG): At #" << Idx << " Interval # "
5383 << OpenIntervals.size() << '\n');
5384
5385 // Add the current instruction to the list of open intervals.
5386 OpenIntervals.insert(I);
5387 }
5388
5389 for (unsigned Idx = 0, End = VFs.size(); Idx < End; ++Idx) {
5390 // Note that elements in this SmallMapVector will be default constructed
5391 // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5392 // there is no previous entry for ClassID.
5394
5395 for (auto *Inst : LoopInvariants) {
5396 // FIXME: The target might use more than one register for the type
5397 // even in the scalar case.
5398 bool IsScalar = all_of(Inst->users(), [&](User *U) {
5399 auto *I = cast<Instruction>(U);
5400 return TheLoop != LI->getLoopFor(I->getParent()) ||
5401 isScalarAfterVectorization(I, VFs[Idx]);
5402 });
5403
5404 ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[Idx];
5405 unsigned ClassID =
5406 TTI.getRegisterClassForType(VF.isVector(), Inst->getType());
5407 Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);
5408 }
5409
5410 LLVM_DEBUG({
5411 dbgs() << "LV(REG): VF = " << VFs[Idx] << '\n';
5412 dbgs() << "LV(REG): Found max usage: " << MaxUsages[Idx].size()
5413 << " item\n";
5414 for (const auto &pair : MaxUsages[Idx]) {
5415 dbgs() << "LV(REG): RegisterClass: "
5416 << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5417 << " registers\n";
5418 }
5419 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5420 << " item\n";
5421 for (const auto &pair : Invariant) {
5422 dbgs() << "LV(REG): RegisterClass: "
5423 << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5424 << " registers\n";
5425 }
5426 });
5427
5428 RU.LoopInvariantRegs = Invariant;
5429 RU.MaxLocalUsers = MaxUsages[Idx];
5430 RUs[Idx] = RU;
5431 }
5432
5433 return RUs;
5434}
5435
5436bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
5437 ElementCount VF) {
5438 // TODO: Cost model for emulated masked load/store is completely
5439 // broken. This hack guides the cost model to use an artificially
5440 // high enough value to practically disable vectorization with such
5441 // operations, except where previously deployed legality hack allowed
5442 // using very low cost values. This is to avoid regressions coming simply
5443 // from moving "masked load/store" check from legality to cost model.
5444 // Masked Load/Gather emulation was previously never allowed.
5445 // Limited number of Masked Store/Scatter emulation was allowed.
5447 "Expecting a scalar emulated instruction");
5448 return isa<LoadInst>(I) ||
5449 (isa<StoreInst>(I) &&
5450 NumPredStores > NumberOfStoresToPredicate);
5451}
5452
5454 // If we aren't vectorizing the loop, or if we've already collected the
5455 // instructions to scalarize, there's nothing to do. Collection may already
5456 // have occurred if we have a user-selected VF and are now computing the
5457 // expected cost for interleaving.
5458 if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF))
5459 return;
5460
5461 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5462 // not profitable to scalarize any instructions, the presence of VF in the
5463 // map will indicate that we've analyzed it already.
5464 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5465
5466 PredicatedBBsAfterVectorization[VF].clear();
5467
5468 // Find all the instructions that are scalar with predication in the loop and
5469 // determine if it would be better to not if-convert the blocks they are in.
5470 // If so, we also record the instructions to scalarize.
5471 for (BasicBlock *BB : TheLoop->blocks()) {
5473 continue;
5474 for (Instruction &I : *BB)
5475 if (isScalarWithPredication(&I, VF)) {
5476 ScalarCostsTy ScalarCosts;
5477 // Do not apply discount logic for:
5478 // 1. Scalars after vectorization, as there will only be a single copy
5479 // of the instruction.
5480 // 2. Scalable VF, as that would lead to invalid scalarization costs.
5481 // 3. Emulated masked memrefs, if a hacked cost is needed.
5482 if (!isScalarAfterVectorization(&I, VF) && !VF.isScalable() &&
5483 !useEmulatedMaskMemRefHack(&I, VF) &&
5484 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) {
5485 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5486 // Check if we decided to scalarize a call. If so, update the widening
5487 // decision of the call to CM_Scalarize with the computed scalar cost.
5488 for (const auto &[I, _] : ScalarCosts) {
5489 auto *CI = dyn_cast<CallInst>(I);
5490 if (!CI || !CallWideningDecisions.contains({CI, VF}))
5491 continue;
5492 CallWideningDecisions[{CI, VF}].Kind = CM_Scalarize;
5493 CallWideningDecisions[{CI, VF}].Cost = ScalarCosts[CI];
5494 }
5495 }
5496 // Remember that BB will remain after vectorization.
5497 PredicatedBBsAfterVectorization[VF].insert(BB);
5498 for (auto *Pred : predecessors(BB)) {
5499 if (Pred->getSingleSuccessor() == BB)
5500 PredicatedBBsAfterVectorization[VF].insert(Pred);
5501 }
5502 }
5503 }
5504}
5505
5506InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
5507 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
5508 assert(!isUniformAfterVectorization(PredInst, VF) &&
5509 "Instruction marked uniform-after-vectorization will be predicated");
5510
5511 // Initialize the discount to zero, meaning that the scalar version and the
5512 // vector version cost the same.
5513 InstructionCost Discount = 0;
5514
5515 // Holds instructions to analyze. The instructions we visit are mapped in
5516 // ScalarCosts. Those instructions are the ones that would be scalarized if
5517 // we find that the scalar version costs less.
5519
5520 // Returns true if the given instruction can be scalarized.
5521 auto CanBeScalarized = [&](Instruction *I) -> bool {
5522 // We only attempt to scalarize instructions forming a single-use chain
5523 // from the original predicated block that would otherwise be vectorized.
5524 // Although not strictly necessary, we give up on instructions we know will
5525 // already be scalar to avoid traversing chains that are unlikely to be
5526 // beneficial.
5527 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5529 return false;
5530
5531 // If the instruction is scalar with predication, it will be analyzed
5532 // separately. We ignore it within the context of PredInst.
5533 if (isScalarWithPredication(I, VF))
5534 return false;
5535
5536 // If any of the instruction's operands are uniform after vectorization,
5537 // the instruction cannot be scalarized. This prevents, for example, a
5538 // masked load from being scalarized.
5539 //
5540 // We assume we will only emit a value for lane zero of an instruction
5541 // marked uniform after vectorization, rather than VF identical values.
5542 // Thus, if we scalarize an instruction that uses a uniform, we would
5543 // create uses of values corresponding to the lanes we aren't emitting code
5544 // for. This behavior can be changed by allowing getScalarValue to clone
5545 // the lane zero values for uniforms rather than asserting.
5546 for (Use &U : I->operands())
5547 if (auto *J = dyn_cast<Instruction>(U.get()))
5548 if (isUniformAfterVectorization(J, VF))
5549 return false;
5550
5551 // Otherwise, we can scalarize the instruction.
5552 return true;
5553 };
5554
5555 // Compute the expected cost discount from scalarizing the entire expression
5556 // feeding the predicated instruction. We currently only consider expressions
5557 // that are single-use instruction chains.
5558 Worklist.push_back(PredInst);
5559 while (!Worklist.empty()) {
5560 Instruction *I = Worklist.pop_back_val();
5561
5562 // If we've already analyzed the instruction, there's nothing to do.
5563 if (ScalarCosts.contains(I))
5564 continue;
5565
5566 // Compute the cost of the vector instruction. Note that this cost already
5567 // includes the scalarization overhead of the predicated instruction.
5568 InstructionCost VectorCost = getInstructionCost(I, VF);
5569
5570 // Compute the cost of the scalarized instruction. This cost is the cost of
5571 // the instruction as if it wasn't if-converted and instead remained in the
5572 // predicated block. We will scale this cost by block probability after
5573 // computing the scalarization overhead.
5574 InstructionCost ScalarCost =
5576
5577 // Compute the scalarization overhead of needed insertelement instructions
5578 // and phi nodes.
5580 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5581 ScalarCost += TTI.getScalarizationOverhead(
5582 cast<VectorType>(toVectorTy(I->getType(), VF)),
5583 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
5584 /*Extract*/ false, CostKind);
5585 ScalarCost +=
5586 VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
5587 }
5588
5589 // Compute the scalarization overhead of needed extractelement
5590 // instructions. For each of the instruction's operands, if the operand can
5591 // be scalarized, add it to the worklist; otherwise, account for the
5592 // overhead.
5593 for (Use &U : I->operands())
5594 if (auto *J = dyn_cast<Instruction>(U.get())) {
5595 assert(VectorType::isValidElementType(J->getType()) &&
5596 "Instruction has non-scalar type");
5597 if (CanBeScalarized(J))
5598 Worklist.push_back(J);
5599 else if (needsExtract(J, VF)) {
5600 ScalarCost += TTI.getScalarizationOverhead(
5601 cast<VectorType>(toVectorTy(J->getType(), VF)),
5602 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
5603 /*Extract*/ true, CostKind);
5604 }
5605 }
5606
5607 // Scale the total scalar cost by block probability.
5608 ScalarCost /= getReciprocalPredBlockProb();
5609
5610 // Compute the discount. A non-negative discount means the vector version
5611 // of the instruction costs more, and scalarizing would be beneficial.
5612 Discount += VectorCost - ScalarCost;
5613 ScalarCosts[I] = ScalarCost;
5614 }
5615
5616 return Discount;
5617}
5618
5621
5622 // If the vector loop gets executed exactly once with the given VF, ignore the
5623 // costs of comparison and induction instructions, as they'll get simplified
5624 // away.
5625 SmallPtrSet<Instruction *, 2> ValuesToIgnoreForVF;
5627 if (VF.isFixed() && TC == VF.getFixedValue() && !foldTailByMasking())
5629 ValuesToIgnoreForVF);
5630
5631 // For each block.
5632 for (BasicBlock *BB : TheLoop->blocks()) {
5633 InstructionCost BlockCost;
5634
5635 // For each instruction in the old loop.
5636 for (Instruction &I : BB->instructionsWithoutDebug()) {
5637 // Skip ignored values.
5638 if (ValuesToIgnore.count(&I) || ValuesToIgnoreForVF.count(&I) ||
5639 (VF.isVector() && VecValuesToIgnore.count(&I)))
5640 continue;
5641
5643
5644 // Check if we should override the cost.
5645 if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0)
5647
5648 BlockCost += C;
5649 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
5650 << VF << " For instruction: " << I << '\n');
5651 }
5652
5653 // If we are vectorizing a predicated block, it will have been
5654 // if-converted. This means that the block's instructions (aside from
5655 // stores and instructions that may divide by zero) will now be
5656 // unconditionally executed. For the scalar case, we may not always execute
5657 // the predicated block, if it is an if-else block. Thus, scale the block's
5658 // cost by the probability of executing it. blockNeedsPredication from
5659 // Legal is used so as to not include all blocks in tail folded loops.
5660 if (VF.isScalar() && Legal->blockNeedsPredication(BB))
5661 BlockCost /= getReciprocalPredBlockProb();
5662
5663 Cost += BlockCost;
5664 }
5665
5666 return Cost;
5667}
5668
5669/// Gets Address Access SCEV after verifying that the access pattern
5670/// is loop invariant except the induction variable dependence.
5671///
5672/// This SCEV can be sent to the Target in order to estimate the address
5673/// calculation cost.
5675 Value *Ptr,
5678 const Loop *TheLoop) {
5679
5680 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5681 if (!Gep)
5682 return nullptr;
5683
5684 // We are looking for a gep with all loop invariant indices except for one
5685 // which should be an induction variable.
5686 auto *SE = PSE.getSE();
5687 unsigned NumOperands = Gep->getNumOperands();
5688 for (unsigned Idx = 1; Idx < NumOperands; ++Idx) {
5689 Value *Opd = Gep->getOperand(Idx);
5690 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5691 !Legal->isInductionVariable(Opd))
5692 return nullptr;
5693 }
5694
5695 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5696 return PSE.getSCEV(Ptr);
5697}
5698
5700LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5701 ElementCount VF) {
5702 assert(VF.isVector() &&
5703 "Scalarization cost of instruction implies vectorization.");
5704 if (VF.isScalable())
5706
5707 Type *ValTy = getLoadStoreType(I);
5708 auto *SE = PSE.getSE();
5709
5710 unsigned AS = getLoadStoreAddressSpace(I);
5712 Type *PtrTy = toVectorTy(Ptr->getType(), VF);
5713 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
5714 // that it is being called from this specific place.
5715
5716 // Figure out whether the access is strided and get the stride value
5717 // if it's known in compile time
5718 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5719
5720 // Get the cost of the scalar memory instruction and address computation.
5722 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5723
5724 // Don't pass *I here, since it is scalar but will actually be part of a
5725 // vectorized loop where the user of it is a vectorized instruction.
5727 const Align Alignment = getLoadStoreAlignment(I);
5728 Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
5729 ValTy->getScalarType(),
5730 Alignment, AS, CostKind);
5731
5732 // Get the overhead of the extractelement and insertelement instructions
5733 // we might create due to scalarization.
5734 Cost += getScalarizationOverhead(I, VF, CostKind);
5735
5736 // If we have a predicated load/store, it will need extra i1 extracts and
5737 // conditional branches, but may not be executed for each vector lane. Scale
5738 // the cost by the probability of executing the predicated block.
5739 if (isPredicatedInst(I)) {
5741
5742 // Add the cost of an i1 extract and a branch
5743 auto *VecI1Ty =
5746 VecI1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
5747 /*Insert=*/false, /*Extract=*/true, CostKind);
5748 Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
5749
5750 if (useEmulatedMaskMemRefHack(I, VF))
5751 // Artificially setting to a high enough value to practically disable
5752 // vectorization with such operations.
5753 Cost = 3000000;
5754 }
5755
5756 return Cost;
5757}
5758
5760LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5761 ElementCount VF) {
5762 Type *ValTy = getLoadStoreType(I);
5763 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5765 unsigned AS = getLoadStoreAddressSpace(I);
5766 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
5768
5769 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5770 "Stride should be 1 or -1 for consecutive memory access");
5771 const Align Alignment = getLoadStoreAlignment(I);
5773 if (Legal->isMaskRequired(I)) {
5774 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5775 CostKind);
5776 } else {
5777 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
5778 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5779 CostKind, OpInfo, I);
5780 }
5781
5782 bool Reverse = ConsecutiveStride < 0;
5783 if (Reverse)
5785 CostKind, 0);
5786 return Cost;
5787}
5788
5790LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5791 ElementCount VF) {
5792 assert(Legal->isUniformMemOp(*I, VF));
5793
5794 Type *ValTy = getLoadStoreType(I);
5795 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5796 const Align Alignment = getLoadStoreAlignment(I);
5797 unsigned AS = getLoadStoreAddressSpace(I);
5799 if (isa<LoadInst>(I)) {
5800 return TTI.getAddressComputationCost(ValTy) +
5801 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
5802 CostKind) +
5804 }
5805 StoreInst *SI = cast<StoreInst>(I);
5806
5807 bool IsLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
5808 return TTI.getAddressComputationCost(ValTy) +
5809 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
5810 CostKind) +
5811 (IsLoopInvariantStoreValue
5812 ? 0
5813 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
5814 CostKind, VF.getKnownMinValue() - 1));
5815}
5816
5818LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5819 ElementCount VF) {
5820 Type *ValTy = getLoadStoreType(I);
5821 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5822 const Align Alignment = getLoadStoreAlignment(I);
5824
5825 return TTI.getAddressComputationCost(VectorTy) +
5827 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
5829}
5830
5832LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5833 ElementCount VF) {
5834 const auto *Group = getInterleavedAccessGroup(I);
5835 assert(Group && "Fail to get an interleaved access group.");
5836
5837 Instruction *InsertPos = Group->getInsertPos();
5838 Type *ValTy = getLoadStoreType(InsertPos);
5839 auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
5840 unsigned AS = getLoadStoreAddressSpace(InsertPos);
5842
5843 unsigned InterleaveFactor = Group->getFactor();
5844 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5845
5846 // Holds the indices of existing members in the interleaved group.
5848 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
5849 if (Group->getMember(IF))
5850 Indices.push_back(IF);
5851
5852 // Calculate the cost of the whole interleaved group.
5853 bool UseMaskForGaps =
5854 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
5855 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
5857 InsertPos->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5858 Group->getAlign(), AS, CostKind, Legal->isMaskRequired(I),
5859 UseMaskForGaps);
5860
5861 if (Group->isReverse()) {
5862 // TODO: Add support for reversed masked interleaved access.
5864 "Reverse masked interleaved access not supported.");
5865 Cost += Group->getNumMembers() *
5867 CostKind, 0);
5868 }
5869 return Cost;
5870}
5871
5872std::optional<InstructionCost>
5874 Instruction *I, ElementCount VF, Type *Ty,
5876 using namespace llvm::PatternMatch;
5877 // Early exit for no inloop reductions
5878 if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
5879 return std::nullopt;
5880 auto *VectorTy = cast<VectorType>(Ty);
5881
5882 // We are looking for a pattern of, and finding the minimal acceptable cost:
5883 // reduce(mul(ext(A), ext(B))) or
5884 // reduce(mul(A, B)) or
5885 // reduce(ext(A)) or
5886 // reduce(A).
5887 // The basic idea is that we walk down the tree to do that, finding the root
5888 // reduction instruction in InLoopReductionImmediateChains. From there we find
5889 // the pattern of mul/ext and test the cost of the entire pattern vs the cost
5890 // of the components. If the reduction cost is lower then we return it for the
5891 // reduction instruction and 0 for the other instructions in the pattern. If
5892 // it is not we return an invalid cost specifying the orignal cost method
5893 // should be used.
5894 Instruction *RetI = I;
5895 if (match(RetI, m_ZExtOrSExt(m_Value()))) {
5896 if (!RetI->hasOneUser())
5897 return std::nullopt;
5898 RetI = RetI->user_back();
5899 }
5900
5901 if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
5902 RetI->user_back()->getOpcode() == Instruction::Add) {
5903 RetI = RetI->user_back();
5904 }
5905
5906 // Test if the found instruction is a reduction, and if not return an invalid
5907 // cost specifying the parent to use the original cost modelling.
5908 if (!InLoopReductionImmediateChains.count(RetI))
5909 return std::nullopt;
5910
5911 // Find the reduction this chain is a part of and calculate the basic cost of
5912 // the reduction on its own.
5913 Instruction *LastChain = InLoopReductionImmediateChains.at(RetI);
5914 Instruction *ReductionPhi = LastChain;
5915 while (!isa<PHINode>(ReductionPhi))
5916 ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi);
5917
5918 const RecurrenceDescriptor &RdxDesc =
5919 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
5920
5921 InstructionCost BaseCost;
5922 RecurKind RK = RdxDesc.getRecurrenceKind();
5925 BaseCost = TTI.getMinMaxReductionCost(MinMaxID, VectorTy,
5926 RdxDesc.getFastMathFlags(), CostKind);
5927 } else {
5929 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
5930 }
5931
5932 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
5933 // normal fmul instruction to the cost of the fadd reduction.
5934 if (RK == RecurKind::FMulAdd)
5935 BaseCost +=
5936 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
5937
5938 // If we're using ordered reductions then we can just return the base cost
5939 // here, since getArithmeticReductionCost calculates the full ordered
5940 // reduction cost when FP reassociation is not allowed.
5941 if (useOrderedReductions(RdxDesc))
5942 return BaseCost;
5943
5944 // Get the operand that was not the reduction chain and match it to one of the
5945 // patterns, returning the better cost if it is found.
5946 Instruction *RedOp = RetI->getOperand(1) == LastChain
5947 ? dyn_cast<Instruction>(RetI->getOperand(0))
5948 : dyn_cast<Instruction>(RetI->getOperand(1));
5949
5950 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
5951
5952 Instruction *Op0, *Op1;
5953 if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5954 match(RedOp,
5956 match(Op0, m_ZExtOrSExt(m_Value())) &&
5957 Op0->getOpcode() == Op1->getOpcode() &&
5958 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
5960 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
5961
5962 // Matched reduce.add(ext(mul(ext(A), ext(B)))
5963 // Note that the extend opcodes need to all match, or if A==B they will have
5964 // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
5965 // which is equally fine.
5966 bool IsUnsigned = isa<ZExtInst>(Op0);
5967 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
5968 auto *MulType = VectorType::get(Op0->getType(), VectorTy);
5969
5970 InstructionCost ExtCost =
5971 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
5973 InstructionCost MulCost =
5974 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
5975 InstructionCost Ext2Cost =
5976 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
5978
5980 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
5981
5982 if (RedCost.isValid() &&
5983 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
5984 return I == RetI ? RedCost : 0;
5985 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
5986 !TheLoop->isLoopInvariant(RedOp)) {
5987 // Matched reduce(ext(A))
5988 bool IsUnsigned = isa<ZExtInst>(RedOp);
5989 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
5991 RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
5992 RdxDesc.getFastMathFlags(), CostKind);
5993
5994 InstructionCost ExtCost =
5995 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
5997 if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
5998 return I == RetI ? RedCost : 0;
5999 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6000 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
6001 if (match(Op0, m_ZExtOrSExt(m_Value())) &&
6002 Op0->getOpcode() == Op1->getOpcode() &&
6004 bool IsUnsigned = isa<ZExtInst>(Op0);
6005 Type *Op0Ty = Op0->getOperand(0)->getType();
6006 Type *Op1Ty = Op1->getOperand(0)->getType();
6007 Type *LargestOpTy =
6008 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
6009 : Op0Ty;
6010 auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
6011
6012 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
6013 // different sizes. We take the largest type as the ext to reduce, and add
6014 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
6016 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
6019 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
6021 InstructionCost MulCost =
6022 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6023
6025 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
6026 InstructionCost ExtraExtCost = 0;
6027 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
6028 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
6029 ExtraExtCost = TTI.getCastInstrCost(
6030 ExtraExtOp->getOpcode(), ExtType,
6031 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
6033 }
6034
6035 if (RedCost.isValid() &&
6036 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
6037 return I == RetI ? RedCost : 0;
6038 } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
6039 // Matched reduce.add(mul())
6040 InstructionCost MulCost =
6041 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6042
6044 true, RdxDesc.getRecurrenceType(), VectorTy, CostKind);
6045
6046 if (RedCost.isValid() && RedCost < MulCost + BaseCost)
6047 return I == RetI ? RedCost : 0;
6048 }
6049 }
6050
6051 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
6052}
6053
6055LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6056 ElementCount VF) {
6057 // Calculate scalar cost only. Vectorization cost should be ready at this
6058 // moment.
6059 if (VF.isScalar()) {
6060 Type *ValTy = getLoadStoreType(I);
6061 const Align Alignment = getLoadStoreAlignment(I);
6062 unsigned AS = getLoadStoreAddressSpace(I);
6063
6064 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
6065 return TTI.getAddressComputationCost(ValTy) +
6066 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6067 TTI::TCK_RecipThroughput, OpInfo, I);
6068 }
6069 return getWideningCost(I, VF);
6070}
6071
6072InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
6074
6075 // There is no mechanism yet to create a scalable scalarization loop,
6076 // so this is currently Invalid.
6077 if (VF.isScalable())
6079
6080 if (VF.isScalar())
6081 return 0;
6082
6084 Type *RetTy = toVectorTy(I->getType(), VF);
6085 if (!RetTy->isVoidTy() &&
6086 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6088 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
6089 /*Insert*/ true,
6090 /*Extract*/ false, CostKind);
6091
6092 // Some targets keep addresses scalar.
6093 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6094 return Cost;
6095
6096 // Some targets support efficient element stores.
6097 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6098 return Cost;
6099
6100 // Collect operands to consider.
6101 CallInst *CI = dyn_cast<CallInst>(I);
6102 Instruction::op_range Ops = CI ? CI->args() : I->operands();
6103
6104 // Skip operands that do not require extraction/scalarization and do not incur
6105 // any overhead.
6107 for (auto *V : filterExtractingOperands(Ops, VF))
6108 Tys.push_back(maybeVectorizeType(V->getType(), VF));
6110 filterExtractingOperands(Ops, VF), Tys, CostKind);
6111}
6112
6114 if (VF.isScalar())
6115 return;
6116 NumPredStores = 0;
6117 for (BasicBlock *BB : TheLoop->blocks()) {
6118 // For each instruction in the old loop.
6119 for (Instruction &I : *BB) {
6121 if (!Ptr)
6122 continue;
6123
6124 // TODO: We should generate better code and update the cost model for
6125 // predicated uniform stores. Today they are treated as any other
6126 // predicated store (see added test cases in
6127 // invariant-store-vectorization.ll).
6128 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
6129 NumPredStores++;
6130
6131 if (Legal->isUniformMemOp(I, VF)) {
6132 auto IsLegalToScalarize = [&]() {
6133 if (!VF.isScalable())
6134 // Scalarization of fixed length vectors "just works".
6135 return true;
6136
6137 // We have dedicated lowering for unpredicated uniform loads and
6138 // stores. Note that even with tail folding we know that at least
6139 // one lane is active (i.e. generalized predication is not possible
6140 // here), and the logic below depends on this fact.
6141 if (!foldTailByMasking())
6142 return true;
6143
6144 // For scalable vectors, a uniform memop load is always
6145 // uniform-by-parts and we know how to scalarize that.
6146 if (isa<LoadInst>(I))
6147 return true;
6148
6149 // A uniform store isn't neccessarily uniform-by-part
6150 // and we can't assume scalarization.
6151 auto &SI = cast<StoreInst>(I);
6152 return TheLoop->isLoopInvariant(SI.getValueOperand());
6153 };
6154
6155 const InstructionCost GatherScatterCost =
6157 getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
6158
6159 // Load: Scalar load + broadcast
6160 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6161 // FIXME: This cost is a significant under-estimate for tail folded
6162 // memory ops.
6163 const InstructionCost ScalarizationCost =
6164 IsLegalToScalarize() ? getUniformMemOpCost(&I, VF)
6166
6167 // Choose better solution for the current VF, Note that Invalid
6168 // costs compare as maximumal large. If both are invalid, we get
6169 // scalable invalid which signals a failure and a vectorization abort.
6170 if (GatherScatterCost < ScalarizationCost)
6171 setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
6172 else
6173 setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
6174 continue;
6175 }
6176
6177 // We assume that widening is the best solution when possible.
6178 if (memoryInstructionCanBeWidened(&I, VF)) {
6179 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6180 int ConsecutiveStride = Legal->isConsecutivePtr(
6182 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6183 "Expected consecutive stride.");
6184 InstWidening Decision =
6185 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6186 setWideningDecision(&I, VF, Decision, Cost);
6187 continue;
6188 }
6189
6190 // Choose between Interleaving, Gather/Scatter or Scalarization.
6192 unsigned NumAccesses = 1;
6193 if (isAccessInterleaved(&I)) {
6194 const auto *Group = getInterleavedAccessGroup(&I);
6195 assert(Group && "Fail to get an interleaved access group.");
6196
6197 // Make one decision for the whole group.
6198 if (getWideningDecision(&I, VF) != CM_Unknown)
6199 continue;
6200
6201 NumAccesses = Group->getNumMembers();
6203 InterleaveCost = getInterleaveGroupCost(&I, VF);
6204 }
6205
6206 InstructionCost GatherScatterCost =
6208 ? getGatherScatterCost(&I, VF) * NumAccesses
6210
6211 InstructionCost ScalarizationCost =
6212 getMemInstScalarizationCost(&I, VF) * NumAccesses;
6213
6214 // Choose better solution for the current VF,
6215 // write down this decision and use it during vectorization.
6217 InstWidening Decision;
6218 if (InterleaveCost <= GatherScatterCost &&
6219 InterleaveCost < ScalarizationCost) {
6220 Decision = CM_Interleave;
6221 Cost = InterleaveCost;
6222 } else if (GatherScatterCost < ScalarizationCost) {
6223 Decision = CM_GatherScatter;
6224 Cost = GatherScatterCost;
6225 } else {
6226 Decision = CM_Scalarize;
6227 Cost = ScalarizationCost;
6228 }
6229 // If the instructions belongs to an interleave group, the whole group
6230 // receives the same decision. The whole group receives the cost, but
6231 // the cost will actually be assigned to one instruction.
6232 if (const auto *Group = getInterleavedAccessGroup(&I))
6233 setWideningDecision(Group, VF, Decision, Cost);
6234 else
6235 setWideningDecision(&I, VF, Decision, Cost);
6236 }
6237 }
6238
6239 // Make sure that any load of address and any other address computation
6240 // remains scalar unless there is gather/scatter support. This avoids
6241 // inevitable extracts into address registers, and also has the benefit of
6242 // activating LSR more, since that pass can't optimize vectorized
6243 // addresses.
6245 return;
6246
6247 // Start with all scalar pointer uses.
6249 for (BasicBlock *BB : TheLoop->blocks())
6250 for (Instruction &I : *BB) {
6251 Instruction *PtrDef =
6252 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6253 if (PtrDef && TheLoop->contains(PtrDef) &&
6255 AddrDefs.insert(PtrDef);
6256 }
6257
6258 // Add all instructions used to generate the addresses.
6260 append_range(Worklist, AddrDefs);
6261 while (!Worklist.empty()) {
6262 Instruction *I = Worklist.pop_back_val();
6263 for (auto &Op : I->operands())
6264 if (auto *InstOp = dyn_cast<Instruction>(Op))
6265 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6266 AddrDefs.insert(InstOp).second)
6267 Worklist.push_back(InstOp);
6268 }
6269
6270 for (auto *I : AddrDefs) {
6271 if (isa<LoadInst>(I)) {
6272 // Setting the desired widening decision should ideally be handled in
6273 // by cost functions, but since this involves the task of finding out
6274 // if the loaded register is involved in an address computation, it is
6275 // instead changed here when we know this is the case.
6276 InstWidening Decision = getWideningDecision(I, VF);
6277 if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6278 // Scalarize a widened load of address.
6280 I, VF, CM_Scalarize,
6281 (VF.getKnownMinValue() *
6282 getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6283 else if (const auto *Group = getInterleavedAccessGroup(I)) {
6284 // Scalarize an interleave group of address loads.
6285 for (unsigned I = 0; I < Group->getFactor(); ++I) {
6286 if (Instruction *Member = Group->getMember(I))
6288 Member, VF, CM_Scalarize,
6289 (VF.getKnownMinValue() *
6290 getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6291 }
6292 }
6293 } else
6294 // Make sure I gets scalarized and a cost estimate without
6295 // scalarization overhead.
6296 ForcedScalars[VF].insert(I);
6297 }
6298}
6299
6301 assert(!VF.isScalar() &&
6302 "Trying to set a vectorization decision for a scalar VF");
6303
6304 auto ForcedScalar = ForcedScalars.find(VF);
6305 for (BasicBlock *BB : TheLoop->blocks()) {
6306 // For each instruction in the old loop.
6307 for (Instruction &I : *BB) {
6308 CallInst *CI = dyn_cast<CallInst>(&I);
6309
6310 if (!CI)
6311 continue;
6312
6317 Function *ScalarFunc = CI->getCalledFunction();
6318 Type *ScalarRetTy = CI->getType();
6319 SmallVector<Type *, 4> Tys, ScalarTys;
6320 for (auto &ArgOp : CI->args())
6321 ScalarTys.push_back(ArgOp->getType());
6322
6323 // Estimate cost of scalarized vector call. The source operands are
6324 // assumed to be vectors, so we need to extract individual elements from
6325 // there, execute VF scalar calls, and then gather the result into the
6326 // vector return value.
6327 InstructionCost ScalarCallCost =
6328 TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
6329
6330 // Compute costs of unpacking argument values for the scalar calls and
6331 // packing the return values to a vector.
6332 InstructionCost ScalarizationCost =
6333 getScalarizationOverhead(CI, VF, CostKind);
6334
6335 ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
6336 // Honor ForcedScalars and UniformAfterVectorization decisions.
6337 // TODO: For calls, it might still be more profitable to widen. Use
6338 // VPlan-based cost model to compare different options.
6339 if (VF.isVector() && ((ForcedScalar != ForcedScalars.end() &&
6340 ForcedScalar->second.contains(CI)) ||
6341 isUniformAfterVectorization(CI, VF))) {
6342 setCallWideningDecision(CI, VF, CM_Scalarize, nullptr,
6343 Intrinsic::not_intrinsic, std::nullopt,
6344 ScalarCost);
6345 continue;
6346 }
6347
6348 bool MaskRequired = Legal->isMaskRequired(CI);
6349 // Compute corresponding vector type for return value and arguments.
6350 Type *RetTy = toVectorTy(ScalarRetTy, VF);
6351 for (Type *ScalarTy : ScalarTys)
6352 Tys.push_back(toVectorTy(ScalarTy, VF));
6353
6354 // An in-loop reduction using an fmuladd intrinsic is a special case;
6355 // we don't want the normal cost for that intrinsic.
6357 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) {
6360 std::nullopt, *RedCost);
6361 continue;
6362 }
6363
6364 // Find the cost of vectorizing the call, if we can find a suitable
6365 // vector variant of the function.
6366 bool UsesMask = false;
6367 VFInfo FuncInfo;
6368 Function *VecFunc = nullptr;
6369 // Search through any available variants for one we can use at this VF.
6370 for (VFInfo &Info : VFDatabase::getMappings(*CI)) {
6371 // Must match requested VF.
6372 if (Info.Shape.VF != VF)
6373 continue;
6374
6375 // Must take a mask argument if one is required
6376 if (MaskRequired && !Info.isMasked())
6377 continue;
6378
6379 // Check that all parameter kinds are supported
6380 bool ParamsOk = true;
6381 for (VFParameter Param : Info.Shape.Parameters) {
6382 switch (Param.ParamKind) {
6384 break;
6386 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6387 // Make sure the scalar parameter in the loop is invariant.
6388 if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),
6389 TheLoop))
6390 ParamsOk = false;
6391 break;
6392 }
6394 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6395 // Find the stride for the scalar parameter in this loop and see if
6396 // it matches the stride for the variant.
6397 // TODO: do we need to figure out the cost of an extract to get the
6398 // first lane? Or do we hope that it will be folded away?
6399 ScalarEvolution *SE = PSE.getSE();
6400 const auto *SAR =
6401 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(ScalarParam));
6402
6403 if (!SAR || SAR->getLoop() != TheLoop) {
6404 ParamsOk = false;
6405 break;
6406 }
6407
6408 const SCEVConstant *Step =
6409 dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE));
6410
6411 if (!Step ||
6412 Step->getAPInt().getSExtValue() != Param.LinearStepOrPos)
6413 ParamsOk = false;
6414
6415 break;
6416 }
6418 UsesMask = true;
6419 break;
6420 default:
6421 ParamsOk = false;
6422 break;
6423 }
6424 }
6425
6426 if (!ParamsOk)
6427 continue;
6428
6429 // Found a suitable candidate, stop here.
6430 VecFunc = CI->getModule()->getFunction(Info.VectorName);
6431 FuncInfo = Info;
6432 break;
6433 }
6434
6435 // Add in the cost of synthesizing a mask if one wasn't required.
6436 InstructionCost MaskCost = 0;
6437 if (VecFunc && UsesMask && !MaskRequired)
6438 MaskCost = TTI.getShuffleCost(
6441 VecFunc->getFunctionType()->getContext()),
6442 VF));
6443
6444 if (TLI && VecFunc && !CI->isNoBuiltin())
6445 VectorCost =
6446 TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost;
6447
6448 // Find the cost of an intrinsic; some targets may have instructions that
6449 // perform the operation without needing an actual call.
6451 if (IID != Intrinsic::not_intrinsic)
6452 IntrinsicCost = getVectorIntrinsicCost(CI, VF);
6453
6454 InstructionCost Cost = ScalarCost;
6455 InstWidening Decision = CM_Scalarize;
6456
6457 if (VectorCost <= Cost) {
6458 Cost = VectorCost;
6459 Decision = CM_VectorCall;
6460 }
6461
6462 if (IntrinsicCost <= Cost) {
6463 Cost = IntrinsicCost;
6464 Decision = CM_IntrinsicCall;
6465 }
6466
6467 setCallWideningDecision(CI, VF, Decision, VecFunc, IID,
6469 }
6470 }
6471}
6472
6474 if (!Legal->isInvariant(Op))
6475 return false;
6476 // Consider Op invariant, if it or its operands aren't predicated
6477 // instruction in the loop. In that case, it is not trivially hoistable.
6478 auto *OpI = dyn_cast<Instruction>(Op);
6479 return !OpI || !TheLoop->contains(OpI) ||
6480 (!isPredicatedInst(OpI) &&
6481 (!isa<PHINode>(OpI) || OpI->getParent() != TheLoop->getHeader()) &&
6482 all_of(OpI->operands(),
6483 [this](Value *Op) { return shouldConsiderInvariant(Op); }));
6484}
6485
6488 ElementCount VF) {
6489 // If we know that this instruction will remain uniform, check the cost of
6490 // the scalar version.
6492 VF = ElementCount::getFixed(1);
6493
6494 if (VF.isVector() && isProfitableToScalarize(I, VF))
6495 return InstsToScalarize[VF][I];
6496
6497 // Forced scalars do not have any scalarization overhead.
6498 auto ForcedScalar = ForcedScalars.find(VF);
6499 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6500 auto InstSet = ForcedScalar->second;
6501 if (InstSet.count(I))
6503 VF.getKnownMinValue();
6504 }
6505
6506 Type *RetTy = I->getType();
6508 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6509 auto *SE = PSE.getSE();
6511
6512 auto HasSingleCopyAfterVectorization = [this](Instruction *I,
6513 ElementCount VF) -> bool {
6514 if (VF.isScalar())
6515 return true;
6516
6517 auto Scalarized = InstsToScalarize.find(VF);
6518 assert(Scalarized != InstsToScalarize.end() &&
6519 "VF not yet analyzed for scalarization profitability");
6520 return !Scalarized->second.count(I) &&
6521 llvm::all_of(I->users(), [&](User *U) {
6522 auto *UI = cast<Instruction>(U);
6523 return !Scalarized->second.count(UI);
6524 });
6525 };
6526 (void)HasSingleCopyAfterVectorization;
6527
6528 Type *VectorTy;
6529 if (isScalarAfterVectorization(I, VF)) {
6530 // With the exception of GEPs and PHIs, after scalarization there should
6531 // only be one copy of the instruction generated in the loop. This is
6532 // because the VF is either 1, or any instructions that need scalarizing
6533 // have already been dealt with by the time we get here. As a result,
6534 // it means we don't have to multiply the instruction cost by VF.
6535 assert(I->getOpcode() == Instruction::GetElementPtr ||
6536 I->getOpcode() == Instruction::PHI ||
6537 (I->getOpcode() == Instruction::BitCast &&
6538 I->getType()->isPointerTy()) ||
6539 HasSingleCopyAfterVectorization(I, VF));
6540 VectorTy = RetTy;
6541 } else
6542 VectorTy = toVectorTy(RetTy, VF);
6543
6544 if (VF.isVector() && VectorTy->isVectorTy() &&
6545 !TTI.getNumberOfParts(VectorTy))
6547
6548 // TODO: We need to estimate the cost of intrinsic calls.
6549 switch (I->getOpcode()) {
6550 case Instruction::GetElementPtr:
6551 // We mark this instruction as zero-cost because the cost of GEPs in
6552 // vectorized code depends on whether the corresponding memory instruction
6553 // is scalarized or not. Therefore, we handle GEPs with the memory
6554 // instruction cost.
6555 return 0;
6556 case Instruction::Br: {
6557 // In cases of scalarized and predicated instructions, there will be VF
6558 // predicated blocks in the vectorized loop. Each branch around these
6559 // blocks requires also an extract of its vector compare i1 element.
6560 // Note that the conditional branch from the loop latch will be replaced by
6561 // a single branch controlling the loop, so there is no extra overhead from
6562 // scalarization.
6563 bool ScalarPredicatedBB = false;
6564 BranchInst *BI = cast<BranchInst>(I);
6565 if (VF.isVector() && BI->isConditional() &&
6566 (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
6567 PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))) &&
6568 BI->getParent() != TheLoop->getLoopLatch())
6569 ScalarPredicatedBB = true;
6570
6571 if (ScalarPredicatedBB) {
6572 // Not possible to scalarize scalable vector with predicated instructions.
6573 if (VF.isScalable())
6575 // Return cost for branches around scalarized and predicated blocks.
6576 auto *VecI1Ty =
6577 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6578 return (
6580 VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
6581 /*Insert*/ false, /*Extract*/ true, CostKind) +
6582 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
6583 }
6584
6585 if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6586 // The back-edge branch will remain, as will all scalar branches.
6587 return TTI.getCFInstrCost(Instruction::Br, CostKind);
6588
6589 // This branch will be eliminated by if-conversion.
6590 return 0;
6591 // Note: We currently assume zero cost for an unconditional branch inside
6592 // a predicated block since it will become a fall-through, although we
6593 // may decide in the future to call TTI for all branches.
6594 }
6595 case Instruction::Switch: {
6596 if (VF.isScalar())
6597 return TTI.getCFInstrCost(Instruction::Switch, CostKind);
6598 auto *Switch = cast<SwitchInst>(I);
6599 return Switch->getNumCases() *
6601 Instruction::ICmp,
6602 toVectorTy(Switch->getCondition()->getType(), VF),
6603 toVectorTy(Type::getInt1Ty(I->getContext()), VF),
6605 }
6606 case Instruction::PHI: {
6607 auto *Phi = cast<PHINode>(I);
6608
6609 // First-order recurrences are replaced by vector shuffles inside the loop.
6610 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6611 // For <vscale x 1 x i64>, if vscale = 1 we are unable to extract the
6612 // penultimate value of the recurrence.
6613 // TODO: Consider vscale_range info.
6614 if (VF.isScalable() && VF.getKnownMinValue() == 1)
6617 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
6619 cast<VectorType>(VectorTy), Mask, CostKind,
6620 VF.getKnownMinValue() - 1);
6621 }
6622
6623 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6624 // converted into select instructions. We require N - 1 selects per phi
6625 // node, where N is the number of incoming values.
6626 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) {
6627 Type *ResultTy = Phi->getType();
6628
6629 // All instructions in an Any-of reduction chain are narrowed to bool.
6630 // Check if that is the case for this phi node.
6631 auto *HeaderUser = cast_if_present<PHINode>(
6632 find_singleton<User>(Phi->users(), [this](User *U, bool) -> User * {
6633 auto *Phi = dyn_cast<PHINode>(U);
6634 if (Phi && Phi->getParent() == TheLoop->getHeader())
6635 return Phi;
6636 return nullptr;
6637 }));
6638 if (HeaderUser) {
6639 auto &ReductionVars = Legal->getReductionVars();
6640 auto Iter = ReductionVars.find(HeaderUser);
6641 if (Iter != ReductionVars.end() &&
6643 Iter->second.getRecurrenceKind()))
6644 ResultTy = Type::getInt1Ty(Phi->getContext());
6645 }
6646 return (Phi->getNumIncomingValues() - 1) *
6648 Instruction::Select, toVectorTy(ResultTy, VF),
6649 toVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6651 }
6652
6653 // When tail folding with EVL, if the phi is part of an out of loop
6654 // reduction then it will be transformed into a wide vp_merge.
6655 if (VF.isVector() && foldTailWithEVL() &&
6658 Intrinsic::vp_merge, toVectorTy(Phi->getType(), VF),
6659 {toVectorTy(Type::getInt1Ty(Phi->getContext()), VF)});
6660 return TTI.getIntrinsicInstrCost(ICA, CostKind);
6661 }
6662
6663 return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6664 }
6665 case Instruction::UDiv:
6666 case Instruction::SDiv:
6667 case Instruction::URem:
6668 case Instruction::SRem:
6669 if (VF.isVector() && isPredicatedInst(I)) {
6670 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6671 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6672 ScalarCost : SafeDivisorCost;
6673 }
6674 // We've proven all lanes safe to speculate, fall through.
6675 [[fallthrough]];
6676 case Instruction::Add:
6677 case Instruction::Sub: {
6678 auto Info = Legal->getHistogramInfo(I);
6679 if (Info && VF.isVector()) {
6680 const HistogramInfo *HGram = Info.value();
6681 // Assume that a non-constant update value (or a constant != 1) requires
6682 // a multiply, and add that into the cost.
6684 ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1));
6685 if (!RHS || RHS->getZExtValue() != 1)
6686 MulCost = TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy);
6687
6688 // Find the cost of the histogram operation itself.
6689 Type *PtrTy = VectorType::get(HGram->Load->getPointerOperandType(), VF);
6690 Type *ScalarTy = I->getType();
6691 Type *MaskTy = VectorType::get(Type::getInt1Ty(I->getContext()), VF);
6692 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
6693 Type::getVoidTy(I->getContext()),
6694 {PtrTy, ScalarTy, MaskTy});
6695
6696 // Add the costs together with the add/sub operation.
6699 MulCost + TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy);
6700 }
6701 [[fallthrough]];
6702 }
6703 case Instruction::FAdd:
6704 case Instruction::FSub:
6705 case Instruction::Mul:
6706 case Instruction::FMul:
6707 case Instruction::FDiv:
6708 case Instruction::FRem:
6709 case Instruction::Shl:
6710 case Instruction::LShr:
6711 case Instruction::AShr:
6712 case Instruction::And:
6713 case Instruction::Or:
6714 case Instruction::Xor: {
6715 // If we're speculating on the stride being 1, the multiplication may
6716 // fold away. We can generalize this for all operations using the notion
6717 // of neutral elements. (TODO)
6718 if (I->getOpcode() == Instruction::Mul &&
6719 (PSE.getSCEV(I->getOperand(0))->isOne() ||
6720 PSE.getSCEV(I->getOperand(1))->isOne()))
6721 return 0;
6722
6723 // Detect reduction patterns
6724 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
6725 return *RedCost;
6726
6727 // Certain instructions can be cheaper to vectorize if they have a constant
6728 // second vector operand. One example of this are shifts on x86.
6729 Value *Op2 = I->getOperand(1);
6730 if (!isa<Constant>(Op2) && PSE.getSE()->isSCEVable(Op2->getType()) &&
6731 isa<SCEVConstant>(PSE.getSCEV(Op2))) {
6732 Op2 = cast<SCEVConstant>(PSE.getSCEV(Op2))->getValue();
6733 }
6734 auto Op2Info = TTI.getOperandInfo(Op2);
6735 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6738
6739 SmallVector<const Value *, 4> Operands(I->operand_values());
6741 I->getOpcode(), VectorTy, CostKind,
6742 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6743 Op2Info, Operands, I, TLI);
6744 }
6745 case Instruction::FNeg: {
6747 I->getOpcode(), VectorTy, CostKind,
6748 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6749 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6750 I->getOperand(0), I);
6751 }
6752 case Instruction::Select: {
6753 SelectInst *SI = cast<SelectInst>(I);
6754 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6755 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6756
6757 const Value *Op0, *Op1;
6758 using namespace llvm::PatternMatch;
6759 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
6760 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
6761 // select x, y, false --> x & y
6762 // select x, true, y --> x | y
6763 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);
6764 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);
6765 assert(Op0->getType()->getScalarSizeInBits() == 1 &&
6766 Op1->getType()->getScalarSizeInBits() == 1);
6767
6770 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
6771 CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I);
6772 }
6773
6774 Type *CondTy = SI->getCondition()->getType();
6775 if (!ScalarCond)
6776 CondTy = VectorType::get(CondTy, VF);
6777
6779 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
6780 Pred = Cmp->getPredicate();
6781 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
6782 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
6783 {TTI::OK_AnyValue, TTI::OP_None}, I);
6784 }
6785 case Instruction::ICmp:
6786 case Instruction::FCmp: {
6787 Type *ValTy = I->getOperand(0)->getType();
6788
6790 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6791 (void)Op0AsInstruction;
6792 assert((!canTruncateToMinimalBitwidth(Op0AsInstruction, VF) ||
6793 MinBWs[I] == MinBWs[Op0AsInstruction]) &&
6794 "if both the operand and the compare are marked for "
6795 "truncation, they must have the same bitwidth");
6796 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[I]);
6797 }
6798
6799 VectorTy = toVectorTy(ValTy, VF);
6800 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
6801 cast<CmpInst>(I)->getPredicate(), CostKind,
6802 {TTI::OK_AnyValue, TTI::OP_None},
6803 {TTI::OK_AnyValue, TTI::OP_None}, I);
6804 }
6805 case Instruction::Store:
6806 case Instruction::Load: {
6807 ElementCount Width = VF;
6808 if (Width.isVector()) {
6809 InstWidening Decision = getWideningDecision(I, Width);
6810 assert(Decision != CM_Unknown &&
6811 "CM decision should be taken at this point");
6814 if (Decision == CM_Scalarize)
6815 Width = ElementCount::getFixed(1);
6816 }
6817 VectorTy = toVectorTy(getLoadStoreType(I), Width);
6818 return getMemoryInstructionCost(I, VF);
6819 }
6820 case Instruction::BitCast:
6821 if (I->getType()->isPointerTy())
6822 return 0;
6823 [[fallthrough]];
6824 case Instruction::ZExt:
6825 case Instruction::SExt:
6826 case Instruction::FPToUI:
6827 case Instruction::FPToSI:
6828 case Instruction::FPExt:
6829 case Instruction::PtrToInt:
6830 case Instruction::IntToPtr:
6831 case Instruction::SIToFP:
6832 case Instruction::UIToFP:
6833 case Instruction::Trunc:
6834 case Instruction::FPTrunc: {
6835 // Computes the CastContextHint from a Load/Store instruction.
6836 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6837 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
6838 "Expected a load or a store!");
6839
6840 if (VF.isScalar() || !TheLoop->contains(I))
6842
6843 switch (getWideningDecision(I, VF)) {
6855 llvm_unreachable("Instr did not go through cost modelling?");
6858 llvm_unreachable_internal("Instr has invalid widening decision");
6859 }
6860
6861 llvm_unreachable("Unhandled case!");
6862 };
6863
6864 unsigned Opcode = I->getOpcode();
6866 // For Trunc, the context is the only user, which must be a StoreInst.
6867 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
6868 if (I->hasOneUse())
6869 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
6870 CCH = ComputeCCH(Store);
6871 }
6872 // For Z/Sext, the context is the operand, which must be a LoadInst.
6873 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
6874 Opcode == Instruction::FPExt) {
6875 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
6876 CCH = ComputeCCH(Load);
6877 }
6878
6879 // We optimize the truncation of induction variables having constant
6880 // integer steps. The cost of these truncations is the same as the scalar
6881 // operation.
6882 if (isOptimizableIVTruncate(I, VF)) {
6883 auto *Trunc = cast<TruncInst>(I);
6884 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6885 Trunc->getSrcTy(), CCH, CostKind, Trunc);
6886 }
6887
6888 // Detect reduction patterns
6889 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
6890 return *RedCost;
6891
6892 Type *SrcScalarTy = I->getOperand(0)->getType();
6893 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6894 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6895 SrcScalarTy =
6896 IntegerType::get(SrcScalarTy->getContext(), MinBWs[Op0AsInstruction]);
6897 Type *SrcVecTy =
6898 VectorTy->isVectorTy() ? toVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6899
6901 // If the result type is <= the source type, there will be no extend
6902 // after truncating the users to the minimal required bitwidth.
6903 if (VectorTy->getScalarSizeInBits() <= SrcVecTy->getScalarSizeInBits() &&
6904 (I->getOpcode() == Instruction::ZExt ||
6905 I->getOpcode() == Instruction::SExt))
6906 return 0;
6907 }
6908
6909 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
6910 }
6911 case Instruction::Call:
6912 return getVectorCallCost(cast<CallInst>(I), VF);
6913 case Instruction::ExtractValue:
6915 case Instruction::Alloca:
6916 // We cannot easily widen alloca to a scalable alloca, as
6917 // the result would need to be a vector of pointers.
6918 if (VF.isScalable())
6920 [[fallthrough]];
6921 default:
6922 // This opcode is unknown. Assume that it is the same as 'mul'.
6923 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6924 } // end of switch.
6925}
6926
6928 // Ignore ephemeral values.
6930
6931 SmallVector<Value *, 4> DeadInterleavePointerOps;
6933
6934 // If a scalar epilogue is required, users outside the loop won't use
6935 // live-outs from the vector loop but from the scalar epilogue. Ignore them if
6936 // that is the case.
6937 bool RequiresScalarEpilogue = requiresScalarEpilogue(true);
6938 auto IsLiveOutDead = [this, RequiresScalarEpilogue](User *U) {
6939 return RequiresScalarEpilogue &&
6940 !TheLoop->contains(cast<Instruction>(U)->getParent());
6941 };
6942
6944 DFS.perform(LI);
6945 MapVector<Value *, SmallVector<Value *>> DeadInvariantStoreOps;
6946 for (BasicBlock *BB : reverse(make_range(DFS.beginRPO(), DFS.endRPO())))
6947 for (Instruction &I : reverse(*BB)) {
6948 // Find all stores to invariant variables. Since they are going to sink
6949 // outside the loop we do not need calculate cost for them.
6950 StoreInst *SI;
6951 if ((SI = dyn_cast<StoreInst>(&I)) &&
6952 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
6953 ValuesToIgnore.insert(&I);
6954 DeadInvariantStoreOps[SI->getPointerOperand()].push_back(
6955 SI->getValueOperand());
6956 }
6957
6958 if (VecValuesToIgnore.contains(&I) || ValuesToIgnore.contains(&I))
6959 continue;
6960
6961 // Add instructions that would be trivially dead and are only used by
6962 // values already ignored to DeadOps to seed worklist.
6964 all_of(I.users(), [this, IsLiveOutDead](User *U) {
6965 return VecValuesToIgnore.contains(U) ||
6966 ValuesToIgnore.contains(U) || IsLiveOutDead(U);
6967 }))
6968 DeadOps.push_back(&I);
6969
6970 // For interleave groups, we only create a pointer for the start of the
6971 // interleave group. Queue up addresses of group members except the insert
6972 // position for further processing.
6973 if (isAccessInterleaved(&I)) {
6974 auto *Group = getInterleavedAccessGroup(&I);
6975 if (Group->getInsertPos() == &I)
6976 continue;
6977 Value *PointerOp = getLoadStorePointerOperand(&I);
6978 DeadInterleavePointerOps.push_back(PointerOp);
6979 }
6980
6981 // Queue branches for analysis. They are dead, if their successors only
6982 // contain dead instructions.
6983 if (auto *Br = dyn_cast<BranchInst>(&I)) {
6984 if (Br->isConditional())
6985 DeadOps.push_back(&I);
6986 }
6987 }
6988
6989 // Mark ops feeding interleave group members as free, if they are only used
6990 // by other dead computations.
6991 for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) {
6992 auto *Op = dyn_cast<Instruction>(DeadInterleavePointerOps[I]);
6993 if (!Op || !TheLoop->contains(Op) || any_of(Op->users(), [this](User *U) {
6994 Instruction *UI = cast<Instruction>(U);
6995 return !VecValuesToIgnore.contains(U) &&
6996 (!isAccessInterleaved(UI) ||
6997 getInterleavedAccessGroup(UI)->getInsertPos() == UI);
6998 }))
6999 continue;
7000 VecValuesToIgnore.insert(Op);
7001 DeadInterleavePointerOps.append(Op->op_begin(), Op->op_end());
7002 }
7003
7004 for (const auto &[_, Ops] : DeadInvariantStoreOps) {
7005 for (Value *Op : ArrayRef(Ops).drop_back())
7006 DeadOps.push_back(Op);
7007 }
7008 // Mark ops that would be trivially dead and are only used by ignored
7009 // instructions as free.
7010 BasicBlock *Header = TheLoop->getHeader();
7011
7012 // Returns true if the block contains only dead instructions. Such blocks will
7013 // be removed by VPlan-to-VPlan transforms and won't be considered by the
7014 // VPlan-based cost model, so skip them in the legacy cost-model as well.
7015 auto IsEmptyBlock = [this](BasicBlock *BB) {
7016 return all_of(*BB, [this](Instruction &I) {
7017 return ValuesToIgnore.contains(&I) || VecValuesToIgnore.contains(&I) ||
7018 (isa<BranchInst>(&I) && !cast<BranchInst>(&I)->isConditional());
7019 });
7020 };
7021 for (unsigned I = 0; I != DeadOps.size(); ++I) {
7022 auto *Op = dyn_cast<Instruction>(DeadOps[I]);
7023
7024 // Check if the branch should be considered dead.
7025 if (auto *Br = dyn_cast_or_null<BranchInst>(Op)) {
7026 BasicBlock *ThenBB = Br->getSuccessor(0);
7027 BasicBlock *ElseBB = Br->getSuccessor(1);
7028 // Don't considers branches leaving the loop for simplification.
7029 if (!TheLoop->contains(ThenBB) || !TheLoop->contains(ElseBB))
7030 continue;
7031 bool ThenEmpty = IsEmptyBlock(ThenBB);
7032 bool ElseEmpty = IsEmptyBlock(ElseBB);
7033 if ((ThenEmpty && ElseEmpty) ||
7034 (ThenEmpty && ThenBB->getSingleSuccessor() == ElseBB &&
7035 ElseBB->phis().empty()) ||
7036 (ElseEmpty && ElseBB->getSingleSuccessor() == ThenBB &&
7037 ThenBB->phis().empty())) {
7038 VecValuesToIgnore.insert(Br);
7039 DeadOps.push_back(Br->getCondition());
7040 }
7041 continue;
7042 }
7043
7044 // Skip any op that shouldn't be considered dead.
7045 if (!Op || !TheLoop->contains(Op) ||
7046 (isa<PHINode>(Op) && Op->getParent() == Header) ||
7048 any_of(Op->users(), [this, IsLiveOutDead](User *U) {
7049 return !VecValuesToIgnore.contains(U) &&
7050 !ValuesToIgnore.contains(U) && !IsLiveOutDead(U);
7051 }))
7052 continue;
7053
7054 if (!TheLoop->contains(Op->getParent()))
7055 continue;
7056
7057 // If all of Op's users are in ValuesToIgnore, add it to ValuesToIgnore
7058 // which applies for both scalar and vector versions. Otherwise it is only
7059 // dead in vector versions, so only add it to VecValuesToIgnore.
7060 if (all_of(Op->users(),
7061 [this](User *U) { return ValuesToIgnore.contains(U); }))
7062 ValuesToIgnore.insert(Op);
7063
7064 VecValuesToIgnore.insert(Op);
7065 DeadOps.append(Op->op_begin(), Op->op_end());
7066 }
7067
7068 // Ignore type-promoting instructions we identified during reduction
7069 // detection.
7070 for (const auto &Reduction : Legal->getReductionVars()) {
7071 const RecurrenceDescriptor &RedDes = Reduction.second;
7072 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7073 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7074 }
7075 // Ignore type-casting instructions we identified during induction
7076 // detection.
7077 for (const auto &Induction : Legal->getInductionVars()) {
7078 const InductionDescriptor &IndDes = Induction.second;
7079 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7080 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7081 }
7082}
7083
7085 for (const auto &Reduction : Legal->getReductionVars()) {
7086 PHINode *Phi = Reduction.first;
7087 const RecurrenceDescriptor &RdxDesc = Reduction.second;
7088
7089 // We don't collect reductions that are type promoted (yet).
7090 if (RdxDesc.getRecurrenceType() != Phi->getType())
7091 continue;
7092
7093 // If the target would prefer this reduction to happen "in-loop", then we
7094 // want to record it as such.
7095 unsigned Opcode = RdxDesc.getOpcode();
7096 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7097 !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7099 continue;
7100
7101 // Check that we can correctly put the reductions into the loop, by
7102 // finding the chain of operations that leads from the phi to the loop
7103 // exit value.
7104 SmallVector<Instruction *, 4> ReductionOperations =
7105 RdxDesc.getReductionOpChain(Phi, TheLoop);
7106 bool InLoop = !ReductionOperations.empty();
7107
7108 if (InLoop) {
7109 InLoopReductions.insert(Phi);
7110 // Add the elements to InLoopReductionImmediateChains for cost modelling.
7111 Instruction *LastChain = Phi;
7112 for (auto *I : ReductionOperations) {
7113 InLoopReductionImmediateChains[I] = LastChain;
7114 LastChain = I;
7115 }
7116 }
7117 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7118 << " reduction for phi: " << *Phi << "\n");
7119 }
7120}
7121
7122// This function will select a scalable VF if the target supports scalable
7123// vectors and a fixed one otherwise.
7124// TODO: we could return a pair of values that specify the max VF and
7125// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7126// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7127// doesn't have a cost model that can choose which plan to execute if
7128// more than one is generated.
7131 unsigned WidestType;
7132 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7133
7138
7140 unsigned N = RegSize.getKnownMinValue() / WidestType;
7141 return ElementCount::get(N, RegSize.isScalable());
7142}
7143
7146 ElementCount VF = UserVF;
7147 // Outer loop handling: They may require CFG and instruction level
7148 // transformations before even evaluating whether vectorization is profitable.
7149 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7150 // the vectorization pipeline.
7151 if (!OrigLoop->isInnermost()) {
7152 // If the user doesn't provide a vectorization factor, determine a
7153 // reasonable one.
7154 if (UserVF.isZero()) {
7155 VF = determineVPlanVF(TTI, CM);
7156 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7157
7158 // Make sure we have a VF > 1 for stress testing.
7159 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7160 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7161 << "overriding computed VF.\n");
7162 VF = ElementCount::getFixed(4);
7163 }
7164 } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
7166 LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
7167 << "not supported by the target.\n");
7169 "Scalable vectorization requested but not supported by the target",
7170 "the scalable user-specified vectorization width for outer-loop "
7171 "vectorization cannot be used because the target does not support "
7172 "scalable vectors.",
7173 "ScalableVFUnfeasible", ORE, OrigLoop);
7175 }
7176 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7178 "VF needs to be a power of two");
7179 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7180 << "VF " << VF << " to build VPlans.\n");
7181 buildVPlans(VF, VF);
7182
7183 // For VPlan build stress testing, we bail out after VPlan construction.
7186
7187 return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
7188 }
7189
7190 LLVM_DEBUG(
7191 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7192 "VPlan-native path.\n");
7194}
7195
7196void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7197 assert(OrigLoop->isInnermost() && "Inner loop expected.");
7200
7201 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7202 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7203 return;
7204
7205 // Invalidate interleave groups if all blocks of loop will be predicated.
7206 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7208 LLVM_DEBUG(
7209 dbgs()
7210 << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7211 "which requires masked-interleaved support.\n");
7213 // Invalidating interleave groups also requires invalidating all decisions
7214 // based on them, which includes widening decisions and uniform and scalar
7215 // values.
7217 }
7218
7219 if (CM.foldTailByMasking())
7221
7222 ElementCount MaxUserVF =
7223 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7224 if (UserVF) {
7225 if (!ElementCount::isKnownLE(UserVF, MaxUserVF)) {
7227 "UserVF ignored because it may be larger than the maximal safe VF",
7228 "InvalidUserVF", ORE, OrigLoop);
7229 } else {
7231 "VF needs to be a power of two");
7232 // Collect the instructions (and their associated costs) that will be more
7233 // profitable to scalarize.
7235 if (CM.selectUserVectorizationFactor(UserVF)) {
7236 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7237 buildVPlansWithVPRecipes(UserVF, UserVF);
7239 return;
7240 }
7241 reportVectorizationInfo("UserVF ignored because of invalid costs.",
7242 "InvalidCost", ORE, OrigLoop);
7243 }
7244 }
7245
7246 // Collect the Vectorization Factor Candidates.
7247 SmallVector<ElementCount> VFCandidates;
7248 for (auto VF = ElementCount::getFixed(1);
7249 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7250 VFCandidates.push_back(VF);
7251 for (auto VF = ElementCount::getScalable(1);
7252 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7253 VFCandidates.push_back(VF);
7254
7256 for (const auto &VF : VFCandidates) {
7257 // Collect Uniform and Scalar instructions after vectorization with VF.
7259
7260 // Collect the instructions (and their associated costs) that will be more
7261 // profitable to scalarize.
7262 if (VF.isVector())
7264 }
7265
7266 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7267 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7268
7270}
7271
7273 ElementCount VF) const {
7274 if (ForceTargetInstructionCost.getNumOccurrences())
7275 return InstructionCost(ForceTargetInstructionCost.getNumOccurrences());
7276 return CM.getInstructionCost(UI, VF);
7277}
7278
7279bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
7280 return CM.ValuesToIgnore.contains(UI) ||
7281 (IsVector && CM.VecValuesToIgnore.contains(UI)) ||
7282 SkipCostComputation.contains(UI);
7283}
7284
7286LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
7287 VPCostContext &CostCtx) const {
7289 // Cost modeling for inductions is inaccurate in the legacy cost model
7290 // compared to the recipes that are generated. To match here initially during
7291 // VPlan cost model bring up directly use the induction costs from the legacy
7292 // cost model. Note that we do this as pre-processing; the VPlan may not have
7293 // any recipes associated with the original induction increment instruction
7294 // and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute
7295 // the cost of induction phis and increments (both that are represented by
7296 // recipes and those that are not), to avoid distinguishing between them here,
7297 // and skip all recipes that represent induction phis and increments (the
7298 // former case) later on, if they exist, to avoid counting them twice.
7299 // Similarly we pre-compute the cost of any optimized truncates.
7300 // TODO: Switch to more accurate costing based on VPlan.
7301 for (const auto &[IV, IndDesc] : Legal->getInductionVars()) {
7302 Instruction *IVInc = cast<Instruction>(
7303 IV->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
7304 SmallVector<Instruction *> IVInsts = {IVInc};
7305 for (unsigned I = 0; I != IVInsts.size(); I++) {
7306 for (Value *Op : IVInsts[I]->operands()) {
7307 auto *OpI = dyn_cast<Instruction>(Op);
7308 if (Op == IV || !OpI || !OrigLoop->contains(OpI) || !Op->hasOneUse())
7309 continue;
7310 IVInsts.push_back(OpI);
7311 }
7312 }
7313 IVInsts.push_back(IV);
7314 for (User *U : IV->users()) {
7315 auto *CI = cast<Instruction>(U);
7316 if (!CostCtx.CM.isOptimizableIVTruncate(CI, VF))
7317 continue;
7318 IVInsts.push_back(CI);
7319 }
7320
7321 // If the vector loop gets executed exactly once with the given VF, ignore
7322 // the costs of comparison and induction instructions, as they'll get
7323 // simplified away.
7324 // TODO: Remove this code after stepping away from the legacy cost model and
7325 // adding code to simplify VPlans before calculating their costs.
7326 auto TC = PSE.getSE()->getSmallConstantTripCount(OrigLoop);
7327 if (VF.isFixed() && TC == VF.getFixedValue() && !CM.foldTailByMasking())
7329 CostCtx.SkipCostComputation);
7330
7331 for (Instruction *IVInst : IVInsts) {
7332 if (CostCtx.skipCostComputation(IVInst, VF.isVector()))
7333 continue;
7334 InstructionCost InductionCost = CostCtx.getLegacyCost(IVInst, VF);
7335 LLVM_DEBUG({
7336 dbgs() << "Cost of " << InductionCost << " for VF " << VF
7337 << ": induction instruction " << *IVInst << "\n";
7338 });
7339 Cost += InductionCost;
7340 CostCtx.SkipCostComputation.insert(IVInst);
7341 }
7342 }
7343
7344 /// Compute the cost of all exiting conditions of the loop using the legacy
7345 /// cost model. This is to match the legacy behavior, which adds the cost of
7346 /// all exit conditions. Note that this over-estimates the cost, as there will
7347 /// be a single condition to control the vector loop.
7349 CM.TheLoop->getExitingBlocks(Exiting);
7350 SetVector<Instruction *> ExitInstrs;
7351 // Collect all exit conditions.
7352 for (BasicBlock *EB : Exiting) {
7353 auto *Term = dyn_cast<BranchInst>(EB->getTerminator());
7354 if (!Term)
7355 continue;
7356 if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) {
7357 ExitInstrs.insert(CondI);
7358 }
7359 }
7360 // Compute the cost of all instructions only feeding the exit conditions.
7361 for (unsigned I = 0; I != ExitInstrs.size(); ++I) {
7362 Instruction *CondI = ExitInstrs[I];
7363 if (!OrigLoop->contains(CondI) ||
7364 !CostCtx.SkipCostComputation.insert(CondI).second)
7365 continue;
7366 InstructionCost CondICost = CostCtx.getLegacyCost(CondI, VF);
7367 LLVM_DEBUG({
7368 dbgs() << "Cost of " << CondICost << " for VF " << VF
7369 << ": exit condition instruction " << *CondI << "\n";
7370 });
7371 Cost += CondICost;
7372 for (Value *Op : CondI->operands()) {
7373 auto *OpI = dyn_cast<Instruction>(Op);
7374 if (!OpI || any_of(OpI->users(), [&ExitInstrs, this](User *U) {
7375 return OrigLoop->contains(cast<Instruction>(U)->getParent()) &&
7376 !ExitInstrs.contains(cast<Instruction>(U));
7377 }))
7378 continue;
7379 ExitInstrs.insert(OpI);
7380 }
7381 }
7382
7383 // The legacy cost model has special logic to compute the cost of in-loop
7384 // reductions, which may be smaller than the sum of all instructions involved
7385 // in the reduction.
7386 // TODO: Switch to costing based on VPlan once the logic has been ported.
7387 for (const auto &[RedPhi, RdxDesc] : Legal->getReductionVars()) {
7388 if (ForceTargetInstructionCost.getNumOccurrences())
7389 continue;
7390
7391 if (!CM.isInLoopReduction(RedPhi))
7392 continue;
7393
7394 const auto &ChainOps = RdxDesc.getReductionOpChain(RedPhi, OrigLoop);
7395 SetVector<Instruction *> ChainOpsAndOperands(ChainOps.begin(),
7396 ChainOps.end());
7397 auto IsZExtOrSExt = [](const unsigned Opcode) -> bool {
7398 return Opcode == Instruction::ZExt || Opcode == Instruction::SExt;
7399 };
7400 // Also include the operands of instructions in the chain, as the cost-model
7401 // may mark extends as free.
7402 //
7403 // For ARM, some of the instruction can folded into the reducion
7404 // instruction. So we need to mark all folded instructions free.
7405 // For example: We can fold reduce(mul(ext(A), ext(B))) into one
7406 // instruction.
7407 for (auto *ChainOp : ChainOps) {
7408 for (Value *Op : ChainOp->operands()) {
7409 if (auto *I = dyn_cast<Instruction>(Op)) {
7410 ChainOpsAndOperands.insert(I);
7411 if (I->getOpcode() == Instruction::Mul) {
7412 auto *Ext0 = dyn_cast<Instruction>(I->getOperand(0));
7413 auto *Ext1 = dyn_cast<Instruction>(I->getOperand(1));
7414 if (Ext0 && IsZExtOrSExt(Ext0->getOpcode()) && Ext1 &&
7415 Ext0->getOpcode() == Ext1->getOpcode()) {
7416 ChainOpsAndOperands.insert(Ext0);
7417 ChainOpsAndOperands.insert(Ext1);
7418 }
7419 }
7420 }
7421 }
7422 }
7423
7424 // Pre-compute the cost for I, if it has a reduction pattern cost.
7425 for (Instruction *I : ChainOpsAndOperands) {
7426 auto ReductionCost = CM.getReductionPatternCost(
7427 I, VF, toVectorTy(I->getType(), VF), TTI::TCK_RecipThroughput);
7428 if (!ReductionCost)
7429 continue;
7430
7431 assert(!CostCtx.SkipCostComputation.contains(I) &&
7432 "reduction op visited multiple times");
7433 CostCtx.SkipCostComputation.insert(I);
7434 LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF
7435 << ":\n in-loop reduction " << *I << "\n");
7436 Cost += *ReductionCost;
7437 }
7438 }
7439
7440 // Pre-compute the costs for branches except for the backedge, as the number
7441 // of replicate regions in a VPlan may not directly match the number of
7442 // branches, which would lead to different decisions.
7443 // TODO: Compute cost of branches for each replicate region in the VPlan,
7444 // which is more accurate than the legacy cost model.
7445 for (BasicBlock *BB : OrigLoop->blocks()) {
7446 if (CostCtx.skipCostComputation(BB->getTerminator(), VF.isVector()))
7447 continue;
7448 CostCtx.SkipCostComputation.insert(BB->getTerminator());
7449 if (BB == OrigLoop->getLoopLatch())
7450 continue;
7451 auto BranchCost = CostCtx.getLegacyCost(BB->getTerminator(), VF);
7452 Cost += BranchCost;
7453 }
7454
7455 // Pre-compute costs for instructions that are forced-scalar or profitable to
7456 // scalarize. Their costs will be computed separately in the legacy cost
7457 // model.
7458 for (Instruction *ForcedScalar : CM.ForcedScalars[VF]) {
7459 if (CostCtx.skipCostComputation(ForcedScalar, VF.isVector()))
7460 continue;
7461 CostCtx.SkipCostComputation.insert(ForcedScalar);
7462 InstructionCost ForcedCost = CostCtx.getLegacyCost(ForcedScalar, VF);
7463 LLVM_DEBUG({
7464 dbgs() << "Cost of " << ForcedCost << " for VF " << VF
7465 << ": forced scalar " << *ForcedScalar << "\n";
7466 });
7467 Cost += ForcedCost;
7468 }
7469 for (const auto &[Scalarized, ScalarCost] : CM.InstsToScalarize[VF]) {
7470 if (CostCtx.skipCostComputation(Scalarized, VF.isVector()))
7471 continue;
7472 CostCtx.SkipCostComputation.insert(Scalarized);
7473 LLVM_DEBUG({
7474 dbgs() << "Cost of " << ScalarCost << " for VF " << VF
7475 << ": profitable to scalarize " << *Scalarized << "\n";
7476 });
7477 Cost += ScalarCost;
7478 }
7479
7480 return Cost;
7481}
7482
7483InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
7484 ElementCount VF) const {
7485 VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM);
7486 InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
7487
7488 // Now compute and add the VPlan-based cost.
7489 Cost += Plan.cost(VF, CostCtx);
7490#ifndef NDEBUG
7491 unsigned EstimatedWidth = getEstimatedRuntimeVF(OrigLoop, CM.TTI, VF);
7492 LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
7493 << " (Estimated cost per lane: ");
7494 if (Cost.isValid()) {
7495 double CostPerLane = double(*Cost.getValue()) / EstimatedWidth;
7496 LLVM_DEBUG(dbgs() << format("%.1f", CostPerLane));
7497 } else /* No point dividing an invalid cost - it will still be invalid */
7498 LLVM_DEBUG(dbgs() << "Invalid");
7499 LLVM_DEBUG(dbgs() << ")\n");
7500#endif
7501 return Cost;
7502}
7503
7504#ifndef NDEBUG
7505/// Return true if the original loop \ TheLoop contains any instructions that do
7506/// not have corresponding recipes in \p Plan and are not marked to be ignored
7507/// in \p CostCtx. This means the VPlan contains simplification that the legacy
7508/// cost-model did not account for.
7510 VPCostContext &CostCtx,
7511 Loop *TheLoop) {
7512 // First collect all instructions for the recipes in Plan.
7513 auto GetInstructionForCost = [](const VPRecipeBase *R) -> Instruction * {
7514 if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
7515 return dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
7516 if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))
7517 return &WidenMem->getIngredient();
7518 return nullptr;
7519 };
7520
7521 DenseSet<Instruction *> SeenInstrs;
7522 auto Iter = vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry());
7523 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
7524 for (VPRecipeBase &R : *VPBB) {
7525 if (auto *IR = dyn_cast<VPInterleaveRecipe>(&R)) {
7526 auto *IG = IR->getInterleaveGroup();
7527 unsigned NumMembers = IG->getNumMembers();
7528 for (unsigned I = 0; I != NumMembers; ++I) {
7529 if (Instruction *M = IG->getMember(I))
7530 SeenInstrs.insert(M);
7531 }
7532 continue;
7533 }
7534 // The VPlan-based cost model is more accurate for partial reduction and
7535 // comparing against the legacy cost isn't desirable.
7536 if (isa<VPPartialReductionRecipe>(&R))
7537 return true;
7538 if (Instruction *UI = GetInstructionForCost(&R))
7539 SeenInstrs.insert(UI);
7540 }
7541 }
7542
7543 // Return true if the loop contains any instructions that are not also part of
7544 // the VPlan or are skipped for VPlan-based cost computations. This indicates
7545 // that the VPlan contains extra simplifications.
7546 return any_of(TheLoop->blocks(), [&SeenInstrs, &CostCtx,
7547 TheLoop](BasicBlock *BB) {
7548 return any_of(*BB, [&SeenInstrs, &CostCtx, TheLoop, BB](Instruction &I) {
7549 if (isa<PHINode>(&I) && BB == TheLoop->getHeader())
7550 return false;
7551 return !SeenInstrs.contains(&I) && !CostCtx.skipCostComputation(&I, true);
7552 });
7553 });
7554}
7555#endif
7556
7558 if (VPlans.empty())
7560 // If there is a single VPlan with a single VF, return it directly.
7561 VPlan &FirstPlan = *VPlans[0];
7562 if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1)
7563 return {*FirstPlan.vectorFactors().begin(), 0, 0};
7564
7566 assert(hasPlanWithVF(ScalarVF) &&
7567 "More than a single plan/VF w/o any plan having scalar VF");
7568
7569 // TODO: Compute scalar cost using VPlan-based cost model.
7570 InstructionCost ScalarCost = CM.expectedCost(ScalarVF);
7571 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ScalarCost << ".\n");
7572 VectorizationFactor ScalarFactor(ScalarVF, ScalarCost, ScalarCost);
7573 VectorizationFactor BestFactor = ScalarFactor;
7574
7575 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
7576 if (ForceVectorization) {
7577 // Ignore scalar width, because the user explicitly wants vectorization.
7578 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
7579 // evaluation.
7580 BestFactor.Cost = InstructionCost::getMax();
7581 }
7582
7583 for (auto &P : VPlans) {
7584 for (ElementCount VF : P->vectorFactors()) {
7585 if (VF.isScalar())
7586 continue;
7587 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
7588 LLVM_DEBUG(
7589 dbgs()
7590 << "LV: Not considering vector loop of width " << VF
7591 << " because it will not generate any vector instructions.\n");
7592 continue;
7593 }
7594
7595 InstructionCost Cost = cost(*P, VF);
7596 VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
7597 if (isMoreProfitable(CurrentFactor, BestFactor))
7598 BestFactor = CurrentFactor;
7599
7600 // If profitable add it to ProfitableVF list.
7601 if (isMoreProfitable(CurrentFactor, ScalarFactor))
7602 ProfitableVFs.push_back(CurrentFactor);
7603 }
7604 }
7605
7606#ifndef NDEBUG
7607 // Select the optimal vectorization factor according to the legacy cost-model.
7608 // This is now only used to verify the decisions by the new VPlan-based
7609 // cost-model and will be retired once the VPlan-based cost-model is
7610 // stabilized.
7611 VectorizationFactor LegacyVF = selectVectorizationFactor();
7612 VPlan &BestPlan = getPlanFor(BestFactor.Width);
7613
7614 // Pre-compute the cost and use it to check if BestPlan contains any
7615 // simplifications not accounted for in the legacy cost model. If that's the
7616 // case, don't trigger the assertion, as the extra simplifications may cause a
7617 // different VF to be picked by the VPlan-based cost model.
7618 VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM);
7619 precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
7620 assert((BestFactor.Width == LegacyVF.Width ||
7622 CostCtx, OrigLoop) ||
7624 CostCtx, OrigLoop)) &&
7625 " VPlan cost model and legacy cost model disagreed");
7626 assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) &&
7627 "when vectorizing, the scalar cost must be computed.");
7628#endif
7629
7630 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << BestFactor.Width << ".\n");
7631 return BestFactor;
7632}
7633
7636 // Reserve first location for self reference to the LoopID metadata node.
7637 MDs.push_back(nullptr);
7638 bool IsUnrollMetadata = false;
7639 MDNode *LoopID = L->getLoopID();
7640 if (LoopID) {
7641 // First find existing loop unrolling disable metadata.
7642 for (unsigned I = 1, IE = LoopID->getNumOperands(); I < IE; ++I) {
7643 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(I));
7644 if (MD) {
7645 const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7646 IsUnrollMetadata =
7647 S && S->getString().starts_with("llvm.loop.unroll.disable");
7648 }
7649 MDs.push_back(LoopID->getOperand(I));
7650 }
7651 }
7652
7653 if (!IsUnrollMetadata) {
7654 // Add runtime unroll disable metadata.
7655 LLVMContext &Context = L->getHeader()->getContext();
7656 SmallVector<Metadata *, 1> DisableOperands;
7657 DisableOperands.push_back(
7658 MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7659 MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7660 MDs.push_back(DisableNode);
7661 MDNode *NewLoopID = MDNode::get(Context, MDs);
7662 // Set operand 0 to refer to the loop id itself.
7663 NewLoopID->replaceOperandWith(0, NewLoopID);
7664 L->setLoopID(NewLoopID);
7665 }
7666}
7667
7668// If \p R is a ComputeReductionResult when vectorizing the epilog loop,
7669// fix the reduction's scalar PHI node by adding the incoming value from the
7670// main vector loop.
7672 VPRecipeBase *R, VPTransformState &State, BasicBlock *LoopMiddleBlock,
7673 BasicBlock *BypassBlock) {
7674 auto *EpiRedResult = dyn_cast<VPInstruction>(R);
7675 if (!EpiRedResult ||
7676 EpiRedResult->getOpcode() != VPInstruction::ComputeReductionResult)
7677 return;
7678
7679 auto *EpiRedHeaderPhi =
7680 cast<VPReductionPHIRecipe>(EpiRedResult->getOperand(0));
7681 const RecurrenceDescriptor &RdxDesc =
7682 EpiRedHeaderPhi->getRecurrenceDescriptor();
7683 Value *MainResumeValue =
7684 EpiRedHeaderPhi->getStartValue()->getUnderlyingValue();
7686 RdxDesc.getRecurrenceKind())) {
7687 auto *Cmp = cast<ICmpInst>(MainResumeValue);
7688 assert(Cmp->getPredicate() == CmpInst::ICMP_NE &&
7689 "AnyOf expected to start with ICMP_NE");
7690 assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue() &&
7691 "AnyOf expected to start by comparing main resume value to original "
7692 "start value");
7693 MainResumeValue = Cmp->getOperand(0);
7695 RdxDesc.getRecurrenceKind())) {
7696 using namespace llvm::PatternMatch;
7697 Value *Cmp, *OrigResumeV;
7698 bool IsExpectedPattern =
7699 match(MainResumeValue, m_Select(m_OneUse(m_Value(Cmp)),
7700 m_Specific(RdxDesc.getSentinelValue()),
7701 m_Value(OrigResumeV))) &&
7702 match(Cmp,
7705 assert(IsExpectedPattern && "Unexpected reduction resume pattern");
7706 (void)IsExpectedPattern;
7707 MainResumeValue = OrigResumeV;
7708 }
7709 PHINode *MainResumePhi = cast<PHINode>(MainResumeValue);
7710
7711 // When fixing reductions in the epilogue loop we should already have
7712 // created a bc.merge.rdx Phi after the main vector body. Ensure that we carry
7713 // over the incoming values correctly.
7714 using namespace VPlanPatternMatch;
7715 auto IsResumePhi = [](VPUser *U) {
7716 return match(
7717 U, m_VPInstruction<VPInstruction::ResumePhi>(m_VPValue(), m_VPValue()));
7718 };
7719 assert(count_if(EpiRedResult->users(), IsResumePhi) == 1 &&
7720 "ResumePhi must have a single user");
7721 auto *EpiResumePhiVPI =
7722 cast<VPInstruction>(*find_if(EpiRedResult->users(), IsResumePhi));
7723 auto *EpiResumePhi = cast<PHINode>(State.get(EpiResumePhiVPI, true));
7724 EpiResumePhi->setIncomingValueForBlock(
7725 BypassBlock, MainResumePhi->getIncomingValueForBlock(BypassBlock));
7726}
7727
7729 ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7730 InnerLoopVectorizer &ILV, DominatorTree *DT, bool VectorizingEpilogue,
7731 const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {
7732 assert(BestVPlan.hasVF(BestVF) &&
7733 "Trying to execute plan with unsupported VF");
7734 assert(BestVPlan.hasUF(BestUF) &&
7735 "Trying to execute plan with unsupported UF");
7736 assert(
7737 ((VectorizingEpilogue && ExpandedSCEVs) ||
7738 (!VectorizingEpilogue && !ExpandedSCEVs)) &&
7739 "expanded SCEVs to reuse can only be used during epilogue vectorization");
7740
7741 // TODO: Move to VPlan transform stage once the transition to the VPlan-based
7742 // cost model is complete for better cost estimates.
7743 VPlanTransforms::unrollByUF(BestVPlan, BestUF,
7744 OrigLoop->getHeader()->getContext());
7745 VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
7747
7748 // Perform the actual loop transformation.
7749 VPTransformState State(&TTI, BestVF, BestUF, LI, DT, ILV.Builder, &ILV,
7750 &BestVPlan, OrigLoop->getParentLoop(),
7751 Legal->getWidestInductionType());
7752
7753#ifdef EXPENSIVE_CHECKS
7754 assert(DT->verify(DominatorTree::VerificationLevel::Fast));
7755#endif
7756
7757 // 0. Generate SCEV-dependent code in the entry, including TripCount, before
7758 // making any changes to the CFG.
7759 if (!BestVPlan.getEntry()->empty())
7760 BestVPlan.getEntry()->execute(&State);
7761
7762 if (!ILV.getTripCount())
7763 ILV.setTripCount(State.get(BestVPlan.getTripCount(), VPLane(0)));
7764 else
7765 assert(VectorizingEpilogue && "should only re-use the existing trip "
7766 "count during epilogue vectorization");
7767
7768 // 1. Set up the skeleton for vectorization, including vector pre-header and
7769 // middle block. The vector loop is created during VPlan execution.
7770 VPBasicBlock *VectorPH =
7771 cast<VPBasicBlock>(BestVPlan.getEntry()->getSingleSuccessor());
7773 ExpandedSCEVs ? *ExpandedSCEVs : State.ExpandedSCEVs);
7774 if (VectorizingEpilogue)
7776
7777 // Only use noalias metadata when using memory checks guaranteeing no overlap
7778 // across all iterations.
7779 const LoopAccessInfo *LAI = ILV.Legal->getLAI();
7780 std::unique_ptr<LoopVersioning> LVer = nullptr;
7781 if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
7783
7784 // We currently don't use LoopVersioning for the actual loop cloning but we
7785 // still use it to add the noalias metadata.
7786 // TODO: Find a better way to re-use LoopVersioning functionality to add
7787 // metadata.
7788 LVer = std::make_unique<LoopVersioning>(
7789 *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT,
7790 PSE.getSE());
7791 State.LVer = &*LVer;
7793 }
7794
7796
7797 //===------------------------------------------------===//
7798 //
7799 // Notice: any optimization or new instruction that go
7800 // into the code below should also be implemented in
7801 // the cost-model.
7802 //
7803 //===------------------------------------------------===//
7804
7805 // 2. Copy and widen instructions from the old loop into the new loop.
7806 BestVPlan.prepareToExecute(
7807 ILV.getTripCount(),
7809 replaceVPBBWithIRVPBB(VectorPH, State.CFG.PrevBB);
7810
7811 BestVPlan.execute(&State);
7812
7813 auto *MiddleVPBB = BestVPlan.getMiddleBlock();
7814 // 2.5 When vectorizing the epilogue, fix reduction and induction resume
7815 // values from the additional bypass block.
7816 if (VectorizingEpilogue) {
7818 "Epilogue vectorisation not yet supported with early exits");
7819 BasicBlock *BypassBlock = ILV.getAdditionalBypassBlock();
7820 for (VPRecipeBase &R : *MiddleVPBB) {
7822 &R, State, State.CFG.VPBB2IRBB[MiddleVPBB], BypassBlock);
7823 }
7824 BasicBlock *PH = OrigLoop->getLoopPreheader();
7825 for (const auto &[IVPhi, _] : Legal->getInductionVars()) {
7826 auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH));
7828 Inc->setIncomingValueForBlock(BypassBlock, V);
7829 }
7830 }
7831
7832 // 2.6. Maintain Loop Hints
7833 // Keep all loop hints from the original loop on the vector loop (we'll
7834 // replace the vectorizer-specific hints below).
7835 if (auto *LoopRegion = BestVPlan.getVectorLoopRegion()) {
7836 MDNode *OrigLoopID = OrigLoop->getLoopID();
7837
7838 std::optional<MDNode *> VectorizedLoopID =
7841
7842 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
7843 Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
7844 if (VectorizedLoopID) {
7845 L->setLoopID(*VectorizedLoopID);
7846 } else {
7847 // Keep all loop hints from the original loop on the vector loop (we'll
7848 // replace the vectorizer-specific hints below).
7849 if (MDNode *LID = OrigLoop->getLoopID())
7850 L->setLoopID(LID);
7851
7852 LoopVectorizeHints Hints(L, true, *ORE);
7853 Hints.setAlreadyVectorized();
7854 }
7856 TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
7857 if (!UP.UnrollVectorizedLoop || VectorizingEpilogue)
7859 }
7860
7861 // 3. Fix the vectorized code: take care of header phi's, live-outs,
7862 // predication, updating analyses.
7863 ILV.fixVectorizedLoop(State);
7864
7866
7867 // 4. Adjust branch weight of the branch in the middle block.
7868 if (BestVPlan.getVectorLoopRegion()) {
7869 auto *MiddleVPBB = BestVPlan.getMiddleBlock();
7870 auto *MiddleTerm =
7871 cast<BranchInst>(State.CFG.VPBB2IRBB[MiddleVPBB]->getTerminator());
7872 if (MiddleTerm->isConditional() &&
7873 hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
7874 // Assume that `Count % VectorTripCount` is equally distributed.
7875 unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
7876 assert(TripCount > 0 && "trip count should not be zero");
7877 const uint32_t Weights[] = {1, TripCount - 1};
7878 setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
7879 }
7880 }
7881
7882 return State.ExpandedSCEVs;
7883}
7884
7885//===--------------------------------------------------------------------===//
7886// EpilogueVectorizerMainLoop
7887//===--------------------------------------------------------------------===//
7888
7889/// This function is partially responsible for generating the control flow
7890/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7892 const SCEV2ValueTy &ExpandedSCEVs) {
7894
7895 // Generate the code to check the minimum iteration count of the vector
7896 // epilogue (see below).
7900
7901 // Generate the code to check any assumptions that we've made for SCEV
7902 // expressions.
7904
7905 // Generate the code that checks at runtime if arrays overlap. We put the
7906 // checks into a separate block to make the more common case of few elements
7907 // faster.
7909
7910 // Generate the iteration count check for the main loop, *after* the check
7911 // for the epilogue loop, so that the path-length is shorter for the case
7912 // that goes directly through the vector epilogue. The longer-path length for
7913 // the main loop is compensated for, by the gain from vectorizing the larger
7914 // trip count. Note: the branch will get updated later on when we vectorize
7915 // the epilogue.
7918
7919 // Generate the induction variable.
7921
7922 return LoopVectorPreHeader;
7923}
7924
7926 LLVM_DEBUG({
7927 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7928 << "Main Loop VF:" << EPI.MainLoopVF
7929 << ", Main Loop UF:" << EPI.MainLoopUF
7930 << ", Epilogue Loop VF:" << EPI.EpilogueVF
7931 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7932 });
7933}
7934
7937 dbgs() << "intermediate fn:\n"
7938 << *OrigLoop->getHeader()->getParent() << "\n";
7939 });
7940}
7941
7942BasicBlock *
7944 bool ForEpilogue) {
7945 assert(Bypass && "Expected valid bypass basic block.");
7946 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7947 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7948 Value *Count = getTripCount();
7949 // Reuse existing vector loop preheader for TC checks.
7950 // Note that new preheader block is generated for vector loop.
7951 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7952 IRBuilder<> Builder(TCCheckBlock->getTerminator());
7953
7954 // Generate code to check if the loop's trip count is less than VF * UF of the
7955 // main vector loop.
7956 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector()
7957 : VF.isVector())
7960
7961 Value *CheckMinIters = Builder.CreateICmp(
7962 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
7963 "min.iters.check");
7964
7965 if (!ForEpilogue)
7966 TCCheckBlock->setName("vector.main.loop.iter.check");
7967
7968 // Create new preheader for vector loop.
7969 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7970 DT, LI, nullptr, "vector.ph");
7971
7972 if (ForEpilogue) {
7973 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7974 DT->getNode(Bypass)->getIDom()) &&
7975 "TC check is expected to dominate Bypass");
7976
7977 LoopBypassBlocks.push_back(TCCheckBlock);
7978
7979 // Save the trip count so we don't have to regenerate it in the
7980 // vec.epilog.iter.check. This is safe to do because the trip count
7981 // generated here dominates the vector epilog iter check.
7982 EPI.TripCount = Count;
7983 }
7984
7985 BranchInst &BI =
7986 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7988 setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
7989 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
7990
7991 introduceCheckBlockInVPlan(TCCheckBlock);
7992 return TCCheckBlock;
7993}
7994
7995//===--------------------------------------------------------------------===//
7996// EpilogueVectorizerEpilogueLoop
7997//===--------------------------------------------------------------------===//
7998
7999/// This function is partially responsible for generating the control flow
8000/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
8001BasicBlock *
8003 const SCEV2ValueTy &ExpandedSCEVs) {
8004 createVectorLoopSkeleton("vec.epilog.");
8005
8006 // Now, compare the remaining count and if there aren't enough iterations to
8007 // execute the vectorized epilogue skip to the scalar part.
8008 LoopVectorPreHeader->setName("vec.epilog.ph");
8009 BasicBlock *VecEpilogueIterationCountCheck =
8011 nullptr, "vec.epilog.iter.check", true);
8013 VecEpilogueIterationCountCheck);
8014 AdditionalBypassBlock = VecEpilogueIterationCountCheck;
8015
8016 // Adjust the control flow taking the state info from the main loop
8017 // vectorization into account.
8019 "expected this to be saved from the previous pass.");
8021 VecEpilogueIterationCountCheck, LoopVectorPreHeader);
8022
8024 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8025
8026 if (EPI.SCEVSafetyCheck)
8028 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8029 if (EPI.MemSafetyCheck)
8031 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8032
8035 // Keep track of bypass blocks, as they feed start values to the induction and
8036 // reduction phis in the scalar loop preheader.
8037 if (EPI.SCEVSafetyCheck)
8039 if (EPI.MemSafetyCheck)
8042
8043 // The vec.epilog.iter.check block may contain Phi nodes from inductions or
8044 // reductions which merge control-flow from the latch block and the middle
8045 // block. Update the incoming values here and move the Phi into the preheader.
8046 SmallVector<PHINode *, 4> PhisInBlock;
8047 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
8048 PhisInBlock.push_back(&Phi);
8049
8050 for (PHINode *Phi : PhisInBlock) {
8051 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
8052 Phi->replaceIncomingBlockWith(
8053 VecEpilogueIterationCountCheck->getSinglePredecessor(),
8054 VecEpilogueIterationCountCheck);
8055
8056 // If the phi doesn't have an incoming value from the
8057 // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
8058 // value and also those from other check blocks. This is needed for
8059 // reduction phis only.
8060 if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
8061 return EPI.EpilogueIterationCountCheck == IncB;
8062 }))
8063 continue;
8064 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
8065 if (EPI.SCEVSafetyCheck)
8066 Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
8067 if (EPI.MemSafetyCheck)
8068 Phi->removeIncomingValue(EPI.MemSafetyCheck);
8069 }
8070
8071 // Generate bypass values from the additional bypass block. Note that when the
8072 // vectorized epilogue is skipped due to iteration count check, then the
8073 // resume value for the induction variable comes from the trip count of the
8074 // main vector loop, passed as the second argument.
8076 return LoopVectorPreHeader;
8077}
8078
8079BasicBlock *
8081 BasicBlock *Bypass, BasicBlock *Insert) {
8082
8084 "Expected trip count to have been saved in the first pass.");
8085 assert(
8086 (!isa<Instruction>(EPI.TripCount) ||
8087 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
8088 "saved trip count does not dominate insertion point.");
8089 Value *TC = EPI.TripCount;
8090 IRBuilder<> Builder(Insert->getTerminator());
8091 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
8092
8093 // Generate code to check if the loop's trip count is less than VF * UF of the
8094 // vector epilogue loop.
8095 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())
8098
8099 Value *CheckMinIters =
8100 Builder.CreateICmp(P, Count,
8103 "min.epilog.iters.check");
8104
8105 BranchInst &BI =
8106 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
8108 unsigned MainLoopStep = UF * VF.getKnownMinValue();
8109 unsigned EpilogueLoopStep =
8111 // We assume the remaining `Count` is equally distributed in
8112 // [0, MainLoopStep)
8113 // So the probability for `Count < EpilogueLoopStep` should be
8114 // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
8115 unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
8116 const uint32_t Weights[] = {EstimatedSkipCount,
8117 MainLoopStep - EstimatedSkipCount};
8118 setBranchWeights(BI, Weights, /*IsExpected=*/false);
8119 }
8120 ReplaceInstWithInst(Insert->getTerminator(), &BI);
8121 LoopBypassBlocks.push_back(Insert);
8122
8123 // A new entry block has been created for the epilogue VPlan. Hook it in, as
8124 // otherwise we would try to modify the entry to the main vector loop.
8125 VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(Insert);
8126 VPBasicBlock *OldEntry = Plan.getEntry();
8127 VPBlockUtils::reassociateBlocks(OldEntry, NewEntry);
8128 Plan.setEntry(NewEntry);
8129 // OldEntry is now dead and will be cleaned up when the plan gets destroyed.
8130
8132 return Insert;
8133}
8134
8136 LLVM_DEBUG({
8137 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
8138 << "Epilogue Loop VF:" << EPI.EpilogueVF
8139 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8140 });
8141}
8142
8145 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
8146 });
8147}
8148
8149iterator_range<mapped_iterator<Use *, std::function<VPValue *(Value *)>>>
8151 std::function<VPValue *(Value *)> Fn = [this](Value *Op) {
8152 return getVPValueOrAddLiveIn(Op);
8153 };
8154 return map_range(Operands, Fn);
8155}
8156
8158 BasicBlock *Src = SI->getParent();
8159 assert(!OrigLoop->isLoopExiting(Src) &&
8160 all_of(successors(Src),
8161 [this](BasicBlock *Succ) {
8162 return OrigLoop->getHeader() != Succ;
8163 }) &&
8164 "unsupported switch either exiting loop or continuing to header");
8165 // Create masks where the terminator in Src is a switch. We create mask for
8166 // all edges at the same time. This is more efficient, as we can create and
8167 // collect compares for all cases once.
8168 VPValue *Cond = getVPValueOrAddLiveIn(SI->getCondition());
8169 BasicBlock *DefaultDst = SI->getDefaultDest();
8171 for (auto &C : SI->cases()) {
8172 BasicBlock *Dst = C.getCaseSuccessor();
8173 assert(!EdgeMaskCache.contains({Src, Dst}) && "Edge masks already created");
8174 // Cases whose destination is the same as default are redundant and can be
8175 // ignored - they will get there anyhow.
8176 if (Dst == DefaultDst)
8177 continue;
8178 auto &Compares = Dst2Compares[Dst];
8179 VPValue *V = getVPValueOrAddLiveIn(C.getCaseValue());
8180 Compares.push_back(Builder.createICmp(CmpInst::ICMP_EQ, Cond, V));
8181 }
8182
8183 // We need to handle 2 separate cases below for all entries in Dst2Compares,
8184 // which excludes destinations matching the default destination.
8185 VPValue *SrcMask = getBlockInMask(Src);
8186 VPValue *DefaultMask = nullptr;
8187 for (const auto &[Dst, Conds] : Dst2Compares) {
8188 // 1. Dst is not the default destination. Dst is reached if any of the cases
8189 // with destination == Dst are taken. Join the conditions for each case
8190 // whose destination == Dst using an OR.
8191 VPValue *Mask = Conds[0];
8192 for (VPValue *V : ArrayRef<VPValue *>(Conds).drop_front())
8193 Mask = Builder.createOr(Mask, V);
8194 if (SrcMask)
8195 Mask = Builder.createLogicalAnd(SrcMask, Mask);
8196 EdgeMaskCache[{Src, Dst}] = Mask;
8197
8198 // 2. Create the mask for the default destination, which is reached if none
8199 // of the cases with destination != default destination are taken. Join the
8200 // conditions for each case where the destination is != Dst using an OR and
8201 // negate it.
8202 DefaultMask = DefaultMask ? Builder.createOr(DefaultMask, Mask) : Mask;
8203 }
8204
8205 if (DefaultMask) {
8206 DefaultMask = Builder.createNot(DefaultMask);
8207 if (SrcMask)
8208 DefaultMask = Builder.createLogicalAnd(SrcMask, DefaultMask);
8209 }
8210 EdgeMaskCache[{Src, DefaultDst}] = DefaultMask;
8211}
8212
8214 assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8215
8216 // Look for cached value.
8217 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8218 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8219 if (ECEntryIt != EdgeMaskCache.end())
8220 return ECEntryIt->second;
8221
8222 if (auto *SI = dyn_cast<SwitchInst>(Src->getTerminator())) {
8224 assert(EdgeMaskCache.contains(Edge) && "Mask for Edge not created?");
8225 return EdgeMaskCache[Edge];
8226 }
8227
8228 VPValue *SrcMask = getBlockInMask(Src);
8229
8230 // The terminator has to be a branch inst!
8231 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8232 assert(BI && "Unexpected terminator found");
8233 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8234 return EdgeMaskCache[Edge] = SrcMask;
8235
8236 // If source is an exiting block, we know the exit edge is dynamically dead
8237 // in the vector loop, and thus we don't need to restrict the mask. Avoid
8238 // adding uses of an otherwise potentially dead instruction unless we are
8239 // vectorizing a loop with uncountable exits. In that case, we always
8240 // materialize the mask.
8241 if (OrigLoop->isLoopExiting(Src) &&
8242 Src != Legal->getUncountableEarlyExitingBlock())
8243 return EdgeMaskCache[Edge] = SrcMask;
8244
8245 VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition());
8246 assert(EdgeMask && "No Edge Mask found for condition");
8247
8248 if (BI->getSuccessor(0) != Dst)
8249 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
8250
8251 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8252 // The bitwise 'And' of SrcMask and EdgeMask introduces new UB if SrcMask
8253 // is false and EdgeMask is poison. Avoid that by using 'LogicalAnd'
8254 // instead which generates 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8255 EdgeMask = Builder.createLogicalAnd(SrcMask, EdgeMask, BI->getDebugLoc());
8256 }
8257
8258 return EdgeMaskCache[Edge] = EdgeMask;
8259}
8260
8262 assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8263
8264 // Look for cached value.
8265 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8266 EdgeMaskCacheTy::const_iterator ECEntryIt = EdgeMaskCache.find(Edge);
8267 assert(ECEntryIt != EdgeMaskCache.end() &&
8268 "looking up mask for edge which has not been created");
8269 return ECEntryIt->second;
8270}
8271
8273 BasicBlock *Header = OrigLoop->getHeader();
8274
8275 // When not folding the tail, use nullptr to model all-true mask.
8276 if (!CM.foldTailByMasking()) {
8277 BlockMaskCache[Header] = nullptr;
8278 return;
8279 }
8280
8281 // Introduce the early-exit compare IV <= BTC to form header block mask.
8282 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8283 // constructing the desired canonical IV in the header block as its first
8284 // non-phi instructions.
8285
8286 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
8287 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8288 auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
8289 HeaderVPBB->insert(IV, NewInsertionPoint);
8290
8291 VPBuilder::InsertPointGuard Guard(Builder);
8292 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
8293 VPValue *BlockMask = nullptr;
8295 BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
8296 BlockMaskCache[Header] = BlockMask;
8297}
8298
8300 // Return the cached value.
8301 BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB);
8302 assert(BCEntryIt != BlockMaskCache.end() &&
8303 "Trying to access mask for block without one.");
8304 return BCEntryIt->second;
8305}
8306
8308 assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8309 assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed");
8310 assert(OrigLoop->getHeader() != BB &&
8311 "Loop header must have cached block mask");
8312
8313 // All-one mask is modelled as no-mask following the convention for masked
8314 // load/store/gather/scatter. Initialize BlockMask to no-mask.
8315 VPValue *BlockMask = nullptr;
8316 // This is the block mask. We OR all unique incoming edges.
8317 for (auto *Predecessor :
8319 VPValue *EdgeMask = createEdgeMask(Predecessor, BB);
8320 if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too.
8321 BlockMaskCache[BB] = EdgeMask;
8322 return;
8323 }
8324
8325 if (!BlockMask) { // BlockMask has its initialized nullptr value.
8326 BlockMask = EdgeMask;
8327 continue;
8328 }
8329
8330 BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8331 }
8332
8333 BlockMaskCache[BB] = BlockMask;
8334}
8335
8337VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
8338 VFRange &Range) {
8339 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8340 "Must be called with either a load or store");
8341
8342 auto WillWiden = [&](ElementCount VF) -> bool {
8344 CM.getWideningDecision(I, VF);
8346 "CM decision should be taken at this point.");
8348 return true;
8349 if (CM.isScalarAfterVectorization(I, VF) ||
8350 CM.isProfitableToScalarize(I, VF))
8351 return false;
8353 };
8354
8356 return nullptr;
8357
8358 VPValue *Mask = nullptr;
8359 if (Legal->isMaskRequired(I))
8360 Mask = getBlockInMask(I->getParent());
8361
8362 // Determine if the pointer operand of the access is either consecutive or
8363 // reverse consecutive.
8365 CM.getWideningDecision(I, Range.Start);
8367 bool Consecutive =
8369
8370 VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
8371 if (Consecutive) {
8372 auto *GEP = dyn_cast<GetElementPtrInst>(
8373 Ptr->getUnderlyingValue()->stripPointerCasts());
8374 VPSingleDefRecipe *VectorPtr;
8375 if (Reverse) {
8376 // When folding the tail, we may compute an address that we don't in the
8377 // original scalar loop and it may not be inbounds. Drop Inbounds in that
8378 // case.
8379 GEPNoWrapFlags Flags =
8380 (CM.foldTailByMasking() || !GEP || !GEP->isInBounds())
8383 VectorPtr = new VPReverseVectorPointerRecipe(
8384 Ptr, &Plan.getVF(), getLoadStoreType(I), Flags, I->getDebugLoc());
8385 } else {
8386 VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
8387 GEP ? GEP->getNoWrapFlags()
8389 I->getDebugLoc());
8390 }
8391 Builder.getInsertBlock()->appendRecipe(VectorPtr);
8392 Ptr = VectorPtr;
8393 }
8394 if (LoadInst *Load = dyn_cast<LoadInst>(I))
8395 return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
8396 I->getDebugLoc());
8397
8398 StoreInst *Store = cast<StoreInst>(I);
8399 return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
8400 Reverse, I->getDebugLoc());
8401}
8402
8403/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
8404/// insert a recipe to expand the step for the induction recipe.
8407 VPValue *Start, const InductionDescriptor &IndDesc,
8408 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop) {
8409 assert(IndDesc.getStartValue() ==
8410 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
8411 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
8412 "step must be loop invariant");
8413
8414 VPValue *Step =
8416 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8417 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
8418 IndDesc, TruncI,
8419 TruncI->getDebugLoc());
8420 }
8421 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
8422 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
8423 IndDesc, Phi->getDebugLoc());
8424}
8425
8426VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
8428
8429 // Check if this is an integer or fp induction. If so, build the recipe that
8430 // produces its scalar and vector values.
8431 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8432 return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan,
8433 *PSE.getSE(), *OrigLoop);
8434
8435 // Check if this is pointer induction. If so, build the recipe for it.
8436 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
8437 VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(),
8438 *PSE.getSE());
8440 Phi, Operands[0], Step, *II,
8442 [&](ElementCount VF) {
8443 return CM.isScalarAfterVectorization(Phi, VF);
8444 },
8445 Range),
8446 Phi->getDebugLoc());
8447 }
8448 return nullptr;
8449}
8450
8451VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8453 // Optimize the special case where the source is a constant integer
8454 // induction variable. Notice that we can only optimize the 'trunc' case
8455 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8456 // (c) other casts depend on pointer size.
8457
8458 // Determine whether \p K is a truncation based on an induction variable that
8459 // can be optimized.
8460 auto IsOptimizableIVTruncate =
8461 [&](Instruction *K) -> std::function<bool(ElementCount)> {
8462 return [=](ElementCount VF) -> bool {
8463 return CM.isOptimizableIVTruncate(K, VF);
8464 };
8465 };
8466
8468 IsOptimizableIVTruncate(I), Range)) {
8469
8470 auto *Phi = cast<PHINode>(I->getOperand(0));
8472 VPValue *Start = Plan.getOrAddLiveIn(II.getStartValue());
8473 return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(),
8474 *OrigLoop);
8475 }
8476 return nullptr;
8477}
8478
8479VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi,
8481 unsigned NumIncoming = Phi->getNumIncomingValues();
8482
8483 // We know that all PHIs in non-header blocks are converted into selects, so
8484 // we don't have to worry about the insertion order and we can just use the
8485 // builder. At this point we generate the predication tree. There may be
8486 // duplications since this is a simple recursive scan, but future
8487 // optimizations will clean it up.
8488 SmallVector<VPValue *, 2> OperandsWithMask;
8489
8490 for (unsigned In = 0; In < NumIncoming; In++) {
8491 OperandsWithMask.push_back(Operands[In]);
8492 VPValue *EdgeMask =
8493 getEdgeMask(Phi->getIncomingBlock(In), Phi->getParent());
8494 if (!EdgeMask) {
8495 assert(In == 0 && "Both null and non-null edge masks found");
8497 "Distinct incoming values with one having a full mask");
8498 break;
8499 }
8500 OperandsWithMask.push_back(EdgeMask);
8501 }
8502 return new VPBlendRecipe(Phi, OperandsWithMask);
8503}
8504
8505VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8507 VFRange &Range) {
8509 [this, CI](ElementCount VF) {
8510 return CM.isScalarWithPredication(CI, VF);
8511 },
8512 Range);
8513
8514 if (IsPredicated)
8515 return nullptr;
8516
8518 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8519 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8520 ID == Intrinsic::pseudoprobe ||
8521 ID == Intrinsic::experimental_noalias_scope_decl))
8522 return nullptr;
8523
8524 SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size()));
8525
8526 // Is it beneficial to perform intrinsic call compared to lib call?
8527 bool ShouldUseVectorIntrinsic =
8529 [&](ElementCount VF) -> bool {
8530 return CM.getCallWideningDecision(CI, VF).Kind ==
8532 },
8533 Range);
8534 if (ShouldUseVectorIntrinsic)
8535 return new VPWidenIntrinsicRecipe(*CI, ID, Ops, CI->getType(),
8536 CI->getDebugLoc());
8537
8538 Function *Variant = nullptr;
8539 std::optional<unsigned> MaskPos;
8540 // Is better to call a vectorized version of the function than to to scalarize
8541 // the call?
8542 auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
8543 [&](ElementCount VF) -> bool {
8544 // The following case may be scalarized depending on the VF.
8545 // The flag shows whether we can use a usual Call for vectorized
8546 // version of the instruction.
8547
8548 // If we've found a variant at a previous VF, then stop looking. A
8549 // vectorized variant of a function expects input in a certain shape
8550 // -- basically the number of input registers, the number of lanes
8551 // per register, and whether there's a mask required.
8552 // We store a pointer to the variant in the VPWidenCallRecipe, so
8553 // once we have an appropriate variant it's only valid for that VF.
8554 // This will force a different vplan to be generated for each VF that
8555 // finds a valid variant.
8556 if (Variant)
8557 return false;
8559 CM.getCallWideningDecision(CI, VF);
8561 Variant = Decision.Variant;
8562 MaskPos = Decision.MaskPos;
8563 return true;
8564 }
8565
8566 return false;
8567 },
8568 Range);
8569 if (ShouldUseVectorCall) {
8570 if (MaskPos.has_value()) {
8571 // We have 2 cases that would require a mask:
8572 // 1) The block needs to be predicated, either due to a conditional
8573 // in the scalar loop or use of an active lane mask with
8574 // tail-folding, and we use the appropriate mask for the block.
8575 // 2) No mask is required for the block, but the only available
8576 // vector variant at this VF requires a mask, so we synthesize an
8577 // all-true mask.
8578 VPValue *Mask = nullptr;
8579 if (Legal->isMaskRequired(CI))
8580 Mask = getBlockInMask(CI->getParent());
8581 else
8582 Mask = Plan.getOrAddLiveIn(
8584
8585 Ops.insert(Ops.begin() + *MaskPos, Mask);
8586 }
8587
8588 Ops.push_back(Operands.back());
8589 return new VPWidenCallRecipe(CI, Variant, Ops, CI->getDebugLoc());
8590 }
8591
8592 return nullptr;
8593}
8594
8595bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8596 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8597 !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8598 // Instruction should be widened, unless it is scalar after vectorization,
8599 // scalarization is profitable or it is predicated.
8600 auto WillScalarize = [this, I](ElementCount VF) -> bool {
8601 return CM.isScalarAfterVectorization(I, VF) ||
8602 CM.isProfitableToScalarize(I, VF) ||
8603 CM.isScalarWithPredication(I, VF);
8604 };
8606 Range);
8607}
8608
8609VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8611 VPBasicBlock *VPBB) {
8612 switch (I->getOpcode()) {
8613 default:
8614 return nullptr;
8615 case Instruction::SDiv:
8616 case Instruction::UDiv:
8617 case Instruction::SRem:
8618 case Instruction::URem: {
8619 // If not provably safe, use a select to form a safe divisor before widening the
8620 // div/rem operation itself. Otherwise fall through to general handling below.
8621 if (CM.isPredicatedInst(I)) {
8623 VPValue *Mask = getBlockInMask(I->getParent());
8624 VPValue *One =
8625 Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false));
8626 auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc());
8627 Ops[1] = SafeRHS;
8628 return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end()));
8629 }
8630 [[fallthrough]];
8631 }
8632 case Instruction::Add:
8633 case Instruction::And:
8634 case Instruction::AShr:
8635 case Instruction::FAdd:
8636 case Instruction::FCmp:
8637 case Instruction::FDiv:
8638 case Instruction::FMul:
8639 case Instruction::FNeg:
8640 case Instruction::FRem:
8641 case Instruction::FSub:
8642 case Instruction::ICmp:
8643 case Instruction::LShr:
8644 case Instruction::Mul:
8645 case Instruction::Or:
8646 case Instruction::Select:
8647 case Instruction::Shl:
8648 case Instruction::Sub:
8649 case Instruction::Xor:
8650 case Instruction::Freeze:
8652 if (Instruction::isBinaryOp(I->getOpcode())) {
8653 // The legacy cost model uses SCEV to check if some of the operands are
8654 // constants. To match the legacy cost model's behavior, use SCEV to try
8655 // to replace operands with constants.
8656 ScalarEvolution &SE = *PSE.getSE();
8657 auto GetConstantViaSCEV = [this, &SE](VPValue *Op) {
8658 Value *V = Op->getUnderlyingValue();
8659 if (isa<Constant>(V) || !SE.isSCEVable(V->getType()))
8660 return Op;
8661 auto *C = dyn_cast<SCEVConstant>(SE.getSCEV(V));
8662 if (!C)
8663 return Op;
8664 return Plan.getOrAddLiveIn(C->getValue());
8665 };
8666 // For Mul, the legacy cost model checks both operands.
8667 if (I->getOpcode() == Instruction::Mul)
8668 NewOps[0] = GetConstantViaSCEV(NewOps[0]);
8669 // For other binops, the legacy cost model only checks the second operand.
8670 NewOps[1] = GetConstantViaSCEV(NewOps[1]);
8671 }
8672 return new VPWidenRecipe(*I, make_range(NewOps.begin(), NewOps.end()));
8673 };
8674}
8675
8677VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
8679 // FIXME: Support other operations.
8680 unsigned Opcode = HI->Update->getOpcode();
8681 assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) &&
8682 "Histogram update operation must be an Add or Sub");
8683
8685 // Bucket address.
8686 HGramOps.push_back(Operands[1]);
8687 // Increment value.
8688 HGramOps.push_back(getVPValueOrAddLiveIn(HI->Update->getOperand(1)));
8689
8690 // In case of predicated execution (due to tail-folding, or conditional
8691 // execution, or both), pass the relevant mask.
8692 if (Legal->isMaskRequired(HI->Store))
8693 HGramOps.push_back(getBlockInMask(HI->Store->getParent()));
8694
8695 return new VPHistogramRecipe(Opcode,
8696 make_range(HGramOps.begin(), HGramOps.end()),
8697 HI->Store->getDebugLoc());
8698}
8699
8701 BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8702 for (VPHeaderPHIRecipe *R : PhisToFix) {
8703 auto *PN = cast<PHINode>(R->getUnderlyingValue());
8704 VPRecipeBase *IncR =
8705 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8706 R->addOperand(IncR->getVPSingleValue());
8707 }
8708}
8709
8711 VFRange &Range) {
8713 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8714 Range);
8715
8716 bool IsPredicated = CM.isPredicatedInst(I);
8717
8718 // Even if the instruction is not marked as uniform, there are certain
8719 // intrinsic calls that can be effectively treated as such, so we check for
8720 // them here. Conservatively, we only do this for scalable vectors, since
8721 // for fixed-width VFs we can always fall back on full scalarization.
8722 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8723 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8724 case Intrinsic::assume:
8725 case Intrinsic::lifetime_start:
8726 case Intrinsic::lifetime_end:
8727 // For scalable vectors if one of the operands is variant then we still
8728 // want to mark as uniform, which will generate one instruction for just
8729 // the first lane of the vector. We can't scalarize the call in the same
8730 // way as for fixed-width vectors because we don't know how many lanes
8731 // there are.
8732 //
8733 // The reasons for doing it this way for scalable vectors are:
8734 // 1. For the assume intrinsic generating the instruction for the first
8735 // lane is still be better than not generating any at all. For
8736 // example, the input may be a splat across all lanes.
8737 // 2. For the lifetime start/end intrinsics the pointer operand only
8738 // does anything useful when the input comes from a stack object,
8739 // which suggests it should always be uniform. For non-stack objects
8740 // the effect is to poison the object, which still allows us to
8741 // remove the call.
8742 IsUniform = true;
8743 break;
8744 default:
8745 break;
8746 }
8747 }
8748 VPValue *BlockInMask = nullptr;
8749 if (!IsPredicated) {
8750 // Finalize the recipe for Instr, first if it is not predicated.
8751 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8752 } else {
8753 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8754 // Instructions marked for predication are replicated and a mask operand is
8755 // added initially. Masked replicate recipes will later be placed under an
8756 // if-then construct to prevent side-effects. Generate recipes to compute
8757 // the block mask for this region.
8758 BlockInMask = getBlockInMask(I->getParent());
8759 }
8760
8761 // Note that there is some custom logic to mark some intrinsics as uniform
8762 // manually above for scalable vectors, which this assert needs to account for
8763 // as well.
8764 assert((Range.Start.isScalar() || !IsUniform || !IsPredicated ||
8765 (Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
8766 "Should not predicate a uniform recipe");
8767 auto *Recipe = new VPReplicateRecipe(I, mapToVPValues(I->operands()),
8768 IsUniform, BlockInMask);
8769 return Recipe;
8770}
8771
8772/// Find all possible partial reductions in the loop and track all of those that
8773/// are valid so recipes can be formed later.
8775 // Find all possible partial reductions.
8777 PartialReductionChains;
8778 for (const auto &[Phi, RdxDesc] : Legal->getReductionVars())
8779 if (std::optional<std::pair<PartialReductionChain, unsigned>> Pair =
8780 getScaledReduction(Phi, RdxDesc, Range))
8781 PartialReductionChains.push_back(*Pair);
8782
8783 // A partial reduction is invalid if any of its extends are used by
8784 // something that isn't another partial reduction. This is because the
8785 // extends are intended to be lowered along with the reduction itself.
8786
8787 // Build up a set of partial reduction bin ops for efficient use checking.
8788 SmallSet<User *, 4> PartialReductionBinOps;
8789 for (const auto &[PartialRdx, _] : PartialReductionChains)
8790 PartialReductionBinOps.insert(PartialRdx.BinOp);
8791
8792 auto ExtendIsOnlyUsedByPartialReductions =
8793 [&PartialReductionBinOps](Instruction *Extend) {
8794 return all_of(Extend->users(), [&](const User *U) {
8795 return PartialReductionBinOps.contains(U);
8796 });
8797 };
8798
8799 // Check if each use of a chain's two extends is a partial reduction
8800 // and only add those that don't have non-partial reduction users.
8801 for (auto Pair : PartialReductionChains) {
8802 PartialReductionChain Chain = Pair.first;
8803 if (ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) &&
8804 ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB))
8805 ScaledReductionExitInstrs.insert(std::make_pair(Chain.Reduction, Pair));
8806 }
8807}
8808
8809std::optional<std::pair<PartialReductionChain, unsigned>>
8810VPRecipeBuilder::getScaledReduction(PHINode *PHI,
8811 const RecurrenceDescriptor &Rdx,
8812 VFRange &Range) {
8813 // TODO: Allow scaling reductions when predicating. The select at
8814 // the end of the loop chooses between the phi value and most recent
8815 // reduction result, both of which have different VFs to the active lane
8816 // mask when scaling.
8818 return std::nullopt;
8819
8820 auto *Update = dyn_cast<BinaryOperator>(Rdx.getLoopExitInstr());
8821 if (!Update)
8822 return std::nullopt;
8823
8824 Value *Op = Update->getOperand(0);
8825 Value *PhiOp = Update->getOperand(1);
8826 if (Op == PHI) {
8827 Op = Update->getOperand(1);
8828 PhiOp = Update->getOperand(0);
8829 }
8830 if (PhiOp != PHI)
8831 return std::nullopt;
8832
8833 auto *BinOp = dyn_cast<BinaryOperator>(Op);
8834 if (!BinOp || !BinOp->hasOneUse())
8835 return std::nullopt;
8836
8837 using namespace llvm::PatternMatch;
8838 Value *A, *B;
8839 if (!match(BinOp->getOperand(0), m_ZExtOrSExt(m_Value(A))) ||
8840 !match(BinOp->getOperand(1), m_ZExtOrSExt(m_Value(B))))
8841 return std::nullopt;
8842
8843 Instruction *ExtA = cast<Instruction>(BinOp->getOperand(0));
8844 Instruction *ExtB = cast<Instruction>(BinOp->getOperand(1));
8845
8850
8851 PartialReductionChain Chain(Rdx.getLoopExitInstr(), ExtA, ExtB, BinOp);
8852
8853 unsigned TargetScaleFactor =
8854 PHI->getType()->getPrimitiveSizeInBits().getKnownScalarFactor(
8855 A->getType()->getPrimitiveSizeInBits());
8856
8858 [&](ElementCount VF) {
8860 Update->getOpcode(), A->getType(), B->getType(), PHI->getType(),
8861 VF, OpAExtend, OpBExtend,
8862 std::make_optional(BinOp->getOpcode()));
8863 return Cost.isValid();
8864 },
8865 Range))
8866 return std::make_pair(Chain, TargetScaleFactor);
8867
8868 return std::nullopt;
8869}
8870
8874 VFRange &Range, VPBasicBlock *VPBB) {
8875 // First, check for specific widening recipes that deal with inductions, Phi
8876 // nodes, calls and memory operations.
8877 VPRecipeBase *Recipe;
8878 if (auto *Phi = dyn_cast<PHINode>(Instr)) {
8879 if (Phi->getParent() != OrigLoop->getHeader())
8880 return tryToBlend(Phi, Operands);
8881
8882 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
8883 return Recipe;
8884
8885 VPHeaderPHIRecipe *PhiRecipe = nullptr;
8886 assert((Legal->isReductionVariable(Phi) ||
8887 Legal->isFixedOrderRecurrence(Phi)) &&
8888 "can only widen reductions and fixed-order recurrences here");
8889 VPValue *StartV = Operands[0];
8890 if (Legal->isReductionVariable(Phi)) {
8891 const RecurrenceDescriptor &RdxDesc =
8892 Legal->getReductionVars().find(Phi)->second;
8893 assert(RdxDesc.getRecurrenceStartValue() ==
8894 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8895
8896 // If the PHI is used by a partial reduction, set the scale factor.
8897 std::optional<std::pair<PartialReductionChain, unsigned>> Pair =
8899 unsigned ScaleFactor = Pair ? Pair->second : 1;
8900 PhiRecipe = new VPReductionPHIRecipe(
8901 Phi, RdxDesc, *StartV, CM.isInLoopReduction(Phi),
8902 CM.useOrderedReductions(RdxDesc), ScaleFactor);
8903 } else {
8904 // TODO: Currently fixed-order recurrences are modeled as chains of
8905 // first-order recurrences. If there are no users of the intermediate
8906 // recurrences in the chain, the fixed order recurrence should be modeled
8907 // directly, enabling more efficient codegen.
8908 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8909 }
8910
8911 PhisToFix.push_back(PhiRecipe);
8912 return PhiRecipe;
8913 }
8914
8915 if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(
8916 cast<TruncInst>(Instr), Operands, Range)))
8917 return Recipe;
8918
8919 // All widen recipes below deal only with VF > 1.
8921 [&](ElementCount VF) { return VF.isScalar(); }, Range))
8922 return nullptr;
8923
8924 if (auto *CI = dyn_cast<CallInst>(Instr))
8925 return tryToWidenCall(CI, Operands, Range);
8926
8927 if (StoreInst *SI = dyn_cast<StoreInst>(Instr))
8928 if (auto HistInfo = Legal->getHistogramInfo(SI))
8929 return tryToWidenHistogram(*HistInfo, Operands);
8930
8931 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8932 return tryToWidenMemory(Instr, Operands, Range);
8933
8934 if (getScaledReductionForInstr(Instr))
8936
8937 if (!shouldWiden(Instr, Range))
8938 return nullptr;
8939
8940 if (auto *GEP = dyn_cast<GetElementPtrInst>(Instr))
8941 return new VPWidenGEPRecipe(GEP,
8942 make_range(Operands.begin(), Operands.end()));
8943
8944 if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8945 return new VPWidenSelectRecipe(
8946 *SI, make_range(Operands.begin(), Operands.end()));
8947 }
8948
8949 if (auto *CI = dyn_cast<CastInst>(Instr)) {
8950 return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(),
8951 *CI);
8952 }
8953
8954 return tryToWiden(Instr, Operands, VPBB);
8955}
8956
8960 assert(Operands.size() == 2 &&
8961 "Unexpected number of operands for partial reduction");
8962
8963 VPValue *BinOp = Operands[0];
8964 VPValue *Phi = Operands[1];
8965 if (isa<VPReductionPHIRecipe>(BinOp->getDefiningRecipe()))
8966 std::swap(BinOp, Phi);
8967
8968 return new VPPartialReductionRecipe(Reduction->getOpcode(), BinOp, Phi,
8969 Reduction);
8970}
8971
8972void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8973 ElementCount MaxVF) {
8974 assert(OrigLoop->isInnermost() && "Inner loop expected.");
8975
8976 auto MaxVFTimes2 = MaxVF * 2;
8977 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
8978 VFRange SubRange = {VF, MaxVFTimes2};
8979 if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) {
8980 // Now optimize the initial VPlan.
8981 if (!Plan->hasVF(ElementCount::getFixed(1)))
8983 CM.getMinimalBitwidths());
8985 // TODO: try to put it close to addActiveLaneMask().
8986 // Discard the plan if it is not EVL-compatible
8988 *Plan, CM.getMaxSafeElements()))
8989 break;
8990 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8991 VPlans.push_back(std::move(Plan));
8992 }
8993 VF = SubRange.End;
8994 }
8995}
8996
8997// Add the necessary canonical IV and branch recipes required to control the
8998// loop.
8999static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
9000 DebugLoc DL) {
9001 Value *StartIdx = ConstantInt::get(IdxTy, 0);
9002 auto *StartV = Plan.getOrAddLiveIn(StartIdx);
9003
9004 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
9005 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
9006 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
9007 VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
9008 Header->insert(CanonicalIVPHI, Header->begin());
9009
9010 VPBuilder Builder(TopRegion->getExitingBasicBlock());
9011 // Add a VPInstruction to increment the scalar canonical IV by VF * UF.
9012 auto *CanonicalIVIncrement = Builder.createOverflowingOp(
9013 Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {HasNUW, false}, DL,
9014 "index.next");
9015 CanonicalIVPHI->addOperand(CanonicalIVIncrement);
9016
9017 // Add the BranchOnCount VPInstruction to the latch.
9019 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
9020}
9021
9022/// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the
9023/// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute
9024/// the end value of the induction.
9026 VPBuilder &VectorPHBuilder,
9027 VPBuilder &ScalarPHBuilder,
9028 VPTypeAnalysis &TypeInfo,
9029 VPValue *VectorTC) {
9030 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
9031 // Truncated wide inductions resume from the last lane of their vector value
9032 // in the last vector iteration which is handled elsewhere.
9033 if (WideIntOrFp && WideIntOrFp->getTruncInst())
9034 return nullptr;
9035
9036 VPValue *Start = WideIV->getStartValue();
9037 VPValue *Step = WideIV->getStepValue();
9039 VPValue *EndValue = VectorTC;
9040 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
9041 EndValue = VectorPHBuilder.createDerivedIV(
9042 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
9043 Start, VectorTC, Step);
9044 }
9045
9046 // EndValue is derived from the vector trip count (which has the same type as
9047 // the widest induction) and thus may be wider than the induction here.
9048 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
9049 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
9050 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
9051 ScalarTypeOfWideIV,
9052 WideIV->getDebugLoc());
9053 }
9054
9055 auto *ResumePhiRecipe =
9056 ScalarPHBuilder.createNaryOp(VPInstruction::ResumePhi, {EndValue, Start},
9057 WideIV->getDebugLoc(), "bc.resume.val");
9058 return ResumePhiRecipe;
9059}
9060
9061/// Create resume phis in the scalar preheader for first-order recurrences,
9062/// reductions and inductions, and update the VPIRInstructions wrapping the
9063/// original phis in the scalar header.
9064static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
9065 VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
9066 auto *ScalarPH = Plan.getScalarPreheader();
9067 auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getSinglePredecessor());
9068 VPBuilder VectorPHBuilder(
9069 cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSinglePredecessor()));
9070 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
9071 VPBuilder ScalarPHBuilder(ScalarPH);
9072 VPValue *OneVPV = Plan.getOrAddLiveIn(
9073 ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
9074 for (VPRecipeBase &ScalarPhiR : *Plan.getScalarHeader()) {
9075 auto *ScalarPhiIRI = cast<VPIRInstruction>(&ScalarPhiR);
9076 auto *ScalarPhiI = dyn_cast<PHINode>(&ScalarPhiIRI->getInstruction());
9077 if (!ScalarPhiI)
9078 break;
9079
9080 auto *VectorPhiR = cast<VPHeaderPHIRecipe>(Builder.getRecipe(ScalarPhiI));
9081 if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
9082 if (VPValue *ResumePhi = addResumePhiRecipeForInduction(
9083 WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo,
9084 &Plan.getVectorTripCount())) {
9085 ScalarPhiIRI->addOperand(ResumePhi);
9086 continue;
9087 }
9088 // TODO: Also handle truncated inductions here. Computing end-values
9089 // separately should be done as VPlan-to-VPlan optimization, after
9090 // legalizing all resume values to use the last lane from the loop.
9091 assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
9092 "should only skip truncated wide inductions");
9093 continue;
9094 }
9095
9096 // The backedge value provides the value to resume coming out of a loop,
9097 // which for FORs is a vector whose last element needs to be extracted. The
9098 // start value provides the value if the loop is bypassed.
9099 bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR);
9100 auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
9101 if (IsFOR)
9102 ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
9103 VPInstruction::ExtractFromEnd, {ResumeFromVectorLoop, OneVPV}, {},
9104 "vector.recur.extract");
9105 StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx";
9106 auto *ResumePhiR = ScalarPHBuilder.createNaryOp(
9108 {ResumeFromVectorLoop, VectorPhiR->getStartValue()}, {}, Name);
9109 ScalarPhiIRI->addOperand(ResumePhiR);
9110 }
9111}
9112
9113/// Return true if \p VPV is an optimizable IV or IV use. That is, if \p VPV is
9114/// either an untruncated wide induction, or if it increments a wide induction
9115/// by its step.
9117 VPRecipeBase *Def = VPV->getDefiningRecipe();
9118 if (!Def)
9119 return false;
9120 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(Def);
9121 if (WideIV) {
9122 // VPV itself is a wide induction, separately compute the end value for exit
9123 // users if it is not a truncated IV.
9124 return isa<VPWidenPointerInductionRecipe>(WideIV) ||
9125 !cast<VPWidenIntOrFpInductionRecipe>(WideIV)->getTruncInst();
9126 }
9127
9128 // Check if VPV is an optimizable induction increment.
9129 if (Def->getNumOperands() != 2)
9130 return false;
9131 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
9132 if (!WideIV)
9133 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
9134 if (!WideIV)
9135 return false;
9136
9137 using namespace VPlanPatternMatch;
9138 auto &ID = WideIV->getInductionDescriptor();
9139
9140 // Check if VPV increments the induction by the induction step.
9141 VPValue *IVStep = WideIV->getStepValue();
9142 switch (ID.getInductionOpcode()) {
9143 case Instruction::Add:
9144 return match(VPV, m_c_Binary<Instruction::Add>(m_Specific(WideIV),
9145 m_Specific(IVStep)));
9146 case Instruction::FAdd:
9147 return match(VPV, m_c_Binary<Instruction::FAdd>(m_Specific(WideIV),
9148 m_Specific(IVStep)));
9149 case Instruction::FSub:
9150 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
9151 m_Specific(IVStep)));
9152 case Instruction::Sub: {
9153 // IVStep will be the negated step of the subtraction. Check if Step == -1 *
9154 // IVStep.
9155 VPValue *Step;
9156 if (!match(VPV, m_Binary<Instruction::Sub>(m_VPValue(), m_VPValue(Step))) ||
9157 !Step->isLiveIn() || !IVStep->isLiveIn())
9158 return false;
9159 auto *StepCI = dyn_cast<ConstantInt>(Step->getLiveInIRValue());
9160 auto *IVStepCI = dyn_cast<ConstantInt>(IVStep->getLiveInIRValue());
9161 return StepCI && IVStepCI &&
9162 StepCI->getValue() == (-1 * IVStepCI->getValue());
9163 }
9164 default:
9165 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
9166 match(VPV, m_GetElementPtr(m_Specific(WideIV),
9167 m_Specific(WideIV->getStepValue())));
9168 }
9169 llvm_unreachable("should have been covered by switch above");
9170}
9171
9172// Collect VPIRInstructions for phis in the exit blocks that are modeled
9173// in VPlan and add the exiting VPValue as operand. Some exiting values are not
9174// modeled explicitly yet and won't be included. Those are un-truncated
9175// VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe and induction
9176// increments.
9179 VPlan &Plan) {
9180 auto *MiddleVPBB = Plan.getMiddleBlock();
9181 SetVector<VPIRInstruction *> ExitUsersToFix;
9182 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
9183 for (VPRecipeBase &R : *ExitVPBB) {
9184 auto *ExitIRI = dyn_cast<VPIRInstruction>(&R);
9185 if (!ExitIRI)
9186 continue;
9187 auto *ExitPhi = dyn_cast<PHINode>(&ExitIRI->getInstruction());
9188 if (!ExitPhi)
9189 break;
9190 for (VPBlockBase *PredVPBB : ExitVPBB->getPredecessors()) {
9191 BasicBlock *ExitingBB = OrigLoop->getLoopLatch();
9192 if (PredVPBB != MiddleVPBB) {
9193 SmallVector<BasicBlock *> ExitingBlocks;
9194 OrigLoop->getExitingBlocks(ExitingBlocks);
9195 assert(ExitingBlocks.size() == 2 && "only support 2 exiting blocks");
9196 ExitingBB = ExitingBB == ExitingBlocks[0] ? ExitingBlocks[1]
9197 : ExitingBlocks[0];
9198 }
9199 Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB);
9200 VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue);
9201 // Exit values for inductions are computed and updated outside of VPlan
9202 // and independent of induction recipes.
9203 // TODO: Compute induction exit values in VPlan.
9204 if (isOptimizableIVOrUse(V) &&
9205 ExitVPBB->getSinglePredecessor() == MiddleVPBB)
9206 continue;
9207 ExitUsersToFix.insert(ExitIRI);
9208 ExitIRI->addOperand(V);
9209 }
9210 }
9211 }
9212 return ExitUsersToFix;
9213}
9214
9215// Add exit values to \p Plan. Extracts are added for each entry in \p
9216// ExitUsersToFix if needed and their operands are updated. Returns true if all
9217// exit users can be handled, otherwise return false.
9218static bool
9220 const SetVector<VPIRInstruction *> &ExitUsersToFix) {
9221 if (ExitUsersToFix.empty())
9222 return true;
9223
9224 auto *MiddleVPBB = Plan.getMiddleBlock();
9225 VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
9226
9227 // Introduce extract for exiting values and update the VPIRInstructions
9228 // modeling the corresponding LCSSA phis.
9229 for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
9230 for (const auto &[Idx, Op] : enumerate(ExitIRI->operands())) {
9231 // Pass live-in values used by exit phis directly through to their users
9232 // in the exit block.
9233 if (Op->isLiveIn())
9234 continue;
9235
9236 // Currently only live-ins can be used by exit values from blocks not
9237 // exiting via the vector latch through to the middle block.
9238 if (ExitIRI->getParent()->getSinglePredecessor() != MiddleVPBB)
9239 return false;
9240
9241 LLVMContext &Ctx = ExitIRI->getInstruction().getContext();
9242 VPValue *Ext = B.createNaryOp(VPInstruction::ExtractFromEnd,
9243 {Op, Plan.getOrAddLiveIn(ConstantInt::get(
9244 IntegerType::get(Ctx, 32), 1))});
9245 ExitIRI->setOperand(Idx, Ext);
9246 }
9247 }
9248 return true;
9249}
9250
9251/// Handle users in the exit block for first order reductions in the original
9252/// exit block. The penultimate value of recurrences is fed to their LCSSA phi
9253/// users in the original exit block using the VPIRInstruction wrapping to the
9254/// LCSSA phi.
9256 VPlan &Plan, SetVector<VPIRInstruction *> &ExitUsersToFix) {
9257 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
9258 auto *ScalarPHVPBB = Plan.getScalarPreheader();
9259 auto *MiddleVPBB = Plan.getMiddleBlock();
9260 VPBuilder ScalarPHBuilder(ScalarPHVPBB);
9261 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
9262 VPValue *TwoVPV = Plan.getOrAddLiveIn(
9263 ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 2));
9264
9265 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
9266 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
9267 if (!FOR)
9268 continue;
9269
9270 // This is the second phase of vectorizing first-order recurrences, creating
9271 // extract for users outside the loop. An overview of the transformation is
9272 // described below. Suppose we have the following loop with some use after
9273 // the loop of the last a[i-1],
9274 //
9275 // for (int i = 0; i < n; ++i) {
9276 // t = a[i - 1];
9277 // b[i] = a[i] - t;
9278 // }
9279 // use t;
9280 //
9281 // There is a first-order recurrence on "a". For this loop, the shorthand
9282 // scalar IR looks like:
9283 //
9284 // scalar.ph:
9285 // s.init = a[-1]
9286 // br scalar.body
9287 //
9288 // scalar.body:
9289 // i = phi [0, scalar.ph], [i+1, scalar.body]
9290 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
9291 // s2 = a[i]
9292 // b[i] = s2 - s1
9293 // br cond, scalar.body, exit.block
9294 //
9295 // exit.block:
9296 // use = lcssa.phi [s1, scalar.body]
9297 //
9298 // In this example, s1 is a recurrence because it's value depends on the
9299 // previous iteration. In the first phase of vectorization, we created a
9300 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
9301 // for users in the scalar preheader and exit block.
9302 //
9303 // vector.ph:
9304 // v_init = vector(..., ..., ..., a[-1])
9305 // br vector.body
9306 //
9307 // vector.body
9308 // i = phi [0, vector.ph], [i+4, vector.body]
9309 // v1 = phi [v_init, vector.ph], [v2, vector.body]
9310 // v2 = a[i, i+1, i+2, i+3]
9311 // b[i] = v2 - v1
9312 // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
9313 // b[i, i+1, i+2, i+3] = v2 - v1
9314 // br cond, vector.body, middle.block
9315 //
9316 // middle.block:
9317 // vector.recur.extract.for.phi = v2(2)
9318 // vector.recur.extract = v2(3)
9319 // br cond, scalar.ph, exit.block
9320 //
9321 // scalar.ph:
9322 // scalar.recur.init = phi [vector.recur.extract, middle.block],
9323 // [s.init, otherwise]
9324 // br scalar.body
9325 //
9326 // scalar.body:
9327 // i = phi [0, scalar.ph], [i+1, scalar.body]
9328 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
9329 // s2 = a[i]
9330 // b[i] = s2 - s1
9331 // br cond, scalar.body, exit.block
9332 //
9333 // exit.block:
9334 // lo = lcssa.phi [s1, scalar.body],
9335 // [vector.recur.extract.for.phi, middle.block]
9336 //
9337 // Now update VPIRInstructions modeling LCSSA phis in the exit block.
9338 // Extract the penultimate value of the recurrence and use it as operand for
9339 // the VPIRInstruction modeling the phi.
9340 for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
9341 if (ExitIRI->getOperand(0) != FOR)
9342 continue;
9343 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
9344 VPInstruction::ExtractFromEnd, {FOR->getBackedgeValue(), TwoVPV}, {},
9345 "vector.recur.extract.for.phi");
9346 ExitIRI->setOperand(0, PenultimateElement);
9347 ExitUsersToFix.remove(ExitIRI);
9348 }
9349 }
9350}
9351
9353LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
9354
9356
9357 // ---------------------------------------------------------------------------
9358 // Build initial VPlan: Scan the body of the loop in a topological order to
9359 // visit each basic block after having visited its predecessor basic blocks.
9360 // ---------------------------------------------------------------------------
9361
9362 // Create initial VPlan skeleton, having a basic block for the pre-header
9363 // which contains SCEV expansions that need to happen before the CFG is
9364 // modified; a basic block for the vector pre-header, followed by a region for
9365 // the vector loop, followed by the middle basic block. The skeleton vector
9366 // loop region contains a header and latch basic blocks.
9367
9368 bool RequiresScalarEpilogueCheck =
9370 [this](ElementCount VF) {
9371 return !CM.requiresScalarEpilogue(VF.isVector());
9372 },
9373 Range);
9375 PSE, RequiresScalarEpilogueCheck,
9376 CM.foldTailByMasking(), OrigLoop);
9377
9378 // Don't use getDecisionAndClampRange here, because we don't know the UF
9379 // so this function is better to be conservative, rather than to split
9380 // it up into different VPlans.
9381 // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
9382 bool IVUpdateMayOverflow = false;
9383 for (ElementCount VF : Range)
9384 IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
9385
9387 TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
9388 // Use NUW for the induction increment if we proved that it won't overflow in
9389 // the vector loop or when not folding the tail. In the later case, we know
9390 // that the canonical induction increment will not overflow as the vector trip
9391 // count is >= increment and a multiple of the increment.
9392 bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None;
9393 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
9394
9395 VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
9396 Builder);
9397
9398 // ---------------------------------------------------------------------------
9399 // Pre-construction: record ingredients whose recipes we'll need to further
9400 // process after constructing the initial VPlan.
9401 // ---------------------------------------------------------------------------
9402
9403 // For each interleave group which is relevant for this (possibly trimmed)
9404 // Range, add it to the set of groups to be later applied to the VPlan and add
9405 // placeholders for its members' Recipes which we'll be replacing with a
9406 // single VPInterleaveRecipe.
9408 auto ApplyIG = [IG, this](ElementCount VF) -> bool {
9409 bool Result = (VF.isVector() && // Query is illegal for VF == 1
9410 CM.getWideningDecision(IG->getInsertPos(), VF) ==
9412 // For scalable vectors, the only interleave factor currently supported
9413 // is 2 since we require the (de)interleave2 intrinsics instead of
9414 // shufflevectors.
9415 assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
9416 "Unsupported interleave factor for scalable vectors");
9417 return Result;
9418 };
9419 if (!getDecisionAndClampRange(ApplyIG, Range))
9420 continue;
9421 InterleaveGroups.insert(IG);
9422 }
9423
9424 // ---------------------------------------------------------------------------
9425 // Construct recipes for the instructions in the loop
9426 // ---------------------------------------------------------------------------
9427
9428 // Scan the body of the loop in a topological order to visit each basic block
9429 // after having visited its predecessor basic blocks.
9430 LoopBlocksDFS DFS(OrigLoop);
9431 DFS.perform(LI);
9432
9433 VPBasicBlock *HeaderVPBB = Plan->getVectorLoopRegion()->getEntryBasicBlock();
9434 VPBasicBlock *VPBB = HeaderVPBB;
9435 BasicBlock *HeaderBB = OrigLoop->getHeader();
9436 bool NeedsMasks =
9437 CM.foldTailByMasking() ||
9438 any_of(OrigLoop->blocks(), [this, HeaderBB](BasicBlock *BB) {
9439 bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();
9440 return Legal->blockNeedsPredication(BB) || NeedsBlends;
9441 });
9442
9443 RecipeBuilder.collectScaledReductions(Range);
9444
9445 auto *MiddleVPBB = Plan->getMiddleBlock();
9446 VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
9447 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
9448 // Relevant instructions from basic block BB will be grouped into VPRecipe
9449 // ingredients and fill a new VPBasicBlock.
9450 if (VPBB != HeaderVPBB)
9451 VPBB->setName(BB->getName());
9452 Builder.setInsertPoint(VPBB);
9453
9454 if (VPBB == HeaderVPBB)
9455 RecipeBuilder.createHeaderMask();
9456 else if (NeedsMasks)
9457 RecipeBuilder.createBlockInMask(BB);
9458
9459 // Introduce each ingredient into VPlan.
9460 // TODO: Model and preserve debug intrinsics in VPlan.
9461 for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
9462 Instruction *Instr = &I;
9464 auto *Phi = dyn_cast<PHINode>(Instr);
9465 if (Phi && Phi->getParent() == HeaderBB) {
9466 Operands.push_back(Plan->getOrAddLiveIn(
9467 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
9468 } else {
9469 auto OpRange = RecipeBuilder.mapToVPValues(Instr->operands());
9470 Operands = {OpRange.begin(), OpRange.end()};
9471 }
9472
9473 // The stores with invariant address inside the loop will be deleted, and
9474 // in the exit block, a uniform store recipe will be created for the final
9475 // invariant store of the reduction.
9476 StoreInst *SI;
9477 if ((SI = dyn_cast<StoreInst>(&I)) &&
9478 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
9479 // Only create recipe for the final invariant store of the reduction.
9480 if (!Legal->isInvariantStoreOfReduction(SI))
9481 continue;
9482 auto *Recipe = new VPReplicateRecipe(
9483 SI, RecipeBuilder.mapToVPValues(Instr->operands()),
9484 true /* IsUniform */);
9485 Recipe->insertBefore(*MiddleVPBB, MBIP);
9486 continue;
9487 }
9488
9489 VPRecipeBase *Recipe =
9490 RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);
9491 if (!Recipe)
9492 Recipe = RecipeBuilder.handleReplication(Instr, Range);
9493
9494 RecipeBuilder.setRecipe(Instr, Recipe);
9495 if (isa<VPHeaderPHIRecipe>(Recipe)) {
9496 // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In
9497 // the following cases, VPHeaderPHIRecipes may be created after non-phi
9498 // recipes and need to be moved to the phi section of HeaderVPBB:
9499 // * tail-folding (non-phi recipes computing the header mask are
9500 // introduced earlier than regular header phi recipes, and should appear
9501 // after them)
9502 // * Optimizing truncates to VPWidenIntOrFpInductionRecipe.
9503
9504 assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() ||
9505 CM.foldTailByMasking() || isa<TruncInst>(Instr)) &&
9506 "unexpected recipe needs moving");
9507 Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
9508 } else
9509 VPBB->appendRecipe(Recipe);
9510 }
9511
9512 VPBlockUtils::insertBlockAfter(Plan->createVPBasicBlock(""), VPBB);
9513 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
9514 }
9515
9516 // After here, VPBB should not be used.
9517 VPBB = nullptr;
9518
9519 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
9520 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
9521 "entry block must be set to a VPRegionBlock having a non-empty entry "
9522 "VPBasicBlock");
9523 RecipeBuilder.fixHeaderPhis();
9524
9525 // Update wide induction increments to use the same step as the corresponding
9526 // wide induction. This enables detecting induction increments directly in
9527 // VPlan and removes redundant splats.
9528 for (const auto &[Phi, ID] : Legal->getInductionVars()) {
9529 auto *IVInc = cast<Instruction>(
9530 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
9531 if (IVInc->getOperand(0) != Phi || IVInc->getOpcode() != Instruction::Add)
9532 continue;
9533 VPWidenInductionRecipe *WideIV =
9534 cast<VPWidenInductionRecipe>(RecipeBuilder.getRecipe(Phi));
9535 VPRecipeBase *R = RecipeBuilder.getRecipe(IVInc);
9536 R->setOperand(1, WideIV->getStepValue());
9537 }
9538
9539 if (auto *UncountableExitingBlock =
9542 *Plan, *PSE.getSE(), OrigLoop, UncountableExitingBlock, RecipeBuilder);
9543 }
9544 addScalarResumePhis(RecipeBuilder, *Plan);
9545 SetVector<VPIRInstruction *> ExitUsersToFix =
9546 collectUsersInExitBlocks(OrigLoop, RecipeBuilder, *Plan);
9547 addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix);
9548 if (!addUsersInExitBlocks(*Plan, ExitUsersToFix)) {
9550 "Some exit values in loop with uncountable exit not supported yet",
9551 "UncountableEarlyExitLoopsUnsupportedExitValue", ORE, OrigLoop);
9552 return nullptr;
9553 }
9554
9555 // ---------------------------------------------------------------------------
9556 // Transform initial VPlan: Apply previously taken decisions, in order, to
9557 // bring the VPlan to its final state.
9558 // ---------------------------------------------------------------------------
9559
9560 // Adjust the recipes for any inloop reductions.
9561 adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start);
9562
9563 // Interleave memory: for each Interleave Group we marked earlier as relevant
9564 // for this VPlan, replace the Recipes widening its memory instructions with a
9565 // single VPInterleaveRecipe at its insertion point.
9567 *Plan, InterleaveGroups, RecipeBuilder, CM.isScalarEpilogueAllowed());
9568
9569 for (ElementCount VF : Range)
9570 Plan->addVF(VF);
9571 Plan->setName("Initial VPlan");
9572
9573 // Replace VPValues for known constant strides guaranteed by predicate scalar
9574 // evolution.
9575 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
9576 auto *R = cast<VPRecipeBase>(&U);
9577 return R->getParent()->getParent() ||
9578 R->getParent() ==
9579 Plan->getVectorLoopRegion()->getSinglePredecessor();
9580 };
9581 for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) {
9582 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
9583 auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV));
9584 // Only handle constant strides for now.
9585 if (!ScevStride)
9586 continue;
9587
9588 auto *CI = Plan->getOrAddLiveIn(
9589 ConstantInt::get(Stride->getType(), ScevStride->getAPInt()));
9590 if (VPValue *StrideVPV = Plan->getLiveIn(StrideV))
9591 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
9592
9593 // The versioned value may not be used in the loop directly but through a
9594 // sext/zext. Add new live-ins in those cases.
9595 for (Value *U : StrideV->users()) {
9596 if (!isa<SExtInst, ZExtInst>(U))
9597 continue;
9598 VPValue *StrideVPV = Plan->getLiveIn(U);
9599 if (!StrideVPV)
9600 continue;
9601 unsigned BW = U->getType()->getScalarSizeInBits();
9602 APInt C = isa<SExtInst>(U) ? ScevStride->getAPInt().sext(BW)
9603 : ScevStride->getAPInt().zext(BW);
9604 VPValue *CI = Plan->getOrAddLiveIn(ConstantInt::get(U->getType(), C));
9605 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
9606 }
9607 }
9608
9610 return Legal->blockNeedsPredication(BB);
9611 });
9612
9613 // Sink users of fixed-order recurrence past the recipe defining the previous
9614 // value and introduce FirstOrderRecurrenceSplice VPInstructions.
9616 return nullptr;
9617
9618 if (useActiveLaneMask(Style)) {
9619 // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
9620 // TailFoldingStyle is visible there.
9621 bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
9622 bool WithoutRuntimeCheck =
9624 VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
9625 WithoutRuntimeCheck);
9626 }
9627
9628 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
9629 return Plan;
9630}
9631
9632VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9633 // Outer loop handling: They may require CFG and instruction level
9634 // transformations before even evaluating whether vectorization is profitable.
9635 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
9636 // the vectorization pipeline.
9637 assert(!OrigLoop->isInnermost());
9638 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
9639
9640 // Create new empty VPlan
9641 auto Plan = VPlan::createInitialVPlan(Legal->getWidestInductionType(), PSE,
9642 true, false, OrigLoop);
9643
9644 // Build hierarchical CFG
9645 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
9646 HCFGBuilder.buildHierarchicalCFG();
9647
9648 for (ElementCount VF : Range)
9649 Plan->addVF(VF);
9650
9652 Plan,
9653 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
9654 *PSE.getSE(), *TLI);
9655
9656 // Remove the existing terminator of the exiting block of the top-most region.
9657 // A BranchOnCount will be added instead when adding the canonical IV recipes.
9658 auto *Term =
9659 Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
9660 Term->eraseFromParent();
9661
9662 // Tail folding is not supported for outer loops, so the induction increment
9663 // is guaranteed to not wrap.
9664 bool HasNUW = true;
9665 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
9666 DebugLoc());
9667
9668 // Collect mapping of IR header phis to header phi recipes, to be used in
9669 // addScalarResumePhis.
9670 VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
9671 Builder);
9672 for (auto &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9673 if (isa<VPCanonicalIVPHIRecipe>(&R))
9674 continue;
9675 auto *HeaderR = cast<VPHeaderPHIRecipe>(&R);
9676 RecipeBuilder.setRecipe(HeaderR->getUnderlyingInstr(), HeaderR);
9677 }
9678 addScalarResumePhis(RecipeBuilder, *Plan);
9679
9680 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
9681 return Plan;
9682}
9683
9684// Adjust the recipes for reductions. For in-loop reductions the chain of
9685// instructions leading from the loop exit instr to the phi need to be converted
9686// to reductions, with one operand being vector and the other being the scalar
9687// reduction chain. For other reductions, a select is introduced between the phi
9688// and users outside the vector region when folding the tail.
9689//
9690// A ComputeReductionResult recipe is added to the middle block, also for
9691// in-loop reductions which compute their result in-loop, because generating
9692// the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
9693//
9694// Adjust AnyOf reductions; replace the reduction phi for the selected value
9695// with a boolean reduction phi node to check if the condition is true in any
9696// iteration. The final value is selected by the final ComputeReductionResult.
9697void LoopVectorizationPlanner::adjustRecipesForReductions(
9698 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
9699 using namespace VPlanPatternMatch;
9700 VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
9701 VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
9702 VPBasicBlock *MiddleVPBB = Plan->getMiddleBlock();
9704
9705 for (VPRecipeBase &R : Header->phis()) {
9706 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9707 if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
9708 continue;
9709
9710 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9711 RecurKind Kind = RdxDesc.getRecurrenceKind();
9712 assert(
9715 "AnyOf and FindLast reductions are not allowed for in-loop reductions");
9716
9717 // Collect the chain of "link" recipes for the reduction starting at PhiR.
9719 Worklist.insert(PhiR);
9720 for (unsigned I = 0; I != Worklist.size(); ++I) {
9721 VPSingleDefRecipe *Cur = Worklist[I];
9722 for (VPUser *U : Cur->users()) {
9723 auto *UserRecipe = cast<VPSingleDefRecipe>(U);
9724 if (!UserRecipe->getParent()->getEnclosingLoopRegion()) {
9725 assert((UserRecipe->getParent() == MiddleVPBB ||
9726 UserRecipe->getParent() == Plan->getScalarPreheader()) &&
9727 "U must be either in the loop region, the middle block or the "
9728 "scalar preheader.");
9729 continue;
9730 }
9731 Worklist.insert(UserRecipe);
9732 }
9733 }
9734
9735 // Visit operation "Links" along the reduction chain top-down starting from
9736 // the phi until LoopExitValue. We keep track of the previous item
9737 // (PreviousLink) to tell which of the two operands of a Link will remain
9738 // scalar and which will be reduced. For minmax by select(cmp), Link will be
9739 // the select instructions. Blend recipes of in-loop reduction phi's will
9740 // get folded to their non-phi operand, as the reduction recipe handles the
9741 // condition directly.
9742 VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0].
9743 for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) {
9744 Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
9745
9746 // Index of the first operand which holds a non-mask vector operand.
9747 unsigned IndexOfFirstOperand;
9748 // Recognize a call to the llvm.fmuladd intrinsic.
9749 bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9750 VPValue *VecOp;
9751 VPBasicBlock *LinkVPBB = CurrentLink->getParent();
9752 if (IsFMulAdd) {
9753 assert(
9755 "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9756 assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) ||
9757 isa<VPWidenIntrinsicRecipe>(CurrentLink)) &&
9758 CurrentLink->getOperand(2) == PreviousLink &&
9759 "expected a call where the previous link is the added operand");
9760
9761 // If the instruction is a call to the llvm.fmuladd intrinsic then we
9762 // need to create an fmul recipe (multiplying the first two operands of
9763 // the fmuladd together) to use as the vector operand for the fadd
9764 // reduction.
9765 VPInstruction *FMulRecipe = new VPInstruction(
9766 Instruction::FMul,
9767 {CurrentLink->getOperand(0), CurrentLink->getOperand(1)},
9768 CurrentLinkI->getFastMathFlags());
9769 LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());
9770 VecOp = FMulRecipe;
9771 } else {
9772 auto *Blend = dyn_cast<VPBlendRecipe>(CurrentLink);
9773 if (PhiR->isInLoop() && Blend) {
9774 assert(Blend->getNumIncomingValues() == 2 &&
9775 "Blend must have 2 incoming values");
9776 if (Blend->getIncomingValue(0) == PhiR)
9777 Blend->replaceAllUsesWith(Blend->getIncomingValue(1));
9778 else {
9779 assert(Blend->getIncomingValue(1) == PhiR &&
9780 "PhiR must be an operand of the blend");
9781 Blend->replaceAllUsesWith(Blend->getIncomingValue(0));
9782 }
9783 continue;
9784 }
9785
9787 if (isa<VPWidenRecipe>(CurrentLink)) {
9788 assert(isa<CmpInst>(CurrentLinkI) &&
9789 "need to have the compare of the select");
9790 continue;
9791 }
9792 assert(isa<VPWidenSelectRecipe>(CurrentLink) &&
9793 "must be a select recipe");
9794 IndexOfFirstOperand = 1;
9795 } else {
9796 assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) &&
9797 "Expected to replace a VPWidenSC");
9798 IndexOfFirstOperand = 0;
9799 }
9800 // Note that for non-commutable operands (cmp-selects), the semantics of
9801 // the cmp-select are captured in the recurrence kind.
9802 unsigned VecOpId =
9803 CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLink
9804 ? IndexOfFirstOperand + 1
9805 : IndexOfFirstOperand;
9806 VecOp = CurrentLink->getOperand(VecOpId);
9807 assert(VecOp != PreviousLink &&
9808 CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 -
9809 (VecOpId - IndexOfFirstOperand)) ==
9810 PreviousLink &&
9811 "PreviousLink must be the operand other than VecOp");
9812 }
9813
9814 BasicBlock *BB = CurrentLinkI->getParent();
9815 VPValue *CondOp = nullptr;
9817 CondOp = RecipeBuilder.getBlockInMask(BB);
9818
9819 auto *RedRecipe = new VPReductionRecipe(
9820 RdxDesc, CurrentLinkI, PreviousLink, VecOp, CondOp,
9821 CM.useOrderedReductions(RdxDesc), CurrentLinkI->getDebugLoc());
9822 // Append the recipe to the end of the VPBasicBlock because we need to
9823 // ensure that it comes after all of it's inputs, including CondOp.
9824 // Delete CurrentLink as it will be invalid if its operand is replaced
9825 // with a reduction defined at the bottom of the block in the next link.
9826 LinkVPBB->appendRecipe(RedRecipe);
9827 CurrentLink->replaceAllUsesWith(RedRecipe);
9828 ToDelete.push_back(CurrentLink);
9829 PreviousLink = RedRecipe;
9830 }
9831 }
9832 VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
9833 Builder.setInsertPoint(&*LatchVPBB->begin());
9834 VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
9835 for (VPRecipeBase &R :
9836 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9837 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9838 if (!PhiR)
9839 continue;
9840
9841 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9842 // If tail is folded by masking, introduce selects between the phi
9843 // and the users outside the vector region of each reduction, at the
9844 // beginning of the dedicated latch block.
9845 auto *OrigExitingVPV = PhiR->getBackedgeValue();
9846 auto *NewExitingVPV = PhiR->getBackedgeValue();
9847 if (!PhiR->isInLoop() && CM.foldTailByMasking()) {
9848 VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader());
9849 assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB &&
9850 "reduction recipe must be defined before latch");
9851 Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType();
9852 std::optional<FastMathFlags> FMFs =
9853 PhiTy->isFloatingPointTy()
9854 ? std::make_optional(RdxDesc.getFastMathFlags())
9855 : std::nullopt;
9856 NewExitingVPV =
9857 Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs);
9858 OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) {
9859 return isa<VPInstruction>(&U) &&
9860 cast<VPInstruction>(&U)->getOpcode() ==
9862 });
9864 PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy))
9865 PhiR->setOperand(1, NewExitingVPV);
9866 }
9867
9868 // If the vector reduction can be performed in a smaller type, we truncate
9869 // then extend the loop exit value to enable InstCombine to evaluate the
9870 // entire expression in the smaller type.
9871 Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType();
9872 if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&
9874 RdxDesc.getRecurrenceKind())) {
9875 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
9876 Type *RdxTy = RdxDesc.getRecurrenceType();
9877 auto *Trunc =
9878 new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy);
9879 auto *Extnd =
9880 RdxDesc.isSigned()
9881 ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy)
9882 : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy);
9883
9884 Trunc->insertAfter(NewExitingVPV->getDefiningRecipe());
9885 Extnd->insertAfter(Trunc);
9886 if (PhiR->getOperand(1) == NewExitingVPV)
9887 PhiR->setOperand(1, Extnd->getVPSingleValue());
9888 NewExitingVPV = Extnd;
9889 }
9890
9891 // We want code in the middle block to appear to execute on the location of
9892 // the scalar loop's latch terminator because: (a) it is all compiler
9893 // generated, (b) these instructions are always executed after evaluating
9894 // the latch conditional branch, and (c) other passes may add new
9895 // predecessors which terminate on this line. This is the easiest way to
9896 // ensure we don't accidentally cause an extra step back into the loop while
9897 // debugging.
9898 DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
9899
9900 // TODO: At the moment ComputeReductionResult also drives creation of the
9901 // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
9902 // even for in-loop reductions, until the reduction resume value handling is
9903 // also modeled in VPlan.
9904 auto *FinalReductionResult = new VPInstruction(
9905 VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
9906 // Update all users outside the vector region.
9907 OrigExitingVPV->replaceUsesWithIf(
9908 FinalReductionResult, [](VPUser &User, unsigned) {
9909 auto *Parent = cast<VPRecipeBase>(&User)->getParent();
9910 return Parent && !Parent->getParent();
9911 });
9912 FinalReductionResult->insertBefore(*MiddleVPBB, IP);
9913
9914 // Adjust AnyOf reductions; replace the reduction phi for the selected value
9915 // with a boolean reduction phi node to check if the condition is true in
9916 // any iteration. The final value is selected by the final
9917 // ComputeReductionResult.
9919 RdxDesc.getRecurrenceKind())) {
9920 auto *Select = cast<VPRecipeBase>(*find_if(PhiR->users(), [](VPUser *U) {
9921 return isa<VPWidenSelectRecipe>(U) ||
9922 (isa<VPReplicateRecipe>(U) &&
9923 cast<VPReplicateRecipe>(U)->getUnderlyingInstr()->getOpcode() ==
9924 Instruction::Select);
9925 }));
9926 VPValue *Cmp = Select->getOperand(0);
9927 // If the compare is checking the reduction PHI node, adjust it to check
9928 // the start value.
9929 if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe()) {
9930 for (unsigned I = 0; I != CmpR->getNumOperands(); ++I)
9931 if (CmpR->getOperand(I) == PhiR)
9932 CmpR->setOperand(I, PhiR->getStartValue());
9933 }
9934 VPBuilder::InsertPointGuard Guard(Builder);
9935 Builder.setInsertPoint(Select);
9936
9937 // If the true value of the select is the reduction phi, the new value is
9938 // selected if the negated condition is true in any iteration.
9939 if (Select->getOperand(1) == PhiR)
9940 Cmp = Builder.createNot(Cmp);
9941 VPValue *Or = Builder.createOr(PhiR, Cmp);
9942 Select->getVPSingleValue()->replaceAllUsesWith(Or);
9943 // Delete Select now that it has invalid types.
9944 ToDelete.push_back(Select);
9945
9946 // Convert the reduction phi to operate on bools.
9947 PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse(
9948 OrigLoop->getHeader()->getContext())));
9949 continue;
9950 }
9951
9953 RdxDesc.getRecurrenceKind())) {
9954 // Adjust the start value for FindLastIV recurrences to use the sentinel
9955 // value after generating the ResumePhi recipe, which uses the original
9956 // start value.
9957 PhiR->setOperand(0, Plan->getOrAddLiveIn(RdxDesc.getSentinelValue()));
9958 }
9959 }
9960
9962 for (VPRecipeBase *R : ToDelete)
9963 R->eraseFromParent();
9964}
9965
9967 assert(!State.Lane && "VPDerivedIVRecipe being replicated.");
9968
9969 // Fast-math-flags propagate from the original induction instruction.
9971 if (FPBinOp)
9972 State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
9973
9974 Value *Step = State.get(getStepValue(), VPLane(0));
9975 Value *Index = State.get(getOperand(1), VPLane(0));
9976 Value *DerivedIV = emitTransformedIndex(
9977 State.Builder, Index, getStartValue()->getLiveInIRValue(), Step, Kind,
9978 cast_if_present<BinaryOperator>(FPBinOp));
9979 DerivedIV->setName(Name);
9980 // If index is the vector trip count, the concrete value will only be set in
9981 // prepareToExecute, leading to missed simplifications, e.g. if it is 0.
9982 // TODO: Remove the special case for the vector trip count once it is computed
9983 // in VPlan and can be used during VPlan simplification.
9984 assert((DerivedIV != Index ||
9985 getOperand(1) == &getParent()->getPlan()->getVectorTripCount()) &&
9986 "IV didn't need transforming?");
9987 State.set(this, DerivedIV, VPLane(0));
9988}
9989
9992 if (State.Lane) { // Generate a single instance.
9993 assert((State.VF.isScalar() || !isUniform()) &&
9994 "uniform recipe shouldn't be predicated");
9995 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9996 State.ILV->scalarizeInstruction(UI, this, *State.Lane, State);
9997 // Insert scalar instance packing it into a vector.
9998 if (State.VF.isVector() && shouldPack()) {
9999 // If we're constructing lane 0, initialize to start from poison.
10000 if (State.Lane->isFirstLane()) {
10001 assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
10003 VectorType::get(UI->getType(), State.VF));
10004 State.set(this, Poison);
10005 }
10006 State.packScalarIntoVectorValue(this, *State.Lane);
10007 }
10008 return;
10009 }
10010
10011 if (IsUniform) {
10012 // Uniform within VL means we need to generate lane 0.
10013 State.ILV->scalarizeInstruction(UI, this, VPLane(0), State);
10014 return;
10015 }
10016
10017 // A store of a loop varying value to a uniform address only needs the last
10018 // copy of the store.
10019 if (isa<StoreInst>(UI) &&
10021 auto Lane = VPLane::getLastLaneForVF(State.VF);
10022 State.ILV->scalarizeInstruction(UI, this, VPLane(Lane), State);
10023 return;
10024 }
10025
10026 // Generate scalar instances for all VF lanes.
10027 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
10028 const unsigned EndLane = State.VF.getKnownMinValue();
10029 for (unsigned Lane = 0; Lane < EndLane; ++Lane)
10030 State.ILV->scalarizeInstruction(UI, this, VPLane(Lane), State);
10031}
10032
10033// Determine how to lower the scalar epilogue, which depends on 1) optimising
10034// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
10035// predication, and 4) a TTI hook that analyses whether the loop is suitable
10036// for predication.
10041 // 1) OptSize takes precedence over all other options, i.e. if this is set,
10042 // don't look at hints or options, and don't request a scalar epilogue.
10043 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
10044 // LoopAccessInfo (due to code dependency and not being able to reliably get
10045 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
10046 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
10047 // versioning when the vectorization is forced, unlike hasOptSize. So revert
10048 // back to the old way and vectorize with versioning when forced. See D81345.)
10049 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
10053
10054 // 2) If set, obey the directives
10055 if (PreferPredicateOverEpilogue.getNumOccurrences()) {
10063 };
10064 }
10065
10066 // 3) If set, obey the hints
10067 switch (Hints.getPredicate()) {
10072 };
10073
10074 // 4) if the TTI hook indicates this is profitable, request predication.
10075 TailFoldingInfo TFI(TLI, &LVL, IAI);
10078
10080}
10081
10082// Process the loop in the VPlan-native vectorization path. This path builds
10083// VPlan upfront in the vectorization pipeline, which allows to apply
10084// VPlan-to-VPlan transformations from the very beginning without modifying the
10085// input LLVM IR.
10092 LoopVectorizationRequirements &Requirements) {
10093
10094 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
10095 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
10096 return false;
10097 }
10098 assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
10099 Function *F = L->getHeader()->getParent();
10100 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
10101
10103 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
10104
10105 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
10106 &Hints, IAI);
10107 // Use the planner for outer loop vectorization.
10108 // TODO: CM is not used at this point inside the planner. Turn CM into an
10109 // optional argument if we don't need it in the future.
10110 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
10111 ORE);
10112
10113 // Get user vectorization factor.
10114 ElementCount UserVF = Hints.getWidth();
10115
10117
10118 // Plan how to best vectorize, return the best VF and its cost.
10119 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
10120
10121 // If we are stress testing VPlan builds, do not attempt to generate vector
10122 // code. Masked vector code generation support will follow soon.
10123 // Also, do not attempt to vectorize if no vector code will be produced.
10125 return false;
10126
10127 VPlan &BestPlan = LVP.getPlanFor(VF.Width);
10128
10129 {
10130 bool AddBranchWeights =
10131 hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
10132 GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
10133 AddBranchWeights);
10134 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10135 VF.Width, 1, LVL, &CM, BFI, PSI, Checks, BestPlan);
10136 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
10137 << L->getHeader()->getParent()->getName() << "\"\n");
10138 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
10139 }
10140
10141 reportVectorization(ORE, L, VF, 1);
10142
10143 // Mark the loop as already vectorized to avoid vectorizing again.
10144 Hints.setAlreadyVectorized();
10145 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10146 return true;
10147}
10148
10149// Emit a remark if there are stores to floats that required a floating point
10150// extension. If the vectorized loop was generated with floating point there
10151// will be a performance penalty from the conversion overhead and the change in
10152// the vector width.
10155 for (BasicBlock *BB : L->getBlocks()) {
10156 for (Instruction &Inst : *BB) {
10157 if (auto *S = dyn_cast<StoreInst>(&Inst)) {
10158 if (S->getValueOperand()->getType()->isFloatTy())
10159 Worklist.push_back(S);
10160 }
10161 }
10162 }
10163
10164 // Traverse the floating point stores upwards searching, for floating point
10165 // conversions.
10168 while (!Worklist.empty()) {
10169 auto *I = Worklist.pop_back_val();
10170 if (!L->contains(I))
10171 continue;
10172 if (!Visited.insert(I).second)
10173 continue;
10174
10175 // Emit a remark if the floating point store required a floating
10176 // point conversion.
10177 // TODO: More work could be done to identify the root cause such as a
10178 // constant or a function return type and point the user to it.
10179 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
10180 ORE->emit([&]() {
10181 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
10182 I->getDebugLoc(), L->getHeader())
10183 << "floating point conversion changes vector width. "
10184 << "Mixed floating point precision requires an up/down "
10185 << "cast that will negatively impact performance.";
10186 });
10187
10188 for (Use &Op : I->operands())
10189 if (auto *OpI = dyn_cast<Instruction>(Op))
10190 Worklist.push_back(OpI);
10191 }
10192}
10193
10194static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
10195 VectorizationFactor &VF, Loop *L,
10196 const TargetTransformInfo &TTI,
10199 InstructionCost CheckCost = Checks.getCost();
10200 if (!CheckCost.isValid())
10201 return false;
10202
10203 // When interleaving only scalar and vector cost will be equal, which in turn
10204 // would lead to a divide by 0. Fall back to hard threshold.
10205 if (VF.Width.isScalar()) {
10206 if (CheckCost > VectorizeMemoryCheckThreshold) {
10207 LLVM_DEBUG(
10208 dbgs()
10209 << "LV: Interleaving only is not profitable due to runtime checks\n");
10210 return false;
10211 }
10212 return true;
10213 }
10214
10215 // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
10216 uint64_t ScalarC = *VF.ScalarCost.getValue();
10217 if (ScalarC == 0)
10218 return true;
10219
10220 // First, compute the minimum iteration count required so that the vector
10221 // loop outperforms the scalar loop.
10222 // The total cost of the scalar loop is
10223 // ScalarC * TC
10224 // where
10225 // * TC is the actual trip count of the loop.
10226 // * ScalarC is the cost of a single scalar iteration.
10227 //
10228 // The total cost of the vector loop is
10229 // RtC + VecC * (TC / VF) + EpiC
10230 // where
10231 // * RtC is the cost of the generated runtime checks
10232 // * VecC is the cost of a single vector iteration.
10233 // * TC is the actual trip count of the loop
10234 // * VF is the vectorization factor
10235 // * EpiCost is the cost of the generated epilogue, including the cost
10236 // of the remaining scalar operations.
10237 //
10238 // Vectorization is profitable once the total vector cost is less than the
10239 // total scalar cost:
10240 // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC
10241 //
10242 // Now we can compute the minimum required trip count TC as
10243 // VF * (RtC + EpiC) / (ScalarC * VF - VecC) < TC
10244 //
10245 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
10246 // the computations are performed on doubles, not integers and the result
10247 // is rounded up, hence we get an upper estimate of the TC.
10248 unsigned IntVF = getEstimatedRuntimeVF(L, TTI, VF.Width);
10249 uint64_t RtC = *CheckCost.getValue();
10250 uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue();
10251 uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
10252
10253 // Second, compute a minimum iteration count so that the cost of the
10254 // runtime checks is only a fraction of the total scalar loop cost. This
10255 // adds a loop-dependent bound on the overhead incurred if the runtime
10256 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
10257 // * TC. To bound the runtime check to be a fraction 1/X of the scalar
10258 // cost, compute
10259 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC
10260 uint64_t MinTC2 = divideCeil(RtC * 10, ScalarC);
10261
10262 // Now pick the larger minimum. If it is not a multiple of VF and a scalar
10263 // epilogue is allowed, choose the next closest multiple of VF. This should
10264 // partly compensate for ignoring the epilogue cost.
10265 uint64_t MinTC = std::max(MinTC1, MinTC2);
10266 if (SEL == CM_ScalarEpilogueAllowed)
10267 MinTC = alignTo(MinTC, IntVF);
10269
10270 LLVM_DEBUG(
10271 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
10272 << VF.MinProfitableTripCount << "\n");
10273
10274 // Skip vectorization if the expected trip count is less than the minimum
10275 // required trip count.
10276 if (auto ExpectedTC = getSmallBestKnownTC(PSE, L)) {
10279 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
10280 "trip count < minimum profitable VF ("
10281 << *ExpectedTC << " < " << VF.MinProfitableTripCount
10282 << ")\n");
10283
10284 return false;
10285 }
10286 }
10287 return true;
10288}
10289
10291 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
10293 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
10295
10296/// Prepare \p MainPlan for vectorizing the main vector loop during epilogue
10297/// vectorization. Remove ResumePhis from \p MainPlan for inductions that
10298/// don't have a corresponding wide induction in \p EpiPlan.
10299static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
10300 // Collect PHI nodes of widened phis in the VPlan for the epilogue. Those
10301 // will need their resume-values computed in the main vector loop. Others
10302 // can be removed from the main VPlan.
10303 SmallPtrSet<PHINode *, 2> EpiWidenedPhis;
10304 for (VPRecipeBase &R :
10306 if (isa<VPCanonicalIVPHIRecipe>(&R))
10307 continue;
10308 EpiWidenedPhis.insert(
10309 cast<PHINode>(R.getVPSingleValue()->getUnderlyingValue()));
10310 }
10312 *cast<VPIRBasicBlock>(MainPlan.getScalarHeader()))) {
10313 auto *VPIRInst = cast<VPIRInstruction>(&R);
10314 auto *IRI = dyn_cast<PHINode>(&VPIRInst->getInstruction());
10315 if (!IRI)
10316 break;
10317 if (EpiWidenedPhis.contains(IRI))
10318 continue;
10319 // There is no corresponding wide induction in the epilogue plan that would
10320 // need a resume value. Remove the VPIRInst wrapping the scalar header phi
10321 // together with the corresponding ResumePhi. The resume values for the
10322 // scalar loop will be created during execution of EpiPlan.
10323 VPRecipeBase *ResumePhi = VPIRInst->getOperand(0)->getDefiningRecipe();
10324 VPIRInst->eraseFromParent();
10325 ResumePhi->eraseFromParent();
10326 }
10328
10329 using namespace VPlanPatternMatch;
10330 VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader();
10331 VPValue *VectorTC = &MainPlan.getVectorTripCount();
10332 // If there is a suitable resume value for the canonical induction in the
10333 // scalar (which will become vector) epilogue loop we are done. Otherwise
10334 // create it below.
10335 if (any_of(*MainScalarPH, [VectorTC](VPRecipeBase &R) {
10336 return match(&R, m_VPInstruction<VPInstruction::ResumePhi>(
10337 m_Specific(VectorTC), m_SpecificInt(0)));
10338 }))
10339 return;
10340 VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin());
10341 ScalarPHBuilder.createNaryOp(
10343 {VectorTC, MainPlan.getCanonicalIV()->getStartValue()}, {},
10344 "vec.epilog.resume.val");
10345}
10346
10347/// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded
10348/// SCEVs from \p ExpandedSCEVs and set resume values for header recipes.
10349static void
10351 const SCEV2ValueTy &ExpandedSCEVs,
10352 const EpilogueLoopVectorizationInfo &EPI) {
10353 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
10354 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
10355 Header->setName("vec.epilog.vector.body");
10356
10357 // Re-use the trip count and steps expanded for the main loop, as
10358 // skeleton creation needs it as a value that dominates both the scalar
10359 // and vector epilogue loops
10360 // TODO: This is a workaround needed for epilogue vectorization and it
10361 // should be removed once induction resume value creation is done
10362 // directly in VPlan.
10363 for (auto &R : make_early_inc_range(*Plan.getEntry())) {
10364 auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(&R);
10365 if (!ExpandR)
10366 continue;
10367 auto *ExpandedVal =
10368 Plan.getOrAddLiveIn(ExpandedSCEVs.find(ExpandR->getSCEV())->second);
10369 ExpandR->replaceAllUsesWith(ExpandedVal);
10370 if (Plan.getTripCount() == ExpandR)
10371 Plan.resetTripCount(ExpandedVal);
10372 ExpandR->eraseFromParent();
10373 }
10374
10375 // Ensure that the start values for all header phi recipes are updated before
10376 // vectorizing the epilogue loop.
10377 for (VPRecipeBase &R : Header->phis()) {
10378 if (auto *IV = dyn_cast<VPCanonicalIVPHIRecipe>(&R)) {
10379 // When vectorizing the epilogue loop, the canonical induction start
10380 // value needs to be changed from zero to the value after the main
10381 // vector loop. Find the resume value created during execution of the main
10382 // VPlan.
10383 // FIXME: Improve modeling for canonical IV start values in the epilogue
10384 // loop.
10385 BasicBlock *MainMiddle = find_singleton<BasicBlock>(
10386 predecessors(L->getLoopPreheader()),
10387 [&EPI](BasicBlock *BB, bool) -> BasicBlock * {
10388 if (BB != EPI.MainLoopIterationCountCheck &&
10389 BB != EPI.EpilogueIterationCountCheck &&
10390 BB != EPI.SCEVSafetyCheck && BB != EPI.MemSafetyCheck)
10391 return BB;
10392 return nullptr;
10393 });
10394 using namespace llvm::PatternMatch;
10395 Type *IdxTy = IV->getScalarType();
10396 PHINode *EPResumeVal = find_singleton<PHINode>(
10397 L->getLoopPreheader()->phis(),
10398 [&EPI, IdxTy, MainMiddle](PHINode &P, bool) -> PHINode * {
10399 if (P.getType() == IdxTy &&
10400 P.getIncomingValueForBlock(MainMiddle) == EPI.VectorTripCount &&
10401 match(
10402 P.getIncomingValueForBlock(EPI.MainLoopIterationCountCheck),
10403 m_SpecificInt(0)))
10404 return &P;
10405 return nullptr;
10406 });
10407 assert(EPResumeVal && "must have a resume value for the canonical IV");
10408 VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal);
10409 assert(all_of(IV->users(),
10410 [](const VPUser *U) {
10411 return isa<VPScalarIVStepsRecipe>(U) ||
10412 isa<VPScalarCastRecipe>(U) ||
10413 isa<VPDerivedIVRecipe>(U) ||
10414 cast<VPInstruction>(U)->getOpcode() ==
10415 Instruction::Add;
10416 }) &&
10417 "the canonical IV should only be used by its increment or "
10418 "ScalarIVSteps when resetting the start value");
10419 IV->setOperand(0, VPV);
10420 continue;
10421 }
10422
10423 Value *ResumeV = nullptr;
10424 // TODO: Move setting of resume values to prepareToExecute.
10425 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10426 ResumeV = cast<PHINode>(ReductionPhi->getUnderlyingInstr())
10427 ->getIncomingValueForBlock(L->getLoopPreheader());
10428 const RecurrenceDescriptor &RdxDesc =
10429 ReductionPhi->getRecurrenceDescriptor();
10430 RecurKind RK = RdxDesc.getRecurrenceKind();
10432 // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
10433 // start value; compare the final value from the main vector loop
10434 // to the start value.
10435 IRBuilder<> Builder(
10436 cast<Instruction>(ResumeV)->getParent()->getFirstNonPHI());
10437 ResumeV =
10438 Builder.CreateICmpNE(ResumeV, RdxDesc.getRecurrenceStartValue());
10440 // VPReductionPHIRecipe for FindLastIV reductions requires an adjustment
10441 // to the resume value. The resume value is adjusted to the sentinel
10442 // value when the final value from the main vector loop equals the start
10443 // value. This ensures correctness when the start value might not be
10444 // less than the minimum value of a monotonically increasing induction
10445 // variable.
10446 IRBuilder<> Builder(
10447 cast<Instruction>(ResumeV)->getParent()->getFirstNonPHI());
10448 Value *Cmp =
10449 Builder.CreateICmpEQ(ResumeV, RdxDesc.getRecurrenceStartValue());
10450 ResumeV =
10451 Builder.CreateSelect(Cmp, RdxDesc.getSentinelValue(), ResumeV);
10452 }
10453 } else {
10454 // Retrieve the induction resume values for wide inductions from
10455 // their original phi nodes in the scalar loop.
10456 PHINode *IndPhi = cast<VPWidenInductionRecipe>(&R)->getPHINode();
10457 // Hook up to the PHINode generated by a ResumePhi recipe of main
10458 // loop VPlan, which feeds the scalar loop.
10459 ResumeV = IndPhi->getIncomingValueForBlock(L->getLoopPreheader());
10460 }
10461 assert(ResumeV && "Must have a resume value");
10462 VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
10463 cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
10464 }
10465}
10466
10468 assert((EnableVPlanNativePath || L->isInnermost()) &&
10469 "VPlan-native path is not enabled. Only process inner loops.");
10470
10471 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
10472 << L->getHeader()->getParent()->getName() << "' from "
10473 << L->getLocStr() << "\n");
10474
10475 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
10476
10477 LLVM_DEBUG(
10478 dbgs() << "LV: Loop hints:"
10479 << " force="
10481 ? "disabled"
10483 ? "enabled"
10484 : "?"))
10485 << " width=" << Hints.getWidth()
10486 << " interleave=" << Hints.getInterleave() << "\n");
10487
10488 // Function containing loop
10489 Function *F = L->getHeader()->getParent();
10490
10491 // Looking at the diagnostic output is the only way to determine if a loop
10492 // was vectorized (other than looking at the IR or machine code), so it
10493 // is important to generate an optimization remark for each loop. Most of
10494 // these messages are generated as OptimizationRemarkAnalysis. Remarks
10495 // generated as OptimizationRemark and OptimizationRemarkMissed are
10496 // less verbose reporting vectorized loops and unvectorized loops that may
10497 // benefit from vectorization, respectively.
10498
10499 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
10500 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
10501 return false;
10502 }
10503
10504 PredicatedScalarEvolution PSE(*SE, *L);
10505
10506 // Check if it is legal to vectorize the loop.
10507 LoopVectorizationRequirements Requirements;
10508 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
10509 &Requirements, &Hints, DB, AC, BFI, PSI);
10511 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
10512 Hints.emitRemarkWithHints();
10513 return false;
10514 }
10515
10517 reportVectorizationFailure("Auto-vectorization of loops with uncountable "
10518 "early exit is not enabled",
10519 "UncountableEarlyExitLoopsDisabled", ORE, L);
10520 return false;
10521 }
10522
10523 if (LVL.hasStructVectorCall()) {
10524 reportVectorizationFailure("Auto-vectorization of calls that return struct "
10525 "types is not yet supported",
10526 "StructCallVectorizationUnsupported", ORE, L);
10527 return false;
10528 }
10529
10530 // Entrance to the VPlan-native vectorization path. Outer loops are processed
10531 // here. They may require CFG and instruction level transformations before
10532 // even evaluating whether vectorization is profitable. Since we cannot modify
10533 // the incoming IR, we need to build VPlan upfront in the vectorization
10534 // pipeline.
10535 if (!L->isInnermost())
10536 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
10537 ORE, BFI, PSI, Hints, Requirements);
10538
10539 assert(L->isInnermost() && "Inner loop expected.");
10540
10541 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
10542 bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
10543
10544 // If an override option has been passed in for interleaved accesses, use it.
10546 UseInterleaved = EnableInterleavedMemAccesses;
10547
10548 // Analyze interleaved memory accesses.
10549 if (UseInterleaved)
10551
10552 if (LVL.hasUncountableEarlyExit()) {
10553 BasicBlock *LoopLatch = L->getLoopLatch();
10554 if (IAI.requiresScalarEpilogue() ||
10556 [LoopLatch](BasicBlock *BB) { return BB != LoopLatch; })) {
10557 reportVectorizationFailure("Auto-vectorization of early exit loops "
10558 "requiring a scalar epilogue is unsupported",
10559 "UncountableEarlyExitUnsupported", ORE, L);
10560 return false;
10561 }
10562 }
10563
10564 // Check the function attributes and profiles to find out if this function
10565 // should be optimized for size.
10567 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI);
10568
10569 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
10570 // count by optimizing for size, to minimize overheads.
10571 auto ExpectedTC = getSmallBestKnownTC(PSE, L);
10572 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
10573 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
10574 << "This loop is worth vectorizing only if no scalar "
10575 << "iteration overheads are incurred.");
10577 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
10578 else {
10579 if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
10580 LLVM_DEBUG(dbgs() << "\n");
10581 // Predicate tail-folded loops are efficient even when the loop
10582 // iteration count is low. However, setting the epilogue policy to
10583 // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
10584 // with runtime checks. It's more effective to let
10585 // `areRuntimeChecksProfitable` determine if vectorization is beneficial
10586 // for the loop.
10589 } else {
10590 LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
10591 "small to consider vectorizing.\n");
10593 "The trip count is below the minial threshold value.",
10594 "loop trip count is too low, avoiding vectorization",
10595 "LowTripCount", ORE, L);
10596 Hints.emitRemarkWithHints();
10597 return false;
10598 }
10599 }
10600 }
10601
10602 // Check the function attributes to see if implicit floats or vectors are
10603 // allowed.
10604 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
10606 "Can't vectorize when the NoImplicitFloat attribute is used",
10607 "loop not vectorized due to NoImplicitFloat attribute",
10608 "NoImplicitFloat", ORE, L);
10609 Hints.emitRemarkWithHints();
10610 return false;
10611 }
10612
10613 // Check if the target supports potentially unsafe FP vectorization.
10614 // FIXME: Add a check for the type of safety issue (denormal, signaling)
10615 // for the target we're vectorizing for, to make sure none of the
10616 // additional fp-math flags can help.
10617 if (Hints.isPotentiallyUnsafe() &&
10620 "Potentially unsafe FP op prevents vectorization",
10621 "loop not vectorized due to unsafe FP support.",
10622 "UnsafeFP", ORE, L);
10623 Hints.emitRemarkWithHints();
10624 return false;
10625 }
10626
10627 bool AllowOrderedReductions;
10628 // If the flag is set, use that instead and override the TTI behaviour.
10630 AllowOrderedReductions = ForceOrderedReductions;
10631 else
10632 AllowOrderedReductions = TTI->enableOrderedReductions();
10633 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
10634 ORE->emit([&]() {
10635 auto *ExactFPMathInst = Requirements.getExactFPInst();
10636 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
10637 ExactFPMathInst->getDebugLoc(),
10638 ExactFPMathInst->getParent())
10639 << "loop not vectorized: cannot prove it is safe to reorder "
10640 "floating-point operations";
10641 });
10642 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10643 "reorder floating-point operations\n");
10644 Hints.emitRemarkWithHints();
10645 return false;
10646 }
10647
10648 // Use the cost model.
10649 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10650 F, &Hints, IAI);
10651 // Use the planner for vectorization.
10652 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
10653 ORE);
10654
10655 // Get user vectorization factor and interleave count.
10656 ElementCount UserVF = Hints.getWidth();
10657 unsigned UserIC = Hints.getInterleave();
10658
10659 // Plan how to best vectorize.
10660 LVP.plan(UserVF, UserIC);
10662 unsigned IC = 1;
10663
10666
10667 bool AddBranchWeights =
10668 hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
10669 GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
10670 AddBranchWeights);
10671 if (LVP.hasPlanWithVF(VF.Width)) {
10672 // Select the interleave count.
10673 IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
10674
10675 unsigned SelectedIC = std::max(IC, UserIC);
10676 // Optimistically generate runtime checks if they are needed. Drop them if
10677 // they turn out to not be profitable.
10678 if (VF.Width.isVector() || SelectedIC > 1)
10679 Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
10680
10681 // Check if it is profitable to vectorize with runtime checks.
10682 bool ForceVectorization =
10684 if (!ForceVectorization &&
10685 !areRuntimeChecksProfitable(Checks, VF, L, *TTI, PSE, SEL)) {
10686 ORE->emit([&]() {
10688 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
10689 L->getHeader())
10690 << "loop not vectorized: cannot prove it is safe to reorder "
10691 "memory operations";
10692 });
10693 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
10694 Hints.emitRemarkWithHints();
10695 return false;
10696 }
10697 }
10698
10699 // Identify the diagnostic messages that should be produced.
10700 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10701 bool VectorizeLoop = true, InterleaveLoop = true;
10702 if (VF.Width.isScalar()) {
10703 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10704 VecDiagMsg = std::make_pair(
10705 "VectorizationNotBeneficial",
10706 "the cost-model indicates that vectorization is not beneficial");
10707 VectorizeLoop = false;
10708 }
10709
10710 if (!LVP.hasPlanWithVF(VF.Width) && UserIC > 1) {
10711 // Tell the user interleaving was avoided up-front, despite being explicitly
10712 // requested.
10713 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10714 "interleaving should be avoided up front\n");
10715 IntDiagMsg = std::make_pair(
10716 "InterleavingAvoided",
10717 "Ignoring UserIC, because interleaving was avoided up front");
10718 InterleaveLoop = false;
10719 } else if (IC == 1 && UserIC <= 1) {
10720 // Tell the user interleaving is not beneficial.
10721 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10722 IntDiagMsg = std::make_pair(
10723 "InterleavingNotBeneficial",
10724 "the cost-model indicates that interleaving is not beneficial");
10725 InterleaveLoop = false;
10726 if (UserIC == 1) {
10727 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10728 IntDiagMsg.second +=
10729 " and is explicitly disabled or interleave count is set to 1";
10730 }
10731 } else if (IC > 1 && UserIC == 1) {
10732 // Tell the user interleaving is beneficial, but it explicitly disabled.
10733 LLVM_DEBUG(
10734 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10735 IntDiagMsg = std::make_pair(
10736 "InterleavingBeneficialButDisabled",
10737 "the cost-model indicates that interleaving is beneficial "
10738 "but is explicitly disabled or interleave count is set to 1");
10739 InterleaveLoop = false;
10740 }
10741
10742 // If there is a histogram in the loop, do not just interleave without
10743 // vectorizing. The order of operations will be incorrect without the
10744 // histogram intrinsics, which are only used for recipes with VF > 1.
10745 if (!VectorizeLoop && InterleaveLoop && LVL.hasHistograms()) {
10746 LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due "
10747 << "to histogram operations.\n");
10748 IntDiagMsg = std::make_pair(
10749 "HistogramPreventsScalarInterleaving",
10750 "Unable to interleave without vectorization due to constraints on "
10751 "the order of histogram operations");
10752 InterleaveLoop = false;
10753 }
10754
10755 // Override IC if user provided an interleave count.
10756 IC = UserIC > 0 ? UserIC : IC;
10757
10758 // Emit diagnostic messages, if any.
10759 const char *VAPassName = Hints.vectorizeAnalysisPassName();
10760 if (!VectorizeLoop && !InterleaveLoop) {
10761 // Do not vectorize or interleaving the loop.
10762 ORE->emit([&]() {
10763 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10764 L->getStartLoc(), L->getHeader())
10765 << VecDiagMsg.second;
10766 });
10767 ORE->emit([&]() {
10768 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10769 L->getStartLoc(), L->getHeader())
10770 << IntDiagMsg.second;
10771 });
10772 return false;
10773 }
10774
10775 if (!VectorizeLoop && InterleaveLoop) {
10776 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10777 ORE->emit([&]() {
10778 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10779 L->getStartLoc(), L->getHeader())
10780 << VecDiagMsg.second;
10781 });
10782 } else if (VectorizeLoop && !InterleaveLoop) {
10783 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10784 << ") in " << L->getLocStr() << '\n');
10785 ORE->emit([&]() {
10786 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10787 L->getStartLoc(), L->getHeader())
10788 << IntDiagMsg.second;
10789 });
10790 } else if (VectorizeLoop && InterleaveLoop) {
10791 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10792 << ") in " << L->getLocStr() << '\n');
10793 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10794 }
10795
10796 bool DisableRuntimeUnroll = false;
10797 MDNode *OrigLoopID = L->getLoopID();
10798 {
10799 using namespace ore;
10800 if (!VectorizeLoop) {
10801 assert(IC > 1 && "interleave count should not be 1 or 0");
10802 // If we decided that it is not legal to vectorize the loop, then
10803 // interleave it.
10804 VPlan &BestPlan = LVP.getPlanFor(VF.Width);
10805 InnerLoopVectorizer Unroller(
10806 L, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed(1),
10807 ElementCount::getFixed(1), IC, &LVL, &CM, BFI, PSI, Checks, BestPlan);
10808
10809 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
10810
10811 ORE->emit([&]() {
10812 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10813 L->getHeader())
10814 << "interleaved loop (interleaved count: "
10815 << NV("InterleaveCount", IC) << ")";
10816 });
10817 } else {
10818 // If we decided that it is *legal* to vectorize the loop, then do it.
10819
10820 VPlan &BestPlan = LVP.getPlanFor(VF.Width);
10821 // Consider vectorizing the epilogue too if it's profitable.
10822 VectorizationFactor EpilogueVF =
10824 if (EpilogueVF.Width.isVector()) {
10825 std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
10826
10827 // The first pass vectorizes the main loop and creates a scalar epilogue
10828 // to be vectorized by executing the plan (potentially with a different
10829 // factor) again shortly afterwards.
10830 VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width);
10831 preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan);
10832 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1,
10833 BestEpiPlan);
10834 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10835 EPI, &LVL, &CM, BFI, PSI, Checks,
10836 *BestMainPlan);
10837 auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
10838 *BestMainPlan, MainILV, DT, false);
10839 ++LoopsVectorized;
10840
10841 // Second pass vectorizes the epilogue and adjusts the control flow
10842 // edges from the first pass.
10843 EPI.MainLoopVF = EPI.EpilogueVF;
10844 EPI.MainLoopUF = EPI.EpilogueUF;
10845 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10846 ORE, EPI, &LVL, &CM, BFI, PSI,
10847 Checks, BestEpiPlan);
10848 EpilogILV.setTripCount(MainILV.getTripCount());
10849 preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI);
10850
10851 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10852 DT, true, &ExpandedSCEVs);
10853 ++LoopsEpilogueVectorized;
10854
10855 if (!MainILV.areSafetyChecksAdded())
10856 DisableRuntimeUnroll = true;
10857 } else {
10858 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10859 VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
10860 PSI, Checks, BestPlan);
10861 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
10862 ++LoopsVectorized;
10863
10864 // Add metadata to disable runtime unrolling a scalar loop when there
10865 // are no runtime checks about strides and memory. A scalar loop that is
10866 // rarely used is not worth unrolling.
10867 if (!LB.areSafetyChecksAdded())
10868 DisableRuntimeUnroll = true;
10869 }
10870 // Report the vectorization decision.
10871 reportVectorization(ORE, L, VF, IC);
10872 }
10873
10876 }
10877
10878 assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
10879 "DT not preserved correctly");
10880
10881 std::optional<MDNode *> RemainderLoopID =
10884 if (RemainderLoopID) {
10885 L->setLoopID(*RemainderLoopID);
10886 } else {
10887 if (DisableRuntimeUnroll)
10889
10890 // Mark the loop as already vectorized to avoid vectorizing again.
10891 Hints.setAlreadyVectorized();
10892 }
10893
10894 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10895 return true;
10896}
10897
10899
10900 // Don't attempt if
10901 // 1. the target claims to have no vector registers, and
10902 // 2. interleaving won't help ILP.
10903 //
10904 // The second condition is necessary because, even if the target has no
10905 // vector registers, loop vectorization may still enable scalar
10906 // interleaving.
10909 return LoopVectorizeResult(false, false);
10910
10911 bool Changed = false, CFGChanged = false;
10912
10913 // The vectorizer requires loops to be in simplified form.
10914 // Since simplification may add new inner loops, it has to run before the
10915 // legality and profitability checks. This means running the loop vectorizer
10916 // will simplify all loops, regardless of whether anything end up being
10917 // vectorized.
10918 for (const auto &L : *LI)
10919 Changed |= CFGChanged |=
10920 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10921
10922 // Build up a worklist of inner-loops to vectorize. This is necessary as
10923 // the act of vectorizing or partially unrolling a loop creates new loops
10924 // and can invalidate iterators across the loops.
10925 SmallVector<Loop *, 8> Worklist;
10926
10927 for (Loop *L : *LI)
10928 collectSupportedLoops(*L, LI, ORE, Worklist);
10929
10930 LoopsAnalyzed += Worklist.size();
10931
10932 // Now walk the identified inner loops.
10933 while (!Worklist.empty()) {
10934 Loop *L = Worklist.pop_back_val();
10935
10936 // For the inner loops we actually process, form LCSSA to simplify the
10937 // transform.
10938 Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10939
10940 Changed |= CFGChanged |= processLoop(L);
10941
10942 if (Changed) {
10943 LAIs->clear();
10944
10945#ifndef NDEBUG
10946 if (VerifySCEV)
10947 SE->verify();
10948#endif
10949 }
10950 }
10951
10952 // Process each loop nest in the function.
10953 return LoopVectorizeResult(Changed, CFGChanged);
10954}
10955
10958 LI = &AM.getResult<LoopAnalysis>(F);
10959 // There are no loops in the function. Return before computing other
10960 // expensive analyses.
10961 if (LI->empty())
10962 return PreservedAnalyses::all();
10971
10972 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10973 PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10974 BFI = nullptr;
10975 if (PSI && PSI->hasProfileSummary())
10977 LoopVectorizeResult Result = runImpl(F);
10978 if (!Result.MadeAnyChange)
10979 return PreservedAnalyses::all();
10981
10982 if (isAssignmentTrackingEnabled(*F.getParent())) {
10983 for (auto &BB : F)
10985 }
10986
10987 PA.preserve<LoopAnalysis>();
10991
10992 if (Result.MadeCFGChange) {
10993 // Making CFG changes likely means a loop got vectorized. Indicate that
10994 // extra simplification passes should be run.
10995 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10996 // be run if runtime checks have been added.
10999 } else {
11001 }
11002 return PA;
11003}
11004
11006 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
11007 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
11008 OS, MapClassName2PassName);
11009
11010 OS << '<';
11011 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
11012 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
11013 OS << '>';
11014}
@ Poison
static unsigned getIntrinsicID(const SDNode *N)
unsigned RegSize
AMDGPU Lower Kernel Arguments
AMDGPU Register Bank Select
Rewrite undef for PHI
This file implements a class to represent arbitrary precision integral constant values and operations...
@ PostInc
ReachingDefAnalysis InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
This is the interface for LLVM's primary stateless and local alias analysis.
static bool IsEmptyBlock(MachineBasicBlock *MBB)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition: CommandLine.h:686
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
#define DEBUG_WITH_TYPE(TYPE,...)
DEBUG_WITH_TYPE macro - This macro should be used by passes to emit debug information.
Definition: Debug.h:64
This file defines DenseMapInfo traits for DenseMap.
This file defines the DenseMap class.
std::string Name
bool End
Definition: ELF_riscv.cpp:480
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
#define Check(C,...)
#define DEBUG_TYPE
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
#define _
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
Module.h This file contains the declarations for the Module class.
This defines the Use class.
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
Legalize the Machine IR a function s Machine IR
Definition: Legalizer.cpp:80
This header provides classes for managing per-loop analyses.
static const char * VerboseDebug
loop Loop Strength Reduction
#define LV_NAME
This file defines the LoopVectorizationLegality class.
This file provides a LoopVectorizationPlanner class.
static void collectSupportedLoops(Loop &L, LoopInfo *LI, OptimizationRemarkEmitter *ORE, SmallVectorImpl< Loop * > &V)
static cl::opt< unsigned > EpilogueVectorizationMinVF("epilogue-vectorization-minimum-VF", cl::Hidden, cl::desc("Only loops with vectorization factor equal to or larger than " "the specified value are considered for epilogue vectorization."))
static cl::opt< unsigned > EpilogueVectorizationForceVF("epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, cl::desc("When epilogue vectorization is enabled, and a value greater than " "1 is specified, forces the given VF for all applicable epilogue " "loops."))
static void addRuntimeUnrollDisableMetaData(Loop *L)
static bool isOptimizableIVOrUse(VPValue *VPV)
Return true if VPV is an optimizable IV or IV use.
static ElementCount determineVPlanVF(const TargetTransformInfo &TTI, LoopVectorizationCostModel &CM)
static cl::opt< unsigned > VectorizeMemoryCheckThreshold("vectorize-memory-check-threshold", cl::init(128), cl::Hidden, cl::desc("The maximum allowed number of runtime memory checks"))
static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan)
Prepare MainPlan for vectorizing the main vector loop during epilogue vectorization.
static cl::opt< unsigned > TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), cl::Hidden, cl::desc("Loops with a constant trip count that is smaller than this " "value are vectorized only if no scalar iteration overheads " "are incurred."))
Loops with a known constant trip count below this number are vectorized only if no scalar iteration o...
static void debugVectorizationMessage(const StringRef Prefix, const StringRef DebugMsg, Instruction *I)
Write a DebugMsg about vectorization to the debug output stream.
static cl::opt< bool > EnableCondStoresVectorization("enable-cond-stores-vec", cl::init(true), cl::Hidden, cl::desc("Enable if predication of stores during vectorization."))
static Value * emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue, Value *Step, InductionDescriptor::InductionKind InductionKind, const BinaryOperator *InductionBinOp)
Compute the transformed value of Index at offset StartValue using step StepValue.
static DebugLoc getDebugLocFromInstOrOperands(Instruction *I)
Look for a meaningful debug location on the instruction or its operands.
static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, VectorizationFactor &VF, Loop *L, const TargetTransformInfo &TTI, PredicatedScalarEvolution &PSE, ScalarEpilogueLowering SEL)
static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB)
Replace VPBB with a VPIRBasicBlock wrapping IRBB.
const char LLVMLoopVectorizeFollowupAll[]
static SetVector< VPIRInstruction * > collectUsersInExitBlocks(Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan)
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, SetVector< VPIRInstruction * > &ExitUsersToFix)
Handle users in the exit block for first order reductions in the original exit block.
static cl::opt< bool > ForceTargetSupportsScalableVectors("force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, cl::desc("Pretend that scalable vectors are supported, even if the target does " "not support them. This flag should only be used for testing."))
static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, DebugLoc DL)
static std::optional< unsigned > getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI)
Convenience function that returns the value of vscale_range iff vscale_range.min == vscale_range....
static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style)
static constexpr uint32_t MemCheckBypassWeights[]
cl::opt< unsigned > ForceTargetInstructionCost("force-target-instruction-cost", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's expected cost for " "an instruction to a single constant value. Mostly " "useful for getting consistent testing."))
std::optional< unsigned > getMaxVScale(const Function &F, const TargetTransformInfo &TTI)
static constexpr uint32_t MinItersBypassWeights[]
static cl::opt< unsigned > ForceTargetNumScalarRegs("force-target-num-scalar-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of scalar registers."))
static cl::opt< bool > UseWiderVFIfCallVariantsPresent("vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true), cl::Hidden, cl::desc("Try wider VFs if they enable the use of vector variants"))
static bool planContainsAdditionalSimplifications(VPlan &Plan, VPCostContext &CostCtx, Loop *TheLoop)
Return true if the original loop \ TheLoop contains any instructions that do not have corresponding r...
static cl::opt< unsigned > SmallLoopCost("small-loop-cost", cl::init(20), cl::Hidden, cl::desc("The cost of a loop that is considered 'small' by the interleaver."))
static cl::opt< unsigned > ForceTargetNumVectorRegs("force-target-num-vector-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of vector registers."))
static VPValue * addResumePhiRecipeForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC)
Create and return a ResumePhi for WideIV, unless it is truncated.
static bool isExplicitVecOuterLoop(Loop *OuterLp, OptimizationRemarkEmitter *ORE)
static cl::opt< bool > EnableIndVarRegisterHeur("enable-ind-var-reg-heur", cl::init(true), cl::Hidden, cl::desc("Count the induction variable only once when interleaving"))
static Type * maybeVectorizeType(Type *Elt, ElementCount VF)
static std::optional< unsigned > getSmallBestKnownTC(PredicatedScalarEvolution &PSE, Loop *L, bool CanUseConstantMax=true)
Returns "best known" trip count for the specified loop L as defined by the following procedure: 1) Re...
static cl::opt< TailFoldingStyle > ForceTailFoldingStyle("force-tail-folding-style", cl::desc("Force the tail folding style"), cl::init(TailFoldingStyle::None), cl::values(clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"), clEnumValN(TailFoldingStyle::Data, "data", "Create lane mask for data only, using active.lane.mask intrinsic"), clEnumValN(TailFoldingStyle::DataWithoutLaneMask, "data-without-lane-mask", "Create lane mask with compare/stepvector"), clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control", "Create lane mask using active.lane.mask intrinsic, and use " "it for both data and control flow"), clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, "data-and-control-without-rt-check", "Similar to data-and-control, but remove the runtime check"), clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl", "Use predicated EVL instructions for tail folding. If EVL " "is unsupported, fallback to data-without-lane-mask.")))
static cl::opt< bool > EnableEpilogueVectorization("enable-epilogue-vectorization", cl::init(true), cl::Hidden, cl::desc("Enable vectorization of epilogue loops."))
static ScalarEpilogueLowering getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI)
const char VerboseDebug[]
static void fixReductionScalarResumeWhenVectorizingEpilog(VPRecipeBase *R, VPTransformState &State, BasicBlock *LoopMiddleBlock, BasicBlock *BypassBlock)
static cl::opt< bool > PreferPredicatedReductionSelect("prefer-predicated-reduction-select", cl::init(false), cl::Hidden, cl::desc("Prefer predicating a reduction operation over an after loop select."))
static VPWidenIntOrFpInductionRecipe * createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start, const InductionDescriptor &IndDesc, VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop)
Creates a VPWidenIntOrFpInductionRecpipe for Phi.
static constexpr uint32_t SCEVCheckBypassWeights[]
static cl::opt< bool > PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), cl::Hidden, cl::desc("Prefer in-loop vector reductions, " "overriding the targets preference."))
const char LLVMLoopVectorizeFollowupVectorized[]
static cl::opt< bool > EnableLoadStoreRuntimeInterleave("enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, cl::desc("Enable runtime interleaving until load/store ports are saturated"))
static cl::opt< bool > VPlanBuildStressTest("vplan-build-stress-test", cl::init(false), cl::Hidden, cl::desc("Build VPlan for every supported loop nest in the function and bail " "out right after the build (stress test the VPlan H-CFG construction " "in the VPlan-native vectorization path)."))
static bool hasIrregularType(Type *Ty, const DataLayout &DL)
A helper function that returns true if the given type is irregular.
static cl::opt< bool > LoopVectorizeWithBlockFrequency("loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, cl::desc("Enable the use of the block frequency analysis to access PGO " "heuristics minimizing code growth in cold regions and being more " "aggressive in hot regions."))
static Value * getExpandedStep(const InductionDescriptor &ID, const SCEV2ValueTy &ExpandedSCEVs)
Return the expanded step for ID using ExpandedSCEVs to look up SCEV expansion results.
const char LLVMLoopVectorizeFollowupEpilogue[]
static void preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L, const SCEV2ValueTy &ExpandedSCEVs, const EpilogueLoopVectorizationInfo &EPI)
Prepare Plan for vectorizing the epilogue loop.
static bool useActiveLaneMask(TailFoldingStyle Style)
static unsigned getEstimatedRuntimeVF(const Loop *L, const TargetTransformInfo &TTI, ElementCount VF)
This function attempts to return a value that represents the vectorization factor at runtime.
static bool isIndvarOverflowCheckKnownFalse(const LoopVectorizationCostModel *Cost, ElementCount VF, std::optional< unsigned > UF=std::nullopt)
For the given VF and UF and maximum trip count computed for the loop, return whether the induction va...
static void addFullyUnrolledInstructionsToIgnore(Loop *L, const LoopVectorizationLegality::InductionList &IL, SmallPtrSetImpl< Instruction * > &InstsToIgnore)
Knowing that loop L executes a single vector iteration, add instructions that will get simplified and...
static cl::opt< PreferPredicateTy::Option > PreferPredicateOverEpilogue("prefer-predicate-over-epilogue", cl::init(PreferPredicateTy::ScalarEpilogue), cl::Hidden, cl::desc("Tail-folding and predication preferences over creating a scalar " "epilogue loop."), cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, "scalar-epilogue", "Don't tail-predicate loops, create scalar epilogue"), clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, "predicate-else-scalar-epilogue", "prefer tail-folding, create scalar epilogue if tail " "folding fails."), clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, "predicate-dont-vectorize", "prefers tail-folding, don't attempt vectorization if " "tail-folding fails.")))
static cl::opt< bool > EnableInterleavedMemAccesses("enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on interleaved memory accesses in a loop"))
static bool addUsersInExitBlocks(VPlan &Plan, const SetVector< VPIRInstruction * > &ExitUsersToFix)
static cl::opt< bool > EnableMaskedInterleavedMemAccesses("enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"))
An interleave-group may need masking if it resides in a block that needs predication,...
static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan)
Create resume phis in the scalar preheader for first-order recurrences, reductions and inductions,...
static cl::opt< bool > ForceOrderedReductions("force-ordered-reductions", cl::init(false), cl::Hidden, cl::desc("Enable the vectorisation of loops with in-order (strict) " "FP reductions"))
static void cse(BasicBlock *BB)
Perform cse of induction variable instructions.
static const SCEV * getAddressAccessSCEV(Value *Ptr, LoopVectorizationLegality *Legal, PredicatedScalarEvolution &PSE, const Loop *TheLoop)
Gets Address Access SCEV after verifying that the access pattern is loop invariant except the inducti...
static cl::opt< cl::boolOrDefault > ForceSafeDivisor("force-widen-divrem-via-safe-divisor", cl::Hidden, cl::desc("Override cost based safe divisor widening for div/rem instructions"))
#define DEBUG_TYPE
static cl::opt< unsigned > ForceTargetMaxVectorInterleaveFactor("force-target-max-vector-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "vectorized loops."))
static bool processLoopInVPlanNativePath(Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, LoopVectorizationRequirements &Requirements)
static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI)
static cl::opt< unsigned > NumberOfStoresToPredicate("vectorize-num-stores-pred", cl::init(1), cl::Hidden, cl::desc("Max number of stores to be predicated behind an if."))
The number of stores in a loop that are allowed to need predication.
static cl::opt< unsigned > MaxNestedScalarReductionIC("max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, cl::desc("The maximum interleave count to use when interleaving a scalar " "reduction in a nested loop."))
static cl::opt< unsigned > ForceTargetMaxScalarInterleaveFactor("force-target-max-scalar-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "scalar loops."))
static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE)
static bool willGenerateVectors(VPlan &Plan, ElementCount VF, const TargetTransformInfo &TTI)
Check if any recipe of Plan will generate a vector value, which will be assigned a vector register.
static cl::opt< bool > EnableEarlyExitVectorization("enable-early-exit-vectorization", cl::init(false), cl::Hidden, cl::desc("Enable vectorization of early exit loops with uncountable exits."))
static cl::opt< bool > MaximizeBandwidth("vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " "will be determined by the smallest type in loop."))
static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop, Instruction *I, DebugLoc DL={})
Create an analysis remark that explains why vectorization failed.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
This file implements a map that provides insertion order iteration.
std::pair< uint64_t, uint64_t > Interval
This file contains the declarations for metadata subclasses.
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
if(PassOpts->AAPipeline)
This file contains the declarations for profiling metadata utility functions.
const SmallVectorImpl< MachineOperand > & Cond
static BinaryOperator * CreateMul(Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore, Value *FlagsOp)
static BinaryOperator * CreateAdd(Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore, Value *FlagsOp)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
#define OP(OPC)
Definition: Instruction.h:45
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
This pass exposes codegen information to IR-level passes.
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file defines the VPlanHCFGBuilder class which contains the public interface (buildHierarchicalCF...
This file provides utility VPlan to VPlan transformations.
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const char PassName[]
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:234
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1520
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1542
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:410
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
void registerAssumption(AssumeInst *CI)
Add an @llvm.assume intrinsic to this function's cache.
unsigned getVScaleRangeMin() const
Returns the minimum value for the vscale_range attribute.
Definition: Attributes.cpp:460
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:448
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition: BasicBlock.h:517
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:367
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:459
const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
Definition: BasicBlock.cpp:489
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:219
LLVMContext & getContext() const
Get the context in which this basic block lives.
Definition: BasicBlock.cpp:168
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:239
BinaryOps getOpcode() const
Definition: InstrTypes.h:370
Analysis pass which computes BlockFrequencyInfo.
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Conditional or Unconditional Branch instruction.
bool isConditional() const
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
BasicBlock * getSuccessor(unsigned i) const
Value * getCondition() const
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:72
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Definition: InstrTypes.h:1870
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1341
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1286
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1277
unsigned arg_size() const
Definition: InstrTypes.h:1284
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:696
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:698
@ ICMP_EQ
equal
Definition: InstrTypes.h:694
@ ICMP_NE
not equal
Definition: InstrTypes.h:695
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:699
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:787
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:866
static ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:873
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
A debug info location.
Definition: DebugLoc.h:33
An analysis that produces DemandedBits for a function.
Definition: DemandedBits.h:103
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:194
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:156
bool empty() const
Definition: DenseMap.h:98
iterator begin()
Definition: DenseMap.h:75
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:152
iterator end()
Definition: DenseMap.h:84
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition: DenseMap.h:202
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:147
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:211
Implements a dense probed hash-table based set.
Definition: DenseSet.h:278
DomTreeNodeBase * getIDom() const
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
bool verify(VerificationLevel VL=VerificationLevel::Full) const
verify - checks if the tree is correct.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
void eraseNode(NodeT *BB)
eraseNode - Removes a node from the dominator tree.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
constexpr bool isVector() const
One or more elements.
Definition: TypeSize.h:326
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:314
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:311
static constexpr ElementCount get(ScalarTy MinVal, bool Scalable)
Definition: TypeSize.h:317
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:322
BasicBlock * emitMinimumVectorEpilogueIterCountCheck(BasicBlock *Bypass, BasicBlock *Insert)
Emits an iteration count bypass check after the main vector loop has finished to see if there are any...
EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan)
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
BasicBlock * createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Implements the interface for creating a vectorized skeleton using the epilogue loop strategy (ie the ...
A specialized derived class of inner loop vectorizer that performs vectorization of main loops in the...
BasicBlock * createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Implements the interface for creating a vectorized skeleton using the main loop strategy (ie the firs...
EpilogueVectorizerMainLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Check, VPlan &Plan)
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, Value *VectorTripCount, BasicBlock *MiddleBlock, VPTransformState &State) override
Set up the values of the IVs correctly when exiting the vector loop.
BasicBlock * emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue)
Emits an iteration count bypass check once for the main loop (when ForEpilogue is false) and once for...
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition: Operator.h:338
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Class to represent function types.
Definition: DerivedTypes.h:105
param_iterator param_begin() const
Definition: DerivedTypes.h:130
param_iterator param_end() const
Definition: DerivedTypes.h:131
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:707
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:216
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition: Function.cpp:373
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:766
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:704
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:731
Represents flags for the getelementptr instruction/expression.
static GEPNoWrapFlags inBounds()
static GEPNoWrapFlags none()
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:113
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:485
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1043
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:330
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2273
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Definition: IRBuilder.cpp:879
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2269
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition: IRBuilder.h:164
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1386
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1369
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition: IRBuilder.h:490
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:199
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2379
Value * CreateURem(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1446
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2704
A struct for saving information about induction variables.
const SCEV * getStep() const
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_NoInduction
Not an induction variable.
@ IK_FpInduction
Floating point induction variable.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
const SmallVectorImpl< Instruction * > & getCastInsts() const
Returns a reference to the type cast instructions in the induction update chain, that are redundant w...
Value * getStartValue() const
An extension of the inner loop vectorizer that creates a skeleton for a vectorized loop that has its ...
InnerLoopAndEpilogueVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan)
BasicBlock * createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Create a new empty loop that will contain vectorized instructions later on, while the old loop will b...
virtual BasicBlock * createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs)=0
The interface for creating a vectorized skeleton using one of two different strategies,...
EpilogueLoopVectorizationInfo & EPI
Holds and updates state information required to vectorize the main loop and its epilogue in two separ...
InnerLoopVectorizer vectorizes loops which contain only one basic block to a specified vectorization ...
virtual void printDebugTracesAtStart()
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
Value * TripCount
Trip count of the original loop.
void sinkScalarOperands(Instruction *PredInst)
Iteratively sink the scalarized operands of a predicated instruction into the block that was created ...
const TargetLibraryInfo * TLI
Target Library Info.
virtual void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, Value *VectorTripCount, BasicBlock *MiddleBlock, VPTransformState &State)
Set up the values of the IVs correctly when exiting the vector loop.
ElementCount MinProfitableTripCount
const TargetTransformInfo * TTI
Target Transform Info.
Value * VectorTripCount
Trip count of the widened loop (TripCount - TripCount % (VF*UF))
BasicBlock * emitSCEVChecks(BasicBlock *Bypass)
Emit a bypass check to see if all of the SCEV assumptions we've had to make are correct.
virtual BasicBlock * createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs)
Create a new empty loop that will contain vectorized instructions later on, while the old loop will b...
LoopVectorizationCostModel * Cost
The profitablity analysis.
BasicBlock * AdditionalBypassBlock
The additional bypass block which conditionally skips over the epilogue loop after executing the main...
BlockFrequencyInfo * BFI
BFI and PSI are used to check for profile guided size optimizations.
Value * getTripCount() const
Returns the original loop trip count.
BasicBlock * LoopMiddleBlock
Middle Block between the vector and the scalar.
OptimizationRemarkEmitter * ORE
Interface to emit optimization remarks.
void scalarizeInstruction(const Instruction *Instr, VPReplicateRecipe *RepRecipe, const VPLane &Lane, VPTransformState &State)
A helper function to scalarize a single Instruction in the innermost loop.
SmallVector< Instruction *, 4 > PredicatedInstructions
Store instructions that were predicated.
DenseMap< PHINode *, Value * > Induction2AdditionalBypassValue
Mapping of induction phis to their additional bypass values.
void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB)
Introduces a new VPIRBasicBlock for CheckIRBB to Plan between the vector preheader and its predecesso...
void createVectorLoopSkeleton(StringRef Prefix)
Emit basic blocks (prefixed with Prefix) for the iteration check, vector loop preheader,...
BasicBlock * emitMemRuntimeChecks(BasicBlock *Bypass)
Emit bypass checks to check any memory assumptions we may have made.
BasicBlock * LoopScalarPreHeader
The scalar-loop preheader.
void createInductionAdditionalBypassValues(const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount)
Create and record the values for induction variables to resume coming from the additional bypass bloc...
VPBlockBase * VectorPHVPB
The vector preheader block of Plan, used as target for check blocks introduced during skeleton creati...
LoopVectorizationLegality * Legal
The legality analysis.
InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, ElementCount VecWidth, ElementCount MinProfitableTripCount, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks, VPlan &Plan)
void emitIterationCountCheck(BasicBlock *Bypass)
Emit a bypass check to see if the vector trip count is zero, including if it overflows.
PredicatedScalarEvolution & PSE
A wrapper around ScalarEvolution used to add runtime SCEV checks.
LoopInfo * LI
Loop Info.
ProfileSummaryInfo * PSI
Value * getInductionAdditionalBypassValue(PHINode *OrigPhi) const
induction header phi.
BasicBlock * getAdditionalBypassBlock() const
Return the additional bypass block which targets the scalar loop by skipping the epilogue loop after ...
DominatorTree * DT
Dominator Tree.
void setTripCount(Value *TC)
Used to set the trip count after ILV's construction and after the preheader block has been executed.
void fixVectorizedLoop(VPTransformState &State)
Fix the vectorized code, taking care of header phi's, and more.
BasicBlock * LoopVectorPreHeader
The vector-loop preheader.
virtual void printDebugTracesAtEnd()
AssumptionCache * AC
Assumption Cache.
Value * getOrCreateVectorTripCount(BasicBlock *InsertBlock)
Returns (and creates if needed) the trip count of the widened loop.
IRBuilder Builder
The builder that we use.
void fixNonInductionPHIs(VPTransformState &State)
Fix the non-induction PHIs in Plan.
unsigned UF
The vectorization unroll factor to use.
SmallVector< BasicBlock *, 4 > LoopBypassBlocks
A list of all bypass blocks. The first block is the entry of the loop.
GeneratedRTChecks & RTChecks
Structure to hold information about generated runtime checks, responsible for cleaning the checks,...
virtual ~InnerLoopVectorizer()=default
ElementCount VF
The vectorization SIMD factor to use.
Loop * OrigLoop
The original loop.
static InstructionCost getInvalid(CostType Val=0)
static InstructionCost getMax()
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:475
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:68
bool isBinaryOp() const
Definition: Instruction.h:279
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:94
void replaceSuccessorWith(BasicBlock *OldBB, BasicBlock *NewBB)
Replace specified successor OldBB to point at the provided block.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Definition: Instruction.h:169
FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
const char * getOpcodeName() const
Definition: Instruction.h:276
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:311
The group of interleaved loads/stores sharing the same stride and close to each other.
Definition: VectorUtils.h:488
uint32_t getFactor() const
Definition: VectorUtils.h:504
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
Definition: VectorUtils.h:558
InstTy * getInsertPos() const
Definition: VectorUtils.h:574
uint32_t getNumMembers() const
Definition: VectorUtils.h:506
Drive the analysis of interleaved memory accesses in the loop.
Definition: VectorUtils.h:630
InterleaveGroup< Instruction > * getInterleaveGroup(const Instruction *Instr) const
Get the interleave group that Instr belongs to.
Definition: VectorUtils.h:675
bool requiresScalarEpilogue() const
Returns true if an interleaved group that may access memory out-of-bounds requires a scalar epilogue ...
Definition: VectorUtils.h:686
bool isInterleaved(Instruction *Instr) const
Check if Instr belongs to any interleave group.
Definition: VectorUtils.h:667
bool invalidateGroups()
Invalidate groups, e.g., in case all blocks in loop will be predicated contrary to original assumptio...
Definition: VectorUtils.h:650
iterator_range< SmallPtrSetIterator< llvm::InterleaveGroup< Instruction > * > > getInterleaveGroups()
Definition: VectorUtils.h:680
void analyzeInterleaving(bool EnableMaskedInterleavedGroup)
Analyze the interleaved accesses and collect them in interleave groups.
void invalidateGroupsRequiringScalarEpilogue()
Invalidate groups that require a scalar epilogue (due to gaps).
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:176
Type * getPointerOperandType() const
Definition: Instructions.h:258
This analysis provides dependence information for the memory accesses of a loop.
Drive the analysis of memory accesses in the loop.
const RuntimePointerChecking * getRuntimePointerChecking() const
unsigned getNumRuntimePointerChecks() const
Number of memchecks required to prove independence of otherwise may-alias pointers.
const DenseMap< Value *, const SCEV * > & getSymbolicStrides() const
If an access has a symbolic strides, this maps the pointer value to the stride symbol.
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
void getExitBlocks(SmallVectorImpl< BlockT * > &ExitBlocks) const
Return all of the successor blocks of this loop.
BlockT * getUniqueLatchExitBlock() const
Return the unique exit block for the latch, or null if there are multiple different exit blocks or th...
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
BlockT * getHeader() const
unsigned getLoopDepth() const
Return the nesting level of this loop.
void addBasicBlockToLoop(BlockT *NewBB, LoopInfoBase< BlockT, LoopT > &LI)
This method is used by other analyses to update loop information.
iterator_range< block_iterator > blocks() const
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
BlockT * getExitingBlock() const
If getExitingBlocks would return exactly one block, return that block.
LoopT * getParentLoop() const
Return the parent loop if it exists or nullptr for top level loops.
bool isLoopExiting(const BlockT *BB) const
True if terminator in the block can branch to another block that is outside of the current loop.
Store the result of a depth first search within basic blocks contained by a single loop.
Definition: LoopIterator.h:97
RPOIterator beginRPO() const
Reverse iterate over the cached postorder blocks.
Definition: LoopIterator.h:136
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
Definition: LoopInfo.cpp:1254
RPOIterator endRPO() const
Definition: LoopIterator.h:140
Wrapper class to LoopBlocksDFS that provides a standard begin()/end() interface for the DFS reverse p...
Definition: LoopIterator.h:172
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
Definition: LoopIterator.h:180
void removeBlock(BlockT *BB)
This method completely removes BB from all data structures, including all of the Loop objects it is n...
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
LoopVectorizationCostModel - estimates the expected speedups due to vectorization.
SmallPtrSet< Type *, 16 > ElementTypesInLoop
All element types found in the loop.
void collectElementTypesForWidening()
Collect all element types in the loop for which widening is needed.
bool canVectorizeReductions(ElementCount VF) const
Returns true if the target machine supports all of the reduction variables found for the given VF.
bool isEpilogueVectorizationProfitable(const ElementCount VF, const unsigned IC) const
Returns true if epilogue vectorization is considered profitable, and false otherwise.
bool requiresScalarEpilogue(VFRange Range) const
Returns true if we're required to use a scalar epilogue for at least the final iteration of the origi...
bool isPredicatedInst(Instruction *I) const
Returns true if I is an instruction that needs to be predicated at runtime.
void collectValuesToIgnore()
Collect values we want to ignore in the cost model.
void collectInLoopReductions()
Split reductions into those that happen in the loop, and those that happen outside.
std::pair< unsigned, unsigned > getSmallestAndWidestTypes()
bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be uniform after vectorization.
bool usePredicatedReductionSelect(unsigned Opcode, Type *PhiTy) const
Returns true if the predicated reduction select should be used to set the incoming value for the redu...
PredicatedScalarEvolution & PSE
Predicated scalar evolution analysis.
const LoopVectorizeHints * Hints
Loop Vectorize Hint.
std::optional< unsigned > getMaxSafeElements() const
Return maximum safe number of elements to be processed per vector iteration, which do not prevent sto...
std::optional< InstructionCost > getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, TTI::TargetCostKind CostKind) const
Return the cost of instructions in an inloop reduction pattern, if I is part of that pattern.
const TargetTransformInfo & TTI
Vector target information.
LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, const Function *F, const LoopVectorizeHints *Hints, InterleavedAccessInfo &IAI)
LoopVectorizationLegality * Legal
Vectorization legality.
bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const
Returns true if the target machine supports masked load operation for the given DataType and kind of ...
InstructionCost getInstructionCost(Instruction *I, ElementCount VF)
Returns the execution time cost of an instruction for a given vector width.
DemandedBits * DB
Demanded bits analysis.
bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const
Returns true if I is a memory instruction in an interleaved-group of memory accesses that can be vect...
const TargetLibraryInfo * TLI
Target Library Info.
bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF)
Returns true if I is a memory instruction with consecutive memory access that can be widened.
const InterleaveGroup< Instruction > * getInterleavedAccessGroup(Instruction *Instr) const
Get the interleaved access group that Instr belongs to.
InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const
Estimate cost of an intrinsic call instruction CI if it were vectorized with factor VF.
bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be scalar after vectorization.
bool isOptimizableIVTruncate(Instruction *I, ElementCount VF)
Return True if instruction I is an optimizable truncate whose operand is an induction variable.
FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC)
Loop * TheLoop
The loop that we evaluate.
TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow=true) const
Returns the TailFoldingStyle that is best for the current loop.
InterleavedAccessInfo & InterleaveInfo
The interleave access information contains groups of interleaved accesses with the same stride and cl...
SmallPtrSet< const Value *, 16 > ValuesToIgnore
Values to ignore in the cost model.
void setVectorizedCallDecision(ElementCount VF)
A call may be vectorized in different ways depending on whether we have vectorized variants available...
void invalidateCostModelingDecisions()
Invalidates decisions already taken by the cost model.
bool isAccessInterleaved(Instruction *Instr) const
Check if Instr belongs to any interleaved access group.
bool selectUserVectorizationFactor(ElementCount UserVF)
Setup cost-based decisions for user vectorization factor.
OptimizationRemarkEmitter * ORE
Interface to emit optimization remarks.
bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const
Returns true if the target machine supports masked store operation for the given DataType and kind of...
LoopInfo * LI
Loop Info analysis.
bool requiresScalarEpilogue(bool IsVectorizing) const
Returns true if we're required to use a scalar epilogue for at least the final iteration of the origi...
SmallVector< RegisterUsage, 8 > calculateRegisterUsage(ArrayRef< ElementCount > VFs)
SmallPtrSet< const Value *, 16 > VecValuesToIgnore
Values to ignore in the cost model when VF > 1.
bool isInLoopReduction(PHINode *Phi) const
Returns true if the Phi is part of an inloop reduction.
bool isProfitableToScalarize(Instruction *I, ElementCount VF) const
void setWideningDecision(const InterleaveGroup< Instruction > *Grp, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for interleaving group Grp and vector ...
const MapVector< Instruction *, uint64_t > & getMinimalBitwidths() const
CallWideningDecision getCallWideningDecision(CallInst *CI, ElementCount VF) const
bool isLegalGatherOrScatter(Value *V, ElementCount VF)
Returns true if the target machine can represent V as a masked gather or scatter operation.
bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const
bool shouldConsiderInvariant(Value *Op)
Returns true if Op should be considered invariant and if it is trivially hoistable.
bool foldTailByMasking() const
Returns true if all loop blocks should be masked to fold tail loop.
bool foldTailWithEVL() const
Returns true if VP intrinsics with explicit vector length support should be generated in the tail fol...
void collectUniformsAndScalars(ElementCount VF)
Collect Uniform and Scalar values for the given VF.
bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const
Returns true if the instructions in this block requires predication for any reason,...
void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind, Function *Variant, Intrinsic::ID IID, std::optional< unsigned > MaskPos, InstructionCost Cost)
void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC)
Selects and saves TailFoldingStyle for 2 options - if IV update may overflow or not.
AssumptionCache * AC
Assumption cache.
void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for instruction I and vector width VF.
InstWidening
Decision that was taken during cost calculation for memory instruction.
bool isScalarWithPredication(Instruction *I, ElementCount VF) const
Returns true if I is an instruction which requires predication and for which our chosen predication s...
InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const
Estimate cost of a call instruction CI if it were vectorized with factor VF.
bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const
Returns true if we should use strict in-order reductions for the given RdxDesc.
std::pair< InstructionCost, InstructionCost > getDivRemSpeculationCost(Instruction *I, ElementCount VF) const
Return the costs for our two available strategies for lowering a div/rem operation which requires spe...
bool isDivRemScalarWithPredication(InstructionCost ScalarCost, InstructionCost SafeDivisorCost) const
Given costs for both strategies, return true if the scalar predication lowering should be used for di...
InstructionCost expectedCost(ElementCount VF)
Returns the expected execution cost.
void setCostBasedWideningDecision(ElementCount VF)
Memory access instruction may be vectorized in more than one way.
InstWidening getWideningDecision(Instruction *I, ElementCount VF) const
Return the cost model decision for the given instruction I and vector width VF.
bool isScalarEpilogueAllowed() const
Returns true if a scalar epilogue is not allowed due to optsize or a loop hint annotation.
InstructionCost getWideningCost(Instruction *I, ElementCount VF)
Return the vectorization cost for the given instruction I and vector width VF.
unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost)
void collectInstsToScalarize(ElementCount VF)
Collects the instructions to scalarize for each predicated instruction in the loop.
LoopVectorizationLegality checks if it is legal to vectorize a loop, and to what vectorization factor...
bool isInvariantStoreOfReduction(StoreInst *SI)
Returns True if given store is a final invariant store of one of the reductions found in the loop.
bool hasVectorCallVariants() const
Returns true if there is at least one function call in the loop which has a vectorized variant availa...
bool isInvariantAddressOfReduction(Value *V)
Returns True if given address is invariant and is used to store recurrent expression.
bool blockNeedsPredication(BasicBlock *BB) const
Return true if the block BB needs to be predicated in order for the loop to be vectorized.
bool canVectorize(bool UseVPlanNativePath)
Returns true if it is legal to vectorize this loop.
int isConsecutivePtr(Type *AccessTy, Value *Ptr) const
Check if this pointer is consecutive when vectorizing.
std::optional< const HistogramInfo * > getHistogramInfo(Instruction *I) const
Returns a HistogramInfo* for the given instruction if it was determined to be part of a load -> updat...
bool canVectorizeFPMath(bool EnableStrictReductions)
Returns true if it is legal to vectorize the FP math operations in this loop.
bool isReductionVariable(PHINode *PN) const
Returns True if PN is a reduction variable in this loop.
bool isFixedOrderRecurrence(const PHINode *Phi) const
Returns True if Phi is a fixed-order recurrence in this loop.
const InductionDescriptor * getPointerInductionDescriptor(PHINode *Phi) const
Returns a pointer to the induction descriptor, if Phi is pointer induction.
const InductionDescriptor * getIntOrFpInductionDescriptor(PHINode *Phi) const
Returns a pointer to the induction descriptor, if Phi is an integer or floating point induction.
bool isInductionPhi(const Value *V) const
Returns True if V is a Phi node of an induction variable in this loop.
PHINode * getPrimaryInduction()
Returns the primary induction variable.
const SmallVector< BasicBlock *, 4 > & getCountableExitingBlocks() const
Returns all exiting blocks with a countable exit, i.e.
const InductionList & getInductionVars() const
Returns the induction variables found in the loop.
bool hasStructVectorCall() const
Returns true if there is at least one function call in the loop which returns a struct type and needs...
bool isInvariant(Value *V) const
Returns true if V is invariant across all loop iterations according to SCEV.
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
bool canFoldTailByMasking() const
Return true if we can vectorize this loop while folding its tail by masking.
void prepareToFoldTailByMasking()
Mark all respective loads/stores for masking.
Type * getWidestInductionType()
Returns the widest induction type.
bool hasUncountableEarlyExit() const
Returns true if the loop has an uncountable early exit, i.e.
bool hasHistograms() const
Returns a list of all known histogram operations in the loop.
const LoopAccessInfo * getLAI() const
bool isUniformMemOp(Instruction &I, ElementCount VF) const
A uniform memory op is a load or store which accesses the same memory location on all VF lanes,...
BasicBlock * getUncountableEarlyExitingBlock() const
Returns the uncountable early exiting block.
bool isMaskRequired(const Instruction *I) const
Returns true if vector representation of the instruction I requires mask.
const RuntimePointerChecking * getRuntimePointerChecking() const
Returns the information that we collected about runtime memory check.
Planner drives the vectorization process after having passed Legality checks.
VectorizationFactor selectEpilogueVectorizationFactor(const ElementCount MaxVF, unsigned IC)
VPlan & getPlanFor(ElementCount VF) const
Return the VPlan for VF.
Definition: VPlan.cpp:1637
VectorizationFactor planInVPlanNativePath(ElementCount UserVF)
Use the VPlan-native path to plan how to best vectorize, return the best VF and its cost.
void buildVPlans(ElementCount MinVF, ElementCount MaxVF)
Build VPlans for power-of-2 VF's between MinVF and MaxVF inclusive, according to the information gath...
Definition: VPlan.cpp:1625
VectorizationFactor computeBestVF()
Compute and return the most profitable vectorization factor.
void emitInvalidCostRemarks(OptimizationRemarkEmitter *ORE)
Emit remarks for recipes with invalid costs in the available VPlans.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition: VPlan.cpp:1606
void printPlans(raw_ostream &O)
Definition: VPlan.cpp:1651
void plan(ElementCount UserVF, unsigned UserIC)
Build VPlans for the specified UserVF and UserIC if they are non-zero or all applicable candidate VFs...
DenseMap< const SCEV *, Value * > executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan, InnerLoopVectorizer &LB, DominatorTree *DT, bool VectorizingEpilogue, const DenseMap< const SCEV *, Value * > *ExpandedSCEVs=nullptr)
Generate the IR code for the vectorized loop captured in VPlan BestPlan according to the best selecte...
bool hasPlanWithVF(ElementCount VF) const
Look through the existing plans and return true if we have one with vectorization factor VF.
This holds vectorization requirements that must be verified late in the process.
Utility class for getting and setting loop vectorizer hints in the form of loop metadata.
bool allowVectorization(Function *F, Loop *L, bool VectorizeOnlyWhenForced) const
bool allowReordering() const
When enabling loop hints are provided we allow the vectorizer to change the order of operations that ...
void emitRemarkWithHints() const
Dumps all the hint information.
void setAlreadyVectorized()
Mark the loop L as already vectorized by setting the width to 1.
const char * vectorizeAnalysisPassName() const
If hints are provided that force vectorization, use the AlwaysPrint pass name to force the frontend t...
void prepareNoAliasMetadata()
Set up the aliasing scopes based on the memchecks.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
bool hasLoopInvariantOperands(const Instruction *I) const
Return true if all the operands of the specified instruction are loop invariant.
Definition: LoopInfo.cpp:67
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:632
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition: LoopInfo.cpp:61
MDNode * getLoopID() const
Return the llvm.loop loop id metadata node for this loop if it is present.
Definition: LoopInfo.cpp:502
Metadata node.
Definition: Metadata.h:1069
void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
Definition: Metadata.cpp:1077
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1430
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1545
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1436
static MDString * get(LLVMContext &Context, StringRef Str)
Definition: Metadata.cpp:606
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
iterator find(const KeyT &Key)
Definition: MapVector.h:167
bool contains(const KeyT &Key) const
Definition: MapVector.h:163
bool empty() const
Definition: MapVector.h:79
size_type size() const
Definition: MapVector.h:60
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition: Module.cpp:228
Diagnostic information for optimization analysis remarks related to pointer aliasing.
Diagnostic information for optimization analysis remarks related to floating-point non-commutativity.
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
bool allowExtraAnalysis(StringRef PassName) const
Whether we allow for extra compile-time budget to perform more analysis to produce fewer false positi...
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
An analysis over an "inner" IR unit that provides access to an analysis manager over a "outer" IR uni...
Definition: PassManager.h:692
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
static unsigned getIncomingValueNumForOperand(unsigned i)
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1878
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
const SCEVPredicate & getPredicate() const
unsigned getSmallConstantMaxTripCount()
Returns the upper bound of the loop trip count as a normal unsigned value, or 0 if the trip count is ...
const SCEV * getBackedgeTakenCount()
Get the (predicated) backedge count for the analyzed loop.
const SCEV * getSymbolicMaxBackedgeTakenCount()
Get the (predicated) symbolic max backedge count for the analyzed loop.
const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
void preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:146
void preserve()
Mark an analysis as preserved.
Definition: Analysis.h:131
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
Analysis providing profile information.
bool hasProfileSummary() const
Returns true if profile summary is available.
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Definition: IVDescriptors.h:77
static bool isFMulAddIntrinsic(Instruction *I)
Returns true if the instruction is a call to the llvm.fmuladd intrinsic.
FastMathFlags getFastMathFlags() const
Instruction * getLoopExitInstr() const
static unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
Type * getRecurrenceType() const
Returns the type of the recurrence.
const SmallPtrSet< Instruction *, 8 > & getCastInsts() const
Returns a reference to the instructions used for type-promoting the recurrence.
unsigned getMinWidthCastToRecurrenceTypeInBits() const
Returns the minimum width used by the recurrence in bits.
TrackingVH< Value > getRecurrenceStartValue() const
SmallVector< Instruction *, 4 > getReductionOpChain(PHINode *Phi, Loop *L) const
Attempts to find a chain of operations from Phi to LoopExitInst that can be treated as a set of reduc...
static bool isAnyOfRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
static bool isFindLastIVRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
bool isSigned() const
Returns true if all source operands of the recurrence are SExtInsts.
RecurKind getRecurrenceKind() const
bool isOrdered() const
Expose an ordered FP reduction to the instance users.
Value * getSentinelValue() const
Returns the sentinel value for FindLastIV recurrences to replace the start value.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
bool Need
This flag indicates if we need to add the runtime check.
std::optional< ArrayRef< PointerDiffInfo > > getDiffChecks() const
const SmallVectorImpl< RuntimePointerCheck > & getChecks() const
Returns the checks that generateChecks created.
This class represents a constant integer value.
const APInt & getAPInt() const
Helper to remove instructions inserted during SCEV expansion, unless they are marked as used.
This class uses information about analyze scalars to rewrite expressions in canonical form.
ScalarEvolution * getSE()
bool isInsertedInstruction(Instruction *I) const
Return true if the specified instruction was inserted by the code rewriter.
Value * expandCodeForPredicate(const SCEVPredicate *Pred, Instruction *Loc)
Generates a code sequence that evaluates this predicate.
This class represents an assumption made using SCEV expressions which can be checked at run-time.
virtual bool isAlwaysTrue() const =0
Returns true if the predicate is always true.
This class represents an analyzed expression in the program.
bool isOne() const
Return true if the expression is a constant one.
bool isZero() const
Return true if the expression is a constant zero.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getURemExpr(const SCEV *LHS, const SCEV *RHS)
Represents an unsigned remainder expression based on unsigned division.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
bool isKnownPredicate(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
void forgetBlockAndLoopDispositions(Value *V=nullptr)
Called when the client has changed the disposition of values in a loop or block.
void forgetLcssaPhiWithNewPredecessor(Loop *L, PHINode *V)
Forget LCSSA phi node V of loop L to which a new predecessor was added, such that it may no longer be...
unsigned getSmallConstantTripCount(const Loop *L)
Returns the exact trip count of the loop if we can compute it, and the result is a small constant.
APInt getUnsignedRangeMax(const SCEV *S)
Determine the max of the unsigned range for a particular SCEV.
const SCEV * applyLoopGuards(const SCEV *Expr, const Loop *L)
Try to apply information from loop guards for L to Expr.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:57
ArrayRef< value_type > getArrayRef() const
Definition: SetVector.h:84
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
iterator end()
Get an iterator to the end of the SetVector.
Definition: SetVector.h:113
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:264
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition: SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
value_type pop_back_val()
Definition: SetVector.h:285
size_type size() const
Definition: SmallPtrSet.h:94
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:363
bool erase(PtrType Ptr)
Remove pointer from the set.
Definition: SmallPtrSet.h:401
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:452
iterator end() const
Definition: SmallPtrSet.h:477
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
iterator begin() const
Definition: SmallPtrSet.h:472
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:458
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:132
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:181
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:937
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:683
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
An instruction for storing to memory.
Definition: Instructions.h:292
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
Multiway switch.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
std::optional< unsigned > getVScaleForTuning() const
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const
Return true if the target supports masked scatter.
bool enableAggressiveInterleaving(bool LoopHasReductions) const
Don't restrict interleaved unrolling to small loops.
bool preferInLoopReduction(unsigned Opcode, Type *Ty, ReductionFlags Flags) const
bool supportsEfficientVectorElementLoadStore() const
If target has efficient vector element load/store instructions, it can return true here so that inser...
bool prefersVectorizedAddressing() const
Return true if target doesn't mind addresses in vectors.
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
bool hasBranchDivergence(const Function *F=nullptr) const
Return true if branch divergence exists.
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE=nullptr, const SCEV *Ptr=nullptr) const
void getUnrollingPreferences(Loop *L, ScalarEvolution &, UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const
Get target-customized preferences for the generic loop unrolling transformation.
InstructionCost getOperandsScalarizationOverhead(ArrayRef< const Value * > Args, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const
Estimate the overhead of scalarizing an instructions unique non-constant operands.
TypeSize getRegisterBitWidth(RegisterKind K) const
bool isLegalMaskedGather(Type *DataType, Align Alignment) const
Return true if the target supports masked gather.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
std::optional< unsigned > getMaxVScale() const
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
bool enableOrderedReductions() const
Return true if we should be enabling ordered reductions for the target.
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
static PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const
TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true) const
Query the target what the preferred style of tail folding is.
unsigned getRegUsageForType(Type *Ty) const
Returns the estimated number of registers required to represent Ty.
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of an Add ...
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
bool isElementTypeLegalForScalableVector(Type *Ty) const
ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const
bool enableMaskedInterleavedAccessVectorization() const
Enable matching of interleaved access groups that contain predicated accesses or gaps and therefore v...
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, ReductionFlags Flags) const
InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
const char * getRegisterClassName(unsigned ClassID) const
bool preferEpilogueVectorization() const
Return true if the loop vectorizer should consider vectorizing an otherwise scalar epilogue loop.
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
bool hasActiveVectorLength(unsigned Opcode, Type *DataType, Align Alignment) const
unsigned getEpilogueVectorizationMinVF() const
unsigned getNumberOfRegisters(unsigned ClassID) const
bool isFPVectorizationPotentiallyUnsafe() const
Indicate that it is potentially unsafe to automatically vectorize floating-point operations because t...
bool isLegalMaskedStore(Type *DataType, Align Alignment) const
Return true if the target supports masked store.
@ TCC_Free
Expected to fold away in lowering.
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
bool enableInterleavedAccessVectorization() const
Enable matching of interleaved access groups.
unsigned getMinTripCountTailFoldingThreshold() const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
unsigned getMaxInterleaveFactor(ElementCount VF) const
unsigned getNumberOfParts(Type *Tp) const
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, PartialReductionExtendKind OpAExtend, PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp=std::nullopt) const
bool isTruncateFree(Type *Ty1, Type *Ty2) const
Return true if it's free to truncate a value of type Ty1 to type Ty2.
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const
Query the target whether it would be prefered to create a predicated vector loop, which can avoid the...
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, Value *Op0=nullptr, Value *Op1=nullptr) const
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ Interleave
The cast is used with an interleaved load/store.
@ GatherScatter
The cast is used with a gather/scatter.
bool isLegalMaskedLoad(Type *DataType, Align Alignment) const
Return true if the target supports masked load.
This class represents a truncation of integer types.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition: TypeSwitch.h:87
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition: TypeSwitch.h:96
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:270
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:264
static IntegerType * getInt1Ty(LLVMContext &C)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static Type * getVoidTy(LLVMContext &C)
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
bool isIntOrPtrTy() const
Return true if this is an integer type or a pointer type.
Definition: Type.h:252
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
bool isTokenTy() const
Return true if this is 'token'.
Definition: Type.h:234
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
This function has undefined behavior.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
op_range operands()
Definition: User.h:288
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
op_iterator op_begin()
Definition: User.h:280
void setOperand(unsigned i, Value *Val)
Definition: User.h:233
Value * getOperand(unsigned i) const
Definition: User.h:228
op_iterator op_end()
Definition: User.h:282
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition: VectorUtils.h:72
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition: VPlan.h:3529
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition: VPlan.h:3601
RecipeListTy::iterator iterator
Instruction iterators...
Definition: VPlan.h:3553
void execute(VPTransformState *State) override
The method which generates the output IR instructions that correspond to this VPBasicBlock,...
Definition: VPlan.cpp:480
iterator end()
Definition: VPlan.h:3563
iterator begin()
Recipe iterator methods.
Definition: VPlan.h:3561
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition: VPlan.h:3614
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition: VPlan.cpp:208
void insert(VPRecipeBase *Recipe, iterator InsertPt)
Definition: VPlan.h:3592
bool empty() const
Definition: VPlan.h:3572
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition: VPlan.h:2487
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition: VPlan.h:397
VPRegionBlock * getParent()
Definition: VPlan.h:489
const VPBasicBlock * getExitingBasicBlock() const
Definition: VPlan.cpp:178
void setName(const Twine &newName)
Definition: VPlan.h:482
size_t getNumSuccessors() const
Definition: VPlan.h:535
void swapSuccessors()
Swap successors of the block. The block must have exactly 2 successors.
Definition: VPlan.h:628
const VPBlocksTy & getPredecessors() const
Definition: VPlan.h:520
VPlan * getPlan()
Definition: VPlan.cpp:153
VPBlockBase * getSinglePredecessor() const
Definition: VPlan.h:531
const VPBasicBlock * getEntryBasicBlock() const
Definition: VPlan.cpp:158
VPBlockBase * getSingleSuccessor() const
Definition: VPlan.h:525
const VPBlocksTy & getSuccessors() const
Definition: VPlan.h:514
static void insertBlockAfter(VPBlockBase *NewBlock, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBase NewBlock after BlockPtr.
Definition: VPlan.h:4203
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition: VPlan.h:4319
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition: VPlan.h:4257
static void reassociateBlocks(VPBlockBase *Old, VPBlockBase *New)
Reassociate all the blocks connected to Old so that they now point to New.
Definition: VPlan.h:4284
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPValue * createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL={}, const Twine &Name="")
Create a new ICmp VPInstruction with predicate Pred and operands A and B.
VPValue * createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL={}, const Twine &Name="")
VPBasicBlock * getInsertBlock() const
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPValue *Start, VPValue *Current, VPValue *Step, const Twine &Name="")
Convert the input value Current to the corresponding value of an induction with Start and Step values...
VPScalarCastRecipe * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL)
VPInstruction * createOverflowingOp(unsigned Opcode, std::initializer_list< VPValue * > Operands, VPRecipeWithIRFlags::WrapFlagsTy WrapFlags, DebugLoc DL={}, const Twine &Name="")
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
VPValue * createNot(VPValue *Operand, DebugLoc DL={}, const Twine &Name="")
VPValue * createLogicalAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL={}, const Twine &Name="")
VPValue * createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL={}, const Twine &Name="", std::optional< FastMathFlags > FMFs=std::nullopt)
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
Canonical scalar induction phi of the vector loop.
Definition: VPlan.h:3226
Type * getScalarType() const
Returns the scalar type of the induction.
Definition: VPlan.h:3257
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition: VPlanValue.h:388
void execute(VPTransformState &State) override
Generate the transformed value of the induction at offset StartValue (1.
VPValue * getStepValue() const
Definition: VPlan.h:3459
VPValue * getStartValue() const
Definition: VPlan.h:3458
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition: VPlan.h:2026
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition: VPlan.h:2074
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition: VPlan.h:2063
A recipe representing a sequence of load -> update -> store as part of a histogram operation.
Definition: VPlan.h:1775
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition: VPlan.h:3668
A recipe to wrap on original IR instruction not to be modified during execution, execept for PHIs.
Definition: VPlan.h:1380
This is a concrete Recipe that models a single VPlan-level instruction.
Definition: VPlan.h:1192
@ ResumePhi
Creates a scalar phi in a leaf VPBB with a single predecessor in VPlan.
Definition: VPlan.h:1210
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition: VPlan.h:2554
In what follows, the term "input IR" refers to code that is fed into the vectorizer whereas the term ...
Definition: VPlan.h:153
static VPLane getLastLaneForVF(const ElementCount &VF)
Definition: VPlan.h:194
static VPLane getFirstLane()
Definition: VPlan.h:178
A recipe for forming partial reductions.
Definition: VPlan.h:2444
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition: VPlan.h:714
VPBasicBlock * getParent()
Definition: VPlan.h:739
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition: VPlan.h:808
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPRecipeBase * tryToCreatePartialReduction(Instruction *Reduction, ArrayRef< VPValue * > Operands)
Create and return a partial reduction recipe for a reduction instruction along with binary operation ...
VPValue * createEdgeMask(BasicBlock *Src, BasicBlock *Dst)
A helper function that computes the predicate of the edge between SRC and DST.
VPReplicateRecipe * handleReplication(Instruction *I, VFRange &Range)
Build a VPReplicationRecipe for I.
void createSwitchEdgeMasks(SwitchInst *SI)
Create an edge mask for every destination of cases and/or default.
std::optional< std::pair< PartialReductionChain, unsigned > > getScaledReductionForInstr(const Instruction *ExitInst)
VPValue * getBlockInMask(BasicBlock *BB) const
Returns the entry mask for the block BB.
VPValue * getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const
A helper that returns the previously computed predicate of the edge between SRC and DST.
iterator_range< mapped_iterator< Use *, std::function< VPValue *(Value *)> > > mapToVPValues(User::op_range Operands)
Returns a range mapping the values of the range Operands to their corresponding VPValues.
void fixHeaderPhis()
Add the incoming values from the backedge to reduction & first-order recurrence cross-iteration phis.
VPRecipeBase * tryToCreateWidenRecipe(Instruction *Instr, ArrayRef< VPValue * > Operands, VFRange &Range, VPBasicBlock *VPBB)
Create and return a widened recipe for I if one can be created within the given VF Range.
VPValue * getVPValueOrAddLiveIn(Value *V)
void createHeaderMask()
Create the mask for the vector loop header block.
void createBlockInMask(BasicBlock *BB)
A helper function that computes the predicate of the block BB, assuming that the header block of the ...
void collectScaledReductions(VFRange &Range)
Find all possible partial reductions in the loop and track all of those that are valid so recipes can...
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
void setFlags(Instruction *I) const
Set the IR flags for I.
Definition: VPlan.h:1106
A recipe for handling reduction phis.
Definition: VPlan.h:2378
bool isInLoop() const
Returns true, if the phi is part of an in-loop reduction.
Definition: VPlan.h:2437
const RecurrenceDescriptor & getRecurrenceDescriptor() const
Definition: VPlan.h:2429
A recipe to represent inloop reduction operations, performing a reduction on a vector operand into a ...
Definition: VPlan.h:2649
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition: VPlan.h:3700
const VPBlockBase * getEntry() const
Definition: VPlan.h:3733
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition: VPlan.h:3765
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition: VPlan.h:2770
void execute(VPTransformState &State) override
Generate replicas of the desired Ingredient.
bool isUniform() const
Definition: VPlan.h:2814
bool shouldPack() const
Returns true if the recipe is used by a widened recipe via an intervening VPPredInstPHIRecipe.
A recipe to compute the pointers for widened memory accesses of IndexTy in reverse order.
Definition: VPlan.h:1903
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition: VPlan.h:841
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition: VPlan.h:911
An analysis for type-inference for VPValues.
Definition: VPlanAnalysis.h:40
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition: VPlanValue.h:200
operand_range operands()
Definition: VPlanValue.h:257
void setOperand(unsigned I, VPValue *New)
Definition: VPlanValue.h:242
unsigned getNumOperands() const
Definition: VPlanValue.h:236
VPValue * getOperand(unsigned N) const
Definition: VPlanValue.h:237
void addOperand(VPValue *Operand)
Definition: VPlanValue.h:231
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition: VPlan.cpp:123
void replaceAllUsesWith(VPValue *New)
Definition: VPlan.cpp:1420
Value * getLiveInIRValue()
Returns the underlying IR value, if this VPValue is defined outside the scope of VPlan.
Definition: VPlanValue.h:172
bool isLiveIn() const
Returns true if this VPValue is a live-in, i.e. defined outside the VPlan.
Definition: VPlanValue.h:167
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition: VPlan.cpp:1424
user_range users()
Definition: VPlanValue.h:132
A recipe to compute the pointers for widened memory accesses of IndexTy.
Definition: VPlan.h:1956
A recipe for widening Call instructions using library calls.
Definition: VPlan.h:1719
A Recipe for widening the canonical induction variable of the vector loop.
Definition: VPlan.h:3367
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition: VPlan.h:1527
A recipe for handling GEP instructions.
Definition: VPlan.h:1854
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition: VPlan.h:2088
VPValue * getStepValue()
Returns the step value of the induction.
Definition: VPlan.h:2116
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition: VPlan.h:2122
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition: VPlan.h:2141
A recipe for widening vector intrinsics.
Definition: VPlan.h:1627
A common base class for widening memory operations.
Definition: VPlan.h:2943
A recipe for handling phis that are widened in the vector loop.
Definition: VPlan.h:2301
VPValue * getIncomingValue(unsigned I)
Returns the I th incoming VPValue.
Definition: VPlan.h:2340
VPBasicBlock * getIncomingBlock(unsigned I)
Returns the I th incoming VPBasicBlock.
Definition: VPlan.h:2337
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition: VPlan.h:1429
Main class to build the VPlan H-CFG for an incoming IR.
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition: VPlan.h:3796
void prepareToExecute(Value *TripCount, Value *VectorTripCount, VPTransformState &State)
Prepare the plan for execution, setting up the required live-in values.
Definition: VPlan.cpp:924
VPBasicBlock * getEntry()
Definition: VPlan.h:3909
VPValue & getVectorTripCount()
The vector trip count.
Definition: VPlan.h:3971
VPValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition: VPlan.h:3977
VPValue & getVF()
Returns the VF of the vector loop region.
Definition: VPlan.h:3974
VPValue * getTripCount() const
The trip count of the original loop.
Definition: VPlan.h:3950
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition: VPlan.h:3964
iterator_range< SmallSetVector< ElementCount, 2 >::iterator > vectorFactors() const
Returns an iterator range over all VFs of the plan.
Definition: VPlan.h:3994
unsigned getUF() const
Definition: VPlan.h:4002
static VPlanPtr createInitialVPlan(Type *InductionTy, PredicatedScalarEvolution &PSE, bool RequiresScalarEpilogueCheck, bool TailFolded, Loop *TheLoop)
Create initial VPlan, having an "entry" VPBasicBlock (wrapping original scalar pre-header) which cont...
Definition: VPlan.cpp:845
bool hasVF(ElementCount VF)
Definition: VPlan.h:3987
bool hasUF(unsigned UF) const
Definition: VPlan.h:4000
auto getExitBlocks()
Return an iterator range over the VPIRBasicBlock wrapping the exit blocks of the VPlan,...
Definition: VPlanCFG.h:309
VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition: VPlan.cpp:1052
InstructionCost cost(ElementCount VF, VPCostContext &Ctx)
Return the cost of this plan.
Definition: VPlan.cpp:1046
const VPBasicBlock * getMiddleBlock() const
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition: VPlan.h:3928
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition: VPlan.h:3957
void setEntry(VPBasicBlock *VPBB)
Definition: VPlan.h:3879
VPIRBasicBlock * createVPIRBasicBlock(BasicBlock *IRBB)
Create a VPIRBasicBlock from IRBB containing VPIRInstructions for all instructions in IRBB,...
Definition: VPlan.cpp:1252
VPValue * getOrAddLiveIn(Value *V)
Gets the live-in VPValue for V or adds a new live-in (if none exists yet) for V.
Definition: VPlan.h:4020
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition: VPlan.h:3936
void execute(VPTransformState *State)
Generate the IR code for this VPlan.
Definition: VPlan.cpp:956
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the vector loop.
Definition: VPlan.h:4054
VPIRBasicBlock * getScalarHeader() const
Return the VPIRBasicBlock wrapping the header of the scalar loop.
Definition: VPlan.h:3941
VPValue * getSCEVExpansion(const SCEV *S) const
Definition: VPlan.h:4063
VPlan * duplicate()
Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...
Definition: VPlan.cpp:1192
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUser() const
Return true if there is exactly one user of this value.
Definition: Value.cpp:157
void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:377
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
void replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition: Value.cpp:542
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1075
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
int getNumOccurrences() const
Definition: CommandLine.h:399
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:213
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:193
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:232
constexpr bool isNonZero() const
Definition: TypeSize.h:158
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:218
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition: TypeSize.h:258
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition: TypeSize.h:174
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
constexpr bool isZero() const
Definition: TypeSize.h:156
static constexpr bool isKnownGT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:225
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition: TypeSize.h:254
static constexpr bool isKnownGE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:239
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition: ilist_node.h:32
A range adaptor for a pair of iterators.
IteratorT end() const
IteratorT begin() const
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:661
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
@ Entry
Definition: COFF.h:844
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition: CallingConv.h:76
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
ID ArrayRef< Type * > Tys
Definition: Intrinsics.h:102
std::variant< std::monostate, Loc::Single, Loc::Multi, Loc::MMI, Loc::EntryValue > Variant
Alias for the std::variant specialization base class of DbgVariable.
Definition: DwarfDebug.h:190
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
Definition: PatternMatch.h:982
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:826
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:885
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:711
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< InstrNode * > Instr
Definition: RDFGraph.h:389
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
const_iterator begin(StringRef path LLVM_LIFETIME_BOUND, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:226
const_iterator end(StringRef path LLVM_LIFETIME_BOUND)
Get end iterator over path.
Definition: Path.cpp:235
bool isUniformAfterVectorization(const VPValue *VPV)
Returns true if VPV is uniform after vectorization.
Definition: VPlanUtils.h:39
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr, ScalarEvolution &SE)
Get or create a VPValue that corresponds to the expansion of Expr.
Definition: VPlanUtils.cpp:26
const SCEV * getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE)
Return the SCEV expression for V.
Definition: VPlanUtils.cpp:65
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, AssumptionCache *AC, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
Simplify each loop in a loop nest recursively.
void ReplaceInstWithInst(BasicBlock *BB, BasicBlock::iterator &BI, Instruction *I)
Replace the instruction specified by BI with the instruction specified by I.
@ Offset
Definition: DWP.cpp:480
Value * addRuntimeChecks(Instruction *Loc, Loop *TheLoop, const SmallVectorImpl< RuntimePointerCheck > &PointerChecks, SCEVExpander &Expander, bool HoistRuntimeChecks=false)
Add code that checks at runtime if the accessed arrays in PointerChecks overlap.
Definition: LoopUtils.cpp:1954
bool RemoveRedundantDbgInstrs(BasicBlock *BB)
Try to remove redundant dbg.value instructions from given basic block.
std::optional< unsigned > getLoopEstimatedTripCount(Loop *L, unsigned *EstimatedLoopInvocationWeight=nullptr)
Returns a loop's estimated trip count based on branch weight metadata.
Definition: LoopUtils.cpp:850
static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop, VectorizationFactor VF, unsigned IC)
Report successful vectorization of the loop.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
unsigned getLoadStoreAddressSpace(const Value *I)
A helper function that returns the address space of the pointer operand of load or store instruction.
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
Definition: LoopUtils.cpp:989
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1697
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
auto pred_end(const MachineBasicBlock *BB)
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Definition: Verifier.cpp:7297
auto successors(const MachineBasicBlock *BB)
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
std::pair< Instruction *, ElementCount > InstructionVFPair
Value * getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF)
Return the runtime value for VF.
bool formLCSSARecursively(Loop &L, const DominatorTree &DT, const LoopInfo *LI, ScalarEvolution *SE)
Put a loop nest into LCSSA form.
Definition: LCSSA.cpp:465
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
std::optional< MDNode * > makeFollowupLoopID(MDNode *OrigLoopID, ArrayRef< StringRef > FollowupAttrs, const char *InheritOptionsAttrsPrefix="", bool AlwaysNew=false)
Create a new loop identifier for a loop created from a loop transformation.
Definition: LoopUtils.cpp:263
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2115
bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:657
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition: VPlanCFG.h:214
bool VerifySCEV
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition: VPlanCFG.h:226
auto map_range(ContainerTy &&C, FuncTy F)
Definition: STLExtras.h:377
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
void collectEphemeralRecipesForVPlan(VPlan &Plan, DenseSet< VPRecipeBase * > &EphRecipes)
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:420
void setBranchWeights(Instruction &I, ArrayRef< uint32_t > Weights, bool IsExpected)
Create a new branch_weights metadata node and add or overwrite a prof metadata reference to instructi...
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
cl::opt< bool > EnableVPlanNativePath("enable-vplan-native-path", cl::Hidden, cl::desc("Enable VPlan-native vectorization path with " "support for outer loop vectorization."))
Definition: VPlan.cpp:53
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1664
std::unique_ptr< VPlan > VPlanPtr
Definition: VPlan.h:144
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753
cl::opt< bool > EnableLoopVectorization
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition: Local.cpp:425
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition: STLExtras.h:573
void llvm_unreachable_internal(const char *msg=nullptr, const char *file=nullptr, unsigned line=0)
This function calls abort(), and prints the optional message to stderr.
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:125
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition: STLExtras.h:336
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
TargetTransformInfo TTI
static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr, DebugLoc DL={})
Reports an informative message: print Msg for debugging purposes as well as an optimization remark.
bool isAssignmentTrackingEnabled(const Module &M)
Return true if assignment tracking is enabled for module M.
Definition: DebugInfo.cpp:2298
RecurKind
These are the kinds of recurrences that we support.
Definition: IVDescriptors.h:33
@ Or
Bitwise or logical OR of integers.
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
void setProfileInfoAfterUnrolling(Loop *OrigLoop, Loop *UnrolledLoop, Loop *RemainderLoop, uint64_t UF)
Set weights for UnrolledLoop and RemainderLoop based on weights for OrigLoop and the following distri...
Definition: LoopUtils.cpp:1761
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
void reportVectorizationFailure(const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr)
Reports a vectorization failure: print DebugMsg for debugging purposes along with the corresponding o...
DWARFExpression::Operation Op
ScalarEpilogueLowering
@ CM_ScalarEpilogueNotAllowedLowTripLoop
@ CM_ScalarEpilogueNotNeededUsePredicate
@ CM_ScalarEpilogueNotAllowedOptSize
@ CM_ScalarEpilogueAllowed
@ CM_ScalarEpilogueNotAllowedUsePredicate
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1945
auto pred_begin(const MachineBasicBlock *BB)
Value * createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, int64_t Step)
Return a value for Step multiplied by VF.
BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1766
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
Value * addDiffRuntimeChecks(Instruction *Loc, ArrayRef< PointerDiffInfo > Checks, SCEVExpander &Expander, function_ref< Value *(IRBuilderBase &, unsigned)> GetVF, unsigned IC)
Definition: LoopUtils.cpp:2012
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2087
InstructionCost Cost
@ DataAndControlFlowWithoutRuntimeCheck
Use predicate to control both data and control flow, but modify the trip count so that a runtime over...
@ None
Don't use tail folding.
@ DataWithEVL
Use predicated EVL instructions for tail-folding.
@ DataWithoutLaneMask
Same as Data, but avoids using the get.active.lane.mask intrinsic to calculate the mask and instead i...
unsigned getReciprocalPredBlockProb()
A helper function that returns the reciprocal of the block probability of predicated blocks.
Definition: VPlan.h:92
bool hasBranchWeightMD(const Instruction &I)
Checks if an instructions has Branch Weight Metadata.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition: Hashing.h:590
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
bool verifyVPlanIsValid(const VPlan &Plan)
Verify invariants for general VPlans.
MapVector< Instruction *, uint64_t > computeMinimumValueSizes(ArrayRef< BasicBlock * > Blocks, DemandedBits &DB, const TargetTransformInfo *TTI=nullptr)
Compute a map of integer instructions to their minimum legal type size.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:468
cl::opt< bool > EnableLoopInterleaving
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
A special type used by analysis passes to provide an address that identifies that particular analysis...
Definition: Analysis.h:28
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:71
An information struct used to provide DenseMap with the various necessary components for a given valu...
Definition: DenseMapInfo.h:52
Encapsulate information regarding vectorization of a loop and its epilogue.
EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, ElementCount EVF, unsigned EUF, VPlan &EpiloguePlan)
A class that represents two vectorization factors (initialized with 0 by default).
static FixedScalableVFPair getNone()
This holds details about a histogram operation – a load -> update -> store sequence where each lane i...
A struct that represents some properties of the register usage of a loop.
SmallMapVector< unsigned, unsigned, 4 > MaxLocalUsers
Holds the maximum number of concurrent live intervals in the loop.
SmallMapVector< unsigned, unsigned, 4 > LoopInvariantRegs
Holds the number of loop invariant values that are used in the loop.
TargetLibraryInfo * TLI
LoopVectorizeResult runImpl(Function &F)
ProfileSummaryInfo * PSI
LoopAccessInfoManager * LAIs
void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName)
LoopVectorizePass(LoopVectorizeOptions Opts={})
BlockFrequencyInfo * BFI
ScalarEvolution * SE
AssumptionCache * AC
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
OptimizationRemarkEmitter * ORE
Storage for information about made changes.
A chain of instructions that form a partial reduction.
Instruction * Reduction
The top-level binary operation that forms the reduction to a scalar after the loop body.
Instruction * ExtendA
The extension of each of the inner binary operation's operands.
A CRTP mix-in to automatically provide informational APIs needed for passes.
Definition: PassManager.h:69
A marker analysis to determine if extra passes should be run after loop vectorization.
A MapVector that performs no allocations if smaller than a certain size.
Definition: MapVector.h:254
Flags describing the kind of vector reduction.
Parameters that control the generic loop unrolling transformation.
bool UnrollVectorizedLoop
Don't disable runtime unroll for the loops which were vectorized.
Holds the VFShape for a specific scalar to vector function mapping.
std::optional< unsigned > getParamIndexForOptionalMask() const
Instruction Set Architecture.
Encapsulates information needed to describe a parameter.
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Definition: VPlan.h:97
ElementCount End
Definition: VPlan.h:102
Struct to hold various analysis needed for cost computations.
Definition: VPlan.h:682
LoopVectorizationCostModel & CM
Definition: VPlan.h:687
bool skipCostComputation(Instruction *UI, bool IsVector) const
Return true if the cost for UI shouldn't be computed, e.g.
InstructionCost getLegacyCost(Instruction *UI, ElementCount VF) const
Return the cost for UI with VF using the legacy cost model as fallback until computing the cost of al...
SmallPtrSet< Instruction *, 8 > SkipCostComputation
Definition: VPlan.h:688
A recipe for handling first-order recurrence phis.
Definition: VPlan.h:2346
BasicBlock * PrevBB
The previous IR BasicBlock created or used.
Definition: VPlan.h:344
SmallDenseMap< VPBasicBlock *, BasicBlock * > VPBB2IRBB
A mapping of each VPBasicBlock to the corresponding BasicBlock.
Definition: VPlan.h:352
VPTransformState holds information passed down when "executing" a VPlan, needed for generating the ou...
Definition: VPlan.h:236
DenseMap< const SCEV *, Value * > ExpandedSCEVs
Map SCEVs to their expanded values.
Definition: VPlan.h:389
VPTypeAnalysis TypeAnalysis
VPlan-based type analysis.
Definition: VPlan.h:392
void packScalarIntoVectorValue(VPValue *Def, const VPLane &Lane)
Construct the vector value of a scalarized value V one lane at a time.
Definition: VPlan.cpp:394
Value * get(VPValue *Def, bool IsScalar=false)
Get the generated vector Value for a given VPValue Def if IsScalar is false, otherwise return the gen...
Definition: VPlan.cpp:249
struct llvm::VPTransformState::CFGState CFG
LoopVersioning * LVer
LoopVersioning.
Definition: VPlan.h:385
void addNewMetadata(Instruction *To, const Instruction *Orig)
Add additional metadata to To that was not present on Orig.
Definition: VPlan.cpp:353
std::optional< VPLane > Lane
Hold the index to generate specific scalar instructions.
Definition: VPlan.h:250
IRBuilderBase & Builder
Hold a reference to the IRBuilder used to generate output IR code.
Definition: VPlan.h:369
VPlan * Plan
Pointer to the VPlan code is generated for.
Definition: VPlan.h:375
InnerLoopVectorizer * ILV
Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods.
Definition: VPlan.h:372
ElementCount VF
The chosen Vectorization Factor of the loop being vectorized.
Definition: VPlan.h:245
void setDebugLocFrom(DebugLoc DL)
Set the debug location in the builder using the debug location DL.
Definition: VPlan.cpp:372
void set(VPValue *Def, Value *V, bool IsScalar=false)
Set the generated vector Value for a given VPValue, if IsScalar is false.
Definition: VPlan.h:279
A recipe for widening load operations, using the address to load from and an optional mask.
Definition: VPlan.h:3023
A recipe for widening select instructions.
Definition: VPlan.h:1816
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition: VPlan.h:3101
static void handleUncountableEarlyExit(VPlan &Plan, ScalarEvolution &SE, Loop *OrigLoop, BasicBlock *UncountableExitingBlock, VPRecipeBuilder &RecipeBuilder)
Update Plan to account for the uncountable early exit block in UncountableExitingBlock by.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx)
Explicitly unroll Plan by UF.
static void dropPoisonGeneratingRecipes(VPlan &Plan, function_ref< bool(BasicBlock *)> BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, bool ScalarEpilogueAllowed)
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static bool tryAddExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPEVLBasedIVPHIRecipe and related recipes to Plan and replaces all uses except the canonical IV...
static void VPInstructionsToVPRecipes(VPlanPtr &Plan, function_ref< const InductionDescriptor *(PHINode *)> GetIntOrFpInductionDescriptor, ScalarEvolution &SE, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Try to have all users of fixed-order recurrences appear after the recipe defining their previous valu...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
TODO: The following VectorizationFactor was pulled out of LoopVectorizationCostModel class.
InstructionCost Cost
Cost of the loop with that width.
ElementCount MinProfitableTripCount
The minimum trip count required to make vectorization profitable, e.g.
ElementCount Width
Vector width with best cost.
InstructionCost ScalarCost
Cost of the scalar loop.
static VectorizationFactor Disabled()
Width 1 means no vectorization, cost 0 means uncomputed cost.